Merge branch release-4-6 into master

author Mark Abraham <mark.j.abraham@gmail.com>

Fri, 19 Jul 2013 07:41:13 +0000 (09:41 +0200)

committer Mark Abraham <mark.j.abraham@gmail.com>

Wed, 24 Jul 2013 06:03:12 +0000 (08:03 +0200)
author Mark Abraham <mark.j.abraham@gmail.com>
Fri, 19 Jul 2013 07:41:13 +0000 (09:41 +0200)
committer Mark Abraham <mark.j.abraham@gmail.com>
Wed, 24 Jul 2013 06:03:12 +0000 (08:03 +0200)
diff --cc CMakeLists.txt

index 396aa452fd5d56a00542240ceac4a765385778e1,380b169c5db63346952fb25c3df65b65370b2f4d..ebbbdc4d5de52d2e7f374399b498d1ba915b3198
--- 1/CMakeLists.txt
--- 2/CMakeLists.txt
+++ b/CMakeLists.txt
@@@ -1166,10 -1175,21 +1166,14 @@@ get_compiler_info(C BUILD_C_COMPILER BU
   if (CMAKE_CXX_COMPILER_LOADED)
       get_compiler_info(CXX BUILD_CXX_COMPILER BUILD_CXXFLAGS)
   endif ()
+ if(GMX_GPU)
+     get_cuda_compiler_info(CUDA_NVCC_COMPILER_INFO CUDA_NVCC_COMPILER_FLAGS)
+ endif(GMX_GPU)
+ 
   
   ########################################################################
- -# Specify install locations and which subdirectories to process        #
+ +# Specify install locations
   ########################################################################
- -if (GMX_USE_RELATIVE_INSTALL_PATH)
- -    set(GMX_INSTALL_PREFIX "" CACHE STRING "Prefix gets appended to CMAKE_INSTALL_PREFIX. For cpack it sets the root folder of the archive.")
- -    mark_as_advanced(GMX_INSTALL_PREFIX)
- -else()
- -    set(GMX_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}/")
- -endif()
- -
   if ( NOT DEFINED GMXLIB )
       set(GMXLIB lib)
   endif()
diff --cc admin/mkhtml

index fc9e32bb3ed3b1e0d24dfecf7b3a263abeeaf68d,ab1d8a07d66f2f764854cb13665c1680be180178..67e4504b5a90fa2e9c59031642255293544d9f56
--- 1/admin/mkhtml
--- 2/admin/mkhtml
+++ b/admin/mkhtml
@@@ -136,8 -134,9 +134,9 @@@ echo "---------------------------------
   
   cd $dir
   
+ setenv GMX_MAXBACKUP -1
   foreach program ( $PROGRAMS )
-   if ( ( -x $GMXBINDIR/$program ) && ( $program != "my_dssp" ) && ( $program != "GMXRC" ) && ( $program != "completion.csh" ) && ( $program != "completion.zsh" ) && ( $program != "average" ) && ( $program != "completion.bash" ) ) then
- -  if ( ( -x $GMXBINDIR/$program ) && ( $program != "my_dssp" ) && ( $program != "GMXRC" ) && ( $program != "completion.csh" ) && ( $program != "completion.zsh" ) && ( $program != "average" ) && ( $program != "completion.bash" ) && ( $program != "luck" ) && ( $program != "demux.pl" ) ) then
++  if ( ( -x $GMXBINDIR/$program ) && ( $program != "my_dssp" ) && ( $program != "GMXRC" ) && ( $program != "completion.csh" ) && ( $program != "completion.zsh" ) && ( $program != "average" ) && ( $program != "completion.bash" ) && ( $program != "demux.pl" ) ) then
       echo -n "$program "
       cd $HTMLOL
       $GMXBINDIR/$program -quiet -man html >& /dev/null
diff --cc cmake/gmxManageGPU.cmake
Simple merge
diff --cc cmake/gmxManageNvccConfig.cmake
Simple merge
diff --cc src/config.h.cmakein
Simple merge
diff --cc src/gromacs/gmxana/gmx_genion.c

index 8c521daae19228d63d81cb4cd64709c49922e928,0000000000000000000000000000000000000000..759db606f59ebc9078abdad615f9b4ec1307fec9

mode 100644,000000..100644
--- 1/src/gromacs/gmxana/gmx_genion.c
--- /dev/null
+++ b/src/gromacs/gmxana/gmx_genion.c
@@@ -1,556 -1,0 +1,544 @@@
- static int greatest_common_divisor(int p, int q)
- {
-     int tmp;
-     while (q != 0)
-     {
-         tmp = q;
-         q = p % q;
-         p = tmp;
-     }
-     return p;
- }
- 
+ +/*
+ + *
+ + *                This source code is part of
+ + *
+ + *                 G   R   O   M   A   C   S
+ + *
+ + *          GROningen MAchine for Chemical Simulations
+ + *
+ + *                        VERSION 3.2.0
+ + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
+ + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ + * Copyright (c) 2001-2004, The GROMACS development team,
+ + * check out http://www.gromacs.org for more information.
+ +
+ + * This program is free software; you can redistribute it and/or
+ + * modify it under the terms of the GNU General Public License
+ + * as published by the Free Software Foundation; either version 2
+ + * of the License, or (at your option) any later version.
+ + *
+ + * If you want to redistribute modifications, please consider that
+ + * scientific software is very special. Version control is crucial -
+ + * bugs must be traceable. We will be happy to consider code for
+ + * inclusion in the official distribution, but derived work must not
+ + * be called official GROMACS. Details are found in the README & COPYING
+ + * files - if they are missing, get the official version at www.gromacs.org.
+ + *
+ + * To help us fund GROMACS development, we humbly ask that you cite
+ + * the papers on the package - you can find them in the top README file.
+ + *
+ + * For more info, check our website at http://www.gromacs.org
+ + *
+ + * And Hey:
+ + * Green Red Orange Magenta Azure Cyan Skyblue
+ + */
+ +#ifdef HAVE_CONFIG_H
+ +#include <config.h>
+ +#endif
+ +
+ +#include <ctype.h>
+ +#include "string2.h"
+ +#include "smalloc.h"
+ +#include "sysstuff.h"
+ +#include "confio.h"
+ +#include "statutil.h"
+ +#include "pbc.h"
+ +#include "force.h"
+ +#include "gmx_fatal.h"
+ +#include "futil.h"
+ +#include "maths.h"
+ +#include "macros.h"
+ +#include "vec.h"
+ +#include "tpxio.h"
+ +#include "mdrun.h"
+ +#include "main.h"
+ +#include "random.h"
+ +#include "index.h"
+ +#include "mtop_util.h"
+ +#include "gmx_ana.h"
+ +
-         int gcd = greatest_common_divisor(n_q, p_q);
+ +static void insert_ion(int nsa, int *nwater,
+ +                       gmx_bool bSet[], int repl[], atom_id index[],
+ +                       rvec x[], t_pbc *pbc,
+ +                       int sign, int q, const char *ionname,
+ +                       t_atoms *atoms,
+ +                       real rmin, int *seed)
+ +{
+ +    int             i, ei,nw;
+ +    real            rmin2;
+ +    rvec            dx;
+ +    gmx_large_int_t maxrand;
+ +
+ +    ei       = -1;
+ +    nw       = *nwater;
+ +    maxrand  = nw;
+ +    maxrand *= 1000;
+ +
+ +    do
+ +    {
+ +        ei = nw*rando(seed);
+ +        maxrand--;
+ +    }
+ +    while (bSet[ei] && (maxrand > 0));
+ +    if (bSet[ei])
+ +    {
+ +        gmx_fatal(FARGS, "No more replaceable solvent!");
+ +    }
+ +
+ +    fprintf(stderr, "Replacing solvent molecule %d (atom %d) with %s\n",
+ +            ei, index[nsa*ei], ionname);
+ +
+ +    /* Replace solvent molecule charges with ion charge */
+ +    bSet[ei] = TRUE;
+ +    repl[ei] = sign;
+ +
+ +    atoms->atom[index[nsa*ei]].q = q;
+ +    for (i = 1; i < nsa; i++)
+ +    {
+ +        atoms->atom[index[nsa*ei+i]].q = 0;
+ +    }
+ +
+ +    /* Mark all solvent molecules within rmin as unavailable for substitution */
+ +    if (rmin > 0)
+ +    {
+ +        rmin2 = rmin*rmin;
+ +        for (i = 0; (i < nw); i++)
+ +        {
+ +            if (!bSet[i])
+ +            {
+ +                pbc_dx(pbc, x[index[nsa*ei]], x[index[nsa*i]], dx);
+ +                if (iprod(dx, dx) < rmin2)
+ +                {
+ +                    bSet[i] = TRUE;
+ +                }
+ +            }
+ +        }
+ +    }
+ +}
+ +
+ +
+ +static char *aname(const char *mname)
+ +{
+ +    char *str;
+ +    int   i;
+ +
+ +    str = strdup(mname);
+ +    i   = strlen(str)-1;
+ +    while (i > 1 && (isdigit(str[i]) || (str[i] == '+') || (str[i] == '-')))
+ +    {
+ +        str[i] = '\0';
+ +        i--;
+ +    }
+ +
+ +    return str;
+ +}
+ +
+ +void sort_ions(int nsa, int nw, int repl[], atom_id index[],
+ +               t_atoms *atoms, rvec x[],
+ +               const char *p_name, const char *n_name)
+ +{
+ +    int    i, j, k, r, np, nn, starta, startr, npi, nni;
+ +    rvec  *xt;
+ +    char **pptr = NULL, **nptr = NULL, **paptr = NULL, **naptr = NULL;
+ +
+ +    snew(xt, atoms->nr);
+ +
+ +    /* Put all the solvent in front and count the added ions */
+ +    np = 0;
+ +    nn = 0;
+ +    j  = index[0];
+ +    for (i = 0; i < nw; i++)
+ +    {
+ +        r = repl[i];
+ +        if (r == 0)
+ +        {
+ +            for (k = 0; k < nsa; k++)
+ +            {
+ +                copy_rvec(x[index[nsa*i+k]], xt[j++]);
+ +            }
+ +        }
+ +        else if (r > 0)
+ +        {
+ +            np++;
+ +        }
+ +        else if (r < 0)
+ +        {
+ +            nn++;
+ +        }
+ +    }
+ +
+ +    if (np+nn > 0)
+ +    {
+ +        /* Put the positive and negative ions at the end */
+ +        starta = index[nsa*(nw - np - nn)];
+ +        startr = atoms->atom[starta].resind;
+ +
+ +        if (np)
+ +        {
+ +            snew(pptr, 1);
+ +            pptr[0] = strdup(p_name);
+ +            snew(paptr, 1);
+ +            paptr[0] = aname(p_name);
+ +        }
+ +        if (nn)
+ +        {
+ +            snew(nptr, 1);
+ +            nptr[0] = strdup(n_name);
+ +            snew(naptr, 1);
+ +            naptr[0] = aname(n_name);
+ +        }
+ +        npi = 0;
+ +        nni = 0;
+ +        for (i = 0; i < nw; i++)
+ +        {
+ +            r = repl[i];
+ +            if (r > 0)
+ +            {
+ +                j = starta+npi;
+ +                k = startr+npi;
+ +                copy_rvec(x[index[nsa*i]], xt[j]);
+ +                atoms->atomname[j]     = paptr;
+ +                atoms->atom[j].resind  = k;
+ +                atoms->resinfo[k].name = pptr;
+ +                npi++;
+ +            }
+ +            else if (r < 0)
+ +            {
+ +                j = starta+np+nni;
+ +                k = startr+np+nni;
+ +                copy_rvec(x[index[nsa*i]], xt[j]);
+ +                atoms->atomname[j]     = naptr;
+ +                atoms->atom[j].resind  = k;
+ +                atoms->resinfo[k].name = nptr;
+ +                nni++;
+ +            }
+ +        }
+ +        for (i = index[nsa*nw-1]+1; i < atoms->nr; i++)
+ +        {
+ +            j                  = i-(nsa-1)*(np+nn);
+ +            atoms->atomname[j] = atoms->atomname[i];
+ +            atoms->atom[j]     = atoms->atom[i];
+ +            copy_rvec(x[i], xt[j]);
+ +        }
+ +        atoms->nr -= (nsa-1)*(np+nn);
+ +
+ +        /* Copy the new positions back */
+ +        for (i = index[0]; i < atoms->nr; i++)
+ +        {
+ +            copy_rvec(xt[i], x[i]);
+ +        }
+ +        sfree(xt);
+ +    }
+ +}
+ +
+ +static void update_topol(const char *topinout, int p_num, int n_num,
+ +                         const char *p_name, const char *n_name, char *grpname)
+ +{
+ +#define TEMP_FILENM "temp.top"
+ +    FILE    *fpin, *fpout;
+ +    char     buf[STRLEN], buf2[STRLEN], *temp, **mol_line = NULL;
+ +    int      line, i, nsol, nmol_line, sol_line, nsol_last;
+ +    gmx_bool bMolecules;
+ +
+ +    printf("\nProcessing topology\n");
+ +    fpin  = ffopen(topinout, "r");
+ +    fpout = ffopen(TEMP_FILENM, "w");
+ +
+ +    line       = 0;
+ +    bMolecules = FALSE;
+ +    nmol_line  = 0;
+ +    sol_line   = -1;
+ +    nsol_last  = -1;
+ +    while (fgets(buf, STRLEN, fpin))
+ +    {
+ +        line++;
+ +        strcpy(buf2, buf);
+ +        if ((temp = strchr(buf2, '\n')) != NULL)
+ +        {
+ +            temp[0] = '\0';
+ +        }
+ +        ltrim(buf2);
+ +        if (buf2[0] == '[')
+ +        {
+ +            buf2[0] = ' ';
+ +            if ((temp = strchr(buf2, '\n')) != NULL)
+ +            {
+ +                temp[0] = '\0';
+ +            }
+ +            rtrim(buf2);
+ +            if (buf2[strlen(buf2)-1] == ']')
+ +            {
+ +                buf2[strlen(buf2)-1] = '\0';
+ +                ltrim(buf2);
+ +                rtrim(buf2);
+ +                bMolecules = (gmx_strcasecmp(buf2, "molecules") == 0);
+ +            }
+ +            fprintf(fpout, "%s", buf);
+ +        }
+ +        else if (!bMolecules)
+ +        {
+ +            fprintf(fpout, "%s", buf);
+ +        }
+ +        else
+ +        {
+ +            /* Check if this is a line with solvent molecules */
+ +            sscanf(buf, "%s", buf2);
+ +            if (gmx_strcasecmp(buf2, grpname) == 0)
+ +            {
+ +                sol_line = nmol_line;
+ +                sscanf(buf, "%*s %d", &nsol_last);
+ +            }
+ +            /* Store this molecules section line */
+ +            srenew(mol_line, nmol_line+1);
+ +            mol_line[nmol_line] = strdup(buf);
+ +            nmol_line++;
+ +        }
+ +    }
+ +    ffclose(fpin);
+ +
+ +    if (sol_line == -1)
+ +    {
+ +        ffclose(fpout);
+ +        gmx_fatal(FARGS, "No line with moleculetype '%s' found the [ molecules ] section of file '%s'", grpname, topinout);
+ +    }
+ +    if (nsol_last < p_num+n_num)
+ +    {
+ +        ffclose(fpout);
+ +        gmx_fatal(FARGS, "The last entry for moleculetype '%s' in the [ molecules ] section of file '%s' has less solvent molecules (%d) than were replaced (%d)", grpname, topinout, nsol_last, p_num+n_num);
+ +    }
+ +
+ +    /* Print all the molecule entries */
+ +    for (i = 0; i < nmol_line; i++)
+ +    {
+ +        if (i != sol_line)
+ +        {
+ +            fprintf(fpout, "%s", mol_line[i]);
+ +        }
+ +        else
+ +        {
+ +            printf("Replacing %d solute molecules in topology file (%s) "
+ +                   " by %d %s and %d %s ions.\n",
+ +                   p_num+n_num, topinout, p_num, p_name, n_num, n_name);
+ +            nsol_last -= p_num + n_num;
+ +            if (nsol_last > 0)
+ +            {
+ +                fprintf(fpout, "%-10s  %d\n", grpname, nsol_last);
+ +            }
+ +            if (p_num > 0)
+ +            {
+ +                fprintf(fpout, "%-15s  %d\n", p_name, p_num);
+ +            }
+ +            if (n_num > 0)
+ +            {
+ +                fprintf(fpout, "%-15s  %d\n", n_name, n_num);
+ +            }
+ +        }
+ +    }
+ +    ffclose(fpout);
+ +    /* use ffopen to generate backup of topinout */
+ +    fpout = ffopen(topinout, "w");
+ +    ffclose(fpout);
+ +    rename(TEMP_FILENM, topinout);
+ +#undef TEMP_FILENM
+ +}
+ +
+ +int gmx_genion(int argc, char *argv[])
+ +{
+ +    const char        *desc[] = {
+ +        "[TT]genion[tt] randomly replaces solvent molecules with monoatomic ions.",
+ +        "The group of solvent molecules should be continuous and all molecules",
+ +        "should have the same number of atoms.",
+ +        "The user should add the ion molecules to the topology file or use",
+ +        "the [TT]-p[tt] option to automatically modify the topology.[PAR]",
+ +        "The ion molecule type, residue and atom names in all force fields",
+ +        "are the capitalized element names without sign. This molecule name",
+ +        "should be given with [TT]-pname[tt] or [TT]-nname[tt], and the",
+ +        "[TT][molecules][tt] section of your topology updated accordingly,",
+ +        "either by hand or with [TT]-p[tt]. Do not use an atom name instead!",
+ +        "[PAR]Ions which can have multiple charge states get the multiplicity",
+ +        "added, without sign, for the uncommon states only.[PAR]",
+ +        "For larger ions, e.g. sulfate we recommended using [TT]genbox[tt]."
+ +    };
+ +    const char        *bugs[] = {
+ +        "If you specify a salt concentration existing ions are not taken into "
+ +        "account. In effect you therefore specify the amount of salt to be added.",
+ +    };
+ +    static int         p_num   = 0, n_num = 0, p_q = 1, n_q = -1;
+ +    static const char *p_name  = "NA", *n_name = "CL";
+ +    static real        rmin    = 0.6, conc = 0;
+ +    static int         seed    = 1993;
+ +    static gmx_bool    bNeutral = FALSE;
+ +    static t_pargs     pa[]    = {
+ +        { "-np",    FALSE, etINT,  {&p_num}, "Number of positive ions"       },
+ +        { "-pname", FALSE, etSTR,  {&p_name}, "Name of the positive ion"      },
+ +        { "-pq",    FALSE, etINT,  {&p_q},   "Charge of the positive ion"    },
+ +        { "-nn",    FALSE, etINT,  {&n_num}, "Number of negative ions"       },
+ +        { "-nname", FALSE, etSTR,  {&n_name}, "Name of the negative ion"      },
+ +        { "-nq",    FALSE, etINT,  {&n_q},   "Charge of the negative ion"    },
+ +        { "-rmin",  FALSE, etREAL, {&rmin},  "Minimum distance between ions" },
+ +        { "-seed",  FALSE, etINT,  {&seed},  "Seed for random number generator" },
+ +        { "-conc",  FALSE, etREAL, {&conc},
+ +          "Specify salt concentration (mol/liter). This will add sufficient ions to reach up to the specified concentration as computed from the volume of the cell in the input [TT].tpr[tt] file. Overrides the [TT]-np[tt] and [TT]-nn[tt] options." },
+ +        { "-neutral", FALSE, etBOOL, {&bNeutral}, "This option will add enough ions to neutralize the system. These ions are added on top of those specified with [TT]-np[tt]/[TT]-nn[tt] or [TT]-conc[tt]. "}
+ +    };
+ +    t_topology        top;
+ +    rvec              *x, *v;
+ +    real               vol, qtot;
+ +    matrix             box;
+ +    t_atoms            atoms;
+ +    t_pbc              pbc;
+ +    int               *repl, ePBC;
+ +    atom_id           *index;
+ +    char              *grpname, title[STRLEN];
+ +    gmx_bool          *bSet;
+ +    int                i, nw, nwa, nsa, nsalt, iqtot;
+ +    output_env_t       oenv;
+ +    t_filenm           fnm[] = {
+ +        { efTPX, NULL,  NULL,      ffREAD  },
+ +        { efNDX, NULL,  NULL,      ffOPTRD },
+ +        { efSTO, "-o",  NULL,      ffWRITE },
+ +        { efTOP, "-p",  "topol",   ffOPTRW }
+ +    };
+ +#define NFILE asize(fnm)
+ +
+ +    parse_common_args(&argc, argv, PCA_BE_NICE, NFILE, fnm, asize(pa), pa,
+ +                      asize(desc), desc, asize(bugs), bugs, &oenv);
+ +
+ +    /* Check input for something sensible */
+ +    if ((p_num < 0) || (n_num < 0))
+ +    {
+ +        gmx_fatal(FARGS, "Negative number of ions to add?");
+ +    }
+ +
+ +    if (conc > 0 && (p_num > 0 || n_num > 0))
+ +    {
+ +        fprintf(stderr, "WARNING: -conc specified, overriding -nn and -np.\n");
+ +    }
+ +
+ +    /* Read atom positions and charges */
+ +    read_tps_conf(ftp2fn(efTPX, NFILE, fnm), title, &top, &ePBC, &x, &v, box, FALSE);
+ +    atoms = top.atoms;
+ +
+ +    /* Compute total charge */
+ +    qtot = 0;
+ +    for (i = 0; (i < atoms.nr); i++)
+ +    {
+ +        qtot += atoms.atom[i].q;
+ +    }
+ +    iqtot = gmx_nint(qtot);
+ +
+ +    
+ +    if (conc > 0)
+ +    {
+ +        /* Compute number of ions to be added */
+ +        vol = det(box);
+ +        nsalt = gmx_nint(conc*vol*AVOGADRO/1e24);
+ +        p_num = abs(nsalt*n_q);
+ +        n_num = abs(nsalt*p_q);
+ +    }
+ +    if (bNeutral)
+ +    {
+ +        int qdelta = p_num*p_q + n_num*n_q + iqtot;
+ +
+ +        /* Check if the system is neutralizable
+ +         * is (qdelta == p_q*p_num + n_q*n_num) solvable for p_num and n_num? */
++        int gcd = gmx_greatest_common_divisor(n_q, p_q);
+ +        if ((qdelta % gcd) != 0)
+ +        {
+ +            gmx_fatal(FARGS, "Can't neutralize this system using -nq %d and"
+ +                    " -pq %d.\n", n_q, p_q);
+ +        }
+ +        
+ +        while (qdelta != 0)
+ +        {
+ +            while (qdelta < 0)
+ +            {
+ +                p_num++;
+ +                qdelta += p_q;
+ +            }
+ +            while (qdelta > 0)
+ +            {
+ +                n_num++;
+ +                qdelta += n_q;
+ +            }
+ +        }
+ +    }
+ +
+ +    if ((p_num == 0) && (n_num == 0))
+ +    {
+ +        fprintf(stderr, "No ions to add.\n");
+ +        exit(0);
+ +    }
+ +    else
+ +    {
+ +        printf("Will try to add %d %s ions and %d %s ions.\n",
+ +               p_num, p_name, n_num, n_name);
+ +        printf("Select a continuous group of solvent molecules\n");
+ +        get_index(&atoms, ftp2fn_null(efNDX, NFILE, fnm), 1, &nwa, &index, &grpname);
+ +        for (i = 1; i < nwa; i++)
+ +        {
+ +            if (index[i] != index[i-1]+1)
+ +            {
+ +                gmx_fatal(FARGS, "The solvent group %s is not continuous: "
+ +                          "index[%d]=%d, index[%d]=%d",
+ +                          grpname, i, index[i-1]+1, i+1, index[i]+1);
+ +            }
+ +        }
+ +        nsa = 1;
+ +        while ((nsa < nwa) &&
+ +               (atoms.atom[index[nsa]].resind ==
+ +                atoms.atom[index[nsa-1]].resind))
+ +        {
+ +            nsa++;
+ +        }
+ +        if (nwa % nsa)
+ +        {
+ +            gmx_fatal(FARGS, "Your solvent group size (%d) is not a multiple of %d",
+ +                      nwa, nsa);
+ +        }
+ +        nw = nwa/nsa;
+ +        fprintf(stderr, "Number of (%d-atomic) solvent molecules: %d\n", nsa, nw);
+ +        if (p_num+n_num > nw)
+ +        {
+ +            gmx_fatal(FARGS, "Not enough solvent for adding ions");
+ +        }
+ +    }
+ +
+ +    if (opt2bSet("-p", NFILE, fnm))
+ +    {
+ +        update_topol(opt2fn("-p", NFILE, fnm), p_num, n_num, p_name, n_name, grpname);
+ +    }
+ +
+ +    snew(bSet, nw);
+ +    snew(repl, nw);
+ +
+ +    snew(v, atoms.nr);
+ +    snew(atoms.pdbinfo, atoms.nr);
+ +
+ +    set_pbc(&pbc, ePBC, box);
+ +
+ +    /* Now loop over the ions that have to be placed */
+ +    while (p_num-- > 0)
+ +    {
+ +        insert_ion(nsa, &nw, bSet, repl, index, x, &pbc,
+ +                   1, p_q, p_name, &atoms, rmin, &seed);
+ +    }
+ +    while (n_num-- > 0)
+ +    {
+ +        insert_ion(nsa, &nw, bSet, repl, index, x, &pbc,
+ +                   -1, n_q, n_name, &atoms, rmin, &seed);
+ +    }
+ +    fprintf(stderr, "\n");
+ +
+ +    if (nw)
+ +    {
+ +        sort_ions(nsa, nw, repl, index, &atoms, x, p_name, n_name);
+ +    }
+ +
+ +    sfree(atoms.pdbinfo);
+ +    atoms.pdbinfo = NULL;
+ +    write_sto_conf(ftp2fn(efSTO, NFILE, fnm), *top.name, &atoms, x, NULL, ePBC,
+ +                   box);
+ +
+ +    return 0;
+ +}
diff --cc src/gromacs/gmxana/gmx_tune_pme.c

index c2ad074419ceeeaccd647455e9be20dc04dc2259,0000000000000000000000000000000000000000..6c8ec79bea9645abf9872b128a620b732d100182

mode 100644,000000..100644
--- 1/src/gromacs/gmxana/gmx_tune_pme.c
--- /dev/null
+++ b/src/gromacs/gmxana/gmx_tune_pme.c
@@@ -1,2492 -1,0 +1,2501 @@@
-         sprintf(bbuf, " -np %d ", nnodes);
+ +/*
+ + *
+ + *                This source code is part of
+ + *
+ + *                 G   R   O   M   A   C   S
+ + *
+ + *          GROningen MAchine for Chemical Simulations
+ + *
+ + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
+ + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ + * Copyright (c) 2001-2008, The GROMACS development team,
+ + * check out http://www.gromacs.org for more information.
+ +
+ + * This program is free software; you can redistribute it and/or
+ + * modify it under the terms of the GNU General Public License
+ + * as published by the Free Software Foundation; either version 2
+ + * of the License, or (at your option) any later version.
+ + *
+ + * If you want to redistribute modifications, please consider that
+ + * scientific software is very special. Version control is crucial -
+ + * bugs must be traceable. We will be happy to consider code for
+ + * inclusion in the official distribution, but derived work must not
+ + * be called official GROMACS. Details are found in the README & COPYING
+ + * files - if they are missing, get the official version at www.gromacs.org.
+ + *
+ + * To help us fund GROMACS development, we humbly ask that you cite
+ + * the papers on the package - you can find them in the top README file.
+ + *
+ + * For more info, check our website at http://www.gromacs.org
+ + *
+ + * And Hey:
+ + * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
+ + */
+ +#ifdef HAVE_CONFIG_H
+ +#include <config.h>
+ +#endif
+ +
+ +
+ +#include <time.h>
+ +#ifdef HAVE_SYS_TIME_H
+ +#include <sys/time.h>
+ +#endif
+ +
+ +
+ +
+ +#include "statutil.h"
+ +#include "typedefs.h"
+ +#include "smalloc.h"
+ +#include "vec.h"
+ +#include "copyrite.h"
+ +#include "statutil.h"
+ +#include "tpxio.h"
+ +#include "string2.h"
+ +#include "readinp.h"
+ +#include "calcgrid.h"
+ +#include "checkpoint.h"
+ +#include "macros.h"
+ +#include "gmx_ana.h"
+ +#include "names.h"
+ +#include "perf_est.h"
+ +
+ +
+ +
+ +/* Enum for situations that can occur during log file parsing, the
+ + * corresponding string entries can be found in do_the_tests() in
+ + * const char* ParseLog[] */
+ +enum {
+ +    eParselogOK,
+ +    eParselogNotFound,
+ +    eParselogNoPerfData,
+ +    eParselogTerm,
+ +    eParselogResetProblem,
+ +    eParselogNoDDGrid,
+ +    eParselogTPXVersion,
+ +    eParselogNotParallel,
+ +    eParselogFatal,
+ +    eParselogNr
+ +};
+ +
+ +
+ +typedef struct
+ +{
+ +    int     nPMEnodes;    /* number of PME-only nodes used in this test */
+ +    int     nx, ny, nz;   /* DD grid */
+ +    int     guessPME;     /* if nPMEnodes == -1, this is the guessed number of PME nodes */
+ +    double *Gcycles;      /* This can contain more than one value if doing multiple tests */
+ +    double  Gcycles_Av;
+ +    float  *ns_per_day;
+ +    float   ns_per_day_Av;
+ +    float  *PME_f_load;     /* PME mesh/force load average*/
+ +    float   PME_f_load_Av;  /* Average average ;) ... */
+ +    char   *mdrun_cmd_line; /* Mdrun command line used for this test */
+ +} t_perf;
+ +
+ +
+ +typedef struct
+ +{
+ +    int             nr_inputfiles;  /* The number of tpr and mdp input files */
+ +    gmx_large_int_t orig_sim_steps; /* Number of steps to be done in the real simulation */
+ +    gmx_large_int_t orig_init_step; /* Init step for the real simulation */
+ +    real           *rcoulomb;       /* The coulomb radii [0...nr_inputfiles] */
+ +    real           *rvdw;           /* The vdW radii */
+ +    real           *rlist;          /* Neighbourlist cutoff radius */
+ +    real           *rlistlong;
+ +    int            *nkx, *nky, *nkz;
+ +    real           *fsx, *fsy, *fsz; /* Fourierspacing in x,y,z dimension */
+ +} t_inputinfo;
+ +
+ +
+ +static void sep_line(FILE *fp)
+ +{
+ +    fprintf(fp, "\n------------------------------------------------------------\n");
+ +}
+ +
+ +
+ +/* Wrapper for system calls */
+ +static int gmx_system_call(char *command)
+ +{
+ +#ifdef GMX_NO_SYSTEM
+ +    gmx_fatal(FARGS, "No calls to system(3) supported on this platform. Attempted to call:\n'%s'\n", command);
+ +#else
+ +    return ( system(command) );
+ +#endif
+ +}
+ +
+ +
+ +/* Check if string starts with substring */
+ +static gmx_bool str_starts(const char *string, const char *substring)
+ +{
+ +    return ( strncmp(string, substring, strlen(substring)) == 0);
+ +}
+ +
+ +
+ +static void cleandata(t_perf *perfdata, int test_nr)
+ +{
+ +    perfdata->Gcycles[test_nr]    = 0.0;
+ +    perfdata->ns_per_day[test_nr] = 0.0;
+ +    perfdata->PME_f_load[test_nr] = 0.0;
+ +
+ +    return;
+ +}
+ +
+ +
+ +static gmx_bool is_equal(real a, real b)
+ +{
+ +    real diff, eps = 1.0e-7;
+ +
+ +
+ +    diff = a - b;
+ +
+ +    if (diff < 0.0)
+ +    {
+ +        diff = -diff;
+ +    }
+ +
+ +    if (diff < eps)
+ +    {
+ +        return TRUE;
+ +    }
+ +    else
+ +    {
+ +        return FALSE;
+ +    }
+ +}
+ +
+ +
+ +static void finalize(const char *fn_out)
+ +{
+ +    char  buf[STRLEN];
+ +    FILE *fp;
+ +
+ +
+ +    fp = fopen(fn_out, "r");
+ +    fprintf(stdout, "\n\n");
+ +
+ +    while (fgets(buf, STRLEN-1, fp) != NULL)
+ +    {
+ +        fprintf(stdout, "%s", buf);
+ +    }
+ +    fclose(fp);
+ +    fprintf(stdout, "\n\n");
+ +}
+ +
+ +
+ +enum {
+ +    eFoundNothing, eFoundDDStr, eFoundAccountingStr, eFoundCycleStr
+ +};
+ +
+ +static int parse_logfile(const char *logfile, const char *errfile,
+ +                         t_perf *perfdata, int test_nr, int presteps, gmx_large_int_t cpt_steps,
+ +                         int nnodes)
+ +{
+ +    FILE           *fp;
+ +    char            line[STRLEN], dumstring[STRLEN], dumstring2[STRLEN];
+ +    const char      matchstrdd[]  = "Domain decomposition grid";
+ +    const char      matchstrcr[]  = "resetting all time and cycle counters";
+ +    const char      matchstrbal[] = "Average PME mesh/force load:";
+ +    const char      matchstring[] = "R E A L   C Y C L E   A N D   T I M E   A C C O U N T I N G";
+ +    const char      errSIG[]      = "signal, stopping at the next";
+ +    int             iFound;
+ +    int             procs;
+ +    float           dum1, dum2, dum3, dum4;
+ +    int             ndum;
+ +    int             npme;
+ +    gmx_large_int_t resetsteps     = -1;
+ +    gmx_bool        bFoundResetStr = FALSE;
+ +    gmx_bool        bResetChecked  = FALSE;
+ +
+ +
+ +    if (!gmx_fexist(logfile))
+ +    {
+ +        fprintf(stderr, "WARNING: Could not find logfile %s.\n", logfile);
+ +        cleandata(perfdata, test_nr);
+ +        return eParselogNotFound;
+ +    }
+ +
+ +    fp = fopen(logfile, "r");
+ +    perfdata->PME_f_load[test_nr] = -1.0;
+ +    perfdata->guessPME            = -1;
+ +
+ +    iFound = eFoundNothing;
+ +    if (1 == nnodes)
+ +    {
+ +        iFound = eFoundDDStr; /* Skip some case statements */
+ +    }
+ +
+ +    while (fgets(line, STRLEN, fp) != NULL)
+ +    {
+ +        /* Remove leading spaces */
+ +        ltrim(line);
+ +
+ +        /* Check for TERM and INT signals from user: */
+ +        if (strstr(line, errSIG) != NULL)
+ +        {
+ +            fclose(fp);
+ +            cleandata(perfdata, test_nr);
+ +            return eParselogTerm;
+ +        }
+ +
+ +        /* Check whether cycle resetting  worked */
+ +        if (presteps > 0 && !bFoundResetStr)
+ +        {
+ +            if (strstr(line, matchstrcr) != NULL)
+ +            {
+ +                sprintf(dumstring, "step %s", gmx_large_int_pfmt);
+ +                sscanf(line, dumstring, &resetsteps);
+ +                bFoundResetStr = TRUE;
+ +                if (resetsteps == presteps+cpt_steps)
+ +                {
+ +                    bResetChecked = TRUE;
+ +                }
+ +                else
+ +                {
+ +                    sprintf(dumstring, gmx_large_int_pfmt, resetsteps);
+ +                    sprintf(dumstring2, gmx_large_int_pfmt, presteps+cpt_steps);
+ +                    fprintf(stderr, "WARNING: Time step counters were reset at step %s,\n"
+ +                            "         though they were supposed to be reset at step %s!\n",
+ +                            dumstring, dumstring2);
+ +                }
+ +            }
+ +        }
+ +
+ +        /* Look for strings that appear in a certain order in the log file: */
+ +        switch (iFound)
+ +        {
+ +            case eFoundNothing:
+ +                /* Look for domain decomp grid and separate PME nodes: */
+ +                if (str_starts(line, matchstrdd))
+ +                {
+ +                    sscanf(line, "Domain decomposition grid %d x %d x %d, separate PME nodes %d",
+ +                           &(perfdata->nx), &(perfdata->ny), &(perfdata->nz), &npme);
+ +                    if (perfdata->nPMEnodes == -1)
+ +                    {
+ +                        perfdata->guessPME = npme;
+ +                    }
+ +                    else if (perfdata->nPMEnodes != npme)
+ +                    {
+ +                        gmx_fatal(FARGS, "PME nodes from command line and output file are not identical");
+ +                    }
+ +                    iFound = eFoundDDStr;
+ +                }
+ +                /* Catch a few errors that might have occured: */
+ +                else if (str_starts(line, "There is no domain decomposition for"))
+ +                {
+ +                    fclose(fp);
+ +                    return eParselogNoDDGrid;
+ +                }
+ +                else if (str_starts(line, "reading tpx file"))
+ +                {
+ +                    fclose(fp);
+ +                    return eParselogTPXVersion;
+ +                }
+ +                else if (str_starts(line, "The -dd or -npme option request a parallel simulation"))
+ +                {
+ +                    fclose(fp);
+ +                    return eParselogNotParallel;
+ +                }
+ +                break;
+ +            case eFoundDDStr:
+ +                /* Look for PME mesh/force balance (not necessarily present, though) */
+ +                if (str_starts(line, matchstrbal))
+ +                {
+ +                    sscanf(&line[strlen(matchstrbal)], "%f", &(perfdata->PME_f_load[test_nr]));
+ +                }
+ +                /* Look for matchstring */
+ +                if (str_starts(line, matchstring))
+ +                {
+ +                    iFound = eFoundAccountingStr;
+ +                }
+ +                break;
+ +            case eFoundAccountingStr:
+ +                /* Already found matchstring - look for cycle data */
+ +                if (str_starts(line, "Total  "))
+ +                {
+ +                    sscanf(line, "Total %d %lf", &procs, &(perfdata->Gcycles[test_nr]));
+ +                    iFound = eFoundCycleStr;
+ +                }
+ +                break;
+ +            case eFoundCycleStr:
+ +                /* Already found cycle data - look for remaining performance info and return */
+ +                if (str_starts(line, "Performance:"))
+ +                {
+ +                    ndum = sscanf(line, "%s %f %f %f %f", dumstring, &dum1, &dum2, &dum3, &dum4);
+ +                    /* (ns/day) is the second last entry, depending on whether GMX_DETAILED_PERF_STATS was set in print_perf(), nrnb.c */
+ +                    perfdata->ns_per_day[test_nr] = (ndum == 5) ? dum3 : dum1;
+ +                    fclose(fp);
+ +                    if (bResetChecked || presteps == 0)
+ +                    {
+ +                        return eParselogOK;
+ +                    }
+ +                    else
+ +                    {
+ +                        return eParselogResetProblem;
+ +                    }
+ +                }
+ +                break;
+ +        }
+ +    } /* while */
+ +
+ +    /* Close the log file */
+ +    fclose(fp);
+ +
+ +    /* Check why there is no performance data in the log file.
+ +     * Did a fatal errors occur? */
+ +    if (gmx_fexist(errfile))
+ +    {
+ +        fp = fopen(errfile, "r");
+ +        while (fgets(line, STRLEN, fp) != NULL)
+ +        {
+ +            if (str_starts(line, "Fatal error:") )
+ +            {
+ +                if (fgets(line, STRLEN, fp) != NULL)
+ +                {
+ +                    fprintf(stderr, "\nWARNING: An error occured during this benchmark:\n"
+ +                            "%s\n", line);
+ +                }
+ +                fclose(fp);
+ +                cleandata(perfdata, test_nr);
+ +                return eParselogFatal;
+ +            }
+ +        }
+ +        fclose(fp);
+ +    }
+ +    else
+ +    {
+ +        fprintf(stderr, "WARNING: Could not find stderr file %s.\n", errfile);
+ +    }
+ +
+ +    /* Giving up ... we could not find out why there is no performance data in
+ +     * the log file. */
+ +    fprintf(stdout, "No performance data in log file.\n");
+ +    cleandata(perfdata, test_nr);
+ +
+ +    return eParselogNoPerfData;
+ +}
+ +
+ +
+ +static gmx_bool analyze_data(
+ +        FILE         *fp,
+ +        const char   *fn,
+ +        t_perf      **perfdata,
+ +        int           nnodes,
+ +        int           ntprs,
+ +        int           ntests,
+ +        int           nrepeats,
+ +        t_inputinfo  *info,
+ +        int          *index_tpr,    /* OUT: Nr of mdp file with best settings */
+ +        int          *npme_optimal) /* OUT: Optimal number of PME nodes */
+ +{
+ +    int      i, j, k;
+ +    int      line  = 0, line_win = -1;
+ +    int      k_win = -1, i_win = -1, winPME;
+ +    double   s     = 0.0; /* standard deviation */
+ +    t_perf  *pd;
+ +    char     strbuf[STRLEN];
+ +    char     str_PME_f_load[13];
+ +    gmx_bool bCanUseOrigTPR;
+ +    gmx_bool bRefinedCoul, bRefinedVdW, bRefinedGrid;
+ +
+ +
+ +    if (nrepeats > 1)
+ +    {
+ +        sep_line(fp);
+ +        fprintf(fp, "Summary of successful runs:\n");
+ +        fprintf(fp, "Line tpr PME nodes  Gcycles Av.     Std.dev.       ns/day        PME/f");
+ +        if (nnodes > 1)
+ +        {
+ +            fprintf(fp, "    DD grid");
+ +        }
+ +        fprintf(fp, "\n");
+ +    }
+ +
+ +
+ +    for (k = 0; k < ntprs; k++)
+ +    {
+ +        for (i = 0; i < ntests; i++)
+ +        {
+ +            /* Select the right dataset: */
+ +            pd = &(perfdata[k][i]);
+ +
+ +            pd->Gcycles_Av    = 0.0;
+ +            pd->PME_f_load_Av = 0.0;
+ +            pd->ns_per_day_Av = 0.0;
+ +
+ +            if (pd->nPMEnodes == -1)
+ +            {
+ +                sprintf(strbuf, "(%3d)", pd->guessPME);
+ +            }
+ +            else
+ +            {
+ +                sprintf(strbuf, "     ");
+ +            }
+ +
+ +            /* Get the average run time of a setting */
+ +            for (j = 0; j < nrepeats; j++)
+ +            {
+ +                pd->Gcycles_Av    += pd->Gcycles[j];
+ +                pd->PME_f_load_Av += pd->PME_f_load[j];
+ +            }
+ +            pd->Gcycles_Av    /= nrepeats;
+ +            pd->PME_f_load_Av /= nrepeats;
+ +
+ +            for (j = 0; j < nrepeats; j++)
+ +            {
+ +                if (pd->ns_per_day[j] > 0.0)
+ +                {
+ +                    pd->ns_per_day_Av += pd->ns_per_day[j];
+ +                }
+ +                else
+ +                {
+ +                    /* Somehow the performance number was not aquired for this run,
+ +                     * therefor set the average to some negative value: */
+ +                    pd->ns_per_day_Av = -1.0f*nrepeats;
+ +                    break;
+ +                }
+ +            }
+ +            pd->ns_per_day_Av /= nrepeats;
+ +
+ +            /* Nicer output: */
+ +            if (pd->PME_f_load_Av > 0.0)
+ +            {
+ +                sprintf(str_PME_f_load, "%12.3f", pd->PME_f_load_Av);
+ +            }
+ +            else
+ +            {
+ +                sprintf(str_PME_f_load, "%s", "         -  ");
+ +            }
+ +
+ +
+ +            /* We assume we had a successful run if both averages are positive */
+ +            if (pd->Gcycles_Av > 0.0 && pd->ns_per_day_Av > 0.0)
+ +            {
+ +                /* Output statistics if repeats were done */
+ +                if (nrepeats > 1)
+ +                {
+ +                    /* Calculate the standard deviation */
+ +                    s = 0.0;
+ +                    for (j = 0; j < nrepeats; j++)
+ +                    {
+ +                        s += pow( pd->Gcycles[j] - pd->Gcycles_Av, 2 );
+ +                    }
+ +                    s /= (nrepeats - 1);
+ +                    s  = sqrt(s);
+ +
+ +                    fprintf(fp, "%4d %3d %4d%s %12.3f %12.3f %12.3f %s",
+ +                            line, k, pd->nPMEnodes, strbuf, pd->Gcycles_Av, s,
+ +                            pd->ns_per_day_Av, str_PME_f_load);
+ +                    if (nnodes > 1)
+ +                    {
+ +                        fprintf(fp, "  %3d %3d %3d", pd->nx, pd->ny, pd->nz);
+ +                    }
+ +                    fprintf(fp, "\n");
+ +                }
+ +                /* Store the index of the best run found so far in 'winner': */
+ +                if ( (k_win == -1) || (pd->Gcycles_Av < perfdata[k_win][i_win].Gcycles_Av) )
+ +                {
+ +                    k_win    = k;
+ +                    i_win    = i;
+ +                    line_win = line;
+ +                }
+ +                line++;
+ +            }
+ +        }
+ +    }
+ +
+ +    if (k_win == -1)
+ +    {
+ +        gmx_fatal(FARGS, "None of the runs was successful! Check %s for problems.", fn);
+ +    }
+ +
+ +    sep_line(fp);
+ +
+ +    winPME = perfdata[k_win][i_win].nPMEnodes;
+ +
+ +    if (1 == ntests)
+ +    {
+ +        /* We stuck to a fixed number of PME-only nodes */
+ +        sprintf(strbuf, "settings No. %d", k_win);
+ +    }
+ +    else
+ +    {
+ +        /* We have optimized the number of PME-only nodes */
+ +        if (winPME == -1)
+ +        {
+ +            sprintf(strbuf, "%s", "the automatic number of PME nodes");
+ +        }
+ +        else
+ +        {
+ +            sprintf(strbuf, "%d PME nodes", winPME);
+ +        }
+ +    }
+ +    fprintf(fp, "Best performance was achieved with %s", strbuf);
+ +    if ((nrepeats > 1) && (ntests > 1))
+ +    {
+ +        fprintf(fp, " (see line %d)", line_win);
+ +    }
+ +    fprintf(fp, "\n");
+ +
+ +    /* Only mention settings if they were modified: */
+ +    bRefinedCoul = !is_equal(info->rcoulomb[k_win], info->rcoulomb[0]);
+ +    bRefinedVdW  = !is_equal(info->rvdw[k_win], info->rvdw[0]    );
+ +    bRefinedGrid = !(info->nkx[k_win] == info->nkx[0] &&
+ +                     info->nky[k_win] == info->nky[0] &&
+ +                     info->nkz[k_win] == info->nkz[0]);
+ +
+ +    if (bRefinedCoul || bRefinedVdW || bRefinedGrid)
+ +    {
+ +        fprintf(fp, "Optimized PME settings:\n");
+ +        bCanUseOrigTPR = FALSE;
+ +    }
+ +    else
+ +    {
+ +        bCanUseOrigTPR = TRUE;
+ +    }
+ +
+ +    if (bRefinedCoul)
+ +    {
+ +        fprintf(fp, "   New Coulomb radius: %f nm (was %f nm)\n", info->rcoulomb[k_win], info->rcoulomb[0]);
+ +    }
+ +
+ +    if (bRefinedVdW)
+ +    {
+ +        fprintf(fp, "   New Van der Waals radius: %f nm (was %f nm)\n", info->rvdw[k_win], info->rvdw[0]);
+ +    }
+ +
+ +    if (bRefinedGrid)
+ +    {
+ +        fprintf(fp, "   New Fourier grid xyz: %d %d %d (was %d %d %d)\n", info->nkx[k_win], info->nky[k_win], info->nkz[k_win],
+ +                info->nkx[0], info->nky[0], info->nkz[0]);
+ +    }
+ +
+ +    if (bCanUseOrigTPR && ntprs > 1)
+ +    {
+ +        fprintf(fp, "and original PME settings.\n");
+ +    }
+ +
+ +    fflush(fp);
+ +
+ +    /* Return the index of the mdp file that showed the highest performance
+ +     * and the optimal number of PME nodes */
+ +    *index_tpr    = k_win;
+ +    *npme_optimal = winPME;
+ +
+ +    return bCanUseOrigTPR;
+ +}
+ +
+ +
+ +/* Get the commands we need to set up the runs from environment variables */
+ +static void get_program_paths(gmx_bool bThreads, char *cmd_mpirun[], char cmd_np[],
+ +                              char *cmd_mdrun[], int repeats)
+ +{
+ +    char      *command = NULL;
+ +    char      *cp;
+ +    char      *cp2;
+ +    char       line[STRLEN];
+ +    FILE      *fp;
+ +    const char def_mpirun[]   = "mpirun";
+ +    const char def_mdrun[]    = "mdrun";
+ +    const char filename[]     = "benchtest.log";
+ +
+ +    /* This string should always be identical to the one in copyrite.c,
+ +     * gmx_print_version_info() in the defined(GMX_MPI) section */
+ +    const char match_mpi[]    = "MPI library:        MPI";
+ +    const char match_mdrun[]  = "Program: ";
+ +    const char empty_mpirun[] = "";
+ +    gmx_bool   bMdrun         = FALSE;
+ +    gmx_bool   bMPI           = FALSE;
+ +
+ +
+ +    /* Get the commands we need to set up the runs from environment variables */
+ +    if (!bThreads)
+ +    {
+ +        if ( (cp = getenv("MPIRUN")) != NULL)
+ +        {
+ +            *cmd_mpirun = strdup(cp);
+ +        }
+ +        else
+ +        {
+ +            *cmd_mpirun = strdup(def_mpirun);
+ +        }
+ +    }
+ +    else
+ +    {
+ +        *cmd_mpirun = strdup(empty_mpirun);
+ +    }
+ +
+ +    if ( (cp = getenv("MDRUN" )) != NULL)
+ +    {
+ +        *cmd_mdrun  = strdup(cp);
+ +    }
+ +    else
+ +    {
+ +        *cmd_mdrun  = strdup(def_mdrun);
+ +    }
+ +
+ +
+ +    /* If no simulations have to be performed, we are done here */
+ +    if (repeats <= 0)
+ +    {
+ +        return;
+ +    }
+ +
+ +    /* Run a small test to see whether mpirun + mdrun work  */
+ +    fprintf(stdout, "Making sure that mdrun can be executed. ");
+ +    if (bThreads)
+ +    {
+ +        snew(command, strlen(*cmd_mdrun) + strlen(cmd_np) + strlen(filename) + 50);
+ +        sprintf(command, "%s%s-version -maxh 0.001 1> %s 2>&1", *cmd_mdrun, cmd_np, filename);
+ +    }
+ +    else
+ +    {
+ +        snew(command, strlen(*cmd_mpirun) + strlen(cmd_np) + strlen(*cmd_mdrun) + strlen(filename) + 50);
+ +        sprintf(command, "%s%s%s -version -maxh 0.001 1> %s 2>&1", *cmd_mpirun, cmd_np, *cmd_mdrun, filename);
+ +    }
+ +    fprintf(stdout, "Trying '%s' ... ", command);
+ +    make_backup(filename);
+ +    gmx_system_call(command);
+ +
+ +    /* Check if we find the characteristic string in the output: */
+ +    if (!gmx_fexist(filename))
+ +    {
+ +        gmx_fatal(FARGS, "Output from test run could not be found.");
+ +    }
+ +
+ +    fp = fopen(filename, "r");
+ +    /* We need to scan the whole output file, since sometimes the queuing system
+ +     * also writes stuff to stdout/err */
+ +    while (!feof(fp) )
+ +    {
+ +        cp2 = fgets(line, STRLEN, fp);
+ +        if (cp2 != NULL)
+ +        {
+ +            if (str_starts(line, match_mdrun) )
+ +            {
+ +                bMdrun = TRUE;
+ +            }
+ +            if (str_starts(line, match_mpi) )
+ +            {
+ +                bMPI = TRUE;
+ +            }
+ +        }
+ +    }
+ +    fclose(fp);
+ +
+ +    if (bThreads)
+ +    {
+ +        if (bMPI)
+ +        {
+ +            gmx_fatal(FARGS, "Need a threaded version of mdrun. This one\n"
+ +                      "(%s)\n"
+ +                      "seems to have been compiled with MPI instead.",
+ +                      *cmd_mdrun);
+ +        }
+ +    }
+ +    else
+ +    {
+ +        if (bMdrun && !bMPI)
+ +        {
+ +            gmx_fatal(FARGS, "Need an MPI-enabled version of mdrun. This one\n"
+ +                      "(%s)\n"
+ +                      "seems to have been compiled without MPI support.",
+ +                      *cmd_mdrun);
+ +        }
+ +    }
+ +
+ +    if (!bMdrun)
+ +    {
+ +        gmx_fatal(FARGS, "Cannot execute mdrun. Please check %s for problems!",
+ +                  filename);
+ +    }
+ +
+ +    fprintf(stdout, "passed.\n");
+ +
+ +    /* Clean up ... */
+ +    remove(filename);
+ +    sfree(command);
+ +}
+ +
+ +
+ +static void launch_simulation(
+ +        gmx_bool    bLaunch,        /* Should the simulation be launched? */
+ +        FILE       *fp,             /* General log file */
+ +        gmx_bool    bThreads,       /* whether to use threads */
+ +        char       *cmd_mpirun,     /* Command for mpirun */
+ +        char       *cmd_np,         /* Switch for -np or -ntmpi or empty */
+ +        char       *cmd_mdrun,      /* Command for mdrun */
+ +        char       *args_for_mdrun, /* Arguments for mdrun */
+ +        const char *simulation_tpr, /* This tpr will be simulated */
+ +        int         nPMEnodes)      /* Number of PME nodes to use */
+ +{
+ +    char  *command;
+ +
+ +
+ +    /* Make enough space for the system call command,
+ +     * (100 extra chars for -npme ... etc. options should suffice): */
+ +    snew(command, strlen(cmd_mpirun)+strlen(cmd_mdrun)+strlen(cmd_np)+strlen(args_for_mdrun)+strlen(simulation_tpr)+100);
+ +
+ +    /* Note that the -passall options requires args_for_mdrun to be at the end
+ +     * of the command line string */
+ +    if (bThreads)
+ +    {
+ +        sprintf(command, "%s%s-npme %d -s %s %s",
+ +                cmd_mdrun, cmd_np, nPMEnodes, simulation_tpr, args_for_mdrun);
+ +    }
+ +    else
+ +    {
+ +        sprintf(command, "%s%s%s -npme %d -s %s %s",
+ +                cmd_mpirun, cmd_np, cmd_mdrun, nPMEnodes, simulation_tpr, args_for_mdrun);
+ +    }
+ +
+ +    fprintf(fp, "%s this command line to launch the simulation:\n\n%s", bLaunch ? "Using" : "Please use", command);
+ +    sep_line(fp);
+ +    fflush(fp);
+ +
+ +    /* Now the real thing! */
+ +    if (bLaunch)
+ +    {
+ +        fprintf(stdout, "\nLaunching simulation with best parameters now.\nExecuting '%s'", command);
+ +        sep_line(stdout);
+ +        fflush(stdout);
+ +        gmx_system_call(command);
+ +    }
+ +}
+ +
+ +
+ +static void modify_PMEsettings(
+ +        gmx_large_int_t simsteps,    /* Set this value as number of time steps */
+ +        gmx_large_int_t init_step,   /* Set this value as init_step */
+ +        const char     *fn_best_tpr, /* tpr file with the best performance */
+ +        const char     *fn_sim_tpr)  /* name of tpr file to be launched */
+ +{
+ +    t_inputrec   *ir;
+ +    t_state       state;
+ +    gmx_mtop_t    mtop;
+ +    char          buf[200];
+ +
+ +    snew(ir, 1);
+ +    read_tpx_state(fn_best_tpr, ir, &state, NULL, &mtop);
+ +
+ +    /* Reset nsteps and init_step to the value of the input .tpr file */
+ +    ir->nsteps    = simsteps;
+ +    ir->init_step = init_step;
+ +
+ +    /* Write the tpr file which will be launched */
+ +    sprintf(buf, "Writing optimized simulation file %s with nsteps=%s.\n", fn_sim_tpr, gmx_large_int_pfmt);
+ +    fprintf(stdout, buf, ir->nsteps);
+ +    fflush(stdout);
+ +    write_tpx_state(fn_sim_tpr, ir, &state, &mtop);
+ +
+ +    sfree(ir);
+ +}
+ +
+ +
+ +#define EPME_SWITCHED(e) ((e) == eelPMESWITCH || (e) == eelPMEUSERSWITCH)
+ +
+ +/* Make additional TPR files with more computational load for the
+ + * direct space processors: */
+ +static void make_benchmark_tprs(
+ +        const char     *fn_sim_tpr,      /* READ : User-provided tpr file                 */
+ +        char           *fn_bench_tprs[], /* WRITE: Names of benchmark tpr files           */
+ +        gmx_large_int_t benchsteps,      /* Number of time steps for benchmark runs       */
+ +        gmx_large_int_t statesteps,      /* Step counter in checkpoint file               */
+ +        real            rmin,            /* Minimal Coulomb radius                        */
+ +        real            rmax,            /* Maximal Coulomb radius                        */
+ +        real            bScaleRvdw,      /* Scale rvdw along with rcoulomb                */
+ +        int            *ntprs,           /* No. of TPRs to write, each with a different
+ +                                            rcoulomb and fourierspacing                   */
+ +        t_inputinfo    *info,            /* Contains information about mdp file options   */
+ +        FILE           *fp)              /* Write the output here                         */
+ +{
+ +    int           i, j, d;
+ +    t_inputrec   *ir;
+ +    t_state       state;
+ +    gmx_mtop_t    mtop;
+ +    real          nlist_buffer;     /* Thickness of the buffer regions for PME-switch potentials */
+ +    char          buf[200];
+ +    rvec          box_size;
+ +    gmx_bool      bNote = FALSE;
+ +    real          add;              /* Add this to rcoul for the next test    */
+ +    real          fac = 1.0;        /* Scaling factor for Coulomb radius      */
+ +    real          fourierspacing;   /* Basic fourierspacing from tpr          */
+ +
+ +
+ +    sprintf(buf, "Making benchmark tpr file%s with %s time step%s",
+ +            *ntprs > 1 ? "s" : "", gmx_large_int_pfmt, benchsteps > 1 ? "s" : "");
+ +    fprintf(stdout, buf, benchsteps);
+ +    if (statesteps > 0)
+ +    {
+ +        sprintf(buf, " (adding %s steps from checkpoint file)", gmx_large_int_pfmt);
+ +        fprintf(stdout, buf, statesteps);
+ +        benchsteps += statesteps;
+ +    }
+ +    fprintf(stdout, ".\n");
+ +
+ +
+ +    snew(ir, 1);
+ +    read_tpx_state(fn_sim_tpr, ir, &state, NULL, &mtop);
+ +
+ +    /* Check if some kind of PME was chosen */
+ +    if (EEL_PME(ir->coulombtype) == FALSE)
+ +    {
+ +        gmx_fatal(FARGS, "Can only do optimizations for simulations with %s electrostatics.",
+ +                  EELTYPE(eelPME));
+ +    }
+ +
+ +    /* Check if rcoulomb == rlist, which is necessary for plain PME. */
+ +    if (  (ir->cutoff_scheme != ecutsVERLET) &&
+ +          (eelPME == ir->coulombtype) && !(ir->rcoulomb == ir->rlist))
+ +    {
+ +        gmx_fatal(FARGS, "%s requires rcoulomb (%f) to be equal to rlist (%f).",
+ +                  EELTYPE(eelPME), ir->rcoulomb, ir->rlist);
+ +    }
+ +    /* For other PME types, rcoulomb is allowed to be smaller than rlist */
+ +    else if (ir->rcoulomb > ir->rlist)
+ +    {
+ +        gmx_fatal(FARGS, "%s requires rcoulomb (%f) to be equal to or smaller than rlist (%f)",
+ +                  EELTYPE(ir->coulombtype), ir->rcoulomb, ir->rlist);
+ +    }
+ +
+ +    if (bScaleRvdw && ir->rvdw != ir->rcoulomb)
+ +    {
+ +        fprintf(stdout, "NOTE: input rvdw != rcoulomb, will not scale rvdw\n");
+ +        bScaleRvdw = FALSE;
+ +    }
+ +
+ +    /* Reduce the number of steps for the benchmarks */
+ +    info->orig_sim_steps = ir->nsteps;
+ +    ir->nsteps           = benchsteps;
+ +    /* We must not use init_step from the input tpr file for the benchmarks */
+ +    info->orig_init_step = ir->init_step;
+ +    ir->init_step        = 0;
+ +
+ +    /* For PME-switch potentials, keep the radial distance of the buffer region */
+ +    nlist_buffer   = ir->rlist - ir->rcoulomb;
+ +
+ +    /* Determine length of triclinic box vectors */
+ +    for (d = 0; d < DIM; d++)
+ +    {
+ +        box_size[d] = 0;
+ +        for (i = 0; i < DIM; i++)
+ +        {
+ +            box_size[d] += state.box[d][i]*state.box[d][i];
+ +        }
+ +        box_size[d] = sqrt(box_size[d]);
+ +    }
+ +
+ +    if (ir->fourier_spacing > 0)
+ +    {
+ +        info->fsx[0] = ir->fourier_spacing;
+ +        info->fsy[0] = ir->fourier_spacing;
+ +        info->fsz[0] = ir->fourier_spacing;
+ +    }
+ +    else
+ +    {
+ +        /* Reconstruct fourierspacing per dimension from the number of grid points and box size */
+ +        info->fsx[0] = box_size[XX]/ir->nkx;
+ +        info->fsy[0] = box_size[YY]/ir->nky;
+ +        info->fsz[0] = box_size[ZZ]/ir->nkz;
+ +    }
+ +
+ +    /* If no value for the fourierspacing was provided on the command line, we
+ +     * use the reconstruction from the tpr file */
+ +    if (ir->fourier_spacing > 0)
+ +    {
+ +        /* Use the spacing from the tpr */
+ +        fourierspacing = ir->fourier_spacing;
+ +    }
+ +    else
+ +    {
+ +        /* Use the maximum observed spacing */
+ +        fourierspacing = max(max(info->fsx[0], info->fsy[0]), info->fsz[0]);
+ +    }
+ +
+ +    fprintf(stdout, "Calculating PME grid points on the basis of a fourierspacing of %f nm\n", fourierspacing);
+ +
+ +    /* For performance comparisons the number of particles is useful to have */
+ +    fprintf(fp, "   Number of particles  : %d\n", mtop.natoms);
+ +
+ +    /* Print information about settings of which some are potentially modified: */
+ +    fprintf(fp, "   Coulomb type         : %s\n", EELTYPE(ir->coulombtype));
+ +    fprintf(fp, "   Grid spacing x y z   : %f %f %f\n",
+ +            box_size[XX]/ir->nkx, box_size[YY]/ir->nky, box_size[ZZ]/ir->nkz);
+ +    fprintf(fp, "   Van der Waals type   : %s\n", EVDWTYPE(ir->vdwtype));
+ +    if (EVDW_SWITCHED(ir->vdwtype))
+ +    {
+ +        fprintf(fp, "   rvdw_switch          : %f nm\n", ir->rvdw_switch);
+ +    }
+ +    if (EPME_SWITCHED(ir->coulombtype))
+ +    {
+ +        fprintf(fp, "   rlist                : %f nm\n", ir->rlist);
+ +    }
+ +    if (ir->rlistlong != max_cutoff(ir->rvdw, ir->rcoulomb))
+ +    {
+ +        fprintf(fp, "   rlistlong            : %f nm\n", ir->rlistlong);
+ +    }
+ +
+ +    /* Print a descriptive line about the tpr settings tested */
+ +    fprintf(fp, "\nWill try these real/reciprocal workload settings:\n");
+ +    fprintf(fp, " No.   scaling  rcoulomb");
+ +    fprintf(fp, "  nkx  nky  nkz");
+ +    fprintf(fp, "   spacing");
+ +    if (evdwCUT == ir->vdwtype)
+ +    {
+ +        fprintf(fp, "      rvdw");
+ +    }
+ +    if (EPME_SWITCHED(ir->coulombtype))
+ +    {
+ +        fprintf(fp, "     rlist");
+ +    }
+ +    if (ir->rlistlong != max_cutoff(ir->rlist, max_cutoff(ir->rvdw, ir->rcoulomb)) )
+ +    {
+ +        fprintf(fp, " rlistlong");
+ +    }
+ +    fprintf(fp, "  tpr file\n");
+ +
+ +    /* Loop to create the requested number of tpr input files */
+ +    for (j = 0; j < *ntprs; j++)
+ +    {
+ +        /* The first .tpr is the provided one, just need to modify nsteps,
+ +         * so skip the following block */
+ +        if (j != 0)
+ +        {
+ +            /* Determine which Coulomb radii rc to use in the benchmarks */
+ +            add = (rmax-rmin)/(*ntprs-1);
+ +            if (is_equal(rmin, info->rcoulomb[0]))
+ +            {
+ +                ir->rcoulomb = rmin + j*add;
+ +            }
+ +            else if (is_equal(rmax, info->rcoulomb[0]))
+ +            {
+ +                ir->rcoulomb = rmin + (j-1)*add;
+ +            }
+ +            else
+ +            {
+ +                /* rmin != rcoul != rmax, ergo test between rmin and rmax */
+ +                add          = (rmax-rmin)/(*ntprs-2);
+ +                ir->rcoulomb = rmin + (j-1)*add;
+ +            }
+ +
+ +            /* Determine the scaling factor fac */
+ +            fac = ir->rcoulomb/info->rcoulomb[0];
+ +
+ +            /* Scale the Fourier grid spacing */
+ +            ir->nkx = ir->nky = ir->nkz = 0;
+ +            calc_grid(NULL, state.box, fourierspacing*fac, &ir->nkx, &ir->nky, &ir->nkz);
+ +
+ +            /* Adjust other radii since various conditions neet to be fulfilled */
+ +            if (eelPME == ir->coulombtype)
+ +            {
+ +                /* plain PME, rcoulomb must be equal to rlist */
+ +                ir->rlist = ir->rcoulomb;
+ +            }
+ +            else
+ +            {
+ +                /* rlist must be >= rcoulomb, we keep the size of the buffer region */
+ +                ir->rlist = ir->rcoulomb + nlist_buffer;
+ +            }
+ +
+ +            if (bScaleRvdw && evdwCUT == ir->vdwtype)
+ +            {
+ +                /* For vdw cutoff, rvdw >= rlist */
+ +                ir->rvdw = max(info->rvdw[0], ir->rlist);
+ +            }
+ +
+ +            ir->rlistlong = max_cutoff(ir->rlist, max_cutoff(ir->rvdw, ir->rcoulomb));
+ +
+ +        } /* end of "if (j != 0)" */
+ +
+ +        /* for j==0: Save the original settings
+ +         * for j >0: Save modified radii and Fourier grids */
+ +        info->rcoulomb[j]  = ir->rcoulomb;
+ +        info->rvdw[j]      = ir->rvdw;
+ +        info->nkx[j]       = ir->nkx;
+ +        info->nky[j]       = ir->nky;
+ +        info->nkz[j]       = ir->nkz;
+ +        info->rlist[j]     = ir->rlist;
+ +        info->rlistlong[j] = ir->rlistlong;
+ +        info->fsx[j]       = fac*fourierspacing;
+ +        info->fsy[j]       = fac*fourierspacing;
+ +        info->fsz[j]       = fac*fourierspacing;
+ +
+ +        /* Write the benchmark tpr file */
+ +        strncpy(fn_bench_tprs[j], fn_sim_tpr, strlen(fn_sim_tpr)-strlen(".tpr"));
+ +        sprintf(buf, "_bench%.2d.tpr", j);
+ +        strcat(fn_bench_tprs[j], buf);
+ +        fprintf(stdout, "Writing benchmark tpr %s with nsteps=", fn_bench_tprs[j]);
+ +        fprintf(stdout, gmx_large_int_pfmt, ir->nsteps);
+ +        if (j > 0)
+ +        {
+ +            fprintf(stdout, ", scaling factor %f\n", fac);
+ +        }
+ +        else
+ +        {
+ +            fprintf(stdout, ", unmodified settings\n");
+ +        }
+ +
+ +        write_tpx_state(fn_bench_tprs[j], ir, &state, &mtop);
+ +
+ +        /* Write information about modified tpr settings to log file */
+ +        fprintf(fp, "%4d%10f%10f", j, fac, ir->rcoulomb);
+ +        fprintf(fp, "%5d%5d%5d", ir->nkx, ir->nky, ir->nkz);
+ +        fprintf(fp, " %9f ", info->fsx[j]);
+ +        if (evdwCUT == ir->vdwtype)
+ +        {
+ +            fprintf(fp, "%10f", ir->rvdw);
+ +        }
+ +        if (EPME_SWITCHED(ir->coulombtype))
+ +        {
+ +            fprintf(fp, "%10f", ir->rlist);
+ +        }
+ +        if (info->rlistlong[0] != max_cutoff(info->rlist[0], max_cutoff(info->rvdw[0], info->rcoulomb[0])) )
+ +        {
+ +            fprintf(fp, "%10f", ir->rlistlong);
+ +        }
+ +        fprintf(fp, "  %-14s\n", fn_bench_tprs[j]);
+ +
+ +        /* Make it clear to the user that some additional settings were modified */
+ +        if (!is_equal(ir->rvdw, info->rvdw[0])
+ +            || !is_equal(ir->rlistlong, info->rlistlong[0]) )
+ +        {
+ +            bNote = TRUE;
+ +        }
+ +    }
+ +    if (bNote)
+ +    {
+ +        fprintf(fp, "\nNote that in addition to the Coulomb radius and the Fourier grid\n"
+ +                "other input settings were also changed (see table above).\n"
+ +                "Please check if the modified settings are appropriate.\n");
+ +    }
+ +    fflush(stdout);
+ +    fflush(fp);
+ +    sfree(ir);
+ +}
+ +
+ +
+ +/* Rename the files we want to keep to some meaningful filename and
+ + * delete the rest */
+ +static void cleanup(const t_filenm *fnm, int nfile, int k, int nnodes,
+ +                    int nPMEnodes, int nr, gmx_bool bKeepStderr)
+ +{
+ +    char        numstring[STRLEN];
+ +    char        newfilename[STRLEN];
+ +    const char *fn = NULL;
+ +    int         i;
+ +    const char *opt;
+ +
+ +
+ +    fprintf(stdout, "Cleaning up, deleting benchmark temp files ...\n");
+ +
+ +    for (i = 0; i < nfile; i++)
+ +    {
+ +        opt = (char *)fnm[i].opt;
+ +        if (strcmp(opt, "-p") == 0)
+ +        {
+ +            /* do nothing; keep this file */
+ +            ;
+ +        }
+ +        else if (strcmp(opt, "-bg") == 0)
+ +        {
+ +            /* Give the log file a nice name so one can later see which parameters were used */
+ +            numstring[0] = '\0';
+ +            if (nr > 0)
+ +            {
+ +                sprintf(numstring, "_%d", nr);
+ +            }
+ +            sprintf(newfilename, "%s_no%d_np%d_npme%d%s", opt2fn("-bg", nfile, fnm), k, nnodes, nPMEnodes, numstring);
+ +            if (gmx_fexist(opt2fn("-bg", nfile, fnm)))
+ +            {
+ +                fprintf(stdout, "renaming log file to %s\n", newfilename);
+ +                make_backup(newfilename);
+ +                rename(opt2fn("-bg", nfile, fnm), newfilename);
+ +            }
+ +        }
+ +        else if (strcmp(opt, "-err") == 0)
+ +        {
+ +            /* This file contains the output of stderr. We want to keep it in
+ +             * cases where there have been problems. */
+ +            fn           = opt2fn(opt, nfile, fnm);
+ +            numstring[0] = '\0';
+ +            if (nr > 0)
+ +            {
+ +                sprintf(numstring, "_%d", nr);
+ +            }
+ +            sprintf(newfilename, "%s_no%d_np%d_npme%d%s", fn, k, nnodes, nPMEnodes, numstring);
+ +            if (gmx_fexist(fn))
+ +            {
+ +                if (bKeepStderr)
+ +                {
+ +                    fprintf(stdout, "Saving stderr output in %s\n", newfilename);
+ +                    make_backup(newfilename);
+ +                    rename(fn, newfilename);
+ +                }
+ +                else
+ +                {
+ +                    fprintf(stdout, "Deleting %s\n", fn);
+ +                    remove(fn);
+ +                }
+ +            }
+ +        }
+ +        /* Delete the files which are created for each benchmark run: (options -b*) */
+ +        else if ( (0 == strncmp(opt, "-b", 2)) && (opt2bSet(opt, nfile, fnm) || !is_optional(&fnm[i])) )
+ +        {
+ +            fn = opt2fn(opt, nfile, fnm);
+ +            if (gmx_fexist(fn))
+ +            {
+ +                fprintf(stdout, "Deleting %s\n", fn);
+ +                remove(fn);
+ +            }
+ +        }
+ +    }
+ +}
+ +
+ +
+ +/* Returns the largest common factor of n1 and n2 */
+ +static int largest_common_factor(int n1, int n2)
+ +{
+ +    int factor, nmax;
+ +
+ +    nmax = min(n1, n2);
+ +    for (factor = nmax; factor > 0; factor--)
+ +    {
+ +        if (0 == (n1 % factor) && 0 == (n2 % factor) )
+ +        {
+ +            return(factor);
+ +        }
+ +    }
+ +    return 0; /* one for the compiler */
+ +}
+ +
+ +enum {
+ +    eNpmeAuto, eNpmeAll, eNpmeReduced, eNpmeSubset, eNpmeNr
+ +};
+ +
+ +/* Create a list of numbers of PME nodes to test */
+ +static void make_npme_list(
+ +        const char *npmevalues_opt, /* Make a complete list with all
+ +                                     * possibilities or a short list that keeps only
+ +                                     * reasonable numbers of PME nodes                  */
+ +        int        *nentries,       /* Number of entries we put in the nPMEnodes list   */
+ +        int        *nPMEnodes[],    /* Each entry contains the value for -npme          */
+ +        int         nnodes,         /* Total number of nodes to do the tests on         */
+ +        int         minPMEnodes,    /* Minimum number of PME nodes                      */
+ +        int         maxPMEnodes)    /* Maximum number of PME nodes                      */
+ +{
+ +    int i, npme, npp;
+ +    int min_factor = 1;   /* We request that npp and npme have this minimal
+ +                           * largest common factor (depends on npp)           */
+ +    int nlistmax;         /* Max. list size                                   */
+ +    int nlist;            /* Actual number of entries in list                 */
+ +    int eNPME = 0;
+ +
+ +
+ +    /* Do we need to check all possible values for -npme or is a reduced list enough? */
+ +    if (0 == strcmp(npmevalues_opt, "all") )
+ +    {
+ +        eNPME = eNpmeAll;
+ +    }
+ +    else if (0 == strcmp(npmevalues_opt, "subset") )
+ +    {
+ +        eNPME = eNpmeSubset;
+ +    }
+ +    else /* "auto" or "range" */
+ +    {
+ +        if (nnodes <= 64)
+ +        {
+ +            eNPME = eNpmeAll;
+ +        }
+ +        else if (nnodes < 128)
+ +        {
+ +            eNPME = eNpmeReduced;
+ +        }
+ +        else
+ +        {
+ +            eNPME = eNpmeSubset;
+ +        }
+ +    }
+ +
+ +    /* Calculate how many entries we could possibly have (in case of -npme all) */
+ +    if (nnodes > 2)
+ +    {
+ +        nlistmax = maxPMEnodes - minPMEnodes + 3;
+ +        if (0 == minPMEnodes)
+ +        {
+ +            nlistmax--;
+ +        }
+ +    }
+ +    else
+ +    {
+ +        nlistmax = 1;
+ +    }
+ +
+ +    /* Now make the actual list which is at most of size nlist */
+ +    snew(*nPMEnodes, nlistmax);
+ +    nlist = 0; /* start counting again, now the real entries in the list */
+ +    for (i = 0; i < nlistmax - 2; i++)
+ +    {
+ +        npme = maxPMEnodes - i;
+ +        npp  = nnodes-npme;
+ +        switch (eNPME)
+ +        {
+ +            case eNpmeAll:
+ +                min_factor = 1;
+ +                break;
+ +            case eNpmeReduced:
+ +                min_factor = 2;
+ +                break;
+ +            case eNpmeSubset:
+ +                /* For 2d PME we want a common largest factor of at least the cube
+ +                 * root of the number of PP nodes */
+ +                min_factor = (int) pow(npp, 1.0/3.0);
+ +                break;
+ +            default:
+ +                gmx_fatal(FARGS, "Unknown option for eNPME in make_npme_list");
+ +                break;
+ +        }
+ +        if (largest_common_factor(npp, npme) >= min_factor)
+ +        {
+ +            (*nPMEnodes)[nlist] = npme;
+ +            nlist++;
+ +        }
+ +    }
+ +    /* We always test 0 PME nodes and the automatic number */
+ +    *nentries             = nlist + 2;
+ +    (*nPMEnodes)[nlist  ] =  0;
+ +    (*nPMEnodes)[nlist+1] = -1;
+ +
+ +    fprintf(stderr, "Will try the following %d different values for -npme:\n", *nentries);
+ +    for (i = 0; i < *nentries-1; i++)
+ +    {
+ +        fprintf(stderr, "%d, ", (*nPMEnodes)[i]);
+ +    }
+ +    fprintf(stderr, "and %d (auto).\n", (*nPMEnodes)[*nentries-1]);
+ +}
+ +
+ +
+ +/* Allocate memory to store the performance data */
+ +static void init_perfdata(t_perf *perfdata[], int ntprs, int datasets, int repeats)
+ +{
+ +    int i, j, k;
+ +
+ +
+ +    for (k = 0; k < ntprs; k++)
+ +    {
+ +        snew(perfdata[k], datasets);
+ +        for (i = 0; i < datasets; i++)
+ +        {
+ +            for (j = 0; j < repeats; j++)
+ +            {
+ +                snew(perfdata[k][i].Gcycles, repeats);
+ +                snew(perfdata[k][i].ns_per_day, repeats);
+ +                snew(perfdata[k][i].PME_f_load, repeats);
+ +            }
+ +        }
+ +    }
+ +}
+ +
+ +
+ +/* Check for errors on mdrun -h */
+ +static void make_sure_it_runs(char *mdrun_cmd_line, int length, FILE *fp)
+ +{
+ +    char *command, *msg;
+ +    int   ret;
+ +
+ +
+ +    snew(command, length +  15);
+ +    snew(msg, length + 500);
+ +
+ +    fprintf(stdout, "Making shure the benchmarks can be executed ...\n");
+ +    sprintf(command, "%s-h -quiet", mdrun_cmd_line);
+ +    ret = gmx_system_call(command);
+ +
+ +    if (0 != ret)
+ +    {
+ +        /* To prevent confusion, do not again issue a gmx_fatal here since we already
+ +         * get the error message from mdrun itself */
+ +        sprintf(msg,    "Cannot run the benchmark simulations! Please check the error message of\n"
+ +                "mdrun for the source of the problem. Did you provide a command line\n"
+ +                "argument that neither g_tune_pme nor mdrun understands? Offending command:\n"
+ +                "\n%s\n\n", command);
+ +
+ +        fprintf(stderr, "%s", msg);
+ +        sep_line(fp);
+ +        fprintf(fp, "%s", msg);
+ +
+ +        exit(ret);
+ +    }
+ +
+ +    sfree(command);
+ +    sfree(msg    );
+ +}
+ +
+ +
+ +static void do_the_tests(
+ +        FILE           *fp,             /* General g_tune_pme output file         */
+ +        char          **tpr_names,      /* Filenames of the input files to test   */
+ +        int             maxPMEnodes,    /* Max fraction of nodes to use for PME   */
+ +        int             minPMEnodes,    /* Min fraction of nodes to use for PME   */
+ +        int             npme_fixed,     /* If >= -1, test fixed number of PME
+ +                                         * nodes only                             */
+ +        const char     *npmevalues_opt, /* Which -npme values should be tested    */
+ +        t_perf        **perfdata,       /* Here the performace data is stored     */
+ +        int            *pmeentries,     /* Entries in the nPMEnodes list          */
+ +        int             repeats,        /* Repeat each test this often            */
+ +        int             nnodes,         /* Total number of nodes = nPP + nPME     */
+ +        int             nr_tprs,        /* Total number of tpr files to test      */
+ +        gmx_bool        bThreads,       /* Threads or MPI?                        */
+ +        char           *cmd_mpirun,     /* mpirun command string                  */
+ +        char           *cmd_np,         /* "-np", "-n", whatever mpirun needs     */
+ +        char           *cmd_mdrun,      /* mdrun command string                   */
+ +        char           *cmd_args_bench, /* arguments for mdrun in a string        */
+ +        const t_filenm *fnm,            /* List of filenames from command line    */
+ +        int             nfile,          /* Number of files specified on the cmdl. */
+ +        int             presteps,       /* DLB equilibration steps, is checked    */
+ +        gmx_large_int_t cpt_steps)      /* Time step counter in the checkpoint    */
+ +{
+ +    int      i, nr, k, ret, count = 0, totaltests;
+ +    int     *nPMEnodes = NULL;
+ +    t_perf  *pd        = NULL;
+ +    int      cmdline_length;
+ +    char    *command, *cmd_stub;
+ +    char     buf[STRLEN];
+ +    gmx_bool bResetProblem = FALSE;
+ +    gmx_bool bFirst        = TRUE;
+ +
+ +
+ +    /* This string array corresponds to the eParselog enum type at the start
+ +     * of this file */
+ +    const char* ParseLog[] = {
+ +        "OK.",
+ +        "Logfile not found!",
+ +        "No timings, logfile truncated?",
+ +        "Run was terminated.",
+ +        "Counters were not reset properly.",
+ +        "No DD grid found for these settings.",
+ +        "TPX version conflict!",
+ +        "mdrun was not started in parallel!",
+ +        "An error occured."
+ +    };
+ +    char        str_PME_f_load[13];
+ +
+ +
+ +    /* Allocate space for the mdrun command line. 100 extra characters should
+ +       be more than enough for the -npme etcetera arguments */
+ +    cmdline_length =  strlen(cmd_mpirun)
+ +        + strlen(cmd_np)
+ +        + strlen(cmd_mdrun)
+ +        + strlen(cmd_args_bench)
+ +        + strlen(tpr_names[0]) + 100;
+ +    snew(command, cmdline_length);
+ +    snew(cmd_stub, cmdline_length);
+ +
+ +    /* Construct the part of the command line that stays the same for all tests: */
+ +    if (bThreads)
+ +    {
+ +        sprintf(cmd_stub, "%s%s", cmd_mdrun, cmd_np);
+ +    }
+ +    else
+ +    {
+ +        sprintf(cmd_stub, "%s%s%s ", cmd_mpirun, cmd_np, cmd_mdrun);
+ +    }
+ +
+ +    /* Create a list of numbers of PME nodes to test */
+ +    if (npme_fixed < -1)
+ +    {
+ +        make_npme_list(npmevalues_opt, pmeentries, &nPMEnodes,
+ +                       nnodes, minPMEnodes, maxPMEnodes);
+ +    }
+ +    else
+ +    {
+ +        *pmeentries  = 1;
+ +        snew(nPMEnodes, 1);
+ +        nPMEnodes[0] = npme_fixed;
+ +        fprintf(stderr, "Will use a fixed number of %d PME-only nodes.\n", nPMEnodes[0]);
+ +    }
+ +
+ +    if (0 == repeats)
+ +    {
+ +        fprintf(fp, "\nNo benchmarks done since number of repeats (-r) is 0.\n");
+ +        ffclose(fp);
+ +        finalize(opt2fn("-p", nfile, fnm));
+ +        exit(0);
+ +    }
+ +
+ +    /* Allocate one dataset for each tpr input file: */
+ +    init_perfdata(perfdata, nr_tprs, *pmeentries, repeats);
+ +
+ +    /*****************************************/
+ +    /* Main loop over all tpr files to test: */
+ +    /*****************************************/
+ +    totaltests = nr_tprs*(*pmeentries)*repeats;
+ +    for (k = 0; k < nr_tprs; k++)
+ +    {
+ +        fprintf(fp, "\nIndividual timings for input file %d (%s):\n", k, tpr_names[k]);
+ +        fprintf(fp, "PME nodes      Gcycles       ns/day        PME/f    Remark\n");
+ +        /* Loop over various numbers of PME nodes: */
+ +        for (i = 0; i < *pmeentries; i++)
+ +        {
+ +            pd = &perfdata[k][i];
+ +
+ +            /* Loop over the repeats for each scenario: */
+ +            for (nr = 0; nr < repeats; nr++)
+ +            {
+ +                pd->nPMEnodes = nPMEnodes[i];
+ +
+ +                /* Add -npme and -s to the command line and save it. Note that
+ +                 * the -passall (if set) options requires cmd_args_bench to be
+ +                 * at the end of the command line string */
+ +                snew(pd->mdrun_cmd_line, cmdline_length);
+ +                sprintf(pd->mdrun_cmd_line, "%s-npme %d -s %s %s",
+ +                        cmd_stub, pd->nPMEnodes, tpr_names[k], cmd_args_bench);
+ +
+ +                /* To prevent that all benchmarks fail due to a show-stopper argument
+ +                 * on the mdrun command line, we make a quick check with mdrun -h first */
+ +                if (bFirst)
+ +                {
+ +                    make_sure_it_runs(pd->mdrun_cmd_line, cmdline_length, fp);
+ +                }
+ +                bFirst = FALSE;
+ +
+ +                /* Do a benchmark simulation: */
+ +                if (repeats > 1)
+ +                {
+ +                    sprintf(buf, ", pass %d/%d", nr+1, repeats);
+ +                }
+ +                else
+ +                {
+ +                    buf[0] = '\0';
+ +                }
+ +                fprintf(stdout, "\n=== Progress %2.0f%%, tpr %d/%d, run %d/%d%s:\n",
+ +                        (100.0*count)/totaltests,
+ +                        k+1, nr_tprs, i+1, *pmeentries, buf);
+ +                make_backup(opt2fn("-err", nfile, fnm));
+ +                sprintf(command, "%s 1> /dev/null 2>%s", pd->mdrun_cmd_line, opt2fn("-err", nfile, fnm));
+ +                fprintf(stdout, "%s\n", pd->mdrun_cmd_line);
+ +                gmx_system_call(command);
+ +
+ +                /* Collect the performance data from the log file; also check stderr
+ +                 * for fatal errors */
+ +                ret = parse_logfile(opt2fn("-bg", nfile, fnm), opt2fn("-err", nfile, fnm),
+ +                                    pd, nr, presteps, cpt_steps, nnodes);
+ +                if ((presteps > 0) && (ret == eParselogResetProblem))
+ +                {
+ +                    bResetProblem = TRUE;
+ +                }
+ +
+ +                if (-1 == pd->nPMEnodes)
+ +                {
+ +                    sprintf(buf, "(%3d)", pd->guessPME);
+ +                }
+ +                else
+ +                {
+ +                    sprintf(buf, "     ");
+ +                }
+ +
+ +                /* Nicer output */
+ +                if (pd->PME_f_load[nr] > 0.0)
+ +                {
+ +                    sprintf(str_PME_f_load, "%12.3f", pd->PME_f_load[nr]);
+ +                }
+ +                else
+ +                {
+ +                    sprintf(str_PME_f_load, "%s", "         -  ");
+ +                }
+ +
+ +                /* Write the data we got to disk */
+ +                fprintf(fp, "%4d%s %12.3f %12.3f %s    %s", pd->nPMEnodes,
+ +                        buf, pd->Gcycles[nr], pd->ns_per_day[nr], str_PME_f_load, ParseLog[ret]);
+ +                if (!(ret == eParselogOK || ret == eParselogNoDDGrid || ret == eParselogNotFound) )
+ +                {
+ +                    fprintf(fp, " Check %s file for problems.", ret == eParselogFatal ? "err" : "log");
+ +                }
+ +                fprintf(fp, "\n");
+ +                fflush(fp);
+ +                count++;
+ +
+ +                /* Do some cleaning up and delete the files we do not need any more */
+ +                cleanup(fnm, nfile, k, nnodes, pd->nPMEnodes, nr, ret == eParselogFatal);
+ +
+ +                /* If the first run with this number of processors already failed, do not try again: */
+ +                if (pd->Gcycles[0] <= 0.0 && repeats > 1)
+ +                {
+ +                    fprintf(stdout, "Skipping remaining passes of unsuccessful setting, see log file for details.\n");
+ +                    count += repeats-(nr+1);
+ +                    break;
+ +                }
+ +            } /* end of repeats loop */
+ +        }     /* end of -npme loop */
+ +    }         /* end of tpr file loop */
+ +
+ +    if (bResetProblem)
+ +    {
+ +        sep_line(fp);
+ +        fprintf(fp, "WARNING: The cycle and time step counters could not be reset properly. ");
+ +        sep_line(fp);
+ +    }
+ +    sfree(command);
+ +    sfree(cmd_stub);
+ +}
+ +
+ +
+ +static void check_input(
+ +        int             nnodes,
+ +        int             repeats,
+ +        int            *ntprs,
+ +        real           *rmin,
+ +        real            rcoulomb,
+ +        real           *rmax,
+ +        real            maxPMEfraction,
+ +        real            minPMEfraction,
+ +        int             npme_fixed,
+ +        gmx_large_int_t bench_nsteps,
+ +        const t_filenm *fnm,
+ +        int             nfile,
+ +        int             sim_part,
+ +        int             presteps,
+ +        int             npargs,
+ +        t_pargs        *pa)
+ +{
+ +    int old;
+ +
+ +
+ +    /* Make sure the input file exists */
+ +    if (!gmx_fexist(opt2fn("-s", nfile, fnm)))
+ +    {
+ +        gmx_fatal(FARGS, "File %s not found.", opt2fn("-s", nfile, fnm));
+ +    }
+ +
+ +    /* Make sure that the checkpoint file is not overwritten during benchmarking */
+ +    if ( (0 == strcmp(opt2fn("-cpi", nfile, fnm), opt2fn("-bcpo", nfile, fnm)) ) && (sim_part > 1) )
+ +    {
+ +        gmx_fatal(FARGS, "Checkpoint input (-cpi) and benchmark checkpoint output (-bcpo) files must not be identical.\n"
+ +                  "The checkpoint input file must not be overwritten during the benchmarks.\n");
+ +    }
+ +
+ +    /* Make sure that repeats is >= 0 (if == 0, only write tpr files) */
+ +    if (repeats < 0)
+ +    {
+ +        gmx_fatal(FARGS, "Number of repeats < 0!");
+ +    }
+ +
+ +    /* Check number of nodes */
+ +    if (nnodes < 1)
+ +    {
+ +        gmx_fatal(FARGS, "Number of nodes/threads must be a positive integer.");
+ +    }
+ +
+ +    /* Automatically choose -ntpr if not set */
+ +    if (*ntprs < 1)
+ +    {
+ +        if (nnodes < 16)
+ +        {
+ +            *ntprs = 1;
+ +        }
+ +        else
+ +        {
+ +            *ntprs = 3;
+ +            /* Set a reasonable scaling factor for rcoulomb */
+ +            if (*rmax <= 0)
+ +            {
+ +                *rmax = rcoulomb * 1.2;
+ +            }
+ +        }
+ +        fprintf(stderr, "Will test %d tpr file%s.\n", *ntprs, *ntprs == 1 ? "" : "s");
+ +    }
+ +    else
+ +    {
+ +        if (1 == *ntprs)
+ +        {
+ +            fprintf(stderr, "Note: Choose ntpr>1 to shift PME load between real and reciprocal space.\n");
+ +        }
+ +    }
+ +
+ +    /* Make shure that rmin <= rcoulomb <= rmax */
+ +    if (*rmin <= 0)
+ +    {
+ +        *rmin = rcoulomb;
+ +    }
+ +    if (*rmax <= 0)
+ +    {
+ +        *rmax = rcoulomb;
+ +    }
+ +    if (!(*rmin <= *rmax) )
+ +    {
+ +        gmx_fatal(FARGS, "Please choose the Coulomb radii such that rmin <= rmax.\n"
+ +                  "rmin = %g, rmax = %g, actual rcoul from .tpr file = %g\n", *rmin, *rmax, rcoulomb);
+ +    }
+ +    /* Add test scenarios if rmin or rmax were set */
+ +    if (*ntprs <= 2)
+ +    {
+ +        if (!is_equal(*rmin, rcoulomb) && (*ntprs == 1) )
+ +        {
+ +            (*ntprs)++;
+ +            fprintf(stderr, "NOTE: Setting -rmin to %g changed -ntpr to %d\n",
+ +                    *rmin, *ntprs);
+ +        }
+ +        if (!is_equal(*rmax, rcoulomb) && (*ntprs == 1) )
+ +        {
+ +            (*ntprs)++;
+ +            fprintf(stderr, "NOTE: Setting -rmax to %g changed -ntpr to %d\n",
+ +                    *rmax, *ntprs);
+ +        }
+ +    }
+ +    old = *ntprs;
+ +    /* If one of rmin, rmax is set, we need 2 tpr files at minimum */
+ +    if (!is_equal(*rmax, rcoulomb) || !is_equal(*rmin, rcoulomb) )
+ +    {
+ +        *ntprs = max(*ntprs, 2);
+ +    }
+ +
+ +    /* If both rmin, rmax are set, we need 3 tpr files at minimum */
+ +    if (!is_equal(*rmax, rcoulomb) && !is_equal(*rmin, rcoulomb) )
+ +    {
+ +        *ntprs = max(*ntprs, 3);
+ +    }
+ +
+ +    if (old != *ntprs)
+ +    {
+ +        fprintf(stderr, "NOTE: Your rmin, rmax setting changed -ntpr to %d\n", *ntprs);
+ +    }
+ +
+ +    if (*ntprs > 1)
+ +    {
+ +        if (is_equal(*rmin, rcoulomb) && is_equal(rcoulomb, *rmax)) /* We have just a single rc */
+ +        {
+ +            fprintf(stderr, "WARNING: Resetting -ntpr to 1 since no Coulomb radius scaling is requested.\n"
+ +                    "Please set rmin < rmax to test Coulomb radii in the [rmin, rmax] interval\n"
+ +                    "with correspondingly adjusted PME grid settings\n");
+ +            *ntprs = 1;
+ +        }
+ +    }
+ +
+ +    /* Check whether max and min fraction are within required values */
+ +    if (maxPMEfraction > 0.5 || maxPMEfraction < 0)
+ +    {
+ +        gmx_fatal(FARGS, "-max must be between 0 and 0.5");
+ +    }
+ +    if (minPMEfraction > 0.5 || minPMEfraction < 0)
+ +    {
+ +        gmx_fatal(FARGS, "-min must be between 0 and 0.5");
+ +    }
+ +    if (maxPMEfraction < minPMEfraction)
+ +    {
+ +        gmx_fatal(FARGS, "-max must be larger or equal to -min");
+ +    }
+ +
+ +    /* Check whether the number of steps - if it was set - has a reasonable value */
+ +    if (bench_nsteps < 0)
+ +    {
+ +        gmx_fatal(FARGS, "Number of steps must be positive.");
+ +    }
+ +
+ +    if (bench_nsteps > 10000 || bench_nsteps < 100)
+ +    {
+ +        fprintf(stderr, "WARNING: steps=");
+ +        fprintf(stderr, gmx_large_int_pfmt, bench_nsteps);
+ +        fprintf(stderr, ". Are you sure you want to perform so %s steps for each benchmark?\n", (bench_nsteps < 100) ? "few" : "many");
+ +    }
+ +
+ +    if (presteps < 0)
+ +    {
+ +        gmx_fatal(FARGS, "Cannot have a negative number of presteps.\n");
+ +    }
+ +
+ +    /* Check for rcoulomb scaling if more than one .tpr file is tested */
+ +    if (*ntprs > 1)
+ +    {
+ +        if (*rmin/rcoulomb < 0.75 || *rmax/rcoulomb > 1.25)
+ +        {
+ +            fprintf(stderr, "WARNING: Applying extreme scaling factor. I hope you know what you are doing.\n");
+ +        }
+ +    }
+ +
+ +    /* If a fixed number of PME nodes is set we do rcoulomb and PME gird tuning
+ +     * only. We need to check whether the requested number of PME-only nodes
+ +     * makes sense. */
+ +    if (npme_fixed > -1)
+ +    {
+ +        /* No more than 50% of all nodes can be assigned as PME-only nodes. */
+ +        if (2*npme_fixed > nnodes)
+ +        {
+ +            gmx_fatal(FARGS, "Cannot have more than %d PME-only nodes for a total of %d nodes (you chose %d).\n",
+ +                      nnodes/2, nnodes, npme_fixed);
+ +        }
+ +        if ((npme_fixed > 0) && (5*npme_fixed < nnodes))
+ +        {
+ +            fprintf(stderr, "WARNING: Only %g percent of the nodes are assigned as PME-only nodes.\n",
+ +                    100.0*((real)npme_fixed / (real)nnodes));
+ +        }
+ +        if (opt2parg_bSet("-min", npargs, pa) || opt2parg_bSet("-max", npargs, pa))
+ +        {
+ +            fprintf(stderr, "NOTE: The -min, -max, and -npme options have no effect when a\n"
+ +                    "      fixed number of PME-only nodes is requested with -fix.\n");
+ +        }
+ +    }
+ +}
+ +
+ +
+ +/* Returns TRUE when "opt" is needed at launch time */
+ +static gmx_bool is_launch_file(char *opt, gmx_bool bSet)
+ +{
+ +    /* Apart from the input .tpr and the output log files we need all options that
+ +     * were set on the command line and that do not start with -b */
+ +    if    (0 == strncmp(opt, "-b", 2) || 0 == strncmp(opt, "-s", 2)
+ +           || 0 == strncmp(opt, "-err", 4) || 0 == strncmp(opt, "-p", 2) )
+ +    {
+ +        return FALSE;
+ +    }
+ +
+ +    return bSet;
+ +}
+ +
+ +
+ +/* Returns TRUE when "opt" defines a file which is needed for the benchmarks runs */
+ +static gmx_bool is_bench_file(char *opt, gmx_bool bSet, gmx_bool bOptional, gmx_bool bIsOutput)
+ +{
+ +    /* Apart from the input .tpr, all files starting with "-b" are for
+ +     * _b_enchmark files exclusively */
+ +    if (0 == strncmp(opt, "-s", 2))
+ +    {
+ +        return FALSE;
+ +    }
+ +
+ +    if (0 == strncmp(opt, "-b", 2) || 0 == strncmp(opt, "-s", 2))
+ +    {
+ +        if (!bOptional || bSet)
+ +        {
+ +            return TRUE;
+ +        }
+ +        else
+ +        {
+ +            return FALSE;
+ +        }
+ +    }
+ +    else
+ +    {
+ +        if (bIsOutput)
+ +        {
+ +            return FALSE;
+ +        }
+ +        else
+ +        {
+ +            if (bSet) /* These are additional input files like -cpi -ei */
+ +            {
+ +                return TRUE;
+ +            }
+ +            else
+ +            {
+ +                return FALSE;
+ +            }
+ +        }
+ +    }
+ +}
+ +
+ +
+ +/* Adds 'buf' to 'str' */
+ +static void add_to_string(char **str, char *buf)
+ +{
+ +    int len;
+ +
+ +
+ +    len = strlen(*str) + strlen(buf) + 1;
+ +    srenew(*str, len);
+ +    strcat(*str, buf);
+ +}
+ +
+ +
+ +/* Create the command line for the benchmark as well as for the real run */
+ +static void create_command_line_snippets(
+ +        gmx_bool  bAppendFiles,
+ +        gmx_bool  bKeepAndNumCPT,
+ +        gmx_bool  bResetHWay,
+ +        int       presteps,
+ +        int       nfile,
+ +        t_filenm  fnm[],
+ +        char     *cmd_args_bench[],  /* command line arguments for benchmark runs */
+ +        char     *cmd_args_launch[], /* command line arguments for simulation run */
+ +        char      extra_args[])      /* Add this to the end of the command line */
+ +{
+ +    int         i;
+ +    char       *opt;
+ +    const char *name;
+ +    char        strbuf[STRLEN];
+ +
+ +
+ +    /* strlen needs at least '\0' as a string: */
+ +    snew(*cmd_args_bench, 1);
+ +    snew(*cmd_args_launch, 1);
+ +    *cmd_args_launch[0] = '\0';
+ +    *cmd_args_bench[0]  = '\0';
+ +
+ +
+ +    /*******************************************/
+ +    /* 1. Process other command line arguments */
+ +    /*******************************************/
+ +    if (presteps > 0)
+ +    {
+ +        /* Add equilibration steps to benchmark options */
+ +        sprintf(strbuf, "-resetstep %d ", presteps);
+ +        add_to_string(cmd_args_bench, strbuf);
+ +    }
+ +    /* These switches take effect only at launch time */
+ +    if (FALSE == bAppendFiles)
+ +    {
+ +        add_to_string(cmd_args_launch, "-noappend ");
+ +    }
+ +    if (bKeepAndNumCPT)
+ +    {
+ +        add_to_string(cmd_args_launch, "-cpnum ");
+ +    }
+ +    if (bResetHWay)
+ +    {
+ +        add_to_string(cmd_args_launch, "-resethway ");
+ +    }
+ +
+ +    /********************/
+ +    /* 2. Process files */
+ +    /********************/
+ +    for (i = 0; i < nfile; i++)
+ +    {
+ +        opt  = (char *)fnm[i].opt;
+ +        name = opt2fn(opt, nfile, fnm);
+ +
+ +        /* Strbuf contains the options, now let's sort out where we need that */
+ +        sprintf(strbuf, "%s %s ", opt, name);
+ +
+ +        if (is_bench_file(opt, opt2bSet(opt, nfile, fnm), is_optional(&fnm[i]), is_output(&fnm[i])) )
+ +        {
+ +            /* All options starting with -b* need the 'b' removed,
+ +             * therefore overwrite strbuf */
+ +            if (0 == strncmp(opt, "-b", 2))
+ +            {
+ +                sprintf(strbuf, "-%s %s ", &opt[2], name);
+ +            }
+ +
+ +            add_to_string(cmd_args_bench, strbuf);
+ +        }
+ +
+ +        if (is_launch_file(opt, opt2bSet(opt, nfile, fnm)) )
+ +        {
+ +            add_to_string(cmd_args_launch, strbuf);
+ +        }
+ +    }
+ +
+ +    add_to_string(cmd_args_bench, extra_args);
+ +    add_to_string(cmd_args_launch, extra_args);
+ +}
+ +
+ +
+ +/* Set option opt */
+ +static void setopt(const char *opt, int nfile, t_filenm fnm[])
+ +{
+ +    int i;
+ +
+ +    for (i = 0; (i < nfile); i++)
+ +    {
+ +        if (strcmp(opt, fnm[i].opt) == 0)
+ +        {
+ +            fnm[i].flag |= ffSET;
+ +        }
+ +    }
+ +}
+ +
+ +
+ +/* This routine inspects the tpr file and ...
+ + * 1. checks for output files that get triggered by a tpr option. These output
+ + *    files are marked as 'set' to allow for a proper cleanup after each
+ + *    tuning run.
+ + * 2. returns the PME:PP load ratio
+ + * 3. returns rcoulomb from the tpr */
+ +static float inspect_tpr(int nfile, t_filenm fnm[], real *rcoulomb)
+ +{
+ +    gmx_bool     bPull;     /* Is pulling requested in .tpr file?             */
+ +    gmx_bool     bTpi;      /* Is test particle insertion requested?          */
+ +    gmx_bool     bFree;     /* Is a free energy simulation requested?         */
+ +    gmx_bool     bNM;       /* Is a normal mode analysis requested?           */
+ +    t_inputrec   ir;
+ +    t_state      state;
+ +    gmx_mtop_t   mtop;
+ +
+ +
+ +    /* Check tpr file for options that trigger extra output files */
+ +    read_tpx_state(opt2fn("-s", nfile, fnm), &ir, &state, NULL, &mtop);
+ +    bPull = (epullNO != ir.ePull);
+ +    bFree = (efepNO  != ir.efep );
+ +    bNM   = (eiNM    == ir.eI   );
+ +    bTpi  = EI_TPI(ir.eI);
+ +
+ +    /* Set these output files on the tuning command-line */
+ +    if (bPull)
+ +    {
+ +        setopt("-pf", nfile, fnm);
+ +        setopt("-px", nfile, fnm);
+ +    }
+ +    if (bFree)
+ +    {
+ +        setopt("-dhdl", nfile, fnm);
+ +    }
+ +    if (bTpi)
+ +    {
+ +        setopt("-tpi", nfile, fnm);
+ +        setopt("-tpid", nfile, fnm);
+ +    }
+ +    if (bNM)
+ +    {
+ +        setopt("-mtx", nfile, fnm);
+ +    }
+ +
+ +    *rcoulomb = ir.rcoulomb;
+ +
+ +    /* Return the estimate for the number of PME nodes */
+ +    return pme_load_estimate(&mtop, &ir, state.box);
+ +}
+ +
+ +
+ +static void couple_files_options(int nfile, t_filenm fnm[])
+ +{
+ +    int      i;
+ +    gmx_bool bSet, bBench;
+ +    char    *opt;
+ +    char     buf[20];
+ +
+ +
+ +    for (i = 0; i < nfile; i++)
+ +    {
+ +        opt    = (char *)fnm[i].opt;
+ +        bSet   = ((fnm[i].flag & ffSET) != 0);
+ +        bBench = (0 == strncmp(opt, "-b", 2));
+ +
+ +        /* Check optional files */
+ +        /* If e.g. -eo is set, then -beo also needs to be set */
+ +        if (is_optional(&fnm[i]) && bSet && !bBench)
+ +        {
+ +            sprintf(buf, "-b%s", &opt[1]);
+ +            setopt(buf, nfile, fnm);
+ +        }
+ +        /* If -beo is set, then -eo also needs to be! */
+ +        if (is_optional(&fnm[i]) && bSet && bBench)
+ +        {
+ +            sprintf(buf, "-%s", &opt[2]);
+ +            setopt(buf, nfile, fnm);
+ +        }
+ +    }
+ +}
+ +
+ +
+ +static double gettime()
+ +{
+ +#ifdef HAVE_GETTIMEOFDAY
+ +    struct timeval t;
+ +    double         seconds;
+ +
+ +    gettimeofday(&t, NULL);
+ +
+ +    seconds = (double) t.tv_sec + 1e-6*(double)t.tv_usec;
+ +
+ +    return seconds;
+ +#else
+ +    double  seconds;
+ +
+ +    seconds = time(NULL);
+ +
+ +    return seconds;
+ +#endif
+ +}
+ +
+ +
+ +#define BENCHSTEPS (1000)
+ +
+ +int gmx_tune_pme(int argc, char *argv[])
+ +{
+ +    const char     *desc[] = {
+ +        "For a given number [TT]-np[tt] or [TT]-ntmpi[tt] of processors/threads, this program systematically",
+ +        "times [TT]mdrun[tt] with various numbers of PME-only nodes and determines",
+ +        "which setting is fastest. It will also test whether performance can",
+ +        "be enhanced by shifting load from the reciprocal to the real space",
+ +        "part of the Ewald sum. ",
+ +        "Simply pass your [TT].tpr[tt] file to [TT]g_tune_pme[tt] together with other options",
+ +        "for [TT]mdrun[tt] as needed.[PAR]",
+ +        "Which executables are used can be set in the environment variables",
+ +        "MPIRUN and MDRUN. If these are not present, 'mpirun' and 'mdrun'",
+ +        "will be used as defaults. Note that for certain MPI frameworks you",
+ +        "need to provide a machine- or hostfile. This can also be passed",
+ +        "via the MPIRUN variable, e.g.[PAR]",
+ +        "[TT]export MPIRUN=\"/usr/local/mpirun -machinefile hosts\"[tt][PAR]",
+ +        "Please call [TT]g_tune_pme[tt] with the normal options you would pass to",
+ +        "[TT]mdrun[tt] and add [TT]-np[tt] for the number of processors to perform the",
+ +        "tests on, or [TT]-ntmpi[tt] for the number of threads. You can also add [TT]-r[tt]",
+ +        "to repeat each test several times to get better statistics. [PAR]",
+ +        "[TT]g_tune_pme[tt] can test various real space / reciprocal space workloads",
+ +        "for you. With [TT]-ntpr[tt] you control how many extra [TT].tpr[tt] files will be",
+ +        "written with enlarged cutoffs and smaller Fourier grids respectively.",
+ +        "Typically, the first test (number 0) will be with the settings from the input",
+ +        "[TT].tpr[tt] file; the last test (number [TT]ntpr[tt]) will have the Coulomb cutoff",
+ +        "specified by [TT]-rmax[tt] with a somwhat smaller PME grid at the same time. ",
+ +        "In this last test, the Fourier spacing is multiplied with [TT]rmax[tt]/rcoulomb. ",
+ +        "The remaining [TT].tpr[tt] files will have equally-spaced Coulomb radii (and Fourier "
+ +        "spacings) between these extremes. [BB]Note[bb] that you can set [TT]-ntpr[tt] to 1",
+ +        "if you just seek the optimal number of PME-only nodes; in that case",
+ +        "your input [TT].tpr[tt] file will remain unchanged.[PAR]",
+ +        "For the benchmark runs, the default of 1000 time steps should suffice for most",
+ +        "MD systems. The dynamic load balancing needs about 100 time steps",
+ +        "to adapt to local load imbalances, therefore the time step counters",
+ +        "are by default reset after 100 steps. For large systems (>1M atoms), as well as ",
+ +        "for a higher accuarcy of the measurements, you should set [TT]-resetstep[tt] to a higher value.",
+ +        "From the 'DD' load imbalance entries in the md.log output file you",
+ +        "can tell after how many steps the load is sufficiently balanced. Example call:[PAR]"
+ +        "[TT]g_tune_pme -np 64 -s protein.tpr -launch[tt][PAR]",
+ +        "After calling [TT]mdrun[tt] several times, detailed performance information",
+ +        "is available in the output file [TT]perf.out.[tt] ",
+ +        "[BB]Note[bb] that during the benchmarks, a couple of temporary files are written",
+ +        "(options [TT]-b[tt]*), these will be automatically deleted after each test.[PAR]",
+ +        "If you want the simulation to be started automatically with the",
+ +        "optimized parameters, use the command line option [TT]-launch[tt].[PAR]",
+ +    };
+ +
+ +    int             nnodes         = 1;
+ +    int             repeats        = 2;
+ +    int             pmeentries     = 0; /* How many values for -npme do we actually test for each tpr file */
+ +    real            maxPMEfraction = 0.50;
+ +    real            minPMEfraction = 0.25;
+ +    int             maxPMEnodes, minPMEnodes;
+ +    float           guessPMEratio;                    /* guessed PME:PP ratio based on the tpr file */
+ +    float           guessPMEnodes;
+ +    int             npme_fixed     = -2;              /* If >= -1, use only this number
+ +                                                       * of PME-only nodes                */
+ +    int             ntprs          = 0;
+ +    real            rmin           = 0.0, rmax = 0.0; /* min and max value for rcoulomb if scaling is requested */
+ +    real            rcoulomb       = -1.0;            /* Coulomb radius as set in .tpr file */
+ +    gmx_bool        bScaleRvdw     = TRUE;
+ +    gmx_large_int_t bench_nsteps   = BENCHSTEPS;
+ +    gmx_large_int_t new_sim_nsteps = -1;  /* -1 indicates: not set by the user */
+ +    gmx_large_int_t cpt_steps      = 0;   /* Step counter in .cpt input file   */
+ +    int             presteps       = 100; /* Do a full cycle reset after presteps steps */
+ +    gmx_bool        bOverwrite     = FALSE, bKeepTPR;
+ +    gmx_bool        bLaunch        = FALSE;
+ +    char           *ExtraArgs      = NULL;
+ +    char          **tpr_names      = NULL;
+ +    const char     *simulation_tpr = NULL;
+ +    int             best_npme, best_tpr;
+ +    int             sim_part = 1; /* For benchmarks with checkpoint files */
+ +    char            bbuf[STRLEN];
+ +
+ +    /* Default program names if nothing else is found */
+ +    char         *cmd_mpirun = NULL, *cmd_mdrun = NULL;
+ +    char         *cmd_args_bench, *cmd_args_launch;
+ +    char         *cmd_np = NULL;
+ +
+ +    t_perf      **perfdata = NULL;
+ +    t_inputinfo  *info;
+ +    int           i;
+ +    FILE         *fp;
+ +    t_commrec    *cr;
+ +
+ +    /* Print out how long the tuning took */
+ +    double          seconds;
+ +
+ +    static t_filenm fnm[] = {
+ +        /* g_tune_pme */
+ +        { efOUT, "-p",      "perf",     ffWRITE },
+ +        { efLOG, "-err",    "bencherr", ffWRITE },
+ +        { efTPX, "-so",     "tuned",    ffWRITE },
+ +        /* mdrun: */
+ +        { efTPX, NULL,      NULL,       ffREAD },
+ +        { efTRN, "-o",      NULL,       ffWRITE },
+ +        { efXTC, "-x",      NULL,       ffOPTWR },
+ +        { efCPT, "-cpi",    NULL,       ffOPTRD },
+ +        { efCPT, "-cpo",    NULL,       ffOPTWR },
+ +        { efSTO, "-c",      "confout",  ffWRITE },
+ +        { efEDR, "-e",      "ener",     ffWRITE },
+ +        { efLOG, "-g",      "md",       ffWRITE },
+ +        { efXVG, "-dhdl",   "dhdl",     ffOPTWR },
+ +        { efXVG, "-field",  "field",    ffOPTWR },
+ +        { efXVG, "-table",  "table",    ffOPTRD },
+ +        { efXVG, "-tabletf", "tabletf",   ffOPTRD },
+ +        { efXVG, "-tablep", "tablep",   ffOPTRD },
+ +        { efXVG, "-tableb", "table",    ffOPTRD },
+ +        { efTRX, "-rerun",  "rerun",    ffOPTRD },
+ +        { efXVG, "-tpi",    "tpi",      ffOPTWR },
+ +        { efXVG, "-tpid",   "tpidist",  ffOPTWR },
+ +        { efEDI, "-ei",     "sam",      ffOPTRD },
+ +        { efXVG, "-eo",     "edsam",    ffOPTWR },
+ +        { efGCT, "-j",      "wham",     ffOPTRD },
+ +        { efGCT, "-jo",     "bam",      ffOPTWR },
+ +        { efXVG, "-ffout",  "gct",      ffOPTWR },
+ +        { efXVG, "-devout", "deviatie", ffOPTWR },
+ +        { efXVG, "-runav",  "runaver",  ffOPTWR },
+ +        { efXVG, "-px",     "pullx",    ffOPTWR },
+ +        { efXVG, "-pf",     "pullf",    ffOPTWR },
+ +        { efXVG, "-ro",     "rotation", ffOPTWR },
+ +        { efLOG, "-ra",     "rotangles", ffOPTWR },
+ +        { efLOG, "-rs",     "rotslabs", ffOPTWR },
+ +        { efLOG, "-rt",     "rottorque", ffOPTWR },
+ +        { efMTX, "-mtx",    "nm",       ffOPTWR },
+ +        { efNDX, "-dn",     "dipole",   ffOPTWR },
+ +        /* Output files that are deleted after each benchmark run */
+ +        { efTRN, "-bo",     "bench",    ffWRITE },
+ +        { efXTC, "-bx",     "bench",    ffWRITE },
+ +        { efCPT, "-bcpo",   "bench",    ffWRITE },
+ +        { efSTO, "-bc",     "bench",    ffWRITE },
+ +        { efEDR, "-be",     "bench",    ffWRITE },
+ +        { efLOG, "-bg",     "bench",    ffWRITE },
+ +        { efXVG, "-beo",    "benchedo", ffOPTWR },
+ +        { efXVG, "-bdhdl",  "benchdhdl", ffOPTWR },
+ +        { efXVG, "-bfield", "benchfld", ffOPTWR },
+ +        { efXVG, "-btpi",   "benchtpi", ffOPTWR },
+ +        { efXVG, "-btpid",  "benchtpid", ffOPTWR },
+ +        { efGCT, "-bjo",    "bench",    ffOPTWR },
+ +        { efXVG, "-bffout", "benchgct", ffOPTWR },
+ +        { efXVG, "-bdevout", "benchdev", ffOPTWR },
+ +        { efXVG, "-brunav", "benchrnav", ffOPTWR },
+ +        { efXVG, "-bpx",    "benchpx",  ffOPTWR },
+ +        { efXVG, "-bpf",    "benchpf",  ffOPTWR },
+ +        { efXVG, "-bro",    "benchrot", ffOPTWR },
+ +        { efLOG, "-bra",    "benchrota", ffOPTWR },
+ +        { efLOG, "-brs",    "benchrots", ffOPTWR },
+ +        { efLOG, "-brt",    "benchrott", ffOPTWR },
+ +        { efMTX, "-bmtx",   "benchn",   ffOPTWR },
+ +        { efNDX, "-bdn",    "bench",    ffOPTWR }
+ +    };
+ +
+ +    gmx_bool        bThreads     = FALSE;
+ +
+ +    int             nthreads = 1;
+ +
+ +    const char     *procstring[] =
+ +    { NULL, "-np", "-n", "none", NULL };
+ +    const char     *npmevalues_opt[] =
+ +    { NULL, "auto", "all", "subset", NULL };
+ +
+ +    gmx_bool     bAppendFiles          = TRUE;
+ +    gmx_bool     bKeepAndNumCPT        = FALSE;
+ +    gmx_bool     bResetCountersHalfWay = FALSE;
+ +    gmx_bool     bBenchmark            = TRUE;
+ +
+ +    output_env_t oenv = NULL;
+ +
+ +    t_pargs      pa[] = {
+ +        /***********************/
+ +        /* g_tune_pme options: */
+ +        /***********************/
+ +        { "-np",       FALSE, etINT,  {&nnodes},
+ +          "Number of nodes to run the tests on (must be > 2 for separate PME nodes)" },
+ +        { "-npstring", FALSE, etENUM, {procstring},
+ +          "Specify the number of processors to [TT]$MPIRUN[tt] using this string"},
+ +        { "-ntmpi",    FALSE, etINT,  {&nthreads},
+ +          "Number of MPI-threads to run the tests on (turns MPI & mpirun off)"},
+ +        { "-r",        FALSE, etINT,  {&repeats},
+ +          "Repeat each test this often" },
+ +        { "-max",      FALSE, etREAL, {&maxPMEfraction},
+ +          "Max fraction of PME nodes to test with" },
+ +        { "-min",      FALSE, etREAL, {&minPMEfraction},
+ +          "Min fraction of PME nodes to test with" },
+ +        { "-npme",     FALSE, etENUM, {npmevalues_opt},
+ +          "Within -min and -max, benchmark all possible values for [TT]-npme[tt], or just a reasonable subset. "
+ +          "Auto neglects -min and -max and chooses reasonable values around a guess for npme derived from the .tpr"},
+ +        { "-fix",      FALSE, etINT,  {&npme_fixed},
+ +          "If >= -1, do not vary the number of PME-only nodes, instead use this fixed value and only vary rcoulomb and the PME grid spacing."},
+ +        { "-rmax",     FALSE, etREAL, {&rmax},
+ +          "If >0, maximal rcoulomb for -ntpr>1 (rcoulomb upscaling results in fourier grid downscaling)" },
+ +        { "-rmin",     FALSE, etREAL, {&rmin},
+ +          "If >0, minimal rcoulomb for -ntpr>1" },
+ +        { "-scalevdw",  FALSE, etBOOL, {&bScaleRvdw},
+ +          "Scale rvdw along with rcoulomb"},
+ +        { "-ntpr",     FALSE, etINT,  {&ntprs},
+ +          "Number of [TT].tpr[tt] files to benchmark. Create this many files with different rcoulomb scaling factors depending on -rmin and -rmax. "
+ +          "If < 1, automatically choose the number of [TT].tpr[tt] files to test" },
+ +        { "-steps",    FALSE, etGMX_LARGE_INT, {&bench_nsteps},
+ +          "Take timings for this many steps in the benchmark runs" },
+ +        { "-resetstep", FALSE, etINT,  {&presteps},
+ +          "Let dlb equilibrate this many steps before timings are taken (reset cycle counters after this many steps)" },
+ +        { "-simsteps", FALSE, etGMX_LARGE_INT, {&new_sim_nsteps},
+ +          "If non-negative, perform this many steps in the real run (overwrites nsteps from [TT].tpr[tt], add [TT].cpt[tt] steps)" },
+ +        { "-launch",   FALSE, etBOOL, {&bLaunch},
+ +          "Launch the real simulation after optimization" },
+ +        { "-bench",    FALSE, etBOOL, {&bBenchmark},
+ +          "Run the benchmarks or just create the input [TT].tpr[tt] files?" },
+ +        /******************/
+ +        /* mdrun options: */
+ +        /******************/
+ +        /* We let g_tune_pme parse and understand these options, because we need to
+ +         * prevent that they appear on the mdrun command line for the benchmarks */
+ +        { "-append",   FALSE, etBOOL, {&bAppendFiles},
+ +          "Append to previous output files when continuing from checkpoint instead of adding the simulation part number to all file names (for launch only)" },
+ +        { "-cpnum",    FALSE, etBOOL, {&bKeepAndNumCPT},
+ +          "Keep and number checkpoint files (launch only)" },
+ +        { "-resethway", FALSE, etBOOL, {&bResetCountersHalfWay},
+ +          "HIDDENReset the cycle counters after half the number of steps or halfway [TT]-maxh[tt] (launch only)" }
+ +    };
+ +
+ +#define NFILE asize(fnm)
+ +
+ +    seconds = gettime();
+ +
+ +    parse_common_args(&argc, argv, PCA_NOEXIT_ON_ARGS,
+ +                      NFILE, fnm, asize(pa), pa, asize(desc), desc,
+ +                      0, NULL, &oenv);
+ +
+ +    /* Store the remaining unparsed command line entries in a string which
+ +     * is then attached to the mdrun command line */
+ +    snew(ExtraArgs, 1);
+ +    ExtraArgs[0] = '\0';
+ +    for (i = 1; i < argc; i++) /* argc will now be 1 if everything was understood */
+ +    {
+ +        add_to_string(&ExtraArgs, argv[i]);
+ +        add_to_string(&ExtraArgs, " ");
+ +    }
+ +
+ +    if (opt2parg_bSet("-ntmpi", asize(pa), pa))
+ +    {
+ +        bThreads = TRUE;
+ +        if (opt2parg_bSet("-npstring", asize(pa), pa))
+ +        {
+ +            fprintf(stderr, "WARNING: -npstring has no effect when using threads.\n");
+ +        }
+ +
+ +        if (nnodes > 1)
+ +        {
+ +            gmx_fatal(FARGS, "Can't run multi-threaded MPI simulation yet!");
+ +        }
+ +        /* and now we just set this; a bit of an ugly hack*/
+ +        nnodes = nthreads;
+ +    }
+ +    /* Check for PME:PP ratio and whether tpr triggers additional output files */
+ +    guessPMEratio = inspect_tpr(NFILE, fnm, &rcoulomb);
+ +
+ +    /* Automatically set -beo options if -eo is set etc. */
+ +    couple_files_options(NFILE, fnm);
+ +
+ +    /* Construct the command line arguments for benchmark runs
+ +     * as well as for the simulation run */
+ +    if (bThreads)
+ +    {
+ +        sprintf(bbuf, " -ntmpi %d ", nthreads);
+ +    }
+ +    else
+ +    {
++        /* This string will be used for MPI runs and will appear after the
++         * mpirun command. */
++        if (strcmp(procstring[0], "none") != 0)
++        {
++            sprintf(bbuf, " %s %d ", procstring[0], nnodes);
++        }
++        else
++        {
++            sprintf(bbuf, " ");
++        }
+ +    }
+ +
+ +    cmd_np = bbuf;
+ +
+ +    create_command_line_snippets(bAppendFiles, bKeepAndNumCPT, bResetCountersHalfWay, presteps,
+ +                                 NFILE, fnm, &cmd_args_bench, &cmd_args_launch, ExtraArgs);
+ +
+ +    /* Read in checkpoint file if requested */
+ +    sim_part = 1;
+ +    if (opt2bSet("-cpi", NFILE, fnm))
+ +    {
+ +        snew(cr, 1);
+ +        cr->duty = DUTY_PP; /* makes the following routine happy */
+ +        read_checkpoint_simulation_part(opt2fn("-cpi", NFILE, fnm),
+ +                                        &sim_part, &cpt_steps, cr,
+ +                                        FALSE, NFILE, fnm, NULL, NULL);
+ +        sfree(cr);
+ +        sim_part++;
+ +        /* sim_part will now be 1 if no checkpoint file was found */
+ +        if (sim_part <= 1)
+ +        {
+ +            gmx_fatal(FARGS, "Checkpoint file %s not found!", opt2fn("-cpi", NFILE, fnm));
+ +        }
+ +    }
+ +
+ +    /* Open performance output file and write header info */
+ +    fp = ffopen(opt2fn("-p", NFILE, fnm), "w");
+ +
+ +    /* Make a quick consistency check of command line parameters */
+ +    check_input(nnodes, repeats, &ntprs, &rmin, rcoulomb, &rmax,
+ +                maxPMEfraction, minPMEfraction, npme_fixed,
+ +                bench_nsteps, fnm, NFILE, sim_part, presteps,
+ +                asize(pa), pa);
+ +
+ +    /* Determine the maximum and minimum number of PME nodes to test,
+ +     * the actual list of settings is build in do_the_tests(). */
+ +    if ((nnodes > 2) && (npme_fixed < -1))
+ +    {
+ +        if (0 == strcmp(npmevalues_opt[0], "auto"))
+ +        {
+ +            /* Determine the npme range automatically based on the PME:PP load guess */
+ +            if (guessPMEratio > 1.0)
+ +            {
+ +                /* More PME than PP work, probably we do not need separate PME nodes at all! */
+ +                maxPMEnodes = nnodes/2;
+ +                minPMEnodes = nnodes/2;
+ +            }
+ +            else
+ +            {
+ +                /* PME : PP load is in the range 0..1, let's test around the guess */
+ +                guessPMEnodes = nnodes/(1.0 + 1.0/guessPMEratio);
+ +                minPMEnodes   = floor(0.7*guessPMEnodes);
+ +                maxPMEnodes   =  ceil(1.6*guessPMEnodes);
+ +                maxPMEnodes   = min(maxPMEnodes, nnodes/2);
+ +            }
+ +        }
+ +        else
+ +        {
+ +            /* Determine the npme range based on user input */
+ +            maxPMEnodes = floor(maxPMEfraction*nnodes);
+ +            minPMEnodes = max(floor(minPMEfraction*nnodes), 0);
+ +            fprintf(stdout, "Will try runs with %d ", minPMEnodes);
+ +            if (maxPMEnodes != minPMEnodes)
+ +            {
+ +                fprintf(stdout, "- %d ", maxPMEnodes);
+ +            }
+ +            fprintf(stdout, "PME-only nodes.\n  Note that the automatic number of PME-only nodes and no separate PME nodes are always tested.\n");
+ +        }
+ +    }
+ +    else
+ +    {
+ +        maxPMEnodes = 0;
+ +        minPMEnodes = 0;
+ +    }
+ +
+ +    /* Get the commands we need to set up the runs from environment variables */
+ +    get_program_paths(bThreads, &cmd_mpirun, cmd_np, &cmd_mdrun, repeats);
+ +
+ +    /* Print some header info to file */
+ +    sep_line(fp);
+ +    fprintf(fp, "\n      P E R F O R M A N C E   R E S U L T S\n");
+ +    sep_line(fp);
+ +    fprintf(fp, "%s for Gromacs %s\n", ShortProgram(), GromacsVersion());
+ +    if (!bThreads)
+ +    {
+ +        fprintf(fp, "Number of nodes         : %d\n", nnodes);
+ +        fprintf(fp, "The mpirun command is   : %s\n", cmd_mpirun);
+ +        if (strcmp(procstring[0], "none") != 0)
+ +        {
+ +            fprintf(fp, "Passing # of nodes via  : %s\n", procstring[0]);
+ +        }
+ +        else
+ +        {
+ +            fprintf(fp, "Not setting number of nodes in system call\n");
+ +        }
+ +    }
+ +    else
+ +    {
+ +        fprintf(fp, "Number of threads       : %d\n", nnodes);
+ +    }
+ +
+ +    fprintf(fp, "The mdrun  command is   : %s\n", cmd_mdrun);
+ +    fprintf(fp, "mdrun args benchmarks   : %s\n", cmd_args_bench);
+ +    fprintf(fp, "Benchmark steps         : ");
+ +    fprintf(fp, gmx_large_int_pfmt, bench_nsteps);
+ +    fprintf(fp, "\n");
+ +    fprintf(fp, "dlb equilibration steps : %d\n", presteps);
+ +    if (sim_part > 1)
+ +    {
+ +        fprintf(fp, "Checkpoint time step    : ");
+ +        fprintf(fp, gmx_large_int_pfmt, cpt_steps);
+ +        fprintf(fp, "\n");
+ +    }
+ +    fprintf(fp, "mdrun args at launchtime: %s\n", cmd_args_launch);
+ +
+ +    if (new_sim_nsteps >= 0)
+ +    {
+ +        bOverwrite = TRUE;
+ +        fprintf(stderr, "Note: Simulation input file %s will have ", opt2fn("-so", NFILE, fnm));
+ +        fprintf(stderr, gmx_large_int_pfmt, new_sim_nsteps+cpt_steps);
+ +        fprintf(stderr, " steps.\n");
+ +        fprintf(fp, "Simulation steps        : ");
+ +        fprintf(fp, gmx_large_int_pfmt, new_sim_nsteps);
+ +        fprintf(fp, "\n");
+ +    }
+ +    if (repeats > 1)
+ +    {
+ +        fprintf(fp, "Repeats for each test   : %d\n", repeats);
+ +    }
+ +
+ +    if (npme_fixed >= -1)
+ +    {
+ +        fprintf(fp, "Fixing -npme at         : %d\n", npme_fixed);
+ +    }
+ +
+ +    fprintf(fp, "Input file              : %s\n", opt2fn("-s", NFILE, fnm));
+ +    fprintf(fp, "   PME/PP load estimate : %g\n", guessPMEratio);
+ +
+ +    /* Allocate memory for the inputinfo struct: */
+ +    snew(info, 1);
+ +    info->nr_inputfiles = ntprs;
+ +    for (i = 0; i < ntprs; i++)
+ +    {
+ +        snew(info->rcoulomb, ntprs);
+ +        snew(info->rvdw, ntprs);
+ +        snew(info->rlist, ntprs);
+ +        snew(info->rlistlong, ntprs);
+ +        snew(info->nkx, ntprs);
+ +        snew(info->nky, ntprs);
+ +        snew(info->nkz, ntprs);
+ +        snew(info->fsx, ntprs);
+ +        snew(info->fsy, ntprs);
+ +        snew(info->fsz, ntprs);
+ +    }
+ +    /* Make alternative tpr files to test: */
+ +    snew(tpr_names, ntprs);
+ +    for (i = 0; i < ntprs; i++)
+ +    {
+ +        snew(tpr_names[i], STRLEN);
+ +    }
+ +
+ +    /* It can be that ntprs is reduced by make_benchmark_tprs if not enough
+ +     * different grids could be found. */
+ +    make_benchmark_tprs(opt2fn("-s", NFILE, fnm), tpr_names, bench_nsteps+presteps,
+ +                        cpt_steps, rmin, rmax, bScaleRvdw, &ntprs, info, fp);
+ +
+ +    /********************************************************************************/
+ +    /* Main loop over all scenarios we need to test: tpr files, PME nodes, repeats  */
+ +    /********************************************************************************/
+ +    snew(perfdata, ntprs);
+ +    if (bBenchmark)
+ +    {
+ +        do_the_tests(fp, tpr_names, maxPMEnodes, minPMEnodes, npme_fixed, npmevalues_opt[0], perfdata, &pmeentries,
+ +                     repeats, nnodes, ntprs, bThreads, cmd_mpirun, cmd_np, cmd_mdrun,
+ +                     cmd_args_bench, fnm, NFILE, presteps, cpt_steps);
+ +
+ +        fprintf(fp, "\nTuning took%8.1f minutes.\n", (gettime()-seconds)/60.0);
+ +
+ +        /* Analyse the results and give a suggestion for optimal settings: */
+ +        bKeepTPR = analyze_data(fp, opt2fn("-p", NFILE, fnm), perfdata, nnodes, ntprs, pmeentries,
+ +                                repeats, info, &best_tpr, &best_npme);
+ +
+ +        /* Take the best-performing tpr file and enlarge nsteps to original value */
+ +        if (bKeepTPR && !bOverwrite)
+ +        {
+ +            simulation_tpr = opt2fn("-s", NFILE, fnm);
+ +        }
+ +        else
+ +        {
+ +            simulation_tpr = opt2fn("-so", NFILE, fnm);
+ +            modify_PMEsettings(bOverwrite ? (new_sim_nsteps+cpt_steps) : info->orig_sim_steps,
+ +                               info->orig_init_step, tpr_names[best_tpr], simulation_tpr);
+ +        }
+ +
+ +        /* Let's get rid of the temporary benchmark input files */
+ +        for (i = 0; i < ntprs; i++)
+ +        {
+ +            fprintf(stdout, "Deleting temporary benchmark input file %s\n", tpr_names[i]);
+ +            remove(tpr_names[i]);
+ +        }
+ +
+ +        /* Now start the real simulation if the user requested it ... */
+ +        launch_simulation(bLaunch, fp, bThreads, cmd_mpirun, cmd_np, cmd_mdrun,
+ +                          cmd_args_launch, simulation_tpr, best_npme);
+ +    }
+ +    ffclose(fp);
+ +
+ +    /* ... or simply print the performance results to screen: */
+ +    if (!bLaunch)
+ +    {
+ +        finalize(opt2fn("-p", NFILE, fnm));
+ +    }
+ +
+ +    return 0;
+ +}
diff --cc src/gromacs/gmxlib/cuda_tools/copyrite_gpu.cu

index 8c33a7e32474675d5996ae6271453d2f4adeb4ac,0000000000000000000000000000000000000000..a2d037449138c27e06404a936cd1e640043db56a

mode 100644,000000..100644
--- 1/src/gromacs/gmxlib/cuda_tools/copyrite_gpu.cu
--- /dev/null
+++ b/src/gromacs/gmxlib/cuda_tools/copyrite_gpu.cu
@@@ -1,59 -1,0 +1,60 @@@
+ +/*
+ + * This file is part of the GROMACS molecular simulation package.
+ + *
+ + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ + * Copyright (c) 2001-2004, The GROMACS development team,
+ + * check out http://www.gromacs.org for more information.
+ + * Copyright (c) 2012, by the GROMACS development team, led by
+ + * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ + * others, as listed in the AUTHORS file in the top-level source
+ + * directory and at http://www.gromacs.org.
+ + *
+ + * GROMACS is free software; you can redistribute it and/or
+ + * modify it under the terms of the GNU Lesser General Public License
+ + * as published by the Free Software Foundation; either version 2.1
+ + * of the License, or (at your option) any later version.
+ + *
+ + * GROMACS is distributed in the hope that it will be useful,
+ + * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ + * Lesser General Public License for more details.
+ + *
+ + * You should have received a copy of the GNU Lesser General Public
+ + * License along with GROMACS; if not, see
+ + * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ + *
+ + * If you want to redistribute modifications to GROMACS, please
+ + * consider that scientific software is very special. Version
+ + * control is crucial - bugs must be traceable. We will be happy to
+ + * consider code for inclusion in the official distribution, but
+ + * derived work must not be called official GROMACS. Details are found
+ + * in the README & COPYING files - if they are missing, get the
+ + * official version at http://www.gromacs.org.
+ + *
+ + * To help us fund GROMACS development, we humbly ask that you cite
+ + * the research papers on the package. Check out http://www.gromacs.org.
+ + */
+ +
+ +#ifdef HAVE_CONFIG_H
+ +#include <config.h>
+ +#endif
+ +
+ +#include <stdio.h>
+ +#include <cuda.h>
+ +#include <cuda_runtime_api.h>
+ +
+ +#include "buildinfo.h"
+ +
+ +void gmx_print_version_info_gpu(FILE *fp)
+ +{
+ +    int cuda_driver,cuda_runtime;
+ +    fprintf(fp, "CUDA compiler:      %s\n",CUDA_NVCC_COMPILER_INFO);
++    fprintf(fp, "CUDA compiler flags:%s\n",CUDA_NVCC_COMPILER_FLAGS);
+ +    cuda_driver = 0;
+ +    cudaDriverGetVersion(&cuda_driver);
+ +    cuda_runtime = 0;
+ +    cudaRuntimeGetVersion(&cuda_runtime);
+ +    fprintf(fp, "CUDA driver:        %d.%d\n",cuda_driver/1000, cuda_driver%100);
+ +    fprintf(fp, "CUDA runtime:       %d.%d\n",cuda_runtime/1000, cuda_runtime%100);
+ +}
diff --cc src/gromacs/gmxlib/gmx_detect_hardware.c

index 56f020014789cfe1dd9d87b39cd9ec0963ccc590,0000000000000000000000000000000000000000..2ce47d2ea252e17f157598b6acd606e93d4e63fd

mode 100644,000000..100644
--- 1/src/gromacs/gmxlib/gmx_detect_hardware.c
--- /dev/null
+++ b/src/gromacs/gmxlib/gmx_detect_hardware.c
@@@ -1,621 -1,0 +1,718 @@@
- 
+ +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
+ + *
+ + *
+ + * This file is part of GROMACS.
+ + * Copyright (c) 2012-
+ + *
+ + * Written by the Gromacs development team under coordination of
+ + * David van der Spoel, Berk Hess, and Erik Lindahl.
+ + *
+ + * This library is free software; you can redistribute it and/or
+ + * modify it under the terms of the GNU Lesser General Public License
+ + * as published by the Free Software Foundation; either version 2
+ + * of the License, or (at your option) any later version.
+ + *
+ + * To help us fund GROMACS development, we humbly ask that you cite
+ + * the research papers on the package. Check out http://www.gromacs.org
+ + *
+ + * And Hey:
+ + * GROup of MAchos and Cynical Suckers
+ + */
+ +#ifdef HAVE_CONFIG_H
+ +#include <config.h>
+ +#endif
+ +
+ +#include <stdlib.h>
+ +#include <assert.h>
+ +#include <string.h>
+ +
+ +#include "types/enums.h"
+ +#include "types/hw_info.h"
+ +#include "types/commrec.h"
+ +#include "gmx_fatal.h"
+ +#include "gmx_fatal_collective.h"
+ +#include "smalloc.h"
+ +#include "gpu_utils.h"
+ +#include "statutil.h"
+ +#include "gmx_detect_hardware.h"
+ +#include "main.h"
+ +#include "md_logging.h"
+ +
++#include "thread_mpi/threads.h"
++
+ +#ifdef HAVE_UNISTD_H
+ +#include <unistd.h>
+ +#endif
- void limit_num_gpus_used(gmx_hw_info_t *hwinfo, int count);
+ +#if ((defined(WIN32) || defined( _WIN32 ) || defined(WIN64) || defined( _WIN64 )) && !(defined (__CYGWIN__) || defined (__CYGWIN32__)))
+ +#include "windows.h"
+ +#endif
+ +
+ +/* Although we can't have more than 10 GPU different ID-s passed by the user as
+ + * the id-s are assumed to be represented by single digits, as multiple
+ + * processes can share a GPU, we can end up with more than 10 IDs.
+ + * To account for potential extreme cases we'll set the limit to a pretty
+ + * ridiculous number. */
+ +static unsigned int max_gpu_ids_user = 64;
+ +
+ +static const char * invalid_gpuid_hint =
+ +    "A delimiter-free sequence of valid numeric IDs of available GPUs is expected.";
+ +
++/* The globally shared hwinfo structure. */
++static gmx_hw_info_t      *hwinfo_g;
++/* A reference counter for the hwinfo structure */
++static int                 n_hwinfo = 0;
++/* A lock to protect the hwinfo structure */
++static tMPI_Thread_mutex_t hw_info_lock = TMPI_THREAD_MUTEX_INITIALIZER;
++
++
+ +/* FW decl. */
-     int      npppn, ntmpi_pp, ngpu;
-     char     sbuf[STRLEN], th_or_proc[STRLEN], th_or_proc_plural[STRLEN], pernode[STRLEN];
-     char     gpu_plural[2];
-     gmx_bool bGPUBin, btMPI, bMPI, bMaxMpiThreadsSet, bNthreadsAuto, bEmulateGPU;
++static void limit_num_gpus_used(gmx_hw_info_t *hwinfo, int count);
+ +
+ +static void sprint_gpus(char *sbuf, const gmx_gpu_info_t *gpu_info)
+ +{
+ +    int      i, ndev;
+ +    char     stmp[STRLEN];
+ +
+ +    ndev = gpu_info->ncuda_dev;
+ +
+ +    sbuf[0] = '\0';
+ +    for (i = 0; i < ndev; i++)
+ +    {
+ +        get_gpu_device_info_string(stmp, gpu_info, i);
+ +        strcat(sbuf, "  ");
+ +        strcat(sbuf, stmp);
+ +        if (i < ndev - 1)
+ +        {
+ +            strcat(sbuf, "\n");
+ +        }
+ +    }
+ +}
+ +
+ +static void print_gpu_detection_stats(FILE                 *fplog,
+ +                                      const gmx_gpu_info_t *gpu_info,
+ +                                      const t_commrec      *cr)
+ +{
+ +    char onhost[266], stmp[STRLEN];
+ +    int  ngpu;
+ +
+ +    ngpu = gpu_info->ncuda_dev;
+ +
+ +#if defined GMX_MPI && !defined GMX_THREAD_MPI
+ +    /* We only print the detection on one, of possibly multiple, nodes */
+ +    strncpy(onhost, " on host ", 10);
+ +    gmx_gethostname(onhost+9, 256);
+ +#else
+ +    /* We detect all relevant GPUs */
+ +    strncpy(onhost, "", 1);
+ +#endif
+ +
+ +    if (ngpu > 0)
+ +    {
+ +        sprint_gpus(stmp, gpu_info);
+ +        md_print_warn(cr, fplog, "%d GPU%s detected%s:\n%s\n",
+ +                      ngpu, (ngpu > 1) ? "s" : "", onhost, stmp);
+ +    }
+ +    else
+ +    {
+ +        md_print_warn(cr, fplog, "No GPUs detected%s\n", onhost);
+ +    }
+ +}
+ +
+ +static void print_gpu_use_stats(FILE                 *fplog,
+ +                                const gmx_gpu_info_t *gpu_info,
+ +                                const t_commrec      *cr)
+ +{
+ +    char sbuf[STRLEN], stmp[STRLEN];
+ +    int  i, ngpu, ngpu_all;
+ +
+ +    ngpu     = gpu_info->ncuda_dev_use;
+ +    ngpu_all = gpu_info->ncuda_dev;
+ +
+ +    /* Issue note if GPUs are available but not used */
+ +    if (ngpu_all > 0 && ngpu < 1)
+ +    {
+ +        sprintf(sbuf,
+ +                "%d compatible GPU%s detected in the system, but none will be used.\n"
+ +                "Consider trying GPU acceleration with the Verlet scheme!",
+ +                ngpu_all, (ngpu_all > 1) ? "s" : "");
+ +    }
+ +    else
+ +    {
+ +        sprintf(sbuf, "%d GPU%s %sselected for this run: ",
+ +                ngpu, (ngpu > 1) ? "s" : "",
+ +                gpu_info->bUserSet ? "user-" : "auto-");
+ +        for (i = 0; i < ngpu; i++)
+ +        {
+ +            sprintf(stmp, "#%d", get_gpu_device_id(gpu_info, i));
+ +            if (i < ngpu - 1)
+ +            {
+ +                strcat(stmp, ", ");
+ +            }
+ +            strcat(sbuf, stmp);
+ +        }
+ +    }
+ +    md_print_info(cr, fplog, "%s\n\n", sbuf);
+ +}
+ +
+ +/* Parse a "plain" GPU ID string which contains a sequence of digits corresponding
+ + * to GPU IDs; the order will indicate the process/tMPI thread - GPU assignment. */
+ +static void parse_gpu_id_plain_string(const char *idstr, int *nid, int *idlist)
+ +{
+ +    int    i;
+ +    size_t len_idstr;
+ +
+ +    len_idstr = strlen(idstr);
+ +
+ +    if (len_idstr > max_gpu_ids_user)
+ +    {
+ +        gmx_fatal(FARGS, "%d GPU IDs provided, but only at most %d are supported",
+ +                  len_idstr, max_gpu_ids_user);
+ +    }
+ +
+ +    *nid = len_idstr;
+ +
+ +    for (i = 0; i < *nid; i++)
+ +    {
+ +        if (idstr[i] < '0' || idstr[i] > '9')
+ +        {
+ +            gmx_fatal(FARGS, "Invalid character in GPU ID string: '%c'\n%s\n",
+ +                      idstr[i], invalid_gpuid_hint);
+ +        }
+ +        idlist[i] = idstr[i] - '0';
+ +    }
+ +}
+ +
+ +void gmx_check_hw_runconf_consistency(FILE *fplog, gmx_hw_info_t *hwinfo,
+ +                                      const t_commrec *cr, int ntmpi_requested,
+ +                                      gmx_bool bUseGPU)
+ +{
-     btMPI         = bMPI = FALSE;
-     bNthreadsAuto = FALSE;
++    int                        npppn, ntmpi_pp, ngpu;
++    char                       sbuf[STRLEN], th_or_proc[STRLEN], th_or_proc_plural[STRLEN], pernode[STRLEN];
++    char                       gpu_plural[2];
++    gmx_bool                   bGPUBin, btMPI, bMPI, bMaxMpiThreadsSet, bNthreadsAuto, bEmulateGPU;
++    int                        ret;
++    static tMPI_Thread_mutex_t cons_lock = TMPI_THREAD_MUTEX_INITIALIZER;
++
+ +
+ +    assert(hwinfo);
+ +    assert(cr);
+ +
-     btMPI         = TRUE;
-     bNthreadsAuto = (ntmpi_requested < 1);
++    /* Below we only do consistency checks for PP and GPUs,
++     * this is irrelevant for PME only nodes, so in that case we return
++     * here.
++     */
++    if (!(cr->duty & DUTY_PP))
++    {
++        return;
++    }
++
++    /* We run this function only once, but must make sure that all threads
++       that are alive run this function, so they get consistent data. We
++       achieve this by mutual exclusion and returning if the structure is
++       already properly checked & set */
++    ret = tMPI_Thread_mutex_lock(&cons_lock);
++    if (ret != 0)
++    {
++        gmx_fatal(FARGS, "Error locking cons mutex: %s", strerror(errno));
++    }
++
++    if (!hwinfo->bConsistencyChecked)
++    {
++        btMPI         = bMPI = FALSE;
++        bNthreadsAuto = FALSE;
+ +#if defined(GMX_THREAD_MPI)
-     bMPI  = TRUE;
++        btMPI         = TRUE;
++        bNthreadsAuto = (ntmpi_requested < 1);
+ +#elif defined(GMX_LIB_MPI)
-     bGPUBin      = TRUE;
++        bMPI  = TRUE;
+ +#endif
+ +
+ +#ifdef GMX_GPU
-     bGPUBin      = FALSE;
++        bGPUBin      = TRUE;
+ +#else
-     /* GPU emulation detection is done later, but we need here as well
-      * -- uncool, but there's no elegant workaround */
-     bEmulateGPU       = (getenv("GMX_EMULATE_GPU") != NULL);
-     bMaxMpiThreadsSet = (getenv("GMX_MAX_MPI_THREADS") != NULL);
++        bGPUBin      = FALSE;
+ +#endif
+ +
-     if (SIMMASTER(cr))
-     {
-         /* check the acceleration mdrun is compiled with against hardware capabilities */
-         /* TODO: Here we assume homogeneous hardware which is not necessarily the case!
-          *       Might not hurt to add an extra check over MPI. */
++        /* GPU emulation detection is done later, but we need here as well
++         * -- uncool, but there's no elegant workaround */
++        bEmulateGPU       = (getenv("GMX_EMULATE_GPU") != NULL);
++        bMaxMpiThreadsSet = (getenv("GMX_MAX_MPI_THREADS") != NULL);
+ +
-     }
- 
-     /* Below we only do consistency checks for PP and GPUs,
-      * this is irrelevant for PME only nodes, so in that case we return here.
-      */
-     if (!(cr->duty & DUTY_PP))
-     {
-         return;
-     }
++        /* check the acceleration mdrun is compiled with against hardware
++           capabilities */
++        /* TODO: Here we assume homogeneous hardware which is not necessarily
++                 the case! Might not hurt to add an extra check over MPI. */
+ +        gmx_cpuid_acceleration_check(hwinfo->cpuid_info, fplog);
-     /* Need to ensure that we have enough GPUs:
-      * - need one GPU per PP node
-      * - no GPU oversubscription with tMPI
-      * => keep on the GPU support, otherwise turn off (or bail if forced)
-      * */
-     /* number of PP processes per node */
-     npppn = cr->nrank_pp_intranode;
- 
-     pernode[0]           = '\0';
-     th_or_proc_plural[0] = '\0';
-     if (btMPI)
-     {
-         sprintf(th_or_proc, "thread-MPI thread");
-         if (npppn > 1)
+ +
-             sprintf(th_or_proc_plural, "s");
++        /* Need to ensure that we have enough GPUs:
++         * - need one GPU per PP node
++         * - no GPU oversubscription with tMPI
++         * => keep on the GPU support, otherwise turn off (or bail if forced)
++         * */
++        /* number of PP processes per node */
++        npppn = cr->nrank_pp_intranode;
++
++        pernode[0]           = '\0';
++        th_or_proc_plural[0] = '\0';
++        if (btMPI)
+ +        {
-     }
-     else if (bMPI)
-     {
-         sprintf(th_or_proc, "MPI process");
-         if (npppn > 1)
++            sprintf(th_or_proc, "thread-MPI thread");
++            if (npppn > 1)
++            {
++                sprintf(th_or_proc_plural, "s");
++            }
+ +        }
-             sprintf(th_or_proc_plural, "es");
++        else if (bMPI)
+ +        {
-         sprintf(pernode, " per node");
-     }
-     else
-     {
-         /* neither MPI nor tMPI */
-         sprintf(th_or_proc, "process");
-     }
- 
-     if (bGPUBin)
-     {
-         print_gpu_detection_stats(fplog, &hwinfo->gpu_info, cr);
-     }
++            sprintf(th_or_proc, "MPI process");
++            if (npppn > 1)
++            {
++                sprintf(th_or_proc_plural, "es");
++            }
++            sprintf(pernode, " per node");
++        }
++        else
++        {
++            /* neither MPI nor tMPI */
++            sprintf(th_or_proc, "process");
+ +        }
-     if (bUseGPU && hwinfo->bCanUseGPU && !bEmulateGPU)
-     {
-         ngpu = hwinfo->gpu_info.ncuda_dev_use;
-         sprintf(gpu_plural, "%s", (ngpu > 1) ? "s" : "");
+ +
-         /* number of tMPI threads atuo-adjusted */
-         if (btMPI && bNthreadsAuto && SIMMASTER(cr))
++        if (bGPUBin)
++        {
++            print_gpu_detection_stats(fplog, &hwinfo->gpu_info, cr);
++        }
+ +
-             if (npppn < ngpu)
++        if (bUseGPU && hwinfo->bCanUseGPU && !bEmulateGPU)
+ +        {
-                 if (hwinfo->gpu_info.bUserSet)
++            ngpu = hwinfo->gpu_info.ncuda_dev_use;
++            sprintf(gpu_plural, "%s", (ngpu > 1) ? "s" : "");
++
++            /* number of tMPI threads atuo-adjusted */
++            if (btMPI && bNthreadsAuto)
+ +            {
-                     /* The user manually provided more GPUs than threads we could
-                      * automatically start. */
-                     gmx_fatal(FARGS,
-                               "%d GPU%s provided, but only %d PP thread-MPI thread%s coud be started.\n"
-                               "%s requires one PP tread-MPI thread per GPU; use fewer GPUs%s.",
-                               ngpu, gpu_plural, npppn, th_or_proc_plural,
-                               ShortProgram(), bMaxMpiThreadsSet ? "\nor allow more threads to be used" : "");
-                 }
-                 else
-                 {
-                     /* There are more GPUs than tMPI threads; we have to limit the number GPUs used. */
-                     md_print_warn(cr, fplog,
-                                   "NOTE: %d GPU%s were detected, but only %d PP thread-MPI thread%s can be started.\n"
-                                   "      %s can use one GPU per PP tread-MPI thread, so only %d GPU%s will be used.%s\n",
++                if (npppn < ngpu)
+ +                {
-                                   ShortProgram(), npppn, npppn > 1 ? "s" : "",
-                                   bMaxMpiThreadsSet ? "\n      Also, you can allow more threads to be used by increasing GMX_MAX_MPI_THREADS" : "");
- 
-                     if (cr->rank_pp_intranode == 0)
++                    if (hwinfo->gpu_info.bUserSet)
++                    {
++                        /* The user manually provided more GPUs than threads we
++                           could automatically start. */
++                        gmx_fatal(FARGS,
++                                  "%d GPU%s provided, but only %d PP thread-MPI thread%s coud be started.\n"
++                                  "%s requires one PP tread-MPI thread per GPU; use fewer GPUs%s.",
+ +                                  ngpu, gpu_plural, npppn, th_or_proc_plural,
-                         limit_num_gpus_used(hwinfo, npppn);
-                         ngpu = hwinfo->gpu_info.ncuda_dev_use;
-                         sprintf(gpu_plural, "%s", (ngpu > 1) ? "s" : "");
++                                  ShortProgram(), bMaxMpiThreadsSet ? "\nor allow more threads to be used" : "");
++                    }
++                    else
+ +                    {
-         }
++                        /* There are more GPUs than tMPI threads; we have to
++                           limit the number GPUs used. */
++                        md_print_warn(cr, fplog,
++                                      "NOTE: %d GPU%s were detected, but only %d PP thread-MPI thread%s can be started.\n"
++                                      "      %s can use one GPU per PP tread-MPI thread, so only %d GPU%s will be used.%s\n",
++                                      ngpu, gpu_plural, npppn,
++                                      th_or_proc_plural,
++                                      ShortProgram(), npppn,
++                                      npppn > 1 ? "s" : "",
++                                      bMaxMpiThreadsSet ? "\n      Also, you can allow more threads to be used by increasing GMX_MAX_MPI_THREADS" : "");
++
++                        if (cr->rank_pp_intranode == 0)
++                        {
++                            limit_num_gpus_used(hwinfo, npppn);
++                            ngpu = hwinfo->gpu_info.ncuda_dev_use;
++                            sprintf(gpu_plural, "%s", (ngpu > 1) ? "s" : "");
++                        }
+ +                    }
+ +                }
+ +            }
-         if (ngpu != npppn)
-         {
-             if (hwinfo->gpu_info.bUserSet)
+ +
-                 gmx_fatal(FARGS,
-                           "Incorrect launch configuration: mismatching number of PP %s%s and GPUs%s.\n"
-                           "%s was started with %d PP %s%s%s, but you provided %d GPU%s.",
-                           th_or_proc, btMPI ? "s" : "es", pernode,
-                           ShortProgram(), npppn, th_or_proc, th_or_proc_plural, pernode, ngpu, gpu_plural);
-             }
-             else
-             {
-                 if (ngpu > npppn)
++            if (ngpu != npppn)
+ +            {
-                     md_print_warn(cr, fplog,
-                                   "NOTE: potentially sub-optimal launch configuration, %s started with less\n"
-                                   "      PP %s%s%s than GPU%s available.\n"
-                                   "      Each PP %s can use only one GPU, %d GPU%s%s will be used.\n",
-                                   ShortProgram(),
-                                   th_or_proc, th_or_proc_plural, pernode, gpu_plural,
-                                   th_or_proc, npppn, gpu_plural, pernode);
- 
-                     if (bMPI || (btMPI && cr->rank_pp_intranode == 0))
-                     {
-                         limit_num_gpus_used(hwinfo, npppn);
-                         ngpu = hwinfo->gpu_info.ncuda_dev_use;
-                         sprintf(gpu_plural, "%s", (ngpu > 1) ? "s" : "");
-                     }
++                if (hwinfo->gpu_info.bUserSet)
+ +                {
-                     /* Avoid duplicate error messages.
-                      * Unfortunately we can only do this at the physical node
-                      * level, since the hardware setup and MPI process count
-                      * might be differ over physical nodes.
-                      */
-                     if (cr->rank_pp_intranode == 0)
++                    gmx_fatal(FARGS,
++                              "Incorrect launch configuration: mismatching number of PP %s%s and GPUs%s.\n"
++                              "%s was started with %d PP %s%s%s, but you provided %d GPU%s.",
++                              th_or_proc, btMPI ? "s" : "es", pernode,
++                              ShortProgram(), npppn, th_or_proc,
++                              th_or_proc_plural, pernode, ngpu, gpu_plural);
+ +                }
+ +                else
+ +                {
-                         gmx_fatal(FARGS,
-                                   "Incorrect launch configuration: mismatching number of PP %s%s and GPUs%s.\n"
-                                   "%s was started with %d PP %s%s%s, but only %d GPU%s were detected.",
-                                   th_or_proc, btMPI ? "s" : "es", pernode,
-                                   ShortProgram(), npppn, th_or_proc, th_or_proc_plural, pernode, ngpu, gpu_plural);
++                    if (ngpu > npppn)
+ +                    {
- #ifdef GMX_MPI
++                        md_print_warn(cr, fplog,
++                                      "NOTE: potentially sub-optimal launch configuration, %s started with less\n"
++                                      "      PP %s%s%s than GPU%s available.\n"
++                                      "      Each PP %s can use only one GPU, %d GPU%s%s will be used.\n",
++                                      ShortProgram(), th_or_proc,
++                                      th_or_proc_plural, pernode, gpu_plural,
++                                      th_or_proc, npppn, gpu_plural, pernode);
++
++                        if (bMPI || (btMPI && cr->rank_pp_intranode == 0))
++                        {
++                            limit_num_gpus_used(hwinfo, npppn);
++                            ngpu = hwinfo->gpu_info.ncuda_dev_use;
++                            sprintf(gpu_plural, "%s", (ngpu > 1) ? "s" : "");
++                        }
+ +                    }
-                         /* Avoid other ranks to continue after inconsistency */
-                         MPI_Barrier(cr->mpi_comm_mygroup);
+ +                    else
+ +                    {
- #endif
++                        /* Avoid duplicate error messages.
++                         * Unfortunately we can only do this at the physical node
++                         * level, since the hardware setup and MPI process count
++                         * might be differ over physical nodes.
++                         */
++                        if (cr->rank_pp_intranode == 0)
++                        {
++                            gmx_fatal(FARGS,
++                                      "Incorrect launch configuration: mismatching number of PP %s%s and GPUs%s.\n"
++                                      "%s was started with %d PP %s%s%s, but only %d GPU%s were detected.",
++                                      th_or_proc, btMPI ? "s" : "es", pernode,
++                                      ShortProgram(), npppn, th_or_proc,
++                                      th_or_proc_plural, pernode, ngpu,
++                                      gpu_plural);
++                        }
+ +                    }
-         }
+ +                }
+ +            }
-         hwinfo->gpu_info.bDevShare = FALSE;
-         if (hwinfo->gpu_info.bUserSet && (cr->rank_pp_intranode == 0))
-         {
-             int      i, j, same_count;
-             gmx_bool bSomeSame, bAllDifferent;
+ +
-             same_count    = 0; /* number of GPUs shared among ranks */
-             bSomeSame     = FALSE;
-             bAllDifferent = TRUE;
++            {
++                int      same_count;
+ +
-             for (i = 0; i < ngpu - 1; i++)
-             {
-                 for (j = i + 1; j < ngpu; j++)
++                same_count = gmx_count_gpu_dev_shared(&(hwinfo->gpu_info));
+ +
-                     bSomeSame       |= hwinfo->gpu_info.cuda_dev_use[i] == hwinfo->gpu_info.cuda_dev_use[j];
-                     bAllDifferent   &= hwinfo->gpu_info.cuda_dev_use[i] != hwinfo->gpu_info.cuda_dev_use[j];
-                     same_count      += hwinfo->gpu_info.cuda_dev_use[i] == hwinfo->gpu_info.cuda_dev_use[j];
++                if (btMPI && same_count > 0)
+ +                {
-             /* store the number of shared/oversubscribed GPUs */
-             hwinfo->gpu_info.bDevShare = bSomeSame;
++                    gmx_fatal(FARGS,
++                              "Invalid GPU assignment: can't share a GPU among multiple thread-MPI threads.\n"
++                              "Use MPI if you are sure that you want to assign GPU to multiple threads.");
++                }
++
++                if (same_count > 0)
++                {
++                    md_print_warn(cr, fplog,
++                                  "NOTE: Potentially sub-optimal launch configuration: you assigned %s to\n"
++                                  "      multiple %s%s; this should be avoided as it can cause\n"
++                                  "      performance loss.\n",
++                                  same_count > 1 ? "GPUs" : "a GPU", th_or_proc, btMPI ? "s" : "es");
+ +                }
+ +            }
++            print_gpu_use_stats(fplog, &hwinfo->gpu_info, cr);
++        }
++        hwinfo->bConsistencyChecked = TRUE;
++    }
+ +
-             if (btMPI && !bAllDifferent)
-             {
-                 gmx_fatal(FARGS,
-                           "Invalid GPU assignment: can't share a GPU among multiple thread-MPI threads.\n"
-                           "Use MPI if you are sure that you want to assign GPU to multiple threads.");
-             }
++    ret = tMPI_Thread_mutex_unlock(&cons_lock);
++    if (ret != 0)
++    {
++        gmx_fatal(FARGS, "Error unlocking cons mutex: %s", strerror(errno));
++    }
+ +
-             if (bSomeSame)
++#ifdef GMX_MPI
++    if (PAR(cr))
++    {
++        /* Avoid other ranks to continue after
++           inconsistency */
++        MPI_Barrier(cr->mpi_comm_mygroup);
++    }
++#endif
++
++}
++
++int gmx_count_gpu_dev_shared(const gmx_gpu_info_t *gpu_info)
++{
++    int      same_count    = 0;
++    int      ngpu          = gpu_info->ncuda_dev_use;
+ +
-                 md_print_warn(cr, fplog,
-                               "NOTE: Potentially sub-optimal launch configuration: you assigned %s to\n"
-                               "      multiple %s%s; this should be avoided as it can cause\n"
-                               "      performance loss.\n",
-                               same_count > 1 ? "GPUs" : "a GPU", th_or_proc, btMPI ? "s" : "es");
++    if (gpu_info->bUserSet)
++    {
++        int      i, j;
++
++        for (i = 0; i < ngpu - 1; i++)
++        {
++            for (j = i + 1; j < ngpu; j++)
+ +            {
-         print_gpu_use_stats(fplog, &hwinfo->gpu_info, cr);
++                same_count      += (gpu_info->cuda_dev_use[i] ==
++                                    gpu_info->cuda_dev_use[j]);
+ +            }
+ +        }
- void gmx_detect_hardware(FILE *fplog, gmx_hw_info_t *hwinfo,
-                          const t_commrec *cr,
-                          gmx_bool bForceUseGPU, gmx_bool bTryUseGPU,
-                          const char *gpu_id)
+ +    }
++
++    return same_count;
+ +}
+ +
++
+ +/* Return the number of hardware threads supported by the current CPU.
+ + * We assume that this is equal with the number of CPUs reported to be
+ + * online by the OS at the time of the call.
+ + */
+ +static int get_nthreads_hw_avail(FILE gmx_unused *fplog, const t_commrec gmx_unused *cr)
+ +{
+ +    int ret = 0;
+ +
+ +#if ((defined(WIN32) || defined( _WIN32 ) || defined(WIN64) || defined( _WIN64 )) && !(defined (__CYGWIN__) || defined (__CYGWIN32__)))
+ +    /* Windows */
+ +    SYSTEM_INFO sysinfo;
+ +    GetSystemInfo( &sysinfo );
+ +    ret = sysinfo.dwNumberOfProcessors;
+ +#elif defined HAVE_SYSCONF
+ +    /* We are probably on Unix.
+ +     * Now check if we have the argument to use before executing the call
+ +     */
+ +#if defined(_SC_NPROCESSORS_ONLN)
+ +    ret = sysconf(_SC_NPROCESSORS_ONLN);
+ +#elif defined(_SC_NPROC_ONLN)
+ +    ret = sysconf(_SC_NPROC_ONLN);
+ +#elif defined(_SC_NPROCESSORS_CONF)
+ +    ret = sysconf(_SC_NPROCESSORS_CONF);
+ +#elif defined(_SC_NPROC_CONF)
+ +    ret = sysconf(_SC_NPROC_CONF);
+ +#endif /* End of check for sysconf argument values */
+ +
+ +#else
+ +    /* Neither windows nor Unix. No fscking idea how many CPUs we have! */
+ +    ret = -1;
+ +#endif
+ +
+ +    if (debug)
+ +    {
+ +        fprintf(debug, "Detected %d processors, will use this as the number "
+ +                "of supported hardware threads.\n", ret);
+ +    }
+ +
+ +#ifdef GMX_OMPENMP
+ +    if (ret != gmx_omp_get_num_procs())
+ +    {
+ +        md_print_warn(cr, fplog,
+ +                      "Number of CPUs detected (%d) does not match the number reported by OpenMP (%d).\n"
+ +                      "Consider setting the launch configuration manually!",
+ +                      ret, gmx_omp_get_num_procs());
+ +    }
+ +#endif
+ +
+ +    return ret;
+ +}
+ +
-     assert(hwinfo);
- 
-     /* detect CPUID info; no fuss, we don't detect system-wide
-      * -- sloppy, but that's it for now */
-     if (gmx_cpuid_init(&hwinfo->cpuid_info) != 0)
++gmx_hw_info_t *gmx_detect_hardware(FILE *fplog, const t_commrec *cr,
++                                   gmx_bool bForceUseGPU, gmx_bool bTryUseGPU,
++                                   const char *gpu_id)
+ +{
+ +    int              i;
+ +    const char      *env;
+ +    char             sbuf[STRLEN], stmp[STRLEN];
+ +    gmx_hw_info_t   *hw;
+ +    gmx_gpu_info_t   gpuinfo_auto, gpuinfo_user;
+ +    gmx_bool         bGPUBin;
++    int              ret;
+ +
-         gmx_fatal_collective(FARGS, cr, NULL, "CPUID detection failed!");
++    /* make sure no one else is doing the same thing */
++    ret = tMPI_Thread_mutex_lock(&hw_info_lock);
++    if (ret != 0)
+ +    {
-     /* detect number of hardware threads */
-     hwinfo->nthreads_hw_avail = get_nthreads_hw_avail(fplog, cr);
++        gmx_fatal(FARGS, "Error locking hwinfo mutex: %s", strerror(errno));
+ +    }
+ +
-     /* detect GPUs */
-     hwinfo->gpu_info.ncuda_dev_use  = 0;
-     hwinfo->gpu_info.cuda_dev_use   = NULL;
-     hwinfo->gpu_info.ncuda_dev      = 0;
-     hwinfo->gpu_info.cuda_dev       = NULL;
++    /* only initialize the hwinfo structure if it is not already initalized */
++    if (n_hwinfo == 0)
++    {
++        snew(hwinfo_g, 1);
++        hwinfo_g->bConsistencyChecked = FALSE;
+ +
-     bGPUBin      = TRUE;
++        /* detect CPUID info; no fuss, we don't detect system-wide
++         * -- sloppy, but that's it for now */
++        if (gmx_cpuid_init(&hwinfo_g->cpuid_info) != 0)
++        {
++            gmx_fatal_collective(FARGS, cr, NULL, "CPUID detection failed!");
++        }
++
++        /* detect number of hardware threads */
++        hwinfo_g->nthreads_hw_avail = get_nthreads_hw_avail(fplog, cr);
++
++        /* detect GPUs */
++        hwinfo_g->gpu_info.ncuda_dev_use  = 0;
++        hwinfo_g->gpu_info.cuda_dev_use   = NULL;
++        hwinfo_g->gpu_info.ncuda_dev      = 0;
++        hwinfo_g->gpu_info.cuda_dev       = NULL;
+ +
+ +#ifdef GMX_GPU
-     bGPUBin      = FALSE;
++        bGPUBin      = TRUE;
+ +#else
-     /* Bail if binary is not compiled with GPU acceleration, but this is either
-      * explicitly (-nb gpu) or implicitly (gpu ID passed) requested. */
-     if (bForceUseGPU && !bGPUBin)
-     {
-         gmx_fatal(FARGS, "GPU acceleration requested, but %s was compiled without GPU support!", ShortProgram());
-     }
-     if (gpu_id != NULL && !bGPUBin)
-     {
-         gmx_fatal(FARGS, "GPU ID string set, but %s was compiled without GPU support!", ShortProgram());
-     }
- 
-     /* run the detection if the binary was compiled with GPU support */
-     if (bGPUBin && getenv("GMX_DISABLE_GPU_DETECTION") == NULL)
-     {
-         char detection_error[STRLEN];
- 
-         if (detect_cuda_gpus(&hwinfo->gpu_info, detection_error) != 0)
++        bGPUBin      = FALSE;
+ +#endif
+ +
-             if (detection_error != NULL && detection_error[0] != '\0')
-             {
-                 sprintf(sbuf, ":\n      %s\n", detection_error);
-             }
-             else
-             {
-                 sprintf(sbuf, ".");
-             }
-             md_print_warn(cr, fplog,
-                           "NOTE: Error occurred during GPU detection%s"
-                           "      Can not use GPU acceleration, will fall back to CPU kernels.\n",
-                           sbuf);
++        /* Bail if binary is not compiled with GPU acceleration, but this is either
++         * explicitly (-nb gpu) or implicitly (gpu ID passed) requested. */
++        if (bForceUseGPU && !bGPUBin)
+ +        {
-     }
- 
-     if (bForceUseGPU || bTryUseGPU)
-     {
-         env = getenv("GMX_GPU_ID");
-         if (env != NULL && gpu_id != NULL)
++            gmx_fatal(FARGS, "GPU acceleration requested, but %s was compiled without GPU support!", ShortProgram());
+ +        }
-             gmx_fatal(FARGS, "GMX_GPU_ID and -gpu_id can not be used at the same time");
++        if (gpu_id != NULL && !bGPUBin)
+ +        {
-         if (env == NULL)
++            gmx_fatal(FARGS, "GPU ID string set, but %s was compiled without GPU support!", ShortProgram());
+ +        }
-             env = gpu_id;
++
++        /* run the detection if the binary was compiled with GPU support */
++        if (bGPUBin && getenv("GMX_DISABLE_GPU_DETECTION") == NULL)
+ +        {
-         /* parse GPU IDs if the user passed any */
-         if (env != NULL)
++            char detection_error[STRLEN];
++
++            if (detect_cuda_gpus(&hwinfo_g->gpu_info, detection_error) != 0)
++            {
++                if (detection_error != NULL && detection_error[0] != '\0')
++                {
++                    sprintf(sbuf, ":\n      %s\n", detection_error);
++                }
++                else
++                {
++                    sprintf(sbuf, ".");
++                }
++                md_print_warn(cr, fplog,
++                              "NOTE: Error occurred during GPU detection%s"
++                              "      Can not use GPU acceleration, will fall back to CPU kernels.\n",
++                              sbuf);
++            }
+ +        }
+ +
-             int *gpuid, *checkres;
-             int  nid, res;
++        if (bForceUseGPU || bTryUseGPU)
+ +        {
-             snew(gpuid, max_gpu_ids_user);
-             snew(checkres, max_gpu_ids_user);
++            env = getenv("GMX_GPU_ID");
++            if (env != NULL && gpu_id != NULL)
++            {
++                gmx_fatal(FARGS, "GMX_GPU_ID and -gpu_id can not be used at the same time");
++            }
++            if (env == NULL)
++            {
++                env = gpu_id;
++            }
+ +
-             parse_gpu_id_plain_string(env, &nid, gpuid);
++            /* parse GPU IDs if the user passed any */
++            if (env != NULL)
++            {
++                int *gpuid, *checkres;
++                int  nid, res;
+ +
-             if (nid == 0)
-             {
-                 gmx_fatal(FARGS, "Empty GPU ID string encountered.\n%s\n", invalid_gpuid_hint);
-             }
++                snew(gpuid, max_gpu_ids_user);
++                snew(checkres, max_gpu_ids_user);
+ +
-             res = check_select_cuda_gpus(checkres, &hwinfo->gpu_info, gpuid, nid);
++                parse_gpu_id_plain_string(env, &nid, gpuid);
+ +
-             if (!res)
-             {
-                 print_gpu_detection_stats(fplog, &hwinfo->gpu_info, cr);
++                if (nid == 0)
++                {
++                    gmx_fatal(FARGS, "Empty GPU ID string encountered.\n%s\n",
++                              invalid_gpuid_hint);
++                }
+ +
-                 sprintf(sbuf, "Some of the requested GPUs do not exist, behave strangely, or are not compatible:\n");
-                 for (i = 0; i < nid; i++)
++                res = check_select_cuda_gpus(checkres, &hwinfo_g->gpu_info,
++                                             gpuid, nid);
+ +
-                     if (checkres[i] != egpuCompatible)
++                if (!res)
+ +                {
-                         sprintf(stmp, "    GPU #%d: %s\n",
-                                 gpuid[i], gpu_detect_res_str[checkres[i]]);
-                         strcat(sbuf, stmp);
++                    print_gpu_detection_stats(fplog, &hwinfo_g->gpu_info, cr);
++
++                    sprintf(sbuf, "Some of the requested GPUs do not exist, behave strangely, or are not compatible:\n");
++                    for (i = 0; i < nid; i++)
+ +                    {
-                 gmx_fatal(FARGS, "%s", sbuf);
-             }
++                        if (checkres[i] != egpuCompatible)
++                        {
++                            sprintf(stmp, "    GPU #%d: %s\n",
++                                    gpuid[i], gpu_detect_res_str[checkres[i]]);
++                            strcat(sbuf, stmp);
++                        }
+ +                    }
++                    gmx_fatal(FARGS, "%s", sbuf);
+ +                }
-             hwinfo->gpu_info.bUserSet = TRUE;
+ +
-             sfree(gpuid);
-             sfree(checkres);
-         }
-         else
-         {
-             pick_compatible_gpus(&hwinfo->gpu_info);
-             hwinfo->gpu_info.bUserSet = FALSE;
-         }
++                hwinfo_g->gpu_info.bUserSet = TRUE;
+ +
-         /* decide whether we can use GPU */
-         hwinfo->bCanUseGPU = (hwinfo->gpu_info.ncuda_dev_use > 0);
-         if (!hwinfo->bCanUseGPU && bForceUseGPU)
-         {
-             gmx_fatal(FARGS, "GPU acceleration requested, but no compatible GPUs were detected.");
++                sfree(gpuid);
++                sfree(checkres);
++            }
++            else
++            {
++                pick_compatible_gpus(&hwinfo_g->gpu_info);
++                hwinfo_g->gpu_info.bUserSet = FALSE;
++            }
+ +
- void limit_num_gpus_used(gmx_hw_info_t *hwinfo, int count)
++            /* decide whether we can use GPU */
++            hwinfo_g->bCanUseGPU = (hwinfo_g->gpu_info.ncuda_dev_use > 0);
++            if (!hwinfo_g->bCanUseGPU && bForceUseGPU)
++            {
++                gmx_fatal(FARGS, "GPU acceleration requested, but no compatible GPUs were detected.");
++            }
+ +        }
+ +    }
++    /* increase the reference counter */
++    n_hwinfo++;
++
++    ret = tMPI_Thread_mutex_unlock(&hw_info_lock);
++    if (ret != 0)
++    {
++        gmx_fatal(FARGS, "Error unlocking hwinfo mutex: %s", strerror(errno));
++    }
++
++    return hwinfo_g;
+ +}
+ +
-     if (hwinfo)
++static void limit_num_gpus_used(gmx_hw_info_t *hwinfo, int count)
+ +{
+ +    int ndev_use;
+ +
+ +    assert(hwinfo);
+ +
+ +    ndev_use = hwinfo->gpu_info.ncuda_dev_use;
+ +
+ +    if (count > ndev_use)
+ +    {
+ +        /* won't increase the # of GPUs */
+ +        return;
+ +    }
+ +
+ +    if (count < 1)
+ +    {
+ +        char sbuf[STRLEN];
+ +        sprintf(sbuf, "Limiting the number of GPUs to <1 doesn't make sense (detected %d, %d requested)!",
+ +                ndev_use, count);
+ +        gmx_incons(sbuf);
+ +    }
+ +
+ +    /* TODO: improve this implementation: either sort GPUs or remove the weakest here */
+ +    hwinfo->gpu_info.ncuda_dev_use = count;
+ +}
+ +
+ +void gmx_hardware_info_free(gmx_hw_info_t *hwinfo)
+ +{
-         gmx_cpuid_done(hwinfo->cpuid_info);
-         free_gpu_info(&hwinfo->gpu_info);
-         sfree(hwinfo);
++    int ret;
++
++    ret = tMPI_Thread_mutex_lock(&hw_info_lock);
++    if (ret != 0)
++    {
++        gmx_fatal(FARGS, "Error locking hwinfo mutex: %s", strerror(errno));
++    }
++
++    /* decrease the reference counter */
++    n_hwinfo--;
++
++
++    if (hwinfo != hwinfo_g)
++    {
++        gmx_incons("hwinfo < hwinfo_g");
++    }
++
++    if (n_hwinfo < 0)
++    {
++        gmx_incons("n_hwinfo < 0");
++    }
++
++    if (n_hwinfo == 0)
++    {
++        gmx_cpuid_done(hwinfo_g->cpuid_info);
++        free_gpu_info(&hwinfo_g->gpu_info);
++        sfree(hwinfo_g);
++    }
++
++    ret = tMPI_Thread_mutex_unlock(&hw_info_lock);
++    if (ret != 0)
+ +    {
++        gmx_fatal(FARGS, "Error unlocking hwinfo mutex: %s", strerror(errno));
+ +    }
+ +}
diff --cc src/gromacs/gmxlib/gmx_thread_affinity.c

index ce89e4130f6a251dc31b787c0b25134ced6e5976,0000000000000000000000000000000000000000..5d1b3659ab361bc33fd0c9dcbb92eeefe339326e

mode 100644,000000..100644
--- 1/src/gromacs/gmxlib/gmx_thread_affinity.c
--- /dev/null
+++ b/src/gromacs/gmxlib/gmx_thread_affinity.c
@@@ -1,530 -1,0 +1,459 @@@
-     /* these are inherently global properties that are shared among all threads
-      */
-     static const int          *locality_order;
-     static int                 rc;
-     static gmx_bool            have_locality_order = FALSE;
-     static tMPI_Thread_mutex_t locality_order_mtx  =
-         TMPI_THREAD_MUTEX_INITIALIZER;
-     static tMPI_Thread_cond_t  locality_order_cond =
-         TMPI_THREAD_COND_INITIALIZER;
+ +/*
+ + * This file is part of the GROMACS molecular simulation package.
+ + *
+ + * Copyright (c) 2012, by the GROMACS development team, led by
+ + * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ + * others, as listed in the AUTHORS file in the top-level source
+ + * directory and at http://www.gromacs.org.
+ + *
+ + * GROMACS is free software; you can redistribute it and/or
+ + * modify it under the terms of the GNU Lesser General Public License
+ + * as published by the Free Software Foundation; either version 2.1
+ + * of the License, or (at your option) any later version.
+ + *
+ + * GROMACS is distributed in the hope that it will be useful,
+ + * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ + * Lesser General Public License for more details.
+ + *
+ + * You should have received a copy of the GNU Lesser General Public
+ + * License along with GROMACS; if not, see
+ + * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ + *
+ + * If you want to redistribute modifications to GROMACS, please
+ + * consider that scientific software is very special. Version
+ + * control is crucial - bugs must be traceable. We will be happy to
+ + * consider code for inclusion in the official distribution, but
+ + * derived work must not be called official GROMACS. Details are found
+ + * in the README & COPYING files - if they are missing, get the
+ + * official version at http://www.gromacs.org.
+ + *
+ + * To help us fund GROMACS development, we humbly ask that you cite
+ + * the research papers on the package. Check out http://www.gromacs.org.
+ + */
+ +#ifdef HAVE_CONFIG_H
+ +#include <config.h>
+ +#endif
+ +#if defined(HAVE_SCHED_H) && defined(HAVE_SCHED_GETAFFINITY)
+ +#define _GNU_SOURCE
+ +#include <sched.h>
+ +#include <sys/syscall.h>
+ +#endif
+ +#include <string.h>
+ +#include <errno.h>
+ +#include <assert.h>
+ +#include <stdio.h>
+ +#include "typedefs.h"
+ +#include "types/commrec.h"
+ +#include "types/hw_info.h"
+ +#include "gmx_cpuid.h"
+ +#include "gmx_omp.h"
+ +#include "gmx_omp_nthreads.h"
+ +#include "mdrun.h"
+ +#include "md_logging.h"
+ +#include "statutil.h"
+ +#include "gmx_thread_affinity.h"
+ +
+ +#include "thread_mpi/threads.h"
+ +
+ +
+ +static int
+ +get_thread_affinity_layout(FILE *fplog,
+ +                           const t_commrec *cr,
+ +                           const gmx_hw_info_t * hwinfo,
+ +                           int nthreads,
+ +                           int pin_offset, int * pin_stride,
+ +                           const int **locality_order)
+ +{
+ +    int         nhwthreads, npkg, ncores, nhwthreads_per_core, rc;
+ +    const int * pkg_id;
+ +    const int * core_id;
+ +    const int * hwthread_id;
+ +    gmx_bool    bPickPinStride;
+ +
+ +    if (pin_offset < 0)
+ +    {
+ +        gmx_fatal(FARGS, "Negative thread pinning offset requested");
+ +    }
+ +    if (*pin_stride < 0)
+ +    {
+ +        gmx_fatal(FARGS, "Negative thread pinning stride requested");
+ +    }
+ +
+ +    rc = gmx_cpuid_topology(hwinfo->cpuid_info, &nhwthreads, &npkg, &ncores,
+ +                            &nhwthreads_per_core,
+ +                            &pkg_id, &core_id, &hwthread_id, locality_order);
+ +
+ +    if (rc != 0)
+ +    {
+ +        /* topology information not available or invalid, ignore it */
+ +        nhwthreads      = hwinfo->nthreads_hw_avail;
+ +        *locality_order = NULL;
+ +
+ +        if (nhwthreads <= 0)
+ +        {
+ +            /* We don't know anything about the hardware, don't pin */
+ +            md_print_warn(cr, fplog,
+ +                          "NOTE: We don't know how many logical cores we have, will not pin threads");
+ +
+ +            return -1;
+ +        }
+ +    }
+ +
+ +    if (nthreads > nhwthreads)
+ +    {
+ +        /* We are oversubscribing, don't pin */
+ +        md_print_warn(NULL, fplog,
+ +                      "WARNING: Oversubscribing the CPU, will not pin threads");
+ +
+ +        return -1;
+ +    }
+ +
+ +    if (pin_offset + nthreads > nhwthreads)
+ +    {
+ +        /* We are oversubscribing, don't pin */
+ +        md_print_warn(NULL, fplog,
+ +                      "WARNING: The requested pin offset is too large for the available logical cores,\n"
+ +                      "         will not pin threads");
+ +
+ +        return -1;
+ +    }
+ +
+ +
+ +    /* do we need to choose the pinning stride? */
+ +    bPickPinStride = (*pin_stride == 0);
+ +
+ +    if (bPickPinStride)
+ +    {
+ +        if (rc == 0 && pin_offset + nthreads*nhwthreads_per_core <= nhwthreads)
+ +        {
+ +            /* Put one thread on each physical core */
+ +            *pin_stride = nhwthreads_per_core;
+ +        }
+ +        else
+ +        {
+ +            /* We don't know if we have SMT, and if we do, we don't know
+ +             * if hw threads in the same physical core are consecutive.
+ +             * Without SMT the pinning layout should not matter too much.
+ +             * so we assume a consecutive layout and maximally spread out"
+ +             * the threads at equal threads per core.
+ +             * Note that IBM is the major non-x86 case with cpuid support
+ +             * and probably threads are already pinned by the queuing system,
+ +             * so we wouldn't end up here in the first place.
+ +             */
+ +            *pin_stride = (nhwthreads - pin_offset)/nthreads;
+ +        }
+ +    }
+ +    else
+ +    {
+ +        /* Check the placement of the thread with the largest index to make sure
+ +         * that the offset & stride doesn't cause pinning beyond the last hardware thread. */
+ +        if (pin_offset + (nthreads-1)*(*pin_stride) >= nhwthreads)
+ +        {
+ +            /* We are oversubscribing, don't pin */
+ +            md_print_warn(NULL, fplog,
+ +                          "WARNING: The requested pinning stride is too large for the available logical cores,\n"
+ +                          "         will not pin threads");
+ +
+ +            return -1;
+ +        }
+ +    }
+ +
+ +    if (fplog != NULL)
+ +    {
+ +        fprintf(fplog, "Pinning threads with a%s logical core stride of %d\n",
+ +                bPickPinStride ? "n auto-selected" : " user-specified",
+ +                *pin_stride);
+ +    }
+ +
+ +    return 0;
+ +}
+ +
+ +/* Set CPU affinity. Can be important for performance.
+ +   On some systems (e.g. Cray) CPU Affinity is set by default.
+ +   But default assigning doesn't work (well) with only some ranks
+ +   having threads. This causes very low performance.
+ +   External tools have cumbersome syntax for setting affinity
+ +   in the case that only some ranks have threads.
+ +   Thus it is important that GROMACS sets the affinity internally
+ +   if only PME is using threads.
+ + */
+ +void
+ +gmx_set_thread_affinity(FILE                *fplog,
+ +                        const t_commrec     *cr,
+ +                        gmx_hw_opt_t        *hw_opt,
+ +                        const gmx_hw_info_t *hwinfo)
+ +{
+ +    int        nth_affinity_set, thread_id_node, thread_id,
+ +               nthread_local, nthread_node, nthread_hw_max, nphyscore;
+ +    int        offset;
- #endif /* __APPLE__ */
++    const int *locality_order;
++    int        rc;
+ +
+ +    if (hw_opt->thread_affinity == threadaffOFF)
+ +    {
+ +        /* Nothing to do */
+ +        return;
+ +    }
+ +
+ +    /* If the tMPI thread affinity setting is not supported encourage the user
+ +     * to report it as it's either a bug or an exotic platform which we might
+ +     * want to support. */
+ +    if (tMPI_Thread_setaffinity_support() != TMPI_SETAFFINITY_SUPPORT_YES)
+ +    {
+ +        /* we know Mac OS doesn't support setting thread affinity, so there's
+ +           no point in warning the user in that case. In any other case
+ +           the user might be able to do something about it. */
+ +#ifndef __APPLE__
+ +        md_print_warn(NULL, fplog,
+ +                      "Can not set thread affinities on the current platform. On NUMA systems this\n"
+ +                      "can cause performance degradation. If you think your platform should support\n"
+ +                      "setting affinities, contact the GROMACS developers.");
-     /* hw_opt is shared among tMPI threads, so for thread safety we need to do
-      * the layout detection only on master as core_pinning_stride is an in-out
-      * parameter and gets auto-set depending on its initial value.
-      * This
-      * This is not thread-safe with multi-simulations, but that's anyway not
-      * supported by tMPI. */
-     if (SIMMASTER(cr))
-     {
-         int ret;
-         int i;
- 
-         ret = tMPI_Thread_mutex_lock(&locality_order_mtx);
-         if (ret != 0)
-         {
-             goto locality_order_err;
-         }
-         rc = get_thread_affinity_layout(fplog, cr, hwinfo,
-                                         nthread_node,
-                                         offset, &hw_opt->core_pinning_stride,
-                                         &locality_order);
-         have_locality_order = TRUE;
-         ret                 = tMPI_Thread_cond_broadcast(&locality_order_cond);
-         if (ret != 0)
-         {
-             tMPI_Thread_mutex_unlock(&locality_order_mtx);
-             goto locality_order_err;
-         }
-         ret = tMPI_Thread_mutex_unlock(&locality_order_mtx);
-         if (ret != 0)
-         {
-             goto locality_order_err;
-         }
-     }
-     else
-     {
-         int ret;
-         /* all other threads wait for the locality order data. */
-         ret = tMPI_Thread_mutex_lock(&locality_order_mtx);
-         if (ret != 0)
-         {
-             goto locality_order_err;
-         }
- 
-         while (!have_locality_order)
-         {
-             ret = tMPI_Thread_cond_wait(&locality_order_cond,
-                                         &locality_order_mtx);
-             if (ret != 0)
-             {
-                 tMPI_Thread_mutex_unlock(&locality_order_mtx);
-                 goto locality_order_err;
-             }
-         }
-         ret = tMPI_Thread_mutex_unlock(&locality_order_mtx);
-         if (ret != 0)
-         {
-             goto locality_order_err;
-         }
-     }
++#endif  /* __APPLE__ */
+ +        return;
+ +    }
+ +
+ +    /* threads on this MPI process or TMPI thread */
+ +    if (cr->duty & DUTY_PP)
+ +    {
+ +        nthread_local = gmx_omp_nthreads_get(emntNonbonded);
+ +    }
+ +    else
+ +    {
+ +        nthread_local = gmx_omp_nthreads_get(emntPME);
+ +    }
+ +
+ +    /* map the current process to cores */
+ +    thread_id_node = 0;
+ +    nthread_node   = nthread_local;
+ +#ifdef GMX_MPI
+ +    if (PAR(cr) || MULTISIM(cr))
+ +    {
+ +        /* We need to determine a scan of the thread counts in this
+ +         * compute node.
+ +         */
+ +        MPI_Comm comm_intra;
+ +
+ +        MPI_Comm_split(MPI_COMM_WORLD, gmx_hostname_num(), cr->rank_intranode,
+ +                       &comm_intra);
+ +        MPI_Scan(&nthread_local, &thread_id_node, 1, MPI_INT, MPI_SUM, comm_intra);
+ +        /* MPI_Scan is inclusive, but here we need exclusive */
+ +        thread_id_node -= nthread_local;
+ +        /* Get the total number of threads on this physical node */
+ +        MPI_Allreduce(&nthread_local, &nthread_node, 1, MPI_INT, MPI_SUM, comm_intra);
+ +        MPI_Comm_free(&comm_intra);
+ +    }
+ +#endif
+ +
+ +    if (hw_opt->thread_affinity == threadaffAUTO &&
+ +        nthread_node != hwinfo->nthreads_hw_avail)
+ +    {
+ +        if (nthread_node > 1 && nthread_node < hwinfo->nthreads_hw_avail)
+ +        {
+ +            md_print_warn(cr, fplog,
+ +                          "NOTE: The number of threads is not equal to the number of (logical) cores\n"
+ +                          "      and the -pin option is set to auto: will not pin thread to cores.\n"
+ +                          "      This can lead to significant performance degradation.\n"
+ +                          "      Consider using -pin on (and -pinoffset in case you run multiple jobs).\n");
+ +        }
+ +
+ +        return;
+ +    }
+ +
+ +    offset = 0;
+ +    if (hw_opt->core_pinning_offset != 0)
+ +    {
+ +        offset = hw_opt->core_pinning_offset;
+ +        md_print_info(cr, fplog, "Applying core pinning offset %d\n", offset);
+ +    }
+ +
- 
- locality_order_err:
-     /* any error in affinity setting shouldn't be fatal, but should generate
-        a warning */
-     md_print_warn(NULL, fplog,
-                   "WARNING: Obtaining affinity information failed due to a basic system error: %s.\n"
-                   "         This can cause performance degradation! ",
-                   strerror(errno));
-     return;
++    rc = get_thread_affinity_layout(fplog, cr, hwinfo,
++                                    nthread_node,
++                                    offset, &hw_opt->core_pinning_stride,
++                                    &locality_order);
+ +
+ +    if (rc != 0)
+ +    {
+ +        /* Incompatible layout, don't pin, warning was already issued */
+ +        return;
+ +    }
+ +
+ +    /* Set the per-thread affinity. In order to be able to check the success
+ +     * of affinity settings, we will set nth_affinity_set to 1 on threads
+ +     * where the affinity setting succeded and to 0 where it failed.
+ +     * Reducing these 0/1 values over the threads will give the total number
+ +     * of threads on which we succeeded.
+ +     */
+ +    nth_affinity_set = 0;
+ +#pragma omp parallel firstprivate(thread_id_node) num_threads(nthread_local) \
+ +    reduction(+:nth_affinity_set)
+ +    {
+ +        int      index, core;
+ +        gmx_bool setaffinity_ret;
+ +
+ +        thread_id       = gmx_omp_get_thread_num();
+ +        thread_id_node += thread_id;
+ +        index           = offset + thread_id_node*hw_opt->core_pinning_stride;
+ +        if (locality_order != NULL)
+ +        {
+ +            core = locality_order[index];
+ +        }
+ +        else
+ +        {
+ +            core = index;
+ +        }
+ +
+ +        setaffinity_ret = tMPI_Thread_setaffinity_single(tMPI_Thread_self(), core);
+ +
+ +        /* store the per-thread success-values of the setaffinity */
+ +        nth_affinity_set = (setaffinity_ret == 0);
+ +
+ +        if (debug)
+ +        {
+ +            fprintf(debug, "On rank %2d, thread %2d, core %2d the affinity setting returned %d\n",
+ +                    cr->nodeid, gmx_omp_get_thread_num(), core, setaffinity_ret);
+ +        }
+ +    }
+ +
+ +    if (nth_affinity_set > nthread_local)
+ +    {
+ +        char msg[STRLEN];
+ +
+ +        sprintf(msg, "Looks like we have set affinity for more threads than "
+ +                "we have (%d > %d)!\n", nth_affinity_set, nthread_local);
+ +        gmx_incons(msg);
+ +    }
+ +    else
+ +    {
+ +        /* check & warn if some threads failed to set their affinities */
+ +        if (nth_affinity_set != nthread_local)
+ +        {
+ +            char sbuf1[STRLEN], sbuf2[STRLEN];
+ +
+ +            /* sbuf1 contains rank info, while sbuf2 OpenMP thread info */
+ +            sbuf1[0] = sbuf2[0] = '\0';
+ +            /* Only add rank info if we have more than one rank. */
+ +            if (cr->nnodes > 1)
+ +            {
+ +#ifdef GMX_MPI
+ +#ifdef GMX_THREAD_MPI
+ +                sprintf(sbuf1, "In tMPI thread #%d: ", cr->nodeid);
+ +#else           /* GMX_LIB_MPI */
+ +                sprintf(sbuf1, "In MPI process #%d: ", cr->nodeid);
+ +#endif
+ +#endif          /* GMX_MPI */
+ +            }
+ +
+ +            if (nthread_local > 1)
+ +            {
+ +                sprintf(sbuf2, "for %d/%d thread%s ",
+ +                        nthread_local - nth_affinity_set, nthread_local,
+ +                        nthread_local > 1 ? "s" : "");
+ +            }
+ +
+ +            md_print_warn(NULL, fplog,
+ +                          "WARNING: %sAffinity setting %sfailed.\n"
+ +                          "         This can cause performance degradation! If you think your setting are\n"
+ +                          "         correct, contact the GROMACS developers.",
+ +                          sbuf1, sbuf2);
+ +        }
+ +    }
+ +    return;
+ +}
+ +
+ +/* Check the process affinity mask and if it is found to be non-zero,
+ + * will honor it and disable mdrun internal affinity setting.
+ + * Note that this will only work on Linux as we use a GNU feature.
+ + */
+ +void
+ +gmx_check_thread_affinity_set(FILE *fplog, const t_commrec *cr,
+ +                              gmx_hw_opt_t *hw_opt, int ncpus,
+ +                              gmx_bool bAfterOpenmpInit)
+ +{
+ +#ifdef HAVE_SCHED_GETAFFINITY
+ +    cpu_set_t mask_current;
+ +    int       i, ret, cpu_count, cpu_set;
+ +    gmx_bool  bAllSet;
+ +
+ +    assert(hw_opt);
+ +    if (hw_opt->thread_affinity == threadaffOFF)
+ +    {
+ +        /* internal affinity setting is off, don't bother checking process affinity */
+ +        return;
+ +    }
+ +
+ +    CPU_ZERO(&mask_current);
+ +    if ((ret = sched_getaffinity(0, sizeof(cpu_set_t), &mask_current)) != 0)
+ +    {
+ +        /* failed to query affinity mask, will just return */
+ +        if (debug)
+ +        {
+ +            fprintf(debug, "Failed to query affinity mask (error %d)", ret);
+ +        }
+ +        return;
+ +    }
+ +
+ +    /* Before proceeding with the actual check, make sure that the number of
+ +     * detected CPUs is >= the CPUs in the current set.
+ +     * We need to check for CPU_COUNT as it was added only in glibc 2.6. */
+ +#ifdef CPU_COUNT
+ +    if (ncpus < CPU_COUNT(&mask_current))
+ +    {
+ +        if (debug)
+ +        {
+ +            fprintf(debug, "%d CPUs detected, but %d was returned by CPU_COUNT",
+ +                    ncpus, CPU_COUNT(&mask_current));
+ +        }
+ +        return;
+ +    }
+ +#endif /* CPU_COUNT */
+ +
+ +    bAllSet = TRUE;
+ +    for (i = 0; (i < ncpus && i < CPU_SETSIZE); i++)
+ +    {
+ +        bAllSet = bAllSet && (CPU_ISSET(i, &mask_current) != 0);
+ +    }
+ +
+ +    if (!bAllSet)
+ +    {
+ +        if (hw_opt->thread_affinity == threadaffAUTO)
+ +        {
+ +            if (!bAfterOpenmpInit)
+ +            {
+ +                md_print_warn(cr, fplog,
+ +                              "Non-default thread affinity set, disabling internal thread affinity");
+ +            }
+ +            else
+ +            {
+ +                md_print_warn(cr, fplog,
+ +                              "Non-default thread affinity set probably by the OpenMP library,\n"
+ +                              "disabling internal thread affinity");
+ +            }
+ +            hw_opt->thread_affinity = threadaffOFF;
+ +        }
+ +        else
+ +        {
+ +            /* Only warn once, at the last check (bAfterOpenmpInit==TRUE) */
+ +            if (bAfterOpenmpInit)
+ +            {
+ +                md_print_warn(cr, fplog,
+ +                              "Overriding thread affinity set outside %s\n",
+ +                              ShortProgram());
+ +            }
+ +        }
+ +
+ +        if (debug)
+ +        {
+ +            fprintf(debug, "Non-default affinity mask found\n");
+ +        }
+ +    }
+ +    else
+ +    {
+ +        if (debug)
+ +        {
+ +            fprintf(debug, "Default affinity mask found\n");
+ +        }
+ +    }
+ +#endif /* HAVE_SCHED_GETAFFINITY */
+ +}
diff --cc src/gromacs/gmxlib/string2.c

index 50222122892428d889aaa3c3aa71c016ab19cdcd,0000000000000000000000000000000000000000..1fba7cef9a065f7a0b946061518427d12ab92d83

mode 100644,000000..100644
--- 1/src/gromacs/gmxlib/string2.c
--- /dev/null
+++ b/src/gromacs/gmxlib/string2.c
@@@ -1,703 -1,0 +1,704 @@@
-     user = pw->pw_name;
+ +/*
+ + *
+ + *                This source code is part of
+ + *
+ + *                 G   R   O   M   A   C   S
+ + *
+ + *          GROningen MAchine for Chemical Simulations
+ + *
+ + *                        VERSION 3.2.0
+ + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
+ + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ + * Copyright (c) 2001-2004, The GROMACS development team,
+ + * check out http://www.gromacs.org for more information.
+ +
+ + * This program is free software; you can redistribute it and/or
+ + * modify it under the terms of the GNU General Public License
+ + * as published by the Free Software Foundation; either version 2
+ + * of the License, or (at your option) any later version.
+ + *
+ + * If you want to redistribute modifications, please consider that
+ + * scientific software is very special. Version control is crucial -
+ + * bugs must be traceable. We will be happy to consider code for
+ + * inclusion in the official distribution, but derived work must not
+ + * be called official GROMACS. Details are found in the README & COPYING
+ + * files - if they are missing, get the official version at www.gromacs.org.
+ + *
+ + * To help us fund GROMACS development, we humbly ask that you cite
+ + * the papers on the package - you can find them in the top README file.
+ + *
+ + * For more info, check our website at http://www.gromacs.org
+ + *
+ + * And Hey:
+ + * GROningen Mixture of Alchemy and Childrens' Stories
+ + */
+ +/* This file is completely threadsafe - keep it that way! */
+ +#ifdef HAVE_CONFIG_H
+ +#include <config.h>
+ +#endif
+ +#include "gromacs/utility/gmx_header_config.h"
+ +
+ +#ifdef GMX_CRAY_XT3
+ +#undef HAVE_PWD_H
+ +#endif
+ +
+ +#include <stdio.h>
+ +#include <ctype.h>
+ +#include <stdlib.h>
+ +#include <errno.h>
+ +#include <sys/types.h>
+ +#include <time.h>
+ +
+ +#ifdef HAVE_SYS_TIME_H
+ +#include <sys/time.h>
+ +#endif
+ +
+ +#ifdef HAVE_UNISTD_H
+ +#include <unistd.h>
+ +#endif
+ +
+ +#ifdef HAVE_PWD_H
+ +#include <pwd.h>
+ +#endif
+ +#include <time.h>
+ +#include <assert.h>
+ +
+ +#include "typedefs.h"
+ +#include "smalloc.h"
+ +#include "gmx_fatal.h"
+ +#include "macros.h"
+ +#include "string2.h"
+ +#include "futil.h"
+ +
+ +int continuing(char *s)
+ +{
+ +    int sl;
+ +    assert(s);
+ +
+ +    rtrim(s);
+ +    sl = strlen(s);
+ +    if ((sl > 0) && (s[sl-1] == CONTINUE))
+ +    {
+ +        s[sl-1] = 0;
+ +        return TRUE;
+ +    }
+ +    else
+ +    {
+ +        return FALSE;
+ +    }
+ +}
+ +
+ +
+ +
+ +char *fgets2(char *line, int n, FILE *stream)
+ +{
+ +    char *c;
+ +    if (fgets(line, n, stream) == NULL)
+ +    {
+ +        return NULL;
+ +    }
+ +    if ((c = strchr(line, '\n')) != NULL)
+ +    {
+ +        *c = '\0';
+ +    }
+ +    else
+ +    {
+ +        /* A line not ending in a newline can only occur at the end of a file,
+ +         * or because of n being too small.
+ +         * Since both cases occur very infrequently, we can check for EOF.
+ +         */
+ +        if (!gmx_eof(stream))
+ +        {
+ +            gmx_fatal(FARGS, "An input file contains a line longer than %d characters, while the buffer passed to fgets2 has size %d. The line starts with: '%20.20s'", n, n, line);
+ +        }
+ +    }
+ +    if ((c = strchr(line, '\r')) != NULL)
+ +    {
+ +        *c = '\0';
+ +    }
+ +
+ +    return line;
+ +}
+ +
+ +void strip_comment (char *line)
+ +{
+ +    char *c;
+ +
+ +    if (!line)
+ +    {
+ +        return;
+ +    }
+ +
+ +    /* search for a comment mark and replace it by a zero */
+ +    if ((c = strchr(line, COMMENTSIGN)) != NULL)
+ +    {
+ +        (*c) = 0;
+ +    }
+ +}
+ +
+ +void upstring (char *str)
+ +{
+ +    int i;
+ +
+ +    for (i = 0; (i < (int)strlen(str)); i++)
+ +    {
+ +        str[i] = toupper(str[i]);
+ +    }
+ +}
+ +
+ +void ltrim (char *str)
+ +{
+ +    char *tr;
+ +    int   i, c;
+ +
+ +    if (NULL == str)
+ +    {
+ +        return;
+ +    }
+ +
+ +    c = 0;
+ +    while (('\0' != str[c]) && isspace(str[c]))
+ +    {
+ +        c++;
+ +    }
+ +    if (c > 0)
+ +    {
+ +        for (i = c; ('\0' != str[i]); i++)
+ +        {
+ +            str[i-c] = str[i];
+ +        }
+ +        str[i-c] = '\0';
+ +    }
+ +}
+ +
+ +void rtrim (char *str)
+ +{
+ +    int nul;
+ +
+ +    if (NULL == str)
+ +    {
+ +        return;
+ +    }
+ +
+ +    nul = strlen(str)-1;
+ +    while ((nul > 0) && ((str[nul] == ' ') || (str[nul] == '\t')) )
+ +    {
+ +        str[nul] = '\0';
+ +        nul--;
+ +    }
+ +}
+ +
+ +void trim (char *str)
+ +{
+ +    ltrim (str);
+ +    rtrim (str);
+ +}
+ +
+ +char *
+ +gmx_ctime_r(const time_t *clock, char *buf, int n)
+ +{
+ +    char tmpbuf[STRLEN];
+ +
+ +#ifdef GMX_NATIVE_WINDOWS
+ +    /* Windows */
+ +    ctime_s( tmpbuf, STRLEN, clock );
+ +#elif (defined(__sun))
+ +    /*Solaris*/
+ +    ctime_r(clock, tmpbuf, n);
+ +#else
+ +    ctime_r(clock, tmpbuf);
+ +#endif
+ +    strncpy(buf, tmpbuf, n-1);
+ +    buf[n-1] = '\0';
+ +
+ +    return buf;
+ +}
+ +
+ +void nice_header (FILE *out, const char *fn)
+ +{
+ +    const char    *unk = "onbekend";
+ +    time_t         clock;
+ +    const char    *user = unk;
+ +    int            gh;
+ +#ifdef HAVE_PWD_H
+ +    uid_t          uid;
+ +#else
+ +    int            uid;
+ +#endif
+ +    char           buf[256] = "";
+ +    char           timebuf[STRLEN];
+ +#ifdef HAVE_PWD_H
+ +    struct passwd *pw;
+ +#endif
+ +
+ +    /* Print a nice header above the file */
+ +    time(&clock);
+ +    fprintf (out, "%c\n", COMMENTSIGN);
+ +    fprintf (out, "%c\tFile '%s' was generated\n", COMMENTSIGN, fn ? fn : unk);
+ +
+ +#ifdef HAVE_PWD_H
+ +    uid  = getuid();
+ +    pw   = getpwuid(uid);
+ +    gh   = gethostname(buf, 255);
++    /* pw returns null on error (e.g. compute nodes lack /etc/passwd) */
++    user = pw ? pw->pw_name : unk;
+ +#else
+ +    uid = 0;
+ +    gh  = -1;
+ +#endif
+ +
+ +    gmx_ctime_r(&clock, timebuf, STRLEN);
+ +    fprintf (out, "%c\tBy user: %s (%d)\n", COMMENTSIGN,
+ +             user ? user : unk, (int) uid);
+ +    fprintf(out, "%c\tOn host: %s\n", COMMENTSIGN, (gh == 0) ? buf : unk);
+ +
+ +    fprintf (out, "%c\tAt date: %s", COMMENTSIGN, timebuf);
+ +    fprintf (out, "%c\n", COMMENTSIGN);
+ +}
+ +
+ +
+ +int gmx_strcasecmp_min(const char *str1, const char *str2)
+ +{
+ +    char ch1, ch2;
+ +
+ +    do
+ +    {
+ +        do
+ +        {
+ +            ch1 = toupper(*(str1++));
+ +        }
+ +        while ((ch1 == '-') || (ch1 == '_'));
+ +        do
+ +        {
+ +            ch2 = toupper(*(str2++));
+ +        }
+ +        while ((ch2 == '-') || (ch2 == '_'));
+ +
+ +        if (ch1 != ch2)
+ +        {
+ +            return (ch1-ch2);
+ +        }
+ +    }
+ +    while (ch1);
+ +    return 0;
+ +}
+ +
+ +int gmx_strncasecmp_min(const char *str1, const char *str2, int n)
+ +{
+ +    char  ch1, ch2;
+ +    char *stri1, *stri2;
+ +
+ +    stri1 = (char *)str1;
+ +    stri2 = (char *)str2;
+ +    do
+ +    {
+ +        do
+ +        {
+ +            ch1 = toupper(*(str1++));
+ +        }
+ +        while ((ch1 == '-') || (ch1 == '_'));
+ +        do
+ +        {
+ +            ch2 = toupper(*(str2++));
+ +        }
+ +        while ((ch2 == '-') || (ch2 == '_'));
+ +
+ +        if (ch1 != ch2)
+ +        {
+ +            return (ch1-ch2);
+ +        }
+ +    }
+ +    while (ch1 && (str1-stri1 < n) && (str2-stri2 < n));
+ +    return 0;
+ +}
+ +
+ +int gmx_strcasecmp(const char *str1, const char *str2)
+ +{
+ +    char ch1, ch2;
+ +
+ +    do
+ +    {
+ +        ch1 = toupper(*(str1++));
+ +        ch2 = toupper(*(str2++));
+ +        if (ch1 != ch2)
+ +        {
+ +            return (ch1-ch2);
+ +        }
+ +    }
+ +    while (ch1);
+ +    return 0;
+ +}
+ +
+ +int gmx_strncasecmp(const char *str1, const char *str2, int n)
+ +{
+ +    char ch1, ch2;
+ +
+ +    if (n == 0)
+ +    {
+ +        return 0;
+ +    }
+ +
+ +    do
+ +    {
+ +        ch1 = toupper(*(str1++));
+ +        ch2 = toupper(*(str2++));
+ +        if (ch1 != ch2)
+ +        {
+ +            return (ch1-ch2);
+ +        }
+ +        n--;
+ +    }
+ +    while (ch1 && n);
+ +    return 0;
+ +}
+ +
+ +char *gmx_strdup(const char *src)
+ +{
+ +    char *dest;
+ +
+ +    snew(dest, strlen(src)+1);
+ +    strcpy(dest, src);
+ +
+ +    return dest;
+ +}
+ +
+ +char *
+ +gmx_strndup(const char *src, int n)
+ +{
+ +    int   len;
+ +    char *dest;
+ +
+ +    len = strlen(src);
+ +    if (len > n)
+ +    {
+ +        len = n;
+ +    }
+ +    snew(dest, len+1);
+ +    strncpy(dest, src, len);
+ +    dest[len] = 0;
+ +    return dest;
+ +}
+ +
+ +/* Magic hash init number for Dan J. Bernsteins algorithm.
+ + * Do NOT use any other value unless you really know what you are doing.
+ + */
+ +const unsigned int
+ +    gmx_string_hash_init = 5381;
+ +
+ +
+ +unsigned int
+ +gmx_string_hash_func(const char *s, unsigned int hash_init)
+ +{
+ +    int c;
+ +
+ +    while ((c = toupper(*s++)) != '\0')
+ +    {
+ +        if (isalnum(c))
+ +        {
+ +            hash_init = ((hash_init << 5) + hash_init) ^ c;            /* (hash * 33) xor c */
+ +        }
+ +    }
+ +    return hash_init;
+ +}
+ +
+ +int
+ +gmx_wcmatch(const char *pattern, const char *str)
+ +{
+ +    while (*pattern)
+ +    {
+ +        if (*pattern == '*')
+ +        {
+ +            /* Skip multiple wildcards in a sequence */
+ +            while (*pattern == '*' || *pattern == '?')
+ +            {
+ +                ++pattern;
+ +                /* For ?, we need to check that there are characters left
+ +                 * in str. */
+ +                if (*pattern == '?')
+ +                {
+ +                    if (*str == 0)
+ +                    {
+ +                        return GMX_NO_WCMATCH;
+ +                    }
+ +                    else
+ +                    {
+ +                        ++str;
+ +                    }
+ +                }
+ +            }
+ +            /* If the pattern ends after the star, we have a match */
+ +            if (*pattern == 0)
+ +            {
+ +                return 0;
+ +            }
+ +            /* Match the rest against each possible suffix of str */
+ +            while (*str)
+ +            {
+ +                /* Only do the recursive call if the first character
+ +                 * matches. We don't have to worry about wildcards here,
+ +                 * since we have processed them above. */
+ +                if (*pattern == *str)
+ +                {
+ +                    int rc;
+ +                    /* Match the suffix, and return if a match or an error */
+ +                    rc = gmx_wcmatch(pattern, str);
+ +                    if (rc != GMX_NO_WCMATCH)
+ +                    {
+ +                        return rc;
+ +                    }
+ +                }
+ +                ++str;
+ +            }
+ +            /* If no suffix of str matches, we don't have a match */
+ +            return GMX_NO_WCMATCH;
+ +        }
+ +        else if ((*pattern == '?' && *str != 0) || *pattern == *str)
+ +        {
+ +            ++str;
+ +        }
+ +        else
+ +        {
+ +            return GMX_NO_WCMATCH;
+ +        }
+ +        ++pattern;
+ +    }
+ +    /* When the pattern runs out, we have a match if the string has ended. */
+ +    return (*str == 0) ? 0 : GMX_NO_WCMATCH;
+ +}
+ +
+ +char *wrap_lines(const char *buf, int line_width, int indent, gmx_bool bIndentFirst)
+ +{
+ +    char    *b2;
+ +    int      i, i0, i2, j, b2len, lspace = 0, l2space = 0;
+ +    gmx_bool bFirst, bFitsOnLine;
+ +
+ +    /* characters are copied from buf to b2 with possible spaces changed
+ +     * into newlines and extra space added for indentation.
+ +     * i indexes buf (source buffer) and i2 indexes b2 (destination buffer)
+ +     * i0 points to the beginning of the current line (in buf, source)
+ +     * lspace and l2space point to the last space on the current line
+ +     * bFirst is set to prevent indentation of first line
+ +     * bFitsOnLine says if the first space occurred before line_width, if
+ +     * that is not the case, we have a word longer than line_width which
+ +     * will also not fit on the next line, so we might as well keep it on
+ +     * the current line (where it also won't fit, but looks better)
+ +     */
+ +
+ +    b2    = NULL;
+ +    b2len = strlen(buf)+1+indent;
+ +    snew(b2, b2len);
+ +    i0 = i2 = 0;
+ +    if (bIndentFirst)
+ +    {
+ +        for (i2 = 0; (i2 < indent); i2++)
+ +        {
+ +            b2[i2] = ' ';
+ +        }
+ +    }
+ +    bFirst = TRUE;
+ +    do
+ +    {
+ +        l2space = -1;
+ +        /* find the last space before end of line */
+ +        for (i = i0; ((i-i0 < line_width) || (l2space == -1)) && (buf[i]); i++)
+ +        {
+ +            b2[i2++] = buf[i];
+ +            /* remember the position of a space */
+ +            if (buf[i] == ' ')
+ +            {
+ +                lspace  = i;
+ +                l2space = i2-1;
+ +            }
+ +            /* if we have a newline before the line is full, reset counters */
+ +            if (buf[i] == '\n' && buf[i+1])
+ +            {
+ +                i0     = i+1;
+ +                b2len += indent;
+ +                srenew(b2, b2len);
+ +                /* add indentation after the newline */
+ +                for (j = 0; (j < indent); j++)
+ +                {
+ +                    b2[i2++] = ' ';
+ +                }
+ +            }
+ +        }
+ +        /* If we are at the last newline, copy it */
+ +        if (buf[i] == '\n' && !buf[i+1])
+ +        {
+ +            b2[i2++] = buf[i++];
+ +        }
+ +        /* if we're not at the end of the string */
+ +        if (buf[i])
+ +        {
+ +            /* check if one word does not fit on the line */
+ +            bFitsOnLine = (i-i0 <= line_width);
+ +            /* reset line counters to just after the space */
+ +            i0 = lspace+1;
+ +            i2 = l2space+1;
+ +            /* if the words fit on the line, and we're beyond the indentation part */
+ +            if ( (bFitsOnLine) && (l2space >= indent) )
+ +            {
+ +                /* start a new line */
+ +                b2[l2space] = '\n';
+ +                /* and add indentation */
+ +                if (indent)
+ +                {
+ +                    if (bFirst)
+ +                    {
+ +                        line_width -= indent;
+ +                        bFirst      = FALSE;
+ +                    }
+ +                    b2len += indent;
+ +                    srenew(b2, b2len);
+ +                    for (j = 0; (j < indent); j++)
+ +                    {
+ +                        b2[i2++] = ' ';
+ +                    }
+ +                    /* no extra spaces after indent; */
+ +                    while (buf[i0] == ' ')
+ +                    {
+ +                        i0++;
+ +                    }
+ +                }
+ +            }
+ +        }
+ +    }
+ +    while (buf[i]);
+ +    b2[i2] = '\0';
+ +
+ +    return b2;
+ +}
+ +
+ +char **split(char sep, const char *str)
+ +{
+ +    char **ptr = NULL;
+ +    int    n, nn, nptr = 0;
+ +
+ +    if (str == NULL)
+ +    {
+ +        return NULL;
+ +    }
+ +    nn = strlen(str);
+ +    for (n = 0; (n < nn); n++)
+ +    {
+ +        if (str[n] == sep)
+ +        {
+ +            nptr++;
+ +        }
+ +    }
+ +    snew(ptr, nptr+2);
+ +    nptr = 0;
+ +    while (*str != '\0')
+ +    {
+ +        while ((*str != '\0') && (*str == sep))
+ +        {
+ +            str++;
+ +        }
+ +        if (*str != '\0')
+ +        {
+ +            snew(ptr[nptr], 1+strlen(str));
+ +            n = 0;
+ +            while ((*str != '\0') && (*str != sep))
+ +            {
+ +                ptr[nptr][n] = *str;
+ +                str++;
+ +                n++;
+ +            }
+ +            ptr[nptr][n] = '\0';
+ +            nptr++;
+ +        }
+ +    }
+ +    ptr[nptr] = NULL;
+ +
+ +    return ptr;
+ +}
+ +
+ +
+ +gmx_large_int_t
+ +str_to_large_int_t(const char *str, char **endptr)
+ +{
+ +    int              sign = 1;
+ +    gmx_large_int_t  val  = 0;
+ +    char             ch;
+ +    const char      *p;
+ +
+ +    p = str;
+ +    if (p == NULL)
+ +    {
+ +        *endptr = NULL;
+ +        return 0;
+ +    }
+ +
+ +    /* Strip off initial white space */
+ +    while (isspace(*p))
+ +    {
+ +        p++;
+ +    }
+ +    /* Conform to ISO C99 - return original pointer if string does not contain a number */
+ +    if (*str == '\0')
+ +    {
+ +        *endptr = (char *)str;
+ +    }
+ +
+ +    if (*p == '-')
+ +    {
+ +        p++;
+ +        sign *= -1;
+ +    }
+ +
+ +    while ( ((ch = *p) != '\0') && isdigit(ch) )
+ +    {
+ +        /* Important to add sign here, so we dont overflow in final multiplication */
+ +        ch  = (ch-'0')*sign;
+ +        val = val*10 + ch;
+ +        if (ch != val%10)
+ +        {
+ +            /* Some sort of overflow has occured, set endptr to original string */
+ +            *endptr = (char *)str;
+ +            errno   = ERANGE;
+ +            return(0);
+ +        }
+ +        p++;
+ +    }
+ +
+ +    *endptr = (char *)p;
+ +
+ +    return val;
+ +}
+ +
+ +char *gmx_strsep(char **stringp, const char *delim)
+ +{
+ +    char *ret;
+ +    int   len = strlen(delim);
+ +    int   i, j = 0;
+ +    int   found = 0;
+ +
+ +    if (!*stringp)
+ +    {
+ +        return NULL;
+ +    }
+ +    ret = *stringp;
+ +    do
+ +    {
+ +        if ( (*stringp)[j] == '\0')
+ +        {
+ +            found    = 1;
+ +            *stringp = NULL;
+ +            break;
+ +        }
+ +        for (i = 0; i < len; i++)
+ +        {
+ +            if ( (*stringp)[j] == delim[i])
+ +            {
+ +                (*stringp)[j] = '\0';
+ +                *stringp      = *stringp+j+1;
+ +                found         = 1;
+ +                break;
+ +            }
+ +        }
+ +        j++;
+ +    }
+ +    while (!found);
+ +
+ +    return ret;
+ +}
diff --cc src/gromacs/legacyheaders/gmx_detect_hardware.h

index 763de7b3ca4c226bec60ac29d853cb9df1ff7de6,0000000000000000000000000000000000000000..f2dcb107d74758d801660633c6b1a9d5b8351b8a

mode 100644,000000..100644
--- 1/src/gromacs/legacyheaders/gmx_detect_hardware.h
--- /dev/null
+++ b/src/gromacs/legacyheaders/gmx_detect_hardware.h
@@@ -1,50 -1,0 +1,68 @@@
- void gmx_detect_hardware(FILE *fplog, gmx_hw_info_t *hwinfo,
-                          const t_commrec *cr,
-                          gmx_bool bForceUseGPU, gmx_bool bTryUseGPU,
-                          const char *gpu_id);
+ +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
+ + *
+ + *
+ + * This file is part of GROMACS.
+ + * Copyright (c) 2012-
+ + *
+ + * Written by the Gromacs development team under coordination of
+ + * David van der Spoel, Berk Hess, and Erik Lindahl.
+ + *
+ + * This library is free software; you can redistribute it and/or
+ + * modify it under the terms of the GNU Lesser General Public License
+ + * as published by the Free Software Foundation; either version 2
+ + * of the License, or (at your option) any later version.
+ + *
+ + * To help us fund GROMACS development, we humbly ask that you cite
+ + * the research papers on the package. Check out http://www.gromacs.org
+ + *
+ + * And Hey:
+ + * GROup of MAchos and Cynical Suckers
+ + */
+ +
+ +#ifndef GMX_HARDWARE_DETECT_H
+ +#define GMX_HARDWARE_DETECT_H
+ +
+ +#include "types/hw_info.h"
+ +
+ +#ifdef __cplusplus
+ +extern "C" {
+ +#endif
+ +#if 0
+ +} /* fixes auto-indentation problems */
+ +#endif
+ +
++/* the init and consistency functions depend on commrec that may not be 
++   consistent in cuda because MPI types don't exist there.  */
++#ifndef __CUDACC__
++#include "types/commrec.h"
++/* return a pointer to a global hwinfo structure. */
++gmx_hw_info_t *gmx_detect_hardware(FILE *fplog, const t_commrec *cr,
++                                   gmx_bool bForceUseGPU, gmx_bool bTryUseGPU,
++                                   const char *gpu_id);
+ +
+ +void gmx_hardware_info_free(gmx_hw_info_t *hwinfo);
+ +
++/* Check the thread count + GPU assignment. This function must
++   either be run by all threads that persist (i.e. all tmpi threads),
++   or be run before they are created.  */
+ +void gmx_check_hw_runconf_consistency(FILE *fplog, gmx_hw_info_t *hwinfo,
+ +                                      const t_commrec *cr, int ntmpi_requsted,
+ +                                      gmx_bool bUseGPU);
++#endif
++
++
++/* Check whether a GPU is shared among ranks, and return the number of shared
++   gpus
++
++   hwinfo        = the hwinfo struct
++
++   returns: The number of GPUs shared among ranks, or 0 */
++int gmx_count_gpu_dev_shared(const gmx_gpu_info_t *gpu_info);
++
+ +
+ +#ifdef __cplusplus
+ +}
+ +#endif
+ +
+ +
+ +#endif /* GMX_HARDWARE_DETECT_H */
diff --cc src/gromacs/legacyheaders/maths.h

index 854f10df33aa1c7c97c755fd1a5ac771b5ac4ac8,0000000000000000000000000000000000000000..fe5b18e6391842505f6b1db2c139b21e254bb219

mode 100644,000000..100644
--- 1/src/gromacs/legacyheaders/maths.h
--- /dev/null
+++ b/src/gromacs/legacyheaders/maths.h
@@@ -1,176 -1,0 +1,188 @@@
+ +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
+ + *
+ + *
+ + *                This source code is part of
+ + *
+ + *                 G   R   O   M   A   C   S
+ + *
+ + *          GROningen MAchine for Chemical Simulations
+ + *
+ + *                        VERSION 3.2.0
+ + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
+ + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ + * Copyright (c) 2001-2004, The GROMACS development team,
+ + * check out http://www.gromacs.org for more information.
+ +
+ + * This program is free software; you can redistribute it and/or
+ + * modify it under the terms of the GNU General Public License
+ + * as published by the Free Software Foundation; either version 2
+ + * of the License, or (at your option) any later version.
+ + *
+ + * If you want to redistribute modifications, please consider that
+ + * scientific software is very special. Version control is crucial -
+ + * bugs must be traceable. We will be happy to consider code for
+ + * inclusion in the official distribution, but derived work must not
+ + * be called official GROMACS. Details are found in the README & COPYING
+ + * files - if they are missing, get the official version at www.gromacs.org.
+ + *
+ + * To help us fund GROMACS development, we humbly ask that you cite
+ + * the papers on the package - you can find them in the top README file.
+ + *
+ + * For more info, check our website at http://www.gromacs.org
+ + *
+ + * And Hey:
+ + * Gromacs Runs On Most of All Computer Systems
+ + */
+ +
+ +#ifndef _maths_h
+ +#define _maths_h
+ +
+ +#include <math.h>
+ +#include "types/simple.h"
+ +#include "typedefs.h"
+ +
+ +#ifdef __cplusplus
+ +extern "C" {
+ +#endif
+ +
+ +#ifndef M_PI
+ +#define M_PI        3.14159265358979323846
+ +#endif
+ +
+ +#ifndef M_PI_2
+ +#define M_PI_2      1.57079632679489661923
+ +#endif
+ +
+ +#ifndef M_2PI
+ +#define M_2PI       6.28318530717958647692
+ +#endif
+ +
+ +#ifndef M_SQRT2
+ +#define M_SQRT2 sqrt(2.0)
+ +#endif
+ +
+ +#ifndef M_1_PI
+ +#define M_1_PI      0.31830988618379067154
+ +#endif
+ +
+ +#ifndef M_FLOAT_1_SQRTPI /* used in CUDA kernels */
+ +/* 1.0 / sqrt(M_PI) */
+ +#define M_FLOAT_1_SQRTPI 0.564189583547756f
+ +#endif
+ +
+ +#ifndef M_1_SQRTPI
+ +/* 1.0 / sqrt(M_PI) */
+ +#define M_1_SQRTPI 0.564189583547756
+ +#endif
+ +
+ +#ifndef M_2_SQRTPI
+ +/* 2.0 / sqrt(M_PI) */
+ +#define M_2_SQRTPI  1.128379167095513
+ +#endif
+ +
+ +int     gmx_nint(real a);
+ +real    sign(real x, real y);
+ +
+ +real    cuberoot (real a);
+ +double  gmx_erfd(double x);
+ +double  gmx_erfcd(double x);
+ +float   gmx_erff(float x);
+ +float   gmx_erfcf(float x);
+ +#ifdef GMX_DOUBLE
+ +#define gmx_erf(x)   gmx_erfd(x)
+ +#define gmx_erfc(x)  gmx_erfcd(x)
+ +#else
+ +#define gmx_erf(x)   gmx_erff(x)
+ +#define gmx_erfc(x)  gmx_erfcf(x)
+ +#endif
+ +
+ +gmx_bool gmx_isfinite(real x);
+ +
+ +/*! \brief Check if two numbers are within a tolerance
+ + *
+ + *  This routine checks if the relative difference between two numbers is
+ + *  approximately within the given tolerance, defined as
+ + *  fabs(f1-f2)<=tolerance*fabs(f1+f2).
+ + *
+ + *  To check if two floating-point numbers are almost identical, use this routine
+ + *  with the tolerance GMX_REAL_EPS, or GMX_DOUBLE_EPS if the check should be
+ + *  done in double regardless of Gromacs precision.
+ + *
+ + *  To check if two algorithms produce similar results you will normally need
+ + *  to relax the tolerance significantly since many operations (e.g. summation)
+ + *  accumulate floating point errors.
+ + *
+ + *  \param f1  First number to compare
+ + *  \param f2  Second number to compare
+ + *  \param tol Tolerance to use
+ + *
+ + *  \return 1 if the relative difference is within tolerance, 0 if not.
+ + */
+ +static int
+ +gmx_within_tol(double   f1,
+ +               double   f2,
+ +               double   tol)
+ +{
+ +    /* The or-equal is important - otherwise we return false if f1==f2==0 */
+ +    if (fabs(f1-f2) <= tol*0.5*(fabs(f1)+fabs(f2)) )
+ +    {
+ +        return 1;
+ +    }
+ +    else
+ +    {
+ +        return 0;
+ +    }
+ +}
+ +
+ +
+ +
+ +/**
+ + * Check if a number is smaller than some preset safe minimum
+ + * value, currently defined as GMX_REAL_MIN/GMX_REAL_EPS.
+ + *
+ + * If a number is smaller than this value we risk numerical overflow
+ + * if any number larger than 1.0/GMX_REAL_EPS is divided by it.
+ + *
+ + * \return 1  if 'almost' numerically zero, 0 otherwise.
+ + */
+ +static int
+ +gmx_numzero(double a)
+ +{
+ +    return gmx_within_tol(a, 0.0, GMX_REAL_MIN/GMX_REAL_EPS);
+ +}
+ +
+ +
+ +static real
+ +gmx_log2(real x)
+ +{
+ +    const real iclog2 = 1.0/log( 2.0 );
+ +
+ +    return log( x ) * iclog2;
+ +}
+ +
+ +/*! /brief Multiply two large ints
+ + *
+ + *  Returns true when overflow did not occur.
+ + */
+ +gmx_bool
+ +check_int_multiply_for_overflow(gmx_large_int_t  a,
+ +                                gmx_large_int_t  b,
+ +                                gmx_large_int_t *result);
+ +
++static int gmx_greatest_common_divisor(int p, int q)
++{
++    int tmp;
++    while (q != 0)
++    {
++        tmp = q;
++        q = p % q;
++        p = tmp;
++    }
++    return p;
++}
++
+ +#ifdef __cplusplus
+ +}
+ +#endif
+ +
+ +#endif  /* _maths_h */
diff --cc src/gromacs/legacyheaders/nbnxn_cuda_data_mgmt.h

index ed3bae4ef27296b67fe8636054267b081851667c,0000000000000000000000000000000000000000..03868818f58259cde916d5fa680ef63c97951d3c

mode 100644,000000..100644
--- 1/src/gromacs/legacyheaders/nbnxn_cuda_data_mgmt.h
--- /dev/null
+++ b/src/gromacs/legacyheaders/nbnxn_cuda_data_mgmt.h
@@@ -1,135 -1,0 +1,146 @@@
-                      gmx_gpu_info_t *gpu_info, int my_gpu_index,
+ +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
+ + *
+ + *
+ + *                This source code is part of
+ + *
+ + *                 G   R   O   M   A   C   S
+ + *
+ + *          GROningen MAchine for Chemical Simulations
+ + *
+ + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
+ + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ + * Copyright (c) 2001-2012, The GROMACS development team,
+ + * check out http://www.gromacs.org for more information.
+ + *
+ + * This program is free software; you can redistribute it and/or
+ + * modify it under the terms of the GNU General Public License
+ + * as published by the Free Software Foundation; either version 2
+ + * of the License, or (at your option) any later version.
+ + *
+ + * If you want to redistribute modifications, please consider that
+ + * scientific software is very special. Version control is crucial -
+ + * bugs must be traceable. We will be happy to consider code for
+ + * inclusion in the official distribution, but derived work must not
+ + * be called official GROMACS. Details are found in the README & COPYING
+ + * files - if they are missing, get the official version at www.gromacs.org.
+ + *
+ + * To help us fund GROMACS development, we humbly ask that you cite
+ + * the papers on the package - you can find them in the top README file.
+ + *
+ + * For more info, check our website at http://www.gromacs.org
+ + *
+ + * And Hey:
+ + * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
+ + */
+ +
+ +#ifndef NBNXN_CUDA_DATA_MGMT_H
+ +#define NBNXN_CUDA_DATA_MGMT_H
+ +
+ +#include "types/simple.h"
+ +#include "types/interaction_const.h"
+ +#include "types/nbnxn_cuda_types_ext.h"
+ +#include "types/hw_info.h"
+ +
+ +#ifdef GMX_GPU
+ +#define FUNC_TERM ;
+ +#define FUNC_QUALIFIER
+ +#else
+ +#define FUNC_TERM {}
+ +#define FUNC_QUALIFIER static
+ +#endif
+ +
+ +#ifdef __cplusplus
+ +extern "C" {
+ +#endif
+ +
+ +/*! Initializes the data structures related to CUDA nonbonded calculations. */
+ +FUNC_QUALIFIER
+ +void nbnxn_cuda_init(FILE *fplog,
+ +                     nbnxn_cuda_ptr_t *p_cu_nb,
++                     const gmx_gpu_info_t *gpu_info, int my_gpu_index,
+ +                     /* true of both local and non-local are don on GPU */
+ +                     gmx_bool bLocalAndNonlocal) FUNC_TERM
+ +
+ +/*! Initializes simulation constant data. */
+ +FUNC_QUALIFIER
+ +void nbnxn_cuda_init_const(nbnxn_cuda_ptr_t                cu_nb,
+ +                           const interaction_const_t      *ic,
+ +                           const nonbonded_verlet_group_t *nbv_group) FUNC_TERM
+ +
+ +/*! Initializes pair-list data for GPU, called at every pair search step. */
+ +FUNC_QUALIFIER
+ +void nbnxn_cuda_init_pairlist(nbnxn_cuda_ptr_t        cu_nb,
+ +                              const nbnxn_pairlist_t *h_nblist,
+ +                              int                     iloc) FUNC_TERM
+ +
+ +/*! Initializes atom-data on the GPU, called at every pair search step. */
+ +FUNC_QUALIFIER
+ +void nbnxn_cuda_init_atomdata(nbnxn_cuda_ptr_t        cu_nb,
+ +                              const nbnxn_atomdata_t *atomdata) FUNC_TERM
+ +
+ +/*! \brief Update parameters during PP-PME load balancing. */
+ +FUNC_QUALIFIER
+ +void nbnxn_cuda_pme_loadbal_update_param(nbnxn_cuda_ptr_t           cu_nb,
+ +                                         const interaction_const_t *ic) FUNC_TERM
+ +
+ +/*! Uploads shift vector to the GPU if the box is dynamic (otherwise just returns). */
+ +FUNC_QUALIFIER
+ +void nbnxn_cuda_upload_shiftvec(nbnxn_cuda_ptr_t        cu_nb,
+ +                                const nbnxn_atomdata_t *nbatom) FUNC_TERM
+ +
+ +/*! Clears GPU outputs: nonbonded force, shift force and energy. */
+ +FUNC_QUALIFIER
+ +void nbnxn_cuda_clear_outputs(nbnxn_cuda_ptr_t cu_nb,
+ +                              int              flags) FUNC_TERM
+ +
+ +/*! Frees all GPU resources used for the nonbonded calculations. */
+ +FUNC_QUALIFIER
+ +void nbnxn_cuda_free(FILE            *fplog,
+ +                     nbnxn_cuda_ptr_t cu_nb) FUNC_TERM
+ +
+ +/*! Returns the GPU timings structure or NULL if GPU is not used or timing is off. */
+ +FUNC_QUALIFIER
+ +wallclock_gpu_t * nbnxn_cuda_get_timings(nbnxn_cuda_ptr_t cu_nb)
+ +#ifdef GMX_GPU
+ +;
+ +#else
+ +{
+ +    return NULL;
+ +}
+ +#endif
+ +
+ +/*! Resets nonbonded GPU timings. */
+ +FUNC_QUALIFIER
+ +void nbnxn_cuda_reset_timings(nbnxn_cuda_ptr_t cu_nb) FUNC_TERM
+ +
+ +/*! Calculates the minimum size of proximity lists to improve SM load balance
+ +    with CUDA non-bonded kernels. */
+ +FUNC_QUALIFIER
+ +int nbnxn_cuda_min_ci_balanced(nbnxn_cuda_ptr_t cu_nb)
+ +#ifdef GMX_GPU
+ +;
+ +#else
+ +{
+ +    return -1;
+ +}
+ +#endif
+ +
++/*! Returns if analytical Ewald CUDA kernels are used. */
++FUNC_QUALIFIER
++gmx_bool nbnxn_cuda_is_kernel_ewald_analytical(const nbnxn_cuda_ptr_t cu_nb)
++#ifdef GMX_GPU
++;
++#else
++{
++    return FALSE;
++}
++#endif
++
+ +#ifdef __cplusplus
+ +}
+ +#endif
+ +
+ +#undef FUNC_TERM
+ +#undef FUNC_QUALIFIER
+ +
+ +#endif /* NBNXN_CUDA_DATA_MGMT_H */
diff --cc src/gromacs/legacyheaders/thread_mpi/atomic.h
Simple merge
diff --cc src/gromacs/legacyheaders/thread_mpi/atomic/xlc_ppc.h
Simple merge
diff --cc src/gromacs/legacyheaders/types/enums.h

index 80c85ef065e2a6347d0d9c764ab820df4c17daad,0000000000000000000000000000000000000000..6187c927f480b56446b7dcea3c141781c78ae1d2

mode 100644,000000..100644
--- 1/src/gromacs/legacyheaders/types/enums.h
--- /dev/null
+++ b/src/gromacs/legacyheaders/types/enums.h
@@@ -1,449 -1,0 +1,449 @@@
- #define EEL_USER(e) ((e) == eelUSER || (e) == eelPMEUSER || (e) == (eelPMESWITCH))
+ +/*
+ + *
+ + *                This source code is part of
+ + *
+ + *                 G   R   O   M   A   C   S
+ + *
+ + *          GROningen MAchine for Chemical Simulations
+ + *
+ + *                        VERSION 3.2.0
+ + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
+ + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ + * Copyright (c) 2001-2004, The GROMACS development team,
+ + * check out http://www.gromacs.org for more information.
+ +
+ + * This program is free software; you can redistribute it and/or
+ + * modify it under the terms of the GNU General Public License
+ + * as published by the Free Software Foundation; either version 2
+ + * of the License, or (at your option) any later version.
+ + *
+ + * If you want to redistribute modifications, please consider that
+ + * scientific software is very special. Version control is crucial -
+ + * bugs must be traceable. We will be happy to consider code for
+ + * inclusion in the official distribution, but derived work must not
+ + * be called official GROMACS. Details are found in the README & COPYING
+ + * files - if they are missing, get the official version at www.gromacs.org.
+ + *
+ + * To help us fund GROMACS development, we humbly ask that you cite
+ + * the papers on the package - you can find them in the top README file.
+ + *
+ + * For more info, check our website at http://www.gromacs.org
+ + *
+ + * And Hey:
+ + * GRoups of Organic Molecules in ACtion for Science
+ + */
+ +
+ +#ifndef ENUMS_H_
+ +#define ENUMS_H_
+ +
+ +#ifdef __cplusplus
+ +extern "C" {
+ +#endif
+ +#if 0
+ +} /* fixes auto-indentation problems */
+ +#endif
+ +
+ +/* note: these enums should correspond to the names in gmxlib/names.c */
+ +
+ +enum {
+ +    epbcXYZ, epbcNONE, epbcXY, epbcSCREW, epbcNR
+ +};
+ +
+ +enum {
+ +    etcNO, etcBERENDSEN, etcNOSEHOOVER, etcYES, etcANDERSEN, etcANDERSENMASSIVE, etcVRESCALE, etcNR
+ +}; /* yes is an alias for berendsen */
+ +
+ +#define ETC_ANDERSEN(e) (((e) == etcANDERSENMASSIVE) || ((e) == etcANDERSEN))
+ +
+ +enum {
+ +    epcNO, epcBERENDSEN, epcPARRINELLORAHMAN, epcISOTROPIC, epcMTTK, epcNR
+ +}; /* isotropic is an alias for berendsen */
+ +
+ +/* trotter decomposition extended variable parts */
+ +enum {
+ +    etrtNONE, etrtNHC, etrtBAROV, etrtBARONHC, etrtNHC2, etrtBAROV2, etrtBARONHC2,
+ +    etrtVELOCITY1, etrtVELOCITY2, etrtPOSITION, etrtSKIPALL, etrtNR
+ +};
+ +
+ +/* sequenced parts of the trotter decomposition */
+ +enum {
+ +    ettTSEQ0,  ettTSEQ1,  ettTSEQ2,  ettTSEQ3,  ettTSEQ4, ettTSEQMAX
+ +};
+ +
+ +enum {
+ +    epctISOTROPIC, epctSEMIISOTROPIC, epctANISOTROPIC,
+ +    epctSURFACETENSION, epctNR
+ +};
+ +
+ +enum {
+ +    erscNO, erscALL, erscCOM, erscNR
+ +};
+ +
+ +enum {
+ +    ecutsGROUP, ecutsVERLET, ecutsNR
+ +};
+ +
+ +/* Coulomb / VdW interaction modifiers.
+ + * grompp replaces eintmodPOTSHIFT_VERLET by eintmodPOTSHIFT or eintmodNONE.
+ + * Exactcutoff is only used by Reaction-field-zero, and is not user-selectable.
+ + */
+ +enum eintmod {
+ +    eintmodPOTSHIFT_VERLET, eintmodPOTSHIFT, eintmodNONE, eintmodPOTSWITCH, eintmodEXACTCUTOFF, eintmodNR
+ +};
+ +
+ +/*
+ + * eelNOTUSED1 used to be GB, but to enable generalized born with different
+ + * forms of electrostatics (RF, switch, etc.) in the future it is now selected
+ + * separately (through the implicit_solvent option).
+ + */
+ +enum {
+ +    eelCUT,     eelRF,     eelGRF,   eelPME,  eelEWALD,  eelP3M_AD,
+ +    eelPOISSON, eelSWITCH, eelSHIFT, eelUSER, eelGB_NOTUSED, eelRF_NEC, eelENCADSHIFT,
+ +    eelPMEUSER, eelPMESWITCH, eelPMEUSERSWITCH, eelRF_ZERO, eelNR
+ +};
+ +
+ +/* Ewald geometry */
+ +enum {
+ +    eewg3D, eewg3DC, eewgNR
+ +};
+ +
+ +#define EEL_RF(e) ((e) == eelRF || (e) == eelGRF || (e) == eelRF_NEC || (e) == eelRF_ZERO )
+ +
+ +#define EEL_PME(e)  ((e) == eelPME || (e) == eelPMESWITCH || (e) == eelPMEUSER || (e) == eelPMEUSERSWITCH || (e) == eelP3M_AD)
+ +#define EEL_FULL(e) (EEL_PME(e) || (e) == eelPOISSON || (e) == eelEWALD)
+ +
+ +#define EEL_SWITCHED(e) ((e) == eelSWITCH || (e) == eelSHIFT || (e) == eelENCADSHIFT || (e) == eelPMESWITCH || (e) == eelPMEUSERSWITCH)
+ +
++#define EEL_USER(e) ((e) == eelUSER || (e) == eelPMEUSER || (e) == (eelPMEUSERSWITCH))
+ +
+ +#define EEL_IS_ZERO_AT_CUTOFF(e) (EEL_SWITCHED(e) || (e) == eelRF_ZERO)
+ +
+ +#define EEL_MIGHT_BE_ZERO_AT_CUTOFF(e) (EEL_IS_ZERO_AT_CUTOFF(e) || (e) == eelUSER || (e) == eelPMEUSER)
+ +
+ +enum {
+ +    evdwCUT, evdwSWITCH, evdwSHIFT, evdwUSER, evdwENCADSHIFT, evdwNR
+ +};
+ +
+ +#define EVDW_SWITCHED(e) ((e) == evdwSWITCH || (e) == evdwSHIFT || (e) == evdwENCADSHIFT)
+ +
+ +#define EVDW_IS_ZERO_AT_CUTOFF(e) EVDW_SWITCHED(e)
+ +
+ +#define EVDW_MIGHT_BE_ZERO_AT_CUTOFF(e) (EVDW_IS_ZERO_AT_CUTOFF(e) || (e) == evdwUSER)
+ +
+ +enum {
+ +    ensGRID, ensSIMPLE, ensNR
+ +};
+ +
+ +/* eiVV is normal velocity verlet -- eiVVAK uses 1/2*(KE(t-dt/2)+KE(t+dt/2)) as the kinetic energy, and the half step kinetic
+ +   energy for temperature control */
+ +
+ +enum {
+ +    eiMD, eiSteep, eiCG, eiBD, eiSD2, eiNM, eiLBFGS, eiTPI, eiTPIC, eiSD1, eiVV, eiVVAK, eiNR
+ +};
+ +#define EI_VV(e) ((e) == eiVV || (e) == eiVVAK)
+ +#define EI_MD(e) ((e) == eiMD || EI_VV(e))
+ +#define EI_SD(e) ((e) == eiSD1 || (e) == eiSD2)
+ +#define EI_RANDOM(e) (EI_SD(e) || (e) == eiBD)
+ +/*above integrators may not conserve momenta*/
+ +#define EI_DYNAMICS(e) (EI_MD(e) || EI_SD(e) || (e) == eiBD)
+ +#define EI_ENERGY_MINIMIZATION(e) ((e) == eiSteep || (e) == eiCG || (e) == eiLBFGS)
+ +#define EI_TPI(e) ((e) == eiTPI || (e) == eiTPIC)
+ +
+ +#define EI_STATE_VELOCITY(e) (EI_MD(e) || EI_SD(e))
+ +
+ +enum {
+ +    econtLINCS, econtSHAKE, econtNR
+ +};
+ +
+ +enum {
+ +    edrNone, edrSimple, edrEnsemble, edrNR
+ +};
+ +
+ +enum {
+ +    edrwConservative, edrwEqual, edrwNR
+ +};
+ +
+ +/* Combination rule things */
+ +enum {
+ +    eCOMB_NONE, eCOMB_GEOMETRIC, eCOMB_ARITHMETIC, eCOMB_GEOM_SIG_EPS, eCOMB_NR
+ +};
+ +
+ +/* NBF selection */
+ +enum {
+ +    eNBF_NONE, eNBF_LJ, eNBF_BHAM, eNBF_NR
+ +};
+ +
+ +/* simulated tempering methods */
+ +enum {
+ +    esimtempGEOMETRIC, esimtempEXPONENTIAL, esimtempLINEAR, esimtempNR
+ +};
+ +/* FEP selection */
+ +enum {
+ +    efepNO, efepYES, efepSTATIC, efepSLOWGROWTH, efepEXPANDED, efepNR
+ +};
+ +/* if efepNO, there are no evaluations at other states.
+ +   if efepYES, treated equivalently to efepSTATIC.
+ +   if efepSTATIC, then lambdas do not change during the simulation.
+ +   if efepSLOWGROWTH, then the states change monotonically throughout the simulation.
+ +   if efepEXPANDED, then expanded ensemble simulations are occuring.
+ + */
+ +
+ +/* FEP coupling types */
+ +enum {
+ +    efptFEP, efptMASS, efptCOUL, efptVDW, efptBONDED, efptRESTRAINT, efptTEMPERATURE, efptNR
+ +};
+ +
+ +/* How the lambda weights are calculated:
+ +   elamstatsMETROPOLIS = using the metropolis criteria
+ +   elamstatsBARKER = using the Barker critera for transition weights - also called unoptimized Bennett
+ +   elamstatsMINVAR = using Barker + minimum variance for weights
+ +   elamstatsWL = Wang-Landu (using visitation counts)
+ +   elamstatsWWL = Weighted Wang-Landau (using optimized gibbs weighted visitation counts)
+ + */
+ +enum {
+ +    elamstatsNO, elamstatsMETROPOLIS, elamstatsBARKER, elamstatsMINVAR, elamstatsWL, elamstatsWWL, elamstatsNR
+ +};
+ +
+ +#define ELAMSTATS_EXPANDED(e) ((e) > elamstatsNO)
+ +
+ +#define EWL(e) ((e) == elamstatsWL || (e) == elamstatsWWL)
+ +
+ +/* How moves in lambda are calculated:
+ +   elmovemcMETROPOLIS - using the Metropolis criteria, and 50% up and down
+ +   elmovemcBARKER - using the Barker criteria, and 50% up and down
+ +   elmovemcGIBBS - computing the transition using the marginalized probabilities of the lambdas
+ +   elmovemcMETGIBBS - computing the transition using the metropolized version of Gibbs (Monte Carlo Strategies in Scientific computing, Liu, p. 134)
+ + */
+ +enum {
+ +    elmcmoveNO, elmcmoveMETROPOLIS, elmcmoveBARKER, elmcmoveGIBBS, elmcmoveMETGIBBS, elmcmoveNR
+ +};
+ +
+ +/* how we decide whether weights have reached equilibrium
+ +   elmceqNO - never stop, weights keep going
+ +   elmceqYES - fix the weights from the beginning; no movement
+ +   elmceqWLDELTA - stop when the WL-delta falls below a certain level
+ +   elmceqNUMATLAM - stop when we have a certain number of samples at every step
+ +   elmceqSTEPS - stop when we've run a certain total number of steps
+ +   elmceqSAMPLES - stop when we've run a certain total number of samples
+ +   elmceqRATIO - stop when the ratio of samples (lowest to highest) is sufficiently large
+ + */
+ +enum {
+ +    elmceqNO, elmceqYES, elmceqWLDELTA, elmceqNUMATLAM, elmceqSTEPS, elmceqSAMPLES, elmceqRATIO, elmceqNR
+ +};
+ +
+ +/* separate_dhdl_file selection */
+ +enum
+ +{
+ +    /* NOTE: YES is the first one. Do NOT interpret this one as a gmx_bool */
+ +    esepdhdlfileYES, esepdhdlfileNO, esepdhdlfileNR
+ +};
+ +
+ +/* dhdl_derivatives selection */
+ +enum
+ +{
+ +    /* NOTE: YES is the first one. Do NOT interpret this one as a gmx_bool */
+ +    edhdlderivativesYES, edhdlderivativesNO, edhdlderivativesNR
+ +};
+ +
+ +/* Solvent model */
+ +enum {
+ +    esolNO, esolSPC, esolTIP4P, esolNR
+ +};
+ +
+ +/* Dispersion correction */
+ +enum {
+ +    edispcNO, edispcEnerPres, edispcEner, edispcAllEnerPres, edispcAllEner, edispcNR
+ +};
+ +
+ +/* Shell types, for completion stuff */
+ +enum {
+ +    eshellCSH, eshellBASH, eshellZSH, eshellNR
+ +};
+ +
+ +/* Center of mass motion selection */
+ +enum {
+ +    ecmLINEAR, ecmANGULAR, ecmNO, ecmNR
+ +};
+ +
+ +/* New version of simulated annealing */
+ +enum {
+ +    eannNO, eannSINGLE, eannPERIODIC, eannNR
+ +};
+ +
+ +/* Implicit solvent algorithms */
+ +enum {
+ +    eisNO, eisGBSA, eisNR
+ +};
+ +
+ +/* Algorithms for calculating GB radii */
+ +enum {
+ +    egbSTILL, egbHCT, egbOBC, egbNR
+ +};
+ +
+ +enum {
+ +    esaAPPROX, esaNO, esaSTILL, esaNR
+ +};
+ +
+ +/* Wall types */
+ +enum {
+ +    ewt93, ewt104, ewtTABLE, ewt126, ewtNR
+ +};
+ +
+ +/* Pull stuff */
+ +enum {
+ +    epullNO, epullUMBRELLA, epullCONSTRAINT, epullCONST_F, epullNR
+ +};
+ +
+ +enum {
+ +    epullgDIST, epullgDIR, epullgCYL, epullgPOS, epullgDIRPBC, epullgNR
+ +};
+ +
+ +#define PULL_CYL(pull) ((pull)->eGeom == epullgCYL)
+ +
+ +/* Enforced rotation groups */
+ +enum {
+ +    erotgISO, erotgISOPF,
+ +    erotgPM, erotgPMPF,
+ +    erotgRM, erotgRMPF,
+ +    erotgRM2, erotgRM2PF,
+ +    erotgFLEX, erotgFLEXT,
+ +    erotgFLEX2, erotgFLEX2T,
+ +    erotgNR
+ +};
+ +
+ +enum {
+ +    erotgFitRMSD, erotgFitNORM, erotgFitPOT, erotgFitNR
+ +};
+ +
+ +/* QMMM */
+ +enum {
+ +    eQMmethodAM1, eQMmethodPM3, eQMmethodRHF,
+ +    eQMmethodUHF, eQMmethodDFT, eQMmethodB3LYP, eQMmethodMP2, eQMmethodCASSCF, eQMmethodB3LYPLAN,
+ +    eQMmethodDIRECT, eQMmethodNR
+ +};
+ +
+ +enum {
+ +    eQMbasisSTO3G, eQMbasisSTO3G2, eQMbasis321G,
+ +    eQMbasis321Gp, eQMbasis321dGp, eQMbasis621G,
+ +    eQMbasis631G, eQMbasis631Gp, eQMbasis631dGp,
+ +    eQMbasis6311G, eQMbasisNR
+ +};
+ +
+ +enum {
+ +    eQMMMschemenormal, eQMMMschemeoniom, eQMMMschemeNR
+ +};
+ +
+ +enum {
+ +    eMultentOptName, eMultentOptNo, eMultentOptLast, eMultentOptNR
+ +};
+ +
+ +/* flat-bottom posres geometries */
+ +enum {
+ +    efbposresZERO, efbposresSPHERE, efbposresCYLINDER, efbposresX, efbposresY, efbposresZ,
+ +    efbposresNR
+ +};
+ +
+ +enum {
+ +    eAdressOff, eAdressConst, eAdressXSplit, eAdressSphere, eAdressNR
+ +};
+ +
+ +enum {
+ +    eAdressICOff, eAdressICThermoForce, eAdressICNR
+ +};
+ +
+ +enum {
+ +    eAdressSITEcom, eAdressSITEcog, eAdressSITEatom, eAdressSITEatomatom, eAdressSITENR
+ +};
+ +
+ +
+ +/* The interactions contained in a (possibly merged) table
+ + * for computing electrostatic, VDW repulsion and/or VDW dispersion
+ + * contributions.
+ + */
+ +enum gmx_table_interaction
+ +{
+ +    GMX_TABLE_INTERACTION_ELEC,
+ +    GMX_TABLE_INTERACTION_VDWREP_VDWDISP,
+ +    GMX_TABLE_INTERACTION_VDWEXPREP_VDWDISP,
+ +    GMX_TABLE_INTERACTION_VDWDISP,
+ +    GMX_TABLE_INTERACTION_ELEC_VDWREP_VDWDISP,
+ +    GMX_TABLE_INTERACTION_ELEC_VDWEXPREP_VDWDISP,
+ +    GMX_TABLE_INTERACTION_ELEC_VDWDISP,
+ +    GMX_TABLE_INTERACTION_NR
+ +};
+ +
+ +/* Different formats for table data. Cubic spline tables are typically stored
+ + * with the four Y,F,G,H intermediate values (check tables.c for format), which
+ + * makes it easy to load with a single 4-way SIMD instruction too.
+ + * Linear tables only need one value per table point, or two if both V and F
+ + * are calculated. However, with SIMD instructions this makes the loads unaligned,
+ + * and in that case we store the data as F, D=F(i+1)-F(i), V, and then a blank value,
+ + * which again makes it possible to load as a single instruction.
+ + */
+ +enum gmx_table_format
+ +{
+ +    GMX_TABLE_FORMAT_CUBICSPLINE_YFGH,
+ +    GMX_TABLE_FORMAT_LINEAR_VF,
+ +    GMX_TABLE_FORMAT_LINEAR_V,
+ +    GMX_TABLE_FORMAT_LINEAR_F,
+ +    GMX_TABLE_FORMAT_LINEAR_FDV0,
+ +    GMX_TABLE_FORMAT_NR
+ +};
+ +
+ +/* Neighborlist geometry type.
+ + * Kernels will compute interactions between two particles,
+ + * 3-center water, 4-center water or coarse-grained beads.
+ + */
+ +enum gmx_nblist_kernel_geometry
+ +{
+ +    GMX_NBLIST_GEOMETRY_PARTICLE_PARTICLE,
+ +    GMX_NBLIST_GEOMETRY_WATER3_PARTICLE,
+ +    GMX_NBLIST_GEOMETRY_WATER3_WATER3,
+ +    GMX_NBLIST_GEOMETRY_WATER4_PARTICLE,
+ +    GMX_NBLIST_GEOMETRY_WATER4_WATER4,
+ +    GMX_NBLIST_GEOMETRY_CG_CG,
+ +    GMX_NBLIST_GEOMETRY_NR
+ +};
+ +
+ +/* Types of electrostatics calculations available inside nonbonded kernels.
+ + * Note that these do NOT necessarily correspond to the user selections in the MDP file;
+ + * many interactions for instance map to tabulated kernels.
+ + */
+ +enum gmx_nbkernel_elec
+ +{
+ +    GMX_NBKERNEL_ELEC_NONE,
+ +    GMX_NBKERNEL_ELEC_COULOMB,
+ +    GMX_NBKERNEL_ELEC_REACTIONFIELD,
+ +    GMX_NBKERNEL_ELEC_CUBICSPLINETABLE,
+ +    GMX_NBKERNEL_ELEC_GENERALIZEDBORN,
+ +    GMX_NBKERNEL_ELEC_EWALD,
+ +    GMX_NBKERNEL_ELEC_NR
+ +};
+ +
+ +/* Types of vdw calculations available inside nonbonded kernels.
+ + * Note that these do NOT necessarily correspond to the user selections in the MDP file;
+ + * many interactions for instance map to tabulated kernels.
+ + */
+ +enum gmx_nbkernel_vdw
+ +{
+ +    GMX_NBKERNEL_VDW_NONE,
+ +    GMX_NBKERNEL_VDW_LENNARDJONES,
+ +    GMX_NBKERNEL_VDW_BUCKINGHAM,
+ +    GMX_NBKERNEL_VDW_CUBICSPLINETABLE,
+ +    GMX_NBKERNEL_VDW_NR
+ +};
+ +/* Types of interactions inside the neighborlist
+ + */
+ +enum gmx_nblist_interaction_type
+ +{
+ +    GMX_NBLIST_INTERACTION_STANDARD,
+ +    GMX_NBLIST_INTERACTION_FREE_ENERGY,
+ +    GMX_NBLIST_INTERACTION_ADRESS,
+ +    GMX_NBLIST_INTERACTION_NR
+ +};
+ +
+ +#ifdef __cplusplus
+ +}
+ +#endif
+ +
+ +#endif /* ENUMS_H_ */
diff --cc src/gromacs/legacyheaders/types/forcerec.h

index d920cc493af6faca086df6b89303c2028948a9fe,0000000000000000000000000000000000000000..15291dac15d6c38b69e72d21864391ae2a7a4fbf

mode 100644,000000..100644
--- 1/src/gromacs/legacyheaders/types/forcerec.h
--- /dev/null
+++ b/src/gromacs/legacyheaders/types/forcerec.h
@@@ -1,483 -1,0 +1,483 @@@
-     int            ePBC;
-     gmx_bool       bMolPBC;
-     int            rc_scaling;
-     rvec           posres_com;
-     rvec           posres_comB;
- 
-     gmx_hw_info_t *hwinfo;
-     gmx_bool       use_cpu_acceleration;
+ +/*
+ + *
+ + *                This source code is part of
+ + *
+ + *                 G   R   O   M   A   C   S
+ + *
+ + *          GROningen MAchine for Chemical Simulations
+ + *
+ + *                        VERSION 3.2.0
+ + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
+ + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ + * Copyright (c) 2001-2004, The GROMACS development team,
+ + * check out http://www.gromacs.org for more information.
+ +
+ + * This program is free software; you can redistribute it and/or
+ + * modify it under the terms of the GNU General Public License
+ + * as published by the Free Software Foundation; either version 2
+ + * of the License, or (at your option) any later version.
+ + *
+ + * If you want to redistribute modifications, please consider that
+ + * scientific software is very special. Version control is crucial -
+ + * bugs must be traceable. We will be happy to consider code for
+ + * inclusion in the official distribution, but derived work must not
+ + * be called official GROMACS. Details are found in the README & COPYING
+ + * files - if they are missing, get the official version at www.gromacs.org.
+ + *
+ + * To help us fund GROMACS development, we humbly ask that you cite
+ + * the papers on the package - you can find them in the top README file.
+ + *
+ + * For more info, check our website at http://www.gromacs.org
+ + *
+ + * And Hey:
+ + * GRoups of Organic Molecules in ACtion for Science
+ + */
+ +
+ +#include "ns.h"
+ +#include "genborn.h"
+ +#include "qmmmrec.h"
+ +#include "idef.h"
+ +#include "nb_verlet.h"
+ +#include "interaction_const.h"
+ +#include "hw_info.h"
+ +
+ +#ifdef __cplusplus
+ +extern "C" {
+ +#endif
+ +#if 0
+ +} /* fixes auto-indentation problems */
+ +#endif
+ +
+ +/* Abstract type for PME that is defined only in the routine that use them. */
+ +typedef struct gmx_pme *gmx_pme_t;
+ +
+ +
+ +
+ +/* Structure describing the data in a single table */
+ +typedef struct
+ +{
+ +    enum gmx_table_interaction  interaction; /* Types of interactions stored in this table */
+ +    enum gmx_table_format       format;      /* Interpolation type and data format */
+ +
+ +    real                        r;           /* range of the table */
+ +    int                         n;           /* n+1 is the number of table points */
+ +    real                        scale;       /* distance (nm) between two table points */
+ +    real                        scale_exp;   /* distance for exponential part of VdW table, not always used */
+ +    real *                      data;        /* the actual table data */
+ +
+ +    /* Some information about the table layout. This can also be derived from the interpolation
+ +     * type and the table interactions, but it is convenient to have here for sanity checks, and it makes it
+ +     * much easier to access the tables in the nonbonded kernels when we can set the data from variables.
+ +     * It is always true that stride = formatsize*ninteractions
+ +     */
+ +    int                         formatsize;    /* Number of fp variables for each table point (1 for F, 2 for VF, 4 for YFGH, etc.) */
+ +    int                         ninteractions; /* Number of interactions in table, 1 for coul-only, 3 for coul+rep+disp. */
+ +    int                         stride;        /* Distance to next table point (number of fp variables per table point in total) */
+ +} t_forcetable;
+ +
+ +typedef struct
+ +{
+ +    t_forcetable   table_elec;
+ +    t_forcetable   table_vdw;
+ +    t_forcetable   table_elec_vdw;
+ +
+ +    /* The actual neighbor lists, short and long range, see enum above
+ +     * for definition of neighborlist indices.
+ +     */
+ +    t_nblist nlist_sr[eNL_NR];
+ +    t_nblist nlist_lr[eNL_NR];
+ +} t_nblists;
+ +
+ +/* macros for the cginfo data in forcerec */
+ +/* The maximum cg size in cginfo is 63
+ + * because we only have space for 6 bits in cginfo,
+ + * this cg size entry is actually only read with domain decomposition.
+ + * But there is a smaller limit due to the t_excl data structure
+ + * which is defined in nblist.h.
+ + */
+ +#define SET_CGINFO_GID(cgi, gid)      (cgi) = (((cgi)  &  ~65535)  |  (gid)   )
+ +#define GET_CGINFO_GID(cgi)        ( (cgi)            &   65535)
+ +#define SET_CGINFO_EXCL_INTRA(cgi)   (cgi) =  ((cgi)  |  (1<<16))
+ +#define GET_CGINFO_EXCL_INTRA(cgi) ( (cgi)            &  (1<<16))
+ +#define SET_CGINFO_EXCL_INTER(cgi)   (cgi) =  ((cgi)  |  (1<<17))
+ +#define GET_CGINFO_EXCL_INTER(cgi) ( (cgi)            &  (1<<17))
+ +#define SET_CGINFO_SOLOPT(cgi, opt)   (cgi) = (((cgi)  & ~(3<<18)) | ((opt)<<18))
+ +#define GET_CGINFO_SOLOPT(cgi)     (((cgi)>>18)       &   3)
+ +#define SET_CGINFO_CONSTR(cgi)       (cgi) =  ((cgi)  |  (1<<20))
+ +#define GET_CGINFO_CONSTR(cgi)     ( (cgi)            &  (1<<20))
+ +#define SET_CGINFO_SETTLE(cgi)       (cgi) =  ((cgi)  |  (1<<21))
+ +#define GET_CGINFO_SETTLE(cgi)     ( (cgi)            &  (1<<21))
+ +/* This bit is only used with bBondComm in the domain decomposition */
+ +#define SET_CGINFO_BOND_INTER(cgi)   (cgi) =  ((cgi)  |  (1<<22))
+ +#define GET_CGINFO_BOND_INTER(cgi) ( (cgi)            &  (1<<22))
+ +#define SET_CGINFO_HAS_VDW(cgi)      (cgi) =  ((cgi)  |  (1<<23))
+ +#define GET_CGINFO_HAS_VDW(cgi)    ( (cgi)            &  (1<<23))
+ +#define SET_CGINFO_HAS_Q(cgi)        (cgi) =  ((cgi)  |  (1<<24))
+ +#define GET_CGINFO_HAS_Q(cgi)      ( (cgi)            &  (1<<24))
+ +#define SET_CGINFO_NATOMS(cgi, opt)   (cgi) = (((cgi)  & ~(63<<25)) | ((opt)<<25))
+ +#define GET_CGINFO_NATOMS(cgi)     (((cgi)>>25)       &   63)
+ +
+ +
+ +/* Value to be used in mdrun for an infinite cut-off.
+ + * Since we need to compare with the cut-off squared,
+ + * this value should be slighlty smaller than sqrt(GMX_FLOAT_MAX).
+ + */
+ +#define GMX_CUTOFF_INF 1E+18
+ +
+ +/* enums for the neighborlist type */
+ +enum {
+ +    enbvdwNONE, enbvdwLJ, enbvdwBHAM, enbvdwTAB, enbvdwNR
+ +};
+ +/* OOR is "one over r" -- standard coul */
+ +enum {
+ +    enbcoulNONE, enbcoulOOR, enbcoulRF, enbcoulTAB, enbcoulGB, enbcoulFEWALD, enbcoulNR
+ +};
+ +
+ +enum {
+ +    egCOULSR, egLJSR, egBHAMSR, egCOULLR, egLJLR, egBHAMLR,
+ +    egCOUL14, egLJ14, egGB, egNR
+ +};
+ +
+ +typedef struct {
+ +    int   nener;      /* The number of energy group pairs     */
+ +    real *ener[egNR]; /* Energy terms for each pair of groups */
+ +} gmx_grppairener_t;
+ +
+ +typedef struct {
+ +    real              term[F_NRE];         /* The energies for all different interaction types */
+ +    gmx_grppairener_t grpp;
+ +    double            dvdl_lin[efptNR];    /* Contributions to dvdl with linear lam-dependence */
+ +    double            dvdl_nonlin[efptNR]; /* Idem, but non-linear dependence                  */
+ +    int               n_lambda;
+ +    int               fep_state;           /*current fep state -- just for printing */
+ +    double           *enerpart_lambda;     /* Partial energy for lambda and flambda[] */
+ +    real              foreign_term[F_NRE]; /* alternate array for storing foreign lambda energies */
+ +    gmx_grppairener_t foreign_grpp;        /* alternate array for storing foreign lambda energies */
+ +} gmx_enerdata_t;
+ +/* The idea is that dvdl terms with linear lambda dependence will be added
+ + * automatically to enerpart_lambda. Terms with non-linear lambda dependence
+ + * should explicitly determine the energies at foreign lambda points
+ + * when n_lambda > 0.
+ + */
+ +
+ +typedef struct {
+ +    int  cg_start;
+ +    int  cg_end;
+ +    int  cg_mod;
+ +    int *cginfo;
+ +} cginfo_mb_t;
+ +
+ +
+ +/* ewald table type */
+ +typedef struct ewald_tab *ewald_tab_t;
+ +
+ +typedef struct {
+ +    rvec             *f;
+ +    int               f_nalloc;
+ +    unsigned          red_mask; /* Mask for marking which parts of f are filled */
+ +    rvec             *fshift;
+ +    real              ener[F_NRE];
+ +    gmx_grppairener_t grpp;
+ +    real              Vcorr;
+ +    real              dvdl[efptNR];
+ +    tensor            vir;
+ +} f_thread_t;
+ +
+ +typedef struct {
+ +    interaction_const_t *ic;
+ +
+ +    /* Domain Decomposition */
+ +    gmx_bool bDomDec;
+ +
+ +    /* PBC stuff */
++    int                  ePBC;
++    gmx_bool             bMolPBC;
++    int                  rc_scaling;
++    rvec                 posres_com;
++    rvec                 posres_comB;
++
++    const gmx_hw_info_t *hwinfo;
++    gmx_bool             use_cpu_acceleration;
+ +
+ +    /* Interaction for calculated in kernels. In many cases this is similar to
+ +     * the electrostatics settings in the inputrecord, but the difference is that
+ +     * these variables always specify the actual interaction in the kernel - if
+ +     * we are tabulating reaction-field the inputrec will say reaction-field, but
+ +     * the kernel interaction will say cubic-spline-table. To be safe we also
+ +     * have a kernel-specific setting for the modifiers - if the interaction is
+ +     * tabulated we already included the inputrec modification there, so the kernel
+ +     * modification setting will say 'none' in that case.
+ +     */
+ +    int nbkernel_elec_interaction;
+ +    int nbkernel_vdw_interaction;
+ +    int nbkernel_elec_modifier;
+ +    int nbkernel_vdw_modifier;
+ +
+ +    /* Use special N*N kernels? */
+ +    gmx_bool bAllvsAll;
+ +    /* Private work data */
+ +    void    *AllvsAll_work;
+ +    void    *AllvsAll_workgb;
+ +
+ +    /* Cut-Off stuff.
+ +     * Infinite cut-off's will be GMX_CUTOFF_INF (unlike in t_inputrec: 0).
+ +     */
+ +    real rlist, rlistlong;
+ +
+ +    /* Dielectric constant resp. multiplication factor for charges */
+ +    real zsquare, temp;
+ +    real epsilon_r, epsilon_rf, epsfac;
+ +
+ +    /* Constants for reaction fields */
+ +    real kappa, k_rf, c_rf;
+ +
+ +    /* Charge sum and dipole for topology A/B ([0]/[1]) for Ewald corrections */
+ +    double qsum[2];
+ +    double q2sum[2];
+ +    rvec   mu_tot[2];
+ +
+ +    /* Dispersion correction stuff */
+ +    int  eDispCorr;
+ +
+ +    /* The shift of the shift or user potentials */
+ +    real enershiftsix;
+ +    real enershifttwelve;
+ +    /* Integrated differces for energy and virial with cut-off functions */
+ +    real enerdiffsix;
+ +    real enerdifftwelve;
+ +    real virdiffsix;
+ +    real virdifftwelve;
+ +    /* Constant for long range dispersion correction (average dispersion)
+ +     * for topology A/B ([0]/[1]) */
+ +    real avcsix[2];
+ +    /* Constant for long range repulsion term. Relative difference of about
+ +     * 0.1 percent with 0.8 nm cutoffs. But hey, it's cheap anyway...
+ +     */
+ +    real avctwelve[2];
+ +
+ +    /* Fudge factors */
+ +    real fudgeQQ;
+ +
+ +    /* Table stuff */
+ +    gmx_bool     bcoultab;
+ +    gmx_bool     bvdwtab;
+ +    /* The normal tables are in the nblists struct(s) below */
+ +    t_forcetable tab14; /* for 1-4 interactions only */
+ +
+ +    /* PPPM & Shifting stuff */
+ +    int   coulomb_modifier;
+ +    real  rcoulomb_switch, rcoulomb;
+ +    real *phi;
+ +
+ +    /* VdW stuff */
+ +    int    vdw_modifier;
+ +    double reppow;
+ +    real   rvdw_switch, rvdw;
+ +    real   bham_b_max;
+ +
+ +    /* Free energy */
+ +    int      efep;
+ +    real     sc_alphavdw;
+ +    real     sc_alphacoul;
+ +    int      sc_power;
+ +    real     sc_r_power;
+ +    real     sc_sigma6_def;
+ +    real     sc_sigma6_min;
+ +    gmx_bool bSepDVDL;
+ +
+ +    /* NS Stuff */
+ +    int  eeltype;
+ +    int  vdwtype;
+ +    int  cg0, hcg;
+ +    /* solvent_opt contains the enum for the most common solvent
+ +     * in the system, which will be optimized.
+ +     * It can be set to esolNO to disable all water optimization */
+ +    int          solvent_opt;
+ +    int          nWatMol;
+ +    gmx_bool     bGrid;
+ +    gmx_bool     bExcl_IntraCGAll_InterCGNone;
+ +    cginfo_mb_t *cginfo_mb;
+ +    int         *cginfo;
+ +    rvec        *cg_cm;
+ +    int          cg_nalloc;
+ +    rvec        *shift_vec;
+ +
+ +    /* The neighborlists including tables */
+ +    int                 nnblists;
+ +    int                *gid2nblists;
+ +    t_nblists          *nblists;
+ +
+ +    int                 cutoff_scheme; /* group- or Verlet-style cutoff */
+ +    gmx_bool            bNonbonded;    /* true if nonbonded calculations are *not* turned off */
+ +    nonbonded_verlet_t *nbv;
+ +
+ +    /* The wall tables (if used) */
+ +    int            nwall;
+ +    t_forcetable **wall_tab;
+ +
+ +    /* The number of charge groups participating in do_force_lowlevel */
+ +    int ncg_force;
+ +    /* The number of atoms participating in do_force_lowlevel */
+ +    int natoms_force;
+ +    /* The number of atoms participating in force and constraints */
+ +    int natoms_force_constr;
+ +    /* The allocation size of vectors of size natoms_force */
+ +    int nalloc_force;
+ +
+ +    /* Twin Range stuff, f_twin has size natoms_force */
+ +    gmx_bool bTwinRange;
+ +    int      nlr;
+ +    rvec    *f_twin;
+ +
+ +    /* Forces that should not enter into the virial summation:
+ +     * PPPM/PME/Ewald/posres
+ +     */
+ +    gmx_bool bF_NoVirSum;
+ +    int      f_novirsum_n;
+ +    int      f_novirsum_nalloc;
+ +    rvec    *f_novirsum_alloc;
+ +    /* Pointer that points to f_novirsum_alloc when pressure is calcaluted,
+ +     * points to the normal force vectors wen pressure is not requested.
+ +     */
+ +    rvec *f_novirsum;
+ +
+ +    /* Long-range forces and virial for PPPM/PME/Ewald */
+ +    gmx_pme_t pmedata;
+ +    tensor    vir_el_recip;
+ +
+ +    /* PME/Ewald stuff */
+ +    gmx_bool    bEwald;
+ +    real        ewaldcoeff;
+ +    ewald_tab_t ewald_table;
+ +
+ +    /* Virial Stuff */
+ +    rvec *fshift;
+ +    rvec  vir_diag_posres;
+ +    dvec  vir_wall_z;
+ +
+ +    /* Non bonded Parameter lists */
+ +    int      ntype; /* Number of atom types */
+ +    gmx_bool bBHAM;
+ +    real    *nbfp;
+ +
+ +    /* Energy group pair flags */
+ +    int *egp_flags;
+ +
+ +    /* xmdrun flexible constraints */
+ +    real fc_stepsize;
+ +
+ +    /* Generalized born implicit solvent */
+ +    gmx_bool       bGB;
+ +    /* Generalized born stuff */
+ +    real           gb_epsilon_solvent;
+ +    /* Table data for GB */
+ +    t_forcetable   gbtab;
+ +    /* VdW radius for each atomtype (dim is thus ntype) */
+ +    real          *atype_radius;
+ +    /* Effective radius (derived from effective volume) for each type */
+ +    real          *atype_vol;
+ +    /* Implicit solvent - surface tension for each atomtype */
+ +    real          *atype_surftens;
+ +    /* Implicit solvent - radius for GB calculation */
+ +    real          *atype_gb_radius;
+ +    /* Implicit solvent - overlap for HCT model */
+ +    real          *atype_S_hct;
+ +    /* Generalized born interaction data */
+ +    gmx_genborn_t *born;
+ +
+ +    /* Table scale for GB */
+ +    real gbtabscale;
+ +    /* Table range for GB */
+ +    real gbtabr;
+ +    /* GB neighborlists (the sr list will contain for each atom all other atoms
+ +     * (for use in the SA calculation) and the lr list will contain
+ +     * for each atom all atoms 1-4 or greater (for use in the GB calculation)
+ +     */
+ +    t_nblist gblist_sr;
+ +    t_nblist gblist_lr;
+ +    t_nblist gblist;
+ +
+ +    /* Inverse square root of the Born radii for implicit solvent */
+ +    real *invsqrta;
+ +    /* Derivatives of the potential with respect to the Born radii */
+ +    real *dvda;
+ +    /* Derivatives of the Born radii with respect to coordinates */
+ +    real *dadx;
+ +    real *dadx_rawptr;
+ +    int   nalloc_dadx; /* Allocated size of dadx */
+ +
+ +    /* If > 0 signals Test Particle Insertion,
+ +     * the value is the number of atoms of the molecule to insert
+ +     * Only the energy difference due to the addition of the last molecule
+ +     * should be calculated.
+ +     */
+ +    gmx_bool n_tpi;
+ +
+ +    /* Neighbor searching stuff */
+ +    gmx_ns_t ns;
+ +
+ +    /* QMMM stuff */
+ +    gmx_bool         bQMMM;
+ +    t_QMMMrec       *qr;
+ +
+ +    /* QM-MM neighborlists */
+ +    t_nblist QMMMlist;
+ +
+ +    /* Limit for printing large forces, negative is don't print */
+ +    real print_force;
+ +
+ +    /* coarse load balancing time measurement */
+ +    double t_fnbf;
+ +    double t_wait;
+ +    int    timesteps;
+ +
+ +    /* parameter needed for AdResS simulation */
+ +    int             adress_type;
+ +    gmx_bool        badress_tf_full_box;
+ +    real            adress_const_wf;
+ +    real            adress_ex_width;
+ +    real            adress_hy_width;
+ +    int             adress_icor;
+ +    int             adress_site;
+ +    rvec            adress_refs;
+ +    int             n_adress_tf_grps;
+ +    int           * adress_tf_table_index;
+ +    int            *adress_group_explicit;
+ +    t_forcetable *  atf_tabs;
+ +    real            adress_ex_forcecap;
+ +    gmx_bool        adress_do_hybridpairs;
+ +
+ +    /* User determined parameters, copied from the inputrec */
+ +    int  userint1;
+ +    int  userint2;
+ +    int  userint3;
+ +    int  userint4;
+ +    real userreal1;
+ +    real userreal2;
+ +    real userreal3;
+ +    real userreal4;
+ +
+ +    /* Thread local force and energy data */
+ +    /* FIXME move to bonded_thread_data_t */
+ +    int         nthreads;
+ +    int         red_ashift;
+ +    int         red_nblock;
+ +    f_thread_t *f_t;
+ +
+ +    /* Exclusion load distribution over the threads */
+ +    int  *excl_load;
+ +} t_forcerec;
+ +
+ +/* Important: Starting with Gromacs-4.6, the values of c6 and c12 in the nbfp array have
+ + * been scaled by 6.0 or 12.0 to save flops in the kernels. We have corrected this everywhere
+ + * in the code, but beware if you are using these macros externally.
+ + */
+ +#define C6(nbfp, ntp, ai, aj)     (nbfp)[2*((ntp)*(ai)+(aj))]
+ +#define C12(nbfp, ntp, ai, aj)    (nbfp)[2*((ntp)*(ai)+(aj))+1]
+ +#define BHAMC(nbfp, ntp, ai, aj)  (nbfp)[3*((ntp)*(ai)+(aj))]
+ +#define BHAMA(nbfp, ntp, ai, aj)  (nbfp)[3*((ntp)*(ai)+(aj))+1]
+ +#define BHAMB(nbfp, ntp, ai, aj)  (nbfp)[3*((ntp)*(ai)+(aj))+2]
+ +
+ +#ifdef __cplusplus
+ +}
+ +#endif
diff --cc src/gromacs/legacyheaders/types/hw_info.h

index e054b021b559193f745d91dde7c1ba5d5ebc88b8,0000000000000000000000000000000000000000..1aa0c32c4803c85f48407e2215b5e6a961863c22

mode 100644,000000..100644
--- 1/src/gromacs/legacyheaders/types/hw_info.h
--- /dev/null
+++ b/src/gromacs/legacyheaders/types/hw_info.h
@@@ -1,85 -1,0 +1,88 @@@
-     gmx_bool            bUserSet;       /* true if the GPUs in cuda_dev_use are manually provided by the user */
-     gmx_bool            bDevShare;      /* true if any of the devices is shared by
-                                            (t)MPI ranks, with auto-detection always FALSE */
+ +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
+ + *
+ + *
+ + * This file is part of GROMACS.
+ + * Copyright (c) 2012-
+ + *
+ + * Written by the Gromacs development team under coordination of
+ + * David van der Spoel, Berk Hess, and Erik Lindahl.
+ + *
+ + * This library is free software; you can redistribute it and/or
+ + * modify it under the terms of the GNU Lesser General Public License
+ + * as published by the Free Software Foundation; either version 2
+ + * of the License, or (at your option) any later version.
+ + *
+ + * To help us fund GROMACS development, we humbly ask that you cite
+ + * the research papers on the package. Check out http://www.gromacs.org
+ + *
+ + * And Hey:
+ + * Gromacs Runs On Most of All Computer Systems
+ + */
+ +
+ +#ifndef HWINFO_H
+ +#define HWINFO_H
+ +
+ +#include "simple.h"
+ +#include "nbnxn_cuda_types_ext.h"
+ +#include "../gmx_cpuid.h"
+ +
+ +#ifdef __cplusplus
+ +extern "C" {
+ +#endif
+ +#if 0
+ +} /* fixes auto-indentation problems */
+ +#endif
+ +
+ +/* Possible results of the GPU detection/check.
+ + *
+ + * The egpuInsane value means that during the sanity checks an error
+ + * occurred that indicates malfunctioning of the device, driver, or
+ + * incompatible driver/runtime. */
+ +typedef enum
+ +{
+ +    egpuCompatible = 0,  egpuNonexistent,  egpuIncompatible, egpuInsane
+ +} e_gpu_detect_res_t;
+ +
+ +/* Textual names of the GPU detection/check results (see e_gpu_detect_res_t). */
+ +static const char * const gpu_detect_res_str[] =
+ +{
+ +    "compatible", "inexistent", "incompatible", "insane"
+ +};
+ +
+ +/* GPU device information -- for now with only CUDA devices.
+ + * The gmx_hardware_detect module initializes it. */
+ +typedef struct
+ +{
-  * It is initialized by gmx_detect_hardware(). */
++    gmx_bool             bUserSet;      /* true if the GPUs in cuda_dev_use are manually provided by the user */
+ +
+ +    int                  ncuda_dev_use; /* number of devices selected to be used */
+ +    int                 *cuda_dev_use;  /* index of the devices selected to be used */
+ +    int                  ncuda_dev;     /* total number of devices detected */
+ +    cuda_dev_info_ptr_t  cuda_dev;      /* devices detected in the system (per node) */
+ +} gmx_gpu_info_t;
+ +
+ +/* Hardware information structure with CPU and GPU information.
-     gmx_bool        bCanUseGPU;        /* True if compatible GPUs are detected during hardware detection */
-     gmx_gpu_info_t  gpu_info;          /* Information about GPUs detected in the system */
++ * It is initialized by gmx_detect_hardware().
++ * NOTE: this structure may only contain structures that are globally valid
++ *       (i.e. must be able to be shared among all threads) */
+ +typedef struct
+ +{
-     gmx_cpuid_t     cpuid_info;        /* CPUID information about CPU detected;
-                                           NOTE: this will only detect the CPU thread 0 of the
-                                           current process runs on. */
-     int             nthreads_hw_avail; /* Number of hardware threads available; this number
-                                           is based on the number of CPUs reported as available
-                                           by the OS at the time of detection. */
++    gmx_bool        bCanUseGPU;          /* True if compatible GPUs are detected during hardware detection */
++    gmx_gpu_info_t  gpu_info;            /* Information about GPUs detected in the system */
+ +
++    gmx_cpuid_t     cpuid_info;          /* CPUID information about CPU detected;
++                                            NOTE: this will only detect the CPU thread 0 of the
++                                            current process runs on. */
++    int             nthreads_hw_avail;   /* Number of hardware threads available; this number
++                                            is based on the number of CPUs reported as available
++                                            by the OS at the time of detection. */
++    gmx_bool        bConsistencyChecked; /* whether
++                                            gmx_check_hw_runconf_consistency()
++                                            has been run with this hw_info */
+ +} gmx_hw_info_t;
+ +
+ +#ifdef __cplusplus
+ +}
+ +#endif
+ +
+ +#endif /* HWINFO_H */
diff --cc src/gromacs/libgromacs.pc.cmakein

index 5cc1da7b318dbab3381d50a4178d99e02d4fc012,0000000000000000000000000000000000000000..b36896a0e7cbc795fc4997d3264e0abfd51e6997

mode 100644,000000..100644
--- 1/src/gromacs/libgromacs.pc.cmakein
--- /dev/null
+++ b/src/gromacs/libgromacs.pc.cmakein
@@@ -1,12 -1,0 +1,12 @@@
- Name: libgromacs
+ +libdir=@LIB_INSTALL_DIR@
+ +includedir=@INCL_INSTALL_DIR@
+ +
++Name: libgromacs@GMX_LIBS_SUFFIX@
+ +Description: Gromacs library
+ +URL: http://www.gromacs.org
+ +Version: @PROJECT_VERSION@
+ +Requires: @PKG_FFT@ @PKG_XML@ @PKG_GSL@
+ +Libs.private: @CMAKE_THREAD_LIBS_INIT@ @PKG_DL_LIBS@ @OpenMP_LINKER_FLAGS@
+ +Libs: -L${libdir} -lgromacs@GMX_LIBS_SUFFIX@ @PKG_FFT_LIBS@ -lm
+ +Cflags: -I${includedir} @PKG_CFLAGS@
+ +
diff --cc src/gromacs/mdlib/forcerec.c

index 6d0c97c46679eb03b7f45dcbe252249474e08448,0000000000000000000000000000000000000000..2a1ff5bcd15c74bce7548a83fd362642aa0990e0

mode 100644,000000..100644
--- 1/src/gromacs/mdlib/forcerec.c
--- /dev/null
+++ b/src/gromacs/mdlib/forcerec.c
@@@ -1,2977 -1,0 +1,2975 @@@
-         snew(fr->hwinfo, 1);
-         gmx_detect_hardware(fp, fr->hwinfo, cr,
-                             FALSE, FALSE, NULL);
+ +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
+ + *
+ + *
+ + *                This source code is part of
+ + *
+ + *                 G   R   O   M   A   C   S
+ + *
+ + *          GROningen MAchine for Chemical Simulations
+ + *
+ + *                        VERSION 3.2.0
+ + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
+ + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ + * Copyright (c) 2001-2004, The GROMACS development team,
+ + * check out http://www.gromacs.org for more information.
+ +
+ + * This program is free software; you can redistribute it and/or
+ + * modify it under the terms of the GNU General Public License
+ + * as published by the Free Software Foundation; either version 2
+ + * of the License, or (at your option) any later version.
+ + *
+ + * If you want to redistribute modifications, please consider that
+ + * scientific software is very special. Version control is crucial -
+ + * bugs must be traceable. We will be happy to consider code for
+ + * inclusion in the official distribution, but derived work must not
+ + * be called official GROMACS. Details are found in the README & COPYING
+ + * files - if they are missing, get the official version at www.gromacs.org.
+ + *
+ + * To help us fund GROMACS development, we humbly ask that you cite
+ + * the papers on the package - you can find them in the top README file.
+ + *
+ + * For more info, check our website at http://www.gromacs.org
+ + *
+ + * And Hey:
+ + * GROwing Monsters And Cloning Shrimps
+ + */
+ +#ifdef HAVE_CONFIG_H
+ +#include <config.h>
+ +#endif
+ +
+ +#include <math.h>
+ +#include <string.h>
+ +#include <assert.h>
+ +#include "sysstuff.h"
+ +#include "typedefs.h"
+ +#include "vec.h"
+ +#include "maths.h"
+ +#include "macros.h"
+ +#include "smalloc.h"
+ +#include "macros.h"
+ +#include "gmx_fatal.h"
+ +#include "gmx_fatal_collective.h"
+ +#include "physics.h"
+ +#include "force.h"
+ +#include "tables.h"
+ +#include "nonbonded.h"
+ +#include "invblock.h"
+ +#include "names.h"
+ +#include "network.h"
+ +#include "pbc.h"
+ +#include "ns.h"
+ +#include "mshift.h"
+ +#include "txtdump.h"
+ +#include "coulomb.h"
+ +#include "md_support.h"
+ +#include "md_logging.h"
+ +#include "domdec.h"
+ +#include "partdec.h"
+ +#include "qmmm.h"
+ +#include "copyrite.h"
+ +#include "mtop_util.h"
+ +#include "nbnxn_search.h"
+ +#include "nbnxn_atomdata.h"
+ +#include "nbnxn_consts.h"
+ +#include "statutil.h"
+ +#include "gmx_omp_nthreads.h"
+ +#include "gmx_detect_hardware.h"
+ +
+ +#ifdef _MSC_VER
+ +/* MSVC definition for __cpuid() */
+ +#include <intrin.h>
+ +#endif
+ +
+ +#include "types/nbnxn_cuda_types_ext.h"
+ +#include "gpu_utils.h"
+ +#include "nbnxn_cuda_data_mgmt.h"
+ +#include "pmalloc_cuda.h"
+ +
+ +t_forcerec *mk_forcerec(void)
+ +{
+ +    t_forcerec *fr;
+ +
+ +    snew(fr, 1);
+ +
+ +    return fr;
+ +}
+ +
+ +#ifdef DEBUG
+ +static void pr_nbfp(FILE *fp, real *nbfp, gmx_bool bBHAM, int atnr)
+ +{
+ +    int i, j;
+ +
+ +    for (i = 0; (i < atnr); i++)
+ +    {
+ +        for (j = 0; (j < atnr); j++)
+ +        {
+ +            fprintf(fp, "%2d - %2d", i, j);
+ +            if (bBHAM)
+ +            {
+ +                fprintf(fp, "  a=%10g, b=%10g, c=%10g\n", BHAMA(nbfp, atnr, i, j),
+ +                        BHAMB(nbfp, atnr, i, j), BHAMC(nbfp, atnr, i, j)/6.0);
+ +            }
+ +            else
+ +            {
+ +                fprintf(fp, "  c6=%10g, c12=%10g\n", C6(nbfp, atnr, i, j)/6.0,
+ +                        C12(nbfp, atnr, i, j)/12.0);
+ +            }
+ +        }
+ +    }
+ +}
+ +#endif
+ +
+ +static real *mk_nbfp(const gmx_ffparams_t *idef, gmx_bool bBHAM)
+ +{
+ +    real *nbfp;
+ +    int   i, j, k, atnr;
+ +
+ +    atnr = idef->atnr;
+ +    if (bBHAM)
+ +    {
+ +        snew(nbfp, 3*atnr*atnr);
+ +        for (i = k = 0; (i < atnr); i++)
+ +        {
+ +            for (j = 0; (j < atnr); j++, k++)
+ +            {
+ +                BHAMA(nbfp, atnr, i, j) = idef->iparams[k].bham.a;
+ +                BHAMB(nbfp, atnr, i, j) = idef->iparams[k].bham.b;
+ +                /* nbfp now includes the 6.0 derivative prefactor */
+ +                BHAMC(nbfp, atnr, i, j) = idef->iparams[k].bham.c*6.0;
+ +            }
+ +        }
+ +    }
+ +    else
+ +    {
+ +        snew(nbfp, 2*atnr*atnr);
+ +        for (i = k = 0; (i < atnr); i++)
+ +        {
+ +            for (j = 0; (j < atnr); j++, k++)
+ +            {
+ +                /* nbfp now includes the 6.0/12.0 derivative prefactors */
+ +                C6(nbfp, atnr, i, j)   = idef->iparams[k].lj.c6*6.0;
+ +                C12(nbfp, atnr, i, j)  = idef->iparams[k].lj.c12*12.0;
+ +            }
+ +        }
+ +    }
+ +
+ +    return nbfp;
+ +}
+ +
+ +/* This routine sets fr->solvent_opt to the most common solvent in the
+ + * system, e.g. esolSPC or esolTIP4P. It will also mark each charge group in
+ + * the fr->solvent_type array with the correct type (or esolNO).
+ + *
+ + * Charge groups that fulfill the conditions but are not identical to the
+ + * most common one will be marked as esolNO in the solvent_type array.
+ + *
+ + * TIP3p is identical to SPC for these purposes, so we call it
+ + * SPC in the arrays (Apologies to Bill Jorgensen ;-)
+ + *
+ + * NOTE: QM particle should not
+ + * become an optimized solvent. Not even if there is only one charge
+ + * group in the Qm
+ + */
+ +
+ +typedef struct
+ +{
+ +    int    model;
+ +    int    count;
+ +    int    vdwtype[4];
+ +    real   charge[4];
+ +} solvent_parameters_t;
+ +
+ +static void
+ +check_solvent_cg(const gmx_moltype_t    *molt,
+ +                 int                     cg0,
+ +                 int                     nmol,
+ +                 const unsigned char    *qm_grpnr,
+ +                 const t_grps           *qm_grps,
+ +                 t_forcerec   *          fr,
+ +                 int                    *n_solvent_parameters,
+ +                 solvent_parameters_t  **solvent_parameters_p,
+ +                 int                     cginfo,
+ +                 int                    *cg_sp)
+ +{
+ +    const t_blocka     *  excl;
+ +    t_atom               *atom;
+ +    int                   j, k;
+ +    int                   j0, j1, nj;
+ +    gmx_bool              perturbed;
+ +    gmx_bool              has_vdw[4];
+ +    gmx_bool              match;
+ +    real                  tmp_charge[4];
+ +    int                   tmp_vdwtype[4];
+ +    int                   tjA;
+ +    gmx_bool              qm;
+ +    solvent_parameters_t *solvent_parameters;
+ +
+ +    /* We use a list with parameters for each solvent type.
+ +     * Every time we discover a new molecule that fulfills the basic
+ +     * conditions for a solvent we compare with the previous entries
+ +     * in these lists. If the parameters are the same we just increment
+ +     * the counter for that type, and otherwise we create a new type
+ +     * based on the current molecule.
+ +     *
+ +     * Once we've finished going through all molecules we check which
+ +     * solvent is most common, and mark all those molecules while we
+ +     * clear the flag on all others.
+ +     */
+ +
+ +    solvent_parameters = *solvent_parameters_p;
+ +
+ +    /* Mark the cg first as non optimized */
+ +    *cg_sp = -1;
+ +
+ +    /* Check if this cg has no exclusions with atoms in other charge groups
+ +     * and all atoms inside the charge group excluded.
+ +     * We only have 3 or 4 atom solvent loops.
+ +     */
+ +    if (GET_CGINFO_EXCL_INTER(cginfo) ||
+ +        !GET_CGINFO_EXCL_INTRA(cginfo))
+ +    {
+ +        return;
+ +    }
+ +
+ +    /* Get the indices of the first atom in this charge group */
+ +    j0     = molt->cgs.index[cg0];
+ +    j1     = molt->cgs.index[cg0+1];
+ +
+ +    /* Number of atoms in our molecule */
+ +    nj     = j1 - j0;
+ +
+ +    if (debug)
+ +    {
+ +        fprintf(debug,
+ +                "Moltype '%s': there are %d atoms in this charge group\n",
+ +                *molt->name, nj);
+ +    }
+ +
+ +    /* Check if it could be an SPC (3 atoms) or TIP4p (4) water,
+ +     * otherwise skip it.
+ +     */
+ +    if (nj < 3 || nj > 4)
+ +    {
+ +        return;
+ +    }
+ +
+ +    /* Check if we are doing QM on this group */
+ +    qm = FALSE;
+ +    if (qm_grpnr != NULL)
+ +    {
+ +        for (j = j0; j < j1 && !qm; j++)
+ +        {
+ +            qm = (qm_grpnr[j] < qm_grps->nr - 1);
+ +        }
+ +    }
+ +    /* Cannot use solvent optimization with QM */
+ +    if (qm)
+ +    {
+ +        return;
+ +    }
+ +
+ +    atom = molt->atoms.atom;
+ +
+ +    /* Still looks like a solvent, time to check parameters */
+ +
+ +    /* If it is perturbed (free energy) we can't use the solvent loops,
+ +     * so then we just skip to the next molecule.
+ +     */
+ +    perturbed = FALSE;
+ +
+ +    for (j = j0; j < j1 && !perturbed; j++)
+ +    {
+ +        perturbed = PERTURBED(atom[j]);
+ +    }
+ +
+ +    if (perturbed)
+ +    {
+ +        return;
+ +    }
+ +
+ +    /* Now it's only a question if the VdW and charge parameters
+ +     * are OK. Before doing the check we compare and see if they are
+ +     * identical to a possible previous solvent type.
+ +     * First we assign the current types and charges.
+ +     */
+ +    for (j = 0; j < nj; j++)
+ +    {
+ +        tmp_vdwtype[j] = atom[j0+j].type;
+ +        tmp_charge[j]  = atom[j0+j].q;
+ +    }
+ +
+ +    /* Does it match any previous solvent type? */
+ +    for (k = 0; k < *n_solvent_parameters; k++)
+ +    {
+ +        match = TRUE;
+ +
+ +
+ +        /* We can only match SPC with 3 atoms and TIP4p with 4 atoms */
+ +        if ( (solvent_parameters[k].model == esolSPC   && nj != 3)  ||
+ +             (solvent_parameters[k].model == esolTIP4P && nj != 4) )
+ +        {
+ +            match = FALSE;
+ +        }
+ +
+ +        /* Check that types & charges match for all atoms in molecule */
+ +        for (j = 0; j < nj && match == TRUE; j++)
+ +        {
+ +            if (tmp_vdwtype[j] != solvent_parameters[k].vdwtype[j])
+ +            {
+ +                match = FALSE;
+ +            }
+ +            if (tmp_charge[j] != solvent_parameters[k].charge[j])
+ +            {
+ +                match = FALSE;
+ +            }
+ +        }
+ +        if (match == TRUE)
+ +        {
+ +            /* Congratulations! We have a matched solvent.
+ +             * Flag it with this type for later processing.
+ +             */
+ +            *cg_sp = k;
+ +            solvent_parameters[k].count += nmol;
+ +
+ +            /* We are done with this charge group */
+ +            return;
+ +        }
+ +    }
+ +
+ +    /* If we get here, we have a tentative new solvent type.
+ +     * Before we add it we must check that it fulfills the requirements
+ +     * of the solvent optimized loops. First determine which atoms have
+ +     * VdW interactions.
+ +     */
+ +    for (j = 0; j < nj; j++)
+ +    {
+ +        has_vdw[j] = FALSE;
+ +        tjA        = tmp_vdwtype[j];
+ +
+ +        /* Go through all other tpes and see if any have non-zero
+ +         * VdW parameters when combined with this one.
+ +         */
+ +        for (k = 0; k < fr->ntype && (has_vdw[j] == FALSE); k++)
+ +        {
+ +            /* We already checked that the atoms weren't perturbed,
+ +             * so we only need to check state A now.
+ +             */
+ +            if (fr->bBHAM)
+ +            {
+ +                has_vdw[j] = (has_vdw[j] ||
+ +                              (BHAMA(fr->nbfp, fr->ntype, tjA, k) != 0.0) ||
+ +                              (BHAMB(fr->nbfp, fr->ntype, tjA, k) != 0.0) ||
+ +                              (BHAMC(fr->nbfp, fr->ntype, tjA, k) != 0.0));
+ +            }
+ +            else
+ +            {
+ +                /* Standard LJ */
+ +                has_vdw[j] = (has_vdw[j] ||
+ +                              (C6(fr->nbfp, fr->ntype, tjA, k)  != 0.0) ||
+ +                              (C12(fr->nbfp, fr->ntype, tjA, k) != 0.0));
+ +            }
+ +        }
+ +    }
+ +
+ +    /* Now we know all we need to make the final check and assignment. */
+ +    if (nj == 3)
+ +    {
+ +        /* So, is it an SPC?
+ +         * For this we require thatn all atoms have charge,
+ +         * the charges on atom 2 & 3 should be the same, and only
+ +         * atom 1 might have VdW.
+ +         */
+ +        if (has_vdw[1] == FALSE &&
+ +            has_vdw[2] == FALSE &&
+ +            tmp_charge[0]  != 0 &&
+ +            tmp_charge[1]  != 0 &&
+ +            tmp_charge[2]  == tmp_charge[1])
+ +        {
+ +            srenew(solvent_parameters, *n_solvent_parameters+1);
+ +            solvent_parameters[*n_solvent_parameters].model = esolSPC;
+ +            solvent_parameters[*n_solvent_parameters].count = nmol;
+ +            for (k = 0; k < 3; k++)
+ +            {
+ +                solvent_parameters[*n_solvent_parameters].vdwtype[k] = tmp_vdwtype[k];
+ +                solvent_parameters[*n_solvent_parameters].charge[k]  = tmp_charge[k];
+ +            }
+ +
+ +            *cg_sp = *n_solvent_parameters;
+ +            (*n_solvent_parameters)++;
+ +        }
+ +    }
+ +    else if (nj == 4)
+ +    {
+ +        /* Or could it be a TIP4P?
+ +         * For this we require thatn atoms 2,3,4 have charge, but not atom 1.
+ +         * Only atom 1 mght have VdW.
+ +         */
+ +        if (has_vdw[1] == FALSE &&
+ +            has_vdw[2] == FALSE &&
+ +            has_vdw[3] == FALSE &&
+ +            tmp_charge[0]  == 0 &&
+ +            tmp_charge[1]  != 0 &&
+ +            tmp_charge[2]  == tmp_charge[1] &&
+ +            tmp_charge[3]  != 0)
+ +        {
+ +            srenew(solvent_parameters, *n_solvent_parameters+1);
+ +            solvent_parameters[*n_solvent_parameters].model = esolTIP4P;
+ +            solvent_parameters[*n_solvent_parameters].count = nmol;
+ +            for (k = 0; k < 4; k++)
+ +            {
+ +                solvent_parameters[*n_solvent_parameters].vdwtype[k] = tmp_vdwtype[k];
+ +                solvent_parameters[*n_solvent_parameters].charge[k]  = tmp_charge[k];
+ +            }
+ +
+ +            *cg_sp = *n_solvent_parameters;
+ +            (*n_solvent_parameters)++;
+ +        }
+ +    }
+ +
+ +    *solvent_parameters_p = solvent_parameters;
+ +}
+ +
+ +static void
+ +check_solvent(FILE  *                fp,
+ +              const gmx_mtop_t  *    mtop,
+ +              t_forcerec  *          fr,
+ +              cginfo_mb_t           *cginfo_mb)
+ +{
+ +    const t_block     *   cgs;
+ +    const t_block     *   mols;
+ +    const gmx_moltype_t  *molt;
+ +    int                   mb, mol, cg_mol, at_offset, cg_offset, am, cgm, i, nmol_ch, nmol;
+ +    int                   n_solvent_parameters;
+ +    solvent_parameters_t *solvent_parameters;
+ +    int                 **cg_sp;
+ +    int                   bestsp, bestsol;
+ +
+ +    if (debug)
+ +    {
+ +        fprintf(debug, "Going to determine what solvent types we have.\n");
+ +    }
+ +
+ +    mols = &mtop->mols;
+ +
+ +    n_solvent_parameters = 0;
+ +    solvent_parameters   = NULL;
+ +    /* Allocate temporary array for solvent type */
+ +    snew(cg_sp, mtop->nmolblock);
+ +
+ +    cg_offset = 0;
+ +    at_offset = 0;
+ +    for (mb = 0; mb < mtop->nmolblock; mb++)
+ +    {
+ +        molt = &mtop->moltype[mtop->molblock[mb].type];
+ +        cgs  = &molt->cgs;
+ +        /* Here we have to loop over all individual molecules
+ +         * because we need to check for QMMM particles.
+ +         */
+ +        snew(cg_sp[mb], cginfo_mb[mb].cg_mod);
+ +        nmol_ch = cginfo_mb[mb].cg_mod/cgs->nr;
+ +        nmol    = mtop->molblock[mb].nmol/nmol_ch;
+ +        for (mol = 0; mol < nmol_ch; mol++)
+ +        {
+ +            cgm = mol*cgs->nr;
+ +            am  = mol*cgs->index[cgs->nr];
+ +            for (cg_mol = 0; cg_mol < cgs->nr; cg_mol++)
+ +            {
+ +                check_solvent_cg(molt, cg_mol, nmol,
+ +                                 mtop->groups.grpnr[egcQMMM] ?
+ +                                 mtop->groups.grpnr[egcQMMM]+at_offset+am : 0,
+ +                                 &mtop->groups.grps[egcQMMM],
+ +                                 fr,
+ +                                 &n_solvent_parameters, &solvent_parameters,
+ +                                 cginfo_mb[mb].cginfo[cgm+cg_mol],
+ +                                 &cg_sp[mb][cgm+cg_mol]);
+ +            }
+ +        }
+ +        cg_offset += cgs->nr;
+ +        at_offset += cgs->index[cgs->nr];
+ +    }
+ +
+ +    /* Puh! We finished going through all charge groups.
+ +     * Now find the most common solvent model.
+ +     */
+ +
+ +    /* Most common solvent this far */
+ +    bestsp = -2;
+ +    for (i = 0; i < n_solvent_parameters; i++)
+ +    {
+ +        if (bestsp == -2 ||
+ +            solvent_parameters[i].count > solvent_parameters[bestsp].count)
+ +        {
+ +            bestsp = i;
+ +        }
+ +    }
+ +
+ +    if (bestsp >= 0)
+ +    {
+ +        bestsol = solvent_parameters[bestsp].model;
+ +    }
+ +    else
+ +    {
+ +        bestsol = esolNO;
+ +    }
+ +
+ +#ifdef DISABLE_WATER_NLIST
+ +    bestsol = esolNO;
+ +#endif
+ +
+ +    fr->nWatMol = 0;
+ +    for (mb = 0; mb < mtop->nmolblock; mb++)
+ +    {
+ +        cgs  = &mtop->moltype[mtop->molblock[mb].type].cgs;
+ +        nmol = (mtop->molblock[mb].nmol*cgs->nr)/cginfo_mb[mb].cg_mod;
+ +        for (i = 0; i < cginfo_mb[mb].cg_mod; i++)
+ +        {
+ +            if (cg_sp[mb][i] == bestsp)
+ +            {
+ +                SET_CGINFO_SOLOPT(cginfo_mb[mb].cginfo[i], bestsol);
+ +                fr->nWatMol += nmol;
+ +            }
+ +            else
+ +            {
+ +                SET_CGINFO_SOLOPT(cginfo_mb[mb].cginfo[i], esolNO);
+ +            }
+ +        }
+ +        sfree(cg_sp[mb]);
+ +    }
+ +    sfree(cg_sp);
+ +
+ +    if (bestsol != esolNO && fp != NULL)
+ +    {
+ +        fprintf(fp, "\nEnabling %s-like water optimization for %d molecules.\n\n",
+ +                esol_names[bestsol],
+ +                solvent_parameters[bestsp].count);
+ +    }
+ +
+ +    sfree(solvent_parameters);
+ +    fr->solvent_opt = bestsol;
+ +}
+ +
+ +enum {
+ +    acNONE = 0, acCONSTRAINT, acSETTLE
+ +};
+ +
+ +static cginfo_mb_t *init_cginfo_mb(FILE *fplog, const gmx_mtop_t *mtop,
+ +                                   t_forcerec *fr, gmx_bool bNoSolvOpt,
+ +                                   gmx_bool *bExcl_IntraCGAll_InterCGNone)
+ +{
+ +    const t_block        *cgs;
+ +    const t_blocka       *excl;
+ +    const gmx_moltype_t  *molt;
+ +    const gmx_molblock_t *molb;
+ +    cginfo_mb_t          *cginfo_mb;
+ +    gmx_bool             *type_VDW;
+ +    int                  *cginfo;
+ +    int                   cg_offset, a_offset, cgm, am;
+ +    int                   mb, m, ncg_tot, cg, a0, a1, gid, ai, j, aj, excl_nalloc;
+ +    int                  *a_con;
+ +    int                   ftype;
+ +    int                   ia;
+ +    gmx_bool              bId, *bExcl, bExclIntraAll, bExclInter, bHaveVDW, bHaveQ;
+ +
+ +    ncg_tot = ncg_mtop(mtop);
+ +    snew(cginfo_mb, mtop->nmolblock);
+ +
+ +    snew(type_VDW, fr->ntype);
+ +    for (ai = 0; ai < fr->ntype; ai++)
+ +    {
+ +        type_VDW[ai] = FALSE;
+ +        for (j = 0; j < fr->ntype; j++)
+ +        {
+ +            type_VDW[ai] = type_VDW[ai] ||
+ +                fr->bBHAM ||
+ +                C6(fr->nbfp, fr->ntype, ai, j) != 0 ||
+ +                C12(fr->nbfp, fr->ntype, ai, j) != 0;
+ +        }
+ +    }
+ +
+ +    *bExcl_IntraCGAll_InterCGNone = TRUE;
+ +
+ +    excl_nalloc = 10;
+ +    snew(bExcl, excl_nalloc);
+ +    cg_offset = 0;
+ +    a_offset  = 0;
+ +    for (mb = 0; mb < mtop->nmolblock; mb++)
+ +    {
+ +        molb = &mtop->molblock[mb];
+ +        molt = &mtop->moltype[molb->type];
+ +        cgs  = &molt->cgs;
+ +        excl = &molt->excls;
+ +
+ +        /* Check if the cginfo is identical for all molecules in this block.
+ +         * If so, we only need an array of the size of one molecule.
+ +         * Otherwise we make an array of #mol times #cgs per molecule.
+ +         */
+ +        bId = TRUE;
+ +        am  = 0;
+ +        for (m = 0; m < molb->nmol; m++)
+ +        {
+ +            am = m*cgs->index[cgs->nr];
+ +            for (cg = 0; cg < cgs->nr; cg++)
+ +            {
+ +                a0 = cgs->index[cg];
+ +                a1 = cgs->index[cg+1];
+ +                if (ggrpnr(&mtop->groups, egcENER, a_offset+am+a0) !=
+ +                    ggrpnr(&mtop->groups, egcENER, a_offset   +a0))
+ +                {
+ +                    bId = FALSE;
+ +                }
+ +                if (mtop->groups.grpnr[egcQMMM] != NULL)
+ +                {
+ +                    for (ai = a0; ai < a1; ai++)
+ +                    {
+ +                        if (mtop->groups.grpnr[egcQMMM][a_offset+am+ai] !=
+ +                            mtop->groups.grpnr[egcQMMM][a_offset   +ai])
+ +                        {
+ +                            bId = FALSE;
+ +                        }
+ +                    }
+ +                }
+ +            }
+ +        }
+ +
+ +        cginfo_mb[mb].cg_start = cg_offset;
+ +        cginfo_mb[mb].cg_end   = cg_offset + molb->nmol*cgs->nr;
+ +        cginfo_mb[mb].cg_mod   = (bId ? 1 : molb->nmol)*cgs->nr;
+ +        snew(cginfo_mb[mb].cginfo, cginfo_mb[mb].cg_mod);
+ +        cginfo = cginfo_mb[mb].cginfo;
+ +
+ +        /* Set constraints flags for constrained atoms */
+ +        snew(a_con, molt->atoms.nr);
+ +        for (ftype = 0; ftype < F_NRE; ftype++)
+ +        {
+ +            if (interaction_function[ftype].flags & IF_CONSTRAINT)
+ +            {
+ +                int nral;
+ +
+ +                nral = NRAL(ftype);
+ +                for (ia = 0; ia < molt->ilist[ftype].nr; ia += 1+nral)
+ +                {
+ +                    int a;
+ +
+ +                    for (a = 0; a < nral; a++)
+ +                    {
+ +                        a_con[molt->ilist[ftype].iatoms[ia+1+a]] =
+ +                            (ftype == F_SETTLE ? acSETTLE : acCONSTRAINT);
+ +                    }
+ +                }
+ +            }
+ +        }
+ +
+ +        for (m = 0; m < (bId ? 1 : molb->nmol); m++)
+ +        {
+ +            cgm = m*cgs->nr;
+ +            am  = m*cgs->index[cgs->nr];
+ +            for (cg = 0; cg < cgs->nr; cg++)
+ +            {
+ +                a0 = cgs->index[cg];
+ +                a1 = cgs->index[cg+1];
+ +
+ +                /* Store the energy group in cginfo */
+ +                gid = ggrpnr(&mtop->groups, egcENER, a_offset+am+a0);
+ +                SET_CGINFO_GID(cginfo[cgm+cg], gid);
+ +
+ +                /* Check the intra/inter charge group exclusions */
+ +                if (a1-a0 > excl_nalloc)
+ +                {
+ +                    excl_nalloc = a1 - a0;
+ +                    srenew(bExcl, excl_nalloc);
+ +                }
+ +                /* bExclIntraAll: all intra cg interactions excluded
+ +                 * bExclInter:    any inter cg interactions excluded
+ +                 */
+ +                bExclIntraAll = TRUE;
+ +                bExclInter    = FALSE;
+ +                bHaveVDW      = FALSE;
+ +                bHaveQ        = FALSE;
+ +                for (ai = a0; ai < a1; ai++)
+ +                {
+ +                    /* Check VDW and electrostatic interactions */
+ +                    bHaveVDW = bHaveVDW || (type_VDW[molt->atoms.atom[ai].type] ||
+ +                                            type_VDW[molt->atoms.atom[ai].typeB]);
+ +                    bHaveQ  = bHaveQ    || (molt->atoms.atom[ai].q != 0 ||
+ +                                            molt->atoms.atom[ai].qB != 0);
+ +
+ +                    /* Clear the exclusion list for atom ai */
+ +                    for (aj = a0; aj < a1; aj++)
+ +                    {
+ +                        bExcl[aj-a0] = FALSE;
+ +                    }
+ +                    /* Loop over all the exclusions of atom ai */
+ +                    for (j = excl->index[ai]; j < excl->index[ai+1]; j++)
+ +                    {
+ +                        aj = excl->a[j];
+ +                        if (aj < a0 || aj >= a1)
+ +                        {
+ +                            bExclInter = TRUE;
+ +                        }
+ +                        else
+ +                        {
+ +                            bExcl[aj-a0] = TRUE;
+ +                        }
+ +                    }
+ +                    /* Check if ai excludes a0 to a1 */
+ +                    for (aj = a0; aj < a1; aj++)
+ +                    {
+ +                        if (!bExcl[aj-a0])
+ +                        {
+ +                            bExclIntraAll = FALSE;
+ +                        }
+ +                    }
+ +
+ +                    switch (a_con[ai])
+ +                    {
+ +                        case acCONSTRAINT:
+ +                            SET_CGINFO_CONSTR(cginfo[cgm+cg]);
+ +                            break;
+ +                        case acSETTLE:
+ +                            SET_CGINFO_SETTLE(cginfo[cgm+cg]);
+ +                            break;
+ +                        default:
+ +                            break;
+ +                    }
+ +                }
+ +                if (bExclIntraAll)
+ +                {
+ +                    SET_CGINFO_EXCL_INTRA(cginfo[cgm+cg]);
+ +                }
+ +                if (bExclInter)
+ +                {
+ +                    SET_CGINFO_EXCL_INTER(cginfo[cgm+cg]);
+ +                }
+ +                if (a1 - a0 > MAX_CHARGEGROUP_SIZE)
+ +                {
+ +                    /* The size in cginfo is currently only read with DD */
+ +                    gmx_fatal(FARGS, "A charge group has size %d which is larger than the limit of %d atoms", a1-a0, MAX_CHARGEGROUP_SIZE);
+ +                }
+ +                if (bHaveVDW)
+ +                {
+ +                    SET_CGINFO_HAS_VDW(cginfo[cgm+cg]);
+ +                }
+ +                if (bHaveQ)
+ +                {
+ +                    SET_CGINFO_HAS_Q(cginfo[cgm+cg]);
+ +                }
+ +                /* Store the charge group size */
+ +                SET_CGINFO_NATOMS(cginfo[cgm+cg], a1-a0);
+ +
+ +                if (!bExclIntraAll || bExclInter)
+ +                {
+ +                    *bExcl_IntraCGAll_InterCGNone = FALSE;
+ +                }
+ +            }
+ +        }
+ +
+ +        sfree(a_con);
+ +
+ +        cg_offset += molb->nmol*cgs->nr;
+ +        a_offset  += molb->nmol*cgs->index[cgs->nr];
+ +    }
+ +    sfree(bExcl);
+ +
+ +    /* the solvent optimizer is called after the QM is initialized,
+ +     * because we don't want to have the QM subsystemto become an
+ +     * optimized solvent
+ +     */
+ +
+ +    check_solvent(fplog, mtop, fr, cginfo_mb);
+ +
+ +    if (getenv("GMX_NO_SOLV_OPT"))
+ +    {
+ +        if (fplog)
+ +        {
+ +            fprintf(fplog, "Found environment variable GMX_NO_SOLV_OPT.\n"
+ +                    "Disabling all solvent optimization\n");
+ +        }
+ +        fr->solvent_opt = esolNO;
+ +    }
+ +    if (bNoSolvOpt)
+ +    {
+ +        fr->solvent_opt = esolNO;
+ +    }
+ +    if (!fr->solvent_opt)
+ +    {
+ +        for (mb = 0; mb < mtop->nmolblock; mb++)
+ +        {
+ +            for (cg = 0; cg < cginfo_mb[mb].cg_mod; cg++)
+ +            {
+ +                SET_CGINFO_SOLOPT(cginfo_mb[mb].cginfo[cg], esolNO);
+ +            }
+ +        }
+ +    }
+ +
+ +    return cginfo_mb;
+ +}
+ +
+ +static int *cginfo_expand(int nmb, cginfo_mb_t *cgi_mb)
+ +{
+ +    int  ncg, mb, cg;
+ +    int *cginfo;
+ +
+ +    ncg = cgi_mb[nmb-1].cg_end;
+ +    snew(cginfo, ncg);
+ +    mb = 0;
+ +    for (cg = 0; cg < ncg; cg++)
+ +    {
+ +        while (cg >= cgi_mb[mb].cg_end)
+ +        {
+ +            mb++;
+ +        }
+ +        cginfo[cg] =
+ +            cgi_mb[mb].cginfo[(cg - cgi_mb[mb].cg_start) % cgi_mb[mb].cg_mod];
+ +    }
+ +
+ +    return cginfo;
+ +}
+ +
+ +static void set_chargesum(FILE *log, t_forcerec *fr, const gmx_mtop_t *mtop)
+ +{
+ +    double         qsum, q2sum, q;
+ +    int            mb, nmol, i;
+ +    const t_atoms *atoms;
+ +
+ +    qsum  = 0;
+ +    q2sum = 0;
+ +    for (mb = 0; mb < mtop->nmolblock; mb++)
+ +    {
+ +        nmol  = mtop->molblock[mb].nmol;
+ +        atoms = &mtop->moltype[mtop->molblock[mb].type].atoms;
+ +        for (i = 0; i < atoms->nr; i++)
+ +        {
+ +            q      = atoms->atom[i].q;
+ +            qsum  += nmol*q;
+ +            q2sum += nmol*q*q;
+ +        }
+ +    }
+ +    fr->qsum[0]  = qsum;
+ +    fr->q2sum[0] = q2sum;
+ +    if (fr->efep != efepNO)
+ +    {
+ +        qsum  = 0;
+ +        q2sum = 0;
+ +        for (mb = 0; mb < mtop->nmolblock; mb++)
+ +        {
+ +            nmol  = mtop->molblock[mb].nmol;
+ +            atoms = &mtop->moltype[mtop->molblock[mb].type].atoms;
+ +            for (i = 0; i < atoms->nr; i++)
+ +            {
+ +                q      = atoms->atom[i].qB;
+ +                qsum  += nmol*q;
+ +                q2sum += nmol*q*q;
+ +            }
+ +            fr->qsum[1]  = qsum;
+ +            fr->q2sum[1] = q2sum;
+ +        }
+ +    }
+ +    else
+ +    {
+ +        fr->qsum[1]  = fr->qsum[0];
+ +        fr->q2sum[1] = fr->q2sum[0];
+ +    }
+ +    if (log)
+ +    {
+ +        if (fr->efep == efepNO)
+ +        {
+ +            fprintf(log, "System total charge: %.3f\n", fr->qsum[0]);
+ +        }
+ +        else
+ +        {
+ +            fprintf(log, "System total charge, top. A: %.3f top. B: %.3f\n",
+ +                    fr->qsum[0], fr->qsum[1]);
+ +        }
+ +    }
+ +}
+ +
+ +void update_forcerec(FILE *log, t_forcerec *fr, matrix box)
+ +{
+ +    if (fr->eeltype == eelGRF)
+ +    {
+ +        calc_rffac(NULL, fr->eeltype, fr->epsilon_r, fr->epsilon_rf,
+ +                   fr->rcoulomb, fr->temp, fr->zsquare, box,
+ +                   &fr->kappa, &fr->k_rf, &fr->c_rf);
+ +    }
+ +}
+ +
+ +void set_avcsixtwelve(FILE *fplog, t_forcerec *fr, const gmx_mtop_t *mtop)
+ +{
+ +    const t_atoms  *atoms, *atoms_tpi;
+ +    const t_blocka *excl;
+ +    int             mb, nmol, nmolc, i, j, tpi, tpj, j1, j2, k, n, nexcl, q;
+ +#if (defined SIZEOF_LONG_LONG_INT) && (SIZEOF_LONG_LONG_INT >= 8)
+ +    long long int   npair, npair_ij, tmpi, tmpj;
+ +#else
+ +    double          npair, npair_ij, tmpi, tmpj;
+ +#endif
+ +    double          csix, ctwelve;
+ +    int             ntp, *typecount;
+ +    gmx_bool        bBHAM;
+ +    real           *nbfp;
+ +
+ +    ntp   = fr->ntype;
+ +    bBHAM = fr->bBHAM;
+ +    nbfp  = fr->nbfp;
+ +
+ +    for (q = 0; q < (fr->efep == efepNO ? 1 : 2); q++)
+ +    {
+ +        csix    = 0;
+ +        ctwelve = 0;
+ +        npair   = 0;
+ +        nexcl   = 0;
+ +        if (!fr->n_tpi)
+ +        {
+ +            /* Count the types so we avoid natoms^2 operations */
+ +            snew(typecount, ntp);
+ +            for (mb = 0; mb < mtop->nmolblock; mb++)
+ +            {
+ +                nmol  = mtop->molblock[mb].nmol;
+ +                atoms = &mtop->moltype[mtop->molblock[mb].type].atoms;
+ +                for (i = 0; i < atoms->nr; i++)
+ +                {
+ +                    if (q == 0)
+ +                    {
+ +                        tpi = atoms->atom[i].type;
+ +                    }
+ +                    else
+ +                    {
+ +                        tpi = atoms->atom[i].typeB;
+ +                    }
+ +                    typecount[tpi] += nmol;
+ +                }
+ +            }
+ +            for (tpi = 0; tpi < ntp; tpi++)
+ +            {
+ +                for (tpj = tpi; tpj < ntp; tpj++)
+ +                {
+ +                    tmpi = typecount[tpi];
+ +                    tmpj = typecount[tpj];
+ +                    if (tpi != tpj)
+ +                    {
+ +                        npair_ij = tmpi*tmpj;
+ +                    }
+ +                    else
+ +                    {
+ +                        npair_ij = tmpi*(tmpi - 1)/2;
+ +                    }
+ +                    if (bBHAM)
+ +                    {
+ +                        /* nbfp now includes the 6.0 derivative prefactor */
+ +                        csix    += npair_ij*BHAMC(nbfp, ntp, tpi, tpj)/6.0;
+ +                    }
+ +                    else
+ +                    {
+ +                        /* nbfp now includes the 6.0/12.0 derivative prefactors */
+ +                        csix    += npair_ij*   C6(nbfp, ntp, tpi, tpj)/6.0;
+ +                        ctwelve += npair_ij*  C12(nbfp, ntp, tpi, tpj)/12.0;
+ +                    }
+ +                    npair += npair_ij;
+ +                }
+ +            }
+ +            sfree(typecount);
+ +            /* Subtract the excluded pairs.
+ +             * The main reason for substracting exclusions is that in some cases
+ +             * some combinations might never occur and the parameters could have
+ +             * any value. These unused values should not influence the dispersion
+ +             * correction.
+ +             */
+ +            for (mb = 0; mb < mtop->nmolblock; mb++)
+ +            {
+ +                nmol  = mtop->molblock[mb].nmol;
+ +                atoms = &mtop->moltype[mtop->molblock[mb].type].atoms;
+ +                excl  = &mtop->moltype[mtop->molblock[mb].type].excls;
+ +                for (i = 0; (i < atoms->nr); i++)
+ +                {
+ +                    if (q == 0)
+ +                    {
+ +                        tpi = atoms->atom[i].type;
+ +                    }
+ +                    else
+ +                    {
+ +                        tpi = atoms->atom[i].typeB;
+ +                    }
+ +                    j1  = excl->index[i];
+ +                    j2  = excl->index[i+1];
+ +                    for (j = j1; j < j2; j++)
+ +                    {
+ +                        k = excl->a[j];
+ +                        if (k > i)
+ +                        {
+ +                            if (q == 0)
+ +                            {
+ +                                tpj = atoms->atom[k].type;
+ +                            }
+ +                            else
+ +                            {
+ +                                tpj = atoms->atom[k].typeB;
+ +                            }
+ +                            if (bBHAM)
+ +                            {
+ +                                /* nbfp now includes the 6.0 derivative prefactor */
+ +                                csix -= nmol*BHAMC(nbfp, ntp, tpi, tpj)/6.0;
+ +                            }
+ +                            else
+ +                            {
+ +                                /* nbfp now includes the 6.0/12.0 derivative prefactors */
+ +                                csix    -= nmol*C6 (nbfp, ntp, tpi, tpj)/6.0;
+ +                                ctwelve -= nmol*C12(nbfp, ntp, tpi, tpj)/12.0;
+ +                            }
+ +                            nexcl += nmol;
+ +                        }
+ +                    }
+ +                }
+ +            }
+ +        }
+ +        else
+ +        {
+ +            /* Only correct for the interaction of the test particle
+ +             * with the rest of the system.
+ +             */
+ +            atoms_tpi =
+ +                &mtop->moltype[mtop->molblock[mtop->nmolblock-1].type].atoms;
+ +
+ +            npair = 0;
+ +            for (mb = 0; mb < mtop->nmolblock; mb++)
+ +            {
+ +                nmol  = mtop->molblock[mb].nmol;
+ +                atoms = &mtop->moltype[mtop->molblock[mb].type].atoms;
+ +                for (j = 0; j < atoms->nr; j++)
+ +                {
+ +                    nmolc = nmol;
+ +                    /* Remove the interaction of the test charge group
+ +                     * with itself.
+ +                     */
+ +                    if (mb == mtop->nmolblock-1)
+ +                    {
+ +                        nmolc--;
+ +
+ +                        if (mb == 0 && nmol == 1)
+ +                        {
+ +                            gmx_fatal(FARGS, "Old format tpr with TPI, please generate a new tpr file");
+ +                        }
+ +                    }
+ +                    if (q == 0)
+ +                    {
+ +                        tpj = atoms->atom[j].type;
+ +                    }
+ +                    else
+ +                    {
+ +                        tpj = atoms->atom[j].typeB;
+ +                    }
+ +                    for (i = 0; i < fr->n_tpi; i++)
+ +                    {
+ +                        if (q == 0)
+ +                        {
+ +                            tpi = atoms_tpi->atom[i].type;
+ +                        }
+ +                        else
+ +                        {
+ +                            tpi = atoms_tpi->atom[i].typeB;
+ +                        }
+ +                        if (bBHAM)
+ +                        {
+ +                            /* nbfp now includes the 6.0 derivative prefactor */
+ +                            csix    += nmolc*BHAMC(nbfp, ntp, tpi, tpj)/6.0;
+ +                        }
+ +                        else
+ +                        {
+ +                            /* nbfp now includes the 6.0/12.0 derivative prefactors */
+ +                            csix    += nmolc*C6 (nbfp, ntp, tpi, tpj)/6.0;
+ +                            ctwelve += nmolc*C12(nbfp, ntp, tpi, tpj)/12.0;
+ +                        }
+ +                        npair += nmolc;
+ +                    }
+ +                }
+ +            }
+ +        }
+ +        if (npair - nexcl <= 0 && fplog)
+ +        {
+ +            fprintf(fplog, "\nWARNING: There are no atom pairs for dispersion correction\n\n");
+ +            csix     = 0;
+ +            ctwelve  = 0;
+ +        }
+ +        else
+ +        {
+ +            csix    /= npair - nexcl;
+ +            ctwelve /= npair - nexcl;
+ +        }
+ +        if (debug)
+ +        {
+ +            fprintf(debug, "Counted %d exclusions\n", nexcl);
+ +            fprintf(debug, "Average C6 parameter is: %10g\n", (double)csix);
+ +            fprintf(debug, "Average C12 parameter is: %10g\n", (double)ctwelve);
+ +        }
+ +        fr->avcsix[q]    = csix;
+ +        fr->avctwelve[q] = ctwelve;
+ +    }
+ +    if (fplog != NULL)
+ +    {
+ +        if (fr->eDispCorr == edispcAllEner ||
+ +            fr->eDispCorr == edispcAllEnerPres)
+ +        {
+ +            fprintf(fplog, "Long Range LJ corr.: <C6> %10.4e, <C12> %10.4e\n",
+ +                    fr->avcsix[0], fr->avctwelve[0]);
+ +        }
+ +        else
+ +        {
+ +            fprintf(fplog, "Long Range LJ corr.: <C6> %10.4e\n", fr->avcsix[0]);
+ +        }
+ +    }
+ +}
+ +
+ +
+ +static void set_bham_b_max(FILE *fplog, t_forcerec *fr,
+ +                           const gmx_mtop_t *mtop)
+ +{
+ +    const t_atoms *at1, *at2;
+ +    int            mt1, mt2, i, j, tpi, tpj, ntypes;
+ +    real           b, bmin;
+ +    real          *nbfp;
+ +
+ +    if (fplog)
+ +    {
+ +        fprintf(fplog, "Determining largest Buckingham b parameter for table\n");
+ +    }
+ +    nbfp   = fr->nbfp;
+ +    ntypes = fr->ntype;
+ +
+ +    bmin           = -1;
+ +    fr->bham_b_max = 0;
+ +    for (mt1 = 0; mt1 < mtop->nmoltype; mt1++)
+ +    {
+ +        at1 = &mtop->moltype[mt1].atoms;
+ +        for (i = 0; (i < at1->nr); i++)
+ +        {
+ +            tpi = at1->atom[i].type;
+ +            if (tpi >= ntypes)
+ +            {
+ +                gmx_fatal(FARGS, "Atomtype[%d] = %d, maximum = %d", i, tpi, ntypes);
+ +            }
+ +
+ +            for (mt2 = mt1; mt2 < mtop->nmoltype; mt2++)
+ +            {
+ +                at2 = &mtop->moltype[mt2].atoms;
+ +                for (j = 0; (j < at2->nr); j++)
+ +                {
+ +                    tpj = at2->atom[j].type;
+ +                    if (tpj >= ntypes)
+ +                    {
+ +                        gmx_fatal(FARGS, "Atomtype[%d] = %d, maximum = %d", j, tpj, ntypes);
+ +                    }
+ +                    b = BHAMB(nbfp, ntypes, tpi, tpj);
+ +                    if (b > fr->bham_b_max)
+ +                    {
+ +                        fr->bham_b_max = b;
+ +                    }
+ +                    if ((b < bmin) || (bmin == -1))
+ +                    {
+ +                        bmin = b;
+ +                    }
+ +                }
+ +            }
+ +        }
+ +    }
+ +    if (fplog)
+ +    {
+ +        fprintf(fplog, "Buckingham b parameters, min: %g, max: %g\n",
+ +                bmin, fr->bham_b_max);
+ +    }
+ +}
+ +
+ +static void make_nbf_tables(FILE *fp, const output_env_t oenv,
+ +                            t_forcerec *fr, real rtab,
+ +                            const t_commrec *cr,
+ +                            const char *tabfn, char *eg1, char *eg2,
+ +                            t_nblists *nbl)
+ +{
+ +    char buf[STRLEN];
+ +    int  i, j;
+ +
+ +    if (tabfn == NULL)
+ +    {
+ +        if (debug)
+ +        {
+ +            fprintf(debug, "No table file name passed, can not read table, can not do non-bonded interactions\n");
+ +        }
+ +        return;
+ +    }
+ +
+ +    sprintf(buf, "%s", tabfn);
+ +    if (eg1 && eg2)
+ +    {
+ +        /* Append the two energy group names */
+ +        sprintf(buf + strlen(tabfn) - strlen(ftp2ext(efXVG)) - 1, "_%s_%s.%s",
+ +                eg1, eg2, ftp2ext(efXVG));
+ +    }
+ +    nbl->table_elec_vdw = make_tables(fp, oenv, fr, MASTER(cr), buf, rtab, 0);
+ +    /* Copy the contents of the table to separate coulomb and LJ tables too,
+ +     * to improve cache performance.
+ +     */
+ +    /* For performance reasons we want
+ +     * the table data to be aligned to 16-byte. The pointers could be freed
+ +     * but currently aren't.
+ +     */
+ +    nbl->table_elec.interaction   = GMX_TABLE_INTERACTION_ELEC;
+ +    nbl->table_elec.format        = nbl->table_elec_vdw.format;
+ +    nbl->table_elec.r             = nbl->table_elec_vdw.r;
+ +    nbl->table_elec.n             = nbl->table_elec_vdw.n;
+ +    nbl->table_elec.scale         = nbl->table_elec_vdw.scale;
+ +    nbl->table_elec.scale_exp     = nbl->table_elec_vdw.scale_exp;
+ +    nbl->table_elec.formatsize    = nbl->table_elec_vdw.formatsize;
+ +    nbl->table_elec.ninteractions = 1;
+ +    nbl->table_elec.stride        = nbl->table_elec.formatsize * nbl->table_elec.ninteractions;
+ +    snew_aligned(nbl->table_elec.data, nbl->table_elec.stride*(nbl->table_elec.n+1), 32);
+ +
+ +    nbl->table_vdw.interaction   = GMX_TABLE_INTERACTION_VDWREP_VDWDISP;
+ +    nbl->table_vdw.format        = nbl->table_elec_vdw.format;
+ +    nbl->table_vdw.r             = nbl->table_elec_vdw.r;
+ +    nbl->table_vdw.n             = nbl->table_elec_vdw.n;
+ +    nbl->table_vdw.scale         = nbl->table_elec_vdw.scale;
+ +    nbl->table_vdw.scale_exp     = nbl->table_elec_vdw.scale_exp;
+ +    nbl->table_vdw.formatsize    = nbl->table_elec_vdw.formatsize;
+ +    nbl->table_vdw.ninteractions = 2;
+ +    nbl->table_vdw.stride        = nbl->table_vdw.formatsize * nbl->table_vdw.ninteractions;
+ +    snew_aligned(nbl->table_vdw.data, nbl->table_vdw.stride*(nbl->table_vdw.n+1), 32);
+ +
+ +    for (i = 0; i <= nbl->table_elec_vdw.n; i++)
+ +    {
+ +        for (j = 0; j < 4; j++)
+ +        {
+ +            nbl->table_elec.data[4*i+j] = nbl->table_elec_vdw.data[12*i+j];
+ +        }
+ +        for (j = 0; j < 8; j++)
+ +        {
+ +            nbl->table_vdw.data[8*i+j] = nbl->table_elec_vdw.data[12*i+4+j];
+ +        }
+ +    }
+ +}
+ +
+ +static void count_tables(int ftype1, int ftype2, const gmx_mtop_t *mtop,
+ +                         int *ncount, int **count)
+ +{
+ +    const gmx_moltype_t *molt;
+ +    const t_ilist       *il;
+ +    int                  mt, ftype, stride, i, j, tabnr;
+ +
+ +    for (mt = 0; mt < mtop->nmoltype; mt++)
+ +    {
+ +        molt = &mtop->moltype[mt];
+ +        for (ftype = 0; ftype < F_NRE; ftype++)
+ +        {
+ +            if (ftype == ftype1 || ftype == ftype2)
+ +            {
+ +                il     = &molt->ilist[ftype];
+ +                stride = 1 + NRAL(ftype);
+ +                for (i = 0; i < il->nr; i += stride)
+ +                {
+ +                    tabnr = mtop->ffparams.iparams[il->iatoms[i]].tab.table;
+ +                    if (tabnr < 0)
+ +                    {
+ +                        gmx_fatal(FARGS, "A bonded table number is smaller than 0: %d\n", tabnr);
+ +                    }
+ +                    if (tabnr >= *ncount)
+ +                    {
+ +                        srenew(*count, tabnr+1);
+ +                        for (j = *ncount; j < tabnr+1; j++)
+ +                        {
+ +                            (*count)[j] = 0;
+ +                        }
+ +                        *ncount = tabnr+1;
+ +                    }
+ +                    (*count)[tabnr]++;
+ +                }
+ +            }
+ +        }
+ +    }
+ +}
+ +
+ +static bondedtable_t *make_bonded_tables(FILE *fplog,
+ +                                         int ftype1, int ftype2,
+ +                                         const gmx_mtop_t *mtop,
+ +                                         const char *basefn, const char *tabext)
+ +{
+ +    int            i, ncount, *count;
+ +    char           tabfn[STRLEN];
+ +    bondedtable_t *tab;
+ +
+ +    tab = NULL;
+ +
+ +    ncount = 0;
+ +    count  = NULL;
+ +    count_tables(ftype1, ftype2, mtop, &ncount, &count);
+ +
+ +    if (ncount > 0)
+ +    {
+ +        snew(tab, ncount);
+ +        for (i = 0; i < ncount; i++)
+ +        {
+ +            if (count[i] > 0)
+ +            {
+ +                sprintf(tabfn, "%s", basefn);
+ +                sprintf(tabfn + strlen(basefn) - strlen(ftp2ext(efXVG)) - 1, "_%s%d.%s",
+ +                        tabext, i, ftp2ext(efXVG));
+ +                tab[i] = make_bonded_table(fplog, tabfn, NRAL(ftype1)-2);
+ +            }
+ +        }
+ +        sfree(count);
+ +    }
+ +
+ +    return tab;
+ +}
+ +
+ +void forcerec_set_ranges(t_forcerec *fr,
+ +                         int ncg_home, int ncg_force,
+ +                         int natoms_force,
+ +                         int natoms_force_constr, int natoms_f_novirsum)
+ +{
+ +    fr->cg0 = 0;
+ +    fr->hcg = ncg_home;
+ +
+ +    /* fr->ncg_force is unused in the standard code,
+ +     * but it can be useful for modified code dealing with charge groups.
+ +     */
+ +    fr->ncg_force           = ncg_force;
+ +    fr->natoms_force        = natoms_force;
+ +    fr->natoms_force_constr = natoms_force_constr;
+ +
+ +    if (fr->natoms_force_constr > fr->nalloc_force)
+ +    {
+ +        fr->nalloc_force = over_alloc_dd(fr->natoms_force_constr);
+ +
+ +        if (fr->bTwinRange)
+ +        {
+ +            srenew(fr->f_twin, fr->nalloc_force);
+ +        }
+ +    }
+ +
+ +    if (fr->bF_NoVirSum)
+ +    {
+ +        fr->f_novirsum_n = natoms_f_novirsum;
+ +        if (fr->f_novirsum_n > fr->f_novirsum_nalloc)
+ +        {
+ +            fr->f_novirsum_nalloc = over_alloc_dd(fr->f_novirsum_n);
+ +            srenew(fr->f_novirsum_alloc, fr->f_novirsum_nalloc);
+ +        }
+ +    }
+ +    else
+ +    {
+ +        fr->f_novirsum_n = 0;
+ +    }
+ +}
+ +
+ +static real cutoff_inf(real cutoff)
+ +{
+ +    if (cutoff == 0)
+ +    {
+ +        cutoff = GMX_CUTOFF_INF;
+ +    }
+ +
+ +    return cutoff;
+ +}
+ +
+ +static void make_adress_tf_tables(FILE *fp, const output_env_t oenv,
+ +                                  t_forcerec *fr, const t_inputrec *ir,
+ +                                  const char *tabfn, const gmx_mtop_t *mtop,
+ +                                  matrix     box)
+ +{
+ +    char buf[STRLEN];
+ +    int  i, j;
+ +
+ +    if (tabfn == NULL)
+ +    {
+ +        gmx_fatal(FARGS, "No thermoforce table file given. Use -tabletf to specify a file\n");
+ +        return;
+ +    }
+ +
+ +    snew(fr->atf_tabs, ir->adress->n_tf_grps);
+ +
+ +    sprintf(buf, "%s", tabfn);
+ +    for (i = 0; i < ir->adress->n_tf_grps; i++)
+ +    {
+ +        j = ir->adress->tf_table_index[i]; /* get energy group index */
+ +        sprintf(buf + strlen(tabfn) - strlen(ftp2ext(efXVG)) - 1, "tf_%s.%s",
+ +                *(mtop->groups.grpname[mtop->groups.grps[egcENER].nm_ind[j]]), ftp2ext(efXVG));
+ +        if (fp)
+ +        {
+ +            fprintf(fp, "loading tf table for energygrp index %d from %s\n", ir->adress->tf_table_index[i], buf);
+ +        }
+ +        fr->atf_tabs[i] = make_atf_table(fp, oenv, fr, buf, box);
+ +    }
+ +
+ +}
+ +
+ +gmx_bool can_use_allvsall(const t_inputrec *ir, const gmx_mtop_t *mtop,
+ +                          gmx_bool bPrintNote, t_commrec *cr, FILE *fp)
+ +{
+ +    gmx_bool bAllvsAll;
+ +
+ +    bAllvsAll =
+ +        (
+ +            ir->rlist == 0            &&
+ +            ir->rcoulomb == 0         &&
+ +            ir->rvdw == 0             &&
+ +            ir->ePBC == epbcNONE      &&
+ +            ir->vdwtype == evdwCUT    &&
+ +            ir->coulombtype == eelCUT &&
+ +            ir->efep == efepNO        &&
+ +            (ir->implicit_solvent == eisNO ||
+ +             (ir->implicit_solvent == eisGBSA && (ir->gb_algorithm == egbSTILL ||
+ +                                                  ir->gb_algorithm == egbHCT   ||
+ +                                                  ir->gb_algorithm == egbOBC))) &&
+ +            getenv("GMX_NO_ALLVSALL") == NULL
+ +        );
+ +
+ +    if (bAllvsAll && ir->opts.ngener > 1)
+ +    {
+ +        const char *note = "NOTE: Can not use all-vs-all force loops, because there are multiple energy monitor groups; you might get significantly higher performance when using only a single energy monitor group.\n";
+ +
+ +        if (bPrintNote)
+ +        {
+ +            if (MASTER(cr))
+ +            {
+ +                fprintf(stderr, "\n%s\n", note);
+ +            }
+ +            if (fp != NULL)
+ +            {
+ +                fprintf(fp, "\n%s\n", note);
+ +            }
+ +        }
+ +        bAllvsAll = FALSE;
+ +    }
+ +
+ +    if (bAllvsAll && fp && MASTER(cr))
+ +    {
+ +        fprintf(fp, "\nUsing accelerated all-vs-all kernels.\n\n");
+ +    }
+ +
+ +    return bAllvsAll;
+ +}
+ +
+ +
+ +static void init_forcerec_f_threads(t_forcerec *fr, int nenergrp)
+ +{
+ +    int t, i;
+ +
+ +    /* These thread local data structures are used for bondeds only */
+ +    fr->nthreads = gmx_omp_nthreads_get(emntBonded);
+ +
+ +    if (fr->nthreads > 1)
+ +    {
+ +        snew(fr->f_t, fr->nthreads);
+ +        /* Thread 0 uses the global force and energy arrays */
+ +        for (t = 1; t < fr->nthreads; t++)
+ +        {
+ +            fr->f_t[t].f        = NULL;
+ +            fr->f_t[t].f_nalloc = 0;
+ +            snew(fr->f_t[t].fshift, SHIFTS);
+ +            fr->f_t[t].grpp.nener = nenergrp*nenergrp;
+ +            for (i = 0; i < egNR; i++)
+ +            {
+ +                snew(fr->f_t[t].grpp.ener[i], fr->f_t[t].grpp.nener);
+ +            }
+ +        }
+ +    }
+ +}
+ +
+ +
+ +static void pick_nbnxn_kernel_cpu(FILE             *fp,
+ +                                  const t_commrec  *cr,
+ +                                  const gmx_cpuid_t cpuid_info,
+ +                                  const t_inputrec *ir,
+ +                                  int              *kernel_type,
+ +                                  int              *ewald_excl)
+ +{
+ +    *kernel_type = nbnxnk4x4_PlainC;
+ +    *ewald_excl  = ewaldexclTable;
+ +
+ +#ifdef GMX_NBNXN_SIMD
+ +    {
+ +#ifdef GMX_NBNXN_SIMD_4XN
+ +        *kernel_type = nbnxnk4xN_SIMD_4xN;
+ +#endif
+ +#ifdef GMX_NBNXN_SIMD_2XNN
+ +        /* We expect the 2xNN kernels to be faster in most cases */
+ +        *kernel_type = nbnxnk4xN_SIMD_2xNN;
+ +#endif
+ +
+ +#if defined GMX_NBNXN_SIMD_4XN && defined GMX_X86_AVX_256
+ +        if (EEL_RF(ir->coulombtype) || ir->coulombtype == eelCUT)
+ +        {
+ +            /* The raw pair rate of the 4x8 kernel is higher than 2x(4+4),
+ +             * 10% with HT, 50% without HT, but extra zeros interactions
+ +             * can compensate. As we currently don't detect the actual use
+ +             * of HT, switch to 4x8 to avoid a potential performance hit.
+ +             */
+ +            *kernel_type = nbnxnk4xN_SIMD_4xN;
+ +        }
+ +#endif
+ +        if (getenv("GMX_NBNXN_SIMD_4XN") != NULL)
+ +        {
+ +#ifdef GMX_NBNXN_SIMD_4XN
+ +            *kernel_type = nbnxnk4xN_SIMD_4xN;
+ +#else
+ +            gmx_fatal(FARGS, "SIMD 4xN kernels requested, but Gromacs has been compiled without support for these kernels");
+ +#endif
+ +        }
+ +        if (getenv("GMX_NBNXN_SIMD_2XNN") != NULL)
+ +        {
+ +#ifdef GMX_NBNXN_SIMD_2XNN
+ +            *kernel_type = nbnxnk4xN_SIMD_2xNN;
+ +#else
+ +            gmx_fatal(FARGS, "SIMD 2x(N+N) kernels requested, but Gromacs has been compiled without support for these kernels");
+ +#endif
+ +        }
+ +
+ +        /* Analytical Ewald exclusion correction is only an option in the
+ +         * x86 SIMD kernel. This is faster in single precision
+ +         * on Bulldozer and slightly faster on Sandy Bridge.
+ +         */
+ +#if (defined GMX_X86_AVX_128_FMA || defined GMX_X86_AVX_256) && !defined GMX_DOUBLE
+ +        *ewald_excl = ewaldexclAnalytical;
+ +#endif
+ +        if (getenv("GMX_NBNXN_EWALD_TABLE") != NULL)
+ +        {
+ +            *ewald_excl = ewaldexclTable;
+ +        }
+ +        if (getenv("GMX_NBNXN_EWALD_ANALYTICAL") != NULL)
+ +        {
+ +            *ewald_excl = ewaldexclAnalytical;
+ +        }
+ +
+ +    }
+ +#endif /* GMX_X86_SSE2 */
+ +}
+ +
+ +
+ +const char *lookup_nbnxn_kernel_name(int kernel_type)
+ +{
+ +    const char *returnvalue = NULL;
+ +    switch (kernel_type)
+ +    {
+ +        case nbnxnkNotSet: returnvalue     = "not set"; break;
+ +        case nbnxnk4x4_PlainC: returnvalue = "plain C"; break;
+ +#ifndef GMX_NBNXN_SIMD
+ +        case nbnxnk4xN_SIMD_4xN: returnvalue  = "not available"; break;
+ +        case nbnxnk4xN_SIMD_2xNN: returnvalue = "not available"; break;
+ +#else
+ +#ifdef GMX_X86_SSE2
+ +#if GMX_NBNXN_SIMD_BITWIDTH == 128
+ +            /* x86 SIMD intrinsics can be converted to either SSE or AVX depending
+ +             * on compiler flags. As we use nearly identical intrinsics, using an AVX
+ +             * compiler flag without an AVX macro effectively results in AVX kernels.
+ +             * For gcc we check for __AVX__
+ +             * At least a check for icc should be added (if there is a macro)
+ +             */
+ +#if !(defined GMX_X86_AVX_128_FMA || defined __AVX__)
+ +#ifndef GMX_X86_SSE4_1
+ +        case nbnxnk4xN_SIMD_4xN: returnvalue  = "SSE2"; break;
+ +        case nbnxnk4xN_SIMD_2xNN: returnvalue = "SSE2"; break;
+ +#else
+ +        case nbnxnk4xN_SIMD_4xN: returnvalue  = "SSE4.1"; break;
+ +        case nbnxnk4xN_SIMD_2xNN: returnvalue = "SSE4.1"; break;
+ +#endif
+ +#else
+ +        case nbnxnk4xN_SIMD_4xN: returnvalue  = "AVX-128"; break;
+ +        case nbnxnk4xN_SIMD_2xNN: returnvalue = "AVX-128"; break;
+ +#endif
+ +#endif
+ +#if GMX_NBNXN_SIMD_BITWIDTH == 256
+ +        case nbnxnk4xN_SIMD_4xN: returnvalue  = "AVX-256"; break;
+ +        case nbnxnk4xN_SIMD_2xNN: returnvalue = "AVX-256"; break;
+ +#endif
+ +#else   /* not GMX_X86_SSE2 */
+ +        case nbnxnk4xN_SIMD_4xN: returnvalue  = "SIMD"; break;
+ +        case nbnxnk4xN_SIMD_2xNN: returnvalue = "SIMD"; break;
+ +#endif
+ +#endif
+ +        case nbnxnk8x8x8_CUDA: returnvalue   = "CUDA"; break;
+ +        case nbnxnk8x8x8_PlainC: returnvalue = "plain C"; break;
+ +
+ +        case nbnxnkNR:
+ +        default:
+ +            gmx_fatal(FARGS, "Illegal kernel type selected");
+ +            returnvalue = NULL;
+ +            break;
+ +    }
+ +    return returnvalue;
+ +};
+ +
+ +static void pick_nbnxn_kernel(FILE                *fp,
+ +                              const t_commrec     *cr,
+ +                              const gmx_hw_info_t *hwinfo,
+ +                              gmx_bool             use_cpu_acceleration,
+ +                              gmx_bool             bUseGPU,
+ +                              gmx_bool             bEmulateGPU,
+ +                              const t_inputrec    *ir,
+ +                              int                 *kernel_type,
+ +                              int                 *ewald_excl,
+ +                              gmx_bool             bDoNonbonded)
+ +{
+ +    assert(kernel_type);
+ +
+ +    *kernel_type = nbnxnkNotSet;
+ +    *ewald_excl  = ewaldexclTable;
+ +
+ +    if (bEmulateGPU)
+ +    {
+ +        *kernel_type = nbnxnk8x8x8_PlainC;
+ +
+ +        if (bDoNonbonded)
+ +        {
+ +            md_print_warn(cr, fp, "Emulating a GPU run on the CPU (slow)");
+ +        }
+ +    }
+ +    else if (bUseGPU)
+ +    {
+ +        *kernel_type = nbnxnk8x8x8_CUDA;
+ +    }
+ +
+ +    if (*kernel_type == nbnxnkNotSet)
+ +    {
+ +        if (use_cpu_acceleration)
+ +        {
+ +            pick_nbnxn_kernel_cpu(fp, cr, hwinfo->cpuid_info, ir,
+ +                                  kernel_type, ewald_excl);
+ +        }
+ +        else
+ +        {
+ +            *kernel_type = nbnxnk4x4_PlainC;
+ +        }
+ +    }
+ +
+ +    if (bDoNonbonded && fp != NULL)
+ +    {
+ +        fprintf(fp, "\nUsing %s %dx%d non-bonded kernels\n\n",
+ +                lookup_nbnxn_kernel_name(*kernel_type),
+ +                nbnxn_kernel_pairlist_simple(*kernel_type) ? NBNXN_CPU_CLUSTER_I_SIZE : NBNXN_GPU_CLUSTER_SIZE,
+ +                nbnxn_kernel_to_cj_size(*kernel_type));
+ +    }
+ +}
+ +
+ +static void pick_nbnxn_resources(FILE                *fp,
+ +                                 const t_commrec     *cr,
+ +                                 const gmx_hw_info_t *hwinfo,
+ +                                 gmx_bool             bDoNonbonded,
+ +                                 gmx_bool            *bUseGPU,
+ +                                 gmx_bool            *bEmulateGPU)
+ +{
+ +    gmx_bool bEmulateGPUEnvVarSet;
+ +    char     gpu_err_str[STRLEN];
+ +
+ +    *bUseGPU = FALSE;
+ +
+ +    bEmulateGPUEnvVarSet = (getenv("GMX_EMULATE_GPU") != NULL);
+ +
+ +    /* Run GPU emulation mode if GMX_EMULATE_GPU is defined. Because
+ +     * GPUs (currently) only handle non-bonded calculations, we will
+ +     * automatically switch to emulation if non-bonded calculations are
+ +     * turned off via GMX_NO_NONBONDED - this is the simple and elegant
+ +     * way to turn off GPU initialization, data movement, and cleanup.
+ +     *
+ +     * GPU emulation can be useful to assess the performance one can expect by
+ +     * adding GPU(s) to the machine. The conditional below allows this even
+ +     * if mdrun is compiled without GPU acceleration support.
+ +     * Note that you should freezing the system as otherwise it will explode.
+ +     */
+ +    *bEmulateGPU = (bEmulateGPUEnvVarSet ||
+ +                    (!bDoNonbonded && hwinfo->bCanUseGPU));
+ +
+ +    /* Enable GPU mode when GPUs are available or no GPU emulation is requested.
+ +     */
+ +    if (hwinfo->bCanUseGPU && !(*bEmulateGPU))
+ +    {
+ +        /* Each PP node will use the intra-node id-th device from the
+ +         * list of detected/selected GPUs. */
+ +        if (!init_gpu(cr->rank_pp_intranode, gpu_err_str, &hwinfo->gpu_info))
+ +        {
+ +            /* At this point the init should never fail as we made sure that
+ +             * we have all the GPUs we need. If it still does, we'll bail. */
+ +            gmx_fatal(FARGS, "On node %d failed to initialize GPU #%d: %s",
+ +                      cr->nodeid,
+ +                      get_gpu_device_id(&hwinfo->gpu_info, cr->rank_pp_intranode),
+ +                      gpu_err_str);
+ +        }
+ +
+ +        /* Here we actually turn on hardware GPU acceleration */
+ +        *bUseGPU = TRUE;
+ +    }
+ +}
+ +
+ +gmx_bool uses_simple_tables(int                 cutoff_scheme,
+ +                            nonbonded_verlet_t *nbv,
+ +                            int                 group)
+ +{
+ +    gmx_bool bUsesSimpleTables = TRUE;
+ +    int      grp_index;
+ +
+ +    switch (cutoff_scheme)
+ +    {
+ +        case ecutsGROUP:
+ +            bUsesSimpleTables = TRUE;
+ +            break;
+ +        case ecutsVERLET:
+ +            assert(NULL != nbv && NULL != nbv->grp);
+ +            grp_index         = (group < 0) ? 0 : (nbv->ngrp - 1);
+ +            bUsesSimpleTables = nbnxn_kernel_pairlist_simple(nbv->grp[grp_index].kernel_type);
+ +            break;
+ +        default:
+ +            gmx_incons("unimplemented");
+ +    }
+ +    return bUsesSimpleTables;
+ +}
+ +
+ +static void init_ewald_f_table(interaction_const_t *ic,
+ +                               gmx_bool             bUsesSimpleTables,
+ +                               real                 rtab)
+ +{
+ +    real maxr;
+ +
+ +    if (bUsesSimpleTables)
+ +    {
+ +        /* With a spacing of 0.0005 we are at the force summation accuracy
+ +         * for the SSE kernels for "normal" atomistic simulations.
+ +         */
+ +        ic->tabq_scale = ewald_spline3_table_scale(ic->ewaldcoeff,
+ +                                                   ic->rcoulomb);
+ +
+ +        maxr           = (rtab > ic->rcoulomb) ? rtab : ic->rcoulomb;
+ +        ic->tabq_size  = (int)(maxr*ic->tabq_scale) + 2;
+ +    }
+ +    else
+ +    {
+ +        ic->tabq_size = GPU_EWALD_COULOMB_FORCE_TABLE_SIZE;
+ +        /* Subtract 2 iso 1 to avoid access out of range due to rounding */
+ +        ic->tabq_scale = (ic->tabq_size - 2)/ic->rcoulomb;
+ +    }
+ +
+ +    sfree_aligned(ic->tabq_coul_FDV0);
+ +    sfree_aligned(ic->tabq_coul_F);
+ +    sfree_aligned(ic->tabq_coul_V);
+ +
+ +    /* Create the original table data in FDV0 */
+ +    snew_aligned(ic->tabq_coul_FDV0, ic->tabq_size*4, 32);
+ +    snew_aligned(ic->tabq_coul_F, ic->tabq_size, 32);
+ +    snew_aligned(ic->tabq_coul_V, ic->tabq_size, 32);
+ +    table_spline3_fill_ewald_lr(ic->tabq_coul_F, ic->tabq_coul_V, ic->tabq_coul_FDV0,
+ +                                ic->tabq_size, 1/ic->tabq_scale, ic->ewaldcoeff);
+ +}
+ +
+ +void init_interaction_const_tables(FILE                *fp,
+ +                                   interaction_const_t *ic,
+ +                                   gmx_bool             bUsesSimpleTables,
+ +                                   real                 rtab)
+ +{
+ +    real spacing;
+ +
+ +    if (ic->eeltype == eelEWALD || EEL_PME(ic->eeltype))
+ +    {
+ +        init_ewald_f_table(ic, bUsesSimpleTables, rtab);
+ +
+ +        if (fp != NULL)
+ +        {
+ +            fprintf(fp, "Initialized non-bonded Ewald correction tables, spacing: %.2e size: %d\n\n",
+ +                    1/ic->tabq_scale, ic->tabq_size);
+ +        }
+ +    }
+ +}
+ +
+ +void init_interaction_const(FILE                 *fp,
+ +                            interaction_const_t **interaction_const,
+ +                            const t_forcerec     *fr,
+ +                            real                  rtab)
+ +{
+ +    interaction_const_t *ic;
+ +    gmx_bool             bUsesSimpleTables = TRUE;
+ +
+ +    snew(ic, 1);
+ +
+ +    /* Just allocate something so we can free it */
+ +    snew_aligned(ic->tabq_coul_FDV0, 16, 32);
+ +    snew_aligned(ic->tabq_coul_F, 16, 32);
+ +    snew_aligned(ic->tabq_coul_V, 16, 32);
+ +
+ +    ic->rlist       = fr->rlist;
+ +    ic->rlistlong   = fr->rlistlong;
+ +
+ +    /* Lennard-Jones */
+ +    ic->rvdw        = fr->rvdw;
+ +    if (fr->vdw_modifier == eintmodPOTSHIFT)
+ +    {
+ +        ic->sh_invrc6 = pow(ic->rvdw, -6.0);
+ +    }
+ +    else
+ +    {
+ +        ic->sh_invrc6 = 0;
+ +    }
+ +
+ +    /* Electrostatics */
+ +    ic->eeltype     = fr->eeltype;
+ +    ic->rcoulomb    = fr->rcoulomb;
+ +    ic->epsilon_r   = fr->epsilon_r;
+ +    ic->epsfac      = fr->epsfac;
+ +
+ +    /* Ewald */
+ +    ic->ewaldcoeff  = fr->ewaldcoeff;
+ +    if (fr->coulomb_modifier == eintmodPOTSHIFT)
+ +    {
+ +        ic->sh_ewald = gmx_erfc(ic->ewaldcoeff*ic->rcoulomb);
+ +    }
+ +    else
+ +    {
+ +        ic->sh_ewald = 0;
+ +    }
+ +
+ +    /* Reaction-field */
+ +    if (EEL_RF(ic->eeltype))
+ +    {
+ +        ic->epsilon_rf = fr->epsilon_rf;
+ +        ic->k_rf       = fr->k_rf;
+ +        ic->c_rf       = fr->c_rf;
+ +    }
+ +    else
+ +    {
+ +        /* For plain cut-off we might use the reaction-field kernels */
+ +        ic->epsilon_rf = ic->epsilon_r;
+ +        ic->k_rf       = 0;
+ +        if (fr->coulomb_modifier == eintmodPOTSHIFT)
+ +        {
+ +            ic->c_rf   = 1/ic->rcoulomb;
+ +        }
+ +        else
+ +        {
+ +            ic->c_rf   = 0;
+ +        }
+ +    }
+ +
+ +    if (fp != NULL)
+ +    {
+ +        fprintf(fp, "Potential shift: LJ r^-12: %.3f r^-6 %.3f",
+ +                sqr(ic->sh_invrc6), ic->sh_invrc6);
+ +        if (ic->eeltype == eelCUT)
+ +        {
+ +            fprintf(fp, ", Coulomb %.3f", ic->c_rf);
+ +        }
+ +        else if (EEL_PME(ic->eeltype))
+ +        {
+ +            fprintf(fp, ", Ewald %.3e", ic->sh_ewald);
+ +        }
+ +        fprintf(fp, "\n");
+ +    }
+ +
+ +    *interaction_const = ic;
+ +
+ +    if (fr->nbv != NULL && fr->nbv->bUseGPU)
+ +    {
+ +        nbnxn_cuda_init_const(fr->nbv->cu_nbv, ic, fr->nbv->grp);
+ +    }
+ +
+ +    bUsesSimpleTables = uses_simple_tables(fr->cutoff_scheme, fr->nbv, -1);
+ +    init_interaction_const_tables(fp, ic, bUsesSimpleTables, rtab);
+ +}
+ +
+ +static void init_nb_verlet(FILE                *fp,
+ +                           nonbonded_verlet_t **nb_verlet,
+ +                           const t_inputrec    *ir,
+ +                           const t_forcerec    *fr,
+ +                           const t_commrec     *cr,
+ +                           const char          *nbpu_opt)
+ +{
+ +    nonbonded_verlet_t *nbv;
+ +    int                 i;
+ +    char               *env;
+ +    gmx_bool            bEmulateGPU, bHybridGPURun = FALSE;
+ +
+ +    nbnxn_alloc_t      *nb_alloc;
+ +    nbnxn_free_t       *nb_free;
+ +
+ +    snew(nbv, 1);
+ +
+ +    pick_nbnxn_resources(fp, cr, fr->hwinfo,
+ +                         fr->bNonbonded,
+ +                         &nbv->bUseGPU,
+ +                         &bEmulateGPU);
+ +
+ +    nbv->nbs = NULL;
+ +
+ +    nbv->ngrp = (DOMAINDECOMP(cr) ? 2 : 1);
+ +    for (i = 0; i < nbv->ngrp; i++)
+ +    {
+ +        nbv->grp[i].nbl_lists.nnbl = 0;
+ +        nbv->grp[i].nbat           = NULL;
+ +        nbv->grp[i].kernel_type    = nbnxnkNotSet;
+ +
+ +        if (i == 0) /* local */
+ +        {
+ +            pick_nbnxn_kernel(fp, cr, fr->hwinfo, fr->use_cpu_acceleration,
+ +                              nbv->bUseGPU, bEmulateGPU,
+ +                              ir,
+ +                              &nbv->grp[i].kernel_type,
+ +                              &nbv->grp[i].ewald_excl,
+ +                              fr->bNonbonded);
+ +        }
+ +        else /* non-local */
+ +        {
+ +            if (nbpu_opt != NULL && strcmp(nbpu_opt, "gpu_cpu") == 0)
+ +            {
+ +                /* Use GPU for local, select a CPU kernel for non-local */
+ +                pick_nbnxn_kernel(fp, cr, fr->hwinfo, fr->use_cpu_acceleration,
+ +                                  FALSE, FALSE,
+ +                                  ir,
+ +                                  &nbv->grp[i].kernel_type,
+ +                                  &nbv->grp[i].ewald_excl,
+ +                                  fr->bNonbonded);
+ +
+ +                bHybridGPURun = TRUE;
+ +            }
+ +            else
+ +            {
+ +                /* Use the same kernel for local and non-local interactions */
+ +                nbv->grp[i].kernel_type = nbv->grp[0].kernel_type;
+ +                nbv->grp[i].ewald_excl  = nbv->grp[0].ewald_excl;
+ +            }
+ +        }
+ +    }
+ +
+ +    if (nbv->bUseGPU)
+ +    {
+ +        /* init the NxN GPU data; the last argument tells whether we'll have
+ +         * both local and non-local NB calculation on GPU */
+ +        nbnxn_cuda_init(fp, &nbv->cu_nbv,
+ +                        &fr->hwinfo->gpu_info, cr->rank_pp_intranode,
+ +                        (nbv->ngrp > 1) && !bHybridGPURun);
+ +
+ +        if ((env = getenv("GMX_NB_MIN_CI")) != NULL)
+ +        {
+ +            char *end;
+ +
+ +            nbv->min_ci_balanced = strtol(env, &end, 10);
+ +            if (!end || (*end != 0) || nbv->min_ci_balanced <= 0)
+ +            {
+ +                gmx_fatal(FARGS, "Invalid value passed in GMX_NB_MIN_CI=%s, positive integer required", env);
+ +            }
+ +
+ +            if (debug)
+ +            {
+ +                fprintf(debug, "Neighbor-list balancing parameter: %d (passed as env. var.)\n",
+ +                        nbv->min_ci_balanced);
+ +            }
+ +        }
+ +        else
+ +        {
+ +            nbv->min_ci_balanced = nbnxn_cuda_min_ci_balanced(nbv->cu_nbv);
+ +            if (debug)
+ +            {
+ +                fprintf(debug, "Neighbor-list balancing parameter: %d (auto-adjusted to the number of GPU multi-processors)\n",
+ +                        nbv->min_ci_balanced);
+ +            }
+ +        }
+ +    }
+ +    else
+ +    {
+ +        nbv->min_ci_balanced = 0;
+ +    }
+ +
+ +    *nb_verlet = nbv;
+ +
+ +    nbnxn_init_search(&nbv->nbs,
+ +                      DOMAINDECOMP(cr) ? &cr->dd->nc : NULL,
+ +                      DOMAINDECOMP(cr) ? domdec_zones(cr->dd) : NULL,
+ +                      gmx_omp_nthreads_get(emntNonbonded));
+ +
+ +    for (i = 0; i < nbv->ngrp; i++)
+ +    {
+ +        if (nbv->grp[0].kernel_type == nbnxnk8x8x8_CUDA)
+ +        {
+ +            nb_alloc = &pmalloc;
+ +            nb_free  = &pfree;
+ +        }
+ +        else
+ +        {
+ +            nb_alloc = NULL;
+ +            nb_free  = NULL;
+ +        }
+ +
+ +        nbnxn_init_pairlist_set(&nbv->grp[i].nbl_lists,
+ +                                nbnxn_kernel_pairlist_simple(nbv->grp[i].kernel_type),
+ +                                /* 8x8x8 "non-simple" lists are ATM always combined */
+ +                                !nbnxn_kernel_pairlist_simple(nbv->grp[i].kernel_type),
+ +                                nb_alloc, nb_free);
+ +
+ +        if (i == 0 ||
+ +            nbv->grp[0].kernel_type != nbv->grp[i].kernel_type)
+ +        {
+ +            snew(nbv->grp[i].nbat, 1);
+ +            nbnxn_atomdata_init(fp,
+ +                                nbv->grp[i].nbat,
+ +                                nbv->grp[i].kernel_type,
+ +                                fr->ntype, fr->nbfp,
+ +                                ir->opts.ngener,
+ +                                nbnxn_kernel_pairlist_simple(nbv->grp[i].kernel_type) ? gmx_omp_nthreads_get(emntNonbonded) : 1,
+ +                                nb_alloc, nb_free);
+ +        }
+ +        else
+ +        {
+ +            nbv->grp[i].nbat = nbv->grp[0].nbat;
+ +        }
+ +    }
+ +}
+ +
+ +void init_forcerec(FILE              *fp,
+ +                   const output_env_t oenv,
+ +                   t_forcerec        *fr,
+ +                   t_fcdata          *fcd,
+ +                   const t_inputrec  *ir,
+ +                   const gmx_mtop_t  *mtop,
+ +                   const t_commrec   *cr,
+ +                   matrix             box,
+ +                   gmx_bool           bMolEpot,
+ +                   const char        *tabfn,
+ +                   const char        *tabafn,
+ +                   const char        *tabpfn,
+ +                   const char        *tabbfn,
+ +                   const char        *nbpu_opt,
+ +                   gmx_bool           bNoSolvOpt,
+ +                   real               print_force)
+ +{
+ +    int            i, j, m, natoms, ngrp, negp_pp, negptable, egi, egj;
+ +    real           rtab;
+ +    char          *env;
+ +    double         dbl;
+ +    rvec           box_size;
+ +    const t_block *cgs;
+ +    gmx_bool       bGenericKernelOnly;
+ +    gmx_bool       bTab, bSep14tab, bNormalnblists;
+ +    t_nblists     *nbl;
+ +    int           *nm_ind, egp_flags;
+ +
+ +    if (fr->hwinfo == NULL)
+ +    {
+ +        /* Detect hardware, gather information.
+ +         * In mdrun, hwinfo has already been set before calling init_forcerec.
+ +         * Here we ignore GPUs, as tools will not use them anyhow.
+ +         */
-         fr->bAllvsAll = FALSE;
++        fr->hwinfo = gmx_detect_hardware(fp, cr, FALSE, FALSE, NULL);
+ +    }
+ +
+ +    /* By default we turn acceleration on, but it might be turned off further down... */
+ +    fr->use_cpu_acceleration = TRUE;
+ +
+ +    fr->bDomDec = DOMAINDECOMP(cr);
+ +
+ +    natoms = mtop->natoms;
+ +
+ +    if (check_box(ir->ePBC, box))
+ +    {
+ +        gmx_fatal(FARGS, check_box(ir->ePBC, box));
+ +    }
+ +
+ +    /* Test particle insertion ? */
+ +    if (EI_TPI(ir->eI))
+ +    {
+ +        /* Set to the size of the molecule to be inserted (the last one) */
+ +        /* Because of old style topologies, we have to use the last cg
+ +         * instead of the last molecule type.
+ +         */
+ +        cgs       = &mtop->moltype[mtop->molblock[mtop->nmolblock-1].type].cgs;
+ +        fr->n_tpi = cgs->index[cgs->nr] - cgs->index[cgs->nr-1];
+ +        if (fr->n_tpi != mtop->mols.index[mtop->mols.nr] - mtop->mols.index[mtop->mols.nr-1])
+ +        {
+ +            gmx_fatal(FARGS, "The molecule to insert can not consist of multiple charge groups.\nMake it a single charge group.");
+ +        }
+ +    }
+ +    else
+ +    {
+ +        fr->n_tpi = 0;
+ +    }
+ +
+ +    /* Copy AdResS parameters */
+ +    if (ir->bAdress)
+ +    {
+ +        fr->adress_type           = ir->adress->type;
+ +        fr->adress_const_wf       = ir->adress->const_wf;
+ +        fr->adress_ex_width       = ir->adress->ex_width;
+ +        fr->adress_hy_width       = ir->adress->hy_width;
+ +        fr->adress_icor           = ir->adress->icor;
+ +        fr->adress_site           = ir->adress->site;
+ +        fr->adress_ex_forcecap    = ir->adress->ex_forcecap;
+ +        fr->adress_do_hybridpairs = ir->adress->do_hybridpairs;
+ +
+ +
+ +        snew(fr->adress_group_explicit, ir->adress->n_energy_grps);
+ +        for (i = 0; i < ir->adress->n_energy_grps; i++)
+ +        {
+ +            fr->adress_group_explicit[i] = ir->adress->group_explicit[i];
+ +        }
+ +
+ +        fr->n_adress_tf_grps = ir->adress->n_tf_grps;
+ +        snew(fr->adress_tf_table_index, fr->n_adress_tf_grps);
+ +        for (i = 0; i < fr->n_adress_tf_grps; i++)
+ +        {
+ +            fr->adress_tf_table_index[i] = ir->adress->tf_table_index[i];
+ +        }
+ +        copy_rvec(ir->adress->refs, fr->adress_refs);
+ +    }
+ +    else
+ +    {
+ +        fr->adress_type           = eAdressOff;
+ +        fr->adress_do_hybridpairs = FALSE;
+ +    }
+ +
+ +    /* Copy the user determined parameters */
+ +    fr->userint1  = ir->userint1;
+ +    fr->userint2  = ir->userint2;
+ +    fr->userint3  = ir->userint3;
+ +    fr->userint4  = ir->userint4;
+ +    fr->userreal1 = ir->userreal1;
+ +    fr->userreal2 = ir->userreal2;
+ +    fr->userreal3 = ir->userreal3;
+ +    fr->userreal4 = ir->userreal4;
+ +
+ +    /* Shell stuff */
+ +    fr->fc_stepsize = ir->fc_stepsize;
+ +
+ +    /* Free energy */
+ +    fr->efep        = ir->efep;
+ +    fr->sc_alphavdw = ir->fepvals->sc_alpha;
+ +    if (ir->fepvals->bScCoul)
+ +    {
+ +        fr->sc_alphacoul  = ir->fepvals->sc_alpha;
+ +        fr->sc_sigma6_min = pow(ir->fepvals->sc_sigma_min, 6);
+ +    }
+ +    else
+ +    {
+ +        fr->sc_alphacoul  = 0;
+ +        fr->sc_sigma6_min = 0; /* only needed when bScCoul is on */
+ +    }
+ +    fr->sc_power      = ir->fepvals->sc_power;
+ +    fr->sc_r_power    = ir->fepvals->sc_r_power;
+ +    fr->sc_sigma6_def = pow(ir->fepvals->sc_sigma, 6);
+ +
+ +    env = getenv("GMX_SCSIGMA_MIN");
+ +    if (env != NULL)
+ +    {
+ +        dbl = 0;
+ +        sscanf(env, "%lf", &dbl);
+ +        fr->sc_sigma6_min = pow(dbl, 6);
+ +        if (fp)
+ +        {
+ +            fprintf(fp, "Setting the minimum soft core sigma to %g nm\n", dbl);
+ +        }
+ +    }
+ +
+ +    fr->bNonbonded = TRUE;
+ +    if (getenv("GMX_NO_NONBONDED") != NULL)
+ +    {
+ +        /* turn off non-bonded calculations */
+ +        fr->bNonbonded = FALSE;
+ +        md_print_warn(cr, fp,
+ +                      "Found environment variable GMX_NO_NONBONDED.\n"
+ +                      "Disabling nonbonded calculations.\n");
+ +    }
+ +
+ +    bGenericKernelOnly = FALSE;
+ +
+ +    /* We now check in the NS code whether a particular combination of interactions
+ +     * can be used with water optimization, and disable it if that is not the case.
+ +     */
+ +
+ +    if (getenv("GMX_NB_GENERIC") != NULL)
+ +    {
+ +        if (fp != NULL)
+ +        {
+ +            fprintf(fp,
+ +                    "Found environment variable GMX_NB_GENERIC.\n"
+ +                    "Disabling all interaction-specific nonbonded kernels, will only\n"
+ +                    "use the slow generic ones in src/gmxlib/nonbonded/nb_generic.c\n\n");
+ +        }
+ +        bGenericKernelOnly = TRUE;
+ +    }
+ +
+ +    if (bGenericKernelOnly == TRUE)
+ +    {
+ +        bNoSolvOpt         = TRUE;
+ +    }
+ +
+ +    if ( (getenv("GMX_DISABLE_CPU_ACCELERATION") != NULL) || (getenv("GMX_NOOPTIMIZEDKERNELS") != NULL) )
+ +    {
+ +        fr->use_cpu_acceleration = FALSE;
+ +        if (fp != NULL)
+ +        {
+ +            fprintf(fp,
+ +                    "\nFound environment variable GMX_DISABLE_CPU_ACCELERATION.\n"
+ +                    "Disabling all CPU architecture-specific (e.g. SSE2/SSE4/AVX) routines.\n\n");
+ +        }
+ +    }
+ +
+ +    fr->bBHAM = (mtop->ffparams.functype[0] == F_BHAM);
+ +
+ +    /* Check if we can/should do all-vs-all kernels */
+ +    fr->bAllvsAll       = can_use_allvsall(ir, mtop, FALSE, NULL, NULL);
+ +    fr->AllvsAll_work   = NULL;
+ +    fr->AllvsAll_workgb = NULL;
+ +
+ +    /* All-vs-all kernels have not been implemented in 4.6, and
+ +     * the SIMD group kernels are also buggy in this case. Non-accelerated
+ +     * group kernels are OK. See Redmine #1249. */
+ +    if (fr->bAllvsAll)
+ +    {
++        fr->bAllvsAll            = FALSE;
+ +        fr->use_cpu_acceleration = FALSE;
+ +        if (fp != NULL)
+ +        {
+ +            fprintf(fp,
+ +                    "\nYour simulation settings would have triggered the efficient all-vs-all\n"
+ +                    "kernels in GROMACS 4.5, but these have not been implemented in GROMACS\n"
+ +                    "4.6. Also, we can't use the accelerated SIMD kernels here because\n"
+ +                    "of an unfixed bug. The reference C kernels are correct, though, so\n"
+ +                    "we are proceeding by disabling all CPU architecture-specific\n"
+ +                    "(e.g. SSE2/SSE4/AVX) routines. If performance is important, please\n"
+ +                    "use GROMACS 4.5.7 or try cutoff-scheme = Verlet.\n\n");
+ +        }
+ +    }
+ +
+ +    /* Neighbour searching stuff */
+ +    fr->cutoff_scheme = ir->cutoff_scheme;
+ +    fr->bGrid         = (ir->ns_type == ensGRID);
+ +    fr->ePBC          = ir->ePBC;
+ +
+ +    /* Determine if we will do PBC for distances in bonded interactions */
+ +    if (fr->ePBC == epbcNONE)
+ +    {
+ +        fr->bMolPBC = FALSE;
+ +    }
+ +    else
+ +    {
+ +        if (!DOMAINDECOMP(cr))
+ +        {
+ +            /* The group cut-off scheme and SHAKE assume charge groups
+ +             * are whole, but not using molpbc is faster in most cases.
+ +             */
+ +            if (fr->cutoff_scheme == ecutsGROUP ||
+ +                (ir->eConstrAlg == econtSHAKE &&
+ +                 (gmx_mtop_ftype_count(mtop, F_CONSTR) > 0 ||
+ +                  gmx_mtop_ftype_count(mtop, F_CONSTRNC) > 0)))
+ +            {
+ +                fr->bMolPBC = ir->bPeriodicMols;
+ +            }
+ +            else
+ +            {
+ +                fr->bMolPBC = TRUE;
+ +                if (getenv("GMX_USE_GRAPH") != NULL)
+ +                {
+ +                    fr->bMolPBC = FALSE;
+ +                    if (fp)
+ +                    {
+ +                        fprintf(fp, "\nGMX_MOLPBC is set, using the graph for bonded interactions\n\n");
+ +                    }
+ +                }
+ +            }
+ +        }
+ +        else
+ +        {
+ +            fr->bMolPBC = dd_bonded_molpbc(cr->dd, fr->ePBC);
+ +        }
+ +    }
+ +    fr->bGB = (ir->implicit_solvent == eisGBSA);
+ +
+ +    fr->rc_scaling = ir->refcoord_scaling;
+ +    copy_rvec(ir->posres_com, fr->posres_com);
+ +    copy_rvec(ir->posres_comB, fr->posres_comB);
+ +    fr->rlist      = cutoff_inf(ir->rlist);
+ +    fr->rlistlong  = cutoff_inf(ir->rlistlong);
+ +    fr->eeltype    = ir->coulombtype;
+ +    fr->vdwtype    = ir->vdwtype;
+ +
+ +    fr->coulomb_modifier = ir->coulomb_modifier;
+ +    fr->vdw_modifier     = ir->vdw_modifier;
+ +
+ +    /* Electrostatics: Translate from interaction-setting-in-mdp-file to kernel interaction format */
+ +    switch (fr->eeltype)
+ +    {
+ +        case eelCUT:
+ +            fr->nbkernel_elec_interaction = (fr->bGB) ? GMX_NBKERNEL_ELEC_GENERALIZEDBORN : GMX_NBKERNEL_ELEC_COULOMB;
+ +            break;
+ +
+ +        case eelRF:
+ +        case eelGRF:
+ +        case eelRF_NEC:
+ +            fr->nbkernel_elec_interaction = GMX_NBKERNEL_ELEC_REACTIONFIELD;
+ +            break;
+ +
+ +        case eelRF_ZERO:
+ +            fr->nbkernel_elec_interaction = GMX_NBKERNEL_ELEC_REACTIONFIELD;
+ +            fr->coulomb_modifier          = eintmodEXACTCUTOFF;
+ +            break;
+ +
+ +        case eelSWITCH:
+ +        case eelSHIFT:
+ +        case eelUSER:
+ +        case eelENCADSHIFT:
+ +        case eelPMESWITCH:
+ +        case eelPMEUSER:
+ +        case eelPMEUSERSWITCH:
+ +            fr->nbkernel_elec_interaction = GMX_NBKERNEL_ELEC_CUBICSPLINETABLE;
+ +            break;
+ +
+ +        case eelPME:
+ +        case eelEWALD:
+ +            fr->nbkernel_elec_interaction = GMX_NBKERNEL_ELEC_EWALD;
+ +            break;
+ +
+ +        default:
+ +            gmx_fatal(FARGS, "Unsupported electrostatic interaction: %s", eel_names[fr->eeltype]);
+ +            break;
+ +    }
+ +
+ +    /* Vdw: Translate from mdp settings to kernel format */
+ +    switch (fr->vdwtype)
+ +    {
+ +        case evdwCUT:
+ +            if (fr->bBHAM)
+ +            {
+ +                fr->nbkernel_vdw_interaction = GMX_NBKERNEL_VDW_BUCKINGHAM;
+ +            }
+ +            else
+ +            {
+ +                fr->nbkernel_vdw_interaction = GMX_NBKERNEL_VDW_LENNARDJONES;
+ +            }
+ +            break;
+ +
+ +        case evdwSWITCH:
+ +        case evdwSHIFT:
+ +        case evdwUSER:
+ +        case evdwENCADSHIFT:
+ +            fr->nbkernel_vdw_interaction = GMX_NBKERNEL_VDW_CUBICSPLINETABLE;
+ +            break;
+ +
+ +        default:
+ +            gmx_fatal(FARGS, "Unsupported vdw interaction: %s", evdw_names[fr->vdwtype]);
+ +            break;
+ +    }
+ +
+ +    /* These start out identical to ir, but might be altered if we e.g. tabulate the interaction in the kernel */
+ +    fr->nbkernel_elec_modifier    = fr->coulomb_modifier;
+ +    fr->nbkernel_vdw_modifier     = fr->vdw_modifier;
+ +
+ +    fr->bTwinRange = fr->rlistlong > fr->rlist;
+ +    fr->bEwald     = (EEL_PME(fr->eeltype) || fr->eeltype == eelEWALD);
+ +
+ +    fr->reppow     = mtop->ffparams.reppow;
+ +
+ +    if (ir->cutoff_scheme == ecutsGROUP)
+ +    {
+ +        fr->bvdwtab    = (fr->vdwtype != evdwCUT ||
+ +                          !gmx_within_tol(fr->reppow, 12.0, 10*GMX_DOUBLE_EPS));
+ +        /* We have special kernels for standard Ewald and PME, but the pme-switch ones are tabulated above */
+ +        fr->bcoultab   = !(fr->eeltype == eelCUT ||
+ +                           fr->eeltype == eelEWALD ||
+ +                           fr->eeltype == eelPME ||
+ +                           fr->eeltype == eelRF ||
+ +                           fr->eeltype == eelRF_ZERO);
+ +
+ +        /* If the user absolutely wants different switch/shift settings for coul/vdw, it is likely
+ +         * going to be faster to tabulate the interaction than calling the generic kernel.
+ +         */
+ +        if (fr->nbkernel_elec_modifier == eintmodPOTSWITCH && fr->nbkernel_vdw_modifier == eintmodPOTSWITCH)
+ +        {
+ +            if ((fr->rcoulomb_switch != fr->rvdw_switch) || (fr->rcoulomb != fr->rvdw))
+ +            {
+ +                fr->bcoultab = TRUE;
+ +            }
+ +        }
+ +        else if ((fr->nbkernel_elec_modifier == eintmodPOTSHIFT && fr->nbkernel_vdw_modifier == eintmodPOTSHIFT) ||
+ +                 ((fr->nbkernel_elec_interaction == GMX_NBKERNEL_ELEC_REACTIONFIELD &&
+ +                   fr->nbkernel_elec_modifier == eintmodEXACTCUTOFF &&
+ +                   (fr->nbkernel_vdw_modifier == eintmodPOTSWITCH || fr->nbkernel_vdw_modifier == eintmodPOTSHIFT))))
+ +        {
+ +            if (fr->rcoulomb != fr->rvdw)
+ +            {
+ +                fr->bcoultab = TRUE;
+ +            }
+ +        }
+ +
+ +        if (getenv("GMX_REQUIRE_TABLES"))
+ +        {
+ +            fr->bvdwtab  = TRUE;
+ +            fr->bcoultab = TRUE;
+ +        }
+ +
+ +        if (fp)
+ +        {
+ +            fprintf(fp, "Table routines are used for coulomb: %s\n", bool_names[fr->bcoultab]);
+ +            fprintf(fp, "Table routines are used for vdw:     %s\n", bool_names[fr->bvdwtab ]);
+ +        }
+ +
+ +        if (fr->bvdwtab == TRUE)
+ +        {
+ +            fr->nbkernel_vdw_interaction = GMX_NBKERNEL_VDW_CUBICSPLINETABLE;
+ +            fr->nbkernel_vdw_modifier    = eintmodNONE;
+ +        }
+ +        if (fr->bcoultab == TRUE)
+ +        {
+ +            fr->nbkernel_elec_interaction = GMX_NBKERNEL_ELEC_CUBICSPLINETABLE;
+ +            fr->nbkernel_elec_modifier    = eintmodNONE;
+ +        }
+ +    }
+ +
+ +    if (ir->cutoff_scheme == ecutsVERLET)
+ +    {
+ +        if (!gmx_within_tol(fr->reppow, 12.0, 10*GMX_DOUBLE_EPS))
+ +        {
+ +            gmx_fatal(FARGS, "Cut-off scheme %S only supports LJ repulsion power 12", ecutscheme_names[ir->cutoff_scheme]);
+ +        }
+ +        fr->bvdwtab  = FALSE;
+ +        fr->bcoultab = FALSE;
+ +    }
+ +
+ +    /* Tables are used for direct ewald sum */
+ +    if (fr->bEwald)
+ +    {
+ +        if (EEL_PME(ir->coulombtype))
+ +        {
+ +            if (fp)
+ +            {
+ +                fprintf(fp, "Will do PME sum in reciprocal space.\n");
+ +            }
+ +            if (ir->coulombtype == eelP3M_AD)
+ +            {
+ +                please_cite(fp, "Hockney1988");
+ +                please_cite(fp, "Ballenegger2012");
+ +            }
+ +            else
+ +            {
+ +                please_cite(fp, "Essmann95a");
+ +            }
+ +
+ +            if (ir->ewald_geometry == eewg3DC)
+ +            {
+ +                if (fp)
+ +                {
+ +                    fprintf(fp, "Using the Ewald3DC correction for systems with a slab geometry.\n");
+ +                }
+ +                please_cite(fp, "In-Chul99a");
+ +            }
+ +        }
+ +        fr->ewaldcoeff = calc_ewaldcoeff(ir->rcoulomb, ir->ewald_rtol);
+ +        init_ewald_tab(&(fr->ewald_table), ir, fp);
+ +        if (fp)
+ +        {
+ +            fprintf(fp, "Using a Gaussian width (1/beta) of %g nm for Ewald\n",
+ +                    1/fr->ewaldcoeff);
+ +        }
+ +    }
+ +
+ +    /* Electrostatics */
+ +    fr->epsilon_r       = ir->epsilon_r;
+ +    fr->epsilon_rf      = ir->epsilon_rf;
+ +    fr->fudgeQQ         = mtop->ffparams.fudgeQQ;
+ +    fr->rcoulomb_switch = ir->rcoulomb_switch;
+ +    fr->rcoulomb        = cutoff_inf(ir->rcoulomb);
+ +
+ +    /* Parameters for generalized RF */
+ +    fr->zsquare = 0.0;
+ +    fr->temp    = 0.0;
+ +
+ +    if (fr->eeltype == eelGRF)
+ +    {
+ +        init_generalized_rf(fp, mtop, ir, fr);
+ +    }
+ +    else if (fr->eeltype == eelSHIFT)
+ +    {
+ +        for (m = 0; (m < DIM); m++)
+ +        {
+ +            box_size[m] = box[m][m];
+ +        }
+ +
+ +        if ((fr->eeltype == eelSHIFT && fr->rcoulomb > fr->rcoulomb_switch))
+ +        {
+ +            set_shift_consts(fr->rcoulomb_switch, fr->rcoulomb, box_size);
+ +        }
+ +    }
+ +
+ +    fr->bF_NoVirSum = (EEL_FULL(fr->eeltype) ||
+ +                       gmx_mtop_ftype_count(mtop, F_POSRES) > 0 ||
+ +                       gmx_mtop_ftype_count(mtop, F_FBPOSRES) > 0 ||
+ +                       IR_ELEC_FIELD(*ir) ||
+ +                       (fr->adress_icor != eAdressICOff)
+ +                       );
+ +
+ +    if (fr->cutoff_scheme == ecutsGROUP &&
+ +        ncg_mtop(mtop) > fr->cg_nalloc && !DOMAINDECOMP(cr))
+ +    {
+ +        /* Count the total number of charge groups */
+ +        fr->cg_nalloc = ncg_mtop(mtop);
+ +        srenew(fr->cg_cm, fr->cg_nalloc);
+ +    }
+ +    if (fr->shift_vec == NULL)
+ +    {
+ +        snew(fr->shift_vec, SHIFTS);
+ +    }
+ +
+ +    if (fr->fshift == NULL)
+ +    {
+ +        snew(fr->fshift, SHIFTS);
+ +    }
+ +
+ +    if (fr->nbfp == NULL)
+ +    {
+ +        fr->ntype = mtop->ffparams.atnr;
+ +        fr->nbfp  = mk_nbfp(&mtop->ffparams, fr->bBHAM);
+ +    }
+ +
+ +    /* Copy the energy group exclusions */
+ +    fr->egp_flags = ir->opts.egp_flags;
+ +
+ +    /* Van der Waals stuff */
+ +    fr->rvdw        = cutoff_inf(ir->rvdw);
+ +    fr->rvdw_switch = ir->rvdw_switch;
+ +    if ((fr->vdwtype != evdwCUT) && (fr->vdwtype != evdwUSER) && !fr->bBHAM)
+ +    {
+ +        if (fr->rvdw_switch >= fr->rvdw)
+ +        {
+ +            gmx_fatal(FARGS, "rvdw_switch (%f) must be < rvdw (%f)",
+ +                      fr->rvdw_switch, fr->rvdw);
+ +        }
+ +        if (fp)
+ +        {
+ +            fprintf(fp, "Using %s Lennard-Jones, switch between %g and %g nm\n",
+ +                    (fr->eeltype == eelSWITCH) ? "switched" : "shifted",
+ +                    fr->rvdw_switch, fr->rvdw);
+ +        }
+ +    }
+ +
+ +    if (fr->bBHAM && (fr->vdwtype == evdwSHIFT || fr->vdwtype == evdwSWITCH))
+ +    {
+ +        gmx_fatal(FARGS, "Switch/shift interaction not supported with Buckingham");
+ +    }
+ +
+ +    if (fp)
+ +    {
+ +        fprintf(fp, "Cut-off's:   NS: %g   Coulomb: %g   %s: %g\n",
+ +                fr->rlist, fr->rcoulomb, fr->bBHAM ? "BHAM" : "LJ", fr->rvdw);
+ +    }
+ +
+ +    fr->eDispCorr = ir->eDispCorr;
+ +    if (ir->eDispCorr != edispcNO)
+ +    {
+ +        set_avcsixtwelve(fp, fr, mtop);
+ +    }
+ +
+ +    if (fr->bBHAM)
+ +    {
+ +        set_bham_b_max(fp, fr, mtop);
+ +    }
+ +
+ +    fr->gb_epsilon_solvent = ir->gb_epsilon_solvent;
+ +
+ +    /* Copy the GBSA data (radius, volume and surftens for each
+ +     * atomtype) from the topology atomtype section to forcerec.
+ +     */
+ +    snew(fr->atype_radius, fr->ntype);
+ +    snew(fr->atype_vol, fr->ntype);
+ +    snew(fr->atype_surftens, fr->ntype);
+ +    snew(fr->atype_gb_radius, fr->ntype);
+ +    snew(fr->atype_S_hct, fr->ntype);
+ +
+ +    if (mtop->atomtypes.nr > 0)
+ +    {
+ +        for (i = 0; i < fr->ntype; i++)
+ +        {
+ +            fr->atype_radius[i] = mtop->atomtypes.radius[i];
+ +        }
+ +        for (i = 0; i < fr->ntype; i++)
+ +        {
+ +            fr->atype_vol[i] = mtop->atomtypes.vol[i];
+ +        }
+ +        for (i = 0; i < fr->ntype; i++)
+ +        {
+ +            fr->atype_surftens[i] = mtop->atomtypes.surftens[i];
+ +        }
+ +        for (i = 0; i < fr->ntype; i++)
+ +        {
+ +            fr->atype_gb_radius[i] = mtop->atomtypes.gb_radius[i];
+ +        }
+ +        for (i = 0; i < fr->ntype; i++)
+ +        {
+ +            fr->atype_S_hct[i] = mtop->atomtypes.S_hct[i];
+ +        }
+ +    }
+ +
+ +    /* Generate the GB table if needed */
+ +    if (fr->bGB)
+ +    {
+ +#ifdef GMX_DOUBLE
+ +        fr->gbtabscale = 2000;
+ +#else
+ +        fr->gbtabscale = 500;
+ +#endif
+ +
+ +        fr->gbtabr = 100;
+ +        fr->gbtab  = make_gb_table(fp, oenv, fr, tabpfn, fr->gbtabscale);
+ +
+ +        init_gb(&fr->born, cr, fr, ir, mtop, ir->rgbradii, ir->gb_algorithm);
+ +
+ +        /* Copy local gb data (for dd, this is done in dd_partition_system) */
+ +        if (!DOMAINDECOMP(cr))
+ +        {
+ +            make_local_gb(cr, fr->born, ir->gb_algorithm);
+ +        }
+ +    }
+ +
+ +    /* Set the charge scaling */
+ +    if (fr->epsilon_r != 0)
+ +    {
+ +        fr->epsfac = ONE_4PI_EPS0/fr->epsilon_r;
+ +    }
+ +    else
+ +    {
+ +        /* eps = 0 is infinite dieletric: no coulomb interactions */
+ +        fr->epsfac = 0;
+ +    }
+ +
+ +    /* Reaction field constants */
+ +    if (EEL_RF(fr->eeltype))
+ +    {
+ +        calc_rffac(fp, fr->eeltype, fr->epsilon_r, fr->epsilon_rf,
+ +                   fr->rcoulomb, fr->temp, fr->zsquare, box,
+ +                   &fr->kappa, &fr->k_rf, &fr->c_rf);
+ +    }
+ +
+ +    set_chargesum(fp, fr, mtop);
+ +
+ +    /* if we are using LR electrostatics, and they are tabulated,
+ +     * the tables will contain modified coulomb interactions.
+ +     * Since we want to use the non-shifted ones for 1-4
+ +     * coulombic interactions, we must have an extra set of tables.
+ +     */
+ +
+ +    /* Construct tables.
+ +     * A little unnecessary to make both vdw and coul tables sometimes,
+ +     * but what the heck... */
+ +
+ +    bTab = fr->bcoultab || fr->bvdwtab || fr->bEwald;
+ +
+ +    bSep14tab = ((!bTab || fr->eeltype != eelCUT || fr->vdwtype != evdwCUT ||
+ +                  fr->bBHAM || fr->bEwald) &&
+ +                 (gmx_mtop_ftype_count(mtop, F_LJ14) > 0 ||
+ +                  gmx_mtop_ftype_count(mtop, F_LJC14_Q) > 0 ||
+ +                  gmx_mtop_ftype_count(mtop, F_LJC_PAIRS_NB) > 0));
+ +
+ +    negp_pp   = ir->opts.ngener - ir->nwall;
+ +    negptable = 0;
+ +    if (!bTab)
+ +    {
+ +        bNormalnblists = TRUE;
+ +        fr->nnblists   = 1;
+ +    }
+ +    else
+ +    {
+ +        bNormalnblists = (ir->eDispCorr != edispcNO);
+ +        for (egi = 0; egi < negp_pp; egi++)
+ +        {
+ +            for (egj = egi; egj < negp_pp; egj++)
+ +            {
+ +                egp_flags = ir->opts.egp_flags[GID(egi, egj, ir->opts.ngener)];
+ +                if (!(egp_flags & EGP_EXCL))
+ +                {
+ +                    if (egp_flags & EGP_TABLE)
+ +                    {
+ +                        negptable++;
+ +                    }
+ +                    else
+ +                    {
+ +                        bNormalnblists = TRUE;
+ +                    }
+ +                }
+ +            }
+ +        }
+ +        if (bNormalnblists)
+ +        {
+ +            fr->nnblists = negptable + 1;
+ +        }
+ +        else
+ +        {
+ +            fr->nnblists = negptable;
+ +        }
+ +        if (fr->nnblists > 1)
+ +        {
+ +            snew(fr->gid2nblists, ir->opts.ngener*ir->opts.ngener);
+ +        }
+ +    }
+ +
+ +    if (ir->adress)
+ +    {
+ +        fr->nnblists *= 2;
+ +    }
+ +
+ +    snew(fr->nblists, fr->nnblists);
+ +
+ +    /* This code automatically gives table length tabext without cut-off's,
+ +     * in that case grompp should already have checked that we do not need
+ +     * normal tables and we only generate tables for 1-4 interactions.
+ +     */
+ +    rtab = ir->rlistlong + ir->tabext;
+ +
+ +    if (bTab)
+ +    {
+ +        /* make tables for ordinary interactions */
+ +        if (bNormalnblists)
+ +        {
+ +            make_nbf_tables(fp, oenv, fr, rtab, cr, tabfn, NULL, NULL, &fr->nblists[0]);
+ +            if (ir->adress)
+ +            {
+ +                make_nbf_tables(fp, oenv, fr, rtab, cr, tabfn, NULL, NULL, &fr->nblists[fr->nnblists/2]);
+ +            }
+ +            if (!bSep14tab)
+ +            {
+ +                fr->tab14 = fr->nblists[0].table_elec_vdw;
+ +            }
+ +            m = 1;
+ +        }
+ +        else
+ +        {
+ +            m = 0;
+ +        }
+ +        if (negptable > 0)
+ +        {
+ +            /* Read the special tables for certain energy group pairs */
+ +            nm_ind = mtop->groups.grps[egcENER].nm_ind;
+ +            for (egi = 0; egi < negp_pp; egi++)
+ +            {
+ +                for (egj = egi; egj < negp_pp; egj++)
+ +                {
+ +                    egp_flags = ir->opts.egp_flags[GID(egi, egj, ir->opts.ngener)];
+ +                    if ((egp_flags & EGP_TABLE) && !(egp_flags & EGP_EXCL))
+ +                    {
+ +                        nbl = &(fr->nblists[m]);
+ +                        if (fr->nnblists > 1)
+ +                        {
+ +                            fr->gid2nblists[GID(egi, egj, ir->opts.ngener)] = m;
+ +                        }
+ +                        /* Read the table file with the two energy groups names appended */
+ +                        make_nbf_tables(fp, oenv, fr, rtab, cr, tabfn,
+ +                                        *mtop->groups.grpname[nm_ind[egi]],
+ +                                        *mtop->groups.grpname[nm_ind[egj]],
+ +                                        &fr->nblists[m]);
+ +                        if (ir->adress)
+ +                        {
+ +                            make_nbf_tables(fp, oenv, fr, rtab, cr, tabfn,
+ +                                            *mtop->groups.grpname[nm_ind[egi]],
+ +                                            *mtop->groups.grpname[nm_ind[egj]],
+ +                                            &fr->nblists[fr->nnblists/2+m]);
+ +                        }
+ +                        m++;
+ +                    }
+ +                    else if (fr->nnblists > 1)
+ +                    {
+ +                        fr->gid2nblists[GID(egi, egj, ir->opts.ngener)] = 0;
+ +                    }
+ +                }
+ +            }
+ +        }
+ +    }
+ +    if (bSep14tab)
+ +    {
+ +        /* generate extra tables with plain Coulomb for 1-4 interactions only */
+ +        fr->tab14 = make_tables(fp, oenv, fr, MASTER(cr), tabpfn, rtab,
+ +                                GMX_MAKETABLES_14ONLY);
+ +    }
+ +
+ +    /* Read AdResS Thermo Force table if needed */
+ +    if (fr->adress_icor == eAdressICThermoForce)
+ +    {
+ +        /* old todo replace */
+ +
+ +        if (ir->adress->n_tf_grps > 0)
+ +        {
+ +            make_adress_tf_tables(fp, oenv, fr, ir, tabfn, mtop, box);
+ +
+ +        }
+ +        else
+ +        {
+ +            /* load the default table */
+ +            snew(fr->atf_tabs, 1);
+ +            fr->atf_tabs[DEFAULT_TF_TABLE] = make_atf_table(fp, oenv, fr, tabafn, box);
+ +        }
+ +    }
+ +
+ +    /* Wall stuff */
+ +    fr->nwall = ir->nwall;
+ +    if (ir->nwall && ir->wall_type == ewtTABLE)
+ +    {
+ +        make_wall_tables(fp, oenv, ir, tabfn, &mtop->groups, fr);
+ +    }
+ +
+ +    if (fcd && tabbfn)
+ +    {
+ +        fcd->bondtab  = make_bonded_tables(fp,
+ +                                           F_TABBONDS, F_TABBONDSNC,
+ +                                           mtop, tabbfn, "b");
+ +        fcd->angletab = make_bonded_tables(fp,
+ +                                           F_TABANGLES, -1,
+ +                                           mtop, tabbfn, "a");
+ +        fcd->dihtab   = make_bonded_tables(fp,
+ +                                           F_TABDIHS, -1,
+ +                                           mtop, tabbfn, "d");
+ +    }
+ +    else
+ +    {
+ +        if (debug)
+ +        {
+ +            fprintf(debug, "No fcdata or table file name passed, can not read table, can not do bonded interactions\n");
+ +        }
+ +    }
+ +
+ +    /* QM/MM initialization if requested
+ +     */
+ +    if (ir->bQMMM)
+ +    {
+ +        fprintf(stderr, "QM/MM calculation requested.\n");
+ +    }
+ +
+ +    fr->bQMMM      = ir->bQMMM;
+ +    fr->qr         = mk_QMMMrec();
+ +
+ +    /* Set all the static charge group info */
+ +    fr->cginfo_mb = init_cginfo_mb(fp, mtop, fr, bNoSolvOpt,
+ +                                   &fr->bExcl_IntraCGAll_InterCGNone);
+ +    if (DOMAINDECOMP(cr))
+ +    {
+ +        fr->cginfo = NULL;
+ +    }
+ +    else
+ +    {
+ +        fr->cginfo = cginfo_expand(mtop->nmolblock, fr->cginfo_mb);
+ +    }
+ +
+ +    if (!DOMAINDECOMP(cr))
+ +    {
+ +        /* When using particle decomposition, the effect of the second argument,
+ +         * which sets fr->hcg, is corrected later in do_md and init_em.
+ +         */
+ +        forcerec_set_ranges(fr, ncg_mtop(mtop), ncg_mtop(mtop),
+ +                            mtop->natoms, mtop->natoms, mtop->natoms);
+ +    }
+ +
+ +    fr->print_force = print_force;
+ +
+ +
+ +    /* coarse load balancing vars */
+ +    fr->t_fnbf    = 0.;
+ +    fr->t_wait    = 0.;
+ +    fr->timesteps = 0;
+ +
+ +    /* Initialize neighbor search */
+ +    init_ns(fp, cr, &fr->ns, fr, mtop, box);
+ +
+ +    if (cr->duty & DUTY_PP)
+ +    {
+ +        gmx_nonbonded_setup(fp, fr, bGenericKernelOnly);
+ +        /*
+ +           if (ir->bAdress)
+ +            {
+ +                gmx_setup_adress_kernels(fp,bGenericKernelOnly);
+ +            }
+ +         */
+ +    }
+ +
+ +    /* Initialize the thread working data for bonded interactions */
+ +    init_forcerec_f_threads(fr, mtop->groups.grps[egcENER].nr);
+ +
+ +    snew(fr->excl_load, fr->nthreads+1);
+ +
+ +    if (fr->cutoff_scheme == ecutsVERLET)
+ +    {
+ +        if (ir->rcoulomb != ir->rvdw)
+ +        {
+ +            gmx_fatal(FARGS, "With Verlet lists rcoulomb and rvdw should be identical");
+ +        }
+ +
+ +        init_nb_verlet(fp, &fr->nbv, ir, fr, cr, nbpu_opt);
+ +    }
+ +
+ +    /* fr->ic is used both by verlet and group kernels (to some extent) now */
+ +    init_interaction_const(fp, &fr->ic, fr, rtab);
+ +    if (ir->eDispCorr != edispcNO)
+ +    {
+ +        calc_enervirdiff(fp, ir->eDispCorr, fr);
+ +    }
+ +}
+ +
+ +#define pr_real(fp, r) fprintf(fp, "%s: %e\n",#r, r)
+ +#define pr_int(fp, i)  fprintf((fp), "%s: %d\n",#i, i)
+ +#define pr_bool(fp, b) fprintf((fp), "%s: %s\n",#b, bool_names[b])
+ +
+ +void pr_forcerec(FILE *fp, t_forcerec *fr, t_commrec *cr)
+ +{
+ +    int i;
+ +
+ +    pr_real(fp, fr->rlist);
+ +    pr_real(fp, fr->rcoulomb);
+ +    pr_real(fp, fr->fudgeQQ);
+ +    pr_bool(fp, fr->bGrid);
+ +    pr_bool(fp, fr->bTwinRange);
+ +    /*pr_int(fp,fr->cg0);
+ +       pr_int(fp,fr->hcg);*/
+ +    for (i = 0; i < fr->nnblists; i++)
+ +    {
+ +        pr_int(fp, fr->nblists[i].table_elec_vdw.n);
+ +    }
+ +    pr_real(fp, fr->rcoulomb_switch);
+ +    pr_real(fp, fr->rcoulomb);
+ +
+ +    fflush(fp);
+ +}
+ +
+ +void forcerec_set_excl_load(t_forcerec *fr,
+ +                            const gmx_localtop_t *top, const t_commrec *cr)
+ +{
+ +    const int *ind, *a;
+ +    int        t, i, j, ntot, n, ntarget;
+ +
+ +    if (cr != NULL && PARTDECOMP(cr))
+ +    {
+ +        /* No OpenMP with particle decomposition */
+ +        pd_at_range(cr,
+ +                    &fr->excl_load[0],
+ +                    &fr->excl_load[1]);
+ +
+ +        return;
+ +    }
+ +
+ +    ind = top->excls.index;
+ +    a   = top->excls.a;
+ +
+ +    ntot = 0;
+ +    for (i = 0; i < top->excls.nr; i++)
+ +    {
+ +        for (j = ind[i]; j < ind[i+1]; j++)
+ +        {
+ +            if (a[j] > i)
+ +            {
+ +                ntot++;
+ +            }
+ +        }
+ +    }
+ +
+ +    fr->excl_load[0] = 0;
+ +    n                = 0;
+ +    i                = 0;
+ +    for (t = 1; t <= fr->nthreads; t++)
+ +    {
+ +        ntarget = (ntot*t)/fr->nthreads;
+ +        while (i < top->excls.nr && n < ntarget)
+ +        {
+ +            for (j = ind[i]; j < ind[i+1]; j++)
+ +            {
+ +                if (a[j] > i)
+ +                {
+ +                    n++;
+ +                }
+ +            }
+ +            i++;
+ +        }
+ +        fr->excl_load[t] = i;
+ +    }
+ +}
diff --cc src/gromacs/mdlib/nbnxn_atomdata.h

index a027460f1444e3e16ab877309a983c7b0816470c,0000000000000000000000000000000000000000..15d5e6af53bfa2c22af3f4caf420785a9d2e9ba3

mode 100644,000000..100644
--- 1/src/gromacs/mdlib/nbnxn_atomdata.h
--- /dev/null
+++ b/src/gromacs/mdlib/nbnxn_atomdata.h
@@@ -1,119 -1,0 +1,119 @@@
- #define _nsnxn_atomdata_h
+ +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustr
+ + *
+ + *
+ + *                This source code is part of
+ + *
+ + *                 G   R   O   M   A   C   S
+ + *
+ + *          GROningen MAchine for Chemical Simulations
+ + *
+ + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
+ + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ + * Copyright (c) 2001-2012, The GROMACS development team,
+ + * check out http://www.gromacs.org for more information.
+ + *
+ + * This program is free software; you can redistribute it and/or
+ + * modify it under the terms of the GNU General Public License
+ + * as published by the Free Software Foundation; either version 2
+ + * of the License, or (at your option) any later version.
+ + *
+ + * If you want to redistribute modifications, please consider that
+ + * scientific software is very special. Version control is crucial -
+ + * bugs must be traceable. We will be happy to consider code for
+ + * inclusion in the official distribution, but derived work must not
+ + * be called official GROMACS. Details are found in the README & COPYING
+ + * files - if they are missing, get the official version at www.gromacs.org.
+ + *
+ + * To help us fund GROMACS development, we humbly ask that you cite
+ + * the papers on the package - you can find them in the top README file.
+ + *
+ + * For more info, check our website at http://www.gromacs.org
+ + *
+ + * And Hey:
+ + * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
+ + */
+ +
+ +#ifndef _nbnxn_atomdata_h
++#define _nbnxn_atomdata_h
+ +
+ +#include "typedefs.h"
+ +
+ +#ifdef __cplusplus
+ +extern "C" {
+ +#endif
+ +
+ +
+ +/* Default nbnxn allocation routine, allocates 32 byte aligned,
+ + * which works for plain C and aligned SSE and AVX loads/stores.
+ + */
+ +void nbnxn_alloc_aligned(void **ptr, size_t nbytes);
+ +
+ +/* Free function for memory allocated with nbnxn_alloc_aligned */
+ +void nbnxn_free_aligned(void *ptr);
+ +
+ +/* Reallocation wrapper function for nbnxn data structures */
+ +void nbnxn_realloc_void(void **ptr,
+ +                        int nbytes_copy, int nbytes_new,
+ +                        nbnxn_alloc_t *ma,
+ +                        nbnxn_free_t  *mf);
+ +
+ +/* Reallocate the nbnxn_atomdata_t for a size of n atoms */
+ +void nbnxn_atomdata_realloc(nbnxn_atomdata_t *nbat, int n);
+ +
+ +/* Copy na rvec elements from x to xnb using nbatFormat, start dest a0,
+ + * and fills up to na_round using cx,cy,cz.
+ + */
+ +void copy_rvec_to_nbat_real(const int *a, int na, int na_round,
+ +                            rvec *x, int nbatFormat, real *xnb, int a0,
+ +                            int cx, int cy, int cz);
+ +
+ +/* Initialize the non-bonded atom data structure.
+ + * The enum for nbatXFormat is in the file defining nbnxn_atomdata_t.
+ + * Copy the ntypes*ntypes*2 sized nbfp non-bonded parameter list
+ + * to the atom data structure.
+ + */
+ +void nbnxn_atomdata_init(FILE *fp,
+ +                         nbnxn_atomdata_t *nbat,
+ +                         int nb_kernel_type,
+ +                         int ntype, const real *nbfp,
+ +                         int n_energygroups,
+ +                         int nout,
+ +                         nbnxn_alloc_t *alloc,
+ +                         nbnxn_free_t  *free);
+ +
+ +/* Copy the atom data to the non-bonded atom data structure */
+ +void nbnxn_atomdata_set(nbnxn_atomdata_t    *nbat,
+ +                        int                  locality,
+ +                        const nbnxn_search_t nbs,
+ +                        const t_mdatoms     *mdatoms,
+ +                        const int           *atinfo);
+ +
+ +/* Copy the shift vectors to nbat */
+ +void nbnxn_atomdata_copy_shiftvec(gmx_bool          dynamic_box,
+ +                                  rvec             *shift_vec,
+ +                                  nbnxn_atomdata_t *nbat);
+ +
+ +/* Copy x to nbat->x.
+ + * FillLocal tells if the local filler particle coordinates should be zeroed.
+ + */
+ +void nbnxn_atomdata_copy_x_to_nbat_x(const nbnxn_search_t nbs,
+ +                                     int                  locality,
+ +                                     gmx_bool             FillLocal,
+ +                                     rvec                *x,
+ +                                     nbnxn_atomdata_t    *nbat);
+ +
+ +/* Add the forces stored in nbat to f, zeros the forces in nbat */
+ +void nbnxn_atomdata_add_nbat_f_to_f(const nbnxn_search_t    nbs,
+ +                                    int                     locality,
+ +                                    const nbnxn_atomdata_t *nbat,
+ +                                    rvec                   *f);
+ +
+ +/* Add the fshift force stored in nbat to fshift */
+ +void nbnxn_atomdata_add_nbat_fshift_to_fshift(const nbnxn_atomdata_t *nbat,
+ +                                              rvec                   *fshift);
+ +
+ +#ifdef __cplusplus
+ +}
+ +#endif
+ +
+ +#endif
diff --cc src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu

index dc089c0bed9aeea55efdcab0e28d52cb8c8a8320,0000000000000000000000000000000000000000..0a6c17ac6154f5f76f495fcae53d3362f3ebed36

mode 100644,000000..100644
--- 1/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu
--- /dev/null
+++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu
@@@ -1,963 -1,0 +1,971 @@@
-                      gmx_gpu_info_t *gpu_info, int my_gpu_index,
+ +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
+ + *
+ + *
+ + *                This source code is part of
+ + *
+ + *                 G   R   O   M   A   C   S
+ + *
+ + *          GROningen MAchine for Chemical Simulations
+ + *
+ + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
+ + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ + * Copyright (c) 2001-2012, The GROMACS development team,
+ + * check out http://www.gromacs.org for more information.
+ + *
+ + * This program is free software; you can redistribute it and/or
+ + * modify it under the terms of the GNU General Public License
+ + * as published by the Free Software Foundation; either version 2
+ + * of the License, or (at your option) any later version.
+ + *
+ + * If you want to redistribute modifications, please consider that
+ + * scientific software is very special. Version control is crucial -
+ + * bugs must be traceable. We will be happy to consider code for
+ + * inclusion in the official distribution, but derived work must not
+ + * be called official GROMACS. Details are found in the README & COPYING
+ + * files - if they are missing, get the official version at www.gromacs.org.
+ + *
+ + * To help us fund GROMACS development, we humbly ask that you cite
+ + * the papers on the package - you can find them in the top README file.
+ + *
+ + * For more info, check our website at http://www.gromacs.org
+ + *
+ + * And Hey:
+ + * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
+ + */
+ +#ifdef HAVE_CONFIG_H
+ +#include <config.h>
+ +#endif
+ +
+ +#include <stdlib.h>
+ +#include <stdio.h>
+ +#include <assert.h>
+ +
+ +#include <cuda.h>
+ +
+ +#include "gmx_fatal.h"
+ +#include "smalloc.h"
+ +#include "tables.h"
+ +#include "typedefs.h"
+ +#include "types/nb_verlet.h"
+ +#include "types/interaction_const.h"
+ +#include "types/force_flags.h"
+ +#include "../nbnxn_consts.h"
++#include "gmx_detect_hardware.h"
+ +
+ +#include "nbnxn_cuda_types.h"
+ +#include "../../gmxlib/cuda_tools/cudautils.cuh"
+ +#include "nbnxn_cuda_data_mgmt.h"
+ +#include "pmalloc_cuda.h"
+ +#include "gpu_utils.h"
+ +
+ +static bool bUseCudaEventBlockingSync = false; /* makes the CPU thread block */
+ +
+ +/* This is a heuristically determined parameter for the Fermi architecture for
+ + * the minimum size of ci lists by multiplying this constant with the # of
+ + * multiprocessors on the current device.
+ + */
+ +static unsigned int gpu_min_ci_balanced_factor = 40;
+ +
+ +/* Functions from nbnxn_cuda.cu */
+ +extern void nbnxn_cuda_set_cacheconfig(cuda_dev_info_t *devinfo);
+ +extern const struct texture<float, 1, cudaReadModeElementType>& nbnxn_cuda_get_nbfp_texref();
+ +extern const struct texture<float, 1, cudaReadModeElementType>& nbnxn_cuda_get_coulomb_tab_texref();
+ +
+ +/* We should actually be using md_print_warn in md_logging.c,
+ + * but we can't include mpi.h in CUDA code.
+ + */
+ +static void md_print_warn(FILE       *fplog,
+ +                          const char *fmt, ...)
+ +{
+ +    va_list ap;
+ +
+ +    if (fplog != NULL)
+ +    {
+ +        /* We should only print to stderr on the master node,
+ +         * in most cases fplog is only set on the master node, so this works.
+ +         */
+ +        va_start(ap, fmt);
+ +        fprintf(stderr, "\n");
+ +        vfprintf(stderr, fmt, ap);
+ +        fprintf(stderr, "\n");
+ +        va_end(ap);
+ +
+ +        va_start(ap, fmt);
+ +        fprintf(fplog, "\n");
+ +        vfprintf(fplog, fmt, ap);
+ +        fprintf(fplog, "\n");
+ +        va_end(ap);
+ +    }
+ +}
+ +
+ +
+ +/* Fw. decl. */
+ +static void nbnxn_cuda_clear_e_fshift(nbnxn_cuda_ptr_t cu_nb);
+ +
+ +
+ +/*! Tabulates the Ewald Coulomb force and initializes the size/scale
+ +    and the table GPU array. If called with an already allocated table,
+ +    it just re-uploads the table.
+ + */
+ +static void init_ewald_coulomb_force_table(cu_nbparam_t *nbp)
+ +{
+ +    float       *ftmp, *coul_tab;
+ +    int         tabsize;
+ +    double      tabscale;
+ +    cudaError_t stat;
+ +
+ +    tabsize     = GPU_EWALD_COULOMB_FORCE_TABLE_SIZE;
+ +    /* Subtract 2 iso 1 to avoid access out of range due to rounding */
+ +    tabscale    = (tabsize - 2) / sqrt(nbp->rcoulomb_sq);
+ +
+ +    pmalloc((void**)&ftmp, tabsize*sizeof(*ftmp));
+ +
+ +    table_spline3_fill_ewald_lr(ftmp, NULL, NULL, tabsize,
+ +                                1/tabscale, nbp->ewald_beta);
+ +
+ +    /* If the table pointer == NULL the table is generated the first time =>
+ +       the array pointer will be saved to nbparam and the texture is bound.
+ +     */
+ +    coul_tab = nbp->coulomb_tab;
+ +    if (coul_tab == NULL)
+ +    {
+ +        stat = cudaMalloc((void **)&coul_tab, tabsize*sizeof(*coul_tab));
+ +        CU_RET_ERR(stat, "cudaMalloc failed on coul_tab");
+ +
+ +        nbp->coulomb_tab = coul_tab;
+ +
+ +        cudaChannelFormatDesc cd   = cudaCreateChannelDesc<float>();
+ +        stat = cudaBindTexture(NULL, &nbnxn_cuda_get_coulomb_tab_texref(),
+ +                               coul_tab, &cd, tabsize*sizeof(*coul_tab));
+ +        CU_RET_ERR(stat, "cudaBindTexture on coul_tab failed");
+ +    }
+ +
+ +    cu_copy_H2D(coul_tab, ftmp, tabsize*sizeof(*coul_tab));
+ +
+ +    nbp->coulomb_tab_size     = tabsize;
+ +    nbp->coulomb_tab_scale    = tabscale;
+ +
+ +    pfree(ftmp);
+ +}
+ +
+ +
+ +/*! Initializes the atomdata structure first time, it only gets filled at
+ +    pair-search. */
+ +static void init_atomdata_first(cu_atomdata_t *ad, int ntypes)
+ +{
+ +    cudaError_t stat;
+ +
+ +    ad->ntypes  = ntypes;
+ +    stat = cudaMalloc((void**)&ad->shift_vec, SHIFTS*sizeof(*ad->shift_vec));
+ +    CU_RET_ERR(stat, "cudaMalloc failed on ad->shift_vec");
+ +    ad->bShiftVecUploaded = false;
+ +
+ +    stat = cudaMalloc((void**)&ad->fshift, SHIFTS*sizeof(*ad->fshift));
+ +    CU_RET_ERR(stat, "cudaMalloc failed on ad->fshift");
+ +
+ +    stat = cudaMalloc((void**)&ad->e_lj, sizeof(*ad->e_lj));
+ +    CU_RET_ERR(stat, "cudaMalloc failed on ad->e_lj");
+ +    stat = cudaMalloc((void**)&ad->e_el, sizeof(*ad->e_el));
+ +    CU_RET_ERR(stat, "cudaMalloc failed on ad->e_el");
+ +
+ +    /* initialize to NULL poiters to data that is not allocated here and will
+ +       need reallocation in nbnxn_cuda_init_atomdata */
+ +    ad->xq = NULL;
+ +    ad->f  = NULL;
+ +
+ +    /* size -1 indicates that the respective array hasn't been initialized yet */
+ +    ad->natoms = -1;
+ +    ad->nalloc = -1;
+ +}
+ +
+ +/*! Selects the Ewald kernel type, analytical on SM 3.0 and later, tabulated on
+ +    earlier GPUs, single or twin cut-off. */
+ +static int pick_ewald_kernel_type(bool                   bTwinCut,
+ +                                  const cuda_dev_info_t *dev_info)
+ +{
+ +    bool bUseAnalyticalEwald, bForceAnalyticalEwald, bForceTabulatedEwald;
+ +    int  kernel_type;
+ +
+ +    /* Benchmarking/development environment variables to force the use of
+ +       analytical or tabulated Ewald kernel. */
+ +    bForceAnalyticalEwald = (getenv("GMX_CUDA_NB_ANA_EWALD") != NULL);
+ +    bForceTabulatedEwald  = (getenv("GMX_CUDA_NB_TAB_EWALD") != NULL);
+ +
+ +    if (bForceAnalyticalEwald && bForceTabulatedEwald)
+ +    {
+ +        gmx_incons("Both analytical and tabulated Ewald CUDA non-bonded kernels "
+ +                   "requested through environment variables.");
+ +    }
+ +
+ +    /* By default, on SM 3.0 and later use analytical Ewald, on earlier tabulated. */
+ +    if ((dev_info->prop.major >= 3 || bForceAnalyticalEwald) && !bForceTabulatedEwald)
+ +    {
+ +        bUseAnalyticalEwald = true;
+ +
+ +        if (debug)
+ +        {
+ +            fprintf(debug, "Using analytical Ewald CUDA kernels\n");
+ +        }
+ +    }
+ +    else
+ +    {
+ +        bUseAnalyticalEwald = false;
+ +
+ +        if (debug)
+ +        {
+ +            fprintf(debug, "Using tabulated Ewald CUDA kernels\n");
+ +        }
+ +    }
+ +
+ +    /* Use twin cut-off kernels if requested by bTwinCut or the env. var.
+ +       forces it (use it for debugging/benchmarking only). */
+ +    if (!bTwinCut && (getenv("GMX_CUDA_NB_EWALD_TWINCUT") == NULL))
+ +    {
+ +        kernel_type = bUseAnalyticalEwald ? eelCuEWALD_ANA : eelCuEWALD_TAB;
+ +    }
+ +    else
+ +    {
+ +        kernel_type = bUseAnalyticalEwald ? eelCuEWALD_ANA_TWIN : eelCuEWALD_TAB_TWIN;
+ +    }
+ +
+ +    return kernel_type;
+ +}
+ +
+ +
+ +/*! Initializes the nonbonded parameter data structure. */
+ +static void init_nbparam(cu_nbparam_t *nbp,
+ +                         const interaction_const_t *ic,
+ +                         const nbnxn_atomdata_t *nbat,
+ +                         const cuda_dev_info_t *dev_info)
+ +{
+ +    cudaError_t stat;
+ +    int         ntypes, nnbfp;
+ +
+ +    ntypes  = nbat->ntype;
+ +
+ +    nbp->ewald_beta = ic->ewaldcoeff;
+ +    nbp->sh_ewald   = ic->sh_ewald;
+ +    nbp->epsfac     = ic->epsfac;
+ +    nbp->two_k_rf   = 2.0 * ic->k_rf;
+ +    nbp->c_rf       = ic->c_rf;
+ +    nbp->rvdw_sq    = ic->rvdw * ic->rvdw;
+ +    nbp->rcoulomb_sq= ic->rcoulomb * ic->rcoulomb;
+ +    nbp->rlist_sq   = ic->rlist * ic->rlist;
+ +    nbp->sh_invrc6  = ic->sh_invrc6;
+ +
+ +    if (ic->eeltype == eelCUT)
+ +    {
+ +        nbp->eeltype = eelCuCUT;
+ +    }
+ +    else if (EEL_RF(ic->eeltype))
+ +    {
+ +        nbp->eeltype = eelCuRF;
+ +    }
+ +    else if ((EEL_PME(ic->eeltype) || ic->eeltype==eelEWALD))
+ +    {
+ +        /* Initially rcoulomb == rvdw, so it's surely not twin cut-off. */
+ +        nbp->eeltype = pick_ewald_kernel_type(false, dev_info);
+ +    }
+ +    else
+ +    {
+ +        /* Shouldn't happen, as this is checked when choosing Verlet-scheme */
+ +        gmx_incons("The requested electrostatics type is not implemented in the CUDA GPU accelerated kernels!");
+ +    }
+ +
+ +    /* generate table for PME */
+ +    nbp->coulomb_tab = NULL;
+ +    if (nbp->eeltype == eelCuEWALD_TAB || nbp->eeltype == eelCuEWALD_TAB_TWIN)
+ +    {
+ +        init_ewald_coulomb_force_table(nbp);
+ +    }
+ +
+ +    nnbfp = 2*ntypes*ntypes;
+ +    stat = cudaMalloc((void **)&nbp->nbfp, nnbfp*sizeof(*nbp->nbfp));
+ +    CU_RET_ERR(stat, "cudaMalloc failed on nbp->nbfp");
+ +    cu_copy_H2D(nbp->nbfp, nbat->nbfp, nnbfp*sizeof(*nbp->nbfp));
+ +
+ +    cudaChannelFormatDesc cd   = cudaCreateChannelDesc<float>();
+ +    stat = cudaBindTexture(NULL, &nbnxn_cuda_get_nbfp_texref(),
+ +                           nbp->nbfp, &cd, nnbfp*sizeof(*nbp->nbfp));
+ +    CU_RET_ERR(stat, "cudaBindTexture on nbfp failed");
+ +}
+ +
+ +/*! Re-generate the GPU Ewald force table, resets rlist, and update the
+ + *  electrostatic type switching to twin cut-off (or back) if needed. */
+ +void nbnxn_cuda_pme_loadbal_update_param(nbnxn_cuda_ptr_t cu_nb,
+ +                                         const interaction_const_t *ic)
+ +{
+ +    cu_nbparam_t *nbp = cu_nb->nbparam;
+ +
+ +    nbp->rlist_sq       = ic->rlist * ic->rlist;
+ +    nbp->rcoulomb_sq    = ic->rcoulomb * ic->rcoulomb;
+ +    nbp->ewald_beta     = ic->ewaldcoeff;
+ +
+ +    nbp->eeltype        = pick_ewald_kernel_type(ic->rcoulomb != ic->rvdw,
+ +                                                 cu_nb->dev_info);
+ +
+ +    init_ewald_coulomb_force_table(cu_nb->nbparam);
+ +}
+ +
+ +/*! Initializes the pair list data structure. */
+ +static void init_plist(cu_plist_t *pl)
+ +{
+ +    /* initialize to NULL pointers to data that is not allocated here and will
+ +       need reallocation in nbnxn_cuda_init_pairlist */
+ +    pl->sci     = NULL;
+ +    pl->cj4     = NULL;
+ +    pl->excl    = NULL;
+ +
+ +    /* size -1 indicates that the respective array hasn't been initialized yet */
+ +    pl->na_c        = -1;
+ +    pl->nsci        = -1;
+ +    pl->sci_nalloc  = -1;
+ +    pl->ncj4        = -1;
+ +    pl->cj4_nalloc  = -1;
+ +    pl->nexcl       = -1;
+ +    pl->excl_nalloc = -1;
+ +    pl->bDoPrune    = false;
+ +}
+ +
+ +/*! Initializes the timer data structure. */
+ +static void init_timers(cu_timers_t *t, bool bUseTwoStreams)
+ +{
+ +    cudaError_t stat;
+ +    int eventflags = ( bUseCudaEventBlockingSync ? cudaEventBlockingSync: cudaEventDefault );
+ +
+ +    stat = cudaEventCreateWithFlags(&(t->start_atdat), eventflags);
+ +    CU_RET_ERR(stat, "cudaEventCreate on start_atdat failed");
+ +    stat = cudaEventCreateWithFlags(&(t->stop_atdat), eventflags);
+ +    CU_RET_ERR(stat, "cudaEventCreate on stop_atdat failed");
+ +
+ +    /* The non-local counters/stream (second in the array) are needed only with DD. */
+ +    for (int i = 0; i <= (bUseTwoStreams ? 1 : 0); i++)
+ +    {
+ +        stat = cudaEventCreateWithFlags(&(t->start_nb_k[i]), eventflags);
+ +        CU_RET_ERR(stat, "cudaEventCreate on start_nb_k failed");
+ +        stat = cudaEventCreateWithFlags(&(t->stop_nb_k[i]), eventflags);
+ +        CU_RET_ERR(stat, "cudaEventCreate on stop_nb_k failed");
+ +
+ +
+ +        stat = cudaEventCreateWithFlags(&(t->start_pl_h2d[i]), eventflags);
+ +        CU_RET_ERR(stat, "cudaEventCreate on start_pl_h2d failed");
+ +        stat = cudaEventCreateWithFlags(&(t->stop_pl_h2d[i]), eventflags);
+ +        CU_RET_ERR(stat, "cudaEventCreate on stop_pl_h2d failed");
+ +
+ +        stat = cudaEventCreateWithFlags(&(t->start_nb_h2d[i]), eventflags);
+ +        CU_RET_ERR(stat, "cudaEventCreate on start_nb_h2d failed");
+ +        stat = cudaEventCreateWithFlags(&(t->stop_nb_h2d[i]), eventflags);
+ +        CU_RET_ERR(stat, "cudaEventCreate on stop_nb_h2d failed");
+ +
+ +        stat = cudaEventCreateWithFlags(&(t->start_nb_d2h[i]), eventflags);
+ +        CU_RET_ERR(stat, "cudaEventCreate on start_nb_d2h failed");
+ +        stat = cudaEventCreateWithFlags(&(t->stop_nb_d2h[i]), eventflags);
+ +        CU_RET_ERR(stat, "cudaEventCreate on stop_nb_d2h failed");
+ +    }
+ +}
+ +
+ +/*! Initializes the timings data structure. */
+ +static void init_timings(wallclock_gpu_t *t)
+ +{
+ +    int i, j;
+ +
+ +    t->nb_h2d_t = 0.0;
+ +    t->nb_d2h_t = 0.0;
+ +    t->nb_c    = 0;
+ +    t->pl_h2d_t = 0.0;
+ +    t->pl_h2d_c = 0;
+ +    for (i = 0; i < 2; i++)
+ +    {
+ +        for(j = 0; j < 2; j++)
+ +        {
+ +            t->ktime[i][j].t = 0.0;
+ +            t->ktime[i][j].c = 0;
+ +        }
+ +    }
+ +}
+ +
+ +/* Decide which kernel version to use (default or legacy) based on:
+ + *  - CUDA version used for compilation
+ + *  - non-bonded kernel selector environment variables
+ + *  - GPU architecture version
+ + */
+ +static int pick_nbnxn_kernel_version(FILE            *fplog,
+ +                                     cuda_dev_info_t *devinfo)
+ +{
+ +    bool bForceLegacyKernel, bForceDefaultKernel, bCUDA40, bCUDA32;
+ +    char sbuf[STRLEN];
+ +    int  kver;
+ +
+ +    /* Legacy kernel (former k2), kept for backward compatibility as it is
+ +       faster than the default with CUDA 3.2/4.0 on Fermi (not on Kepler). */
+ +    bForceLegacyKernel  = (getenv("GMX_CUDA_NB_LEGACY") != NULL);
+ +    /* default kernel (former k3). */
+ +    bForceDefaultKernel = (getenv("GMX_CUDA_NB_DEFAULT") != NULL);
+ +
+ +    if ((unsigned)(bForceLegacyKernel + bForceDefaultKernel) > 1)
+ +    {
+ +        gmx_fatal(FARGS, "Multiple CUDA non-bonded kernels requested; to manually pick a kernel set only one \n"
+ +                  "of the following environment variables: \n"
+ +                  "GMX_CUDA_NB_DEFAULT, GMX_CUDA_NB_LEGACY");
+ +    }
+ +
+ +    bCUDA32 = bCUDA40 = false;
+ +#if CUDA_VERSION == 3200
+ +    bCUDA32 = true;
+ +    sprintf(sbuf, "3.2");
+ +#elif CUDA_VERSION == 4000
+ +    bCUDA40 = true;
+ +    sprintf(sbuf, "4.0");
+ +#endif
+ +
+ +    /* default is default ;) */
+ +    kver = eNbnxnCuKDefault;
+ +
+ +    /* Consider switching to legacy kernels only on Fermi */
+ +    if (devinfo->prop.major < 3 && (bCUDA32 || bCUDA40))
+ +    {
+ +        /* use legacy kernel unless something else is forced by an env. var */
+ +        if (bForceDefaultKernel)
+ +        {
+ +            md_print_warn(fplog,
+ +                          "NOTE: CUDA %s compilation detected; with this compiler version the legacy\n"
+ +                          "      non-bonded kernels perform best. However, the default kernels were\n"
+ +                          "      selected by the GMX_CUDA_NB_DEFAULT environment variable.\n"
+ +                          "      For best performance upgrade your CUDA toolkit.\n",
+ +                          sbuf);
+ +        }
+ +        else
+ +        {
+ +            kver = eNbnxnCuKLegacy;
+ +        }
+ +    }
+ +    else
+ +    {
+ +        /* issue note if the non-default kernel is forced by an env. var */
+ +        if (bForceLegacyKernel)
+ +        {
+ +            md_print_warn(fplog,
+ +                    "NOTE: Legacy non-bonded CUDA kernels selected by the GMX_CUDA_NB_LEGACY\n"
+ +                    "      env. var. Consider using using the default kernels which should be faster!\n");
+ +
+ +            kver = eNbnxnCuKLegacy;
+ +        }
+ +    }
+ +
+ +    return kver;
+ +}
+ +
+ +void nbnxn_cuda_init(FILE *fplog,
+ +                     nbnxn_cuda_ptr_t *p_cu_nb,
-         bool bShouldUsePollSync = (bX86 && bTMPIAtomics && !gpu_info->bDevShare);
++                     const gmx_gpu_info_t *gpu_info, int my_gpu_index,
+ +                     gmx_bool bLocalAndNonlocal)
+ +{
+ +    cudaError_t stat;
+ +    nbnxn_cuda_ptr_t  nb;
+ +    char sbuf[STRLEN];
+ +    bool bStreamSync, bNoStreamSync, bTMPIAtomics, bX86, bOldDriver;
+ +    int cuda_drv_ver;
+ +
+ +    assert(gpu_info);
+ +
+ +    if (p_cu_nb == NULL) return;
+ +
+ +    snew(nb, 1);
+ +    snew(nb->atdat, 1);
+ +    snew(nb->nbparam, 1);
+ +    snew(nb->plist[eintLocal], 1);
+ +    if (bLocalAndNonlocal)
+ +    {
+ +        snew(nb->plist[eintNonlocal], 1);
+ +    }
+ +
+ +    nb->bUseTwoStreams = bLocalAndNonlocal;
+ +
+ +    snew(nb->timers, 1);
+ +    snew(nb->timings, 1);
+ +
+ +    /* init nbst */
+ +    pmalloc((void**)&nb->nbst.e_lj, sizeof(*nb->nbst.e_lj));
+ +    pmalloc((void**)&nb->nbst.e_el, sizeof(*nb->nbst.e_el));
+ +    pmalloc((void**)&nb->nbst.fshift, SHIFTS * sizeof(*nb->nbst.fshift));
+ +
+ +    init_plist(nb->plist[eintLocal]);
+ +
+ +    /* local/non-local GPU streams */
+ +    stat = cudaStreamCreate(&nb->stream[eintLocal]);
+ +    CU_RET_ERR(stat, "cudaStreamCreate on stream[eintLocal] failed");
+ +    if (nb->bUseTwoStreams)
+ +    {
+ +        init_plist(nb->plist[eintNonlocal]);
+ +        stat = cudaStreamCreate(&nb->stream[eintNonlocal]);
+ +        CU_RET_ERR(stat, "cudaStreamCreate on stream[eintNonlocal] failed");
+ +    }
+ +
+ +    /* init events for sychronization (timing disabled for performance reasons!) */
+ +    stat = cudaEventCreateWithFlags(&nb->nonlocal_done, cudaEventDisableTiming);
+ +    CU_RET_ERR(stat, "cudaEventCreate on nonlocal_done failed");
+ +    stat = cudaEventCreateWithFlags(&nb->misc_ops_done, cudaEventDisableTiming);
+ +    CU_RET_ERR(stat, "cudaEventCreate on misc_ops_one failed");
+ +
+ +    /* set device info, just point it to the right GPU among the detected ones */
+ +    nb->dev_info = &gpu_info->cuda_dev[get_gpu_device_id(gpu_info, my_gpu_index)];
+ +
+ +    /* On GPUs with ECC enabled, cudaStreamSynchronize shows a large overhead
+ +     * (which increases with shorter time/step) caused by a known CUDA driver bug.
+ +     * To work around the issue we'll use an (admittedly fragile) memory polling
+ +     * waiting to preserve performance. This requires support for atomic
+ +     * operations and only works on x86/x86_64.
+ +     * With polling wait event-timing also needs to be disabled.
+ +     *
+ +     * The overhead is greatly reduced in API v5.0 drivers and the improvement
+ +     $ is independent of runtime version. Hence, with API v5.0 drivers and later
+ +     * we won't switch to polling.
+ +     *
+ +     * NOTE: Unfortunately, this is known to fail when GPUs are shared by (t)MPI,
+ +     * ranks so we will also disable it in that case.
+ +     */
+ +
+ +    bStreamSync    = getenv("GMX_CUDA_STREAMSYNC") != NULL;
+ +    bNoStreamSync  = getenv("GMX_NO_CUDA_STREAMSYNC") != NULL;
+ +
+ +#ifdef TMPI_ATOMICS
+ +    bTMPIAtomics = true;
+ +#else
+ +    bTMPIAtomics = false;
+ +#endif
+ +
+ +#if defined(i386) || defined(__x86_64__)
+ +    bX86 = true;
+ +#else
+ +    bX86 = false;
+ +#endif
+ +
+ +    if (bStreamSync && bNoStreamSync)
+ +    {
+ +        gmx_fatal(FARGS, "Conflicting environment variables: both GMX_CUDA_STREAMSYNC and GMX_NO_CUDA_STREAMSYNC defined");
+ +    }
+ +
+ +    stat = cudaDriverGetVersion(&cuda_drv_ver);
+ +    CU_RET_ERR(stat, "cudaDriverGetVersion failed");
+ +
+ +    bOldDriver = (cuda_drv_ver < 5000);
+ +
+ +    if ((nb->dev_info->prop.ECCEnabled == 1) && bOldDriver)
+ +    {
+ +        /* Polling wait should be used instead of cudaStreamSynchronize only if:
+ +         *   - ECC is ON & driver is old (checked above),
+ +         *   - we're on x86/x86_64,
+ +         *   - atomics are available, and
+ +         *   - GPUs are not being shared.
+ +         */
++        bool bShouldUsePollSync = (bX86 && bTMPIAtomics &&
++                                   (gmx_count_gpu_dev_shared(gpu_info) < 1));
+ +
+ +        if (bStreamSync)
+ +        {
+ +            nb->bUseStreamSync = true;
+ +
+ +            /* only warn if polling should be used */
+ +            if (bShouldUsePollSync)
+ +            {
+ +                md_print_warn(fplog,
+ +                              "NOTE: Using a GPU with ECC enabled and CUDA driver API version <5.0, but\n"
+ +                              "      cudaStreamSynchronize waiting is forced by the GMX_CUDA_STREAMSYNC env. var.\n");
+ +            }
+ +        }
+ +        else
+ +        {
+ +            nb->bUseStreamSync = !bShouldUsePollSync;
+ +
+ +            if (bShouldUsePollSync)
+ +            {
+ +                md_print_warn(fplog,
+ +                              "NOTE: Using a GPU with ECC enabled and CUDA driver API version <5.0, known to\n"
+ +                              "      cause performance loss. Switching to the alternative polling GPU wait.\n"
+ +                              "      If you encounter issues, switch back to standard GPU waiting by setting\n"
+ +                              "      the GMX_CUDA_STREAMSYNC environment variable.\n");
+ +            }
+ +            else
+ +            {
+ +                /* Tell the user that the ECC+old driver combination can be bad */
+ +                sprintf(sbuf,
+ +                        "NOTE: Using a GPU with ECC enabled and CUDA driver API version <5.0.\n"
+ +                        "      A known bug in this driver version can cause performance loss.\n"
+ +                        "      However, the polling wait workaround can not be used because\n%s\n"
+ +                        "      Consider updating the driver or turning ECC off.",
+ +                        (bX86 && bTMPIAtomics) ?
+ +                            "      GPU(s) are being oversubscribed." :
+ +                            "      atomic operations are not supported by the platform/CPU+compiler.");
+ +                md_print_warn(fplog, sbuf);
+ +            }
+ +        }
+ +    }
+ +    else
+ +    {
+ +        if (bNoStreamSync)
+ +        {
+ +            nb->bUseStreamSync = false;
+ +
+ +            md_print_warn(fplog,
+ +                          "NOTE: Polling wait for GPU synchronization requested by GMX_NO_CUDA_STREAMSYNC\n");
+ +        }
+ +        else
+ +        {
+ +            /* no/off ECC, cudaStreamSynchronize not turned off by env. var. */
+ +            nb->bUseStreamSync = true;
+ +        }
+ +    }
+ +
+ +    /* CUDA timing disabled as event timers don't work:
+ +       - with multiple streams = domain-decomposition;
+ +       - with the polling waiting hack (without cudaStreamSynchronize);
+ +       - when turned off by GMX_DISABLE_CUDA_TIMING.
+ +     */
+ +    nb->bDoTime = (!nb->bUseTwoStreams && nb->bUseStreamSync &&
+ +                   (getenv("GMX_DISABLE_CUDA_TIMING") == NULL));
+ +
+ +    if (nb->bDoTime)
+ +    {
+ +        init_timers(nb->timers, nb->bUseTwoStreams);
+ +        init_timings(nb->timings);
+ +    }
+ +
+ +    /* set the kernel type for the current GPU */
+ +    nb->kernel_ver = pick_nbnxn_kernel_version(fplog, nb->dev_info);
+ +    /* pick L1 cache configuration */
+ +    nbnxn_cuda_set_cacheconfig(nb->dev_info);
+ +
+ +    *p_cu_nb = nb;
+ +
+ +    if (debug)
+ +    {
+ +        fprintf(debug, "Initialized CUDA data structures.\n");
+ +    }
+ +}
+ +
+ +void nbnxn_cuda_init_const(nbnxn_cuda_ptr_t                cu_nb,
+ +                           const interaction_const_t      *ic,
+ +                           const nonbonded_verlet_group_t *nbv_group)
+ +{
+ +    init_atomdata_first(cu_nb->atdat, nbv_group[0].nbat->ntype);
+ +    init_nbparam(cu_nb->nbparam, ic, nbv_group[0].nbat, cu_nb->dev_info);
+ +
+ +    /* clear energy and shift force outputs */
+ +    nbnxn_cuda_clear_e_fshift(cu_nb);
+ +}
+ +
+ +void nbnxn_cuda_init_pairlist(nbnxn_cuda_ptr_t cu_nb,
+ +                              const nbnxn_pairlist_t *h_plist,
+ +                              int iloc)
+ +{
+ +    char         sbuf[STRLEN];
+ +    cudaError_t  stat;
+ +    bool         bDoTime    = cu_nb->bDoTime;
+ +    cudaStream_t stream     = cu_nb->stream[iloc];
+ +    cu_plist_t   *d_plist   = cu_nb->plist[iloc];
+ +
+ +    if (d_plist->na_c < 0)
+ +    {
+ +        d_plist->na_c = h_plist->na_ci;
+ +    }
+ +    else
+ +    {
+ +        if (d_plist->na_c != h_plist->na_ci)
+ +        {
+ +            sprintf(sbuf, "In cu_init_plist: the #atoms per cell has changed (from %d to %d)",
+ +                    d_plist->na_c, h_plist->na_ci);
+ +            gmx_incons(sbuf);
+ +        }
+ +    }
+ +
+ +    if (bDoTime)
+ +    {
+ +        stat = cudaEventRecord(cu_nb->timers->start_pl_h2d[iloc], stream);
+ +        CU_RET_ERR(stat, "cudaEventRecord failed");
+ +    }
+ +
+ +    cu_realloc_buffered((void **)&d_plist->sci, h_plist->sci, sizeof(*d_plist->sci),
+ +                         &d_plist->nsci, &d_plist->sci_nalloc,
+ +                         h_plist->nsci,
+ +                         stream, true);
+ +
+ +    cu_realloc_buffered((void **)&d_plist->cj4, h_plist->cj4, sizeof(*d_plist->cj4),
+ +                         &d_plist->ncj4, &d_plist->cj4_nalloc,
+ +                         h_plist->ncj4,
+ +                         stream, true);
+ +
+ +    cu_realloc_buffered((void **)&d_plist->excl, h_plist->excl, sizeof(*d_plist->excl),
+ +                         &d_plist->nexcl, &d_plist->excl_nalloc,
+ +                         h_plist->nexcl,
+ +                         stream, true);
+ +
+ +    if (bDoTime)
+ +    {
+ +        stat = cudaEventRecord(cu_nb->timers->stop_pl_h2d[iloc], stream);
+ +        CU_RET_ERR(stat, "cudaEventRecord failed");
+ +    }
+ +
+ +    /* need to prune the pair list during the next step */
+ +    d_plist->bDoPrune = true;
+ +}
+ +
+ +void nbnxn_cuda_upload_shiftvec(nbnxn_cuda_ptr_t cu_nb,
+ +                                const nbnxn_atomdata_t *nbatom)
+ +{
+ +    cu_atomdata_t *adat = cu_nb->atdat;
+ +    cudaStream_t  ls    = cu_nb->stream[eintLocal];
+ +
+ +    /* only if we have a dynamic box */
+ +    if (nbatom->bDynamicBox || !adat->bShiftVecUploaded)
+ +    {
+ +        cu_copy_H2D_async(adat->shift_vec, nbatom->shift_vec, 
+ +                          SHIFTS * sizeof(*adat->shift_vec), ls);
+ +        adat->bShiftVecUploaded = true;
+ +    }
+ +}
+ +
+ +/*! Clears the first natoms_clear elements of the GPU nonbonded force output array. */
+ +static void nbnxn_cuda_clear_f(nbnxn_cuda_ptr_t cu_nb, int natoms_clear)
+ +{
+ +    cudaError_t   stat;
+ +    cu_atomdata_t *adat = cu_nb->atdat;
+ +    cudaStream_t  ls    = cu_nb->stream[eintLocal];
+ +
+ +    stat = cudaMemsetAsync(adat->f, 0, natoms_clear * sizeof(*adat->f), ls);
+ +    CU_RET_ERR(stat, "cudaMemsetAsync on f falied");
+ +}
+ +
+ +/*! Clears nonbonded shift force output array and energy outputs on the GPU. */
+ +static void nbnxn_cuda_clear_e_fshift(nbnxn_cuda_ptr_t cu_nb)
+ +{
+ +    cudaError_t   stat;
+ +    cu_atomdata_t *adat = cu_nb->atdat;
+ +    cudaStream_t  ls    = cu_nb->stream[eintLocal];
+ +
+ +    stat = cudaMemsetAsync(adat->fshift, 0, SHIFTS * sizeof(*adat->fshift), ls);
+ +    CU_RET_ERR(stat, "cudaMemsetAsync on fshift falied");
+ +    stat = cudaMemsetAsync(adat->e_lj, 0, sizeof(*adat->e_lj), ls);
+ +    CU_RET_ERR(stat, "cudaMemsetAsync on e_lj falied");
+ +    stat = cudaMemsetAsync(adat->e_el, 0, sizeof(*adat->e_el), ls);
+ +    CU_RET_ERR(stat, "cudaMemsetAsync on e_el falied");
+ +}
+ +
+ +void nbnxn_cuda_clear_outputs(nbnxn_cuda_ptr_t cu_nb, int flags)
+ +{
+ +    nbnxn_cuda_clear_f(cu_nb, cu_nb->atdat->natoms);
+ +    /* clear shift force array and energies if the outputs were 
+ +       used in the current step */
+ +    if (flags & GMX_FORCE_VIRIAL)
+ +    {
+ +        nbnxn_cuda_clear_e_fshift(cu_nb);
+ +    }
+ +}
+ +
+ +void nbnxn_cuda_init_atomdata(nbnxn_cuda_ptr_t cu_nb,
+ +                              const nbnxn_atomdata_t *nbat)
+ +{
+ +    cudaError_t   stat;
+ +    int           nalloc, natoms;
+ +    bool          realloced;
+ +    bool          bDoTime   = cu_nb->bDoTime;
+ +    cu_timers_t   *timers   = cu_nb->timers;
+ +    cu_atomdata_t *d_atdat  = cu_nb->atdat;
+ +    cudaStream_t  ls        = cu_nb->stream[eintLocal];
+ +
+ +    natoms = nbat->natoms;
+ +    realloced = false;
+ +
+ +    if (bDoTime)
+ +    {
+ +        /* time async copy */
+ +        stat = cudaEventRecord(timers->start_atdat, ls);
+ +        CU_RET_ERR(stat, "cudaEventRecord failed");
+ +    }
+ +
+ +    /* need to reallocate if we have to copy more atoms than the amount of space
+ +       available and only allocate if we haven't initialized yet, i.e d_atdat->natoms == -1 */
+ +    if (natoms > d_atdat->nalloc)
+ +    {
+ +        nalloc = over_alloc_small(natoms);
+ +
+ +        /* free up first if the arrays have already been initialized */
+ +        if (d_atdat->nalloc != -1)
+ +        {
+ +            cu_free_buffered(d_atdat->f, &d_atdat->natoms, &d_atdat->nalloc);
+ +            cu_free_buffered(d_atdat->xq);
+ +            cu_free_buffered(d_atdat->atom_types);
+ +        }
+ +
+ +        stat = cudaMalloc((void **)&d_atdat->f, nalloc*sizeof(*d_atdat->f));
+ +        CU_RET_ERR(stat, "cudaMalloc failed on d_atdat->f");
+ +        stat = cudaMalloc((void **)&d_atdat->xq, nalloc*sizeof(*d_atdat->xq));
+ +        CU_RET_ERR(stat, "cudaMalloc failed on d_atdat->xq");
+ +
+ +        stat = cudaMalloc((void **)&d_atdat->atom_types, nalloc*sizeof(*d_atdat->atom_types));
+ +        CU_RET_ERR(stat, "cudaMalloc failed on d_atdat->atom_types");
+ +
+ +        d_atdat->nalloc = nalloc;
+ +        realloced = true;
+ +    }
+ +
+ +    d_atdat->natoms = natoms;
+ +    d_atdat->natoms_local = nbat->natoms_local;
+ +
+ +    /* need to clear GPU f output if realloc happened */
+ +    if (realloced)
+ +    {
+ +        nbnxn_cuda_clear_f(cu_nb, nalloc);
+ +    }
+ +
+ +    cu_copy_H2D_async(d_atdat->atom_types, nbat->type,
+ +                      natoms*sizeof(*d_atdat->atom_types), ls);
+ +
+ +    if (bDoTime)
+ +    {
+ +        stat = cudaEventRecord(timers->stop_atdat, ls);
+ +        CU_RET_ERR(stat, "cudaEventRecord failed");
+ +    }
+ +}
+ +
+ +void nbnxn_cuda_free(FILE *fplog, nbnxn_cuda_ptr_t cu_nb)
+ +{
+ +    cudaError_t     stat;
+ +    cu_atomdata_t   *atdat;
+ +    cu_nbparam_t    *nbparam;
+ +    cu_plist_t      *plist, *plist_nl;
+ +    cu_timers_t     *timers;
+ +
+ +    if (cu_nb == NULL) return;
+ +
+ +    atdat       = cu_nb->atdat;
+ +    nbparam     = cu_nb->nbparam;
+ +    plist       = cu_nb->plist[eintLocal];
+ +    plist_nl    = cu_nb->plist[eintNonlocal];
+ +    timers      = cu_nb->timers;
+ +
+ +    if (nbparam->eeltype == eelCuEWALD_TAB || nbparam->eeltype == eelCuEWALD_TAB_TWIN)
+ +    {
+ +      stat = cudaUnbindTexture(nbnxn_cuda_get_coulomb_tab_texref());
+ +      CU_RET_ERR(stat, "cudaUnbindTexture on coulomb_tab failed");
+ +      cu_free_buffered(nbparam->coulomb_tab, &nbparam->coulomb_tab_size);
+ +    }
+ +
+ +    stat = cudaEventDestroy(cu_nb->nonlocal_done);
+ +    CU_RET_ERR(stat, "cudaEventDestroy failed on timers->nonlocal_done");
+ +    stat = cudaEventDestroy(cu_nb->misc_ops_done);
+ +    CU_RET_ERR(stat, "cudaEventDestroy failed on timers->misc_ops_done");
+ +
+ +    if (cu_nb->bDoTime)
+ +    {
+ +        stat = cudaEventDestroy(timers->start_atdat);
+ +        CU_RET_ERR(stat, "cudaEventDestroy failed on timers->start_atdat");
+ +        stat = cudaEventDestroy(timers->stop_atdat);
+ +        CU_RET_ERR(stat, "cudaEventDestroy failed on timers->stop_atdat");
+ +
+ +        /* The non-local counters/stream (second in the array) are needed only with DD. */
+ +        for (int i = 0; i <= (cu_nb->bUseTwoStreams ? 1 : 0); i++)
+ +        {
+ +            stat = cudaEventDestroy(timers->start_nb_k[i]);
+ +            CU_RET_ERR(stat, "cudaEventDestroy failed on timers->start_nb_k");
+ +            stat = cudaEventDestroy(timers->stop_nb_k[i]);
+ +            CU_RET_ERR(stat, "cudaEventDestroy failed on timers->stop_nb_k");
+ +
+ +            stat = cudaEventDestroy(timers->start_pl_h2d[i]);
+ +            CU_RET_ERR(stat, "cudaEventDestroy failed on timers->start_pl_h2d");
+ +            stat = cudaEventDestroy(timers->stop_pl_h2d[i]);
+ +            CU_RET_ERR(stat, "cudaEventDestroy failed on timers->stop_pl_h2d");
+ +
+ +            stat = cudaStreamDestroy(cu_nb->stream[i]);
+ +            CU_RET_ERR(stat, "cudaStreamDestroy failed on stream");
+ +
+ +            stat = cudaEventDestroy(timers->start_nb_h2d[i]);
+ +            CU_RET_ERR(stat, "cudaEventDestroy failed on timers->start_nb_h2d");
+ +            stat = cudaEventDestroy(timers->stop_nb_h2d[i]);
+ +            CU_RET_ERR(stat, "cudaEventDestroy failed on timers->stop_nb_h2d");
+ +
+ +            stat = cudaEventDestroy(timers->start_nb_d2h[i]);
+ +            CU_RET_ERR(stat, "cudaEventDestroy failed on timers->start_nb_d2h");
+ +            stat = cudaEventDestroy(timers->stop_nb_d2h[i]);
+ +            CU_RET_ERR(stat, "cudaEventDestroy failed on timers->stop_nb_d2h");
+ +        }
+ +    }
+ +
+ +    stat = cudaUnbindTexture(nbnxn_cuda_get_nbfp_texref());
+ +    CU_RET_ERR(stat, "cudaUnbindTexture on coulomb_tab failed");
+ +    cu_free_buffered(nbparam->nbfp);
+ +
+ +    stat = cudaFree(atdat->shift_vec);
+ +    CU_RET_ERR(stat, "cudaFree failed on atdat->shift_vec");
+ +    stat = cudaFree(atdat->fshift);
+ +    CU_RET_ERR(stat, "cudaFree failed on atdat->fshift");
+ +
+ +    stat = cudaFree(atdat->e_lj);
+ +    CU_RET_ERR(stat, "cudaFree failed on atdat->e_lj");
+ +    stat = cudaFree(atdat->e_el);
+ +    CU_RET_ERR(stat, "cudaFree failed on atdat->e_el");
+ +
+ +    cu_free_buffered(atdat->f, &atdat->natoms, &atdat->nalloc);
+ +    cu_free_buffered(atdat->xq);
+ +    cu_free_buffered(atdat->atom_types, &atdat->ntypes);
+ +
+ +    cu_free_buffered(plist->sci, &plist->nsci, &plist->sci_nalloc);
+ +    cu_free_buffered(plist->cj4, &plist->ncj4, &plist->cj4_nalloc);
+ +    cu_free_buffered(plist->excl, &plist->nexcl, &plist->excl_nalloc);
+ +    if (cu_nb->bUseTwoStreams)
+ +    {
+ +        cu_free_buffered(plist_nl->sci, &plist_nl->nsci, &plist_nl->sci_nalloc);
+ +        cu_free_buffered(plist_nl->cj4, &plist_nl->ncj4, &plist_nl->cj4_nalloc);
+ +        cu_free_buffered(plist_nl->excl, &plist_nl->nexcl, &plist->excl_nalloc);
+ +    }
+ +
+ +    sfree(atdat);
+ +    sfree(nbparam);
+ +    sfree(plist);
+ +    if (cu_nb->bUseTwoStreams)
+ +    {
+ +        sfree(plist_nl);
+ +    }
+ +    sfree(timers);
+ +    sfree(cu_nb->timings);
+ +    sfree(cu_nb);
+ +
+ +    if (debug)
+ +    {
+ +        fprintf(debug, "Cleaned up CUDA data structures.\n");
+ +    }
+ +}
+ +
+ +void cu_synchstream_atdat(nbnxn_cuda_ptr_t cu_nb, int iloc)
+ +{
+ +    cudaError_t stat;
+ +    cudaStream_t stream = cu_nb->stream[iloc];
+ +
+ +    stat = cudaStreamWaitEvent(stream, cu_nb->timers->stop_atdat, 0);
+ +    CU_RET_ERR(stat, "cudaStreamWaitEvent failed");
+ +}
+ +
+ +wallclock_gpu_t * nbnxn_cuda_get_timings(nbnxn_cuda_ptr_t cu_nb)
+ +{
+ +    return (cu_nb != NULL && cu_nb->bDoTime) ? cu_nb->timings : NULL;
+ +}
+ +
+ +void nbnxn_cuda_reset_timings(nbnxn_cuda_ptr_t cu_nb)
+ +{
+ +    if (cu_nb->bDoTime)
+ +    {
+ +        init_timings(cu_nb->timings);
+ +    }
+ +}
+ +
+ +int nbnxn_cuda_min_ci_balanced(nbnxn_cuda_ptr_t cu_nb)
+ +{
+ +    return cu_nb != NULL ?
+ +        gpu_min_ci_balanced_factor*cu_nb->dev_info->prop.multiProcessorCount : 0;
+ +
+ +}
++
++gmx_bool nbnxn_cuda_is_kernel_ewald_analytical(const nbnxn_cuda_ptr_t cu_nb)
++{
++    return ((cu_nb->nbparam->eeltype == eelCuEWALD_ANA) ||
++            (cu_nb->nbparam->eeltype == eelCuEWALD_ANA_TWIN));
++}
diff --cc src/gromacs/mdlib/nbnxn_search.h

index e97218213ea16da5ab1df2a2d126c283ba84f9b8,0000000000000000000000000000000000000000..a5117ccc4768b8a1811a752246aa8ac3ef5773da

mode 100644,000000..100644
--- 1/src/gromacs/mdlib/nbnxn_search.h
--- /dev/null
+++ b/src/gromacs/mdlib/nbnxn_search.h
@@@ -1,136 -1,0 +1,136 @@@
- #define _nsnxn_search_h
+ +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustr
+ + *
+ + *
+ + *                This source code is part of
+ + *
+ + *                 G   R   O   M   A   C   S
+ + *
+ + *          GROningen MAchine for Chemical Simulations
+ + *
+ + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
+ + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ + * Copyright (c) 2001-2012, The GROMACS development team,
+ + * check out http://www.gromacs.org for more information.
+ + *
+ + * This program is free software; you can redistribute it and/or
+ + * modify it under the terms of the GNU General Public License
+ + * as published by the Free Software Foundation; either version 2
+ + * of the License, or (at your option) any later version.
+ + *
+ + * If you want to redistribute modifications, please consider that
+ + * scientific software is very special. Version control is crucial -
+ + * bugs must be traceable. We will be happy to consider code for
+ + * inclusion in the official distribution, but derived work must not
+ + * be called official GROMACS. Details are found in the README & COPYING
+ + * files - if they are missing, get the official version at www.gromacs.org.
+ + *
+ + * To help us fund GROMACS development, we humbly ask that you cite
+ + * the papers on the package - you can find them in the top README file.
+ + *
+ + * For more info, check our website at http://www.gromacs.org
+ + *
+ + * And Hey:
+ + * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
+ + */
+ +
+ +#ifndef _nbnxn_search_h
++#define _nbnxn_search_h
+ +
+ +#include "typedefs.h"
+ +
+ +#ifdef __cplusplus
+ +extern "C" {
+ +#endif
+ +
+ +
+ +/* Returns the j-cluster size for kernel of type nb_kernel_type */
+ +int nbnxn_kernel_to_cj_size(int nb_kernel_type);
+ +
+ +/* Tells if the pair-list corresponding to nb_kernel_type is simple.
+ + * Returns FALSE for super-sub type pair-list.
+ + */
+ +gmx_bool nbnxn_kernel_pairlist_simple(int nb_kernel_type);
+ +
+ +/* Due to the cluster size the effective pair-list is longer than
+ + * that of a simple atom pair-list. This function gives the extra distance.
+ + */
+ +real nbnxn_get_rlist_effective_inc(int cluster_size, real atom_density);
+ +
+ +/* Allocates and initializes a pair search data structure */
+ +void nbnxn_init_search(nbnxn_search_t    * nbs_ptr,
+ +                       ivec               *n_dd_cells,
+ +                       gmx_domdec_zones_t *zones,
+ +                       int                 nthread_max);
+ +
+ +/* Put the atoms on the pair search grid.
+ + * Only atoms a0 to a1 in x are put on the grid.
+ + * The atom_density is used to determine the grid size.
+ + * When atom_density=-1, the density is determined from a1-a0 and the corners.
+ + * With domain decomposition part of the n particles might have migrated,
+ + * but have not been removed yet. This count is given by nmoved.
+ + * When move[i] < 0 particle i has migrated and will not be put on the grid.
+ + * Without domain decomposition move will be NULL.
+ + */
+ +void nbnxn_put_on_grid(nbnxn_search_t nbs,
+ +                       int ePBC, matrix box,
+ +                       int dd_zone,
+ +                       rvec corner0, rvec corner1,
+ +                       int a0, int a1,
+ +                       real atom_density,
+ +                       const int *atinfo,
+ +                       rvec *x,
+ +                       int nmoved, int *move,
+ +                       int nb_kernel_type,
+ +                       nbnxn_atomdata_t *nbat);
+ +
+ +/* As nbnxn_put_on_grid, but for the non-local atoms
+ + * with domain decomposition. Should be called after calling
+ + * nbnxn_search_put_on_grid for the local atoms / home zone.
+ + */
+ +void nbnxn_put_on_grid_nonlocal(nbnxn_search_t            nbs,
+ +                                const gmx_domdec_zones_t *zones,
+ +                                const int                *atinfo,
+ +                                rvec                     *x,
+ +                                int                       nb_kernel_type,
+ +                                nbnxn_atomdata_t         *nbat);
+ +
+ +/* Add simple grid type information to the local super/sub grid */
+ +void nbnxn_grid_add_simple(nbnxn_search_t    nbs,
+ +                           nbnxn_atomdata_t *nbat);
+ +
+ +/* Return the number of x and y cells in the local grid */
+ +void nbnxn_get_ncells(nbnxn_search_t nbs, int *ncx, int *ncy);
+ +
+ +/* Return the order indices *a of the atoms on the ns grid, size n */
+ +void nbnxn_get_atomorder(nbnxn_search_t nbs, int **a, int *n);
+ +
+ +/* Renumber the atom indices on the grid to consecutive order */
+ +void nbnxn_set_atomorder(nbnxn_search_t nbs);
+ +
+ +/* Initializes a set of pair lists stored in nbnxn_pairlist_set_t */
+ +void nbnxn_init_pairlist_set(nbnxn_pairlist_set_t *nbl_list,
+ +                             gmx_bool simple, gmx_bool combined,
+ +                             nbnxn_alloc_t *alloc,
+ +                             nbnxn_free_t  *free);
+ +
+ +/* Make a apir-list with radius rlist, store it in nbl.
+ + * The parameter min_ci_balanced sets the minimum required
+ + * number or roughly equally sized ci blocks in nbl.
+ + * When set >0 ci lists will be chopped up when the estimate
+ + * for the number of equally sized lists is below min_ci_balanced.
+ + */
+ +void nbnxn_make_pairlist(const nbnxn_search_t  nbs,
+ +                         nbnxn_atomdata_t     *nbat,
+ +                         const t_blocka       *excl,
+ +                         real                  rlist,
+ +                         int                   min_ci_balanced,
+ +                         nbnxn_pairlist_set_t *nbl_list,
+ +                         int                   iloc,
+ +                         int                   nb_kernel_type,
+ +                         t_nrnb               *nrnb);
+ +
+ +#ifdef __cplusplus
+ +}
+ +#endif
+ +
+ +#endif
diff --cc src/gromacs/mdlib/qm_orca.c

index a8ee061002abc6cb2d8a26e28e7a3fbcef054aab,0000000000000000000000000000000000000000..d33f231b7d71a47341873207a1f611aeb38579fe

mode 100644,000000..100644
--- 1/src/gromacs/mdlib/qm_orca.c
--- /dev/null
+++ b/src/gromacs/mdlib/qm_orca.c
@@@ -1,527 -1,0 +1,528 @@@
-     char
-     *buf;
+ +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
+ + *
+ + *                This source code is part of
+ + *
+ + *                 G   R   O   M   A   C   S
+ + *
+ + *          GROningen MAchine for Chemical Simulations
+ + *
+ + *                        VERSION 4.5
+ + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
+ + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ + * Copyright (c) 2001-2008, The GROMACS development team,
+ + * check out http://www.gromacs.org for more information.
+ +
+ + * This program is free software; you can redistribute it and/or
+ + * modify it under the terms of the GNU General Public License
+ + * as published by the Free Software Foundation; either version 2
+ + * of the License, or (at your option) any later version.
+ + *
+ + * If you want to redistribute modifications, please consider that
+ + * scientific software is very special. Version control is crucial -
+ + * bugs must be traceable. We will be happy to consider code for
+ + * inclusion in the official distribution, but derived work must not
+ + * be called official GROMACS. Details are found in the README & COPYING
+ + * files - if they are missing, get the official version at www.gromacs.org.
+ + *
+ + * To help us fund GROMACS development, we humbly ask that you cite
+ + * the papers on the package - you can find them in the top README file.
+ + *
+ + * For more info, check our website at http://www.gromacs.org
+ + *
+ + * And Hey:
+ + * Groningen Machine for Chemical Simulation
+ + */
+ +#ifdef HAVE_CONFIG_H
+ +#include <config.h>
+ +#endif
+ +
+ +#include <math.h>
+ +#include "sysstuff.h"
+ +#include "typedefs.h"
+ +#include "macros.h"
+ +#include "smalloc.h"
+ +#include "physics.h"
+ +#include "macros.h"
+ +#include "vec.h"
+ +#include "force.h"
+ +#include "invblock.h"
+ +#include "confio.h"
+ +#include "names.h"
+ +#include "network.h"
+ +#include "pbc.h"
+ +#include "ns.h"
+ +#include "nrnb.h"
+ +#include "bondf.h"
+ +#include "mshift.h"
+ +#include "txtdump.h"
+ +#include "qmmm.h"
+ +#include <stdio.h>
+ +#include <string.h>
+ +#include "gmx_fatal.h"
+ +#include "typedefs.h"
+ +#include <stdlib.h>
+ +
+ +/* ORCA interface routines */
+ +
+ +void init_orca(t_commrec *cr, t_QMrec *qm, t_MMrec *mm)
+ +{
-         gmx_fatal(FARGS, "no $BASENAME\n");
++    char *buf;
+ +    snew(buf, 200);
++
+ +    /* ORCA settings on the system */
+ +    buf = getenv("BASENAME");
+ +    if (buf)
+ +    {
+ +        snew(qm->orca_basename, 200);
+ +        sscanf(buf, "%s", qm->orca_basename);
+ +    }
+ +    else
+ +    {
-     fprintf(stderr, "%s", buf);
++        gmx_fatal(FARGS, "$BASENAME not set\n");
+ +    }
+ +
+ +    /* ORCA directory on the system */
+ +    snew(buf, 200);
+ +    buf = getenv("ORCA_PATH");
-         gmx_fatal(FARGS, "no $ORCA_PATH, check manual\n");
+ +
+ +    if (buf)
+ +    {
+ +        snew(qm->orca_dir, 200);
+ +        sscanf(buf, "%s", qm->orca_dir);
+ +    }
+ +    else
+ +    {
-     fprintf(stderr, "%s...\n", qm->orca_dir);
-     fprintf(stderr, "orca initialised...\n");
++        gmx_fatal(FARGS, "$ORCA_PATH not set, check manual\n");
+ +    }
+ +
-     int
-         i;
-     t_QMMMrec
-        *QMMMrec;
-     FILE
-        *out, *pcFile, *addInputFile, *LJCoeff;
-     char
-        *buf, *orcaInput, *addInputFilename, *LJCoeffFilename,
-     *pcFilename, *exclInName, *exclOutName;
++    fprintf(stderr, "Setting ORCA path to: %s...\n", qm->orca_dir);
++    fprintf(stderr, "ORCA initialised...\n\n");
+ +    /* since we append the output to the BASENAME.out file,
+ +       we should delete an existent old out-file here. */
+ +    sprintf(buf, "%s.out", qm->orca_basename);
+ +    remove(buf);
+ +}
+ +
+ +
+ +void write_orca_input(int step, t_forcerec *fr, t_QMrec *qm, t_MMrec *mm)
+ +{
-     fprintf(out, "#input-file generated by gromacs\n");
++    int i;
++    t_QMMMrec *QMMMrec;
++    FILE *out, *pcFile, *addInputFile, *LJCoeff;
++    char *buf, *orcaInput, *addInputFilename, *LJCoeffFilename, *pcFilename, *exclInName, *exclOutName;
++
+ +    QMMMrec = fr->qr;
++
+ +    /* write the first part of the input-file */
+ +    snew(orcaInput, 200);
+ +    sprintf(orcaInput, "%s.inp", qm->orca_basename);
+ +    out = fopen(orcaInput, "w");
++
+ +    snew(addInputFilename, 200);
+ +    sprintf(addInputFilename, "%s.ORCAINFO", qm->orca_basename);
+ +    addInputFile = fopen(addInputFilename, "r");
-         fprintf(stderr, "No information on the calculation given in <%s>\n", addInputFilename);
-         gmx_call("qm_orca.c");
++
++    fprintf(out, "#input-file generated by GROMACS\n");
++
+ +    if (qm->bTS)
+ +    {
+ +        fprintf(out, "!QMMMOpt TightSCF\n");
+ +        fprintf(out, "%s\n", "%geom TS_Search EF end");
+ +    }
+ +    else if (qm->bOPT)
+ +    {
+ +        fprintf(out, "!QMMMOpt TightSCF\n");
+ +    }
+ +    else
+ +    {
+ +        fprintf(out, "!EnGrad TightSCF\n");
+ +    }
++
+ +    /* here we include the insertion of the additional orca-input */
+ +    snew(buf, 200);
+ +    if (addInputFile != NULL)
+ +    {
+ +        while (!feof(addInputFile))
+ +        {
+ +            if (fgets(buf, 200, addInputFile) != NULL)
+ +            {
+ +                fputs(buf, out);
+ +            }
+ +        }
+ +    }
+ +    else
+ +    {
-     /* write charge and multiplicity
-      */
++        gmx_fatal(FARGS, "No information on the calculation given in %s\n", addInputFilename);
+ +    }
++
+ +    fclose(addInputFile);
++
+ +    if (qm->bTS || qm->bOPT)
+ +    {
+ +        /* freeze the frontier QM atoms and Link atoms. This is
+ +         * important only if a full QM subsystem optimization is done
+ +         * with a frozen MM environmeent. For dynamics, or gromacs's own
+ +         * optimization routines this is not important.
+ +         */
+ +        /* ORCA reads the exclusions from LJCoeffFilename.Excl,
+ +         * so we have to rename the file
+ +         */
+ +        int didStart = 0;
+ +        for (i = 0; i < qm->nrQMatoms; i++)
+ +        {
+ +            if (qm->frontatoms[i])
+ +            {
+ +                if (!didStart)
+ +                {
+ +                    fprintf(out, "%s\n", "%geom");
+ +                    fprintf(out, "   Constraints \n");
+ +                    didStart = 1;
+ +                }
+ +                fprintf(out, "        {C %d C}\n", i); /* counting from 0 */
+ +            }
+ +        }
+ +        if (didStart)
+ +        {
+ +            fprintf(out, "     end\n   end\n");
+ +        }
+ +        /* make a file with information on the C6 and C12 coefficients */
+ +        if (QMMMrec->QMMMscheme != eQMMMschemeoniom && mm->nrMMatoms)
+ +        {
+ +            snew(exclInName, 200);
+ +            snew(exclOutName, 200);
+ +            sprintf(exclInName, "QMMMexcl.dat");
+ +            sprintf(exclOutName, "%s.LJ.Excl", qm->orca_basename);
+ +            rename(exclInName, exclOutName);
+ +            snew(LJCoeffFilename, 200);
+ +            sprintf(LJCoeffFilename, "%s.LJ", qm->orca_basename);
+ +            fprintf(out, "%s%s%s\n", "%LJCOEFFICIENTS \"", LJCoeffFilename, "\"");
+ +            /* make a file with information on the C6 and C12 coefficients */
+ +            LJCoeff = fopen(LJCoeffFilename, "w");
+ +            fprintf(LJCoeff, "%d\n", qm->nrQMatoms);
+ +            for (i = 0; i < qm->nrQMatoms; i++)
+ +            {
+ +#ifdef GMX_DOUBLE
+ +                fprintf(LJCoeff, "%10.7lf  %10.7lf\n", qm->c6[i], qm->c12[i]);
+ +#else
+ +                fprintf(LJCoeff, "%10.7f  %10.7f\n", qm->c6[i], qm->c12[i]);
+ +#endif
+ +            }
+ +            fprintf(LJCoeff, "%d\n", mm->nrMMatoms);
+ +            for (i = 0; i < mm->nrMMatoms; i++)
+ +            {
+ +#ifdef GMX_DOUBLE
+ +                fprintf(LJCoeff, "%10.7lf  %10.7lf\n", mm->c6[i], mm->c12[i]);
+ +#else
+ +                fprintf(LJCoeff, "%10.7f  %10.7f\n", mm->c6[i], mm->c12[i]);
+ +#endif
+ +            }
+ +            fclose(LJCoeff);
+ +        }
+ +    }
-     /* write the QM coordinates
-      */
++
++    /* write charge and multiplicity */
+ +    fprintf(out, "*xyz %2d%2d\n", qm->QMcharge, qm->multiplicity);
-     /* write the MM point charge data
-      */
++
++    /* write the QM coordinates */
+ +    for (i = 0; i < qm->nrQMatoms; i++)
+ +    {
+ +        int atomNr;
+ +        if (qm->atomicnumberQM[i] == 0)
+ +        {
+ +            atomNr = 1;
+ +        }
+ +        else
+ +        {
+ +            atomNr = qm->atomicnumberQM[i];
+ +        }
+ +#ifdef GMX_DOUBLE
+ +        fprintf(out, "%3d %10.7lf  %10.7lf  %10.7lf\n",
+ +                atomNr,
+ +                qm->xQM[i][XX]/0.1,
+ +                qm->xQM[i][YY]/0.1,
+ +                qm->xQM[i][ZZ]/0.1);
+ +#else
+ +        fprintf(out, "%3d %10.7f  %10.7f  %10.7f\n",
+ +                atomNr,
+ +                qm->xQM[i][XX]/0.1,
+ +                qm->xQM[i][YY]/0.1,
+ +                qm->xQM[i][ZZ]/0.1);
+ +#endif
+ +    }
+ +    fprintf(out, "*\n");
++
++    /* write the MM point charge data */
+ +    if (QMMMrec->QMMMscheme != eQMMMschemeoniom && mm->nrMMatoms)
+ +    {
+ +        /* name of the point charge file */
+ +        snew(pcFilename, 200);
+ +        sprintf(pcFilename, "%s.pc", qm->orca_basename);
+ +        fprintf(out, "%s%s%s\n", "%pointcharges \"", pcFilename, "\"");
+ +        pcFile = fopen(pcFilename, "w");
+ +        fprintf(pcFile, "%d\n", mm->nrMMatoms);
+ +        for (i = 0; i < mm->nrMMatoms; i++)
+ +        {
+ +#ifdef GMX_DOUBLE
+ +            fprintf(pcFile, "%8.4lf %10.7lf  %10.7lf  %10.7lf\n",
+ +                    mm->MMcharges[i],
+ +                    mm->xMM[i][XX]/0.1,
+ +                    mm->xMM[i][YY]/0.1,
+ +                    mm->xMM[i][ZZ]/0.1);
+ +#else
+ +            fprintf(pcFile, "%8.4f %10.7f  %10.7f  %10.7f\n",
+ +                    mm->MMcharges[i],
+ +                    mm->xMM[i][XX]/0.1,
+ +                    mm->xMM[i][YY]/0.1,
+ +                    mm->xMM[i][ZZ]/0.1);
+ +#endif
+ +        }
+ +        fprintf(pcFile, "\n");
+ +        fclose(pcFile);
+ +    }
+ +    fprintf(out, "\n");
+ +
+ +    fclose(out);
+ +}  /* write_orca_input */
+ +
+ +real read_orca_output(rvec QMgrad[], rvec MMgrad[], int step, t_forcerec *fr,
+ +                      t_QMrec *qm, t_MMrec *mm)
+ +{
+ +    int
+ +        i, j, atnum;
+ +    char
+ +        buf[300], tmp[300], orca_xyzFilename[300], orca_pcgradFilename[300], orca_engradFilename[300];
+ +    real
+ +        QMener;
+ +    FILE
+ +       *xyz, *pcgrad, *engrad;
+ +    int k;
+ +    t_QMMMrec
+ +       *QMMMrec;
+ +    QMMMrec = fr->qr;
+ +    /* in case of an optimization, the coordinates are printed in the
+ +     * xyz file, the energy and gradients for the QM part are stored in the engrad file
+ +     * and the gradients for the point charges are stored in the pc file.
+ +     */
+ +
+ +    /* we need the new xyz coordinates of the QM atoms only for separate QM-optimization
+ +     */
+ +
+ +    if (qm->bTS || qm->bOPT)
+ +    {
+ +        sprintf(orca_xyzFilename, "%s.xyz", qm->orca_basename);
+ +        xyz = fopen(orca_xyzFilename, "r");
+ +        if (fgets(buf, 300, xyz) == NULL)
+ +        {
+ +            gmx_fatal(FARGS, "Unexpected end of ORCA output");
+ +        }
+ +        if (fgets(buf, 300, xyz) == NULL)
+ +        {
+ +            gmx_fatal(FARGS, "Unexpected end of ORCA output");
+ +        }
+ +        for (i = 0; i < qm->nrQMatoms; i++)
+ +        {
+ +            if (fgets(buf, 300, xyz) == NULL)
+ +            {
+ +                gmx_fatal(FARGS, "Unexpected end of ORCA output");
+ +            }
+ +#ifdef GMX_DOUBLE
+ +            sscanf(buf, "%s%lf%lf%lf\n",
+ +                   tmp,
+ +                   &qm->xQM[i][XX],
+ +                   &qm->xQM[i][YY],
+ +                   &qm->xQM[i][ZZ]);
+ +#else
+ +            sscanf(buf, "%d%f%f%f\n",
+ +                   &atnum,
+ +                   &qm->xQM[i][XX],
+ +                   &qm->xQM[i][YY],
+ +                   &qm->xQM[i][ZZ]);
+ +#endif
+ +            for (j = 0; j < DIM; j++)
+ +            {
+ +                qm->xQM[i][j] *= 0.1;
+ +            }
+ +        }
+ +        fclose(xyz);
+ +    }
+ +    sprintf(orca_engradFilename, "%s.engrad", qm->orca_basename);
+ +    engrad = fopen(orca_engradFilename, "r");
+ +    /* we read the energy and the gradient for the qm-atoms from the engrad file
+ +     */
+ +    /* we can skip the first seven lines
+ +     */
+ +    for (j = 0; j < 7; j++)
+ +    {
+ +        if (fgets(buf, 300, engrad) == NULL)
+ +        {
+ +            gmx_fatal(FARGS, "Unexpected end of ORCA output");
+ +        }
+ +    }
+ +    /* now comes the energy
+ +     */
+ +    if (fgets(buf, 300, engrad) == NULL)
+ +    {
+ +        gmx_fatal(FARGS, "Unexpected end of ORCA output");
+ +    }
+ +#ifdef GMX_DOUBLE
+ +    sscanf(buf, "%lf\n", &QMener);
+ +#else
+ +    sscanf(buf, "%f\n", &QMener);
+ +#endif
+ +    /* we can skip the next three lines
+ +     */
+ +    for (j = 0; j < 3; j++)
+ +    {
+ +        if (fgets(buf, 300, engrad) == NULL)
+ +        {
+ +            gmx_fatal(FARGS, "Unexpected end of ORCA output");
+ +        }
+ +    }
+ +    /* next lines contain the gradients of the QM atoms
+ +     * now comes the gradient, one value per line:
+ +     * (atom1 x \n atom1 y \n atom1 z \n atom2 x ...
+ +     */
+ +
+ +    for (i = 0; i < 3*qm->nrQMatoms; i++)
+ +    {
+ +        k = i/3;
+ +        if (fgets(buf, 300, engrad) == NULL)
+ +        {
+ +            gmx_fatal(FARGS, "Unexpected end of ORCA output");
+ +        }
+ +#ifdef GMX_DOUBLE
+ +        if (i%3 == 0)
+ +        {
+ +            sscanf(buf, "%lf\n", &QMgrad[k][XX]);
+ +        }
+ +        else if (i%3 == 1)
+ +        {
+ +            sscanf(buf, "%lf\n", &QMgrad[k][YY]);
+ +        }
+ +        else if (i%3 == 2)
+ +        {
+ +            sscanf(buf, "%lf\n", &QMgrad[k][ZZ]);
+ +        }
+ +#else
+ +        if (i%3 == 0)
+ +        {
+ +            sscanf(buf, "%f\n", &QMgrad[k][XX]);
+ +        }
+ +        else if (i%3 == 1)
+ +        {
+ +            sscanf(buf, "%f\n", &QMgrad[k][YY]);
+ +        }
+ +        else if (i%3 == 2)
+ +        {
+ +            sscanf(buf, "%f\n", &QMgrad[k][ZZ]);
+ +        }
+ +#endif
+ +    }
+ +    fclose(engrad);
+ +    /* write the MM point charge data
+ +     */
+ +    if (QMMMrec->QMMMscheme != eQMMMschemeoniom && mm->nrMMatoms)
+ +    {
+ +        sprintf(orca_pcgradFilename, "%s.pcgrad", qm->orca_basename);
+ +        pcgrad = fopen(orca_pcgradFilename, "r");
+ +
+ +        /* we read the gradient for the mm-atoms from the pcgrad file
+ +         */
+ +        /* we can skip the first line
+ +         */
+ +        if (fgets(buf, 300, pcgrad) == NULL)
+ +        {
+ +            gmx_fatal(FARGS, "Unexpected end of ORCA output");
+ +        }
+ +        for (i = 0; i < mm->nrMMatoms; i++)
+ +        {
+ +            if (fgets(buf, 300, pcgrad) == NULL)
+ +            {
+ +                gmx_fatal(FARGS, "Unexpected end of ORCA output");
+ +            }
+ +    #ifdef GMX_DOUBLE
+ +            sscanf(buf, "%lf%lf%lf\n",
+ +                   &MMgrad[i][XX],
+ +                   &MMgrad[i][YY],
+ +                   &MMgrad[i][ZZ]);
+ +    #else
+ +            sscanf(buf, "%f%f%f\n",
+ +                   &MMgrad[i][XX],
+ +                   &MMgrad[i][YY],
+ +                   &MMgrad[i][ZZ]);
+ +    #endif
+ +        }
+ +        fclose(pcgrad);
+ +    }
+ +    return(QMener);
+ +}
+ +
+ +void do_orca(int step, char *exe, char *orca_dir, char *basename)
+ +{
+ +
+ +    /* make the call to the orca binary through system()
+ +     * The location of the binary is set through the
+ +     * environment.
+ +     */
+ +    char
+ +        buf[100];
+ +    sprintf(buf, "%s/%s %s.inp >> %s.out",
+ +            orca_dir,
+ +            "orca",
+ +            basename,
+ +            basename);
+ +    fprintf(stderr, "Calling '%s'\n", buf);
+ +    if (system(buf) != 0)
+ +    {
+ +        gmx_fatal(FARGS, "Call to '%s' failed\n", buf);
+ +    }
+ +}
+ +
+ +real call_orca(t_commrec *cr,  t_forcerec *fr,
+ +               t_QMrec *qm, t_MMrec *mm, rvec f[], rvec fshift[])
+ +{
+ +    /* normal orca jobs */
+ +    static int
+ +        step = 0;
+ +    int
+ +        i, j;
+ +    real
+ +        QMener;
+ +    rvec
+ +       *QMgrad, *MMgrad;
+ +    char
+ +       *exe;
+ +
+ +    snew(exe, 30);
+ +    sprintf(exe, "%s", "orca");
+ +    snew(QMgrad, qm->nrQMatoms);
+ +    snew(MMgrad, mm->nrMMatoms);
+ +
+ +    write_orca_input(step, fr, qm, mm);
+ +    do_orca(step, exe, qm->orca_dir, qm->orca_basename);
+ +    QMener = read_orca_output(QMgrad, MMgrad, step, fr, qm, mm);
+ +    /* put the QMMM forces in the force array and to the fshift
+ +     */
+ +    for (i = 0; i < qm->nrQMatoms; i++)
+ +    {
+ +        for (j = 0; j < DIM; j++)
+ +        {
+ +            f[i][j]      = HARTREE_BOHR2MD*QMgrad[i][j];
+ +            fshift[i][j] = HARTREE_BOHR2MD*QMgrad[i][j];
+ +        }
+ +    }
+ +    for (i = 0; i < mm->nrMMatoms; i++)
+ +    {
+ +        for (j = 0; j < DIM; j++)
+ +        {
+ +            f[i+qm->nrQMatoms][j]      = HARTREE_BOHR2MD*MMgrad[i][j];
+ +            fshift[i+qm->nrQMatoms][j] = HARTREE_BOHR2MD*MMgrad[i][j];
+ +        }
+ +    }
+ +    QMener = QMener*HARTREE2KJ*AVOGADRO;
+ +    step++;
+ +    free(exe);
+ +    return(QMener);
+ +} /* call_orca */
+ +
+ +/* end of orca sub routines */
diff --cc src/gromacs/mdlib/sim_util.c

index c0fd4cd0078aeb570273dd2269eb679a6b61ac6c,0000000000000000000000000000000000000000..46ada2a183b202e27ffa55720f2459c5e0550c53

mode 100644,000000..100644
--- 1/src/gromacs/mdlib/sim_util.c
--- /dev/null
+++ b/src/gromacs/mdlib/sim_util.c
@@@ -1,2747 -1,0 +1,2751 @@@
-     if (nbvg->kernel_type != nbnxnk8x8x8_CUDA)
+ +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
+ + *
+ + *
+ + *                This source code is part of
+ + *
+ + *                 G   R   O   M   A   C   S
+ + *
+ + *          GROningen MAchine for Chemical Simulations
+ + *
+ + *                        VERSION 3.2.0
+ + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
+ + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ + * Copyright (c) 2001-2004, The GROMACS development team,
+ + * check out http://www.gromacs.org for more information.
+ +
+ + * This program is free software; you can redistribute it and/or
+ + * modify it under the terms of the GNU General Public License
+ + * as published by the Free Software Foundation; either version 2
+ + * of the License, or (at your option) any later version.
+ + *
+ + * If you want to redistribute modifications, please consider that
+ + * scientific software is very special. Version control is crucial -
+ + * bugs must be traceable. We will be happy to consider code for
+ + * inclusion in the official distribution, but derived work must not
+ + * be called official GROMACS. Details are found in the README & COPYING
+ + * files - if they are missing, get the official version at www.gromacs.org.
+ + *
+ + * To help us fund GROMACS development, we humbly ask that you cite
+ + * the papers on the package - you can find them in the top README file.
+ + *
+ + * For more info, check our website at http://www.gromacs.org
+ + *
+ + * And Hey:
+ + * GROwing Monsters And Cloning Shrimps
+ + */
+ +#ifdef HAVE_CONFIG_H
+ +#include <config.h>
+ +#endif
+ +
+ +#ifdef GMX_CRAY_XT3
+ +#include <catamount/dclock.h>
+ +#endif
+ +
+ +
+ +#include <stdio.h>
+ +#include <time.h>
+ +#ifdef HAVE_SYS_TIME_H
+ +#include <sys/time.h>
+ +#endif
+ +#include <math.h>
+ +#include "typedefs.h"
+ +#include "string2.h"
+ +#include "gmxfio.h"
+ +#include "smalloc.h"
+ +#include "names.h"
+ +#include "confio.h"
+ +#include "mvdata.h"
+ +#include "txtdump.h"
+ +#include "pbc.h"
+ +#include "chargegroup.h"
+ +#include "vec.h"
+ +#include <time.h>
+ +#include "nrnb.h"
+ +#include "mshift.h"
+ +#include "mdrun.h"
+ +#include "sim_util.h"
+ +#include "update.h"
+ +#include "physics.h"
+ +#include "main.h"
+ +#include "mdatoms.h"
+ +#include "force.h"
+ +#include "bondf.h"
+ +#include "pme.h"
+ +#include "disre.h"
+ +#include "orires.h"
+ +#include "network.h"
+ +#include "calcmu.h"
+ +#include "constr.h"
+ +#include "xvgr.h"
+ +#include "trnio.h"
+ +#include "xtcio.h"
+ +#include "copyrite.h"
+ +#include "pull_rotation.h"
+ +#include "gmx_random.h"
+ +#include "domdec.h"
+ +#include "partdec.h"
+ +#include "gmx_wallcycle.h"
+ +#include "genborn.h"
+ +#include "nbnxn_atomdata.h"
+ +#include "nbnxn_search.h"
+ +#include "nbnxn_kernels/nbnxn_kernel_ref.h"
+ +#include "nbnxn_kernels/nbnxn_kernel_simd_4xn.h"
+ +#include "nbnxn_kernels/nbnxn_kernel_simd_2xnn.h"
+ +#include "nbnxn_kernels/nbnxn_kernel_gpu_ref.h"
+ +
+ +#include "gromacs/utility/gmxmpi.h"
+ +
+ +#include "adress.h"
+ +#include "qmmm.h"
+ +
+ +#include "nbnxn_cuda_data_mgmt.h"
+ +#include "nbnxn_cuda/nbnxn_cuda.h"
+ +
+ +#if 0
+ +typedef struct gmx_timeprint {
+ +
+ +} t_gmx_timeprint;
+ +#endif
+ +
+ +/* Portable version of ctime_r implemented in src/gmxlib/string2.c, but we do not want it declared in public installed headers */
+ +char *
+ +gmx_ctime_r(const time_t *clock, char *buf, int n);
+ +
+ +
+ +double
+ +gmx_gettime()
+ +{
+ +#ifdef HAVE_GETTIMEOFDAY
+ +    struct timeval t;
+ +    double         seconds;
+ +
+ +    gettimeofday(&t, NULL);
+ +
+ +    seconds = (double) t.tv_sec + 1e-6*(double)t.tv_usec;
+ +
+ +    return seconds;
+ +#else
+ +    double  seconds;
+ +
+ +    seconds = time(NULL);
+ +
+ +    return seconds;
+ +#endif
+ +}
+ +
+ +
+ +#define difftime(end, start) ((double)(end)-(double)(start))
+ +
+ +void print_time(FILE *out, gmx_runtime_t *runtime, gmx_large_int_t step,
+ +                t_inputrec *ir, t_commrec *cr)
+ +{
+ +    time_t finish;
+ +    char   timebuf[STRLEN];
+ +    double dt;
+ +    char   buf[48];
+ +
+ +#ifndef GMX_THREAD_MPI
+ +    if (!PAR(cr))
+ +#endif
+ +    {
+ +        fprintf(out, "\r");
+ +    }
+ +    fprintf(out, "step %s", gmx_step_str(step, buf));
+ +    if ((step >= ir->nstlist))
+ +    {
+ +        runtime->last          = gmx_gettime();
+ +        dt                     = difftime(runtime->last, runtime->real);
+ +        runtime->time_per_step = dt/(step - ir->init_step + 1);
+ +
+ +        dt = (ir->nsteps + ir->init_step - step)*runtime->time_per_step;
+ +
+ +        if (ir->nsteps >= 0)
+ +        {
+ +            if (dt >= 300)
+ +            {
+ +                finish = (time_t) (runtime->last + dt);
+ +                gmx_ctime_r(&finish, timebuf, STRLEN);
+ +                sprintf(buf, "%s", timebuf);
+ +                buf[strlen(buf)-1] = '\0';
+ +                fprintf(out, ", will finish %s", buf);
+ +            }
+ +            else
+ +            {
+ +                fprintf(out, ", remaining runtime: %5d s          ", (int)dt);
+ +            }
+ +        }
+ +        else
+ +        {
+ +            fprintf(out, " performance: %.1f ns/day    ",
+ +                    ir->delta_t/1000*24*60*60/runtime->time_per_step);
+ +        }
+ +    }
+ +#ifndef GMX_THREAD_MPI
+ +    if (PAR(cr))
+ +    {
+ +        fprintf(out, "\n");
+ +    }
+ +#endif
+ +
+ +    fflush(out);
+ +}
+ +
+ +#ifdef NO_CLOCK
+ +#define clock() -1
+ +#endif
+ +
+ +static double set_proctime(gmx_runtime_t *runtime)
+ +{
+ +    double diff;
+ +#ifdef GMX_CRAY_XT3
+ +    double prev;
+ +
+ +    prev          = runtime->proc;
+ +    runtime->proc = dclock();
+ +
+ +    diff = runtime->proc - prev;
+ +#else
+ +    clock_t prev;
+ +
+ +    prev          = runtime->proc;
+ +    runtime->proc = clock();
+ +
+ +    diff = (double)(runtime->proc - prev)/(double)CLOCKS_PER_SEC;
+ +#endif
+ +    if (diff < 0)
+ +    {
+ +        /* The counter has probably looped, ignore this data */
+ +        diff = 0;
+ +    }
+ +
+ +    return diff;
+ +}
+ +
+ +void runtime_start(gmx_runtime_t *runtime)
+ +{
+ +    runtime->real          = gmx_gettime();
+ +    runtime->proc          = 0;
+ +    set_proctime(runtime);
+ +    runtime->realtime      = 0;
+ +    runtime->proctime      = 0;
+ +    runtime->last          = 0;
+ +    runtime->time_per_step = 0;
+ +}
+ +
+ +void runtime_end(gmx_runtime_t *runtime)
+ +{
+ +    double now;
+ +
+ +    now = gmx_gettime();
+ +
+ +    runtime->proctime += set_proctime(runtime);
+ +    runtime->realtime  = now - runtime->real;
+ +    runtime->real      = now;
+ +}
+ +
+ +void runtime_upd_proc(gmx_runtime_t *runtime)
+ +{
+ +    runtime->proctime += set_proctime(runtime);
+ +}
+ +
+ +void print_date_and_time(FILE *fplog, int nodeid, const char *title,
+ +                         const gmx_runtime_t *runtime)
+ +{
+ +    int    i;
+ +    char   timebuf[STRLEN];
+ +    char   time_string[STRLEN];
+ +    time_t tmptime;
+ +
+ +    if (fplog)
+ +    {
+ +        if (runtime != NULL)
+ +        {
+ +            tmptime = (time_t) runtime->real;
+ +            gmx_ctime_r(&tmptime, timebuf, STRLEN);
+ +        }
+ +        else
+ +        {
+ +            tmptime = (time_t) gmx_gettime();
+ +            gmx_ctime_r(&tmptime, timebuf, STRLEN);
+ +        }
+ +        for (i = 0; timebuf[i] >= ' '; i++)
+ +        {
+ +            time_string[i] = timebuf[i];
+ +        }
+ +        time_string[i] = '\0';
+ +
+ +        fprintf(fplog, "%s on node %d %s\n", title, nodeid, time_string);
+ +    }
+ +}
+ +
+ +static void sum_forces(int start, int end, rvec f[], rvec flr[])
+ +{
+ +    int i;
+ +
+ +    if (gmx_debug_at)
+ +    {
+ +        pr_rvecs(debug, 0, "fsr", f+start, end-start);
+ +        pr_rvecs(debug, 0, "flr", flr+start, end-start);
+ +    }
+ +    for (i = start; (i < end); i++)
+ +    {
+ +        rvec_inc(f[i], flr[i]);
+ +    }
+ +}
+ +
+ +/*
+ + * calc_f_el calculates forces due to an electric field.
+ + *
+ + * force is kJ mol^-1 nm^-1 = e * kJ mol^-1 nm^-1 / e
+ + *
+ + * Et[] contains the parameters for the time dependent
+ + * part of the field (not yet used).
+ + * Ex[] contains the parameters for
+ + * the spatial dependent part of the field. You can have cool periodic
+ + * fields in principle, but only a constant field is supported
+ + * now.
+ + * The function should return the energy due to the electric field
+ + * (if any) but for now returns 0.
+ + *
+ + * WARNING:
+ + * There can be problems with the virial.
+ + * Since the field is not self-consistent this is unavoidable.
+ + * For neutral molecules the virial is correct within this approximation.
+ + * For neutral systems with many charged molecules the error is small.
+ + * But for systems with a net charge or a few charged molecules
+ + * the error can be significant when the field is high.
+ + * Solution: implement a self-consitent electric field into PME.
+ + */
+ +static void calc_f_el(FILE *fp, int  start, int homenr,
+ +                      real charge[], rvec x[], rvec f[],
+ +                      t_cosines Ex[], t_cosines Et[], double t)
+ +{
+ +    rvec Ext;
+ +    real t0;
+ +    int  i, m;
+ +
+ +    for (m = 0; (m < DIM); m++)
+ +    {
+ +        if (Et[m].n > 0)
+ +        {
+ +            if (Et[m].n == 3)
+ +            {
+ +                t0     = Et[m].a[1];
+ +                Ext[m] = cos(Et[m].a[0]*(t-t0))*exp(-sqr(t-t0)/(2.0*sqr(Et[m].a[2])));
+ +            }
+ +            else
+ +            {
+ +                Ext[m] = cos(Et[m].a[0]*t);
+ +            }
+ +        }
+ +        else
+ +        {
+ +            Ext[m] = 1.0;
+ +        }
+ +        if (Ex[m].n > 0)
+ +        {
+ +            /* Convert the field strength from V/nm to MD-units */
+ +            Ext[m] *= Ex[m].a[0]*FIELDFAC;
+ +            for (i = start; (i < start+homenr); i++)
+ +            {
+ +                f[i][m] += charge[i]*Ext[m];
+ +            }
+ +        }
+ +        else
+ +        {
+ +            Ext[m] = 0;
+ +        }
+ +    }
+ +    if (fp != NULL)
+ +    {
+ +        fprintf(fp, "%10g  %10g  %10g  %10g #FIELD\n", t,
+ +                Ext[XX]/FIELDFAC, Ext[YY]/FIELDFAC, Ext[ZZ]/FIELDFAC);
+ +    }
+ +}
+ +
+ +static void calc_virial(FILE *fplog, int start, int homenr, rvec x[], rvec f[],
+ +                        tensor vir_part, t_graph *graph, matrix box,
+ +                        t_nrnb *nrnb, const t_forcerec *fr, int ePBC)
+ +{
+ +    int    i, j;
+ +    tensor virtest;
+ +
+ +    /* The short-range virial from surrounding boxes */
+ +    clear_mat(vir_part);
+ +    calc_vir(fplog, SHIFTS, fr->shift_vec, fr->fshift, vir_part, ePBC == epbcSCREW, box);
+ +    inc_nrnb(nrnb, eNR_VIRIAL, SHIFTS);
+ +
+ +    /* Calculate partial virial, for local atoms only, based on short range.
+ +     * Total virial is computed in global_stat, called from do_md
+ +     */
+ +    f_calc_vir(fplog, start, start+homenr, x, f, vir_part, graph, box);
+ +    inc_nrnb(nrnb, eNR_VIRIAL, homenr);
+ +
+ +    /* Add position restraint contribution */
+ +    for (i = 0; i < DIM; i++)
+ +    {
+ +        vir_part[i][i] += fr->vir_diag_posres[i];
+ +    }
+ +
+ +    /* Add wall contribution */
+ +    for (i = 0; i < DIM; i++)
+ +    {
+ +        vir_part[i][ZZ] += fr->vir_wall_z[i];
+ +    }
+ +
+ +    if (debug)
+ +    {
+ +        pr_rvecs(debug, 0, "vir_part", vir_part, DIM);
+ +    }
+ +}
+ +
+ +static void posres_wrapper(FILE *fplog,
+ +                           int flags,
+ +                           gmx_bool bSepDVDL,
+ +                           t_inputrec *ir,
+ +                           t_nrnb *nrnb,
+ +                           gmx_localtop_t *top,
+ +                           matrix box, rvec x[],
+ +                           rvec f[],
+ +                           gmx_enerdata_t *enerd,
+ +                           real *lambda,
+ +                           t_forcerec *fr)
+ +{
+ +    t_pbc pbc;
+ +    real  v, dvdl;
+ +    int   i;
+ +
+ +    /* Position restraints always require full pbc */
+ +    set_pbc(&pbc, ir->ePBC, box);
+ +    dvdl = 0;
+ +    v    = posres(top->idef.il[F_POSRES].nr, top->idef.il[F_POSRES].iatoms,
+ +                  top->idef.iparams_posres,
+ +                  (const rvec*)x, fr->f_novirsum, fr->vir_diag_posres,
+ +                  ir->ePBC == epbcNONE ? NULL : &pbc,
+ +                  lambda[efptRESTRAINT], &dvdl,
+ +                  fr->rc_scaling, fr->ePBC, fr->posres_com, fr->posres_comB);
+ +    if (bSepDVDL)
+ +    {
+ +        fprintf(fplog, sepdvdlformat,
+ +                interaction_function[F_POSRES].longname, v, dvdl);
+ +    }
+ +    enerd->term[F_POSRES] += v;
+ +    /* If just the force constant changes, the FEP term is linear,
+ +     * but if k changes, it is not.
+ +     */
+ +    enerd->dvdl_nonlin[efptRESTRAINT] += dvdl;
+ +    inc_nrnb(nrnb, eNR_POSRES, top->idef.il[F_POSRES].nr/2);
+ +
+ +    if ((ir->fepvals->n_lambda > 0) && (flags & GMX_FORCE_DHDL))
+ +    {
+ +        for (i = 0; i < enerd->n_lambda; i++)
+ +        {
+ +            real dvdl_dum, lambda_dum;
+ +
+ +            lambda_dum = (i == 0 ? lambda[efptRESTRAINT] : ir->fepvals->all_lambda[efptRESTRAINT][i-1]);
+ +            v          = posres(top->idef.il[F_POSRES].nr, top->idef.il[F_POSRES].iatoms,
+ +                                top->idef.iparams_posres,
+ +                                (const rvec*)x, NULL, NULL,
+ +                                ir->ePBC == epbcNONE ? NULL : &pbc, lambda_dum, &dvdl,
+ +                                fr->rc_scaling, fr->ePBC, fr->posres_com, fr->posres_comB);
+ +            enerd->enerpart_lambda[i] += v;
+ +        }
+ +    }
+ +}
+ +
+ +static void pull_potential_wrapper(FILE *fplog,
+ +                                   gmx_bool bSepDVDL,
+ +                                   t_commrec *cr,
+ +                                   t_inputrec *ir,
+ +                                   matrix box, rvec x[],
+ +                                   rvec f[],
+ +                                   tensor vir_force,
+ +                                   t_mdatoms *mdatoms,
+ +                                   gmx_enerdata_t *enerd,
+ +                                   real *lambda,
+ +                                   double t)
+ +{
+ +    t_pbc  pbc;
+ +    real   dvdl;
+ +
+ +    /* Calculate the center of mass forces, this requires communication,
+ +     * which is why pull_potential is called close to other communication.
+ +     * The virial contribution is calculated directly,
+ +     * which is why we call pull_potential after calc_virial.
+ +     */
+ +    set_pbc(&pbc, ir->ePBC, box);
+ +    dvdl                     = 0;
+ +    enerd->term[F_COM_PULL] +=
+ +        pull_potential(ir->ePull, ir->pull, mdatoms, &pbc,
+ +                       cr, t, lambda[efptRESTRAINT], x, f, vir_force, &dvdl);
+ +    if (bSepDVDL)
+ +    {
+ +        fprintf(fplog, sepdvdlformat, "Com pull", enerd->term[F_COM_PULL], dvdl);
+ +    }
+ +    enerd->dvdl_lin[efptRESTRAINT] += dvdl;
+ +}
+ +
+ +static void pme_receive_force_ener(FILE           *fplog,
+ +                                   gmx_bool        bSepDVDL,
+ +                                   t_commrec      *cr,
+ +                                   gmx_wallcycle_t wcycle,
+ +                                   gmx_enerdata_t *enerd,
+ +                                   t_forcerec     *fr)
+ +{
+ +    real   e, v, dvdl;
+ +    float  cycles_ppdpme, cycles_seppme;
+ +
+ +    cycles_ppdpme = wallcycle_stop(wcycle, ewcPPDURINGPME);
+ +    dd_cycles_add(cr->dd, cycles_ppdpme, ddCyclPPduringPME);
+ +
+ +    /* In case of node-splitting, the PP nodes receive the long-range
+ +     * forces, virial and energy from the PME nodes here.
+ +     */
+ +    wallcycle_start(wcycle, ewcPP_PMEWAITRECVF);
+ +    dvdl = 0;
+ +    gmx_pme_receive_f(cr, fr->f_novirsum, fr->vir_el_recip, &e, &dvdl,
+ +                      &cycles_seppme);
+ +    if (bSepDVDL)
+ +    {
+ +        fprintf(fplog, sepdvdlformat, "PME mesh", e, dvdl);
+ +    }
+ +    enerd->term[F_COUL_RECIP] += e;
+ +    enerd->dvdl_lin[efptCOUL] += dvdl;
+ +    if (wcycle)
+ +    {
+ +        dd_cycles_add(cr->dd, cycles_seppme, ddCyclPME);
+ +    }
+ +    wallcycle_stop(wcycle, ewcPP_PMEWAITRECVF);
+ +}
+ +
+ +static void print_large_forces(FILE *fp, t_mdatoms *md, t_commrec *cr,
+ +                               gmx_large_int_t step, real pforce, rvec *x, rvec *f)
+ +{
+ +    int  i;
+ +    real pf2, fn2;
+ +    char buf[STEPSTRSIZE];
+ +
+ +    pf2 = sqr(pforce);
+ +    for (i = md->start; i < md->start+md->homenr; i++)
+ +    {
+ +        fn2 = norm2(f[i]);
+ +        /* We also catch NAN, if the compiler does not optimize this away. */
+ +        if (fn2 >= pf2 || fn2 != fn2)
+ +        {
+ +            fprintf(fp, "step %s  atom %6d  x %8.3f %8.3f %8.3f  force %12.5e\n",
+ +                    gmx_step_str(step, buf),
+ +                    ddglatnr(cr->dd, i), x[i][XX], x[i][YY], x[i][ZZ], sqrt(fn2));
+ +        }
+ +    }
+ +}
+ +
+ +static void post_process_forces(FILE *fplog,
+ +                                t_commrec *cr,
+ +                                gmx_large_int_t step,
+ +                                t_nrnb *nrnb, gmx_wallcycle_t wcycle,
+ +                                gmx_localtop_t *top,
+ +                                matrix box, rvec x[],
+ +                                rvec f[],
+ +                                tensor vir_force,
+ +                                t_mdatoms *mdatoms,
+ +                                t_graph *graph,
+ +                                t_forcerec *fr, gmx_vsite_t *vsite,
+ +                                int flags)
+ +{
+ +    if (fr->bF_NoVirSum)
+ +    {
+ +        if (vsite)
+ +        {
+ +            /* Spread the mesh force on virtual sites to the other particles...
+ +             * This is parallellized. MPI communication is performed
+ +             * if the constructing atoms aren't local.
+ +             */
+ +            wallcycle_start(wcycle, ewcVSITESPREAD);
+ +            spread_vsite_f(fplog, vsite, x, fr->f_novirsum, NULL,
+ +                           (flags & GMX_FORCE_VIRIAL), fr->vir_el_recip,
+ +                           nrnb,
+ +                           &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr);
+ +            wallcycle_stop(wcycle, ewcVSITESPREAD);
+ +        }
+ +        if (flags & GMX_FORCE_VIRIAL)
+ +        {
+ +            /* Now add the forces, this is local */
+ +            if (fr->bDomDec)
+ +            {
+ +                sum_forces(0, fr->f_novirsum_n, f, fr->f_novirsum);
+ +            }
+ +            else
+ +            {
+ +                sum_forces(mdatoms->start, mdatoms->start+mdatoms->homenr,
+ +                           f, fr->f_novirsum);
+ +            }
+ +            if (EEL_FULL(fr->eeltype))
+ +            {
+ +                /* Add the mesh contribution to the virial */
+ +                m_add(vir_force, fr->vir_el_recip, vir_force);
+ +            }
+ +            if (debug)
+ +            {
+ +                pr_rvecs(debug, 0, "vir_force", vir_force, DIM);
+ +            }
+ +        }
+ +    }
+ +
+ +    if (fr->print_force >= 0)
+ +    {
+ +        print_large_forces(stderr, mdatoms, cr, step, fr->print_force, x, f);
+ +    }
+ +}
+ +
+ +static void do_nb_verlet(t_forcerec *fr,
+ +                         interaction_const_t *ic,
+ +                         gmx_enerdata_t *enerd,
+ +                         int flags, int ilocality,
+ +                         int clearF,
+ +                         t_nrnb *nrnb,
+ +                         gmx_wallcycle_t wcycle)
+ +{
+ +    int                        nnbl, kernel_type, enr_nbnxn_kernel_ljc, enr_nbnxn_kernel_lj;
+ +    char                      *env;
+ +    nonbonded_verlet_group_t  *nbvg;
++    gmx_bool                  bCUDA;
+ +
+ +    if (!(flags & GMX_FORCE_NONBONDED))
+ +    {
+ +        /* skip non-bonded calculation */
+ +        return;
+ +    }
+ +
+ +    nbvg = &fr->nbv->grp[ilocality];
+ +
+ +    /* CUDA kernel launch overhead is already timed separately */
+ +    if (fr->cutoff_scheme != ecutsVERLET)
+ +    {
+ +        gmx_incons("Invalid cut-off scheme passed!");
+ +    }
+ +
-     if (nbvg->kernel_type != nbnxnk8x8x8_CUDA)
++    bCUDA = (nbvg->kernel_type == nbnxnk8x8x8_CUDA);
++
++    if (!bCUDA)
+ +    {
+ +        wallcycle_sub_start(wcycle, ewcsNONBONDED);
+ +    }
+ +    switch (nbvg->kernel_type)
+ +    {
+ +        case nbnxnk4x4_PlainC:
+ +            nbnxn_kernel_ref(&nbvg->nbl_lists,
+ +                             nbvg->nbat, ic,
+ +                             fr->shift_vec,
+ +                             flags,
+ +                             clearF,
+ +                             fr->fshift[0],
+ +                             enerd->grpp.ener[egCOULSR],
+ +                             fr->bBHAM ?
+ +                             enerd->grpp.ener[egBHAMSR] :
+ +                             enerd->grpp.ener[egLJSR]);
+ +            break;
+ +
+ +        case nbnxnk4xN_SIMD_4xN:
+ +            nbnxn_kernel_simd_4xn(&nbvg->nbl_lists,
+ +                                  nbvg->nbat, ic,
+ +                                  nbvg->ewald_excl,
+ +                                  fr->shift_vec,
+ +                                  flags,
+ +                                  clearF,
+ +                                  fr->fshift[0],
+ +                                  enerd->grpp.ener[egCOULSR],
+ +                                  fr->bBHAM ?
+ +                                  enerd->grpp.ener[egBHAMSR] :
+ +                                  enerd->grpp.ener[egLJSR]);
+ +            break;
+ +        case nbnxnk4xN_SIMD_2xNN:
+ +            nbnxn_kernel_simd_2xnn(&nbvg->nbl_lists,
+ +                                   nbvg->nbat, ic,
+ +                                   nbvg->ewald_excl,
+ +                                   fr->shift_vec,
+ +                                   flags,
+ +                                   clearF,
+ +                                   fr->fshift[0],
+ +                                   enerd->grpp.ener[egCOULSR],
+ +                                   fr->bBHAM ?
+ +                                   enerd->grpp.ener[egBHAMSR] :
+ +                                   enerd->grpp.ener[egLJSR]);
+ +            break;
+ +
+ +        case nbnxnk8x8x8_CUDA:
+ +            nbnxn_cuda_launch_kernel(fr->nbv->cu_nbv, nbvg->nbat, flags, ilocality);
+ +            break;
+ +
+ +        case nbnxnk8x8x8_PlainC:
+ +            nbnxn_kernel_gpu_ref(nbvg->nbl_lists.nbl[0],
+ +                                 nbvg->nbat, ic,
+ +                                 fr->shift_vec,
+ +                                 flags,
+ +                                 clearF,
+ +                                 nbvg->nbat->out[0].f,
+ +                                 fr->fshift[0],
+ +                                 enerd->grpp.ener[egCOULSR],
+ +                                 fr->bBHAM ?
+ +                                 enerd->grpp.ener[egBHAMSR] :
+ +                                 enerd->grpp.ener[egLJSR]);
+ +            break;
+ +
+ +        default:
+ +            gmx_incons("Invalid nonbonded kernel type passed!");
+ +
+ +    }
-     else if (nbvg->ewald_excl == ewaldexclTable)
++    if (!bCUDA)
+ +    {
+ +        wallcycle_sub_stop(wcycle, ewcsNONBONDED);
+ +    }
+ +
+ +    if (EEL_RF(ic->eeltype) || ic->eeltype == eelCUT)
+ +    {
+ +        enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_RF;
+ +    }
-         enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_TAB;
++    else if ((!bCUDA && nbvg->ewald_excl == ewaldexclAnalytical) ||
++             (bCUDA && nbnxn_cuda_is_kernel_ewald_analytical(fr->nbv->cu_nbv)))
+ +    {
-         enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_EWALD;
++        enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_EWALD;
+ +    }
+ +    else
+ +    {
++        enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_TAB;
+ +    }
+ +    enr_nbnxn_kernel_lj = eNR_NBNXN_LJ;
+ +    if (flags & GMX_FORCE_ENERGY)
+ +    {
+ +        /* In eNR_??? the nbnxn F+E kernels are always the F kernel + 1 */
+ +        enr_nbnxn_kernel_ljc += 1;
+ +        enr_nbnxn_kernel_lj  += 1;
+ +    }
+ +
+ +    inc_nrnb(nrnb, enr_nbnxn_kernel_ljc,
+ +             nbvg->nbl_lists.natpair_ljq);
+ +    inc_nrnb(nrnb, enr_nbnxn_kernel_lj,
+ +             nbvg->nbl_lists.natpair_lj);
+ +    inc_nrnb(nrnb, enr_nbnxn_kernel_ljc-eNR_NBNXN_LJ_RF+eNR_NBNXN_RF,
+ +             nbvg->nbl_lists.natpair_q);
+ +}
+ +
+ +void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
+ +                         t_inputrec *inputrec,
+ +                         gmx_large_int_t step, t_nrnb *nrnb, gmx_wallcycle_t wcycle,
+ +                         gmx_localtop_t *top,
+ +                         gmx_mtop_t *mtop,
+ +                         gmx_groups_t *groups,
+ +                         matrix box, rvec x[], history_t *hist,
+ +                         rvec f[],
+ +                         tensor vir_force,
+ +                         t_mdatoms *mdatoms,
+ +                         gmx_enerdata_t *enerd, t_fcdata *fcd,
+ +                         real *lambda, t_graph *graph,
+ +                         t_forcerec *fr, interaction_const_t *ic,
+ +                         gmx_vsite_t *vsite, rvec mu_tot,
+ +                         double t, FILE *field, gmx_edsam_t ed,
+ +                         gmx_bool bBornRadii,
+ +                         int flags)
+ +{
+ +    int                 cg0, cg1, i, j;
+ +    int                 start, homenr;
+ +    int                 nb_kernel_type;
+ +    double              mu[2*DIM];
+ +    gmx_bool            bSepDVDL, bStateChanged, bNS, bFillGrid, bCalcCGCM, bBS;
+ +    gmx_bool            bDoLongRange, bDoForces, bSepLRF, bUseGPU, bUseOrEmulGPU;
+ +    gmx_bool            bDiffKernels = FALSE;
+ +    matrix              boxs;
+ +    rvec                vzero, box_diag;
+ +    real                e, v, dvdl;
+ +    float               cycles_pme, cycles_force;
+ +    nonbonded_verlet_t *nbv;
+ +
+ +    cycles_force   = 0;
+ +    nbv            = fr->nbv;
+ +    nb_kernel_type = fr->nbv->grp[0].kernel_type;
+ +
+ +    start  = mdatoms->start;
+ +    homenr = mdatoms->homenr;
+ +
+ +    bSepDVDL = (fr->bSepDVDL && do_per_step(step, inputrec->nstlog));
+ +
+ +    clear_mat(vir_force);
+ +
+ +    cg0 = 0;
+ +    if (DOMAINDECOMP(cr))
+ +    {
+ +        cg1 = cr->dd->ncg_tot;
+ +    }
+ +    else
+ +    {
+ +        cg1 = top->cgs.nr;
+ +    }
+ +    if (fr->n_tpi > 0)
+ +    {
+ +        cg1--;
+ +    }
+ +
+ +    bStateChanged = (flags & GMX_FORCE_STATECHANGED);
+ +    bNS           = (flags & GMX_FORCE_NS) && (fr->bAllvsAll == FALSE);
+ +    bFillGrid     = (bNS && bStateChanged);
+ +    bCalcCGCM     = (bFillGrid && !DOMAINDECOMP(cr));
+ +    bDoLongRange  = (fr->bTwinRange && bNS && (flags & GMX_FORCE_DO_LR));
+ +    bDoForces     = (flags & GMX_FORCE_FORCES);
+ +    bSepLRF       = (bDoLongRange && bDoForces && (flags & GMX_FORCE_SEPLRF));
+ +    bUseGPU       = fr->nbv->bUseGPU;
+ +    bUseOrEmulGPU = bUseGPU || (nbv->grp[0].kernel_type == nbnxnk8x8x8_PlainC);
+ +
+ +    if (bStateChanged)
+ +    {
+ +        update_forcerec(fplog, fr, box);
+ +
+ +        if (NEED_MUTOT(*inputrec))
+ +        {
+ +            /* Calculate total (local) dipole moment in a temporary common array.
+ +             * This makes it possible to sum them over nodes faster.
+ +             */
+ +            calc_mu(start, homenr,
+ +                    x, mdatoms->chargeA, mdatoms->chargeB, mdatoms->nChargePerturbed,
+ +                    mu, mu+DIM);
+ +        }
+ +    }
+ +
+ +    if (fr->ePBC != epbcNONE)
+ +    {
+ +        /* Compute shift vectors every step,
+ +         * because of pressure coupling or box deformation!
+ +         */
+ +        if ((flags & GMX_FORCE_DYNAMICBOX) && bStateChanged)
+ +        {
+ +            calc_shifts(box, fr->shift_vec);
+ +        }
+ +
+ +        if (bCalcCGCM)
+ +        {
+ +            put_atoms_in_box_omp(fr->ePBC, box, homenr, x);
+ +            inc_nrnb(nrnb, eNR_SHIFTX, homenr);
+ +        }
+ +        else if (EI_ENERGY_MINIMIZATION(inputrec->eI) && graph)
+ +        {
+ +            unshift_self(graph, box, x);
+ +        }
+ +    }
+ +
+ +    nbnxn_atomdata_copy_shiftvec(flags & GMX_FORCE_DYNAMICBOX,
+ +                                 fr->shift_vec, nbv->grp[0].nbat);
+ +
+ +#ifdef GMX_MPI
+ +    if (!(cr->duty & DUTY_PME))
+ +    {
+ +        /* Send particle coordinates to the pme nodes.
+ +         * Since this is only implemented for domain decomposition
+ +         * and domain decomposition does not use the graph,
+ +         * we do not need to worry about shifting.
+ +         */
+ +
+ +        wallcycle_start(wcycle, ewcPP_PMESENDX);
+ +
+ +        bBS = (inputrec->nwall == 2);
+ +        if (bBS)
+ +        {
+ +            copy_mat(box, boxs);
+ +            svmul(inputrec->wall_ewald_zfac, boxs[ZZ], boxs[ZZ]);
+ +        }
+ +
+ +        gmx_pme_send_x(cr, bBS ? boxs : box, x,
+ +                       mdatoms->nChargePerturbed, lambda[efptCOUL],
+ +                       (flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)), step);
+ +
+ +        wallcycle_stop(wcycle, ewcPP_PMESENDX);
+ +    }
+ +#endif /* GMX_MPI */
+ +
+ +    /* do gridding for pair search */
+ +    if (bNS)
+ +    {
+ +        if (graph && bStateChanged)
+ +        {
+ +            /* Calculate intramolecular shift vectors to make molecules whole */
+ +            mk_mshift(fplog, graph, fr->ePBC, box, x);
+ +        }
+ +
+ +        clear_rvec(vzero);
+ +        box_diag[XX] = box[XX][XX];
+ +        box_diag[YY] = box[YY][YY];
+ +        box_diag[ZZ] = box[ZZ][ZZ];
+ +
+ +        wallcycle_start(wcycle, ewcNS);
+ +        if (!fr->bDomDec)
+ +        {
+ +            wallcycle_sub_start(wcycle, ewcsNBS_GRID_LOCAL);
+ +            nbnxn_put_on_grid(nbv->nbs, fr->ePBC, box,
+ +                              0, vzero, box_diag,
+ +                              0, mdatoms->homenr, -1, fr->cginfo, x,
+ +                              0, NULL,
+ +                              nbv->grp[eintLocal].kernel_type,
+ +                              nbv->grp[eintLocal].nbat);
+ +            wallcycle_sub_stop(wcycle, ewcsNBS_GRID_LOCAL);
+ +        }
+ +        else
+ +        {
+ +            wallcycle_sub_start(wcycle, ewcsNBS_GRID_NONLOCAL);
+ +            nbnxn_put_on_grid_nonlocal(nbv->nbs, domdec_zones(cr->dd),
+ +                                       fr->cginfo, x,
+ +                                       nbv->grp[eintNonlocal].kernel_type,
+ +                                       nbv->grp[eintNonlocal].nbat);
+ +            wallcycle_sub_stop(wcycle, ewcsNBS_GRID_NONLOCAL);
+ +        }
+ +
+ +        if (nbv->ngrp == 1 ||
+ +            nbv->grp[eintNonlocal].nbat == nbv->grp[eintLocal].nbat)
+ +        {
+ +            nbnxn_atomdata_set(nbv->grp[eintLocal].nbat, eatAll,
+ +                               nbv->nbs, mdatoms, fr->cginfo);
+ +        }
+ +        else
+ +        {
+ +            nbnxn_atomdata_set(nbv->grp[eintLocal].nbat, eatLocal,
+ +                               nbv->nbs, mdatoms, fr->cginfo);
+ +            nbnxn_atomdata_set(nbv->grp[eintNonlocal].nbat, eatAll,
+ +                               nbv->nbs, mdatoms, fr->cginfo);
+ +        }
+ +        wallcycle_stop(wcycle, ewcNS);
+ +    }
+ +
+ +    /* initialize the GPU atom data and copy shift vector */
+ +    if (bUseGPU)
+ +    {
+ +        if (bNS)
+ +        {
+ +            wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU_NB);
+ +            nbnxn_cuda_init_atomdata(nbv->cu_nbv, nbv->grp[eintLocal].nbat);
+ +            wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
+ +        }
+ +
+ +        wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU_NB);
+ +        nbnxn_cuda_upload_shiftvec(nbv->cu_nbv, nbv->grp[eintLocal].nbat);
+ +        wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
+ +    }
+ +
+ +    /* do local pair search */
+ +    if (bNS)
+ +    {
+ +        wallcycle_start_nocount(wcycle, ewcNS);
+ +        wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_LOCAL);
+ +        nbnxn_make_pairlist(nbv->nbs, nbv->grp[eintLocal].nbat,
+ +                            &top->excls,
+ +                            ic->rlist,
+ +                            nbv->min_ci_balanced,
+ +                            &nbv->grp[eintLocal].nbl_lists,
+ +                            eintLocal,
+ +                            nbv->grp[eintLocal].kernel_type,
+ +                            nrnb);
+ +        wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_LOCAL);
+ +
+ +        if (bUseGPU)
+ +        {
+ +            /* initialize local pair-list on the GPU */
+ +            nbnxn_cuda_init_pairlist(nbv->cu_nbv,
+ +                                     nbv->grp[eintLocal].nbl_lists.nbl[0],
+ +                                     eintLocal);
+ +        }
+ +        wallcycle_stop(wcycle, ewcNS);
+ +    }
+ +    else
+ +    {
+ +        wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
+ +        wallcycle_sub_start(wcycle, ewcsNB_X_BUF_OPS);
+ +        nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs, eatLocal, FALSE, x,
+ +                                        nbv->grp[eintLocal].nbat);
+ +        wallcycle_sub_stop(wcycle, ewcsNB_X_BUF_OPS);
+ +        wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
+ +    }
+ +
+ +    if (bUseGPU)
+ +    {
+ +        wallcycle_start(wcycle, ewcLAUNCH_GPU_NB);
+ +        /* launch local nonbonded F on GPU */
+ +        do_nb_verlet(fr, ic, enerd, flags, eintLocal, enbvClearFNo,
+ +                     nrnb, wcycle);
+ +        wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
+ +    }
+ +
+ +    /* Communicate coordinates and sum dipole if necessary +
+ +       do non-local pair search */
+ +    if (DOMAINDECOMP(cr))
+ +    {
+ +        bDiffKernels = (nbv->grp[eintNonlocal].kernel_type !=
+ +                        nbv->grp[eintLocal].kernel_type);
+ +
+ +        if (bDiffKernels)
+ +        {
+ +            /* With GPU+CPU non-bonded calculations we need to copy
+ +             * the local coordinates to the non-local nbat struct
+ +             * (in CPU format) as the non-local kernel call also
+ +             * calculates the local - non-local interactions.
+ +             */
+ +            wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
+ +            wallcycle_sub_start(wcycle, ewcsNB_X_BUF_OPS);
+ +            nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs, eatLocal, TRUE, x,
+ +                                            nbv->grp[eintNonlocal].nbat);
+ +            wallcycle_sub_stop(wcycle, ewcsNB_X_BUF_OPS);
+ +            wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
+ +        }
+ +
+ +        if (bNS)
+ +        {
+ +            wallcycle_start_nocount(wcycle, ewcNS);
+ +            wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_NONLOCAL);
+ +
+ +            if (bDiffKernels)
+ +            {
+ +                nbnxn_grid_add_simple(nbv->nbs, nbv->grp[eintNonlocal].nbat);
+ +            }
+ +
+ +            nbnxn_make_pairlist(nbv->nbs, nbv->grp[eintNonlocal].nbat,
+ +                                &top->excls,
+ +                                ic->rlist,
+ +                                nbv->min_ci_balanced,
+ +                                &nbv->grp[eintNonlocal].nbl_lists,
+ +                                eintNonlocal,
+ +                                nbv->grp[eintNonlocal].kernel_type,
+ +                                nrnb);
+ +
+ +            wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_NONLOCAL);
+ +
+ +            if (nbv->grp[eintNonlocal].kernel_type == nbnxnk8x8x8_CUDA)
+ +            {
+ +                /* initialize non-local pair-list on the GPU */
+ +                nbnxn_cuda_init_pairlist(nbv->cu_nbv,
+ +                                         nbv->grp[eintNonlocal].nbl_lists.nbl[0],
+ +                                         eintNonlocal);
+ +            }
+ +            wallcycle_stop(wcycle, ewcNS);
+ +        }
+ +        else
+ +        {
+ +            wallcycle_start(wcycle, ewcMOVEX);
+ +            dd_move_x(cr->dd, box, x);
+ +
+ +            /* When we don't need the total dipole we sum it in global_stat */
+ +            if (bStateChanged && NEED_MUTOT(*inputrec))
+ +            {
+ +                gmx_sumd(2*DIM, mu, cr);
+ +            }
+ +            wallcycle_stop(wcycle, ewcMOVEX);
+ +
+ +            wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
+ +            wallcycle_sub_start(wcycle, ewcsNB_X_BUF_OPS);
+ +            nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs, eatNonlocal, FALSE, x,
+ +                                            nbv->grp[eintNonlocal].nbat);
+ +            wallcycle_sub_stop(wcycle, ewcsNB_X_BUF_OPS);
+ +            cycles_force += wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
+ +        }
+ +
+ +        if (bUseGPU && !bDiffKernels)
+ +        {
+ +            wallcycle_start(wcycle, ewcLAUNCH_GPU_NB);
+ +            /* launch non-local nonbonded F on GPU */
+ +            do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFNo,
+ +                         nrnb, wcycle);
+ +            cycles_force += wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
+ +        }
+ +    }
+ +
+ +    if (bUseGPU)
+ +    {
+ +        /* launch D2H copy-back F */
+ +        wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU_NB);
+ +        if (DOMAINDECOMP(cr) && !bDiffKernels)
+ +        {
+ +            nbnxn_cuda_launch_cpyback(nbv->cu_nbv, nbv->grp[eintNonlocal].nbat,
+ +                                      flags, eatNonlocal);
+ +        }
+ +        nbnxn_cuda_launch_cpyback(nbv->cu_nbv, nbv->grp[eintLocal].nbat,
+ +                                  flags, eatLocal);
+ +        cycles_force += wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
+ +    }
+ +
+ +    if (bStateChanged && NEED_MUTOT(*inputrec))
+ +    {
+ +        if (PAR(cr))
+ +        {
+ +            gmx_sumd(2*DIM, mu, cr);
+ +        }
+ +
+ +        for (i = 0; i < 2; i++)
+ +        {
+ +            for (j = 0; j < DIM; j++)
+ +            {
+ +                fr->mu_tot[i][j] = mu[i*DIM + j];
+ +            }
+ +        }
+ +    }
+ +    if (fr->efep == efepNO)
+ +    {
+ +        copy_rvec(fr->mu_tot[0], mu_tot);
+ +    }
+ +    else
+ +    {
+ +        for (j = 0; j < DIM; j++)
+ +        {
+ +            mu_tot[j] =
+ +                (1.0 - lambda[efptCOUL])*fr->mu_tot[0][j] +
+ +                lambda[efptCOUL]*fr->mu_tot[1][j];
+ +        }
+ +    }
+ +
+ +    /* Reset energies */
+ +    reset_enerdata(&(inputrec->opts), fr, bNS, enerd, MASTER(cr));
+ +    clear_rvecs(SHIFTS, fr->fshift);
+ +
+ +    if (DOMAINDECOMP(cr))
+ +    {
+ +        if (!(cr->duty & DUTY_PME))
+ +        {
+ +            wallcycle_start(wcycle, ewcPPDURINGPME);
+ +            dd_force_flop_start(cr->dd, nrnb);
+ +        }
+ +    }
+ +
+ +    if (inputrec->bRot)
+ +    {
+ +        /* Enforced rotation has its own cycle counter that starts after the collective
+ +         * coordinates have been communicated. It is added to ddCyclF to allow
+ +         * for proper load-balancing */
+ +        wallcycle_start(wcycle, ewcROT);
+ +        do_rotation(cr, inputrec, box, x, t, step, wcycle, bNS);
+ +        wallcycle_stop(wcycle, ewcROT);
+ +    }
+ +
+ +    /* Start the force cycle counter.
+ +     * This counter is stopped in do_forcelow_level.
+ +     * No parallel communication should occur while this counter is running,
+ +     * since that will interfere with the dynamic load balancing.
+ +     */
+ +    wallcycle_start(wcycle, ewcFORCE);
+ +    if (bDoForces)
+ +    {
+ +        /* Reset forces for which the virial is calculated separately:
+ +         * PME/Ewald forces if necessary */
+ +        if (fr->bF_NoVirSum)
+ +        {
+ +            if (flags & GMX_FORCE_VIRIAL)
+ +            {
+ +                fr->f_novirsum = fr->f_novirsum_alloc;
+ +                if (fr->bDomDec)
+ +                {
+ +                    clear_rvecs(fr->f_novirsum_n, fr->f_novirsum);
+ +                }
+ +                else
+ +                {
+ +                    clear_rvecs(homenr, fr->f_novirsum+start);
+ +                }
+ +            }
+ +            else
+ +            {
+ +                /* We are not calculating the pressure so we do not need
+ +                 * a separate array for forces that do not contribute
+ +                 * to the pressure.
+ +                 */
+ +                fr->f_novirsum = f;
+ +            }
+ +        }
+ +
+ +        /* Clear the short- and long-range forces */
+ +        clear_rvecs(fr->natoms_force_constr, f);
+ +        if (bSepLRF && do_per_step(step, inputrec->nstcalclr))
+ +        {
+ +            clear_rvecs(fr->natoms_force_constr, fr->f_twin);
+ +        }
+ +
+ +        clear_rvec(fr->vir_diag_posres);
+ +    }
+ +
+ +    if (inputrec->ePull == epullCONSTRAINT)
+ +    {
+ +        clear_pull_forces(inputrec->pull);
+ +    }
+ +
+ +    /* We calculate the non-bonded forces, when done on the CPU, here.
+ +     * We do this before calling do_force_lowlevel, as in there bondeds
+ +     * forces are calculated before PME, which does communication.
+ +     * With this order, non-bonded and bonded force calculation imbalance
+ +     * can be balanced out by the domain decomposition load balancing.
+ +     */
+ +
+ +    if (!bUseOrEmulGPU)
+ +    {
+ +        /* Maybe we should move this into do_force_lowlevel */
+ +        do_nb_verlet(fr, ic, enerd, flags, eintLocal, enbvClearFYes,
+ +                     nrnb, wcycle);
+ +    }
+ +
+ +    if (!bUseOrEmulGPU || bDiffKernels)
+ +    {
+ +        int aloc;
+ +
+ +        if (DOMAINDECOMP(cr))
+ +        {
+ +            do_nb_verlet(fr, ic, enerd, flags, eintNonlocal,
+ +                         bDiffKernels ? enbvClearFYes : enbvClearFNo,
+ +                         nrnb, wcycle);
+ +        }
+ +
+ +        if (!bUseOrEmulGPU)
+ +        {
+ +            aloc = eintLocal;
+ +        }
+ +        else
+ +        {
+ +            aloc = eintNonlocal;
+ +        }
+ +
+ +        /* Add all the non-bonded force to the normal force array.
+ +         * This can be split into a local a non-local part when overlapping
+ +         * communication with calculation with domain decomposition.
+ +         */
+ +        cycles_force += wallcycle_stop(wcycle, ewcFORCE);
+ +        wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
+ +        wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS);
+ +        nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs, eatAll, nbv->grp[aloc].nbat, f);
+ +        wallcycle_sub_stop(wcycle, ewcsNB_F_BUF_OPS);
+ +        cycles_force += wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
+ +        wallcycle_start_nocount(wcycle, ewcFORCE);
+ +
+ +        /* if there are multiple fshift output buffers reduce them */
+ +        if ((flags & GMX_FORCE_VIRIAL) &&
+ +            nbv->grp[aloc].nbl_lists.nnbl > 1)
+ +        {
+ +            nbnxn_atomdata_add_nbat_fshift_to_fshift(nbv->grp[aloc].nbat,
+ +                                                     fr->fshift);
+ +        }
+ +    }
+ +
+ +    /* update QMMMrec, if necessary */
+ +    if (fr->bQMMM)
+ +    {
+ +        update_QMMMrec(cr, fr, x, mdatoms, box, top);
+ +    }
+ +
+ +    if ((flags & GMX_FORCE_BONDED) && top->idef.il[F_POSRES].nr > 0)
+ +    {
+ +        posres_wrapper(fplog, flags, bSepDVDL, inputrec, nrnb, top, box, x,
+ +                       f, enerd, lambda, fr);
+ +    }
+ +
+ +    /* Compute the bonded and non-bonded energies and optionally forces */
+ +    do_force_lowlevel(fplog, step, fr, inputrec, &(top->idef),
+ +                      cr, nrnb, wcycle, mdatoms, &(inputrec->opts),
+ +                      x, hist, f, bSepLRF ? fr->f_twin : f, enerd, fcd, mtop, top, fr->born,
+ +                      &(top->atomtypes), bBornRadii, box,
+ +                      inputrec->fepvals, lambda, graph, &(top->excls), fr->mu_tot,
+ +                      flags, &cycles_pme);
+ +
+ +    if (bSepLRF)
+ +    {
+ +        if (do_per_step(step, inputrec->nstcalclr))
+ +        {
+ +            /* Add the long range forces to the short range forces */
+ +            for (i = 0; i < fr->natoms_force_constr; i++)
+ +            {
+ +                rvec_add(fr->f_twin[i], f[i], f[i]);
+ +            }
+ +        }
+ +    }
+ +
+ +    cycles_force += wallcycle_stop(wcycle, ewcFORCE);
+ +
+ +    if (ed)
+ +    {
+ +        do_flood(cr, inputrec, x, f, ed, box, step, bNS);
+ +    }
+ +
+ +    if (bUseOrEmulGPU && !bDiffKernels)
+ +    {
+ +        /* wait for non-local forces (or calculate in emulation mode) */
+ +        if (DOMAINDECOMP(cr))
+ +        {
+ +            if (bUseGPU)
+ +            {
+ +                wallcycle_start(wcycle, ewcWAIT_GPU_NB_NL);
+ +                nbnxn_cuda_wait_gpu(nbv->cu_nbv,
+ +                                    nbv->grp[eintNonlocal].nbat,
+ +                                    flags, eatNonlocal,
+ +                                    enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
+ +                                    fr->fshift);
+ +                cycles_force += wallcycle_stop(wcycle, ewcWAIT_GPU_NB_NL);
+ +            }
+ +            else
+ +            {
+ +                wallcycle_start_nocount(wcycle, ewcFORCE);
+ +                do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFYes,
+ +                             nrnb, wcycle);
+ +                cycles_force += wallcycle_stop(wcycle, ewcFORCE);
+ +            }
+ +            wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
+ +            wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS);
+ +            /* skip the reduction if there was no non-local work to do */
+ +            if (nbv->grp[eintLocal].nbl_lists.nbl[0]->nsci > 0)
+ +            {
+ +                nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs, eatNonlocal,
+ +                                               nbv->grp[eintNonlocal].nbat, f);
+ +            }
+ +            wallcycle_sub_stop(wcycle, ewcsNB_F_BUF_OPS);
+ +            cycles_force += wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
+ +        }
+ +    }
+ +
+ +    if (bDoForces)
+ +    {
+ +        /* Communicate the forces */
+ +        if (PAR(cr))
+ +        {
+ +            wallcycle_start(wcycle, ewcMOVEF);
+ +            if (DOMAINDECOMP(cr))
+ +            {
+ +                dd_move_f(cr->dd, f, fr->fshift);
+ +                /* Do we need to communicate the separate force array
+ +                 * for terms that do not contribute to the single sum virial?
+ +                 * Position restraints and electric fields do not introduce
+ +                 * inter-cg forces, only full electrostatics methods do.
+ +                 * When we do not calculate the virial, fr->f_novirsum = f,
+ +                 * so we have already communicated these forces.
+ +                 */
+ +                if (EEL_FULL(fr->eeltype) && cr->dd->n_intercg_excl &&
+ +                    (flags & GMX_FORCE_VIRIAL))
+ +                {
+ +                    dd_move_f(cr->dd, fr->f_novirsum, NULL);
+ +                }
+ +                if (bSepLRF)
+ +                {
+ +                    /* We should not update the shift forces here,
+ +                     * since f_twin is already included in f.
+ +                     */
+ +                    dd_move_f(cr->dd, fr->f_twin, NULL);
+ +                }
+ +            }
+ +            wallcycle_stop(wcycle, ewcMOVEF);
+ +        }
+ +    }
+ +
+ +    if (bUseOrEmulGPU)
+ +    {
+ +        /* wait for local forces (or calculate in emulation mode) */
+ +        if (bUseGPU)
+ +        {
+ +            wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
+ +            nbnxn_cuda_wait_gpu(nbv->cu_nbv,
+ +                                nbv->grp[eintLocal].nbat,
+ +                                flags, eatLocal,
+ +                                enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
+ +                                fr->fshift);
+ +            wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
+ +
+ +            /* now clear the GPU outputs while we finish the step on the CPU */
+ +
+ +            wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU_NB);
+ +            nbnxn_cuda_clear_outputs(nbv->cu_nbv, flags);
+ +            wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
+ +        }
+ +        else
+ +        {
+ +            wallcycle_start_nocount(wcycle, ewcFORCE);
+ +            do_nb_verlet(fr, ic, enerd, flags, eintLocal,
+ +                         DOMAINDECOMP(cr) ? enbvClearFNo : enbvClearFYes,
+ +                         nrnb, wcycle);
+ +            wallcycle_stop(wcycle, ewcFORCE);
+ +        }
+ +        wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
+ +        wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS);
+ +        if (nbv->grp[eintLocal].nbl_lists.nbl[0]->nsci > 0)
+ +        {
+ +            /* skip the reduction if there was no non-local work to do */
+ +            nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs, eatLocal,
+ +                                           nbv->grp[eintLocal].nbat, f);
+ +        }
+ +        wallcycle_sub_stop(wcycle, ewcsNB_F_BUF_OPS);
+ +        wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
+ +    }
+ +
+ +    if (DOMAINDECOMP(cr))
+ +    {
+ +        dd_force_flop_stop(cr->dd, nrnb);
+ +        if (wcycle)
+ +        {
+ +            dd_cycles_add(cr->dd, cycles_force-cycles_pme, ddCyclF);
+ +        }
+ +    }
+ +
+ +    if (bDoForces)
+ +    {
+ +        if (IR_ELEC_FIELD(*inputrec))
+ +        {
+ +            /* Compute forces due to electric field */
+ +            calc_f_el(MASTER(cr) ? field : NULL,
+ +                      start, homenr, mdatoms->chargeA, x, fr->f_novirsum,
+ +                      inputrec->ex, inputrec->et, t);
+ +        }
+ +
+ +        /* If we have NoVirSum forces, but we do not calculate the virial,
+ +         * we sum fr->f_novirum=f later.
+ +         */
+ +        if (vsite && !(fr->bF_NoVirSum && !(flags & GMX_FORCE_VIRIAL)))
+ +        {
+ +            wallcycle_start(wcycle, ewcVSITESPREAD);
+ +            spread_vsite_f(fplog, vsite, x, f, fr->fshift, FALSE, NULL, nrnb,
+ +                           &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr);
+ +            wallcycle_stop(wcycle, ewcVSITESPREAD);
+ +
+ +            if (bSepLRF)
+ +            {
+ +                wallcycle_start(wcycle, ewcVSITESPREAD);
+ +                spread_vsite_f(fplog, vsite, x, fr->f_twin, NULL, FALSE, NULL,
+ +                               nrnb,
+ +                               &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr);
+ +                wallcycle_stop(wcycle, ewcVSITESPREAD);
+ +            }
+ +        }
+ +
+ +        if (flags & GMX_FORCE_VIRIAL)
+ +        {
+ +            /* Calculation of the virial must be done after vsites! */
+ +            calc_virial(fplog, mdatoms->start, mdatoms->homenr, x, f,
+ +                        vir_force, graph, box, nrnb, fr, inputrec->ePBC);
+ +        }
+ +    }
+ +
+ +    if (inputrec->ePull == epullUMBRELLA || inputrec->ePull == epullCONST_F)
+ +    {
+ +        pull_potential_wrapper(fplog, bSepDVDL, cr, inputrec, box, x,
+ +                               f, vir_force, mdatoms, enerd, lambda, t);
+ +    }
+ +
+ +    /* Add the forces from enforced rotation potentials (if any) */
+ +    if (inputrec->bRot)
+ +    {
+ +        wallcycle_start(wcycle, ewcROTadd);
+ +        enerd->term[F_COM_PULL] += add_rot_forces(inputrec->rot, f, cr, step, t);
+ +        wallcycle_stop(wcycle, ewcROTadd);
+ +    }
+ +
+ +    if (PAR(cr) && !(cr->duty & DUTY_PME))
+ +    {
+ +        /* In case of node-splitting, the PP nodes receive the long-range
+ +         * forces, virial and energy from the PME nodes here.
+ +         */
+ +        pme_receive_force_ener(fplog, bSepDVDL, cr, wcycle, enerd, fr);
+ +    }
+ +
+ +    if (bDoForces)
+ +    {
+ +        post_process_forces(fplog, cr, step, nrnb, wcycle,
+ +                            top, box, x, f, vir_force, mdatoms, graph, fr, vsite,
+ +                            flags);
+ +    }
+ +
+ +    /* Sum the potential energy terms from group contributions */
+ +    sum_epot(&(inputrec->opts), &(enerd->grpp), enerd->term);
+ +}
+ +
+ +void do_force_cutsGROUP(FILE *fplog, t_commrec *cr,
+ +                        t_inputrec *inputrec,
+ +                        gmx_large_int_t step, t_nrnb *nrnb, gmx_wallcycle_t wcycle,
+ +                        gmx_localtop_t *top,
+ +                        gmx_mtop_t *mtop,
+ +                        gmx_groups_t *groups,
+ +                        matrix box, rvec x[], history_t *hist,
+ +                        rvec f[],
+ +                        tensor vir_force,
+ +                        t_mdatoms *mdatoms,
+ +                        gmx_enerdata_t *enerd, t_fcdata *fcd,
+ +                        real *lambda, t_graph *graph,
+ +                        t_forcerec *fr, gmx_vsite_t *vsite, rvec mu_tot,
+ +                        double t, FILE *field, gmx_edsam_t ed,
+ +                        gmx_bool bBornRadii,
+ +                        int flags)
+ +{
+ +    int        cg0, cg1, i, j;
+ +    int        start, homenr;
+ +    double     mu[2*DIM];
+ +    gmx_bool   bSepDVDL, bStateChanged, bNS, bFillGrid, bCalcCGCM, bBS;
+ +    gmx_bool   bDoLongRangeNS, bDoForces, bDoPotential, bSepLRF;
+ +    gmx_bool   bDoAdressWF;
+ +    matrix     boxs;
+ +    rvec       vzero, box_diag;
+ +    real       e, v, dvdlambda[efptNR];
+ +    t_pbc      pbc;
+ +    float      cycles_pme, cycles_force;
+ +
+ +    start  = mdatoms->start;
+ +    homenr = mdatoms->homenr;
+ +
+ +    bSepDVDL = (fr->bSepDVDL && do_per_step(step, inputrec->nstlog));
+ +
+ +    clear_mat(vir_force);
+ +
+ +    if (PARTDECOMP(cr))
+ +    {
+ +        pd_cg_range(cr, &cg0, &cg1);
+ +    }
+ +    else
+ +    {
+ +        cg0 = 0;
+ +        if (DOMAINDECOMP(cr))
+ +        {
+ +            cg1 = cr->dd->ncg_tot;
+ +        }
+ +        else
+ +        {
+ +            cg1 = top->cgs.nr;
+ +        }
+ +        if (fr->n_tpi > 0)
+ +        {
+ +            cg1--;
+ +        }
+ +    }
+ +
+ +    bStateChanged  = (flags & GMX_FORCE_STATECHANGED);
+ +    bNS            = (flags & GMX_FORCE_NS) && (fr->bAllvsAll == FALSE);
+ +    /* Should we update the long-range neighborlists at this step? */
+ +    bDoLongRangeNS = fr->bTwinRange && bNS;
+ +    /* Should we perform the long-range nonbonded evaluation inside the neighborsearching? */
+ +    bFillGrid      = (bNS && bStateChanged);
+ +    bCalcCGCM      = (bFillGrid && !DOMAINDECOMP(cr));
+ +    bDoForces      = (flags & GMX_FORCE_FORCES);
+ +    bDoPotential   = (flags & GMX_FORCE_ENERGY);
+ +    bSepLRF        = ((inputrec->nstcalclr > 1) && bDoForces &&
+ +                      (flags & GMX_FORCE_SEPLRF) && (flags & GMX_FORCE_DO_LR));
+ +
+ +    /* should probably move this to the forcerec since it doesn't change */
+ +    bDoAdressWF   = ((fr->adress_type != eAdressOff));
+ +
+ +    if (bStateChanged)
+ +    {
+ +        update_forcerec(fplog, fr, box);
+ +
+ +        if (NEED_MUTOT(*inputrec))
+ +        {
+ +            /* Calculate total (local) dipole moment in a temporary common array.
+ +             * This makes it possible to sum them over nodes faster.
+ +             */
+ +            calc_mu(start, homenr,
+ +                    x, mdatoms->chargeA, mdatoms->chargeB, mdatoms->nChargePerturbed,
+ +                    mu, mu+DIM);
+ +        }
+ +    }
+ +
+ +    if (fr->ePBC != epbcNONE)
+ +    {
+ +        /* Compute shift vectors every step,
+ +         * because of pressure coupling or box deformation!
+ +         */
+ +        if ((flags & GMX_FORCE_DYNAMICBOX) && bStateChanged)
+ +        {
+ +            calc_shifts(box, fr->shift_vec);
+ +        }
+ +
+ +        if (bCalcCGCM)
+ +        {
+ +            put_charge_groups_in_box(fplog, cg0, cg1, fr->ePBC, box,
+ +                                     &(top->cgs), x, fr->cg_cm);
+ +            inc_nrnb(nrnb, eNR_CGCM, homenr);
+ +            inc_nrnb(nrnb, eNR_RESETX, cg1-cg0);
+ +        }
+ +        else if (EI_ENERGY_MINIMIZATION(inputrec->eI) && graph)
+ +        {
+ +            unshift_self(graph, box, x);
+ +        }
+ +    }
+ +    else if (bCalcCGCM)
+ +    {
+ +        calc_cgcm(fplog, cg0, cg1, &(top->cgs), x, fr->cg_cm);
+ +        inc_nrnb(nrnb, eNR_CGCM, homenr);
+ +    }
+ +
+ +    if (bCalcCGCM)
+ +    {
+ +        if (PAR(cr))
+ +        {
+ +            move_cgcm(fplog, cr, fr->cg_cm);
+ +        }
+ +        if (gmx_debug_at)
+ +        {
+ +            pr_rvecs(debug, 0, "cgcm", fr->cg_cm, top->cgs.nr);
+ +        }
+ +    }
+ +
+ +#ifdef GMX_MPI
+ +    if (!(cr->duty & DUTY_PME))
+ +    {
+ +        /* Send particle coordinates to the pme nodes.
+ +         * Since this is only implemented for domain decomposition
+ +         * and domain decomposition does not use the graph,
+ +         * we do not need to worry about shifting.
+ +         */
+ +
+ +        wallcycle_start(wcycle, ewcPP_PMESENDX);
+ +
+ +        bBS = (inputrec->nwall == 2);
+ +        if (bBS)
+ +        {
+ +            copy_mat(box, boxs);
+ +            svmul(inputrec->wall_ewald_zfac, boxs[ZZ], boxs[ZZ]);
+ +        }
+ +
+ +        gmx_pme_send_x(cr, bBS ? boxs : box, x,
+ +                       mdatoms->nChargePerturbed, lambda[efptCOUL],
+ +                       (flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)), step);
+ +
+ +        wallcycle_stop(wcycle, ewcPP_PMESENDX);
+ +    }
+ +#endif /* GMX_MPI */
+ +
+ +    /* Communicate coordinates and sum dipole if necessary */
+ +    if (PAR(cr))
+ +    {
+ +        wallcycle_start(wcycle, ewcMOVEX);
+ +        if (DOMAINDECOMP(cr))
+ +        {
+ +            dd_move_x(cr->dd, box, x);
+ +        }
+ +        else
+ +        {
+ +            move_x(fplog, cr, GMX_LEFT, GMX_RIGHT, x, nrnb);
+ +        }
+ +        wallcycle_stop(wcycle, ewcMOVEX);
+ +    }
+ +
+ +    /* update adress weight beforehand */
+ +    if (bStateChanged && bDoAdressWF)
+ +    {
+ +        /* need pbc for adress weight calculation with pbc_dx */
+ +        set_pbc(&pbc, inputrec->ePBC, box);
+ +        if (fr->adress_site == eAdressSITEcog)
+ +        {
+ +            update_adress_weights_cog(top->idef.iparams, top->idef.il, x, fr, mdatoms,
+ +                                      inputrec->ePBC == epbcNONE ? NULL : &pbc);
+ +        }
+ +        else if (fr->adress_site == eAdressSITEcom)
+ +        {
+ +            update_adress_weights_com(fplog, cg0, cg1, &(top->cgs), x, fr, mdatoms,
+ +                                      inputrec->ePBC == epbcNONE ? NULL : &pbc);
+ +        }
+ +        else if (fr->adress_site == eAdressSITEatomatom)
+ +        {
+ +            update_adress_weights_atom_per_atom(cg0, cg1, &(top->cgs), x, fr, mdatoms,
+ +                                                inputrec->ePBC == epbcNONE ? NULL : &pbc);
+ +        }
+ +        else
+ +        {
+ +            update_adress_weights_atom(cg0, cg1, &(top->cgs), x, fr, mdatoms,
+ +                                       inputrec->ePBC == epbcNONE ? NULL : &pbc);
+ +        }
+ +    }
+ +
+ +    if (NEED_MUTOT(*inputrec))
+ +    {
+ +
+ +        if (bStateChanged)
+ +        {
+ +            if (PAR(cr))
+ +            {
+ +                gmx_sumd(2*DIM, mu, cr);
+ +            }
+ +            for (i = 0; i < 2; i++)
+ +            {
+ +                for (j = 0; j < DIM; j++)
+ +                {
+ +                    fr->mu_tot[i][j] = mu[i*DIM + j];
+ +                }
+ +            }
+ +        }
+ +        if (fr->efep == efepNO)
+ +        {
+ +            copy_rvec(fr->mu_tot[0], mu_tot);
+ +        }
+ +        else
+ +        {
+ +            for (j = 0; j < DIM; j++)
+ +            {
+ +                mu_tot[j] =
+ +                    (1.0 - lambda[efptCOUL])*fr->mu_tot[0][j] + lambda[efptCOUL]*fr->mu_tot[1][j];
+ +            }
+ +        }
+ +    }
+ +
+ +    /* Reset energies */
+ +    reset_enerdata(&(inputrec->opts), fr, bNS, enerd, MASTER(cr));
+ +    clear_rvecs(SHIFTS, fr->fshift);
+ +
+ +    if (bNS)
+ +    {
+ +        wallcycle_start(wcycle, ewcNS);
+ +
+ +        if (graph && bStateChanged)
+ +        {
+ +            /* Calculate intramolecular shift vectors to make molecules whole */
+ +            mk_mshift(fplog, graph, fr->ePBC, box, x);
+ +        }
+ +
+ +        /* Do the actual neighbour searching and if twin range electrostatics
+ +         * also do the calculation of long range forces and energies.
+ +         */
+ +        for (i = 0; i < efptNR; i++)
+ +        {
+ +            dvdlambda[i] = 0;
+ +        }
+ +        ns(fplog, fr, x, box,
+ +           groups, &(inputrec->opts), top, mdatoms,
+ +           cr, nrnb, lambda, dvdlambda, &enerd->grpp, bFillGrid,
+ +           bDoLongRangeNS);
+ +        if (bSepDVDL)
+ +        {
+ +            fprintf(fplog, sepdvdlformat, "LR non-bonded", 0.0, dvdlambda);
+ +        }
+ +        enerd->dvdl_lin[efptVDW]  += dvdlambda[efptVDW];
+ +        enerd->dvdl_lin[efptCOUL] += dvdlambda[efptCOUL];
+ +
+ +        wallcycle_stop(wcycle, ewcNS);
+ +    }
+ +
+ +    if (inputrec->implicit_solvent && bNS)
+ +    {
+ +        make_gb_nblist(cr, inputrec->gb_algorithm, inputrec->rlist,
+ +                       x, box, fr, &top->idef, graph, fr->born);
+ +    }
+ +
+ +    if (DOMAINDECOMP(cr))
+ +    {
+ +        if (!(cr->duty & DUTY_PME))
+ +        {
+ +            wallcycle_start(wcycle, ewcPPDURINGPME);
+ +            dd_force_flop_start(cr->dd, nrnb);
+ +        }
+ +    }
+ +
+ +    if (inputrec->bRot)
+ +    {
+ +        /* Enforced rotation has its own cycle counter that starts after the collective
+ +         * coordinates have been communicated. It is added to ddCyclF to allow
+ +         * for proper load-balancing */
+ +        wallcycle_start(wcycle, ewcROT);
+ +        do_rotation(cr, inputrec, box, x, t, step, wcycle, bNS);
+ +        wallcycle_stop(wcycle, ewcROT);
+ +    }
+ +
+ +    /* Start the force cycle counter.
+ +     * This counter is stopped in do_forcelow_level.
+ +     * No parallel communication should occur while this counter is running,
+ +     * since that will interfere with the dynamic load balancing.
+ +     */
+ +    wallcycle_start(wcycle, ewcFORCE);
+ +
+ +    if (bDoForces)
+ +    {
+ +        /* Reset forces for which the virial is calculated separately:
+ +         * PME/Ewald forces if necessary */
+ +        if (fr->bF_NoVirSum)
+ +        {
+ +            if (flags & GMX_FORCE_VIRIAL)
+ +            {
+ +                fr->f_novirsum = fr->f_novirsum_alloc;
+ +                if (fr->bDomDec)
+ +                {
+ +                    clear_rvecs(fr->f_novirsum_n, fr->f_novirsum);
+ +                }
+ +                else
+ +                {
+ +                    clear_rvecs(homenr, fr->f_novirsum+start);
+ +                }
+ +            }
+ +            else
+ +            {
+ +                /* We are not calculating the pressure so we do not need
+ +                 * a separate array for forces that do not contribute
+ +                 * to the pressure.
+ +                 */
+ +                fr->f_novirsum = f;
+ +            }
+ +        }
+ +
+ +        /* Clear the short- and long-range forces */
+ +        clear_rvecs(fr->natoms_force_constr, f);
+ +        if (bSepLRF && do_per_step(step, inputrec->nstcalclr))
+ +        {
+ +            clear_rvecs(fr->natoms_force_constr, fr->f_twin);
+ +        }
+ +
+ +        clear_rvec(fr->vir_diag_posres);
+ +    }
+ +    if (inputrec->ePull == epullCONSTRAINT)
+ +    {
+ +        clear_pull_forces(inputrec->pull);
+ +    }
+ +
+ +    /* update QMMMrec, if necessary */
+ +    if (fr->bQMMM)
+ +    {
+ +        update_QMMMrec(cr, fr, x, mdatoms, box, top);
+ +    }
+ +
+ +    if ((flags & GMX_FORCE_BONDED) && top->idef.il[F_POSRES].nr > 0)
+ +    {
+ +        posres_wrapper(fplog, flags, bSepDVDL, inputrec, nrnb, top, box, x,
+ +                       f, enerd, lambda, fr);
+ +    }
+ +
+ +    if ((flags & GMX_FORCE_BONDED) && top->idef.il[F_FBPOSRES].nr > 0)
+ +    {
+ +        /* Flat-bottomed position restraints always require full pbc */
+ +        if (!(bStateChanged && bDoAdressWF))
+ +        {
+ +            set_pbc(&pbc, inputrec->ePBC, box);
+ +        }
+ +        v = fbposres(top->idef.il[F_FBPOSRES].nr, top->idef.il[F_FBPOSRES].iatoms,
+ +                     top->idef.iparams_fbposres,
+ +                     (const rvec*)x, fr->f_novirsum, fr->vir_diag_posres,
+ +                     inputrec->ePBC == epbcNONE ? NULL : &pbc,
+ +                     fr->rc_scaling, fr->ePBC, fr->posres_com);
+ +        enerd->term[F_FBPOSRES] += v;
+ +        inc_nrnb(nrnb, eNR_FBPOSRES, top->idef.il[F_FBPOSRES].nr/2);
+ +    }
+ +
+ +    /* Compute the bonded and non-bonded energies and optionally forces */
+ +    do_force_lowlevel(fplog, step, fr, inputrec, &(top->idef),
+ +                      cr, nrnb, wcycle, mdatoms, &(inputrec->opts),
+ +                      x, hist, f, bSepLRF ? fr->f_twin : f, enerd, fcd, mtop, top, fr->born,
+ +                      &(top->atomtypes), bBornRadii, box,
+ +                      inputrec->fepvals, lambda,
+ +                      graph, &(top->excls), fr->mu_tot,
+ +                      flags,
+ +                      &cycles_pme);
+ +
+ +    if (bSepLRF)
+ +    {
+ +        if (do_per_step(step, inputrec->nstcalclr))
+ +        {
+ +            /* Add the long range forces to the short range forces */
+ +            for (i = 0; i < fr->natoms_force_constr; i++)
+ +            {
+ +                rvec_add(fr->f_twin[i], f[i], f[i]);
+ +            }
+ +        }
+ +    }
+ +
+ +    cycles_force = wallcycle_stop(wcycle, ewcFORCE);
+ +
+ +    if (ed)
+ +    {
+ +        do_flood(cr, inputrec, x, f, ed, box, step, bNS);
+ +    }
+ +
+ +    if (DOMAINDECOMP(cr))
+ +    {
+ +        dd_force_flop_stop(cr->dd, nrnb);
+ +        if (wcycle)
+ +        {
+ +            dd_cycles_add(cr->dd, cycles_force-cycles_pme, ddCyclF);
+ +        }
+ +    }
+ +
+ +    if (bDoForces)
+ +    {
+ +        if (IR_ELEC_FIELD(*inputrec))
+ +        {
+ +            /* Compute forces due to electric field */
+ +            calc_f_el(MASTER(cr) ? field : NULL,
+ +                      start, homenr, mdatoms->chargeA, x, fr->f_novirsum,
+ +                      inputrec->ex, inputrec->et, t);
+ +        }
+ +
+ +        if (bDoAdressWF && fr->adress_icor == eAdressICThermoForce)
+ +        {
+ +            /* Compute thermodynamic force in hybrid AdResS region */
+ +            adress_thermo_force(start, homenr, &(top->cgs), x, fr->f_novirsum, fr, mdatoms,
+ +                                inputrec->ePBC == epbcNONE ? NULL : &pbc);
+ +        }
+ +
+ +        /* Communicate the forces */
+ +        if (PAR(cr))
+ +        {
+ +            wallcycle_start(wcycle, ewcMOVEF);
+ +            if (DOMAINDECOMP(cr))
+ +            {
+ +                dd_move_f(cr->dd, f, fr->fshift);
+ +                /* Do we need to communicate the separate force array
+ +                 * for terms that do not contribute to the single sum virial?
+ +                 * Position restraints and electric fields do not introduce
+ +                 * inter-cg forces, only full electrostatics methods do.
+ +                 * When we do not calculate the virial, fr->f_novirsum = f,
+ +                 * so we have already communicated these forces.
+ +                 */
+ +                if (EEL_FULL(fr->eeltype) && cr->dd->n_intercg_excl &&
+ +                    (flags & GMX_FORCE_VIRIAL))
+ +                {
+ +                    dd_move_f(cr->dd, fr->f_novirsum, NULL);
+ +                }
+ +                if (bSepLRF)
+ +                {
+ +                    /* We should not update the shift forces here,
+ +                     * since f_twin is already included in f.
+ +                     */
+ +                    dd_move_f(cr->dd, fr->f_twin, NULL);
+ +                }
+ +            }
+ +            else
+ +            {
+ +                pd_move_f(cr, f, nrnb);
+ +                if (bSepLRF)
+ +                {
+ +                    pd_move_f(cr, fr->f_twin, nrnb);
+ +                }
+ +            }
+ +            wallcycle_stop(wcycle, ewcMOVEF);
+ +        }
+ +
+ +        /* If we have NoVirSum forces, but we do not calculate the virial,
+ +         * we sum fr->f_novirum=f later.
+ +         */
+ +        if (vsite && !(fr->bF_NoVirSum && !(flags & GMX_FORCE_VIRIAL)))
+ +        {
+ +            wallcycle_start(wcycle, ewcVSITESPREAD);
+ +            spread_vsite_f(fplog, vsite, x, f, fr->fshift, FALSE, NULL, nrnb,
+ +                           &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr);
+ +            wallcycle_stop(wcycle, ewcVSITESPREAD);
+ +
+ +            if (bSepLRF)
+ +            {
+ +                wallcycle_start(wcycle, ewcVSITESPREAD);
+ +                spread_vsite_f(fplog, vsite, x, fr->f_twin, NULL, FALSE, NULL,
+ +                               nrnb,
+ +                               &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr);
+ +                wallcycle_stop(wcycle, ewcVSITESPREAD);
+ +            }
+ +        }
+ +
+ +        if (flags & GMX_FORCE_VIRIAL)
+ +        {
+ +            /* Calculation of the virial must be done after vsites! */
+ +            calc_virial(fplog, mdatoms->start, mdatoms->homenr, x, f,
+ +                        vir_force, graph, box, nrnb, fr, inputrec->ePBC);
+ +        }
+ +    }
+ +
+ +    if (inputrec->ePull == epullUMBRELLA || inputrec->ePull == epullCONST_F)
+ +    {
+ +        pull_potential_wrapper(fplog, bSepDVDL, cr, inputrec, box, x,
+ +                               f, vir_force, mdatoms, enerd, lambda, t);
+ +    }
+ +
+ +    /* Add the forces from enforced rotation potentials (if any) */
+ +    if (inputrec->bRot)
+ +    {
+ +        wallcycle_start(wcycle, ewcROTadd);
+ +        enerd->term[F_COM_PULL] += add_rot_forces(inputrec->rot, f, cr, step, t);
+ +        wallcycle_stop(wcycle, ewcROTadd);
+ +    }
+ +
+ +    if (PAR(cr) && !(cr->duty & DUTY_PME))
+ +    {
+ +        /* In case of node-splitting, the PP nodes receive the long-range
+ +         * forces, virial and energy from the PME nodes here.
+ +         */
+ +        pme_receive_force_ener(fplog, bSepDVDL, cr, wcycle, enerd, fr);
+ +    }
+ +
+ +    if (bDoForces)
+ +    {
+ +        post_process_forces(fplog, cr, step, nrnb, wcycle,
+ +                            top, box, x, f, vir_force, mdatoms, graph, fr, vsite,
+ +                            flags);
+ +    }
+ +
+ +    /* Sum the potential energy terms from group contributions */
+ +    sum_epot(&(inputrec->opts), &(enerd->grpp), enerd->term);
+ +}
+ +
+ +void do_force(FILE *fplog, t_commrec *cr,
+ +              t_inputrec *inputrec,
+ +              gmx_large_int_t step, t_nrnb *nrnb, gmx_wallcycle_t wcycle,
+ +              gmx_localtop_t *top,
+ +              gmx_mtop_t *mtop,
+ +              gmx_groups_t *groups,
+ +              matrix box, rvec x[], history_t *hist,
+ +              rvec f[],
+ +              tensor vir_force,
+ +              t_mdatoms *mdatoms,
+ +              gmx_enerdata_t *enerd, t_fcdata *fcd,
+ +              real *lambda, t_graph *graph,
+ +              t_forcerec *fr,
+ +              gmx_vsite_t *vsite, rvec mu_tot,
+ +              double t, FILE *field, gmx_edsam_t ed,
+ +              gmx_bool bBornRadii,
+ +              int flags)
+ +{
+ +    /* modify force flag if not doing nonbonded */
+ +    if (!fr->bNonbonded)
+ +    {
+ +        flags &= ~GMX_FORCE_NONBONDED;
+ +    }
+ +
+ +    switch (inputrec->cutoff_scheme)
+ +    {
+ +        case ecutsVERLET:
+ +            do_force_cutsVERLET(fplog, cr, inputrec,
+ +                                step, nrnb, wcycle,
+ +                                top, mtop,
+ +                                groups,
+ +                                box, x, hist,
+ +                                f, vir_force,
+ +                                mdatoms,
+ +                                enerd, fcd,
+ +                                lambda, graph,
+ +                                fr, fr->ic,
+ +                                vsite, mu_tot,
+ +                                t, field, ed,
+ +                                bBornRadii,
+ +                                flags);
+ +            break;
+ +        case ecutsGROUP:
+ +            do_force_cutsGROUP(fplog, cr, inputrec,
+ +                               step, nrnb, wcycle,
+ +                               top, mtop,
+ +                               groups,
+ +                               box, x, hist,
+ +                               f, vir_force,
+ +                               mdatoms,
+ +                               enerd, fcd,
+ +                               lambda, graph,
+ +                               fr, vsite, mu_tot,
+ +                               t, field, ed,
+ +                               bBornRadii,
+ +                               flags);
+ +            break;
+ +        default:
+ +            gmx_incons("Invalid cut-off scheme passed!");
+ +    }
+ +}
+ +
+ +
+ +void do_constrain_first(FILE *fplog, gmx_constr_t constr,
+ +                        t_inputrec *ir, t_mdatoms *md,
+ +                        t_state *state, rvec *f,
+ +                        t_graph *graph, t_commrec *cr, t_nrnb *nrnb,
+ +                        t_forcerec *fr, gmx_localtop_t *top, tensor shake_vir)
+ +{
+ +    int             i, m, start, end;
+ +    gmx_large_int_t step;
+ +    real            dt = ir->delta_t;
+ +    real            dvdl_dum;
+ +    rvec           *savex;
+ +
+ +    snew(savex, state->natoms);
+ +
+ +    start = md->start;
+ +    end   = md->homenr + start;
+ +
+ +    if (debug)
+ +    {
+ +        fprintf(debug, "vcm: start=%d, homenr=%d, end=%d\n",
+ +                start, md->homenr, end);
+ +    }
+ +    /* Do a first constrain to reset particles... */
+ +    step = ir->init_step;
+ +    if (fplog)
+ +    {
+ +        char buf[STEPSTRSIZE];
+ +        fprintf(fplog, "\nConstraining the starting coordinates (step %s)\n",
+ +                gmx_step_str(step, buf));
+ +    }
+ +    dvdl_dum = 0;
+ +
+ +    /* constrain the current position */
+ +    constrain(NULL, TRUE, FALSE, constr, &(top->idef),
+ +              ir, NULL, cr, step, 0, md,
+ +              state->x, state->x, NULL,
+ +              fr->bMolPBC, state->box,
+ +              state->lambda[efptBONDED], &dvdl_dum,
+ +              NULL, NULL, nrnb, econqCoord,
+ +              ir->epc == epcMTTK, state->veta, state->veta);
+ +    if (EI_VV(ir->eI))
+ +    {
+ +        /* constrain the inital velocity, and save it */
+ +        /* also may be useful if we need the ekin from the halfstep for velocity verlet */
+ +        /* might not yet treat veta correctly */
+ +        constrain(NULL, TRUE, FALSE, constr, &(top->idef),
+ +                  ir, NULL, cr, step, 0, md,
+ +                  state->x, state->v, state->v,
+ +                  fr->bMolPBC, state->box,
+ +                  state->lambda[efptBONDED], &dvdl_dum,
+ +                  NULL, NULL, nrnb, econqVeloc,
+ +                  ir->epc == epcMTTK, state->veta, state->veta);
+ +    }
+ +    /* constrain the inital velocities at t-dt/2 */
+ +    if (EI_STATE_VELOCITY(ir->eI) && ir->eI != eiVV)
+ +    {
+ +        for (i = start; (i < end); i++)
+ +        {
+ +            for (m = 0; (m < DIM); m++)
+ +            {
+ +                /* Reverse the velocity */
+ +                state->v[i][m] = -state->v[i][m];
+ +                /* Store the position at t-dt in buf */
+ +                savex[i][m] = state->x[i][m] + dt*state->v[i][m];
+ +            }
+ +        }
+ +        /* Shake the positions at t=-dt with the positions at t=0
+ +         * as reference coordinates.
+ +         */
+ +        if (fplog)
+ +        {
+ +            char buf[STEPSTRSIZE];
+ +            fprintf(fplog, "\nConstraining the coordinates at t0-dt (step %s)\n",
+ +                    gmx_step_str(step, buf));
+ +        }
+ +        dvdl_dum = 0;
+ +        constrain(NULL, TRUE, FALSE, constr, &(top->idef),
+ +                  ir, NULL, cr, step, -1, md,
+ +                  state->x, savex, NULL,
+ +                  fr->bMolPBC, state->box,
+ +                  state->lambda[efptBONDED], &dvdl_dum,
+ +                  state->v, NULL, nrnb, econqCoord,
+ +                  ir->epc == epcMTTK, state->veta, state->veta);
+ +
+ +        for (i = start; i < end; i++)
+ +        {
+ +            for (m = 0; m < DIM; m++)
+ +            {
+ +                /* Re-reverse the velocities */
+ +                state->v[i][m] = -state->v[i][m];
+ +            }
+ +        }
+ +    }
+ +    sfree(savex);
+ +}
+ +
+ +void calc_enervirdiff(FILE *fplog, int eDispCorr, t_forcerec *fr)
+ +{
+ +    double eners[2], virs[2], enersum, virsum, y0, f, g, h;
+ +    double r0, r1, r, rc3, rc9, ea, eb, ec, pa, pb, pc, pd;
+ +    double invscale, invscale2, invscale3;
+ +    int    ri0, ri1, ri, i, offstart, offset;
+ +    real   scale, *vdwtab, tabfactor, tmp;
+ +
+ +    fr->enershiftsix    = 0;
+ +    fr->enershifttwelve = 0;
+ +    fr->enerdiffsix     = 0;
+ +    fr->enerdifftwelve  = 0;
+ +    fr->virdiffsix      = 0;
+ +    fr->virdifftwelve   = 0;
+ +
+ +    if (eDispCorr != edispcNO)
+ +    {
+ +        for (i = 0; i < 2; i++)
+ +        {
+ +            eners[i] = 0;
+ +            virs[i]  = 0;
+ +        }
+ +        if ((fr->vdwtype == evdwSWITCH) || (fr->vdwtype == evdwSHIFT))
+ +        {
+ +            if (fr->rvdw_switch == 0)
+ +            {
+ +                gmx_fatal(FARGS,
+ +                          "With dispersion correction rvdw-switch can not be zero "
+ +                          "for vdw-type = %s", evdw_names[fr->vdwtype]);
+ +            }
+ +
+ +            scale  = fr->nblists[0].table_elec_vdw.scale;
+ +            vdwtab = fr->nblists[0].table_vdw.data;
+ +
+ +            /* Round the cut-offs to exact table values for precision */
+ +            ri0  = floor(fr->rvdw_switch*scale);
+ +            ri1  = ceil(fr->rvdw*scale);
+ +            r0   = ri0/scale;
+ +            r1   = ri1/scale;
+ +            rc3  = r0*r0*r0;
+ +            rc9  = rc3*rc3*rc3;
+ +
+ +            if (fr->vdwtype == evdwSHIFT)
+ +            {
+ +                /* Determine the constant energy shift below rvdw_switch.
+ +                 * Table has a scale factor since we have scaled it down to compensate
+ +                 * for scaling-up c6/c12 with the derivative factors to save flops in analytical kernels.
+ +                 */
+ +                fr->enershiftsix    = (real)(-1.0/(rc3*rc3)) - 6.0*vdwtab[8*ri0];
+ +                fr->enershifttwelve = (real)( 1.0/(rc9*rc3)) - 12.0*vdwtab[8*ri0 + 4];
+ +            }
+ +            /* Add the constant part from 0 to rvdw_switch.
+ +             * This integration from 0 to rvdw_switch overcounts the number
+ +             * of interactions by 1, as it also counts the self interaction.
+ +             * We will correct for this later.
+ +             */
+ +            eners[0] += 4.0*M_PI*fr->enershiftsix*rc3/3.0;
+ +            eners[1] += 4.0*M_PI*fr->enershifttwelve*rc3/3.0;
+ +
+ +            invscale  = 1.0/(scale);
+ +            invscale2 = invscale*invscale;
+ +            invscale3 = invscale*invscale2;
+ +
+ +            /* following summation derived from cubic spline definition,
+ +               Numerical Recipies in C, second edition, p. 113-116.  Exact
+ +               for the cubic spline.  We first calculate the negative of
+ +               the energy from rvdw to rvdw_switch, assuming that g(r)=1,
+ +               and then add the more standard, abrupt cutoff correction to
+ +               that result, yielding the long-range correction for a
+ +               switched function.  We perform both the pressure and energy
+ +               loops at the same time for simplicity, as the computational
+ +               cost is low. */
+ +
+ +            for (i = 0; i < 2; i++)
+ +            {
+ +                enersum = 0.0; virsum = 0.0;
+ +                if (i == 0)
+ +                {
+ +                    offstart = 0;
+ +                    /* Since the dispersion table has been scaled down a factor 6.0 and the repulsion
+ +                     * a factor 12.0 to compensate for the c6/c12 parameters inside nbfp[] being scaled
+ +                     * up (to save flops in kernels), we need to correct for this.
+ +                     */
+ +                    tabfactor = 6.0;
+ +                }
+ +                else
+ +                {
+ +                    offstart  = 4;
+ +                    tabfactor = 12.0;
+ +                }
+ +                for (ri = ri0; ri < ri1; ri++)
+ +                {
+ +                    r  = ri*invscale;
+ +                    ea = invscale3;
+ +                    eb = 2.0*invscale2*r;
+ +                    ec = invscale*r*r;
+ +
+ +                    pa = invscale3;
+ +                    pb = 3.0*invscale2*r;
+ +                    pc = 3.0*invscale*r*r;
+ +                    pd = r*r*r;
+ +
+ +                    /* this "8" is from the packing in the vdwtab array - perhaps should be #define'ed? */
+ +                    offset = 8*ri + offstart;
+ +                    y0     = vdwtab[offset];
+ +                    f      = vdwtab[offset+1];
+ +                    g      = vdwtab[offset+2];
+ +                    h      = vdwtab[offset+3];
+ +
+ +                    enersum += y0*(ea/3 + eb/2 + ec) + f*(ea/4 + eb/3 + ec/2) + g*(ea/5 + eb/4 + ec/3) + h*(ea/6 + eb/5 + ec/4);
+ +                    virsum  += f*(pa/4 + pb/3 + pc/2 + pd) + 2*g*(pa/5 + pb/4 + pc/3 + pd/2) + 3*h*(pa/6 + pb/5 + pc/4 + pd/3);
+ +                }
+ +
+ +                enersum  *= 4.0*M_PI*tabfactor;
+ +                virsum   *= 4.0*M_PI*tabfactor;
+ +                eners[i] -= enersum;
+ +                virs[i]  -= virsum;
+ +            }
+ +
+ +            /* now add the correction for rvdw_switch to infinity */
+ +            eners[0] += -4.0*M_PI/(3.0*rc3);
+ +            eners[1] +=  4.0*M_PI/(9.0*rc9);
+ +            virs[0]  +=  8.0*M_PI/rc3;
+ +            virs[1]  += -16.0*M_PI/(3.0*rc9);
+ +        }
+ +        else if ((fr->vdwtype == evdwCUT) || (fr->vdwtype == evdwUSER))
+ +        {
+ +            if (fr->vdwtype == evdwUSER && fplog)
+ +            {
+ +                fprintf(fplog,
+ +                        "WARNING: using dispersion correction with user tables\n");
+ +            }
+ +            rc3  = fr->rvdw*fr->rvdw*fr->rvdw;
+ +            rc9  = rc3*rc3*rc3;
+ +            /* Contribution beyond the cut-off */
+ +            eners[0] += -4.0*M_PI/(3.0*rc3);
+ +            eners[1] +=  4.0*M_PI/(9.0*rc9);
+ +            if (fr->vdw_modifier == eintmodPOTSHIFT)
+ +            {
+ +                /* Contribution within the cut-off */
+ +                eners[0] += -4.0*M_PI/(3.0*rc3);
+ +                eners[1] +=  4.0*M_PI/(3.0*rc9);
+ +            }
+ +            /* Contribution beyond the cut-off */
+ +            virs[0]  +=  8.0*M_PI/rc3;
+ +            virs[1]  += -16.0*M_PI/(3.0*rc9);
+ +        }
+ +        else
+ +        {
+ +            gmx_fatal(FARGS,
+ +                      "Dispersion correction is not implemented for vdw-type = %s",
+ +                      evdw_names[fr->vdwtype]);
+ +        }
+ +        fr->enerdiffsix    = eners[0];
+ +        fr->enerdifftwelve = eners[1];
+ +        /* The 0.5 is due to the Gromacs definition of the virial */
+ +        fr->virdiffsix     = 0.5*virs[0];
+ +        fr->virdifftwelve  = 0.5*virs[1];
+ +    }
+ +}
+ +
+ +void calc_dispcorr(FILE *fplog, t_inputrec *ir, t_forcerec *fr,
+ +                   gmx_large_int_t step, int natoms,
+ +                   matrix box, real lambda, tensor pres, tensor virial,
+ +                   real *prescorr, real *enercorr, real *dvdlcorr)
+ +{
+ +    gmx_bool bCorrAll, bCorrPres;
+ +    real     dvdlambda, invvol, dens, ninter, avcsix, avctwelve, enerdiff, svir = 0, spres = 0;
+ +    int      m;
+ +
+ +    *prescorr = 0;
+ +    *enercorr = 0;
+ +    *dvdlcorr = 0;
+ +
+ +    clear_mat(virial);
+ +    clear_mat(pres);
+ +
+ +    if (ir->eDispCorr != edispcNO)
+ +    {
+ +        bCorrAll  = (ir->eDispCorr == edispcAllEner ||
+ +                     ir->eDispCorr == edispcAllEnerPres);
+ +        bCorrPres = (ir->eDispCorr == edispcEnerPres ||
+ +                     ir->eDispCorr == edispcAllEnerPres);
+ +
+ +        invvol = 1/det(box);
+ +        if (fr->n_tpi)
+ +        {
+ +            /* Only correct for the interactions with the inserted molecule */
+ +            dens   = (natoms - fr->n_tpi)*invvol;
+ +            ninter = fr->n_tpi;
+ +        }
+ +        else
+ +        {
+ +            dens   = natoms*invvol;
+ +            ninter = 0.5*natoms;
+ +        }
+ +
+ +        if (ir->efep == efepNO)
+ +        {
+ +            avcsix    = fr->avcsix[0];
+ +            avctwelve = fr->avctwelve[0];
+ +        }
+ +        else
+ +        {
+ +            avcsix    = (1 - lambda)*fr->avcsix[0]    + lambda*fr->avcsix[1];
+ +            avctwelve = (1 - lambda)*fr->avctwelve[0] + lambda*fr->avctwelve[1];
+ +        }
+ +
+ +        enerdiff   = ninter*(dens*fr->enerdiffsix - fr->enershiftsix);
+ +        *enercorr += avcsix*enerdiff;
+ +        dvdlambda  = 0.0;
+ +        if (ir->efep != efepNO)
+ +        {
+ +            dvdlambda += (fr->avcsix[1] - fr->avcsix[0])*enerdiff;
+ +        }
+ +        if (bCorrAll)
+ +        {
+ +            enerdiff   = ninter*(dens*fr->enerdifftwelve - fr->enershifttwelve);
+ +            *enercorr += avctwelve*enerdiff;
+ +            if (fr->efep != efepNO)
+ +            {
+ +                dvdlambda += (fr->avctwelve[1] - fr->avctwelve[0])*enerdiff;
+ +            }
+ +        }
+ +
+ +        if (bCorrPres)
+ +        {
+ +            svir = ninter*dens*avcsix*fr->virdiffsix/3.0;
+ +            if (ir->eDispCorr == edispcAllEnerPres)
+ +            {
+ +                svir += ninter*dens*avctwelve*fr->virdifftwelve/3.0;
+ +            }
+ +            /* The factor 2 is because of the Gromacs virial definition */
+ +            spres = -2.0*invvol*svir*PRESFAC;
+ +
+ +            for (m = 0; m < DIM; m++)
+ +            {
+ +                virial[m][m] += svir;
+ +                pres[m][m]   += spres;
+ +            }
+ +            *prescorr += spres;
+ +        }
+ +
+ +        /* Can't currently control when it prints, for now, just print when degugging */
+ +        if (debug)
+ +        {
+ +            if (bCorrAll)
+ +            {
+ +                fprintf(debug, "Long Range LJ corr.: <C6> %10.4e, <C12> %10.4e\n",
+ +                        avcsix, avctwelve);
+ +            }
+ +            if (bCorrPres)
+ +            {
+ +                fprintf(debug,
+ +                        "Long Range LJ corr.: Epot %10g, Pres: %10g, Vir: %10g\n",
+ +                        *enercorr, spres, svir);
+ +            }
+ +            else
+ +            {
+ +                fprintf(debug, "Long Range LJ corr.: Epot %10g\n", *enercorr);
+ +            }
+ +        }
+ +
+ +        if (fr->bSepDVDL && do_per_step(step, ir->nstlog))
+ +        {
+ +            fprintf(fplog, sepdvdlformat, "Dispersion correction",
+ +                    *enercorr, dvdlambda);
+ +        }
+ +        if (fr->efep != efepNO)
+ +        {
+ +            *dvdlcorr += dvdlambda;
+ +        }
+ +    }
+ +}
+ +
+ +void do_pbc_first(FILE *fplog, matrix box, t_forcerec *fr,
+ +                  t_graph *graph, rvec x[])
+ +{
+ +    if (fplog)
+ +    {
+ +        fprintf(fplog, "Removing pbc first time\n");
+ +    }
+ +    calc_shifts(box, fr->shift_vec);
+ +    if (graph)
+ +    {
+ +        mk_mshift(fplog, graph, fr->ePBC, box, x);
+ +        if (gmx_debug_at)
+ +        {
+ +            p_graph(debug, "do_pbc_first 1", graph);
+ +        }
+ +        shift_self(graph, box, x);
+ +        /* By doing an extra mk_mshift the molecules that are broken
+ +         * because they were e.g. imported from another software
+ +         * will be made whole again. Such are the healing powers
+ +         * of GROMACS.
+ +         */
+ +        mk_mshift(fplog, graph, fr->ePBC, box, x);
+ +        if (gmx_debug_at)
+ +        {
+ +            p_graph(debug, "do_pbc_first 2", graph);
+ +        }
+ +    }
+ +    if (fplog)
+ +    {
+ +        fprintf(fplog, "Done rmpbc\n");
+ +    }
+ +}
+ +
+ +static void low_do_pbc_mtop(FILE *fplog, int ePBC, matrix box,
+ +                            gmx_mtop_t *mtop, rvec x[],
+ +                            gmx_bool bFirst)
+ +{
+ +    t_graph        *graph;
+ +    int             mb, as, mol;
+ +    gmx_molblock_t *molb;
+ +
+ +    if (bFirst && fplog)
+ +    {
+ +        fprintf(fplog, "Removing pbc first time\n");
+ +    }
+ +
+ +    snew(graph, 1);
+ +    as = 0;
+ +    for (mb = 0; mb < mtop->nmolblock; mb++)
+ +    {
+ +        molb = &mtop->molblock[mb];
+ +        if (molb->natoms_mol == 1 ||
+ +            (!bFirst && mtop->moltype[molb->type].cgs.nr == 1))
+ +        {
+ +            /* Just one atom or charge group in the molecule, no PBC required */
+ +            as += molb->nmol*molb->natoms_mol;
+ +        }
+ +        else
+ +        {
+ +            /* Pass NULL iso fplog to avoid graph prints for each molecule type */
+ +            mk_graph_ilist(NULL, mtop->moltype[molb->type].ilist,
+ +                           0, molb->natoms_mol, FALSE, FALSE, graph);
+ +
+ +            for (mol = 0; mol < molb->nmol; mol++)
+ +            {
+ +                mk_mshift(fplog, graph, ePBC, box, x+as);
+ +
+ +                shift_self(graph, box, x+as);
+ +                /* The molecule is whole now.
+ +                 * We don't need the second mk_mshift call as in do_pbc_first,
+ +                 * since we no longer need this graph.
+ +                 */
+ +
+ +                as += molb->natoms_mol;
+ +            }
+ +            done_graph(graph);
+ +        }
+ +    }
+ +    sfree(graph);
+ +}
+ +
+ +void do_pbc_first_mtop(FILE *fplog, int ePBC, matrix box,
+ +                       gmx_mtop_t *mtop, rvec x[])
+ +{
+ +    low_do_pbc_mtop(fplog, ePBC, box, mtop, x, TRUE);
+ +}
+ +
+ +void do_pbc_mtop(FILE *fplog, int ePBC, matrix box,
+ +                 gmx_mtop_t *mtop, rvec x[])
+ +{
+ +    low_do_pbc_mtop(fplog, ePBC, box, mtop, x, FALSE);
+ +}
+ +
+ +void finish_run(FILE *fplog, t_commrec *cr, const char *confout,
+ +                t_inputrec *inputrec,
+ +                t_nrnb nrnb[], gmx_wallcycle_t wcycle,
+ +                gmx_runtime_t *runtime,
+ +                wallclock_gpu_t *gputimes,
+ +                int omp_nth_pp,
+ +                gmx_bool bWriteStat)
+ +{
+ +    int     i, j;
+ +    t_nrnb *nrnb_tot = NULL;
+ +    real    delta_t;
+ +    double  nbfs, mflop;
+ +
+ +    wallcycle_sum(cr, wcycle);
+ +
+ +    if (cr->nnodes > 1)
+ +    {
+ +        snew(nrnb_tot, 1);
+ +#ifdef GMX_MPI
+ +        MPI_Allreduce(nrnb->n, nrnb_tot->n, eNRNB, MPI_DOUBLE, MPI_SUM,
+ +                      cr->mpi_comm_mysim);
+ +#endif
+ +    }
+ +    else
+ +    {
+ +        nrnb_tot = nrnb;
+ +    }
+ +
+ +#if defined(GMX_MPI) && !defined(GMX_THREAD_MPI)
+ +    if (cr->nnodes > 1)
+ +    {
+ +        /* reduce nodetime over all MPI processes in the current simulation */
+ +        double sum;
+ +        MPI_Allreduce(&runtime->proctime, &sum, 1, MPI_DOUBLE, MPI_SUM,
+ +                      cr->mpi_comm_mysim);
+ +        runtime->proctime = sum;
+ +    }
+ +#endif
+ +
+ +    if (SIMMASTER(cr))
+ +    {
+ +        print_flop(fplog, nrnb_tot, &nbfs, &mflop);
+ +    }
+ +    if (cr->nnodes > 1)
+ +    {
+ +        sfree(nrnb_tot);
+ +    }
+ +
+ +    if ((cr->duty & DUTY_PP) && DOMAINDECOMP(cr))
+ +    {
+ +        print_dd_statistics(cr, inputrec, fplog);
+ +    }
+ +
+ +#ifdef GMX_MPI
+ +    if (PARTDECOMP(cr))
+ +    {
+ +        if (MASTER(cr))
+ +        {
+ +            t_nrnb     *nrnb_all;
+ +            int         s;
+ +            MPI_Status  stat;
+ +
+ +            snew(nrnb_all, cr->nnodes);
+ +            nrnb_all[0] = *nrnb;
+ +            for (s = 1; s < cr->nnodes; s++)
+ +            {
+ +                MPI_Recv(nrnb_all[s].n, eNRNB, MPI_DOUBLE, s, 0,
+ +                         cr->mpi_comm_mysim, &stat);
+ +            }
+ +            pr_load(fplog, cr, nrnb_all);
+ +            sfree(nrnb_all);
+ +        }
+ +        else
+ +        {
+ +            MPI_Send(nrnb->n, eNRNB, MPI_DOUBLE, MASTERRANK(cr), 0,
+ +                     cr->mpi_comm_mysim);
+ +        }
+ +    }
+ +#endif
+ +
+ +    if (SIMMASTER(cr))
+ +    {
+ +        wallcycle_print(fplog, cr->nnodes, cr->npmenodes, runtime->realtime,
+ +                        wcycle, gputimes);
+ +
+ +        if (EI_DYNAMICS(inputrec->eI))
+ +        {
+ +            delta_t = inputrec->delta_t;
+ +        }
+ +        else
+ +        {
+ +            delta_t = 0;
+ +        }
+ +
+ +        if (fplog)
+ +        {
+ +            print_perf(fplog, runtime->proctime, runtime->realtime,
+ +                       runtime->nsteps_done, delta_t, nbfs, mflop);
+ +        }
+ +        if (bWriteStat)
+ +        {
+ +            print_perf(stderr, runtime->proctime, runtime->realtime,
+ +                       runtime->nsteps_done, delta_t, nbfs, mflop);
+ +        }
+ +    }
+ +}
+ +
+ +extern void initialize_lambdas(FILE *fplog, t_inputrec *ir, int *fep_state, real *lambda, double *lam0)
+ +{
+ +    /* this function works, but could probably use a logic rewrite to keep all the different
+ +       types of efep straight. */
+ +
+ +    int       i;
+ +    t_lambda *fep = ir->fepvals;
+ +
+ +    if ((ir->efep == efepNO) && (ir->bSimTemp == FALSE))
+ +    {
+ +        for (i = 0; i < efptNR; i++)
+ +        {
+ +            lambda[i] = 0.0;
+ +            if (lam0)
+ +            {
+ +                lam0[i] = 0.0;
+ +            }
+ +        }
+ +        return;
+ +    }
+ +    else
+ +    {
+ +        *fep_state = fep->init_fep_state; /* this might overwrite the checkpoint
+ +                                             if checkpoint is set -- a kludge is in for now
+ +                                             to prevent this.*/
+ +        for (i = 0; i < efptNR; i++)
+ +        {
+ +            /* overwrite lambda state with init_lambda for now for backwards compatibility */
+ +            if (fep->init_lambda >= 0) /* if it's -1, it was never initializd */
+ +            {
+ +                lambda[i] = fep->init_lambda;
+ +                if (lam0)
+ +                {
+ +                    lam0[i] = lambda[i];
+ +                }
+ +            }
+ +            else
+ +            {
+ +                lambda[i] = fep->all_lambda[i][*fep_state];
+ +                if (lam0)
+ +                {
+ +                    lam0[i] = lambda[i];
+ +                }
+ +            }
+ +        }
+ +        if (ir->bSimTemp)
+ +        {
+ +            /* need to rescale control temperatures to match current state */
+ +            for (i = 0; i < ir->opts.ngtc; i++)
+ +            {
+ +                if (ir->opts.ref_t[i] > 0)
+ +                {
+ +                    ir->opts.ref_t[i] = ir->simtempvals->temperatures[*fep_state];
+ +                }
+ +            }
+ +        }
+ +    }
+ +
+ +    /* Send to the log the information on the current lambdas */
+ +    if (fplog != NULL)
+ +    {
+ +        fprintf(fplog, "Initial vector of lambda components:[ ");
+ +        for (i = 0; i < efptNR; i++)
+ +        {
+ +            fprintf(fplog, "%10.4f ", lambda[i]);
+ +        }
+ +        fprintf(fplog, "]\n");
+ +    }
+ +    return;
+ +}
+ +
+ +
+ +void init_md(FILE *fplog,
+ +             t_commrec *cr, t_inputrec *ir, const output_env_t oenv,
+ +             double *t, double *t0,
+ +             real *lambda, int *fep_state, double *lam0,
+ +             t_nrnb *nrnb, gmx_mtop_t *mtop,
+ +             gmx_update_t *upd,
+ +             int nfile, const t_filenm fnm[],
+ +             gmx_mdoutf_t **outf, t_mdebin **mdebin,
+ +             tensor force_vir, tensor shake_vir, rvec mu_tot,
+ +             gmx_bool *bSimAnn, t_vcm **vcm, t_state *state, unsigned long Flags)
+ +{
+ +    int  i, j, n;
+ +    real tmpt, mod;
+ +
+ +    /* Initial values */
+ +    *t = *t0       = ir->init_t;
+ +
+ +    *bSimAnn = FALSE;
+ +    for (i = 0; i < ir->opts.ngtc; i++)
+ +    {
+ +        /* set bSimAnn if any group is being annealed */
+ +        if (ir->opts.annealing[i] != eannNO)
+ +        {
+ +            *bSimAnn = TRUE;
+ +        }
+ +    }
+ +    if (*bSimAnn)
+ +    {
+ +        update_annealing_target_temp(&(ir->opts), ir->init_t);
+ +    }
+ +
+ +    /* Initialize lambda variables */
+ +    initialize_lambdas(fplog, ir, fep_state, lambda, lam0);
+ +
+ +    if (upd)
+ +    {
+ +        *upd = init_update(fplog, ir);
+ +    }
+ +
+ +
+ +    if (vcm != NULL)
+ +    {
+ +        *vcm = init_vcm(fplog, &mtop->groups, ir);
+ +    }
+ +
+ +    if (EI_DYNAMICS(ir->eI) && !(Flags & MD_APPENDFILES))
+ +    {
+ +        if (ir->etc == etcBERENDSEN)
+ +        {
+ +            please_cite(fplog, "Berendsen84a");
+ +        }
+ +        if (ir->etc == etcVRESCALE)
+ +        {
+ +            please_cite(fplog, "Bussi2007a");
+ +        }
+ +    }
+ +
+ +    init_nrnb(nrnb);
+ +
+ +    if (nfile != -1)
+ +    {
+ +        *outf = init_mdoutf(nfile, fnm, Flags, cr, ir, oenv);
+ +
+ +        *mdebin = init_mdebin((Flags & MD_APPENDFILES) ? NULL : (*outf)->fp_ene,
+ +                              mtop, ir, (*outf)->fp_dhdl);
+ +    }
+ +
+ +    if (ir->bAdress)
+ +    {
+ +        please_cite(fplog, "Fritsch12");
+ +        please_cite(fplog, "Junghans10");
+ +    }
+ +    /* Initiate variables */
+ +    clear_mat(force_vir);
+ +    clear_mat(shake_vir);
+ +    clear_rvec(mu_tot);
+ +
+ +    debug_gmx();
+ +}
diff --cc src/gromacs/mdlib/update.c

index d829a167b4bc366b04ad5ddcf305b7aec10d3590,0000000000000000000000000000000000000000..27551cfd3b4d99ce6ca6d88db104a0ff0bf24968

mode 100644,000000..100644
--- 1/src/gromacs/mdlib/update.c
--- /dev/null
+++ b/src/gromacs/mdlib/update.c
@@@ -1,2083 -1,0 +1,2112 @@@
- #pragma omp parallel num_threads(ngr)
+ +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
+ + *
+ + *
+ + *                This source code is part of
+ + *
+ + *                 G   R   O   M   A   C   S
+ + *
+ + *          GROningen MAchine for Chemical Simulations
+ + *
+ + *                        VERSION 3.2.0
+ + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
+ + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ + * Copyright (c) 2001-2004, The GROMACS development team,
+ + * check out http://www.gromacs.org for more information.
+ +
+ + * This program is free software; you can redistribute it and/or
+ + * modify it under the terms of the GNU General Public License
+ + * as published by the Free Software Foundation; either version 2
+ + * of the License, or (at your option) any later version.
+ + *
+ + * If you want to redistribute modifications, please consider that
+ + * scientific software is very special. Version control is crucial -
+ + * bugs must be traceable. We will be happy to consider code for
+ + * inclusion in the official distribution, but derived work must not
+ + * be called official GROMACS. Details are found in the README & COPYING
+ + * files - if they are missing, get the official version at www.gromacs.org.
+ + *
+ + * To help us fund GROMACS development, we humbly ask that you cite
+ + * the papers on the package - you can find them in the top README file.
+ + *
+ + * For more info, check our website at http://www.gromacs.org
+ + *
+ + * And Hey:
+ + * GROwing Monsters And Cloning Shrimps
+ + */
+ +#ifdef HAVE_CONFIG_H
+ +#include <config.h>
+ +#endif
+ +
+ +
+ +#include <stdio.h>
+ +#include <math.h>
+ +
+ +#include "types/commrec.h"
+ +#include "sysstuff.h"
+ +#include "smalloc.h"
+ +#include "typedefs.h"
+ +#include "nrnb.h"
+ +#include "physics.h"
+ +#include "macros.h"
+ +#include "vec.h"
+ +#include "main.h"
+ +#include "confio.h"
+ +#include "update.h"
+ +#include "gmx_random.h"
+ +#include "futil.h"
+ +#include "mshift.h"
+ +#include "tgroup.h"
+ +#include "force.h"
+ +#include "names.h"
+ +#include "txtdump.h"
+ +#include "mdrun.h"
+ +#include "constr.h"
+ +#include "edsam.h"
+ +#include "pull.h"
+ +#include "disre.h"
+ +#include "orires.h"
+ +#include "gmx_wallcycle.h"
+ +#include "gmx_omp_nthreads.h"
+ +#include "gmx_omp.h"
+ +
+ +/*For debugging, start at v(-dt/2) for velolcity verlet -- uncomment next line */
+ +/*#define STARTFROMDT2*/
+ +
+ +typedef struct {
+ +    double gdt;
+ +    double eph;
+ +    double emh;
+ +    double em;
+ +    double b;
+ +    double c;
+ +    double d;
+ +} gmx_sd_const_t;
+ +
+ +typedef struct {
+ +    real V;
+ +    real X;
+ +    real Yv;
+ +    real Yx;
+ +} gmx_sd_sigma_t;
+ +
+ +typedef struct {
+ +    /* The random state for ngaussrand threads.
+ +     * Normal thermostats need just 1 random number generator,
+ +     * but SD and BD with OpenMP parallelization need 1 for each thread.
+ +     */
+ +    int             ngaussrand;
+ +    gmx_rng_t      *gaussrand;
+ +    /* BD stuff */
+ +    real           *bd_rf;
+ +    /* SD stuff */
+ +    gmx_sd_const_t *sdc;
+ +    gmx_sd_sigma_t *sdsig;
+ +    rvec           *sd_V;
+ +    int             sd_V_nalloc;
+ +    /* andersen temperature control stuff */
+ +    gmx_bool       *randomize_group;
+ +    real           *boltzfac;
+ +} gmx_stochd_t;
+ +
+ +typedef struct gmx_update
+ +{
+ +    gmx_stochd_t *sd;
+ +    /* xprime for constraint algorithms */
+ +    rvec         *xp;
+ +    int           xp_nalloc;
+ +
+ +    /* variable size arrays for andersen */
+ +    gmx_bool *randatom;
+ +    int      *randatom_list;
+ +    gmx_bool  randatom_list_init;
+ +
+ +    /* Variables for the deform algorithm */
+ +    gmx_large_int_t deformref_step;
+ +    matrix          deformref_box;
+ +} t_gmx_update;
+ +
+ +
+ +static void do_update_md(int start, int nrend, double dt,
+ +                         t_grp_tcstat *tcstat,
+ +                         double nh_vxi[],
+ +                         gmx_bool bNEMD, t_grp_acc *gstat, rvec accel[],
+ +                         ivec nFreeze[],
+ +                         real invmass[],
+ +                         unsigned short ptype[], unsigned short cFREEZE[],
+ +                         unsigned short cACC[], unsigned short cTC[],
+ +                         rvec x[], rvec xprime[], rvec v[],
+ +                         rvec f[], matrix M,
+ +                         gmx_bool bNH, gmx_bool bPR)
+ +{
+ +    double imass, w_dt;
+ +    int    gf = 0, ga = 0, gt = 0;
+ +    rvec   vrel;
+ +    real   vn, vv, va, vb, vnrel;
+ +    real   lg, vxi = 0, u;
+ +    int    n, d;
+ +
+ +    if (bNH || bPR)
+ +    {
+ +        /* Update with coupling to extended ensembles, used for
+ +         * Nose-Hoover and Parrinello-Rahman coupling
+ +         * Nose-Hoover uses the reversible leap-frog integrator from
+ +         * Holian et al. Phys Rev E 52(3) : 2338, 1995
+ +         */
+ +        for (n = start; n < nrend; n++)
+ +        {
+ +            imass = invmass[n];
+ +            if (cFREEZE)
+ +            {
+ +                gf   = cFREEZE[n];
+ +            }
+ +            if (cACC)
+ +            {
+ +                ga   = cACC[n];
+ +            }
+ +            if (cTC)
+ +            {
+ +                gt   = cTC[n];
+ +            }
+ +            lg   = tcstat[gt].lambda;
+ +            if (bNH)
+ +            {
+ +                vxi   = nh_vxi[gt];
+ +            }
+ +            rvec_sub(v[n], gstat[ga].u, vrel);
+ +
+ +            for (d = 0; d < DIM; d++)
+ +            {
+ +                if ((ptype[n] != eptVSite) && (ptype[n] != eptShell) && !nFreeze[gf][d])
+ +                {
+ +                    vnrel = (lg*vrel[d] + dt*(imass*f[n][d] - 0.5*vxi*vrel[d]
+ +                                              - iprod(M[d], vrel)))/(1 + 0.5*vxi*dt);
+ +                    /* do not scale the mean velocities u */
+ +                    vn             = gstat[ga].u[d] + accel[ga][d]*dt + vnrel;
+ +                    v[n][d]        = vn;
+ +                    xprime[n][d]   = x[n][d]+vn*dt;
+ +                }
+ +                else
+ +                {
+ +                    v[n][d]        = 0.0;
+ +                    xprime[n][d]   = x[n][d];
+ +                }
+ +            }
+ +        }
+ +    }
+ +    else if (cFREEZE != NULL ||
+ +             nFreeze[0][XX] || nFreeze[0][YY] || nFreeze[0][ZZ] ||
+ +             bNEMD)
+ +    {
+ +        /* Update with Berendsen/v-rescale coupling and freeze or NEMD */
+ +        for (n = start; n < nrend; n++)
+ +        {
+ +            w_dt = invmass[n]*dt;
+ +            if (cFREEZE)
+ +            {
+ +                gf   = cFREEZE[n];
+ +            }
+ +            if (cACC)
+ +            {
+ +                ga   = cACC[n];
+ +            }
+ +            if (cTC)
+ +            {
+ +                gt   = cTC[n];
+ +            }
+ +            lg   = tcstat[gt].lambda;
+ +
+ +            for (d = 0; d < DIM; d++)
+ +            {
+ +                vn             = v[n][d];
+ +                if ((ptype[n] != eptVSite) && (ptype[n] != eptShell) && !nFreeze[gf][d])
+ +                {
+ +                    vv             = lg*vn + f[n][d]*w_dt;
+ +
+ +                    /* do not scale the mean velocities u */
+ +                    u              = gstat[ga].u[d];
+ +                    va             = vv + accel[ga][d]*dt;
+ +                    vb             = va + (1.0-lg)*u;
+ +                    v[n][d]        = vb;
+ +                    xprime[n][d]   = x[n][d]+vb*dt;
+ +                }
+ +                else
+ +                {
+ +                    v[n][d]        = 0.0;
+ +                    xprime[n][d]   = x[n][d];
+ +                }
+ +            }
+ +        }
+ +    }
+ +    else
+ +    {
+ +        /* Plain update with Berendsen/v-rescale coupling */
+ +        for (n = start; n < nrend; n++)
+ +        {
+ +            if ((ptype[n] != eptVSite) && (ptype[n] != eptShell))
+ +            {
+ +                w_dt = invmass[n]*dt;
+ +                if (cTC)
+ +                {
+ +                    gt = cTC[n];
+ +                }
+ +                lg = tcstat[gt].lambda;
+ +
+ +                for (d = 0; d < DIM; d++)
+ +                {
+ +                    vn           = lg*v[n][d] + f[n][d]*w_dt;
+ +                    v[n][d]      = vn;
+ +                    xprime[n][d] = x[n][d] + vn*dt;
+ +                }
+ +            }
+ +            else
+ +            {
+ +                for (d = 0; d < DIM; d++)
+ +                {
+ +                    v[n][d]        = 0.0;
+ +                    xprime[n][d]   = x[n][d];
+ +                }
+ +            }
+ +        }
+ +    }
+ +}
+ +
+ +static void do_update_vv_vel(int start, int nrend, double dt,
+ +                             t_grp_tcstat *tcstat, t_grp_acc *gstat,
+ +                             rvec accel[], ivec nFreeze[], real invmass[],
+ +                             unsigned short ptype[], unsigned short cFREEZE[],
+ +                             unsigned short cACC[], rvec v[], rvec f[],
+ +                             gmx_bool bExtended, real veta, real alpha)
+ +{
+ +    double imass, w_dt;
+ +    int    gf = 0, ga = 0;
+ +    rvec   vrel;
+ +    real   u, vn, vv, va, vb, vnrel;
+ +    int    n, d;
+ +    double g, mv1, mv2;
+ +
+ +    if (bExtended)
+ +    {
+ +        g        = 0.25*dt*veta*alpha;
+ +        mv1      = exp(-g);
+ +        mv2      = series_sinhx(g);
+ +    }
+ +    else
+ +    {
+ +        mv1      = 1.0;
+ +        mv2      = 1.0;
+ +    }
+ +    for (n = start; n < nrend; n++)
+ +    {
+ +        w_dt = invmass[n]*dt;
+ +        if (cFREEZE)
+ +        {
+ +            gf   = cFREEZE[n];
+ +        }
+ +        if (cACC)
+ +        {
+ +            ga   = cACC[n];
+ +        }
+ +
+ +        for (d = 0; d < DIM; d++)
+ +        {
+ +            if ((ptype[n] != eptVSite) && (ptype[n] != eptShell) && !nFreeze[gf][d])
+ +            {
+ +                v[n][d]             = mv1*(mv1*v[n][d] + 0.5*(w_dt*mv2*f[n][d]))+0.5*accel[ga][d]*dt;
+ +            }
+ +            else
+ +            {
+ +                v[n][d]        = 0.0;
+ +            }
+ +        }
+ +    }
+ +} /* do_update_vv_vel */
+ +
+ +static void do_update_vv_pos(int start, int nrend, double dt,
+ +                             t_grp_tcstat *tcstat, t_grp_acc *gstat,
+ +                             rvec accel[], ivec nFreeze[], real invmass[],
+ +                             unsigned short ptype[], unsigned short cFREEZE[],
+ +                             rvec x[], rvec xprime[], rvec v[],
+ +                             rvec f[], gmx_bool bExtended, real veta, real alpha)
+ +{
+ +    double imass, w_dt;
+ +    int    gf = 0;
+ +    int    n, d;
+ +    double g, mr1, mr2;
+ +
+ +    /* Would it make more sense if Parrinello-Rahman was put here? */
+ +    if (bExtended)
+ +    {
+ +        g        = 0.5*dt*veta;
+ +        mr1      = exp(g);
+ +        mr2      = series_sinhx(g);
+ +    }
+ +    else
+ +    {
+ +        mr1      = 1.0;
+ +        mr2      = 1.0;
+ +    }
+ +
+ +    for (n = start; n < nrend; n++)
+ +    {
+ +
+ +        if (cFREEZE)
+ +        {
+ +            gf   = cFREEZE[n];
+ +        }
+ +
+ +        for (d = 0; d < DIM; d++)
+ +        {
+ +            if ((ptype[n] != eptVSite) && (ptype[n] != eptShell) && !nFreeze[gf][d])
+ +            {
+ +                xprime[n][d]   = mr1*(mr1*x[n][d]+mr2*dt*v[n][d]);
+ +            }
+ +            else
+ +            {
+ +                xprime[n][d]   = x[n][d];
+ +            }
+ +        }
+ +    }
+ +} /* do_update_vv_pos */
+ +
+ +static void do_update_visc(int start, int nrend, double dt,
+ +                           t_grp_tcstat *tcstat,
+ +                           double nh_vxi[],
+ +                           real invmass[],
+ +                           unsigned short ptype[], unsigned short cTC[],
+ +                           rvec x[], rvec xprime[], rvec v[],
+ +                           rvec f[], matrix M, matrix box, real
+ +                           cos_accel, real vcos,
+ +                           gmx_bool bNH, gmx_bool bPR)
+ +{
+ +    double imass, w_dt;
+ +    int    gt = 0;
+ +    real   vn, vc;
+ +    real   lg, vxi = 0, vv;
+ +    real   fac, cosz;
+ +    rvec   vrel;
+ +    int    n, d;
+ +
+ +    fac = 2*M_PI/(box[ZZ][ZZ]);
+ +
+ +    if (bNH || bPR)
+ +    {
+ +        /* Update with coupling to extended ensembles, used for
+ +         * Nose-Hoover and Parrinello-Rahman coupling
+ +         */
+ +        for (n = start; n < nrend; n++)
+ +        {
+ +            imass = invmass[n];
+ +            if (cTC)
+ +            {
+ +                gt   = cTC[n];
+ +            }
+ +            lg   = tcstat[gt].lambda;
+ +            cosz = cos(fac*x[n][ZZ]);
+ +
+ +            copy_rvec(v[n], vrel);
+ +
+ +            vc            = cosz*vcos;
+ +            vrel[XX]     -= vc;
+ +            if (bNH)
+ +            {
+ +                vxi        = nh_vxi[gt];
+ +            }
+ +            for (d = 0; d < DIM; d++)
+ +            {
+ +                vn             = v[n][d];
+ +
+ +                if ((ptype[n] != eptVSite) && (ptype[n] != eptShell))
+ +                {
+ +                    vn  = (lg*vrel[d] + dt*(imass*f[n][d] - 0.5*vxi*vrel[d]
+ +                                            - iprod(M[d], vrel)))/(1 + 0.5*vxi*dt);
+ +                    if (d == XX)
+ +                    {
+ +                        vn += vc + dt*cosz*cos_accel;
+ +                    }
+ +                    v[n][d]        = vn;
+ +                    xprime[n][d]   = x[n][d]+vn*dt;
+ +                }
+ +                else
+ +                {
+ +                    xprime[n][d]   = x[n][d];
+ +                }
+ +            }
+ +        }
+ +    }
+ +    else
+ +    {
+ +        /* Classic version of update, used with berendsen coupling */
+ +        for (n = start; n < nrend; n++)
+ +        {
+ +            w_dt = invmass[n]*dt;
+ +            if (cTC)
+ +            {
+ +                gt   = cTC[n];
+ +            }
+ +            lg   = tcstat[gt].lambda;
+ +            cosz = cos(fac*x[n][ZZ]);
+ +
+ +            for (d = 0; d < DIM; d++)
+ +            {
+ +                vn             = v[n][d];
+ +
+ +                if ((ptype[n] != eptVSite) && (ptype[n] != eptShell))
+ +                {
+ +                    if (d == XX)
+ +                    {
+ +                        vc           = cosz*vcos;
+ +                        /* Do not scale the cosine velocity profile */
+ +                        vv           = vc + lg*(vn - vc + f[n][d]*w_dt);
+ +                        /* Add the cosine accelaration profile */
+ +                        vv          += dt*cosz*cos_accel;
+ +                    }
+ +                    else
+ +                    {
+ +                        vv           = lg*(vn + f[n][d]*w_dt);
+ +                    }
+ +                    v[n][d]        = vv;
+ +                    xprime[n][d]   = x[n][d]+vv*dt;
+ +                }
+ +                else
+ +                {
+ +                    v[n][d]        = 0.0;
+ +                    xprime[n][d]   = x[n][d];
+ +                }
+ +            }
+ +        }
+ +    }
+ +}
+ +
+ +/* Allocates and initializes sd->gaussrand[i] for i=1, i<sd->ngaussrand,
+ + * Using seeds generated from sd->gaussrand[0].
+ + */
+ +static void init_multiple_gaussrand(gmx_stochd_t *sd)
+ +{
+ +    int           ngr, i;
+ +    unsigned int *seed;
+ +
+ +    ngr = sd->ngaussrand;
+ +    snew(seed, ngr);
+ +
+ +    for (i = 1; i < ngr; i++)
+ +    {
+ +        seed[i] = gmx_rng_uniform_uint32(sd->gaussrand[0]);
+ +    }
+ +
-             /* Initialize on each thread to have thread-local memory alloced */
++    if (ngr != gmx_omp_nthreads_get(emntUpdate))
++    {
++        gmx_incons("The number of Gaussian number generators should be equal to gmx_omp_nthreads_get(emntUpdate)");
++    }
++
++#pragma omp parallel num_threads(gmx_omp_nthreads_get(emntUpdate))
+ +    {
+ +        int th;
+ +
+ +        th = gmx_omp_get_thread_num();
+ +        if (th > 0)
+ +        {
-                           int ngtc, real tau_t[], real ref_t[],
++            /* Initialize on each thread to get memory allocated thread-local */
+ +            sd->gaussrand[th] = gmx_rng_init(seed[th]);
+ +        }
+ +    }
+ +
+ +    sfree(seed);
+ +}
+ +
+ +static gmx_stochd_t *init_stochd(FILE *fplog, t_inputrec *ir, int nthreads)
+ +{
+ +    gmx_stochd_t   *sd;
+ +    gmx_sd_const_t *sdc;
+ +    int             ngtc, n, th;
+ +    real            y;
+ +
+ +    snew(sd, 1);
+ +
+ +    /* Initiate random number generator for langevin type dynamics,
+ +     * for BD, SD or velocity rescaling temperature coupling.
+ +     */
+ +    if (ir->eI == eiBD || EI_SD(ir->eI))
+ +    {
+ +        sd->ngaussrand = nthreads;
+ +    }
+ +    else
+ +    {
+ +        sd->ngaussrand = 1;
+ +    }
+ +    snew(sd->gaussrand, sd->ngaussrand);
+ +
+ +    /* Initialize the first random generator */
+ +    sd->gaussrand[0] = gmx_rng_init(ir->ld_seed);
+ +
+ +    if (sd->ngaussrand > 1)
+ +    {
+ +        /* Initialize the rest of the random number generators,
+ +         * using the first one to generate seeds.
+ +         */
+ +        init_multiple_gaussrand(sd);
+ +    }
+ +
+ +    ngtc = ir->opts.ngtc;
+ +
+ +    if (ir->eI == eiBD)
+ +    {
+ +        snew(sd->bd_rf, ngtc);
+ +    }
+ +    else if (EI_SD(ir->eI))
+ +    {
+ +        snew(sd->sdc, ngtc);
+ +        snew(sd->sdsig, ngtc);
+ +
+ +        sdc = sd->sdc;
+ +        for (n = 0; n < ngtc; n++)
+ +        {
+ +            if (ir->opts.tau_t[n] > 0)
+ +            {
+ +                sdc[n].gdt = ir->delta_t/ir->opts.tau_t[n];
+ +                sdc[n].eph = exp(sdc[n].gdt/2);
+ +                sdc[n].emh = exp(-sdc[n].gdt/2);
+ +                sdc[n].em  = exp(-sdc[n].gdt);
+ +            }
+ +            else
+ +            {
+ +                /* No friction and noise on this group */
+ +                sdc[n].gdt = 0;
+ +                sdc[n].eph = 1;
+ +                sdc[n].emh = 1;
+ +                sdc[n].em  = 1;
+ +            }
+ +            if (sdc[n].gdt >= 0.05)
+ +            {
+ +                sdc[n].b = sdc[n].gdt*(sdc[n].eph*sdc[n].eph - 1)
+ +                    - 4*(sdc[n].eph - 1)*(sdc[n].eph - 1);
+ +                sdc[n].c = sdc[n].gdt - 3 + 4*sdc[n].emh - sdc[n].em;
+ +                sdc[n].d = 2 - sdc[n].eph - sdc[n].emh;
+ +            }
+ +            else
+ +            {
+ +                y = sdc[n].gdt/2;
+ +                /* Seventh order expansions for small y */
+ +                sdc[n].b = y*y*y*y*(1/3.0+y*(1/3.0+y*(17/90.0+y*7/9.0)));
+ +                sdc[n].c = y*y*y*(2/3.0+y*(-1/2.0+y*(7/30.0+y*(-1/12.0+y*31/1260.0))));
+ +                sdc[n].d = y*y*(-1+y*y*(-1/12.0-y*y/360.0));
+ +            }
+ +            if (debug)
+ +            {
+ +                fprintf(debug, "SD const tc-grp %d: b %g  c %g  d %g\n",
+ +                        n, sdc[n].b, sdc[n].c, sdc[n].d);
+ +            }
+ +        }
+ +    }
+ +    else if (ETC_ANDERSEN(ir->etc))
+ +    {
+ +        int        ngtc;
+ +        t_grpopts *opts;
+ +        real       reft;
+ +
+ +        opts = &ir->opts;
+ +        ngtc = opts->ngtc;
+ +
+ +        snew(sd->randomize_group, ngtc);
+ +        snew(sd->boltzfac, ngtc);
+ +
+ +        /* for now, assume that all groups, if randomized, are randomized at the same rate, i.e. tau_t is the same. */
+ +        /* since constraint groups don't necessarily match up with temperature groups! This is checked in readir.c */
+ +
+ +        for (n = 0; n < ngtc; n++)
+ +        {
+ +            reft = max(0.0, opts->ref_t[n]);
+ +            if ((opts->tau_t[n] > 0) && (reft > 0))  /* tau_t or ref_t = 0 means that no randomization is done */
+ +            {
+ +                sd->randomize_group[n] = TRUE;
+ +                sd->boltzfac[n]        = BOLTZ*opts->ref_t[n];
+ +            }
+ +            else
+ +            {
+ +                sd->randomize_group[n] = FALSE;
+ +            }
+ +        }
+ +    }
+ +    return sd;
+ +}
+ +
+ +void get_stochd_state(gmx_update_t upd, t_state *state)
+ +{
+ +    /* Note that we only get the state of the first random generator,
+ +     * even if there are multiple. This avoids repetition.
+ +     */
+ +    gmx_rng_get_state(upd->sd->gaussrand[0], state->ld_rng, state->ld_rngi);
+ +}
+ +
+ +void set_stochd_state(gmx_update_t upd, t_state *state)
+ +{
+ +    gmx_stochd_t *sd;
+ +    int           i;
+ +
+ +    sd = upd->sd;
+ +
+ +    gmx_rng_set_state(sd->gaussrand[0], state->ld_rng, state->ld_rngi[0]);
+ +
+ +    if (sd->ngaussrand > 1)
+ +    {
+ +        /* We only end up here with SD or BD with OpenMP.
+ +         * Destroy and reinitialize the rest of the random number generators,
+ +         * using seeds generated from the first one.
+ +         * Although this doesn't recover the previous state,
+ +         * it at least avoids repetition, which is most important.
+ +         * Exaclty restoring states with all MPI+OpenMP setups is difficult
+ +         * and as the integrator is random to start with, doesn't gain us much.
+ +         */
+ +        for (i = 1; i < sd->ngaussrand; i++)
+ +        {
+ +            gmx_rng_destroy(sd->gaussrand[i]);
+ +        }
+ +
+ +        init_multiple_gaussrand(sd);
+ +    }
+ +}
+ +
+ +gmx_update_t init_update(FILE *fplog, t_inputrec *ir)
+ +{
+ +    t_gmx_update *upd;
+ +
+ +    snew(upd, 1);
+ +
+ +    if (ir->eI == eiBD || EI_SD(ir->eI) || ir->etc == etcVRESCALE || ETC_ANDERSEN(ir->etc))
+ +    {
+ +        upd->sd = init_stochd(fplog, ir, gmx_omp_nthreads_get(emntUpdate));
+ +    }
+ +
+ +    upd->xp                 = NULL;
+ +    upd->xp_nalloc          = 0;
+ +    upd->randatom           = NULL;
+ +    upd->randatom_list      = NULL;
+ +    upd->randatom_list_init = FALSE; /* we have not yet cleared the data structure at this point */
+ +
+ +    return upd;
+ +}
+ +
+ +static void do_update_sd1(gmx_stochd_t *sd,
+ +                          gmx_rng_t gaussrand,
+ +                          int start, int nrend, double dt,
+ +                          rvec accel[], ivec nFreeze[],
+ +                          real invmass[], unsigned short ptype[],
+ +                          unsigned short cFREEZE[], unsigned short cACC[],
+ +                          unsigned short cTC[],
+ +                          rvec x[], rvec xprime[], rvec v[], rvec f[],
+ +                          rvec sd_X[],
+ +                          int ngtc, real tau_t[], real ref_t[])
+ +{
+ +    gmx_sd_const_t *sdc;
+ +    gmx_sd_sigma_t *sig;
+ +    real            kT;
+ +    int             gf = 0, ga = 0, gt = 0;
+ +    real            ism, sd_V;
+ +    int             n, d;
+ +
+ +    sdc = sd->sdc;
+ +    sig = sd->sdsig;
+ +
+ +    for (n = 0; n < ngtc; n++)
+ +    {
+ +        kT = BOLTZ*ref_t[n];
+ +        /* The mass is encounted for later, since this differs per atom */
+ +        sig[n].V  = sqrt(kT*(1 - sdc[n].em*sdc[n].em));
+ +    }
+ +
+ +    for (n = start; n < nrend; n++)
+ +    {
+ +        ism = sqrt(invmass[n]);
+ +        if (cFREEZE)
+ +        {
+ +            gf  = cFREEZE[n];
+ +        }
+ +        if (cACC)
+ +        {
+ +            ga  = cACC[n];
+ +        }
+ +        if (cTC)
+ +        {
+ +            gt  = cTC[n];
+ +        }
+ +
+ +        for (d = 0; d < DIM; d++)
+ +        {
+ +            if ((ptype[n] != eptVSite) && (ptype[n] != eptShell) && !nFreeze[gf][d])
+ +            {
+ +                sd_V = ism*sig[gt].V*gmx_rng_gaussian_table(gaussrand);
+ +
+ +                v[n][d] = v[n][d]*sdc[gt].em
+ +                    + (invmass[n]*f[n][d] + accel[ga][d])*tau_t[gt]*(1 - sdc[gt].em)
+ +                    + sd_V;
+ +
+ +                xprime[n][d] = x[n][d] + v[n][d]*dt;
+ +            }
+ +            else
+ +            {
+ +                v[n][d]      = 0.0;
+ +                xprime[n][d] = x[n][d];
+ +            }
+ +        }
+ +    }
+ +}
+ +
+ +static void check_sd2_work_data_allocation(gmx_stochd_t *sd, int nrend)
+ +{
+ +    if (nrend > sd->sd_V_nalloc)
+ +    {
+ +        sd->sd_V_nalloc = over_alloc_dd(nrend);
+ +        srenew(sd->sd_V, sd->sd_V_nalloc);
+ +    }
+ +}
+ +
++static void do_update_sd2_Tconsts(gmx_stochd_t *sd,
++                                  int ngtc,
++                                  const real tau_t[],
++                                  const real ref_t[])
++{
++    /* This is separated from the update below, because it is single threaded */
++    gmx_sd_const_t *sdc;
++    gmx_sd_sigma_t *sig;
++    int             gt;
++    real            kT;
++
++    sdc = sd->sdc;
++    sig = sd->sdsig;
++
++    for (gt = 0; gt < ngtc; gt++)
++    {
++        kT = BOLTZ*ref_t[gt];
++        /* The mass is encounted for later, since this differs per atom */
++        sig[gt].V  = sqrt(kT*(1-sdc[gt].em));
++        sig[gt].X  = sqrt(kT*sqr(tau_t[gt])*sdc[gt].c);
++        sig[gt].Yv = sqrt(kT*sdc[gt].b/sdc[gt].c);
++        sig[gt].Yx = sqrt(kT*sqr(tau_t[gt])*sdc[gt].b/(1-sdc[gt].em));
++    }
++}
++
+ +static void do_update_sd2(gmx_stochd_t *sd,
+ +                          gmx_rng_t gaussrand,
+ +                          gmx_bool bInitStep,
+ +                          int start, int nrend,
+ +                          rvec accel[], ivec nFreeze[],
+ +                          real invmass[], unsigned short ptype[],
+ +                          unsigned short cFREEZE[], unsigned short cACC[],
+ +                          unsigned short cTC[],
+ +                          rvec x[], rvec xprime[], rvec v[], rvec f[],
+ +                          rvec sd_X[],
-     if (bFirstHalf)
-     {
-         for (n = 0; n < ngtc; n++)
-         {
-             kT = BOLTZ*ref_t[n];
-             /* The mass is encounted for later, since this differs per atom */
-             sig[n].V  = sqrt(kT*(1-sdc[n].em));
-             sig[n].X  = sqrt(kT*sqr(tau_t[n])*sdc[n].c);
-             sig[n].Yv = sqrt(kT*sdc[n].b/sdc[n].c);
-             sig[n].Yx = sqrt(kT*sqr(tau_t[n])*sdc[n].b/(1-sdc[n].em));
-         }
-     }
- 
++                          const real tau_t[],
+ +                          gmx_bool bFirstHalf)
+ +{
+ +    gmx_sd_const_t *sdc;
+ +    gmx_sd_sigma_t *sig;
+ +    /* The random part of the velocity update, generated in the first
+ +     * half of the update, needs to be remembered for the second half.
+ +     */
+ +    rvec  *sd_V;
+ +    real   kT;
+ +    int    gf = 0, ga = 0, gt = 0;
+ +    real   vn = 0, Vmh, Xmh;
+ +    real   ism;
+ +    int    n, d;
+ +
+ +    sdc  = sd->sdc;
+ +    sig  = sd->sdsig;
+ +    sd_V = sd->sd_V;
+ +
-                          int ngtc, real tau_t[], real ref_t[],
+ +    for (n = start; n < nrend; n++)
+ +    {
+ +        ism = sqrt(invmass[n]);
+ +        if (cFREEZE)
+ +        {
+ +            gf  = cFREEZE[n];
+ +        }
+ +        if (cACC)
+ +        {
+ +            ga  = cACC[n];
+ +        }
+ +        if (cTC)
+ +        {
+ +            gt  = cTC[n];
+ +        }
+ +
+ +        for (d = 0; d < DIM; d++)
+ +        {
+ +            if (bFirstHalf)
+ +            {
+ +                vn             = v[n][d];
+ +            }
+ +            if ((ptype[n] != eptVSite) && (ptype[n] != eptShell) && !nFreeze[gf][d])
+ +            {
+ +                if (bFirstHalf)
+ +                {
+ +                    if (bInitStep)
+ +                    {
+ +                        sd_X[n][d] = ism*sig[gt].X*gmx_rng_gaussian_table(gaussrand);
+ +                    }
+ +                    Vmh = sd_X[n][d]*sdc[gt].d/(tau_t[gt]*sdc[gt].c)
+ +                        + ism*sig[gt].Yv*gmx_rng_gaussian_table(gaussrand);
+ +                    sd_V[n][d] = ism*sig[gt].V*gmx_rng_gaussian_table(gaussrand);
+ +
+ +                    v[n][d] = vn*sdc[gt].em
+ +                        + (invmass[n]*f[n][d] + accel[ga][d])*tau_t[gt]*(1 - sdc[gt].em)
+ +                        + sd_V[n][d] - sdc[gt].em*Vmh;
+ +
+ +                    xprime[n][d] = x[n][d] + v[n][d]*tau_t[gt]*(sdc[gt].eph - sdc[gt].emh);
+ +                }
+ +                else
+ +                {
+ +
+ +                    /* Correct the velocities for the constraints.
+ +                     * This operation introduces some inaccuracy,
+ +                     * since the velocity is determined from differences in coordinates.
+ +                     */
+ +                    v[n][d] =
+ +                        (xprime[n][d] - x[n][d])/(tau_t[gt]*(sdc[gt].eph - sdc[gt].emh));
+ +
+ +                    Xmh = sd_V[n][d]*tau_t[gt]*sdc[gt].d/(sdc[gt].em-1)
+ +                        + ism*sig[gt].Yx*gmx_rng_gaussian_table(gaussrand);
+ +                    sd_X[n][d] = ism*sig[gt].X*gmx_rng_gaussian_table(gaussrand);
+ +
+ +                    xprime[n][d] += sd_X[n][d] - Xmh;
+ +
+ +                }
+ +            }
+ +            else
+ +            {
+ +                if (bFirstHalf)
+ +                {
+ +                    v[n][d]        = 0.0;
+ +                    xprime[n][d]   = x[n][d];
+ +                }
+ +            }
+ +        }
+ +    }
+ +}
+ +
++static void do_update_bd_Tconsts(double dt, real friction_coefficient,
++                                 int ngtc, const real ref_t[],
++                                 real *rf)
++{
++    /* This is separated from the update below, because it is single threaded */
++    int gt;
++
++    if (friction_coefficient != 0)
++    {
++        for (gt = 0; gt < ngtc; gt++)
++        {
++            rf[gt] = sqrt(2.0*BOLTZ*ref_t[gt]/(friction_coefficient*dt));
++        }
++    }
++    else
++    {
++        for (gt = 0; gt < ngtc; gt++)
++        {
++            rf[gt] = sqrt(2.0*BOLTZ*ref_t[gt]);
++        }
++    }
++}
++
+ +static void do_update_bd(int start, int nrend, double dt,
+ +                         ivec nFreeze[],
+ +                         real invmass[], unsigned short ptype[],
+ +                         unsigned short cFREEZE[], unsigned short cTC[],
+ +                         rvec x[], rvec xprime[], rvec v[],
+ +                         rvec f[], real friction_coefficient,
-         for (n = 0; n < ngtc; n++)
-         {
-             rf[n] = sqrt(2.0*BOLTZ*ref_t[n]/(friction_coefficient*dt));
-         }
-     }
-     else
-     {
-         for (n = 0; n < ngtc; n++)
-         {
-             rf[n] = sqrt(2.0*BOLTZ*ref_t[n]);
-         }
+ +                         real *rf, gmx_rng_t gaussrand)
+ +{
+ +    /* note -- these appear to be full step velocities . . .  */
+ +    int    gf = 0, gt = 0;
+ +    real   vn;
+ +    real   invfr = 0;
+ +    int    n, d;
+ +
+ +    if (friction_coefficient != 0)
+ +    {
+ +        invfr = 1.0/friction_coefficient;
-                           inputrec->opts.ngtc, inputrec->opts.tau_t,
-                           inputrec->opts.ref_t, FALSE);
+ +    }
++
+ +    for (n = start; (n < nrend); n++)
+ +    {
+ +        if (cFREEZE)
+ +        {
+ +            gf = cFREEZE[n];
+ +        }
+ +        if (cTC)
+ +        {
+ +            gt = cTC[n];
+ +        }
+ +        for (d = 0; (d < DIM); d++)
+ +        {
+ +            if ((ptype[n] != eptVSite) && (ptype[n] != eptShell) && !nFreeze[gf][d])
+ +            {
+ +                if (friction_coefficient != 0)
+ +                {
+ +                    vn = invfr*f[n][d] + rf[gt]*gmx_rng_gaussian_table(gaussrand);
+ +                }
+ +                else
+ +                {
+ +                    /* NOTE: invmass = 2/(mass*friction_constant*dt) */
+ +                    vn = 0.5*invmass[n]*f[n][d]*dt
+ +                        + sqrt(0.5*invmass[n])*rf[gt]*gmx_rng_gaussian_table(gaussrand);
+ +                }
+ +
+ +                v[n][d]      = vn;
+ +                xprime[n][d] = x[n][d]+vn*dt;
+ +            }
+ +            else
+ +            {
+ +                v[n][d]      = 0.0;
+ +                xprime[n][d] = x[n][d];
+ +            }
+ +        }
+ +    }
+ +}
+ +
+ +static void dump_it_all(FILE *fp, const char *title,
+ +                        int natoms, rvec x[], rvec xp[], rvec v[], rvec f[])
+ +{
+ +#ifdef DEBUG
+ +    if (fp)
+ +    {
+ +        fprintf(fp, "%s\n", title);
+ +        pr_rvecs(fp, 0, "x", x, natoms);
+ +        pr_rvecs(fp, 0, "xp", xp, natoms);
+ +        pr_rvecs(fp, 0, "v", v, natoms);
+ +        pr_rvecs(fp, 0, "f", f, natoms);
+ +    }
+ +#endif
+ +}
+ +
+ +static void calc_ke_part_normal(rvec v[], t_grpopts *opts, t_mdatoms *md,
+ +                                gmx_ekindata_t *ekind, t_nrnb *nrnb, gmx_bool bEkinAveVel,
+ +                                gmx_bool bSaveEkinOld)
+ +{
+ +    int           g;
+ +    t_grp_tcstat *tcstat  = ekind->tcstat;
+ +    t_grp_acc    *grpstat = ekind->grpstat;
+ +    int           nthread, thread;
+ +
+ +    /* three main: VV with AveVel, vv with AveEkin, leap with AveEkin.  Leap with AveVel is also
+ +       an option, but not supported now.  Additionally, if we are doing iterations.
+ +       bEkinAveVel: If TRUE, we sum into ekin, if FALSE, into ekinh.
+ +       bSavEkinOld: If TRUE (in the case of iteration = bIterate is TRUE), we don't copy over the ekinh_old.
+ +       If FALSE, we overrwrite it.
+ +     */
+ +
+ +    /* group velocities are calculated in update_ekindata and
+ +     * accumulated in acumulate_groups.
+ +     * Now the partial global and groups ekin.
+ +     */
+ +    for (g = 0; (g < opts->ngtc); g++)
+ +    {
+ +
+ +        if (!bSaveEkinOld)
+ +        {
+ +            copy_mat(tcstat[g].ekinh, tcstat[g].ekinh_old);
+ +        }
+ +        if (bEkinAveVel)
+ +        {
+ +            clear_mat(tcstat[g].ekinf);
+ +        }
+ +        else
+ +        {
+ +            clear_mat(tcstat[g].ekinh);
+ +        }
+ +        if (bEkinAveVel)
+ +        {
+ +            tcstat[g].ekinscalef_nhc = 1.0; /* need to clear this -- logic is complicated! */
+ +        }
+ +    }
+ +    ekind->dekindl_old = ekind->dekindl;
+ +
+ +    nthread = gmx_omp_nthreads_get(emntUpdate);
+ +
+ +#pragma omp parallel for num_threads(nthread) schedule(static)
+ +    for (thread = 0; thread < nthread; thread++)
+ +    {
+ +        int     start_t, end_t, n;
+ +        int     ga, gt;
+ +        rvec    v_corrt;
+ +        real    hm;
+ +        int     d, m;
+ +        matrix *ekin_sum;
+ +        real   *dekindl_sum;
+ +
+ +        start_t = md->start + ((thread+0)*md->homenr)/nthread;
+ +        end_t   = md->start + ((thread+1)*md->homenr)/nthread;
+ +
+ +        ekin_sum    = ekind->ekin_work[thread];
+ +        dekindl_sum = ekind->dekindl_work[thread];
+ +
+ +        for (gt = 0; gt < opts->ngtc; gt++)
+ +        {
+ +            clear_mat(ekin_sum[gt]);
+ +        }
+ +        *dekindl_sum = 0.0;
+ +
+ +        ga = 0;
+ +        gt = 0;
+ +        for (n = start_t; n < end_t; n++)
+ +        {
+ +            if (md->cACC)
+ +            {
+ +                ga = md->cACC[n];
+ +            }
+ +            if (md->cTC)
+ +            {
+ +                gt = md->cTC[n];
+ +            }
+ +            hm   = 0.5*md->massT[n];
+ +
+ +            for (d = 0; (d < DIM); d++)
+ +            {
+ +                v_corrt[d]  = v[n][d]  - grpstat[ga].u[d];
+ +            }
+ +            for (d = 0; (d < DIM); d++)
+ +            {
+ +                for (m = 0; (m < DIM); m++)
+ +                {
+ +                    /* if we're computing a full step velocity, v_corrt[d] has v(t).  Otherwise, v(t+dt/2) */
+ +                    ekin_sum[gt][m][d] += hm*v_corrt[m]*v_corrt[d];
+ +                }
+ +            }
+ +            if (md->nMassPerturbed && md->bPerturbed[n])
+ +            {
+ +                *dekindl_sum +=
+ +                    0.5*(md->massB[n] - md->massA[n])*iprod(v_corrt, v_corrt);
+ +            }
+ +        }
+ +    }
+ +
+ +    ekind->dekindl = 0;
+ +    for (thread = 0; thread < nthread; thread++)
+ +    {
+ +        for (g = 0; g < opts->ngtc; g++)
+ +        {
+ +            if (bEkinAveVel)
+ +            {
+ +                m_add(tcstat[g].ekinf, ekind->ekin_work[thread][g],
+ +                      tcstat[g].ekinf);
+ +            }
+ +            else
+ +            {
+ +                m_add(tcstat[g].ekinh, ekind->ekin_work[thread][g],
+ +                      tcstat[g].ekinh);
+ +            }
+ +        }
+ +
+ +        ekind->dekindl += *ekind->dekindl_work[thread];
+ +    }
+ +
+ +    inc_nrnb(nrnb, eNR_EKIN, md->homenr);
+ +}
+ +
+ +static void calc_ke_part_visc(matrix box, rvec x[], rvec v[],
+ +                              t_grpopts *opts, t_mdatoms *md,
+ +                              gmx_ekindata_t *ekind,
+ +                              t_nrnb *nrnb, gmx_bool bEkinAveVel, gmx_bool bSaveEkinOld)
+ +{
+ +    int           start = md->start, homenr = md->homenr;
+ +    int           g, d, n, m, gt = 0;
+ +    rvec          v_corrt;
+ +    real          hm;
+ +    t_grp_tcstat *tcstat = ekind->tcstat;
+ +    t_cos_acc    *cosacc = &(ekind->cosacc);
+ +    real          dekindl;
+ +    real          fac, cosz;
+ +    double        mvcos;
+ +
+ +    for (g = 0; g < opts->ngtc; g++)
+ +    {
+ +        copy_mat(ekind->tcstat[g].ekinh, ekind->tcstat[g].ekinh_old);
+ +        clear_mat(ekind->tcstat[g].ekinh);
+ +    }
+ +    ekind->dekindl_old = ekind->dekindl;
+ +
+ +    fac     = 2*M_PI/box[ZZ][ZZ];
+ +    mvcos   = 0;
+ +    dekindl = 0;
+ +    for (n = start; n < start+homenr; n++)
+ +    {
+ +        if (md->cTC)
+ +        {
+ +            gt = md->cTC[n];
+ +        }
+ +        hm   = 0.5*md->massT[n];
+ +
+ +        /* Note that the times of x and v differ by half a step */
+ +        /* MRS -- would have to be changed for VV */
+ +        cosz         = cos(fac*x[n][ZZ]);
+ +        /* Calculate the amplitude of the new velocity profile */
+ +        mvcos       += 2*cosz*md->massT[n]*v[n][XX];
+ +
+ +        copy_rvec(v[n], v_corrt);
+ +        /* Subtract the profile for the kinetic energy */
+ +        v_corrt[XX] -= cosz*cosacc->vcos;
+ +        for (d = 0; (d < DIM); d++)
+ +        {
+ +            for (m = 0; (m < DIM); m++)
+ +            {
+ +                /* if we're computing a full step velocity, v_corrt[d] has v(t).  Otherwise, v(t+dt/2) */
+ +                if (bEkinAveVel)
+ +                {
+ +                    tcstat[gt].ekinf[m][d] += hm*v_corrt[m]*v_corrt[d];
+ +                }
+ +                else
+ +                {
+ +                    tcstat[gt].ekinh[m][d] += hm*v_corrt[m]*v_corrt[d];
+ +                }
+ +            }
+ +        }
+ +        if (md->nPerturbed && md->bPerturbed[n])
+ +        {
+ +            dekindl += 0.5*(md->massB[n] - md->massA[n])*iprod(v_corrt, v_corrt);
+ +        }
+ +    }
+ +    ekind->dekindl = dekindl;
+ +    cosacc->mvcos  = mvcos;
+ +
+ +    inc_nrnb(nrnb, eNR_EKIN, homenr);
+ +}
+ +
+ +void calc_ke_part(t_state *state, t_grpopts *opts, t_mdatoms *md,
+ +                  gmx_ekindata_t *ekind, t_nrnb *nrnb, gmx_bool bEkinAveVel, gmx_bool bSaveEkinOld)
+ +{
+ +    if (ekind->cosacc.cos_accel == 0)
+ +    {
+ +        calc_ke_part_normal(state->v, opts, md, ekind, nrnb, bEkinAveVel, bSaveEkinOld);
+ +    }
+ +    else
+ +    {
+ +        calc_ke_part_visc(state->box, state->x, state->v, opts, md, ekind, nrnb, bEkinAveVel, bSaveEkinOld);
+ +    }
+ +}
+ +
+ +extern void init_ekinstate(ekinstate_t *ekinstate, const t_inputrec *ir)
+ +{
+ +    ekinstate->ekin_n = ir->opts.ngtc;
+ +    snew(ekinstate->ekinh, ekinstate->ekin_n);
+ +    snew(ekinstate->ekinf, ekinstate->ekin_n);
+ +    snew(ekinstate->ekinh_old, ekinstate->ekin_n);
+ +    snew(ekinstate->ekinscalef_nhc, ekinstate->ekin_n);
+ +    snew(ekinstate->ekinscaleh_nhc, ekinstate->ekin_n);
+ +    snew(ekinstate->vscale_nhc, ekinstate->ekin_n);
+ +    ekinstate->dekindl = 0;
+ +    ekinstate->mvcos   = 0;
+ +}
+ +
+ +void update_ekinstate(ekinstate_t *ekinstate, gmx_ekindata_t *ekind)
+ +{
+ +    int i;
+ +
+ +    for (i = 0; i < ekinstate->ekin_n; i++)
+ +    {
+ +        copy_mat(ekind->tcstat[i].ekinh, ekinstate->ekinh[i]);
+ +        copy_mat(ekind->tcstat[i].ekinf, ekinstate->ekinf[i]);
+ +        copy_mat(ekind->tcstat[i].ekinh_old, ekinstate->ekinh_old[i]);
+ +        ekinstate->ekinscalef_nhc[i] = ekind->tcstat[i].ekinscalef_nhc;
+ +        ekinstate->ekinscaleh_nhc[i] = ekind->tcstat[i].ekinscaleh_nhc;
+ +        ekinstate->vscale_nhc[i]     = ekind->tcstat[i].vscale_nhc;
+ +    }
+ +
+ +    copy_mat(ekind->ekin, ekinstate->ekin_total);
+ +    ekinstate->dekindl = ekind->dekindl;
+ +    ekinstate->mvcos   = ekind->cosacc.mvcos;
+ +
+ +}
+ +
+ +void restore_ekinstate_from_state(t_commrec *cr,
+ +                                  gmx_ekindata_t *ekind, ekinstate_t *ekinstate)
+ +{
+ +    int i, n;
+ +
+ +    if (MASTER(cr))
+ +    {
+ +        for (i = 0; i < ekinstate->ekin_n; i++)
+ +        {
+ +            copy_mat(ekinstate->ekinh[i], ekind->tcstat[i].ekinh);
+ +            copy_mat(ekinstate->ekinf[i], ekind->tcstat[i].ekinf);
+ +            copy_mat(ekinstate->ekinh_old[i], ekind->tcstat[i].ekinh_old);
+ +            ekind->tcstat[i].ekinscalef_nhc = ekinstate->ekinscalef_nhc[i];
+ +            ekind->tcstat[i].ekinscaleh_nhc = ekinstate->ekinscaleh_nhc[i];
+ +            ekind->tcstat[i].vscale_nhc     = ekinstate->vscale_nhc[i];
+ +        }
+ +
+ +        copy_mat(ekinstate->ekin_total, ekind->ekin);
+ +
+ +        ekind->dekindl      = ekinstate->dekindl;
+ +        ekind->cosacc.mvcos = ekinstate->mvcos;
+ +        n                   = ekinstate->ekin_n;
+ +    }
+ +
+ +    if (PAR(cr))
+ +    {
+ +        gmx_bcast(sizeof(n), &n, cr);
+ +        for (i = 0; i < n; i++)
+ +        {
+ +            gmx_bcast(DIM*DIM*sizeof(ekind->tcstat[i].ekinh[0][0]),
+ +                      ekind->tcstat[i].ekinh[0], cr);
+ +            gmx_bcast(DIM*DIM*sizeof(ekind->tcstat[i].ekinf[0][0]),
+ +                      ekind->tcstat[i].ekinf[0], cr);
+ +            gmx_bcast(DIM*DIM*sizeof(ekind->tcstat[i].ekinh_old[0][0]),
+ +                      ekind->tcstat[i].ekinh_old[0], cr);
+ +
+ +            gmx_bcast(sizeof(ekind->tcstat[i].ekinscalef_nhc),
+ +                      &(ekind->tcstat[i].ekinscalef_nhc), cr);
+ +            gmx_bcast(sizeof(ekind->tcstat[i].ekinscaleh_nhc),
+ +                      &(ekind->tcstat[i].ekinscaleh_nhc), cr);
+ +            gmx_bcast(sizeof(ekind->tcstat[i].vscale_nhc),
+ +                      &(ekind->tcstat[i].vscale_nhc), cr);
+ +        }
+ +        gmx_bcast(DIM*DIM*sizeof(ekind->ekin[0][0]),
+ +                  ekind->ekin[0], cr);
+ +
+ +        gmx_bcast(sizeof(ekind->dekindl), &ekind->dekindl, cr);
+ +        gmx_bcast(sizeof(ekind->cosacc.mvcos), &ekind->cosacc.mvcos, cr);
+ +    }
+ +}
+ +
+ +void set_deform_reference_box(gmx_update_t upd, gmx_large_int_t step, matrix box)
+ +{
+ +    upd->deformref_step = step;
+ +    copy_mat(box, upd->deformref_box);
+ +}
+ +
+ +static void deform(gmx_update_t upd,
+ +                   int start, int homenr, rvec x[], matrix box, matrix *scale_tot,
+ +                   const t_inputrec *ir, gmx_large_int_t step)
+ +{
+ +    matrix bnew, invbox, mu;
+ +    real   elapsed_time;
+ +    int    i, j;
+ +
+ +    elapsed_time = (step + 1 - upd->deformref_step)*ir->delta_t;
+ +    copy_mat(box, bnew);
+ +    for (i = 0; i < DIM; i++)
+ +    {
+ +        for (j = 0; j < DIM; j++)
+ +        {
+ +            if (ir->deform[i][j] != 0)
+ +            {
+ +                bnew[i][j] =
+ +                    upd->deformref_box[i][j] + elapsed_time*ir->deform[i][j];
+ +            }
+ +        }
+ +    }
+ +    /* We correct the off-diagonal elements,
+ +     * which can grow indefinitely during shearing,
+ +     * so the shifts do not get messed up.
+ +     */
+ +    for (i = 1; i < DIM; i++)
+ +    {
+ +        for (j = i-1; j >= 0; j--)
+ +        {
+ +            while (bnew[i][j] - box[i][j] > 0.5*bnew[j][j])
+ +            {
+ +                rvec_dec(bnew[i], bnew[j]);
+ +            }
+ +            while (bnew[i][j] - box[i][j] < -0.5*bnew[j][j])
+ +            {
+ +                rvec_inc(bnew[i], bnew[j]);
+ +            }
+ +        }
+ +    }
+ +    m_inv_ur0(box, invbox);
+ +    copy_mat(bnew, box);
+ +    mmul_ur0(box, invbox, mu);
+ +
+ +    for (i = start; i < start+homenr; i++)
+ +    {
+ +        x[i][XX] = mu[XX][XX]*x[i][XX]+mu[YY][XX]*x[i][YY]+mu[ZZ][XX]*x[i][ZZ];
+ +        x[i][YY] = mu[YY][YY]*x[i][YY]+mu[ZZ][YY]*x[i][ZZ];
+ +        x[i][ZZ] = mu[ZZ][ZZ]*x[i][ZZ];
+ +    }
+ +    if (*scale_tot)
+ +    {
+ +        /* The transposes of the scaling matrices are stored,
+ +         * so we need to do matrix multiplication in the inverse order.
+ +         */
+ +        mmul_ur0(*scale_tot, mu, *scale_tot);
+ +    }
+ +}
+ +
+ +static void combine_forces(int nstcalclr,
+ +                           gmx_constr_t constr,
+ +                           t_inputrec *ir, t_mdatoms *md, t_idef *idef,
+ +                           t_commrec *cr,
+ +                           gmx_large_int_t step,
+ +                           t_state *state, gmx_bool bMolPBC,
+ +                           int start, int nrend,
+ +                           rvec f[], rvec f_lr[],
+ +                           t_nrnb *nrnb)
+ +{
+ +    int  i, d, nm1;
+ +
+ +    /* f contains the short-range forces + the long range forces
+ +     * which are stored separately in f_lr.
+ +     */
+ +
+ +    if (constr != NULL && !(ir->eConstrAlg == econtSHAKE && ir->epc == epcNO))
+ +    {
+ +        /* We need to constrain the LR forces separately,
+ +         * because due to the different pre-factor for the SR and LR
+ +         * forces in the update algorithm, we can not determine
+ +         * the constraint force for the coordinate constraining.
+ +         * Constrain only the additional LR part of the force.
+ +         */
+ +        /* MRS -- need to make sure this works with trotter integration -- the constraint calls may not be right.*/
+ +        constrain(NULL, FALSE, FALSE, constr, idef, ir, NULL, cr, step, 0, md,
+ +                  state->x, f_lr, f_lr, bMolPBC, state->box, state->lambda[efptBONDED], NULL,
+ +                  NULL, NULL, nrnb, econqForce, ir->epc == epcMTTK, state->veta, state->veta);
+ +    }
+ +
+ +    /* Add nstcalclr-1 times the LR force to the sum of both forces
+ +     * and store the result in forces_lr.
+ +     */
+ +    nm1 = nstcalclr - 1;
+ +    for (i = start; i < nrend; i++)
+ +    {
+ +        for (d = 0; d < DIM; d++)
+ +        {
+ +            f_lr[i][d] = f[i][d] + nm1*f_lr[i][d];
+ +        }
+ +    }
+ +}
+ +
+ +void update_tcouple(FILE             *fplog,
+ +                    gmx_large_int_t   step,
+ +                    t_inputrec       *inputrec,
+ +                    t_state          *state,
+ +                    gmx_ekindata_t   *ekind,
+ +                    gmx_wallcycle_t   wcycle,
+ +                    gmx_update_t      upd,
+ +                    t_extmass        *MassQ,
+ +                    t_mdatoms        *md)
+ +
+ +{
+ +    gmx_bool   bTCouple = FALSE;
+ +    real       dttc;
+ +    int        i, start, end, homenr, offset;
+ +
+ +    /* if using vv with trotter decomposition methods, we do this elsewhere in the code */
+ +    if (inputrec->etc != etcNO &&
+ +        !(IR_NVT_TROTTER(inputrec) || IR_NPT_TROTTER(inputrec) || IR_NPH_TROTTER(inputrec)))
+ +    {
+ +        /* We should only couple after a step where energies were determined (for leapfrog versions)
+ +           or the step energies are determined, for velocity verlet versions */
+ +
+ +        if (EI_VV(inputrec->eI))
+ +        {
+ +            offset = 0;
+ +        }
+ +        else
+ +        {
+ +            offset = 1;
+ +        }
+ +        bTCouple = (inputrec->nsttcouple == 1 ||
+ +                    do_per_step(step+inputrec->nsttcouple-offset,
+ +                                inputrec->nsttcouple));
+ +    }
+ +
+ +    if (bTCouple)
+ +    {
+ +        dttc = inputrec->nsttcouple*inputrec->delta_t;
+ +
+ +        switch (inputrec->etc)
+ +        {
+ +            case etcNO:
+ +                break;
+ +            case etcBERENDSEN:
+ +                berendsen_tcoupl(inputrec, ekind, dttc);
+ +                break;
+ +            case etcNOSEHOOVER:
+ +                nosehoover_tcoupl(&(inputrec->opts), ekind, dttc,
+ +                                  state->nosehoover_xi, state->nosehoover_vxi, MassQ);
+ +                break;
+ +            case etcVRESCALE:
+ +                vrescale_tcoupl(inputrec, ekind, dttc,
+ +                                state->therm_integral, upd->sd->gaussrand[0]);
+ +                break;
+ +        }
+ +        /* rescale in place here */
+ +        if (EI_VV(inputrec->eI))
+ +        {
+ +            rescale_velocities(ekind, md, md->start, md->start+md->homenr, state->v);
+ +        }
+ +    }
+ +    else
+ +    {
+ +        /* Set the T scaling lambda to 1 to have no scaling */
+ +        for (i = 0; (i < inputrec->opts.ngtc); i++)
+ +        {
+ +            ekind->tcstat[i].lambda = 1.0;
+ +        }
+ +    }
+ +}
+ +
+ +void update_pcouple(FILE             *fplog,
+ +                    gmx_large_int_t   step,
+ +                    t_inputrec       *inputrec,
+ +                    t_state          *state,
+ +                    matrix            pcoupl_mu,
+ +                    matrix            M,
+ +                    gmx_wallcycle_t   wcycle,
+ +                    gmx_update_t      upd,
+ +                    gmx_bool          bInitStep)
+ +{
+ +    gmx_bool   bPCouple = FALSE;
+ +    real       dtpc     = 0;
+ +    int        i;
+ +
+ +    /* if using Trotter pressure, we do this in coupling.c, so we leave it false. */
+ +    if (inputrec->epc != epcNO && (!(IR_NPT_TROTTER(inputrec) || IR_NPH_TROTTER(inputrec))))
+ +    {
+ +        /* We should only couple after a step where energies were determined */
+ +        bPCouple = (inputrec->nstpcouple == 1 ||
+ +                    do_per_step(step+inputrec->nstpcouple-1,
+ +                                inputrec->nstpcouple));
+ +    }
+ +
+ +    clear_mat(pcoupl_mu);
+ +    for (i = 0; i < DIM; i++)
+ +    {
+ +        pcoupl_mu[i][i] = 1.0;
+ +    }
+ +
+ +    clear_mat(M);
+ +
+ +    if (bPCouple)
+ +    {
+ +        dtpc = inputrec->nstpcouple*inputrec->delta_t;
+ +
+ +        switch (inputrec->epc)
+ +        {
+ +            /* We can always pcoupl, even if we did not sum the energies
+ +             * the previous step, since state->pres_prev is only updated
+ +             * when the energies have been summed.
+ +             */
+ +            case (epcNO):
+ +                break;
+ +            case (epcBERENDSEN):
+ +                if (!bInitStep)
+ +                {
+ +                    berendsen_pcoupl(fplog, step, inputrec, dtpc, state->pres_prev, state->box,
+ +                                     pcoupl_mu);
+ +                }
+ +                break;
+ +            case (epcPARRINELLORAHMAN):
+ +                parrinellorahman_pcoupl(fplog, step, inputrec, dtpc, state->pres_prev,
+ +                                        state->box, state->box_rel, state->boxv,
+ +                                        M, pcoupl_mu, bInitStep);
+ +                break;
+ +            default:
+ +                break;
+ +        }
+ +    }
+ +}
+ +
+ +static rvec *get_xprime(const t_state *state, gmx_update_t upd)
+ +{
+ +    if (state->nalloc > upd->xp_nalloc)
+ +    {
+ +        upd->xp_nalloc = state->nalloc;
+ +        srenew(upd->xp, upd->xp_nalloc);
+ +    }
+ +
+ +    return upd->xp;
+ +}
+ +
+ +void update_constraints(FILE             *fplog,
+ +                        gmx_large_int_t   step,
+ +                        real             *dvdlambda, /* the contribution to be added to the bonded interactions */
+ +                        t_inputrec       *inputrec,  /* input record and box stuff    */
+ +                        gmx_ekindata_t   *ekind,
+ +                        t_mdatoms        *md,
+ +                        t_state          *state,
+ +                        gmx_bool          bMolPBC,
+ +                        t_graph          *graph,
+ +                        rvec              force[],   /* forces on home particles */
+ +                        t_idef           *idef,
+ +                        tensor            vir_part,
+ +                        tensor            vir,       /* tensors for virial and ekin, needed for computing */
+ +                        t_commrec        *cr,
+ +                        t_nrnb           *nrnb,
+ +                        gmx_wallcycle_t   wcycle,
+ +                        gmx_update_t      upd,
+ +                        gmx_constr_t      constr,
+ +                        gmx_bool          bInitStep,
+ +                        gmx_bool          bFirstHalf,
+ +                        gmx_bool          bCalcVir,
+ +                        real              vetanew)
+ +{
+ +    gmx_bool             bExtended, bLastStep, bLog = FALSE, bEner = FALSE, bDoConstr = FALSE;
+ +    double               dt;
+ +    real                 dt_1;
+ +    int                  start, homenr, nrend, i, n, m, g, d;
+ +    tensor               vir_con;
+ +    rvec                *vbuf, *xprime = NULL;
+ +    int                  nth, th;
+ +
+ +    if (constr)
+ +    {
+ +        bDoConstr = TRUE;
+ +    }
+ +    if (bFirstHalf && !EI_VV(inputrec->eI))
+ +    {
+ +        bDoConstr = FALSE;
+ +    }
+ +
+ +    /* for now, SD update is here -- though it really seems like it
+ +       should be reformulated as a velocity verlet method, since it has two parts */
+ +
+ +    start  = md->start;
+ +    homenr = md->homenr;
+ +    nrend  = start+homenr;
+ +
+ +    dt   = inputrec->delta_t;
+ +    dt_1 = 1.0/dt;
+ +
+ +    /*
+ +     *  Steps (7C, 8C)
+ +     *  APPLY CONSTRAINTS:
+ +     *  BLOCK SHAKE
+ +
+ +     * When doing PR pressure coupling we have to constrain the
+ +     * bonds in each iteration. If we are only using Nose-Hoover tcoupling
+ +     * it is enough to do this once though, since the relative velocities
+ +     * after this will be normal to the bond vector
+ +     */
+ +
+ +    if (bDoConstr)
+ +    {
+ +        /* clear out constraints before applying */
+ +        clear_mat(vir_part);
+ +
+ +        xprime = get_xprime(state, upd);
+ +
+ +        bLastStep = (step == inputrec->init_step+inputrec->nsteps);
+ +        bLog      = (do_per_step(step, inputrec->nstlog) || bLastStep || (step < 0));
+ +        bEner     = (do_per_step(step, inputrec->nstenergy) || bLastStep);
+ +        /* Constrain the coordinates xprime */
+ +        wallcycle_start(wcycle, ewcCONSTR);
+ +        if (EI_VV(inputrec->eI) && bFirstHalf)
+ +        {
+ +            constrain(NULL, bLog, bEner, constr, idef,
+ +                      inputrec, ekind, cr, step, 1, md,
+ +                      state->x, state->v, state->v,
+ +                      bMolPBC, state->box,
+ +                      state->lambda[efptBONDED], dvdlambda,
+ +                      NULL, bCalcVir ? &vir_con : NULL, nrnb, econqVeloc,
+ +                      inputrec->epc == epcMTTK, state->veta, vetanew);
+ +        }
+ +        else
+ +        {
+ +            constrain(NULL, bLog, bEner, constr, idef,
+ +                      inputrec, ekind, cr, step, 1, md,
+ +                      state->x, xprime, NULL,
+ +                      bMolPBC, state->box,
+ +                      state->lambda[efptBONDED], dvdlambda,
+ +                      state->v, bCalcVir ? &vir_con : NULL, nrnb, econqCoord,
+ +                      inputrec->epc == epcMTTK, state->veta, state->veta);
+ +        }
+ +        wallcycle_stop(wcycle, ewcCONSTR);
+ +
+ +        where();
+ +
+ +        dump_it_all(fplog, "After Shake",
+ +                    state->natoms, state->x, xprime, state->v, force);
+ +
+ +        if (bCalcVir)
+ +        {
+ +            if (inputrec->eI == eiSD2)
+ +            {
+ +                /* A correction factor eph is needed for the SD constraint force */
+ +                /* Here we can, unfortunately, not have proper corrections
+ +                 * for different friction constants, so we use the first one.
+ +                 */
+ +                for (i = 0; i < DIM; i++)
+ +                {
+ +                    for (m = 0; m < DIM; m++)
+ +                    {
+ +                        vir_part[i][m] += upd->sd->sdc[0].eph*vir_con[i][m];
+ +                    }
+ +                }
+ +            }
+ +            else
+ +            {
+ +                m_add(vir_part, vir_con, vir_part);
+ +            }
+ +            if (debug)
+ +            {
+ +                pr_rvecs(debug, 0, "constraint virial", vir_part, DIM);
+ +            }
+ +        }
+ +    }
+ +
+ +    where();
+ +    if ((inputrec->eI == eiSD2) && !(bFirstHalf))
+ +    {
+ +        xprime = get_xprime(state, upd);
+ +
+ +        nth = gmx_omp_nthreads_get(emntUpdate);
+ +
+ +#pragma omp parallel for num_threads(nth) schedule(static)
+ +        for (th = 0; th < nth; th++)
+ +        {
+ +            int start_th, end_th;
+ +
+ +            start_th = start + ((nrend-start)* th   )/nth;
+ +            end_th   = start + ((nrend-start)*(th+1))/nth;
+ +
+ +            /* The second part of the SD integration */
+ +            do_update_sd2(upd->sd, upd->sd->gaussrand[th],
+ +                          FALSE, start_th, end_th,
+ +                          inputrec->opts.acc, inputrec->opts.nFreeze,
+ +                          md->invmass, md->ptype,
+ +                          md->cFREEZE, md->cACC, md->cTC,
+ +                          state->x, xprime, state->v, force, state->sd_X,
-     if (EI_RANDOM(inputrec->eI))
++                          inputrec->opts.tau_t,
++                          FALSE);
+ +        }
+ +        inc_nrnb(nrnb, eNR_UPDATE, homenr);
+ +
+ +        if (bDoConstr)
+ +        {
+ +            /* Constrain the coordinates xprime */
+ +            wallcycle_start(wcycle, ewcCONSTR);
+ +            constrain(NULL, bLog, bEner, constr, idef,
+ +                      inputrec, NULL, cr, step, 1, md,
+ +                      state->x, xprime, NULL,
+ +                      bMolPBC, state->box,
+ +                      state->lambda[efptBONDED], dvdlambda,
+ +                      NULL, NULL, nrnb, econqCoord, FALSE, 0, 0);
+ +            wallcycle_stop(wcycle, ewcCONSTR);
+ +        }
+ +    }
+ +
+ +    /* We must always unshift after updating coordinates; if we did not shake
+ +       x was shifted in do_force */
+ +
+ +    if (!(bFirstHalf)) /* in the first half of vv, no shift. */
+ +    {
+ +        if (graph && (graph->nnodes > 0))
+ +        {
+ +            unshift_x(graph, state->box, state->x, upd->xp);
+ +            if (TRICLINIC(state->box))
+ +            {
+ +                inc_nrnb(nrnb, eNR_SHIFTX, 2*graph->nnodes);
+ +            }
+ +            else
+ +            {
+ +                inc_nrnb(nrnb, eNR_SHIFTX, graph->nnodes);
+ +            }
+ +        }
+ +        else
+ +        {
+ +#pragma omp parallel for num_threads(gmx_omp_nthreads_get(emntUpdate)) schedule(static)
+ +            for (i = start; i < nrend; i++)
+ +            {
+ +                copy_rvec(upd->xp[i], state->x[i]);
+ +            }
+ +        }
+ +
+ +        dump_it_all(fplog, "After unshift",
+ +                    state->natoms, state->x, upd->xp, state->v, force);
+ +    }
+ +/* ############# END the update of velocities and positions ######### */
+ +}
+ +
+ +void update_box(FILE             *fplog,
+ +                gmx_large_int_t   step,
+ +                t_inputrec       *inputrec,  /* input record and box stuff    */
+ +                t_mdatoms        *md,
+ +                t_state          *state,
+ +                t_graph          *graph,
+ +                rvec              force[],   /* forces on home particles */
+ +                matrix           *scale_tot,
+ +                matrix            pcoupl_mu,
+ +                t_nrnb           *nrnb,
+ +                gmx_wallcycle_t   wcycle,
+ +                gmx_update_t      upd,
+ +                gmx_bool          bInitStep,
+ +                gmx_bool          bFirstHalf)
+ +{
+ +    gmx_bool             bExtended, bLastStep, bLog = FALSE, bEner = FALSE;
+ +    double               dt;
+ +    real                 dt_1;
+ +    int                  start, homenr, nrend, i, n, m, g;
+ +    tensor               vir_con;
+ +
+ +    start  = md->start;
+ +    homenr = md->homenr;
+ +    nrend  = start+homenr;
+ +
+ +    bExtended =
+ +        (inputrec->etc == etcNOSEHOOVER) ||
+ +        (inputrec->epc == epcPARRINELLORAHMAN) ||
+ +        (inputrec->epc == epcMTTK);
+ +
+ +    dt = inputrec->delta_t;
+ +
+ +    where();
+ +
+ +    /* now update boxes */
+ +    switch (inputrec->epc)
+ +    {
+ +        case (epcNO):
+ +            break;
+ +        case (epcBERENDSEN):
+ +            berendsen_pscale(inputrec, pcoupl_mu, state->box, state->box_rel,
+ +                             start, homenr, state->x, md->cFREEZE, nrnb);
+ +            break;
+ +        case (epcPARRINELLORAHMAN):
+ +            /* The box velocities were updated in do_pr_pcoupl in the update
+ +             * iteration, but we dont change the box vectors until we get here
+ +             * since we need to be able to shift/unshift above.
+ +             */
+ +            for (i = 0; i < DIM; i++)
+ +            {
+ +                for (m = 0; m <= i; m++)
+ +                {
+ +                    state->box[i][m] += dt*state->boxv[i][m];
+ +                }
+ +            }
+ +            preserve_box_shape(inputrec, state->box_rel, state->box);
+ +
+ +            /* Scale the coordinates */
+ +            for (n = start; (n < start+homenr); n++)
+ +            {
+ +                tmvmul_ur0(pcoupl_mu, state->x[n], state->x[n]);
+ +            }
+ +            break;
+ +        case (epcMTTK):
+ +            switch (inputrec->epct)
+ +            {
+ +                case (epctISOTROPIC):
+ +                    /* DIM * eta = ln V.  so DIM*eta_new = DIM*eta_old + DIM*dt*veta =>
+ +                       ln V_new = ln V_old + 3*dt*veta => V_new = V_old*exp(3*dt*veta) =>
+ +                       Side length scales as exp(veta*dt) */
+ +
+ +                    msmul(state->box, exp(state->veta*dt), state->box);
+ +
+ +                    /* Relate veta to boxv.  veta = d(eta)/dT = (1/DIM)*1/V dV/dT.
+ +                       o               If we assume isotropic scaling, and box length scaling
+ +                       factor L, then V = L^DIM (det(M)).  So dV/dt = DIM
+ +                       L^(DIM-1) dL/dt det(M), and veta = (1/L) dL/dt.  The
+ +                       determinant of B is L^DIM det(M), and the determinant
+ +                       of dB/dt is (dL/dT)^DIM det (M).  veta will be
+ +                       (det(dB/dT)/det(B))^(1/3).  Then since M =
+ +                       B_new*(vol_new)^(1/3), dB/dT_new = (veta_new)*B(new). */
+ +
+ +                    msmul(state->box, state->veta, state->boxv);
+ +                    break;
+ +                default:
+ +                    break;
+ +            }
+ +            break;
+ +        default:
+ +            break;
+ +    }
+ +
+ +    if ((!(IR_NPT_TROTTER(inputrec) || IR_NPH_TROTTER(inputrec))) && scale_tot)
+ +    {
+ +        /* The transposes of the scaling matrices are stored,
+ +         * therefore we need to reverse the order in the multiplication.
+ +         */
+ +        mmul_ur0(*scale_tot, pcoupl_mu, *scale_tot);
+ +    }
+ +
+ +    if (DEFORM(*inputrec))
+ +    {
+ +        deform(upd, start, homenr, state->x, state->box, scale_tot, inputrec, step);
+ +    }
+ +    where();
+ +    dump_it_all(fplog, "After update",
+ +                state->natoms, state->x, upd->xp, state->v, force);
+ +}
+ +
+ +void update_coords(FILE             *fplog,
+ +                   gmx_large_int_t   step,
+ +                   t_inputrec       *inputrec,  /* input record and box stuff */
+ +                   t_mdatoms        *md,
+ +                   t_state          *state,
+ +                   gmx_bool          bMolPBC,
+ +                   rvec             *f,    /* forces on home particles */
+ +                   gmx_bool          bDoLR,
+ +                   rvec             *f_lr,
+ +                   t_fcdata         *fcd,
+ +                   gmx_ekindata_t   *ekind,
+ +                   matrix            M,
+ +                   gmx_wallcycle_t   wcycle,
+ +                   gmx_update_t      upd,
+ +                   gmx_bool          bInitStep,
+ +                   int               UpdatePart,
+ +                   t_commrec        *cr, /* these shouldn't be here -- need to think about it */
+ +                   t_nrnb           *nrnb,
+ +                   gmx_constr_t      constr,
+ +                   t_idef           *idef)
+ +{
+ +    gmx_bool          bNH, bPR, bLastStep, bLog = FALSE, bEner = FALSE;
+ +    double            dt, alpha;
+ +    real             *imass, *imassin;
+ +    rvec             *force;
+ +    real              dt_1;
+ +    int               start, homenr, nrend, i, j, d, n, m, g;
+ +    int               blen0, blen1, iatom, jatom, nshake, nsettle, nconstr, nexpand;
+ +    int              *icom = NULL;
+ +    tensor            vir_con;
+ +    rvec             *vcom, *xcom, *vall, *xall, *xin, *vin, *forcein, *fall, *xpall, *xprimein, *xprime;
+ +    int               nth, th;
+ +
+ +    /* Running the velocity half does nothing except for velocity verlet */
+ +    if ((UpdatePart == etrtVELOCITY1 || UpdatePart == etrtVELOCITY2) &&
+ +        !EI_VV(inputrec->eI))
+ +    {
+ +        gmx_incons("update_coords called for velocity without VV integrator");
+ +    }
+ +
+ +    start  = md->start;
+ +    homenr = md->homenr;
+ +    nrend  = start+homenr;
+ +
+ +    xprime = get_xprime(state, upd);
+ +
+ +    dt   = inputrec->delta_t;
+ +    dt_1 = 1.0/dt;
+ +
+ +    /* We need to update the NMR restraint history when time averaging is used */
+ +    if (state->flags & (1<<estDISRE_RM3TAV))
+ +    {
+ +        update_disres_history(fcd, &state->hist);
+ +    }
+ +    if (state->flags & (1<<estORIRE_DTAV))
+ +    {
+ +        update_orires_history(fcd, &state->hist);
+ +    }
+ +
+ +
+ +    bNH = inputrec->etc == etcNOSEHOOVER;
+ +    bPR = ((inputrec->epc == epcPARRINELLORAHMAN) || (inputrec->epc == epcMTTK));
+ +
+ +    if (bDoLR && inputrec->nstcalclr > 1 && !EI_VV(inputrec->eI))  /* get this working with VV? */
+ +    {
+ +        /* Store the total force + nstcalclr-1 times the LR force
+ +         * in forces_lr, so it can be used in a normal update algorithm
+ +         * to produce twin time stepping.
+ +         */
+ +        /* is this correct in the new construction? MRS */
+ +        combine_forces(inputrec->nstcalclr, constr, inputrec, md, idef, cr,
+ +                       step, state, bMolPBC,
+ +                       start, nrend, f, f_lr, nrnb);
+ +        force = f_lr;
+ +    }
+ +    else
+ +    {
+ +        force = f;
+ +    }
+ +
+ +    /* ############# START The update of velocities and positions ######### */
+ +    where();
+ +    dump_it_all(fplog, "Before update",
+ +                state->natoms, state->x, xprime, state->v, force);
+ +
-         /* We still need to take care of generating random seeds properly
-          * when multi-threading.
-          */
-         nth = 1;
++    if (inputrec->eI == eiSD2)
+ +    {
-     else
++        check_sd2_work_data_allocation(upd->sd, nrend);
++
++        do_update_sd2_Tconsts(upd->sd,
++                              inputrec->opts.ngtc,
++                              inputrec->opts.tau_t,
++                              inputrec->opts.ref_t);
+ +    }
-         nth = gmx_omp_nthreads_get(emntUpdate);
++    if (inputrec->eI == eiBD)
+ +    {
-     if (inputrec->eI == eiSD2)
-     {
-         check_sd2_work_data_allocation(upd->sd, nrend);
-     }
++        do_update_bd_Tconsts(dt, inputrec->bd_fric,
++                             inputrec->opts.ngtc, inputrec->opts.ref_t,
++                             upd->sd->bd_rf);
+ +    }
+ +
-                               inputrec->opts.ngtc, inputrec->opts.tau_t, inputrec->opts.ref_t,
++    nth = gmx_omp_nthreads_get(emntUpdate);
+ +
+ +#pragma omp parallel for num_threads(nth) schedule(static) private(alpha)
+ +    for (th = 0; th < nth; th++)
+ +    {
+ +        int start_th, end_th;
+ +
+ +        start_th = start + ((nrend-start)* th   )/nth;
+ +        end_th   = start + ((nrend-start)*(th+1))/nth;
+ +
+ +        switch (inputrec->eI)
+ +        {
+ +            case (eiMD):
+ +                if (ekind->cosacc.cos_accel == 0)
+ +                {
+ +                    do_update_md(start_th, end_th, dt,
+ +                                 ekind->tcstat, state->nosehoover_vxi,
+ +                                 ekind->bNEMD, ekind->grpstat, inputrec->opts.acc,
+ +                                 inputrec->opts.nFreeze,
+ +                                 md->invmass, md->ptype,
+ +                                 md->cFREEZE, md->cACC, md->cTC,
+ +                                 state->x, xprime, state->v, force, M,
+ +                                 bNH, bPR);
+ +                }
+ +                else
+ +                {
+ +                    do_update_visc(start_th, end_th, dt,
+ +                                   ekind->tcstat, state->nosehoover_vxi,
+ +                                   md->invmass, md->ptype,
+ +                                   md->cTC, state->x, xprime, state->v, force, M,
+ +                                   state->box,
+ +                                   ekind->cosacc.cos_accel,
+ +                                   ekind->cosacc.vcos,
+ +                                   bNH, bPR);
+ +                }
+ +                break;
+ +            case (eiSD1):
+ +                do_update_sd1(upd->sd, upd->sd->gaussrand[th],
+ +                              start_th, end_th, dt,
+ +                              inputrec->opts.acc, inputrec->opts.nFreeze,
+ +                              md->invmass, md->ptype,
+ +                              md->cFREEZE, md->cACC, md->cTC,
+ +                              state->x, xprime, state->v, force, state->sd_X,
+ +                              inputrec->opts.ngtc, inputrec->opts.tau_t, inputrec->opts.ref_t);
+ +                break;
+ +            case (eiSD2):
+ +                /* The SD update is done in 2 parts, because an extra constraint step
+ +                 * is needed
+ +                 */
+ +                do_update_sd2(upd->sd, upd->sd->gaussrand[th],
+ +                              bInitStep, start_th, end_th,
+ +                              inputrec->opts.acc, inputrec->opts.nFreeze,
+ +                              md->invmass, md->ptype,
+ +                              md->cFREEZE, md->cACC, md->cTC,
+ +                              state->x, xprime, state->v, force, state->sd_X,
-                              inputrec->opts.ngtc, inputrec->opts.tau_t, inputrec->opts.ref_t,
++                              inputrec->opts.tau_t,
+ +                              TRUE);
+ +                break;
+ +            case (eiBD):
+ +                do_update_bd(start_th, end_th, dt,
+ +                             inputrec->opts.nFreeze, md->invmass, md->ptype,
+ +                             md->cFREEZE, md->cTC,
+ +                             state->x, xprime, state->v, force,
+ +                             inputrec->bd_fric,
+ +                             upd->sd->bd_rf, upd->sd->gaussrand[th]);
+ +                break;
+ +            case (eiVV):
+ +            case (eiVVAK):
+ +                alpha = 1.0 + DIM/((double)inputrec->opts.nrdf[0]); /* assuming barostat coupled to group 0. */
+ +                switch (UpdatePart)
+ +                {
+ +                    case etrtVELOCITY1:
+ +                    case etrtVELOCITY2:
+ +                        do_update_vv_vel(start_th, end_th, dt,
+ +                                         ekind->tcstat, ekind->grpstat,
+ +                                         inputrec->opts.acc, inputrec->opts.nFreeze,
+ +                                         md->invmass, md->ptype,
+ +                                         md->cFREEZE, md->cACC,
+ +                                         state->v, force,
+ +                                         (bNH || bPR), state->veta, alpha);
+ +                        break;
+ +                    case etrtPOSITION:
+ +                        do_update_vv_pos(start_th, end_th, dt,
+ +                                         ekind->tcstat, ekind->grpstat,
+ +                                         inputrec->opts.acc, inputrec->opts.nFreeze,
+ +                                         md->invmass, md->ptype, md->cFREEZE,
+ +                                         state->x, xprime, state->v, force,
+ +                                         (bNH || bPR), state->veta, alpha);
+ +                        break;
+ +                }
+ +                break;
+ +            default:
+ +                gmx_fatal(FARGS, "Don't know how to update coordinates");
+ +                break;
+ +        }
+ +    }
+ +
+ +}
+ +
+ +
+ +void correct_ekin(FILE *log, int start, int end, rvec v[], rvec vcm, real mass[],
+ +                  real tmass, tensor ekin)
+ +{
+ +    /*
+ +     * This is a debugging routine. It should not be called for production code
+ +     *
+ +     * The kinetic energy should calculated according to:
+ +     *   Ekin = 1/2 m (v-vcm)^2
+ +     * However the correction is not always applied, since vcm may not be
+ +     * known in time and we compute
+ +     *   Ekin' = 1/2 m v^2 instead
+ +     * This can be corrected afterwards by computing
+ +     *   Ekin = Ekin' + 1/2 m ( -2 v vcm + vcm^2)
+ +     * or in hsorthand:
+ +     *   Ekin = Ekin' - m v vcm + 1/2 m vcm^2
+ +     */
+ +    int    i, j, k;
+ +    real   m, tm;
+ +    rvec   hvcm, mv;
+ +    tensor dekin;
+ +
+ +    /* Local particles */
+ +    clear_rvec(mv);
+ +
+ +    /* Processor dependent part. */
+ +    tm = 0;
+ +    for (i = start; (i < end); i++)
+ +    {
+ +        m      = mass[i];
+ +        tm    += m;
+ +        for (j = 0; (j < DIM); j++)
+ +        {
+ +            mv[j] += m*v[i][j];
+ +        }
+ +    }
+ +    /* Shortcut */
+ +    svmul(1/tmass, vcm, vcm);
+ +    svmul(0.5, vcm, hvcm);
+ +    clear_mat(dekin);
+ +    for (j = 0; (j < DIM); j++)
+ +    {
+ +        for (k = 0; (k < DIM); k++)
+ +        {
+ +            dekin[j][k] += vcm[k]*(tm*hvcm[j]-mv[j]);
+ +        }
+ +    }
+ +    pr_rvecs(log, 0, "dekin", dekin, DIM);
+ +    pr_rvecs(log, 0, " ekin", ekin, DIM);
+ +    fprintf(log, "dekin = %g, ekin = %g  vcm = (%8.4f %8.4f %8.4f)\n",
+ +            trace(dekin), trace(ekin), vcm[XX], vcm[YY], vcm[ZZ]);
+ +    fprintf(log, "mv = (%8.4f %8.4f %8.4f)\n",
+ +            mv[XX], mv[YY], mv[ZZ]);
+ +}
+ +
+ +extern gmx_bool update_randomize_velocities(t_inputrec *ir, gmx_large_int_t step, t_mdatoms *md, t_state *state, gmx_update_t upd, t_idef *idef, gmx_constr_t constr)
+ +{
+ +
+ +    int  i;
+ +    real rate = (ir->delta_t)/ir->opts.tau_t[0];
+ +    /* proceed with andersen if 1) it's fixed probability per
+ +       particle andersen or 2) it's massive andersen and it's tau_t/dt */
+ +    if ((ir->etc == etcANDERSEN) || do_per_step(step, (int)(1.0/rate)))
+ +    {
+ +        srenew(upd->randatom, state->nalloc);
+ +        srenew(upd->randatom_list, state->nalloc);
+ +        if (upd->randatom_list_init == FALSE)
+ +        {
+ +            for (i = 0; i < state->nalloc; i++)
+ +            {
+ +                upd->randatom[i]      = FALSE;
+ +                upd->randatom_list[i] = 0;
+ +            }
+ +            upd->randatom_list_init = TRUE;
+ +        }
+ +        andersen_tcoupl(ir, md, state, upd->sd->gaussrand[0], rate,
+ +                        (ir->etc == etcANDERSEN) ? idef : NULL,
+ +                        constr ? get_nblocks(constr) : 0,
+ +                        constr ? get_sblock(constr) : NULL,
+ +                        upd->randatom, upd->randatom_list,
+ +                        upd->sd->randomize_group, upd->sd->boltzfac);
+ +        return TRUE;
+ +    }
+ +    return FALSE;
+ +}
diff --cc src/programs/mdrun/md.c

index c75b17e76cca036936ed938ec2823d7e9ea69897,0000000000000000000000000000000000000000..019f41ed01472f4589ca2091564baf6f18b26c57

mode 100644,000000..100644
--- 1/src/programs/mdrun/md.c
--- /dev/null
+++ b/src/programs/mdrun/md.c
@@@ -1,2236 -1,0 +1,2237 @@@
-     /* set free energy calculation frequency as the minimum of nstdhdl, nstexpanded, and nstrepl_ex_nst*/
+ +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
+ + *
+ + *
+ + *                This source code is part of
+ + *
+ + *                 G   R   O   M   A   C   S
+ + *
+ + *          GROningen MAchine for Chemical Simulations
+ + *
+ + *                        VERSION 3.2.0
+ + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
+ + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ + * Copyright (c) 2001-2004, The GROMACS development team,
+ + * check out http://www.gromacs.org for more information.
+ +
+ + * This program is free software; you can redistribute it and/or
+ + * modify it under the terms of the GNU General Public License
+ + * as published by the Free Software Foundation; either version 2
+ + * of the License, or (at your option) any later version.
+ + *
+ + * If you want to redistribute modifications, please consider that
+ + * scientific software is very special. Version control is crucial -
+ + * bugs must be traceable. We will be happy to consider code for
+ + * inclusion in the official distribution, but derived work must not
+ + * be called official GROMACS. Details are found in the README & COPYING
+ + * files - if they are missing, get the official version at www.gromacs.org.
+ + *
+ + * To help us fund GROMACS development, we humbly ask that you cite
+ + * the papers on the package - you can find them in the top README file.
+ + *
+ + * For more info, check our website at http://www.gromacs.org
+ + *
+ + * And Hey:
+ + * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
+ + */
+ +#ifdef HAVE_CONFIG_H
+ +#include <config.h>
+ +#endif
+ +
+ +#include "typedefs.h"
+ +#include "smalloc.h"
+ +#include "sysstuff.h"
+ +#include "vec.h"
+ +#include "statutil.h"
+ +#include "vcm.h"
+ +#include "mdebin.h"
+ +#include "nrnb.h"
+ +#include "calcmu.h"
+ +#include "index.h"
+ +#include "vsite.h"
+ +#include "update.h"
+ +#include "ns.h"
+ +#include "trnio.h"
+ +#include "xtcio.h"
+ +#include "mdrun.h"
+ +#include "md_support.h"
+ +#include "md_logging.h"
+ +#include "confio.h"
+ +#include "network.h"
+ +#include "pull.h"
+ +#include "xvgr.h"
+ +#include "physics.h"
+ +#include "names.h"
+ +#include "xmdrun.h"
+ +#include "ionize.h"
+ +#include "disre.h"
+ +#include "orires.h"
+ +#include "pme.h"
+ +#include "mdatoms.h"
+ +#include "repl_ex.h"
+ +#include "qmmm.h"
+ +#include "domdec.h"
+ +#include "domdec_network.h"
+ +#include "partdec.h"
+ +#include "topsort.h"
+ +#include "coulomb.h"
+ +#include "constr.h"
+ +#include "shellfc.h"
+ +#include "compute_io.h"
+ +#include "mvdata.h"
+ +#include "checkpoint.h"
+ +#include "mtop_util.h"
+ +#include "sighandler.h"
+ +#include "txtdump.h"
+ +#include "string2.h"
+ +#include "pme_loadbal.h"
+ +#include "bondf.h"
+ +#include "membed.h"
+ +#include "types/nlistheuristics.h"
+ +#include "types/iteratedconstraints.h"
+ +#include "nbnxn_cuda_data_mgmt.h"
+ +
+ +#include "gromacs/utility/gmxmpi.h"
+ +
+ +#ifdef GMX_FAHCORE
+ +#include "corewrap.h"
+ +#endif
+ +
+ +static void reset_all_counters(FILE *fplog, t_commrec *cr,
+ +                               gmx_large_int_t step,
+ +                               gmx_large_int_t *step_rel, t_inputrec *ir,
+ +                               gmx_wallcycle_t wcycle, t_nrnb *nrnb,
+ +                               gmx_runtime_t *runtime,
+ +                               nbnxn_cuda_ptr_t cu_nbv)
+ +{
+ +    char sbuf[STEPSTRSIZE];
+ +
+ +    /* Reset all the counters related to performance over the run */
+ +    md_print_warn(cr, fplog, "step %s: resetting all time and cycle counters\n",
+ +                  gmx_step_str(step, sbuf));
+ +
+ +    if (cu_nbv)
+ +    {
+ +        nbnxn_cuda_reset_timings(cu_nbv);
+ +    }
+ +
+ +    wallcycle_stop(wcycle, ewcRUN);
+ +    wallcycle_reset_all(wcycle);
+ +    if (DOMAINDECOMP(cr))
+ +    {
+ +        reset_dd_statistics_counters(cr->dd);
+ +    }
+ +    init_nrnb(nrnb);
+ +    ir->init_step += *step_rel;
+ +    ir->nsteps    -= *step_rel;
+ +    *step_rel      = 0;
+ +    wallcycle_start(wcycle, ewcRUN);
+ +    runtime_start(runtime);
+ +    print_date_and_time(fplog, cr->nodeid, "Restarted time", runtime);
+ +}
+ +
+ +double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
+ +             const output_env_t oenv, gmx_bool bVerbose, gmx_bool bCompact,
+ +             int nstglobalcomm,
+ +             gmx_vsite_t *vsite, gmx_constr_t constr,
+ +             int stepout, t_inputrec *ir,
+ +             gmx_mtop_t *top_global,
+ +             t_fcdata *fcd,
+ +             t_state *state_global,
+ +             t_mdatoms *mdatoms,
+ +             t_nrnb *nrnb, gmx_wallcycle_t wcycle,
+ +             gmx_edsam_t ed, t_forcerec *fr,
+ +             int repl_ex_nst, int repl_ex_nex, int repl_ex_seed, gmx_membed_t membed,
+ +             real cpt_period, real max_hours,
+ +             const char *deviceOptions,
+ +             unsigned long Flags,
+ +             gmx_runtime_t *runtime)
+ +{
+ +    gmx_mdoutf_t   *outf;
+ +    gmx_large_int_t step, step_rel;
+ +    double          run_time;
+ +    double          t, t0, lam0[efptNR];
+ +    gmx_bool        bGStatEveryStep, bGStat, bCalcVir, bCalcEner;
+ +    gmx_bool        bNS, bNStList, bSimAnn, bStopCM, bRerunMD, bNotLastFrame = FALSE,
+ +                    bFirstStep, bStateFromCP, bStateFromTPX, bInitStep, bLastStep,
+ +                    bBornRadii, bStartingFromCpt;
+ +    gmx_bool          bDoDHDL = FALSE, bDoFEP = FALSE, bDoExpanded = FALSE;
+ +    gmx_bool          do_ene, do_log, do_verbose, bRerunWarnNoV = TRUE,
+ +                      bForceUpdate = FALSE, bCPT;
+ +    int               mdof_flags;
+ +    gmx_bool          bMasterState;
+ +    int               force_flags, cglo_flags;
+ +    tensor            force_vir, shake_vir, total_vir, tmp_vir, pres;
+ +    int               i, m;
+ +    t_trxstatus      *status;
+ +    rvec              mu_tot;
+ +    t_vcm            *vcm;
+ +    t_state          *bufstate = NULL;
+ +    matrix           *scale_tot, pcoupl_mu, M, ebox;
+ +    gmx_nlheur_t      nlh;
+ +    t_trxframe        rerun_fr;
+ +    gmx_repl_ex_t     repl_ex = NULL;
+ +    int               nchkpt  = 1;
+ +    gmx_localtop_t   *top;
+ +    t_mdebin         *mdebin = NULL;
+ +    df_history_t      df_history;
+ +    t_state          *state    = NULL;
+ +    rvec             *f_global = NULL;
+ +    int               n_xtc    = -1;
+ +    rvec             *x_xtc    = NULL;
+ +    gmx_enerdata_t   *enerd;
+ +    rvec             *f = NULL;
+ +    gmx_global_stat_t gstat;
+ +    gmx_update_t      upd   = NULL;
+ +    t_graph          *graph = NULL;
+ +    globsig_t         gs;
+ +    gmx_rng_t         mcrng = NULL;
+ +    gmx_bool          bFFscan;
+ +    gmx_groups_t     *groups;
+ +    gmx_ekindata_t   *ekind, *ekind_save;
+ +    gmx_shellfc_t     shellfc;
+ +    int               count, nconverged = 0;
+ +    real              timestep = 0;
+ +    double            tcount   = 0;
+ +    gmx_bool          bIonize  = FALSE;
+ +    gmx_bool          bTCR     = FALSE, bConverged = TRUE, bOK, bSumEkinhOld, bExchanged;
+ +    gmx_bool          bAppend;
+ +    gmx_bool          bResetCountersHalfMaxH = FALSE;
+ +    gmx_bool          bVV, bIterativeCase, bFirstIterate, bTemp, bPres, bTrotter;
+ +    gmx_bool          bUpdateDoLR;
+ +    real              mu_aver = 0, dvdl_constr;
+ +    int               a0, a1, gnx = 0, ii;
+ +    atom_id          *grpindex = NULL;
+ +    char             *grpname;
+ +    t_coupl_rec      *tcr     = NULL;
+ +    rvec             *xcopy   = NULL, *vcopy = NULL, *cbuf = NULL;
+ +    matrix            boxcopy = {{0}}, lastbox;
+ +    tensor            tmpvir;
+ +    real              fom, oldfom, veta_save, pcurr, scalevir, tracevir;
+ +    real              vetanew = 0;
+ +    int               lamnew  = 0;
+ +    /* for FEP */
+ +    int               nstfep;
+ +    real              rate;
+ +    double            cycles;
+ +    real              saved_conserved_quantity = 0;
+ +    real              last_ekin                = 0;
+ +    int               iter_i;
+ +    t_extmass         MassQ;
+ +    int             **trotter_seq;
+ +    char              sbuf[STEPSTRSIZE], sbuf2[STEPSTRSIZE];
+ +    int               handled_stop_condition = gmx_stop_cond_none; /* compare to get_stop_condition*/
+ +    gmx_iterate_t     iterate;
+ +    gmx_large_int_t   multisim_nsteps = -1;                        /* number of steps to do  before first multisim
+ +                                                                      simulation stops. If equal to zero, don't
+ +                                                                      communicate any more between multisims.*/
+ +    /* PME load balancing data for GPU kernels */
+ +    pme_load_balancing_t pme_loadbal = NULL;
+ +    double               cycles_pmes;
+ +    gmx_bool             bPMETuneTry = FALSE, bPMETuneRunning = FALSE;
+ +
+ +#ifdef GMX_FAHCORE
+ +    /* Temporary addition for FAHCORE checkpointing */
+ +    int chkpt_ret;
+ +#endif
+ +
+ +    /* Check for special mdrun options */
+ +    bRerunMD = (Flags & MD_RERUN);
+ +    bIonize  = (Flags & MD_IONIZE);
+ +    bFFscan  = (Flags & MD_FFSCAN);
+ +    bAppend  = (Flags & MD_APPENDFILES);
+ +    if (Flags & MD_RESETCOUNTERSHALFWAY)
+ +    {
+ +        if (ir->nsteps > 0)
+ +        {
+ +            /* Signal to reset the counters half the simulation steps. */
+ +            wcycle_set_reset_counters(wcycle, ir->nsteps/2);
+ +        }
+ +        /* Signal to reset the counters halfway the simulation time. */
+ +        bResetCountersHalfMaxH = (max_hours > 0);
+ +    }
+ +
+ +    /* md-vv uses averaged full step velocities for T-control
+ +       md-vv-avek uses averaged half step velocities for T-control (but full step ekin for P control)
+ +       md uses averaged half step kinetic energies to determine temperature unless defined otherwise by GMX_EKIN_AVE_VEL; */
+ +    bVV = EI_VV(ir->eI);
+ +    if (bVV) /* to store the initial velocities while computing virial */
+ +    {
+ +        snew(cbuf, top_global->natoms);
+ +    }
+ +    /* all the iteratative cases - only if there are constraints */
+ +    bIterativeCase = ((IR_NPH_TROTTER(ir) || IR_NPT_TROTTER(ir)) && (constr) && (!bRerunMD));
+ +    gmx_iterate_init(&iterate, FALSE); /* The default value of iterate->bIterationActive is set to
+ +                                          false in this step.  The correct value, true or false,
+ +                                          is set at each step, as it depends on the frequency of temperature
+ +                                          and pressure control.*/
+ +    bTrotter = (bVV && (IR_NPT_TROTTER(ir) || IR_NPH_TROTTER(ir) || IR_NVT_TROTTER(ir)));
+ +
+ +    if (bRerunMD)
+ +    {
+ +        /* Since we don't know if the frames read are related in any way,
+ +         * rebuild the neighborlist at every step.
+ +         */
+ +        ir->nstlist       = 1;
+ +        ir->nstcalcenergy = 1;
+ +        nstglobalcomm     = 1;
+ +    }
+ +
+ +    check_ir_old_tpx_versions(cr, fplog, ir, top_global);
+ +
+ +    nstglobalcomm   = check_nstglobalcomm(fplog, cr, nstglobalcomm, ir);
+ +    bGStatEveryStep = (nstglobalcomm == 1);
+ +
+ +    if (!bGStatEveryStep && ir->nstlist == -1 && fplog != NULL)
+ +    {
+ +        fprintf(fplog,
+ +                "To reduce the energy communication with nstlist = -1\n"
+ +                "the neighbor list validity should not be checked at every step,\n"
+ +                "this means that exact integration is not guaranteed.\n"
+ +                "The neighbor list validity is checked after:\n"
+ +                "  <n.list life time> - 2*std.dev.(n.list life time)  steps.\n"
+ +                "In most cases this will result in exact integration.\n"
+ +                "This reduces the energy communication by a factor of 2 to 3.\n"
+ +                "If you want less energy communication, set nstlist > 3.\n\n");
+ +    }
+ +
+ +    if (bRerunMD || bFFscan)
+ +    {
+ +        ir->nstxtcout = 0;
+ +    }
+ +    groups = &top_global->groups;
+ +
+ +    /* Initial values */
+ +    init_md(fplog, cr, ir, oenv, &t, &t0, state_global->lambda,
+ +            &(state_global->fep_state), lam0,
+ +            nrnb, top_global, &upd,
+ +            nfile, fnm, &outf, &mdebin,
+ +            force_vir, shake_vir, mu_tot, &bSimAnn, &vcm, state_global, Flags);
+ +
+ +    clear_mat(total_vir);
+ +    clear_mat(pres);
+ +    /* Energy terms and groups */
+ +    snew(enerd, 1);
+ +    init_enerdata(top_global->groups.grps[egcENER].nr, ir->fepvals->n_lambda,
+ +                  enerd);
+ +    if (DOMAINDECOMP(cr))
+ +    {
+ +        f = NULL;
+ +    }
+ +    else
+ +    {
+ +        snew(f, top_global->natoms);
+ +    }
+ +
+ +    /* lambda Monte carlo random number generator  */
+ +    if (ir->bExpanded)
+ +    {
+ +        mcrng = gmx_rng_init(ir->expandedvals->lmc_seed);
+ +    }
+ +    /* copy the state into df_history */
+ +    copy_df_history(&df_history, &state_global->dfhist);
+ +
+ +    /* Kinetic energy data */
+ +    snew(ekind, 1);
+ +    init_ekindata(fplog, top_global, &(ir->opts), ekind);
+ +    /* needed for iteration of constraints */
+ +    snew(ekind_save, 1);
+ +    init_ekindata(fplog, top_global, &(ir->opts), ekind_save);
+ +    /* Copy the cos acceleration to the groups struct */
+ +    ekind->cosacc.cos_accel = ir->cos_accel;
+ +
+ +    gstat = global_stat_init(ir);
+ +    debug_gmx();
+ +
+ +    /* Check for polarizable models and flexible constraints */
+ +    shellfc = init_shell_flexcon(fplog,
+ +                                 top_global, n_flexible_constraints(constr),
+ +                                 (ir->bContinuation ||
+ +                                  (DOMAINDECOMP(cr) && !MASTER(cr))) ?
+ +                                 NULL : state_global->x);
+ +
+ +    if (DEFORM(*ir))
+ +    {
+ +#ifdef GMX_THREAD_MPI
+ +        tMPI_Thread_mutex_lock(&deform_init_box_mutex);
+ +#endif
+ +        set_deform_reference_box(upd,
+ +                                 deform_init_init_step_tpx,
+ +                                 deform_init_box_tpx);
+ +#ifdef GMX_THREAD_MPI
+ +        tMPI_Thread_mutex_unlock(&deform_init_box_mutex);
+ +#endif
+ +    }
+ +
+ +    {
+ +        double io = compute_io(ir, top_global->natoms, groups, mdebin->ebin->nener, 1);
+ +        if ((io > 2000) && MASTER(cr))
+ +        {
+ +            fprintf(stderr,
+ +                    "\nWARNING: This run will generate roughly %.0f Mb of data\n\n",
+ +                    io);
+ +        }
+ +    }
+ +
+ +    if (DOMAINDECOMP(cr))
+ +    {
+ +        top = dd_init_local_top(top_global);
+ +
+ +        snew(state, 1);
+ +        dd_init_local_state(cr->dd, state_global, state);
+ +
+ +        if (DDMASTER(cr->dd) && ir->nstfout)
+ +        {
+ +            snew(f_global, state_global->natoms);
+ +        }
+ +    }
+ +    else
+ +    {
+ +        if (PAR(cr))
+ +        {
+ +            /* Initialize the particle decomposition and split the topology */
+ +            top = split_system(fplog, top_global, ir, cr);
+ +
+ +            pd_cg_range(cr, &fr->cg0, &fr->hcg);
+ +            pd_at_range(cr, &a0, &a1);
+ +        }
+ +        else
+ +        {
+ +            top = gmx_mtop_generate_local_top(top_global, ir);
+ +
+ +            a0 = 0;
+ +            a1 = top_global->natoms;
+ +        }
+ +
+ +        forcerec_set_excl_load(fr, top, cr);
+ +
+ +        state    = partdec_init_local_state(cr, state_global);
+ +        f_global = f;
+ +
+ +        atoms2md(top_global, ir, 0, NULL, a0, a1-a0, mdatoms);
+ +
+ +        if (vsite)
+ +        {
+ +            set_vsite_top(vsite, top, mdatoms, cr);
+ +        }
+ +
+ +        if (ir->ePBC != epbcNONE && !fr->bMolPBC)
+ +        {
+ +            graph = mk_graph(fplog, &(top->idef), 0, top_global->natoms, FALSE, FALSE);
+ +        }
+ +
+ +        if (shellfc)
+ +        {
+ +            make_local_shells(cr, mdatoms, shellfc);
+ +        }
+ +
+ +        init_bonded_thread_force_reduction(fr, &top->idef);
+ +
+ +        if (ir->pull && PAR(cr))
+ +        {
+ +            dd_make_local_pull_groups(NULL, ir->pull, mdatoms);
+ +        }
+ +    }
+ +
+ +    if (DOMAINDECOMP(cr))
+ +    {
+ +        /* Distribute the charge groups over the nodes from the master node */
+ +        dd_partition_system(fplog, ir->init_step, cr, TRUE, 1,
+ +                            state_global, top_global, ir,
+ +                            state, &f, mdatoms, top, fr,
+ +                            vsite, shellfc, constr,
+ +                            nrnb, wcycle, FALSE);
+ +
+ +    }
+ +
+ +    update_mdatoms(mdatoms, state->lambda[efptMASS]);
+ +
+ +    if (opt2bSet("-cpi", nfile, fnm))
+ +    {
+ +        bStateFromCP = gmx_fexist_master(opt2fn_master("-cpi", nfile, fnm, cr), cr);
+ +    }
+ +    else
+ +    {
+ +        bStateFromCP = FALSE;
+ +    }
+ +
+ +    if (MASTER(cr))
+ +    {
+ +        if (bStateFromCP)
+ +        {
+ +            /* Update mdebin with energy history if appending to output files */
+ +            if (Flags & MD_APPENDFILES)
+ +            {
+ +                restore_energyhistory_from_state(mdebin, &state_global->enerhist);
+ +            }
+ +            else
+ +            {
+ +                /* We might have read an energy history from checkpoint,
+ +                 * free the allocated memory and reset the counts.
+ +                 */
+ +                done_energyhistory(&state_global->enerhist);
+ +                init_energyhistory(&state_global->enerhist);
+ +            }
+ +        }
+ +        /* Set the initial energy history in state by updating once */
+ +        update_energyhistory(&state_global->enerhist, mdebin);
+ +    }
+ +
+ +    if ((state->flags & (1<<estLD_RNG)) && (Flags & MD_READ_RNG))
+ +    {
+ +        /* Set the random state if we read a checkpoint file */
+ +        set_stochd_state(upd, state);
+ +    }
+ +
+ +    if (state->flags & (1<<estMC_RNG))
+ +    {
+ +        set_mc_state(mcrng, state);
+ +    }
+ +
+ +    /* Initialize constraints */
+ +    if (constr)
+ +    {
+ +        if (!DOMAINDECOMP(cr))
+ +        {
+ +            set_constraints(constr, top, ir, mdatoms, cr);
+ +        }
+ +    }
+ +
+ +    /* Check whether we have to GCT stuff */
+ +    bTCR = ftp2bSet(efGCT, nfile, fnm);
+ +    if (bTCR)
+ +    {
+ +        if (MASTER(cr))
+ +        {
+ +            fprintf(stderr, "Will do General Coupling Theory!\n");
+ +        }
+ +        gnx = top_global->mols.nr;
+ +        snew(grpindex, gnx);
+ +        for (i = 0; (i < gnx); i++)
+ +        {
+ +            grpindex[i] = i;
+ +        }
+ +    }
+ +
+ +    if (repl_ex_nst > 0)
+ +    {
+ +        /* We need to be sure replica exchange can only occur
+ +         * when the energies are current */
+ +        check_nst_param(fplog, cr, "nstcalcenergy", ir->nstcalcenergy,
+ +                        "repl_ex_nst", &repl_ex_nst);
+ +        /* This check needs to happen before inter-simulation
+ +         * signals are initialized, too */
+ +    }
+ +    if (repl_ex_nst > 0 && MASTER(cr))
+ +    {
+ +        repl_ex = init_replica_exchange(fplog, cr->ms, state_global, ir,
+ +                                        repl_ex_nst, repl_ex_nex, repl_ex_seed);
+ +    }
+ +
+ +    /* PME tuning is only supported with GPUs or PME nodes and not with rerun.
+ +     * With perturbed charges with soft-core we should not change the cut-off.
+ +     */
+ +    if ((Flags & MD_TUNEPME) &&
+ +        EEL_PME(fr->eeltype) &&
+ +        ( (fr->cutoff_scheme == ecutsVERLET && fr->nbv->bUseGPU) || !(cr->duty & DUTY_PME)) &&
+ +        !(ir->efep != efepNO && mdatoms->nChargePerturbed > 0 && ir->fepvals->bScCoul) &&
+ +        !bRerunMD)
+ +    {
+ +        pme_loadbal_init(&pme_loadbal, ir, state->box, fr->ic, fr->pmedata);
+ +        cycles_pmes = 0;
+ +        if (cr->duty & DUTY_PME)
+ +        {
+ +            /* Start tuning right away, as we can't measure the load */
+ +            bPMETuneRunning = TRUE;
+ +        }
+ +        else
+ +        {
+ +            /* Separate PME nodes, we can measure the PP/PME load balance */
+ +            bPMETuneTry = TRUE;
+ +        }
+ +    }
+ +
+ +    if (!ir->bContinuation && !bRerunMD)
+ +    {
+ +        if (mdatoms->cFREEZE && (state->flags & (1<<estV)))
+ +        {
+ +            /* Set the velocities of frozen particles to zero */
+ +            for (i = mdatoms->start; i < mdatoms->start+mdatoms->homenr; i++)
+ +            {
+ +                for (m = 0; m < DIM; m++)
+ +                {
+ +                    if (ir->opts.nFreeze[mdatoms->cFREEZE[i]][m])
+ +                    {
+ +                        state->v[i][m] = 0;
+ +                    }
+ +                }
+ +            }
+ +        }
+ +
+ +        if (constr)
+ +        {
+ +            /* Constrain the initial coordinates and velocities */
+ +            do_constrain_first(fplog, constr, ir, mdatoms, state, f,
+ +                               graph, cr, nrnb, fr, top, shake_vir);
+ +        }
+ +        if (vsite)
+ +        {
+ +            /* Construct the virtual sites for the initial configuration */
+ +            construct_vsites(fplog, vsite, state->x, nrnb, ir->delta_t, NULL,
+ +                             top->idef.iparams, top->idef.il,
+ +                             fr->ePBC, fr->bMolPBC, graph, cr, state->box);
+ +        }
+ +    }
+ +
+ +    debug_gmx();
+ +
-     if (ir->bExpanded && (nstfep > ir->expandedvals->nstexpanded))
++    /* set free energy calculation frequency as the minimum
++       greatest common denominator of nstdhdl, nstexpanded, and repl_ex_nst*/
+ +    nstfep = ir->fepvals->nstdhdl;
-         nstfep = ir->expandedvals->nstexpanded;
++    if (ir->bExpanded)
+ +    {
-     if (repl_ex_nst > 0 && nstfep > repl_ex_nst)
++        nstfep = gmx_greatest_common_divisor(ir->fepvals->nstdhdl,nstfep);
+ +    }
-         nstfep = repl_ex_nst;
++    if (repl_ex_nst > 0)
+ +    {
++        nstfep = gmx_greatest_common_divisor(repl_ex_nst,nstfep);
+ +    }
+ +
+ +    /* I'm assuming we need global communication the first time! MRS */
+ +    cglo_flags = (CGLO_TEMPERATURE | CGLO_GSTAT
+ +                  | ((ir->comm_mode != ecmNO) ? CGLO_STOPCM : 0)
+ +                  | (bVV ? CGLO_PRESSURE : 0)
+ +                  | (bVV ? CGLO_CONSTRAINT : 0)
+ +                  | (bRerunMD ? CGLO_RERUNMD : 0)
+ +                  | ((Flags & MD_READ_EKIN) ? CGLO_READEKIN : 0));
+ +
+ +    bSumEkinhOld = FALSE;
+ +    compute_globals(fplog, gstat, cr, ir, fr, ekind, state, state_global, mdatoms, nrnb, vcm,
+ +                    NULL, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
+ +                    constr, NULL, FALSE, state->box,
+ +                    top_global, &pcurr, top_global->natoms, &bSumEkinhOld, cglo_flags);
+ +    if (ir->eI == eiVVAK)
+ +    {
+ +        /* a second call to get the half step temperature initialized as well */
+ +        /* we do the same call as above, but turn the pressure off -- internally to
+ +           compute_globals, this is recognized as a velocity verlet half-step
+ +           kinetic energy calculation.  This minimized excess variables, but
+ +           perhaps loses some logic?*/
+ +
+ +        compute_globals(fplog, gstat, cr, ir, fr, ekind, state, state_global, mdatoms, nrnb, vcm,
+ +                        NULL, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
+ +                        constr, NULL, FALSE, state->box,
+ +                        top_global, &pcurr, top_global->natoms, &bSumEkinhOld,
+ +                        cglo_flags &~(CGLO_STOPCM | CGLO_PRESSURE));
+ +    }
+ +
+ +    /* Calculate the initial half step temperature, and save the ekinh_old */
+ +    if (!(Flags & MD_STARTFROMCPT))
+ +    {
+ +        for (i = 0; (i < ir->opts.ngtc); i++)
+ +        {
+ +            copy_mat(ekind->tcstat[i].ekinh, ekind->tcstat[i].ekinh_old);
+ +        }
+ +    }
+ +    if (ir->eI != eiVV)
+ +    {
+ +        enerd->term[F_TEMP] *= 2; /* result of averages being done over previous and current step,
+ +                                     and there is no previous step */
+ +    }
+ +
+ +    /* if using an iterative algorithm, we need to create a working directory for the state. */
+ +    if (bIterativeCase)
+ +    {
+ +        bufstate = init_bufstate(state);
+ +    }
+ +    if (bFFscan)
+ +    {
+ +        snew(xcopy, state->natoms);
+ +        snew(vcopy, state->natoms);
+ +        copy_rvecn(state->x, xcopy, 0, state->natoms);
+ +        copy_rvecn(state->v, vcopy, 0, state->natoms);
+ +        copy_mat(state->box, boxcopy);
+ +    }
+ +
+ +    /* need to make an initiation call to get the Trotter variables set, as well as other constants for non-trotter
+ +       temperature control */
+ +    trotter_seq = init_npt_vars(ir, state, &MassQ, bTrotter);
+ +
+ +    if (MASTER(cr))
+ +    {
+ +        if (constr && !ir->bContinuation && ir->eConstrAlg == econtLINCS)
+ +        {
+ +            fprintf(fplog,
+ +                    "RMS relative constraint deviation after constraining: %.2e\n",
+ +                    constr_rmsd(constr, FALSE));
+ +        }
+ +        if (EI_STATE_VELOCITY(ir->eI))
+ +        {
+ +            fprintf(fplog, "Initial temperature: %g K\n", enerd->term[F_TEMP]);
+ +        }
+ +        if (bRerunMD)
+ +        {
+ +            fprintf(stderr, "starting md rerun '%s', reading coordinates from"
+ +                    " input trajectory '%s'\n\n",
+ +                    *(top_global->name), opt2fn("-rerun", nfile, fnm));
+ +            if (bVerbose)
+ +            {
+ +                fprintf(stderr, "Calculated time to finish depends on nsteps from "
+ +                        "run input file,\nwhich may not correspond to the time "
+ +                        "needed to process input trajectory.\n\n");
+ +            }
+ +        }
+ +        else
+ +        {
+ +            char tbuf[20];
+ +            fprintf(stderr, "starting mdrun '%s'\n",
+ +                    *(top_global->name));
+ +            if (ir->nsteps >= 0)
+ +            {
+ +                sprintf(tbuf, "%8.1f", (ir->init_step+ir->nsteps)*ir->delta_t);
+ +            }
+ +            else
+ +            {
+ +                sprintf(tbuf, "%s", "infinite");
+ +            }
+ +            if (ir->init_step > 0)
+ +            {
+ +                fprintf(stderr, "%s steps, %s ps (continuing from step %s, %8.1f ps).\n",
+ +                        gmx_step_str(ir->init_step+ir->nsteps, sbuf), tbuf,
+ +                        gmx_step_str(ir->init_step, sbuf2),
+ +                        ir->init_step*ir->delta_t);
+ +            }
+ +            else
+ +            {
+ +                fprintf(stderr, "%s steps, %s ps.\n",
+ +                        gmx_step_str(ir->nsteps, sbuf), tbuf);
+ +            }
+ +        }
+ +        fprintf(fplog, "\n");
+ +    }
+ +
+ +    /* Set and write start time */
+ +    runtime_start(runtime);
+ +    print_date_and_time(fplog, cr->nodeid, "Started mdrun", runtime);
+ +    wallcycle_start(wcycle, ewcRUN);
+ +    if (fplog)
+ +    {
+ +        fprintf(fplog, "\n");
+ +    }
+ +
+ +    /* safest point to do file checkpointing is here.  More general point would be immediately before integrator call */
+ +#ifdef GMX_FAHCORE
+ +    chkpt_ret = fcCheckPointParallel( cr->nodeid,
+ +                                      NULL, 0);
+ +    if (chkpt_ret == 0)
+ +    {
+ +        gmx_fatal( 3, __FILE__, __LINE__, "Checkpoint error on step %d\n", 0 );
+ +    }
+ +#endif
+ +
+ +    debug_gmx();
+ +    /***********************************************************
+ +     *
+ +     *             Loop over MD steps
+ +     *
+ +     ************************************************************/
+ +
+ +    /* if rerunMD then read coordinates and velocities from input trajectory */
+ +    if (bRerunMD)
+ +    {
+ +        if (getenv("GMX_FORCE_UPDATE"))
+ +        {
+ +            bForceUpdate = TRUE;
+ +        }
+ +
+ +        rerun_fr.natoms = 0;
+ +        if (MASTER(cr))
+ +        {
+ +            bNotLastFrame = read_first_frame(oenv, &status,
+ +                                             opt2fn("-rerun", nfile, fnm),
+ +                                             &rerun_fr, TRX_NEED_X | TRX_READ_V);
+ +            if (rerun_fr.natoms != top_global->natoms)
+ +            {
+ +                gmx_fatal(FARGS,
+ +                          "Number of atoms in trajectory (%d) does not match the "
+ +                          "run input file (%d)\n",
+ +                          rerun_fr.natoms, top_global->natoms);
+ +            }
+ +            if (ir->ePBC != epbcNONE)
+ +            {
+ +                if (!rerun_fr.bBox)
+ +                {
+ +                    gmx_fatal(FARGS, "Rerun trajectory frame step %d time %f does not contain a box, while pbc is used", rerun_fr.step, rerun_fr.time);
+ +                }
+ +                if (max_cutoff2(ir->ePBC, rerun_fr.box) < sqr(fr->rlistlong))
+ +                {
+ +                    gmx_fatal(FARGS, "Rerun trajectory frame step %d time %f has too small box dimensions", rerun_fr.step, rerun_fr.time);
+ +                }
+ +            }
+ +        }
+ +
+ +        if (PAR(cr))
+ +        {
+ +            rerun_parallel_comm(cr, &rerun_fr, &bNotLastFrame);
+ +        }
+ +
+ +        if (ir->ePBC != epbcNONE)
+ +        {
+ +            /* Set the shift vectors.
+ +             * Necessary here when have a static box different from the tpr box.
+ +             */
+ +            calc_shifts(rerun_fr.box, fr->shift_vec);
+ +        }
+ +    }
+ +
+ +    /* loop over MD steps or if rerunMD to end of input trajectory */
+ +    bFirstStep = TRUE;
+ +    /* Skip the first Nose-Hoover integration when we get the state from tpx */
+ +    bStateFromTPX    = !bStateFromCP;
+ +    bInitStep        = bFirstStep && (bStateFromTPX || bVV);
+ +    bStartingFromCpt = (Flags & MD_STARTFROMCPT) && bInitStep;
+ +    bLastStep        = FALSE;
+ +    bSumEkinhOld     = FALSE;
+ +    bExchanged       = FALSE;
+ +
+ +    init_global_signals(&gs, cr, ir, repl_ex_nst);
+ +
+ +    step     = ir->init_step;
+ +    step_rel = 0;
+ +
+ +    if (ir->nstlist == -1)
+ +    {
+ +        init_nlistheuristics(&nlh, bGStatEveryStep, step);
+ +    }
+ +
+ +    if (MULTISIM(cr) && (repl_ex_nst <= 0 ))
+ +    {
+ +        /* check how many steps are left in other sims */
+ +        multisim_nsteps = get_multisim_nsteps(cr, ir->nsteps);
+ +    }
+ +
+ +
+ +    /* and stop now if we should */
+ +    bLastStep = (bRerunMD || (ir->nsteps >= 0 && step_rel > ir->nsteps) ||
+ +                 ((multisim_nsteps >= 0) && (step_rel >= multisim_nsteps )));
+ +    while (!bLastStep || (bRerunMD && bNotLastFrame))
+ +    {
+ +
+ +        wallcycle_start(wcycle, ewcSTEP);
+ +
+ +        if (bRerunMD)
+ +        {
+ +            if (rerun_fr.bStep)
+ +            {
+ +                step     = rerun_fr.step;
+ +                step_rel = step - ir->init_step;
+ +            }
+ +            if (rerun_fr.bTime)
+ +            {
+ +                t = rerun_fr.time;
+ +            }
+ +            else
+ +            {
+ +                t = step;
+ +            }
+ +        }
+ +        else
+ +        {
+ +            bLastStep = (step_rel == ir->nsteps);
+ +            t         = t0 + step*ir->delta_t;
+ +        }
+ +
+ +        if (ir->efep != efepNO || ir->bSimTemp)
+ +        {
+ +            /* find and set the current lambdas.  If rerunning, we either read in a state, or a lambda value,
+ +               requiring different logic. */
+ +
+ +            set_current_lambdas(step, ir->fepvals, bRerunMD, &rerun_fr, state_global, state, lam0);
+ +            bDoDHDL      = do_per_step(step, ir->fepvals->nstdhdl);
+ +            bDoFEP       = (do_per_step(step, nstfep) && (ir->efep != efepNO));
+ +            bDoExpanded  = (do_per_step(step, ir->expandedvals->nstexpanded) && (ir->bExpanded) && (step > 0));
+ +        }
+ +
+ +        if (bSimAnn)
+ +        {
+ +            update_annealing_target_temp(&(ir->opts), t);
+ +        }
+ +
+ +        if (bRerunMD)
+ +        {
+ +            if (!(DOMAINDECOMP(cr) && !MASTER(cr)))
+ +            {
+ +                for (i = 0; i < state_global->natoms; i++)
+ +                {
+ +                    copy_rvec(rerun_fr.x[i], state_global->x[i]);
+ +                }
+ +                if (rerun_fr.bV)
+ +                {
+ +                    for (i = 0; i < state_global->natoms; i++)
+ +                    {
+ +                        copy_rvec(rerun_fr.v[i], state_global->v[i]);
+ +                    }
+ +                }
+ +                else
+ +                {
+ +                    for (i = 0; i < state_global->natoms; i++)
+ +                    {
+ +                        clear_rvec(state_global->v[i]);
+ +                    }
+ +                    if (bRerunWarnNoV)
+ +                    {
+ +                        fprintf(stderr, "\nWARNING: Some frames do not contain velocities.\n"
+ +                                "         Ekin, temperature and pressure are incorrect,\n"
+ +                                "         the virial will be incorrect when constraints are present.\n"
+ +                                "\n");
+ +                        bRerunWarnNoV = FALSE;
+ +                    }
+ +                }
+ +            }
+ +            copy_mat(rerun_fr.box, state_global->box);
+ +            copy_mat(state_global->box, state->box);
+ +
+ +            if (vsite && (Flags & MD_RERUN_VSITE))
+ +            {
+ +                if (DOMAINDECOMP(cr))
+ +                {
+ +                    gmx_fatal(FARGS, "Vsite recalculation with -rerun is not implemented for domain decomposition, use particle decomposition");
+ +                }
+ +                if (graph)
+ +                {
+ +                    /* Following is necessary because the graph may get out of sync
+ +                     * with the coordinates if we only have every N'th coordinate set
+ +                     */
+ +                    mk_mshift(fplog, graph, fr->ePBC, state->box, state->x);
+ +                    shift_self(graph, state->box, state->x);
+ +                }
+ +                construct_vsites(fplog, vsite, state->x, nrnb, ir->delta_t, state->v,
+ +                                 top->idef.iparams, top->idef.il,
+ +                                 fr->ePBC, fr->bMolPBC, graph, cr, state->box);
+ +                if (graph)
+ +                {
+ +                    unshift_self(graph, state->box, state->x);
+ +                }
+ +            }
+ +        }
+ +
+ +        /* Stop Center of Mass motion */
+ +        bStopCM = (ir->comm_mode != ecmNO && do_per_step(step, ir->nstcomm));
+ +
+ +        /* Copy back starting coordinates in case we're doing a forcefield scan */
+ +        if (bFFscan)
+ +        {
+ +            for (ii = 0; (ii < state->natoms); ii++)
+ +            {
+ +                copy_rvec(xcopy[ii], state->x[ii]);
+ +                copy_rvec(vcopy[ii], state->v[ii]);
+ +            }
+ +            copy_mat(boxcopy, state->box);
+ +        }
+ +
+ +        if (bRerunMD)
+ +        {
+ +            /* for rerun MD always do Neighbour Searching */
+ +            bNS      = (bFirstStep || ir->nstlist != 0);
+ +            bNStList = bNS;
+ +        }
+ +        else
+ +        {
+ +            /* Determine whether or not to do Neighbour Searching and LR */
+ +            bNStList = (ir->nstlist > 0  && step % ir->nstlist == 0);
+ +
+ +            bNS = (bFirstStep || bExchanged || bNStList || bDoFEP ||
+ +                   (ir->nstlist == -1 && nlh.nabnsb > 0));
+ +
+ +            if (bNS && ir->nstlist == -1)
+ +            {
+ +                set_nlistheuristics(&nlh, bFirstStep || bExchanged || bDoFEP, step);
+ +            }
+ +        }
+ +
+ +        /* check whether we should stop because another simulation has
+ +           stopped. */
+ +        if (MULTISIM(cr))
+ +        {
+ +            if ( (multisim_nsteps >= 0) &&  (step_rel >= multisim_nsteps)  &&
+ +                 (multisim_nsteps != ir->nsteps) )
+ +            {
+ +                if (bNS)
+ +                {
+ +                    if (MASTER(cr))
+ +                    {
+ +                        fprintf(stderr,
+ +                                "Stopping simulation %d because another one has finished\n",
+ +                                cr->ms->sim);
+ +                    }
+ +                    bLastStep         = TRUE;
+ +                    gs.sig[eglsCHKPT] = 1;
+ +                }
+ +            }
+ +        }
+ +
+ +        /* < 0 means stop at next step, > 0 means stop at next NS step */
+ +        if ( (gs.set[eglsSTOPCOND] < 0) ||
+ +             ( (gs.set[eglsSTOPCOND] > 0) && (bNStList || ir->nstlist == 0) ) )
+ +        {
+ +            bLastStep = TRUE;
+ +        }
+ +
+ +        /* Determine whether or not to update the Born radii if doing GB */
+ +        bBornRadii = bFirstStep;
+ +        if (ir->implicit_solvent && (step % ir->nstgbradii == 0))
+ +        {
+ +            bBornRadii = TRUE;
+ +        }
+ +
+ +        do_log     = do_per_step(step, ir->nstlog) || bFirstStep || bLastStep;
+ +        do_verbose = bVerbose &&
+ +            (step % stepout == 0 || bFirstStep || bLastStep);
+ +
+ +        if (bNS && !(bFirstStep && ir->bContinuation && !bRerunMD))
+ +        {
+ +            if (bRerunMD)
+ +            {
+ +                bMasterState = TRUE;
+ +            }
+ +            else
+ +            {
+ +                bMasterState = FALSE;
+ +                /* Correct the new box if it is too skewed */
+ +                if (DYNAMIC_BOX(*ir))
+ +                {
+ +                    if (correct_box(fplog, step, state->box, graph))
+ +                    {
+ +                        bMasterState = TRUE;
+ +                    }
+ +                }
+ +                if (DOMAINDECOMP(cr) && bMasterState)
+ +                {
+ +                    dd_collect_state(cr->dd, state, state_global);
+ +                }
+ +            }
+ +
+ +            if (DOMAINDECOMP(cr))
+ +            {
+ +                /* Repartition the domain decomposition */
+ +                wallcycle_start(wcycle, ewcDOMDEC);
+ +                dd_partition_system(fplog, step, cr,
+ +                                    bMasterState, nstglobalcomm,
+ +                                    state_global, top_global, ir,
+ +                                    state, &f, mdatoms, top, fr,
+ +                                    vsite, shellfc, constr,
+ +                                    nrnb, wcycle,
+ +                                    do_verbose && !bPMETuneRunning);
+ +                wallcycle_stop(wcycle, ewcDOMDEC);
+ +                /* If using an iterative integrator, reallocate space to match the decomposition */
+ +            }
+ +        }
+ +
+ +        if (MASTER(cr) && do_log && !bFFscan)
+ +        {
+ +            print_ebin_header(fplog, step, t, state->lambda[efptFEP]); /* can we improve the information printed here? */
+ +        }
+ +
+ +        if (ir->efep != efepNO)
+ +        {
+ +            update_mdatoms(mdatoms, state->lambda[efptMASS]);
+ +        }
+ +
+ +        if ((bRerunMD && rerun_fr.bV) || bExchanged)
+ +        {
+ +
+ +            /* We need the kinetic energy at minus the half step for determining
+ +             * the full step kinetic energy and possibly for T-coupling.*/
+ +            /* This may not be quite working correctly yet . . . . */
+ +            compute_globals(fplog, gstat, cr, ir, fr, ekind, state, state_global, mdatoms, nrnb, vcm,
+ +                            wcycle, enerd, NULL, NULL, NULL, NULL, mu_tot,
+ +                            constr, NULL, FALSE, state->box,
+ +                            top_global, &pcurr, top_global->natoms, &bSumEkinhOld,
+ +                            CGLO_RERUNMD | CGLO_GSTAT | CGLO_TEMPERATURE);
+ +        }
+ +        clear_mat(force_vir);
+ +
+ +        /* Ionize the atoms if necessary */
+ +        if (bIonize)
+ +        {
+ +            ionize(fplog, oenv, mdatoms, top_global, t, ir, state->x, state->v,
+ +                   mdatoms->start, mdatoms->start+mdatoms->homenr, state->box, cr);
+ +        }
+ +
+ +        /* Update force field in ffscan program */
+ +        if (bFFscan)
+ +        {
+ +            if (update_forcefield(fplog,
+ +                                  nfile, fnm, fr,
+ +                                  mdatoms->nr, state->x, state->box))
+ +            {
+ +                gmx_finalize_par();
+ +
+ +                exit(0);
+ +            }
+ +        }
+ +
+ +        /* We write a checkpoint at this MD step when:
+ +         * either at an NS step when we signalled through gs,
+ +         * or at the last step (but not when we do not want confout),
+ +         * but never at the first step or with rerun.
+ +         */
+ +        bCPT = (((gs.set[eglsCHKPT] && (bNS || ir->nstlist == 0)) ||
+ +                 (bLastStep && (Flags & MD_CONFOUT))) &&
+ +                step > ir->init_step && !bRerunMD);
+ +        if (bCPT)
+ +        {
+ +            gs.set[eglsCHKPT] = 0;
+ +        }
+ +
+ +        /* Determine the energy and pressure:
+ +         * at nstcalcenergy steps and at energy output steps (set below).
+ +         */
+ +        if (EI_VV(ir->eI) && (!bInitStep))
+ +        {
+ +            /* for vv, the first half of the integration actually corresponds
+ +               to the previous step.  bCalcEner is only required to be evaluated on the 'next' step,
+ +               but the virial needs to be calculated on both the current step and the 'next' step. Future
+ +               reorganization may be able to get rid of one of the bCalcVir=TRUE steps. */
+ +
+ +            bCalcEner = do_per_step(step-1, ir->nstcalcenergy);
+ +            bCalcVir  = bCalcEner ||
+ +                (ir->epc != epcNO && (do_per_step(step, ir->nstpcouple) || do_per_step(step-1, ir->nstpcouple)));
+ +        }
+ +        else
+ +        {
+ +            bCalcEner = do_per_step(step, ir->nstcalcenergy);
+ +            bCalcVir  = bCalcEner ||
+ +                (ir->epc != epcNO && do_per_step(step, ir->nstpcouple));
+ +        }
+ +
+ +        /* Do we need global communication ? */
+ +        bGStat = (bCalcVir || bCalcEner || bStopCM ||
+ +                  do_per_step(step, nstglobalcomm) || (bVV && IR_NVT_TROTTER(ir) && do_per_step(step-1, nstglobalcomm)) ||
+ +                  (ir->nstlist == -1 && !bRerunMD && step >= nlh.step_nscheck));
+ +
+ +        do_ene = (do_per_step(step, ir->nstenergy) || bLastStep);
+ +
+ +        if (do_ene || do_log)
+ +        {
+ +            bCalcVir  = TRUE;
+ +            bCalcEner = TRUE;
+ +            bGStat    = TRUE;
+ +        }
+ +
+ +        /* these CGLO_ options remain the same throughout the iteration */
+ +        cglo_flags = ((bRerunMD ? CGLO_RERUNMD : 0) |
+ +                      (bGStat ? CGLO_GSTAT : 0)
+ +                      );
+ +
+ +        force_flags = (GMX_FORCE_STATECHANGED |
+ +                       ((DYNAMIC_BOX(*ir) || bRerunMD) ? GMX_FORCE_DYNAMICBOX : 0) |
+ +                       GMX_FORCE_ALLFORCES |
+ +                       GMX_FORCE_SEPLRF |
+ +                       (bCalcVir ? GMX_FORCE_VIRIAL : 0) |
+ +                       (bCalcEner ? GMX_FORCE_ENERGY : 0) |
+ +                       (bDoFEP ? GMX_FORCE_DHDL : 0)
+ +                       );
+ +
+ +        if (fr->bTwinRange)
+ +        {
+ +            if (do_per_step(step, ir->nstcalclr))
+ +            {
+ +                force_flags |= GMX_FORCE_DO_LR;
+ +            }
+ +        }
+ +
+ +        if (shellfc)
+ +        {
+ +            /* Now is the time to relax the shells */
+ +            count = relax_shell_flexcon(fplog, cr, bVerbose, bFFscan ? step+1 : step,
+ +                                        ir, bNS, force_flags,
+ +                                        bStopCM, top, top_global,
+ +                                        constr, enerd, fcd,
+ +                                        state, f, force_vir, mdatoms,
+ +                                        nrnb, wcycle, graph, groups,
+ +                                        shellfc, fr, bBornRadii, t, mu_tot,
+ +                                        state->natoms, &bConverged, vsite,
+ +                                        outf->fp_field);
+ +            tcount += count;
+ +
+ +            if (bConverged)
+ +            {
+ +                nconverged++;
+ +            }
+ +        }
+ +        else
+ +        {
+ +            /* The coordinates (x) are shifted (to get whole molecules)
+ +             * in do_force.
+ +             * This is parallellized as well, and does communication too.
+ +             * Check comments in sim_util.c
+ +             */
+ +            do_force(fplog, cr, ir, step, nrnb, wcycle, top, top_global, groups,
+ +                     state->box, state->x, &state->hist,
+ +                     f, force_vir, mdatoms, enerd, fcd,
+ +                     state->lambda, graph,
+ +                     fr, vsite, mu_tot, t, outf->fp_field, ed, bBornRadii,
+ +                     (bNS ? GMX_FORCE_NS : 0) | force_flags);
+ +        }
+ +
+ +        if (bTCR)
+ +        {
+ +            mu_aver = calc_mu_aver(cr, state->x, mdatoms->chargeA,
+ +                                   mu_tot, &top_global->mols, mdatoms, gnx, grpindex);
+ +        }
+ +
+ +        if (bTCR && bFirstStep)
+ +        {
+ +            tcr = init_coupling(fplog, nfile, fnm, cr, fr, mdatoms, &(top->idef));
+ +            fprintf(fplog, "Done init_coupling\n");
+ +            fflush(fplog);
+ +        }
+ +
+ +        if (bVV && !bStartingFromCpt && !bRerunMD)
+ +        /*  ############### START FIRST UPDATE HALF-STEP FOR VV METHODS############### */
+ +        {
+ +            if (ir->eI == eiVV && bInitStep)
+ +            {
+ +                /* if using velocity verlet with full time step Ekin,
+ +                 * take the first half step only to compute the
+ +                 * virial for the first step. From there,
+ +                 * revert back to the initial coordinates
+ +                 * so that the input is actually the initial step.
+ +                 */
+ +                copy_rvecn(state->v, cbuf, 0, state->natoms); /* should make this better for parallelizing? */
+ +            }
+ +            else
+ +            {
+ +                /* this is for NHC in the Ekin(t+dt/2) version of vv */
+ +                trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ1);
+ +            }
+ +
+ +            /* If we are using twin-range interactions where the long-range component
+ +             * is only evaluated every nstcalclr>1 steps, we should do a special update
+ +             * step to combine the long-range forces on these steps.
+ +             * For nstcalclr=1 this is not done, since the forces would have been added
+ +             * directly to the short-range forces already.
+ +             */
+ +            bUpdateDoLR = (fr->bTwinRange && do_per_step(step, ir->nstcalclr));
+ +
+ +            update_coords(fplog, step, ir, mdatoms, state, fr->bMolPBC,
+ +                          f, bUpdateDoLR, fr->f_twin, fcd,
+ +                          ekind, M, wcycle, upd, bInitStep, etrtVELOCITY1,
+ +                          cr, nrnb, constr, &top->idef);
+ +
+ +            if (bIterativeCase && do_per_step(step-1, ir->nstpcouple) && !bInitStep)
+ +            {
+ +                gmx_iterate_init(&iterate, TRUE);
+ +            }
+ +            /* for iterations, we save these vectors, as we will be self-consistently iterating
+ +               the calculations */
+ +
+ +            /*#### UPDATE EXTENDED VARIABLES IN TROTTER FORMULATION */
+ +
+ +            /* save the state */
+ +            if (iterate.bIterationActive)
+ +            {
+ +                copy_coupling_state(state, bufstate, ekind, ekind_save, &(ir->opts));
+ +            }
+ +
+ +            bFirstIterate = TRUE;
+ +            while (bFirstIterate || iterate.bIterationActive)
+ +            {
+ +                if (iterate.bIterationActive)
+ +                {
+ +                    copy_coupling_state(bufstate, state, ekind_save, ekind, &(ir->opts));
+ +                    if (bFirstIterate && bTrotter)
+ +                    {
+ +                        /* The first time through, we need a decent first estimate
+ +                           of veta(t+dt) to compute the constraints.  Do
+ +                           this by computing the box volume part of the
+ +                           trotter integration at this time. Nothing else
+ +                           should be changed by this routine here.  If
+ +                           !(first time), we start with the previous value
+ +                           of veta.  */
+ +
+ +                        veta_save = state->veta;
+ +                        trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ0);
+ +                        vetanew     = state->veta;
+ +                        state->veta = veta_save;
+ +                    }
+ +                }
+ +
+ +                bOK = TRUE;
+ +                if (!bRerunMD || rerun_fr.bV || bForceUpdate)     /* Why is rerun_fr.bV here?  Unclear. */
+ +                {
+ +                    update_constraints(fplog, step, NULL, ir, ekind, mdatoms,
+ +                                       state, fr->bMolPBC, graph, f,
+ +                                       &top->idef, shake_vir, NULL,
+ +                                       cr, nrnb, wcycle, upd, constr,
+ +                                       bInitStep, TRUE, bCalcVir, vetanew);
+ +
+ +                    if (!bOK && !bFFscan)
+ +                    {
+ +                        gmx_fatal(FARGS, "Constraint error: Shake, Lincs or Settle could not solve the constrains");
+ +                    }
+ +
+ +                }
+ +                else if (graph)
+ +                {
+ +                    /* Need to unshift here if a do_force has been
+ +                       called in the previous step */
+ +                    unshift_self(graph, state->box, state->x);
+ +                }
+ +
+ +                /* if VV, compute the pressure and constraints */
+ +                /* For VV2, we strictly only need this if using pressure
+ +                 * control, but we really would like to have accurate pressures
+ +                 * printed out.
+ +                 * Think about ways around this in the future?
+ +                 * For now, keep this choice in comments.
+ +                 */
+ +                /*bPres = (ir->eI==eiVV || IR_NPT_TROTTER(ir)); */
+ +                /*bTemp = ((ir->eI==eiVV &&(!bInitStep)) || (ir->eI==eiVVAK && IR_NPT_TROTTER(ir)));*/
+ +                bPres = TRUE;
+ +                bTemp = ((ir->eI == eiVV && (!bInitStep)) || (ir->eI == eiVVAK));
+ +                if (bCalcEner && ir->eI == eiVVAK)  /*MRS:  7/9/2010 -- this still doesn't fix it?*/
+ +                {
+ +                    bSumEkinhOld = TRUE;
+ +                }
+ +                /* for vv, the first half of the integration actually corresponds to the previous step.
+ +                   So we need information from the last step in the first half of the integration */
+ +                if (bGStat || do_per_step(step-1, nstglobalcomm))
+ +                {
+ +                    compute_globals(fplog, gstat, cr, ir, fr, ekind, state, state_global, mdatoms, nrnb, vcm,
+ +                                    wcycle, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
+ +                                    constr, NULL, FALSE, state->box,
+ +                                    top_global, &pcurr, top_global->natoms, &bSumEkinhOld,
+ +                                    cglo_flags
+ +                                    | CGLO_ENERGY
+ +                                    | (bTemp ? CGLO_TEMPERATURE : 0)
+ +                                    | (bPres ? CGLO_PRESSURE : 0)
+ +                                    | (bPres ? CGLO_CONSTRAINT : 0)
+ +                                    | ((iterate.bIterationActive) ? CGLO_ITERATE : 0)
+ +                                    | (bFirstIterate ? CGLO_FIRSTITERATE : 0)
+ +                                    | CGLO_SCALEEKIN
+ +                                    );
+ +                    /* explanation of above:
+ +                       a) We compute Ekin at the full time step
+ +                       if 1) we are using the AveVel Ekin, and it's not the
+ +                       initial step, or 2) if we are using AveEkin, but need the full
+ +                       time step kinetic energy for the pressure (always true now, since we want accurate statistics).
+ +                       b) If we are using EkinAveEkin for the kinetic energy for the temperature control, we still feed in
+ +                       EkinAveVel because it's needed for the pressure */
+ +                }
+ +                /* temperature scaling and pressure scaling to produce the extended variables at t+dt */
+ +                if (!bInitStep)
+ +                {
+ +                    if (bTrotter)
+ +                    {
+ +                        m_add(force_vir, shake_vir, total_vir); /* we need the un-dispersion corrected total vir here */
+ +                        trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ2);
+ +                    }
+ +                    else
+ +                    {
+ +                        if (bExchanged)
+ +                        {
+ +
+ +                            /* We need the kinetic energy at minus the half step for determining
+ +                             * the full step kinetic energy and possibly for T-coupling.*/
+ +                            /* This may not be quite working correctly yet . . . . */
+ +                            compute_globals(fplog, gstat, cr, ir, fr, ekind, state, state_global, mdatoms, nrnb, vcm,
+ +                                            wcycle, enerd, NULL, NULL, NULL, NULL, mu_tot,
+ +                                            constr, NULL, FALSE, state->box,
+ +                                            top_global, &pcurr, top_global->natoms, &bSumEkinhOld,
+ +                                            CGLO_RERUNMD | CGLO_GSTAT | CGLO_TEMPERATURE);
+ +                        }
+ +                    }
+ +                }
+ +
+ +                if (iterate.bIterationActive &&
+ +                    done_iterating(cr, fplog, step, &iterate, bFirstIterate,
+ +                                   state->veta, &vetanew))
+ +                {
+ +                    break;
+ +                }
+ +                bFirstIterate = FALSE;
+ +            }
+ +
+ +            if (bTrotter && !bInitStep)
+ +            {
+ +                copy_mat(shake_vir, state->svir_prev);
+ +                copy_mat(force_vir, state->fvir_prev);
+ +                if (IR_NVT_TROTTER(ir) && ir->eI == eiVV)
+ +                {
+ +                    /* update temperature and kinetic energy now that step is over - this is the v(t+dt) point */
+ +                    enerd->term[F_TEMP] = sum_ekin(&(ir->opts), ekind, NULL, (ir->eI == eiVV), FALSE, FALSE);
+ +                    enerd->term[F_EKIN] = trace(ekind->ekin);
+ +                }
+ +            }
+ +            /* if it's the initial step, we performed this first step just to get the constraint virial */
+ +            if (bInitStep && ir->eI == eiVV)
+ +            {
+ +                copy_rvecn(cbuf, state->v, 0, state->natoms);
+ +            }
+ +        }
+ +
+ +        /* MRS -- now done iterating -- compute the conserved quantity */
+ +        if (bVV)
+ +        {
+ +            saved_conserved_quantity = compute_conserved_from_auxiliary(ir, state, &MassQ);
+ +            if (ir->eI == eiVV)
+ +            {
+ +                last_ekin = enerd->term[F_EKIN];
+ +            }
+ +            if ((ir->eDispCorr != edispcEnerPres) && (ir->eDispCorr != edispcAllEnerPres))
+ +            {
+ +                saved_conserved_quantity -= enerd->term[F_DISPCORR];
+ +            }
+ +            /* sum up the foreign energy and dhdl terms for vv.  currently done every step so that dhdl is correct in the .edr */
+ +            if (!bRerunMD)
+ +            {
+ +                sum_dhdl(enerd, state->lambda, ir->fepvals);
+ +            }
+ +        }
+ +
+ +        /* ########  END FIRST UPDATE STEP  ############## */
+ +        /* ########  If doing VV, we now have v(dt) ###### */
+ +        if (bDoExpanded)
+ +        {
+ +            /* perform extended ensemble sampling in lambda - we don't
+ +               actually move to the new state before outputting
+ +               statistics, but if performing simulated tempering, we
+ +               do update the velocities and the tau_t. */
+ +
+ +            lamnew = ExpandedEnsembleDynamics(fplog, ir, enerd, state, &MassQ, &df_history, step, mcrng, state->v, mdatoms);
+ +        }
+ +        /* ################## START TRAJECTORY OUTPUT ################# */
+ +
+ +        /* Now we have the energies and forces corresponding to the
+ +         * coordinates at time t. We must output all of this before
+ +         * the update.
+ +         * for RerunMD t is read from input trajectory
+ +         */
+ +        mdof_flags = 0;
+ +        if (do_per_step(step, ir->nstxout))
+ +        {
+ +            mdof_flags |= MDOF_X;
+ +        }
+ +        if (do_per_step(step, ir->nstvout))
+ +        {
+ +            mdof_flags |= MDOF_V;
+ +        }
+ +        if (do_per_step(step, ir->nstfout))
+ +        {
+ +            mdof_flags |= MDOF_F;
+ +        }
+ +        if (do_per_step(step, ir->nstxtcout))
+ +        {
+ +            mdof_flags |= MDOF_XTC;
+ +        }
+ +        if (bCPT)
+ +        {
+ +            mdof_flags |= MDOF_CPT;
+ +        }
+ +        ;
+ +
+ +#if defined(GMX_FAHCORE) || defined(GMX_WRITELASTSTEP)
+ +        if (bLastStep)
+ +        {
+ +            /* Enforce writing positions and velocities at end of run */
+ +            mdof_flags |= (MDOF_X | MDOF_V);
+ +        }
+ +#endif
+ +#ifdef GMX_FAHCORE
+ +        if (MASTER(cr))
+ +        {
+ +            fcReportProgress( ir->nsteps, step );
+ +        }
+ +
+ +        /* sync bCPT and fc record-keeping */
+ +        if (bCPT && MASTER(cr))
+ +        {
+ +            fcRequestCheckPoint();
+ +        }
+ +#endif
+ +
+ +        if (mdof_flags != 0)
+ +        {
+ +            wallcycle_start(wcycle, ewcTRAJ);
+ +            if (bCPT)
+ +            {
+ +                if (state->flags & (1<<estLD_RNG))
+ +                {
+ +                    get_stochd_state(upd, state);
+ +                }
+ +                if (state->flags  & (1<<estMC_RNG))
+ +                {
+ +                    get_mc_state(mcrng, state);
+ +                }
+ +                if (MASTER(cr))
+ +                {
+ +                    if (bSumEkinhOld)
+ +                    {
+ +                        state_global->ekinstate.bUpToDate = FALSE;
+ +                    }
+ +                    else
+ +                    {
+ +                        update_ekinstate(&state_global->ekinstate, ekind);
+ +                        state_global->ekinstate.bUpToDate = TRUE;
+ +                    }
+ +                    update_energyhistory(&state_global->enerhist, mdebin);
+ +                    if (ir->efep != efepNO || ir->bSimTemp)
+ +                    {
+ +                        state_global->fep_state = state->fep_state; /* MRS: seems kludgy. The code should be
+ +                                                                       structured so this isn't necessary.
+ +                                                                       Note this reassignment is only necessary
+ +                                                                       for single threads.*/
+ +                        copy_df_history(&state_global->dfhist, &df_history);
+ +                    }
+ +                }
+ +            }
+ +            write_traj(fplog, cr, outf, mdof_flags, top_global,
+ +                       step, t, state, state_global, f, f_global, &n_xtc, &x_xtc);
+ +            if (bCPT)
+ +            {
+ +                nchkpt++;
+ +                bCPT = FALSE;
+ +            }
+ +            debug_gmx();
+ +            if (bLastStep && step_rel == ir->nsteps &&
+ +                (Flags & MD_CONFOUT) && MASTER(cr) &&
+ +                !bRerunMD && !bFFscan)
+ +            {
+ +                /* x and v have been collected in write_traj,
+ +                 * because a checkpoint file will always be written
+ +                 * at the last step.
+ +                 */
+ +                fprintf(stderr, "\nWriting final coordinates.\n");
+ +                if (fr->bMolPBC)
+ +                {
+ +                    /* Make molecules whole only for confout writing */
+ +                    do_pbc_mtop(fplog, ir->ePBC, state->box, top_global, state_global->x);
+ +                }
+ +                write_sto_conf_mtop(ftp2fn(efSTO, nfile, fnm),
+ +                                    *top_global->name, top_global,
+ +                                    state_global->x, state_global->v,
+ +                                    ir->ePBC, state->box);
+ +                debug_gmx();
+ +            }
+ +            wallcycle_stop(wcycle, ewcTRAJ);
+ +        }
+ +
+ +        /* kludge -- virial is lost with restart for NPT control. Must restart */
+ +        if (bStartingFromCpt && bVV)
+ +        {
+ +            copy_mat(state->svir_prev, shake_vir);
+ +            copy_mat(state->fvir_prev, force_vir);
+ +        }
+ +        /*  ################## END TRAJECTORY OUTPUT ################ */
+ +
+ +        /* Determine the wallclock run time up till now */
+ +        run_time = gmx_gettime() - (double)runtime->real;
+ +
+ +        /* Check whether everything is still allright */
+ +        if (((int)gmx_get_stop_condition() > handled_stop_condition)
+ +#ifdef GMX_THREAD_MPI
+ +            && MASTER(cr)
+ +#endif
+ +            )
+ +        {
+ +            /* this is just make gs.sig compatible with the hack
+ +               of sending signals around by MPI_Reduce with together with
+ +               other floats */
+ +            if (gmx_get_stop_condition() == gmx_stop_cond_next_ns)
+ +            {
+ +                gs.sig[eglsSTOPCOND] = 1;
+ +            }
+ +            if (gmx_get_stop_condition() == gmx_stop_cond_next)
+ +            {
+ +                gs.sig[eglsSTOPCOND] = -1;
+ +            }
+ +            /* < 0 means stop at next step, > 0 means stop at next NS step */
+ +            if (fplog)
+ +            {
+ +                fprintf(fplog,
+ +                        "\n\nReceived the %s signal, stopping at the next %sstep\n\n",
+ +                        gmx_get_signal_name(),
+ +                        gs.sig[eglsSTOPCOND] == 1 ? "NS " : "");
+ +                fflush(fplog);
+ +            }
+ +            fprintf(stderr,
+ +                    "\n\nReceived the %s signal, stopping at the next %sstep\n\n",
+ +                    gmx_get_signal_name(),
+ +                    gs.sig[eglsSTOPCOND] == 1 ? "NS " : "");
+ +            fflush(stderr);
+ +            handled_stop_condition = (int)gmx_get_stop_condition();
+ +        }
+ +        else if (MASTER(cr) && (bNS || ir->nstlist <= 0) &&
+ +                 (max_hours > 0 && run_time > max_hours*60.0*60.0*0.99) &&
+ +                 gs.sig[eglsSTOPCOND] == 0 && gs.set[eglsSTOPCOND] == 0)
+ +        {
+ +            /* Signal to terminate the run */
+ +            gs.sig[eglsSTOPCOND] = 1;
+ +            if (fplog)
+ +            {
+ +                fprintf(fplog, "\nStep %s: Run time exceeded %.3f hours, will terminate the run\n", gmx_step_str(step, sbuf), max_hours*0.99);
+ +            }
+ +            fprintf(stderr, "\nStep %s: Run time exceeded %.3f hours, will terminate the run\n", gmx_step_str(step, sbuf), max_hours*0.99);
+ +        }
+ +
+ +        if (bResetCountersHalfMaxH && MASTER(cr) &&
+ +            run_time > max_hours*60.0*60.0*0.495)
+ +        {
+ +            gs.sig[eglsRESETCOUNTERS] = 1;
+ +        }
+ +
+ +        if (ir->nstlist == -1 && !bRerunMD)
+ +        {
+ +            /* When bGStatEveryStep=FALSE, global_stat is only called
+ +             * when we check the atom displacements, not at NS steps.
+ +             * This means that also the bonded interaction count check is not
+ +             * performed immediately after NS. Therefore a few MD steps could
+ +             * be performed with missing interactions.
+ +             * But wrong energies are never written to file,
+ +             * since energies are only written after global_stat
+ +             * has been called.
+ +             */
+ +            if (step >= nlh.step_nscheck)
+ +            {
+ +                nlh.nabnsb = natoms_beyond_ns_buffer(ir, fr, &top->cgs,
+ +                                                     nlh.scale_tot, state->x);
+ +            }
+ +            else
+ +            {
+ +                /* This is not necessarily true,
+ +                 * but step_nscheck is determined quite conservatively.
+ +                 */
+ +                nlh.nabnsb = 0;
+ +            }
+ +        }
+ +
+ +        /* In parallel we only have to check for checkpointing in steps
+ +         * where we do global communication,
+ +         *  otherwise the other nodes don't know.
+ +         */
+ +        if (MASTER(cr) && ((bGStat || !PAR(cr)) &&
+ +                           cpt_period >= 0 &&
+ +                           (cpt_period == 0 ||
+ +                            run_time >= nchkpt*cpt_period*60.0)) &&
+ +            gs.set[eglsCHKPT] == 0)
+ +        {
+ +            gs.sig[eglsCHKPT] = 1;
+ +        }
+ +
+ +        /* at the start of step, randomize or scale the velocities (trotter done elsewhere) */
+ +        if (EI_VV(ir->eI))
+ +        {
+ +            if (!bInitStep)
+ +            {
+ +                update_tcouple(fplog, step, ir, state, ekind, wcycle, upd, &MassQ, mdatoms);
+ +            }
+ +            if (ETC_ANDERSEN(ir->etc)) /* keep this outside of update_tcouple because of the extra info required to pass */
+ +            {
+ +                gmx_bool bIfRandomize;
+ +                bIfRandomize = update_randomize_velocities(ir, step, mdatoms, state, upd, &top->idef, constr);
+ +                /* if we have constraints, we have to remove the kinetic energy parallel to the bonds */
+ +                if (constr && bIfRandomize)
+ +                {
+ +                    update_constraints(fplog, step, NULL, ir, ekind, mdatoms,
+ +                                       state, fr->bMolPBC, graph, f,
+ +                                       &top->idef, tmp_vir, NULL,
+ +                                       cr, nrnb, wcycle, upd, constr,
+ +                                       bInitStep, TRUE, bCalcVir, vetanew);
+ +                }
+ +            }
+ +        }
+ +
+ +        if (bIterativeCase && do_per_step(step, ir->nstpcouple))
+ +        {
+ +            gmx_iterate_init(&iterate, TRUE);
+ +            /* for iterations, we save these vectors, as we will be redoing the calculations */
+ +            copy_coupling_state(state, bufstate, ekind, ekind_save, &(ir->opts));
+ +        }
+ +
+ +        bFirstIterate = TRUE;
+ +        while (bFirstIterate || iterate.bIterationActive)
+ +        {
+ +            /* We now restore these vectors to redo the calculation with improved extended variables */
+ +            if (iterate.bIterationActive)
+ +            {
+ +                copy_coupling_state(bufstate, state, ekind_save, ekind, &(ir->opts));
+ +            }
+ +
+ +            /* We make the decision to break or not -after- the calculation of Ekin and Pressure,
+ +               so scroll down for that logic */
+ +
+ +            /* #########   START SECOND UPDATE STEP ################# */
+ +            /* Box is changed in update() when we do pressure coupling,
+ +             * but we should still use the old box for energy corrections and when
+ +             * writing it to the energy file, so it matches the trajectory files for
+ +             * the same timestep above. Make a copy in a separate array.
+ +             */
+ +            copy_mat(state->box, lastbox);
+ +
+ +            bOK = TRUE;
+ +            dvdl_constr = 0;
+ +
+ +            if (!(bRerunMD && !rerun_fr.bV && !bForceUpdate))
+ +            {
+ +                wallcycle_start(wcycle, ewcUPDATE);
+ +                /* UPDATE PRESSURE VARIABLES IN TROTTER FORMULATION WITH CONSTRAINTS */
+ +                if (bTrotter)
+ +                {
+ +                    if (iterate.bIterationActive)
+ +                    {
+ +                        if (bFirstIterate)
+ +                        {
+ +                            scalevir = 1;
+ +                        }
+ +                        else
+ +                        {
+ +                            /* we use a new value of scalevir to converge the iterations faster */
+ +                            scalevir = tracevir/trace(shake_vir);
+ +                        }
+ +                        msmul(shake_vir, scalevir, shake_vir);
+ +                        m_add(force_vir, shake_vir, total_vir);
+ +                        clear_mat(shake_vir);
+ +                    }
+ +                    trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ3);
+ +                    /* We can only do Berendsen coupling after we have summed
+ +                     * the kinetic energy or virial. Since the happens
+ +                     * in global_state after update, we should only do it at
+ +                     * step % nstlist = 1 with bGStatEveryStep=FALSE.
+ +                     */
+ +                }
+ +                else
+ +                {
+ +                    update_tcouple(fplog, step, ir, state, ekind, wcycle, upd, &MassQ, mdatoms);
+ +                    update_pcouple(fplog, step, ir, state, pcoupl_mu, M, wcycle,
+ +                                   upd, bInitStep);
+ +                }
+ +
+ +                if (bVV)
+ +                {
+ +                    bUpdateDoLR = (fr->bTwinRange && do_per_step(step, ir->nstcalclr));
+ +
+ +                    /* velocity half-step update */
+ +                    update_coords(fplog, step, ir, mdatoms, state, fr->bMolPBC, f,
+ +                                  bUpdateDoLR, fr->f_twin, fcd,
+ +                                  ekind, M, wcycle, upd, FALSE, etrtVELOCITY2,
+ +                                  cr, nrnb, constr, &top->idef);
+ +                }
+ +
+ +                /* Above, initialize just copies ekinh into ekin,
+ +                 * it doesn't copy position (for VV),
+ +                 * and entire integrator for MD.
+ +                 */
+ +
+ +                if (ir->eI == eiVVAK)
+ +                {
+ +                    copy_rvecn(state->x, cbuf, 0, state->natoms);
+ +                }
+ +                bUpdateDoLR = (fr->bTwinRange && do_per_step(step, ir->nstcalclr));
+ +
+ +                update_coords(fplog, step, ir, mdatoms, state, fr->bMolPBC, f,
+ +                              bUpdateDoLR, fr->f_twin, fcd,
+ +                              ekind, M, wcycle, upd, bInitStep, etrtPOSITION, cr, nrnb, constr, &top->idef);
+ +                wallcycle_stop(wcycle, ewcUPDATE);
+ +
+ +                update_constraints(fplog, step, &dvdl_constr, ir, ekind, mdatoms, state,
+ +                                   fr->bMolPBC, graph, f,
+ +                                   &top->idef, shake_vir, force_vir,
+ +                                   cr, nrnb, wcycle, upd, constr,
+ +                                   bInitStep, FALSE, bCalcVir, state->veta);
+ +
+ +                if (ir->eI == eiVVAK)
+ +                {
+ +                    /* erase F_EKIN and F_TEMP here? */
+ +                    /* just compute the kinetic energy at the half step to perform a trotter step */
+ +                    compute_globals(fplog, gstat, cr, ir, fr, ekind, state, state_global, mdatoms, nrnb, vcm,
+ +                                    wcycle, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
+ +                                    constr, NULL, FALSE, lastbox,
+ +                                    top_global, &pcurr, top_global->natoms, &bSumEkinhOld,
+ +                                    cglo_flags | CGLO_TEMPERATURE
+ +                                    );
+ +                    wallcycle_start(wcycle, ewcUPDATE);
+ +                    trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ4);
+ +                    /* now we know the scaling, we can compute the positions again again */
+ +                    copy_rvecn(cbuf, state->x, 0, state->natoms);
+ +
+ +                    bUpdateDoLR = (fr->bTwinRange && do_per_step(step, ir->nstcalclr));
+ +
+ +                    update_coords(fplog, step, ir, mdatoms, state, fr->bMolPBC, f,
+ +                                  bUpdateDoLR, fr->f_twin, fcd,
+ +                                  ekind, M, wcycle, upd, bInitStep, etrtPOSITION, cr, nrnb, constr, &top->idef);
+ +                    wallcycle_stop(wcycle, ewcUPDATE);
+ +
+ +                    /* do we need an extra constraint here? just need to copy out of state->v to upd->xp? */
+ +                    /* are the small terms in the shake_vir here due
+ +                     * to numerical errors, or are they important
+ +                     * physically? I'm thinking they are just errors, but not completely sure.
+ +                     * For now, will call without actually constraining, constr=NULL*/
+ +                    update_constraints(fplog, step, NULL, ir, ekind, mdatoms,
+ +                                       state, fr->bMolPBC, graph, f,
+ +                                       &top->idef, tmp_vir, force_vir,
+ +                                       cr, nrnb, wcycle, upd, NULL,
+ +                                       bInitStep, FALSE, bCalcVir,
+ +                                       state->veta);
+ +                }
+ +                if (!bOK && !bFFscan)
+ +                {
+ +                    gmx_fatal(FARGS, "Constraint error: Shake, Lincs or Settle could not solve the constrains");
+ +                }
+ +
+ +                if (fr->bSepDVDL && fplog && do_log)
+ +                {
+ +                    fprintf(fplog, sepdvdlformat, "Constraint dV/dl", 0.0, dvdl_constr);
+ +                }
+ +                if (bVV)
+ +                {
+ +                    /* this factor or 2 correction is necessary
+ +                       because half of the constraint force is removed
+ +                       in the vv step, so we have to double it.  See
+ +                       the Redmine issue #1255.  It is not yet clear
+ +                       if the factor of 2 is exact, or just a very
+ +                       good approximation, and this will be
+ +                       investigated.  The next step is to see if this
+ +                       can be done adding a dhdl contribution from the
+ +                       rattle step, but this is somewhat more
+ +                       complicated with the current code. Will be
+ +                       investigated, hopefully for 4.6.3. However,
+ +                       this current solution is much better than
+ +                       having it completely wrong.
+ +                    */
+ +                    enerd->term[F_DVDL_CONSTR] += 2*dvdl_constr;
+ +                }
+ +                else
+ +                {
+ +                    enerd->term[F_DVDL_CONSTR] += dvdl_constr;
+ +                }
+ +            }
+ +            else if (graph)
+ +            {
+ +                /* Need to unshift here */
+ +                unshift_self(graph, state->box, state->x);
+ +            }
+ +
+ +            if (vsite != NULL)
+ +            {
+ +                wallcycle_start(wcycle, ewcVSITECONSTR);
+ +                if (graph != NULL)
+ +                {
+ +                    shift_self(graph, state->box, state->x);
+ +                }
+ +                construct_vsites(fplog, vsite, state->x, nrnb, ir->delta_t, state->v,
+ +                                 top->idef.iparams, top->idef.il,
+ +                                 fr->ePBC, fr->bMolPBC, graph, cr, state->box);
+ +
+ +                if (graph != NULL)
+ +                {
+ +                    unshift_self(graph, state->box, state->x);
+ +                }
+ +                wallcycle_stop(wcycle, ewcVSITECONSTR);
+ +            }
+ +
+ +            /* ############## IF NOT VV, Calculate globals HERE, also iterate constraints  ############ */
+ +            /* With Leap-Frog we can skip compute_globals at
+ +             * non-communication steps, but we need to calculate
+ +             * the kinetic energy one step before communication.
+ +             */
+ +            if (bGStat || (!EI_VV(ir->eI) && do_per_step(step+1, nstglobalcomm)))
+ +            {
+ +                if (ir->nstlist == -1 && bFirstIterate)
+ +                {
+ +                    gs.sig[eglsNABNSB] = nlh.nabnsb;
+ +                }
+ +                compute_globals(fplog, gstat, cr, ir, fr, ekind, state, state_global, mdatoms, nrnb, vcm,
+ +                                wcycle, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
+ +                                constr,
+ +                                bFirstIterate ? &gs : NULL,
+ +                                (step_rel % gs.nstms == 0) &&
+ +                                (multisim_nsteps < 0 || (step_rel < multisim_nsteps)),
+ +                                lastbox,
+ +                                top_global, &pcurr, top_global->natoms, &bSumEkinhOld,
+ +                                cglo_flags
+ +                                | (!EI_VV(ir->eI) || bRerunMD ? CGLO_ENERGY : 0)
+ +                                | (!EI_VV(ir->eI) && bStopCM ? CGLO_STOPCM : 0)
+ +                                | (!EI_VV(ir->eI) ? CGLO_TEMPERATURE : 0)
+ +                                | (!EI_VV(ir->eI) || bRerunMD ? CGLO_PRESSURE : 0)
+ +                                | (iterate.bIterationActive ? CGLO_ITERATE : 0)
+ +                                | (bFirstIterate ? CGLO_FIRSTITERATE : 0)
+ +                                | CGLO_CONSTRAINT
+ +                                );
+ +                if (ir->nstlist == -1 && bFirstIterate)
+ +                {
+ +                    nlh.nabnsb         = gs.set[eglsNABNSB];
+ +                    gs.set[eglsNABNSB] = 0;
+ +                }
+ +            }
+ +            /* bIterate is set to keep it from eliminating the old ekin kinetic energy terms */
+ +            /* #############  END CALC EKIN AND PRESSURE ################# */
+ +
+ +            /* Note: this is OK, but there are some numerical precision issues with using the convergence of
+ +               the virial that should probably be addressed eventually. state->veta has better properies,
+ +               but what we actually need entering the new cycle is the new shake_vir value. Ideally, we could
+ +               generate the new shake_vir, but test the veta value for convergence.  This will take some thought. */
+ +
+ +            if (iterate.bIterationActive &&
+ +                done_iterating(cr, fplog, step, &iterate, bFirstIterate,
+ +                               trace(shake_vir), &tracevir))
+ +            {
+ +                break;
+ +            }
+ +            bFirstIterate = FALSE;
+ +        }
+ +
+ +        if (!bVV || bRerunMD)
+ +        {
+ +            /* sum up the foreign energy and dhdl terms for md and sd. currently done every step so that dhdl is correct in the .edr */
+ +            sum_dhdl(enerd, state->lambda, ir->fepvals);
+ +        }
+ +        update_box(fplog, step, ir, mdatoms, state, graph, f,
+ +                   ir->nstlist == -1 ? &nlh.scale_tot : NULL, pcoupl_mu, nrnb, wcycle, upd, bInitStep, FALSE);
+ +
+ +        /* ################# END UPDATE STEP 2 ################# */
+ +        /* #### We now have r(t+dt) and v(t+dt/2)  ############# */
+ +
+ +        /* The coordinates (x) were unshifted in update */
+ +        if (bFFscan && (shellfc == NULL || bConverged))
+ +        {
+ +            if (print_forcefield(fplog, enerd->term, mdatoms->homenr,
+ +                                 f, NULL, xcopy,
+ +                                 &(top_global->mols), mdatoms->massT, pres))
+ +            {
+ +                gmx_finalize_par();
+ +
+ +                fprintf(stderr, "\n");
+ +                exit(0);
+ +            }
+ +        }
+ +        if (!bGStat)
+ +        {
+ +            /* We will not sum ekinh_old,
+ +             * so signal that we still have to do it.
+ +             */
+ +            bSumEkinhOld = TRUE;
+ +        }
+ +
+ +        if (bTCR)
+ +        {
+ +            /* Only do GCT when the relaxation of shells (minimization) has converged,
+ +             * otherwise we might be coupling to bogus energies.
+ +             * In parallel we must always do this, because the other sims might
+ +             * update the FF.
+ +             */
+ +
+ +            /* Since this is called with the new coordinates state->x, I assume
+ +             * we want the new box state->box too. / EL 20040121
+ +             */
+ +            do_coupling(fplog, oenv, nfile, fnm, tcr, t, step, enerd->term, fr,
+ +                        ir, MASTER(cr),
+ +                        mdatoms, &(top->idef), mu_aver,
+ +                        top_global->mols.nr, cr,
+ +                        state->box, total_vir, pres,
+ +                        mu_tot, state->x, f, bConverged);
+ +            debug_gmx();
+ +        }
+ +
+ +        /* #########  BEGIN PREPARING EDR OUTPUT  ###########  */
+ +
+ +        /* use the directly determined last velocity, not actually the averaged half steps */
+ +        if (bTrotter && ir->eI == eiVV)
+ +        {
+ +            enerd->term[F_EKIN] = last_ekin;
+ +        }
+ +        enerd->term[F_ETOT] = enerd->term[F_EPOT] + enerd->term[F_EKIN];
+ +
+ +        if (bVV)
+ +        {
+ +            enerd->term[F_ECONSERVED] = enerd->term[F_ETOT] + saved_conserved_quantity;
+ +        }
+ +        else
+ +        {
+ +            enerd->term[F_ECONSERVED] = enerd->term[F_ETOT] + compute_conserved_from_auxiliary(ir, state, &MassQ);
+ +        }
+ +        /* Check for excessively large energies */
+ +        if (bIonize)
+ +        {
+ +#ifdef GMX_DOUBLE
+ +            real etot_max = 1e200;
+ +#else
+ +            real etot_max = 1e30;
+ +#endif
+ +            if (fabs(enerd->term[F_ETOT]) > etot_max)
+ +            {
+ +                fprintf(stderr, "Energy too large (%g), giving up\n",
+ +                        enerd->term[F_ETOT]);
+ +            }
+ +        }
+ +        /* #########  END PREPARING EDR OUTPUT  ###########  */
+ +
+ +        /* Time for performance */
+ +        if (((step % stepout) == 0) || bLastStep)
+ +        {
+ +            runtime_upd_proc(runtime);
+ +        }
+ +
+ +        /* Output stuff */
+ +        if (MASTER(cr))
+ +        {
+ +            gmx_bool do_dr, do_or;
+ +
+ +            if (fplog && do_log && bDoExpanded)
+ +            {
+ +                /* only needed if doing expanded ensemble */
+ +                PrintFreeEnergyInfoToFile(fplog, ir->fepvals, ir->expandedvals, ir->bSimTemp ? ir->simtempvals : NULL,
+ +                                          &df_history, state->fep_state, ir->nstlog, step);
+ +            }
+ +            if (!(bStartingFromCpt && (EI_VV(ir->eI))))
+ +            {
+ +                if (bCalcEner)
+ +                {
+ +                    upd_mdebin(mdebin, bDoDHDL, TRUE,
+ +                               t, mdatoms->tmass, enerd, state,
+ +                               ir->fepvals, ir->expandedvals, lastbox,
+ +                               shake_vir, force_vir, total_vir, pres,
+ +                               ekind, mu_tot, constr);
+ +                }
+ +                else
+ +                {
+ +                    upd_mdebin_step(mdebin);
+ +                }
+ +
+ +                do_dr  = do_per_step(step, ir->nstdisreout);
+ +                do_or  = do_per_step(step, ir->nstorireout);
+ +
+ +                print_ebin(outf->fp_ene, do_ene, do_dr, do_or, do_log ? fplog : NULL,
+ +                           step, t,
+ +                           eprNORMAL, bCompact, mdebin, fcd, groups, &(ir->opts));
+ +            }
+ +            if (ir->ePull != epullNO)
+ +            {
+ +                pull_print_output(ir->pull, step, t);
+ +            }
+ +
+ +            if (do_per_step(step, ir->nstlog))
+ +            {
+ +                if (fflush(fplog) != 0)
+ +                {
+ +                    gmx_fatal(FARGS, "Cannot flush logfile - maybe you are out of disk space?");
+ +                }
+ +            }
+ +        }
+ +        if (bDoExpanded)
+ +        {
+ +            /* Have to do this part after outputting the logfile and the edr file */
+ +            state->fep_state = lamnew;
+ +            for (i = 0; i < efptNR; i++)
+ +            {
+ +                state_global->lambda[i] = ir->fepvals->all_lambda[i][lamnew];
+ +            }
+ +        }
+ +        /* Remaining runtime */
+ +        if (MULTIMASTER(cr) && (do_verbose || gmx_got_usr_signal()) && !bPMETuneRunning)
+ +        {
+ +            if (shellfc)
+ +            {
+ +                fprintf(stderr, "\n");
+ +            }
+ +            print_time(stderr, runtime, step, ir, cr);
+ +        }
+ +
+ +        /* Replica exchange */
+ +        bExchanged = FALSE;
+ +        if ((repl_ex_nst > 0) && (step > 0) && !bLastStep &&
+ +            do_per_step(step, repl_ex_nst))
+ +        {
+ +            bExchanged = replica_exchange(fplog, cr, repl_ex,
+ +                                          state_global, enerd,
+ +                                          state, step, t);
+ +
+ +            if (bExchanged && DOMAINDECOMP(cr))
+ +            {
+ +                dd_partition_system(fplog, step, cr, TRUE, 1,
+ +                                    state_global, top_global, ir,
+ +                                    state, &f, mdatoms, top, fr,
+ +                                    vsite, shellfc, constr,
+ +                                    nrnb, wcycle, FALSE);
+ +            }
+ +        }
+ +
+ +        bFirstStep       = FALSE;
+ +        bInitStep        = FALSE;
+ +        bStartingFromCpt = FALSE;
+ +
+ +        /* #######  SET VARIABLES FOR NEXT ITERATION IF THEY STILL NEED IT ###### */
+ +        /* With all integrators, except VV, we need to retain the pressure
+ +         * at the current step for coupling at the next step.
+ +         */
+ +        if ((state->flags & (1<<estPRES_PREV)) &&
+ +            (bGStatEveryStep ||
+ +             (ir->nstpcouple > 0 && step % ir->nstpcouple == 0)))
+ +        {
+ +            /* Store the pressure in t_state for pressure coupling
+ +             * at the next MD step.
+ +             */
+ +            copy_mat(pres, state->pres_prev);
+ +        }
+ +
+ +        /* #######  END SET VARIABLES FOR NEXT ITERATION ###### */
+ +
+ +        if ( (membed != NULL) && (!bLastStep) )
+ +        {
+ +            rescale_membed(step_rel, membed, state_global->x);
+ +        }
+ +
+ +        if (bRerunMD)
+ +        {
+ +            if (MASTER(cr))
+ +            {
+ +                /* read next frame from input trajectory */
+ +                bNotLastFrame = read_next_frame(oenv, status, &rerun_fr);
+ +            }
+ +
+ +            if (PAR(cr))
+ +            {
+ +                rerun_parallel_comm(cr, &rerun_fr, &bNotLastFrame);
+ +            }
+ +        }
+ +
+ +        if (!bRerunMD || !rerun_fr.bStep)
+ +        {
+ +            /* increase the MD step number */
+ +            step++;
+ +            step_rel++;
+ +        }
+ +
+ +        cycles = wallcycle_stop(wcycle, ewcSTEP);
+ +        if (DOMAINDECOMP(cr) && wcycle)
+ +        {
+ +            dd_cycles_add(cr->dd, cycles, ddCyclStep);
+ +        }
+ +
+ +        if (bPMETuneRunning || bPMETuneTry)
+ +        {
+ +            /* PME grid + cut-off optimization with GPUs or PME nodes */
+ +
+ +            /* Count the total cycles over the last steps */
+ +            cycles_pmes += cycles;
+ +
+ +            /* We can only switch cut-off at NS steps */
+ +            if (step % ir->nstlist == 0)
+ +            {
+ +                /* PME grid + cut-off optimization with GPUs or PME nodes */
+ +                if (bPMETuneTry)
+ +                {
+ +                    if (DDMASTER(cr->dd))
+ +                    {
+ +                        /* PME node load is too high, start tuning */
+ +                        bPMETuneRunning = (dd_pme_f_ratio(cr->dd) >= 1.05);
+ +                    }
+ +                    dd_bcast(cr->dd, sizeof(gmx_bool), &bPMETuneRunning);
+ +
+ +                    if (bPMETuneRunning || step_rel > ir->nstlist*50)
+ +                    {
+ +                        bPMETuneTry     = FALSE;
+ +                    }
+ +                }
+ +                if (bPMETuneRunning)
+ +                {
+ +                    /* init_step might not be a multiple of nstlist,
+ +                     * but the first cycle is always skipped anyhow.
+ +                     */
+ +                    bPMETuneRunning =
+ +                        pme_load_balance(pme_loadbal, cr,
+ +                                         (bVerbose && MASTER(cr)) ? stderr : NULL,
+ +                                         fplog,
+ +                                         ir, state, cycles_pmes,
+ +                                         fr->ic, fr->nbv, &fr->pmedata,
+ +                                         step);
+ +
+ +                    /* Update constants in forcerec/inputrec to keep them in sync with fr->ic */
+ +                    fr->ewaldcoeff = fr->ic->ewaldcoeff;
+ +                    fr->rlist      = fr->ic->rlist;
+ +                    fr->rlistlong  = fr->ic->rlistlong;
+ +                    fr->rcoulomb   = fr->ic->rcoulomb;
+ +                    fr->rvdw       = fr->ic->rvdw;
+ +                }
+ +                cycles_pmes = 0;
+ +            }
+ +        }
+ +
+ +        if (step_rel == wcycle_get_reset_counters(wcycle) ||
+ +            gs.set[eglsRESETCOUNTERS] != 0)
+ +        {
+ +            /* Reset all the counters related to performance over the run */
+ +            reset_all_counters(fplog, cr, step, &step_rel, ir, wcycle, nrnb, runtime,
+ +                               fr->nbv != NULL && fr->nbv->bUseGPU ? fr->nbv->cu_nbv : NULL);
+ +            wcycle_set_reset_counters(wcycle, -1);
+ +            if (!(cr->duty & DUTY_PME))
+ +            {
+ +                /* Tell our PME node to reset its counters */
+ +                gmx_pme_send_resetcounters(cr, step);
+ +            }
+ +            /* Correct max_hours for the elapsed time */
+ +            max_hours                -= run_time/(60.0*60.0);
+ +            bResetCountersHalfMaxH    = FALSE;
+ +            gs.set[eglsRESETCOUNTERS] = 0;
+ +        }
+ +
+ +    }
+ +    /* End of main MD loop */
+ +    debug_gmx();
+ +
+ +    /* Stop the time */
+ +    runtime_end(runtime);
+ +
+ +    if (bRerunMD && MASTER(cr))
+ +    {
+ +        close_trj(status);
+ +    }
+ +
+ +    if (!(cr->duty & DUTY_PME))
+ +    {
+ +        /* Tell the PME only node to finish */
+ +        gmx_pme_send_finish(cr);
+ +    }
+ +
+ +    if (MASTER(cr))
+ +    {
+ +        if (ir->nstcalcenergy > 0 && !bRerunMD)
+ +        {
+ +            print_ebin(outf->fp_ene, FALSE, FALSE, FALSE, fplog, step, t,
+ +                       eprAVER, FALSE, mdebin, fcd, groups, &(ir->opts));
+ +        }
+ +    }
+ +
+ +    done_mdoutf(outf);
+ +
+ +    debug_gmx();
+ +
+ +    if (ir->nstlist == -1 && nlh.nns > 0 && fplog)
+ +    {
+ +        fprintf(fplog, "Average neighborlist lifetime: %.1f steps, std.dev.: %.1f steps\n", nlh.s1/nlh.nns, sqrt(nlh.s2/nlh.nns - sqr(nlh.s1/nlh.nns)));
+ +        fprintf(fplog, "Average number of atoms that crossed the half buffer length: %.1f\n\n", nlh.ab/nlh.nns);
+ +    }
+ +
+ +    if (pme_loadbal != NULL)
+ +    {
+ +        pme_loadbal_done(pme_loadbal, cr, fplog,
+ +                         fr->nbv != NULL && fr->nbv->bUseGPU);
+ +    }
+ +
+ +    if (shellfc && fplog)
+ +    {
+ +        fprintf(fplog, "Fraction of iterations that converged:           %.2f %%\n",
+ +                (nconverged*100.0)/step_rel);
+ +        fprintf(fplog, "Average number of force evaluations per MD step: %.2f\n\n",
+ +                tcount/step_rel);
+ +    }
+ +
+ +    if (repl_ex_nst > 0 && MASTER(cr))
+ +    {
+ +        print_replica_exchange_statistics(fplog, repl_ex);
+ +    }
+ +
+ +    runtime->nsteps_done = step_rel;
+ +
+ +    return 0;
+ +}
diff --cc src/programs/mdrun/runner.c

index 7db1f502da785eff25d5983a3b24f6ad20d7ec0d,0000000000000000000000000000000000000000..298289df1f9f92479e22d3112c097dd673a9c17d

mode 100644,000000..100644
--- 1/src/programs/mdrun/runner.c
--- /dev/null
+++ b/src/programs/mdrun/runner.c
@@@ -1,1690 -1,0 +1,1661 @@@
- static int get_nthreads_mpi(gmx_hw_info_t *hwinfo,
+ +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
+ + *
+ + *
+ + *                This source code is part of
+ + *
+ + *                 G   R   O   M   A   C   S
+ + *
+ + *          GROningen MAchine for Chemical Simulations
+ + *
+ + *                        VERSION 3.2.0
+ + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
+ + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ + * Copyright (c) 2001-2004, The GROMACS development team,
+ + * check out http://www.gromacs.org for more information.
+ +
+ + * This program is free software; you can redistribute it and/or
+ + * modify it under the terms of the GNU General Public License
+ + * as published by the Free Software Foundation; either version 2
+ + * of the License, or (at your option) any later version.
+ + *
+ + * If you want to redistribute modifications, please consider that
+ + * scientific software is very special. Version control is crucial -
+ + * bugs must be traceable. We will be happy to consider code for
+ + * inclusion in the official distribution, but derived work must not
+ + * be called official GROMACS. Details are found in the README & COPYING
+ + * files - if they are missing, get the official version at www.gromacs.org.
+ + *
+ + * To help us fund GROMACS development, we humbly ask that you cite
+ + * the papers on the package - you can find them in the top README file.
+ + *
+ + * For more info, check our website at http://www.gromacs.org
+ + *
+ + * And Hey:
+ + * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
+ + */
+ +#ifdef HAVE_CONFIG_H
+ +#include <config.h>
+ +#endif
+ +#include <signal.h>
+ +#include <stdlib.h>
+ +#ifdef HAVE_UNISTD_H
+ +#include <unistd.h>
+ +#endif
+ +#include <string.h>
+ +#include <assert.h>
+ +
+ +#include "typedefs.h"
+ +#include "smalloc.h"
+ +#include "sysstuff.h"
+ +#include "statutil.h"
+ +#include "force.h"
+ +#include "mdrun.h"
+ +#include "md_logging.h"
+ +#include "md_support.h"
+ +#include "network.h"
+ +#include "pull.h"
+ +#include "pull_rotation.h"
+ +#include "names.h"
+ +#include "disre.h"
+ +#include "orires.h"
+ +#include "pme.h"
+ +#include "mdatoms.h"
+ +#include "repl_ex.h"
+ +#include "qmmm.h"
+ +#include "domdec.h"
+ +#include "partdec.h"
+ +#include "coulomb.h"
+ +#include "constr.h"
+ +#include "mvdata.h"
+ +#include "checkpoint.h"
+ +#include "mtop_util.h"
+ +#include "sighandler.h"
+ +#include "tpxio.h"
+ +#include "txtdump.h"
+ +#include "gmx_detect_hardware.h"
+ +#include "gmx_omp_nthreads.h"
+ +#include "pull_rotation.h"
+ +#include "calc_verletbuf.h"
+ +#include "../mdlib/nbnxn_search.h"
+ +#include "../mdlib/nbnxn_consts.h"
+ +#include "gmx_fatal_collective.h"
+ +#include "membed.h"
+ +#include "macros.h"
+ +#include "gmx_omp.h"
+ +#include "gmx_thread_affinity.h"
+ +
+ +#include "gromacs/utility/gmxmpi.h"
+ +
+ +#ifdef GMX_FAHCORE
+ +#include "corewrap.h"
+ +#endif
+ +
+ +#include "gpu_utils.h"
+ +#include "nbnxn_cuda_data_mgmt.h"
+ +
+ +typedef struct {
+ +    gmx_integrator_t *func;
+ +} gmx_intp_t;
+ +
+ +/* The array should match the eI array in include/types/enums.h */
+ +const gmx_intp_t    integrator[eiNR] = { {do_md}, {do_steep}, {do_cg}, {do_md}, {do_md}, {do_nm}, {do_lbfgs}, {do_tpi}, {do_tpi}, {do_md}, {do_md}, {do_md}};
+ +
+ +gmx_large_int_t     deform_init_init_step_tpx;
+ +matrix              deform_init_box_tpx;
+ +#ifdef GMX_THREAD_MPI
+ +tMPI_Thread_mutex_t deform_init_box_mutex = TMPI_THREAD_MUTEX_INITIALIZER;
+ +#endif
+ +
+ +
+ +#ifdef GMX_THREAD_MPI
+ +struct mdrunner_arglist
+ +{
+ +    gmx_hw_opt_t   *hw_opt;
+ +    FILE           *fplog;
+ +    t_commrec      *cr;
+ +    int             nfile;
+ +    const t_filenm *fnm;
+ +    output_env_t    oenv;
+ +    gmx_bool        bVerbose;
+ +    gmx_bool        bCompact;
+ +    int             nstglobalcomm;
+ +    ivec            ddxyz;
+ +    int             dd_node_order;
+ +    real            rdd;
+ +    real            rconstr;
+ +    const char     *dddlb_opt;
+ +    real            dlb_scale;
+ +    const char     *ddcsx;
+ +    const char     *ddcsy;
+ +    const char     *ddcsz;
+ +    const char     *nbpu_opt;
+ +    gmx_large_int_t nsteps_cmdline;
+ +    int             nstepout;
+ +    int             resetstep;
+ +    int             nmultisim;
+ +    int             repl_ex_nst;
+ +    int             repl_ex_nex;
+ +    int             repl_ex_seed;
+ +    real            pforce;
+ +    real            cpt_period;
+ +    real            max_hours;
+ +    const char     *deviceOptions;
+ +    unsigned long   Flags;
+ +    int             ret; /* return value */
+ +};
+ +
+ +
+ +/* The function used for spawning threads. Extracts the mdrunner()
+ +   arguments from its one argument and calls mdrunner(), after making
+ +   a commrec. */
+ +static void mdrunner_start_fn(void *arg)
+ +{
+ +    struct mdrunner_arglist *mda = (struct mdrunner_arglist*)arg;
+ +    struct mdrunner_arglist  mc  = *mda; /* copy the arg list to make sure
+ +                                            that it's thread-local. This doesn't
+ +                                            copy pointed-to items, of course,
+ +                                            but those are all const. */
+ +    t_commrec *cr;                       /* we need a local version of this */
+ +    FILE      *fplog = NULL;
+ +    t_filenm  *fnm;
+ +
+ +    fnm = dup_tfn(mc.nfile, mc.fnm);
+ +
+ +    cr = init_par_threads(mc.cr);
+ +
+ +    if (MASTER(cr))
+ +    {
+ +        fplog = mc.fplog;
+ +    }
+ +
+ +    mda->ret = mdrunner(mc.hw_opt, fplog, cr, mc.nfile, fnm, mc.oenv,
+ +                        mc.bVerbose, mc.bCompact, mc.nstglobalcomm,
+ +                        mc.ddxyz, mc.dd_node_order, mc.rdd,
+ +                        mc.rconstr, mc.dddlb_opt, mc.dlb_scale,
+ +                        mc.ddcsx, mc.ddcsy, mc.ddcsz,
+ +                        mc.nbpu_opt,
+ +                        mc.nsteps_cmdline, mc.nstepout, mc.resetstep,
+ +                        mc.nmultisim, mc.repl_ex_nst, mc.repl_ex_nex, mc.repl_ex_seed, mc.pforce,
+ +                        mc.cpt_period, mc.max_hours, mc.deviceOptions, mc.Flags);
+ +}
+ +
+ +/* called by mdrunner() to start a specific number of threads (including
+ +   the main thread) for thread-parallel runs. This in turn calls mdrunner()
+ +   for each thread.
+ +   All options besides nthreads are the same as for mdrunner(). */
+ +static t_commrec *mdrunner_start_threads(gmx_hw_opt_t *hw_opt,
+ +                                         FILE *fplog, t_commrec *cr, int nfile,
+ +                                         const t_filenm fnm[], const output_env_t oenv, gmx_bool bVerbose,
+ +                                         gmx_bool bCompact, int nstglobalcomm,
+ +                                         ivec ddxyz, int dd_node_order, real rdd, real rconstr,
+ +                                         const char *dddlb_opt, real dlb_scale,
+ +                                         const char *ddcsx, const char *ddcsy, const char *ddcsz,
+ +                                         const char *nbpu_opt,
+ +                                         gmx_large_int_t nsteps_cmdline,
+ +                                         int nstepout, int resetstep,
+ +                                         int nmultisim, int repl_ex_nst, int repl_ex_nex, int repl_ex_seed,
+ +                                         real pforce, real cpt_period, real max_hours,
+ +                                         const char *deviceOptions, unsigned long Flags)
+ +{
+ +    int                      ret;
+ +    struct mdrunner_arglist *mda;
+ +    t_commrec               *crn; /* the new commrec */
+ +    t_filenm                *fnmn;
+ +
+ +    /* first check whether we even need to start tMPI */
+ +    if (hw_opt->nthreads_tmpi < 2)
+ +    {
+ +        return cr;
+ +    }
+ +
+ +    /* a few small, one-time, almost unavoidable memory leaks: */
+ +    snew(mda, 1);
+ +    fnmn = dup_tfn(nfile, fnm);
+ +
+ +    /* fill the data structure to pass as void pointer to thread start fn */
+ +    mda->hw_opt         = hw_opt;
+ +    mda->fplog          = fplog;
+ +    mda->cr             = cr;
+ +    mda->nfile          = nfile;
+ +    mda->fnm            = fnmn;
+ +    mda->oenv           = oenv;
+ +    mda->bVerbose       = bVerbose;
+ +    mda->bCompact       = bCompact;
+ +    mda->nstglobalcomm  = nstglobalcomm;
+ +    mda->ddxyz[XX]      = ddxyz[XX];
+ +    mda->ddxyz[YY]      = ddxyz[YY];
+ +    mda->ddxyz[ZZ]      = ddxyz[ZZ];
+ +    mda->dd_node_order  = dd_node_order;
+ +    mda->rdd            = rdd;
+ +    mda->rconstr        = rconstr;
+ +    mda->dddlb_opt      = dddlb_opt;
+ +    mda->dlb_scale      = dlb_scale;
+ +    mda->ddcsx          = ddcsx;
+ +    mda->ddcsy          = ddcsy;
+ +    mda->ddcsz          = ddcsz;
+ +    mda->nbpu_opt       = nbpu_opt;
+ +    mda->nsteps_cmdline = nsteps_cmdline;
+ +    mda->nstepout       = nstepout;
+ +    mda->resetstep      = resetstep;
+ +    mda->nmultisim      = nmultisim;
+ +    mda->repl_ex_nst    = repl_ex_nst;
+ +    mda->repl_ex_nex    = repl_ex_nex;
+ +    mda->repl_ex_seed   = repl_ex_seed;
+ +    mda->pforce         = pforce;
+ +    mda->cpt_period     = cpt_period;
+ +    mda->max_hours      = max_hours;
+ +    mda->deviceOptions  = deviceOptions;
+ +    mda->Flags          = Flags;
+ +
+ +    /* now spawn new threads that start mdrunner_start_fn(), while
+ +       the main thread returns, we set thread affinity later */
+ +    ret = tMPI_Init_fn(TRUE, hw_opt->nthreads_tmpi, TMPI_AFFINITY_NONE,
+ +                       mdrunner_start_fn, (void*)(mda) );
+ +    if (ret != TMPI_SUCCESS)
+ +    {
+ +        return NULL;
+ +    }
+ +
+ +    /* make a new comm_rec to reflect the new situation */
+ +    crn = init_par_threads(cr);
+ +    return crn;
+ +}
+ +
+ +
+ +static int get_tmpi_omp_thread_division(const gmx_hw_info_t *hwinfo,
+ +                                        const gmx_hw_opt_t  *hw_opt,
+ +                                        int                  nthreads_tot,
+ +                                        int                  ngpu)
+ +{
+ +    int nthreads_tmpi;
+ +
+ +    /* There are no separate PME nodes here, as we ensured in
+ +     * check_and_update_hw_opt that nthreads_tmpi>0 with PME nodes
+ +     * and a conditional ensures we would not have ended up here.
+ +     * Note that separate PME nodes might be switched on later.
+ +     */
+ +    if (ngpu > 0)
+ +    {
+ +        nthreads_tmpi = ngpu;
+ +        if (nthreads_tot > 0 && nthreads_tot < nthreads_tmpi)
+ +        {
+ +            nthreads_tmpi = nthreads_tot;
+ +        }
+ +    }
+ +    else if (hw_opt->nthreads_omp > 0)
+ +    {
+ +        /* Here we could oversubscribe, when we do, we issue a warning later */
+ +        nthreads_tmpi = max(1, nthreads_tot/hw_opt->nthreads_omp);
+ +    }
+ +    else
+ +    {
+ +        /* TODO choose nthreads_omp based on hardware topology
+ +           when we have a hardware topology detection library */
+ +        /* In general, when running up to 4 threads, OpenMP should be faster.
+ +         * Note: on AMD Bulldozer we should avoid running OpenMP over two dies.
+ +         * On Intel>=Nehalem running OpenMP on a single CPU is always faster,
+ +         * even on two CPUs it's usually faster (but with many OpenMP threads
+ +         * it could be faster not to use HT, currently we always use HT).
+ +         * On Nehalem/Westmere we want to avoid running 16 threads over
+ +         * two CPUs with HT, so we need a limit<16; thus we use 12.
+ +         * A reasonable limit for Intel Sandy and Ivy bridge,
+ +         * not knowing the topology, is 16 threads.
+ +         */
+ +        const int nthreads_omp_always_faster             =  4;
+ +        const int nthreads_omp_always_faster_Nehalem     = 12;
+ +        const int nthreads_omp_always_faster_SandyBridge = 16;
+ +        const int first_model_Nehalem                    = 0x1A;
+ +        const int first_model_SandyBridge                = 0x2A;
+ +        gmx_bool  bIntel_Family6;
+ +
+ +        bIntel_Family6 =
+ +            (gmx_cpuid_vendor(hwinfo->cpuid_info) == GMX_CPUID_VENDOR_INTEL &&
+ +             gmx_cpuid_family(hwinfo->cpuid_info) == 6);
+ +
+ +        if (nthreads_tot <= nthreads_omp_always_faster ||
+ +            (bIntel_Family6 &&
+ +             ((gmx_cpuid_model(hwinfo->cpuid_info) >= nthreads_omp_always_faster_Nehalem && nthreads_tot <= nthreads_omp_always_faster_Nehalem) ||
+ +              (gmx_cpuid_model(hwinfo->cpuid_info) >= nthreads_omp_always_faster_SandyBridge && nthreads_tot <= nthreads_omp_always_faster_SandyBridge))))
+ +        {
+ +            /* Use pure OpenMP parallelization */
+ +            nthreads_tmpi = 1;
+ +        }
+ +        else
+ +        {
+ +            /* Don't use OpenMP parallelization */
+ +            nthreads_tmpi = nthreads_tot;
+ +        }
+ +    }
+ +
+ +    return nthreads_tmpi;
+ +}
+ +
+ +
+ +/* Get the number of threads to use for thread-MPI based on how many
+ + * were requested, which algorithms we're using,
+ + * and how many particles there are.
+ + * At the point we have already called check_and_update_hw_opt.
+ + * Thus all options should be internally consistent and consistent
+ + * with the hardware, except that ntmpi could be larger than #GPU.
+ + */
- static void prepare_verlet_scheme(FILE             *fplog,
-                                   gmx_hw_info_t    *hwinfo,
-                                   t_commrec        *cr,
-                                   const char       *nbpu_opt,
-                                   t_inputrec       *ir,
-                                   const gmx_mtop_t *mtop,
-                                   matrix            box,
-                                   gmx_bool         *bUseGPU)
++static int get_nthreads_mpi(const gmx_hw_info_t *hwinfo,
+ +                            gmx_hw_opt_t *hw_opt,
+ +                            t_inputrec *inputrec, gmx_mtop_t *mtop,
+ +                            const t_commrec *cr,
+ +                            FILE *fplog)
+ +{
+ +    int      nthreads_hw, nthreads_tot_max, nthreads_tmpi, nthreads_new, ngpu;
+ +    int      min_atoms_per_mpi_thread;
+ +    char    *env;
+ +    char     sbuf[STRLEN];
+ +    gmx_bool bCanUseGPU;
+ +
+ +    if (hw_opt->nthreads_tmpi > 0)
+ +    {
+ +        /* Trivial, return right away */
+ +        return hw_opt->nthreads_tmpi;
+ +    }
+ +
+ +    nthreads_hw = hwinfo->nthreads_hw_avail;
+ +
+ +    /* How many total (#tMPI*#OpenMP) threads can we start? */
+ +    if (hw_opt->nthreads_tot > 0)
+ +    {
+ +        nthreads_tot_max = hw_opt->nthreads_tot;
+ +    }
+ +    else
+ +    {
+ +        nthreads_tot_max = nthreads_hw;
+ +    }
+ +
+ +    bCanUseGPU = (inputrec->cutoff_scheme == ecutsVERLET && hwinfo->bCanUseGPU);
+ +    if (bCanUseGPU)
+ +    {
+ +        ngpu = hwinfo->gpu_info.ncuda_dev_use;
+ +    }
+ +    else
+ +    {
+ +        ngpu = 0;
+ +    }
+ +
+ +    nthreads_tmpi =
+ +        get_tmpi_omp_thread_division(hwinfo, hw_opt, nthreads_tot_max, ngpu);
+ +
+ +    if (inputrec->eI == eiNM || EI_TPI(inputrec->eI))
+ +    {
+ +        /* Steps are divided over the nodes iso splitting the atoms */
+ +        min_atoms_per_mpi_thread = 0;
+ +    }
+ +    else
+ +    {
+ +        if (bCanUseGPU)
+ +        {
+ +            min_atoms_per_mpi_thread = MIN_ATOMS_PER_GPU;
+ +        }
+ +        else
+ +        {
+ +            min_atoms_per_mpi_thread = MIN_ATOMS_PER_MPI_THREAD;
+ +        }
+ +    }
+ +
+ +    /* Check if an algorithm does not support parallel simulation.  */
+ +    if (nthreads_tmpi != 1 &&
+ +        ( inputrec->eI == eiLBFGS ||
+ +          inputrec->coulombtype == eelEWALD ) )
+ +    {
+ +        nthreads_tmpi = 1;
+ +
+ +        md_print_warn(cr, fplog, "The integration or electrostatics algorithm doesn't support parallel runs. Using a single thread-MPI thread.\n");
+ +        if (hw_opt->nthreads_tmpi > nthreads_tmpi)
+ +        {
+ +            gmx_fatal(FARGS, "You asked for more than 1 thread-MPI thread, but an algorithm doesn't support that");
+ +        }
+ +    }
+ +    else if (mtop->natoms/nthreads_tmpi < min_atoms_per_mpi_thread)
+ +    {
+ +        /* the thread number was chosen automatically, but there are too many
+ +           threads (too few atoms per thread) */
+ +        nthreads_new = max(1, mtop->natoms/min_atoms_per_mpi_thread);
+ +
+ +        /* Avoid partial use of Hyper-Threading */
+ +        if (gmx_cpuid_x86_smt(hwinfo->cpuid_info) == GMX_CPUID_X86_SMT_ENABLED &&
+ +            nthreads_new > nthreads_hw/2 && nthreads_new < nthreads_hw)
+ +        {
+ +            nthreads_new = nthreads_hw/2;
+ +        }
+ +
+ +        /* Avoid large prime numbers in the thread count */
+ +        if (nthreads_new >= 6)
+ +        {
+ +            /* Use only 6,8,10 with additional factors of 2 */
+ +            int fac;
+ +
+ +            fac = 2;
+ +            while (3*fac*2 <= nthreads_new)
+ +            {
+ +                fac *= 2;
+ +            }
+ +
+ +            nthreads_new = (nthreads_new/fac)*fac;
+ +        }
+ +        else
+ +        {
+ +            /* Avoid 5 */
+ +            if (nthreads_new == 5)
+ +            {
+ +                nthreads_new = 4;
+ +            }
+ +        }
+ +
+ +        nthreads_tmpi = nthreads_new;
+ +
+ +        fprintf(stderr, "\n");
+ +        fprintf(stderr, "NOTE: Parallelization is limited by the small number of atoms,\n");
+ +        fprintf(stderr, "      only starting %d thread-MPI threads.\n", nthreads_tmpi);
+ +        fprintf(stderr, "      You can use the -nt and/or -ntmpi option to optimize the number of threads.\n\n");
+ +    }
+ +
+ +    return nthreads_tmpi;
+ +}
+ +#endif /* GMX_THREAD_MPI */
+ +
+ +
+ +/* Environment variable for setting nstlist */
+ +static const char*  NSTLIST_ENVVAR          =  "GMX_NSTLIST";
+ +/* Try to increase nstlist when using a GPU with nstlist less than this */
+ +static const int    NSTLIST_GPU_ENOUGH      = 20;
+ +/* Increase nstlist until the non-bonded cost increases more than this factor */
+ +static const float  NBNXN_GPU_LIST_OK_FAC   = 1.25;
+ +/* Don't increase nstlist beyond a non-bonded cost increases of this factor */
+ +static const float  NBNXN_GPU_LIST_MAX_FAC  = 1.40;
+ +
+ +/* Try to increase nstlist when running on a GPU */
+ +static void increase_nstlist(FILE *fp, t_commrec *cr,
+ +                             t_inputrec *ir, const gmx_mtop_t *mtop, matrix box)
+ +{
+ +    char                  *env;
+ +    int                    nstlist_orig, nstlist_prev;
+ +    verletbuf_list_setup_t ls;
+ +    real                   rlist_inc, rlist_ok, rlist_max, rlist_new, rlist_prev;
+ +    int                    i;
+ +    t_state                state_tmp;
+ +    gmx_bool               bBox, bDD, bCont;
+ +    const char            *nstl_fmt = "\nFor optimal performance with a GPU nstlist (now %d) should be larger.\nThe optimum depends on your CPU and GPU resources.\nYou might want to try several nstlist values.\n";
+ +    const char            *vbd_err  = "Can not increase nstlist for GPU run because verlet-buffer-drift is not set or used";
+ +    const char            *box_err  = "Can not increase nstlist for GPU run because the box is too small";
+ +    const char            *dd_err   = "Can not increase nstlist for GPU run because of domain decomposition limitations";
+ +    char                   buf[STRLEN];
+ +
+ +    /* Number of + nstlist alternative values to try when switching  */
+ +    const int nstl[] = { 20, 25, 40, 50 };
+ +#define NNSTL  sizeof(nstl)/sizeof(nstl[0])
+ +
+ +    env = getenv(NSTLIST_ENVVAR);
+ +    if (env == NULL)
+ +    {
+ +        if (fp != NULL)
+ +        {
+ +            fprintf(fp, nstl_fmt, ir->nstlist);
+ +        }
+ +    }
+ +
+ +    if (ir->verletbuf_drift == 0)
+ +    {
+ +        gmx_fatal(FARGS, "You are using an old tpr file with a GPU, please generate a new tpr file with an up to date version of grompp");
+ +    }
+ +
+ +    if (ir->verletbuf_drift < 0)
+ +    {
+ +        if (MASTER(cr))
+ +        {
+ +            fprintf(stderr, "%s\n", vbd_err);
+ +        }
+ +        if (fp != NULL)
+ +        {
+ +            fprintf(fp, "%s\n", vbd_err);
+ +        }
+ +
+ +        return;
+ +    }
+ +
+ +    nstlist_orig = ir->nstlist;
+ +    if (env != NULL)
+ +    {
+ +        sprintf(buf, "Getting nstlist from environment variable GMX_NSTLIST=%s", env);
+ +        if (MASTER(cr))
+ +        {
+ +            fprintf(stderr, "%s\n", buf);
+ +        }
+ +        if (fp != NULL)
+ +        {
+ +            fprintf(fp, "%s\n", buf);
+ +        }
+ +        sscanf(env, "%d", &ir->nstlist);
+ +    }
+ +
+ +    verletbuf_get_list_setup(TRUE, &ls);
+ +
+ +    /* Allow rlist to make the list double the size of the cut-off sphere */
+ +    rlist_inc = nbnxn_get_rlist_effective_inc(NBNXN_GPU_CLUSTER_SIZE, mtop->natoms/det(box));
+ +    rlist_ok  = (max(ir->rvdw, ir->rcoulomb) + rlist_inc)*pow(NBNXN_GPU_LIST_OK_FAC, 1.0/3.0) - rlist_inc;
+ +    rlist_max = (max(ir->rvdw, ir->rcoulomb) + rlist_inc)*pow(NBNXN_GPU_LIST_MAX_FAC, 1.0/3.0) - rlist_inc;
+ +    if (debug)
+ +    {
+ +        fprintf(debug, "GPU nstlist tuning: rlist_inc %.3f rlist_max %.3f\n",
+ +                rlist_inc, rlist_max);
+ +    }
+ +
+ +    i            = 0;
+ +    nstlist_prev = nstlist_orig;
+ +    rlist_prev   = ir->rlist;
+ +    do
+ +    {
+ +        if (env == NULL)
+ +        {
+ +            ir->nstlist = nstl[i];
+ +        }
+ +
+ +        /* Set the pair-list buffer size in ir */
+ +        calc_verlet_buffer_size(mtop, det(box), ir, ir->verletbuf_drift, &ls,
+ +                                NULL, &rlist_new);
+ +
+ +        /* Does rlist fit in the box? */
+ +        bBox = (sqr(rlist_new) < max_cutoff2(ir->ePBC, box));
+ +        bDD  = TRUE;
+ +        if (bBox && DOMAINDECOMP(cr))
+ +        {
+ +            /* Check if rlist fits in the domain decomposition */
+ +            if (inputrec2nboundeddim(ir) < DIM)
+ +            {
+ +                gmx_incons("Changing nstlist with domain decomposition and unbounded dimensions is not implemented yet");
+ +            }
+ +            copy_mat(box, state_tmp.box);
+ +            bDD = change_dd_cutoff(cr, &state_tmp, ir, rlist_new);
+ +        }
+ +
+ +        bCont = FALSE;
+ +
+ +        if (env == NULL)
+ +        {
+ +            if (bBox && bDD && rlist_new <= rlist_max)
+ +            {
+ +                /* Increase nstlist */
+ +                nstlist_prev = ir->nstlist;
+ +                rlist_prev   = rlist_new;
+ +                bCont        = (i+1 < NNSTL && rlist_new < rlist_ok);
+ +            }
+ +            else
+ +            {
+ +                /* Stick with the previous nstlist */
+ +                ir->nstlist = nstlist_prev;
+ +                rlist_new   = rlist_prev;
+ +                bBox        = TRUE;
+ +                bDD         = TRUE;
+ +            }
+ +        }
+ +
+ +        i++;
+ +    }
+ +    while (bCont);
+ +
+ +    if (!bBox || !bDD)
+ +    {
+ +        gmx_warning(!bBox ? box_err : dd_err);
+ +        if (fp != NULL)
+ +        {
+ +            fprintf(fp, "\n%s\n", bBox ? box_err : dd_err);
+ +        }
+ +        ir->nstlist = nstlist_orig;
+ +    }
+ +    else if (ir->nstlist != nstlist_orig || rlist_new != ir->rlist)
+ +    {
+ +        sprintf(buf, "Changing nstlist from %d to %d, rlist from %g to %g",
+ +                nstlist_orig, ir->nstlist,
+ +                ir->rlist, rlist_new);
+ +        if (MASTER(cr))
+ +        {
+ +            fprintf(stderr, "%s\n\n", buf);
+ +        }
+ +        if (fp != NULL)
+ +        {
+ +            fprintf(fp, "%s\n\n", buf);
+ +        }
+ +        ir->rlist     = rlist_new;
+ +        ir->rlistlong = rlist_new;
+ +    }
+ +}
+ +
-         /* Detect hardware, gather information. With tMPI only thread 0 does it
-          * and after threads are started broadcasts hwinfo around. */
-         snew(hwinfo, 1);
-         gmx_detect_hardware(fplog, hwinfo, cr,
-                             bForceUseGPU, bTryUseGPU, hw_opt->gpu_id);
++static void prepare_verlet_scheme(FILE                *fplog,
++                                  const gmx_hw_info_t *hwinfo,
++                                  t_commrec           *cr,
++                                  const char          *nbpu_opt,
++                                  t_inputrec          *ir,
++                                  const gmx_mtop_t    *mtop,
++                                  matrix               box,
++                                  gmx_bool            *bUseGPU)
+ +{
+ +    /* Here we only check for GPU usage on the MPI master process,
+ +     * as here we don't know how many GPUs we will use yet.
+ +     * We check for a GPU on all processes later.
+ +     */
+ +    *bUseGPU = hwinfo->bCanUseGPU || (getenv("GMX_EMULATE_GPU") != NULL);
+ +
+ +    if (ir->verletbuf_drift > 0)
+ +    {
+ +        /* Update the Verlet buffer size for the current run setup */
+ +        verletbuf_list_setup_t ls;
+ +        real                   rlist_new;
+ +
+ +        /* Here we assume CPU acceleration is on. But as currently
+ +         * calc_verlet_buffer_size gives the same results for 4x8 and 4x4
+ +         * and 4x2 gives a larger buffer than 4x4, this is ok.
+ +         */
+ +        verletbuf_get_list_setup(*bUseGPU, &ls);
+ +
+ +        calc_verlet_buffer_size(mtop, det(box), ir,
+ +                                ir->verletbuf_drift, &ls,
+ +                                NULL, &rlist_new);
+ +        if (rlist_new != ir->rlist)
+ +        {
+ +            if (fplog != NULL)
+ +            {
+ +                fprintf(fplog, "\nChanging rlist from %g to %g for non-bonded %dx%d atom kernels\n\n",
+ +                        ir->rlist, rlist_new,
+ +                        ls.cluster_size_i, ls.cluster_size_j);
+ +            }
+ +            ir->rlist     = rlist_new;
+ +            ir->rlistlong = rlist_new;
+ +        }
+ +    }
+ +
+ +    /* With GPU or emulation we should check nstlist for performance */
+ +    if ((EI_DYNAMICS(ir->eI) &&
+ +         *bUseGPU &&
+ +         ir->nstlist < NSTLIST_GPU_ENOUGH) ||
+ +        getenv(NSTLIST_ENVVAR) != NULL)
+ +    {
+ +        /* Choose a better nstlist */
+ +        increase_nstlist(fplog, cr, ir, mtop, box);
+ +    }
+ +}
+ +
+ +static void convert_to_verlet_scheme(FILE *fplog,
+ +                                     t_inputrec *ir,
+ +                                     gmx_mtop_t *mtop, real box_vol)
+ +{
+ +    char *conv_mesg = "Converting input file with group cut-off scheme to the Verlet cut-off scheme";
+ +
+ +    md_print_warn(NULL, fplog, "%s\n", conv_mesg);
+ +
+ +    ir->cutoff_scheme   = ecutsVERLET;
+ +    ir->verletbuf_drift = 0.005;
+ +
+ +    if (ir->rcoulomb != ir->rvdw)
+ +    {
+ +        gmx_fatal(FARGS, "The VdW and Coulomb cut-offs are different, whereas the Verlet scheme only supports equal cut-offs");
+ +    }
+ +
+ +    if (ir->vdwtype == evdwUSER || EEL_USER(ir->coulombtype))
+ +    {
+ +        gmx_fatal(FARGS, "User non-bonded potentials are not (yet) supported with the Verlet scheme");
+ +    }
+ +    else if (EVDW_SWITCHED(ir->vdwtype) || EEL_SWITCHED(ir->coulombtype))
+ +    {
+ +        md_print_warn(NULL, fplog, "Converting switched or shifted interactions to a shifted potential (without force shift), this will lead to slightly different interaction potentials");
+ +
+ +        if (EVDW_SWITCHED(ir->vdwtype))
+ +        {
+ +            ir->vdwtype = evdwCUT;
+ +        }
+ +        if (EEL_SWITCHED(ir->coulombtype))
+ +        {
+ +            if (EEL_FULL(ir->coulombtype))
+ +            {
+ +                /* With full electrostatic only PME can be switched */
+ +                ir->coulombtype = eelPME;
+ +            }
+ +            else
+ +            {
+ +                md_print_warn(NULL, fplog, "NOTE: Replacing %s electrostatics with reaction-field with epsilon-rf=inf\n", eel_names[ir->coulombtype]);
+ +                ir->coulombtype = eelRF;
+ +                ir->epsilon_rf  = 0.0;
+ +            }
+ +        }
+ +
+ +        /* We set the target energy drift to a small number.
+ +         * Note that this is only for testing. For production the user
+ +         * should think about this and set the mdp options.
+ +         */
+ +        ir->verletbuf_drift = 1e-4;
+ +    }
+ +
+ +    if (inputrec2nboundeddim(ir) != 3)
+ +    {
+ +        gmx_fatal(FARGS, "Can only convert old tpr files to the Verlet cut-off scheme with 3D pbc");
+ +    }
+ +
+ +    if (ir->efep != efepNO || ir->implicit_solvent != eisNO)
+ +    {
+ +        gmx_fatal(FARGS, "Will not convert old tpr files to the Verlet cut-off scheme with free-energy calculations or implicit solvent");
+ +    }
+ +
+ +    if (EI_DYNAMICS(ir->eI) && !(EI_MD(ir->eI) && ir->etc == etcNO))
+ +    {
+ +        verletbuf_list_setup_t ls;
+ +
+ +        verletbuf_get_list_setup(FALSE, &ls);
+ +        calc_verlet_buffer_size(mtop, box_vol, ir, ir->verletbuf_drift, &ls,
+ +                                NULL, &ir->rlist);
+ +    }
+ +    else
+ +    {
+ +        ir->verletbuf_drift = -1;
+ +        ir->rlist           = 1.05*max(ir->rvdw, ir->rcoulomb);
+ +    }
+ +
+ +    gmx_mtop_remove_chargegroups(mtop);
+ +}
+ +
+ +static void check_and_update_hw_opt(gmx_hw_opt_t *hw_opt,
+ +                                    int           cutoff_scheme,
+ +                                    gmx_bool      bIsSimMaster)
+ +{
+ +    gmx_omp_nthreads_read_env(&hw_opt->nthreads_omp, bIsSimMaster);
+ +
+ +#ifndef GMX_THREAD_MPI
+ +    if (hw_opt->nthreads_tot > 0)
+ +    {
+ +        gmx_fatal(FARGS, "Setting the total number of threads is only supported with thread-MPI and Gromacs was compiled without thread-MPI");
+ +    }
+ +    if (hw_opt->nthreads_tmpi > 0)
+ +    {
+ +        gmx_fatal(FARGS, "Setting the number of thread-MPI threads is only supported with thread-MPI and Gromacs was compiled without thread-MPI");
+ +    }
+ +#endif
+ +
+ +    if (hw_opt->nthreads_tot > 0 && hw_opt->nthreads_omp_pme <= 0)
+ +    {
+ +        /* We have the same number of OpenMP threads for PP and PME processes,
+ +         * thus we can perform several consistency checks.
+ +         */
+ +        if (hw_opt->nthreads_tmpi > 0 &&
+ +            hw_opt->nthreads_omp > 0 &&
+ +            hw_opt->nthreads_tot != hw_opt->nthreads_tmpi*hw_opt->nthreads_omp)
+ +        {
+ +            gmx_fatal(FARGS, "The total number of threads requested (%d) does not match the thread-MPI threads (%d) times the OpenMP threads (%d) requested",
+ +                      hw_opt->nthreads_tot, hw_opt->nthreads_tmpi, hw_opt->nthreads_omp);
+ +        }
+ +
+ +        if (hw_opt->nthreads_tmpi > 0 &&
+ +            hw_opt->nthreads_tot % hw_opt->nthreads_tmpi != 0)
+ +        {
+ +            gmx_fatal(FARGS, "The total number of threads requested (%d) is not divisible by the number of thread-MPI threads requested (%d)",
+ +                      hw_opt->nthreads_tot, hw_opt->nthreads_tmpi);
+ +        }
+ +
+ +        if (hw_opt->nthreads_omp > 0 &&
+ +            hw_opt->nthreads_tot % hw_opt->nthreads_omp != 0)
+ +        {
+ +            gmx_fatal(FARGS, "The total number of threads requested (%d) is not divisible by the number of OpenMP threads requested (%d)",
+ +                      hw_opt->nthreads_tot, hw_opt->nthreads_omp);
+ +        }
+ +
+ +        if (hw_opt->nthreads_tmpi > 0 &&
+ +            hw_opt->nthreads_omp <= 0)
+ +        {
+ +            hw_opt->nthreads_omp = hw_opt->nthreads_tot/hw_opt->nthreads_tmpi;
+ +        }
+ +    }
+ +
+ +#ifndef GMX_OPENMP
+ +    if (hw_opt->nthreads_omp > 1)
+ +    {
+ +        gmx_fatal(FARGS, "OpenMP threads are requested, but Gromacs was compiled without OpenMP support");
+ +    }
+ +#endif
+ +
+ +    if (cutoff_scheme == ecutsGROUP)
+ +    {
+ +        /* We only have OpenMP support for PME only nodes */
+ +        if (hw_opt->nthreads_omp > 1)
+ +        {
+ +            gmx_fatal(FARGS, "OpenMP threads have been requested with cut-off scheme %s, but these are only supported with cut-off scheme %s",
+ +                      ecutscheme_names[cutoff_scheme],
+ +                      ecutscheme_names[ecutsVERLET]);
+ +        }
+ +        hw_opt->nthreads_omp = 1;
+ +    }
+ +
+ +    if (hw_opt->nthreads_omp_pme > 0 && hw_opt->nthreads_omp <= 0)
+ +    {
+ +        gmx_fatal(FARGS, "You need to specify -ntomp in addition to -ntomp_pme");
+ +    }
+ +
+ +    if (hw_opt->nthreads_tot == 1)
+ +    {
+ +        hw_opt->nthreads_tmpi = 1;
+ +
+ +        if (hw_opt->nthreads_omp > 1)
+ +        {
+ +            gmx_fatal(FARGS, "You requested %d OpenMP threads with %d total threads",
+ +                      hw_opt->nthreads_tmpi, hw_opt->nthreads_tot);
+ +        }
+ +        hw_opt->nthreads_omp = 1;
+ +    }
+ +
+ +    if (hw_opt->nthreads_omp_pme <= 0 && hw_opt->nthreads_omp > 0)
+ +    {
+ +        hw_opt->nthreads_omp_pme = hw_opt->nthreads_omp;
+ +    }
+ +
+ +    if (debug)
+ +    {
+ +        fprintf(debug, "hw_opt: nt %d ntmpi %d ntomp %d ntomp_pme %d gpu_id '%s'\n",
+ +                hw_opt->nthreads_tot,
+ +                hw_opt->nthreads_tmpi,
+ +                hw_opt->nthreads_omp,
+ +                hw_opt->nthreads_omp_pme,
+ +                hw_opt->gpu_id != NULL ? hw_opt->gpu_id : "");
+ +
+ +    }
+ +}
+ +
+ +
+ +/* Override the value in inputrec with value passed on the command line (if any) */
+ +static void override_nsteps_cmdline(FILE            *fplog,
+ +                                    gmx_large_int_t  nsteps_cmdline,
+ +                                    t_inputrec      *ir,
+ +                                    const t_commrec *cr)
+ +{
+ +    char sbuf[STEPSTRSIZE];
+ +
+ +    assert(ir);
+ +    assert(cr);
+ +
+ +    /* override with anything else than the default -2 */
+ +    if (nsteps_cmdline > -2)
+ +    {
+ +        char stmp[STRLEN];
+ +
+ +        ir->nsteps = nsteps_cmdline;
+ +        if (EI_DYNAMICS(ir->eI))
+ +        {
+ +            sprintf(stmp, "Overriding nsteps with value passed on the command line: %s steps, %.3f ps",
+ +                    gmx_step_str(nsteps_cmdline, sbuf),
+ +                    nsteps_cmdline*ir->delta_t);
+ +        }
+ +        else
+ +        {
+ +            sprintf(stmp, "Overriding nsteps with value passed on the command line: %s steps",
+ +                    gmx_step_str(nsteps_cmdline, sbuf));
+ +        }
+ +
+ +        md_print_warn(cr, fplog, "%s\n", stmp);
+ +    }
+ +}
+ +
+ +/* Data structure set by SIMMASTER which needs to be passed to all nodes
+ + * before the other nodes have read the tpx file and called gmx_detect_hardware.
+ + */
+ +typedef struct {
+ +    int      cutoff_scheme; /* The cutoff scheme from inputrec_t */
+ +    gmx_bool bUseGPU;       /* Use GPU or GPU emulation          */
+ +} master_inf_t;
+ +
+ +int mdrunner(gmx_hw_opt_t *hw_opt,
+ +             FILE *fplog, t_commrec *cr, int nfile,
+ +             const t_filenm fnm[], const output_env_t oenv, gmx_bool bVerbose,
+ +             gmx_bool bCompact, int nstglobalcomm,
+ +             ivec ddxyz, int dd_node_order, real rdd, real rconstr,
+ +             const char *dddlb_opt, real dlb_scale,
+ +             const char *ddcsx, const char *ddcsy, const char *ddcsz,
+ +             const char *nbpu_opt,
+ +             gmx_large_int_t nsteps_cmdline, int nstepout, int resetstep,
+ +             int nmultisim, int repl_ex_nst, int repl_ex_nex,
+ +             int repl_ex_seed, real pforce, real cpt_period, real max_hours,
+ +             const char *deviceOptions, unsigned long Flags)
+ +{
+ +    gmx_bool        bForceUseGPU, bTryUseGPU;
+ +    double          nodetime = 0, realtime;
+ +    t_inputrec     *inputrec;
+ +    t_state        *state = NULL;
+ +    matrix          box;
+ +    gmx_ddbox_t     ddbox = {0};
+ +    int             npme_major, npme_minor;
+ +    real            tmpr1, tmpr2;
+ +    t_nrnb         *nrnb;
+ +    gmx_mtop_t     *mtop       = NULL;
+ +    t_mdatoms      *mdatoms    = NULL;
+ +    t_forcerec     *fr         = NULL;
+ +    t_fcdata       *fcd        = NULL;
+ +    real            ewaldcoeff = 0;
+ +    gmx_pme_t      *pmedata    = NULL;
+ +    gmx_vsite_t    *vsite      = NULL;
+ +    gmx_constr_t    constr;
+ +    int             i, m, nChargePerturbed = -1, status, nalloc;
+ +    char           *gro;
+ +    gmx_wallcycle_t wcycle;
+ +    gmx_bool        bReadRNG, bReadEkin;
+ +    int             list;
+ +    gmx_runtime_t   runtime;
+ +    int             rc;
+ +    gmx_large_int_t reset_counters;
+ +    gmx_edsam_t     ed           = NULL;
+ +    t_commrec      *cr_old       = cr;
+ +    int             nthreads_pme = 1;
+ +    int             nthreads_pp  = 1;
+ +    gmx_membed_t    membed       = NULL;
+ +    gmx_hw_info_t  *hwinfo       = NULL;
+ +    master_inf_t    minf         = {-1, FALSE};
+ +
+ +    /* CAUTION: threads may be started later on in this function, so
+ +       cr doesn't reflect the final parallel state right now */
+ +    snew(inputrec, 1);
+ +    snew(mtop, 1);
+ +
+ +    if (Flags & MD_APPENDFILES)
+ +    {
+ +        fplog = NULL;
+ +    }
+ +
+ +    bForceUseGPU = (strncmp(nbpu_opt, "gpu", 3) == 0);
+ +    bTryUseGPU   = (strncmp(nbpu_opt, "auto", 4) == 0) || bForceUseGPU;
+ +
++    /* Detect hardware, gather information. This is an operation that is
++     * global for this process (MPI rank). */
++    hwinfo = gmx_detect_hardware(fplog, cr,
++                                 bForceUseGPU, bTryUseGPU, hw_opt->gpu_id);
++
++
+ +    snew(state, 1);
+ +    if (SIMMASTER(cr))
+ +    {
+ +        /* Read (nearly) all data required for the simulation */
+ +        read_tpx_state(ftp2fn(efTPX, nfile, fnm), inputrec, state, NULL, mtop);
+ +
+ +        if (inputrec->cutoff_scheme != ecutsVERLET &&
+ +            ((Flags & MD_TESTVERLET) || getenv("GMX_VERLET_SCHEME") != NULL))
+ +        {
+ +            convert_to_verlet_scheme(fplog, inputrec, mtop, det(state->box));
+ +        }
+ +
- #if defined GMX_THREAD_MPI
-     /* With tMPI we detected on thread 0 and we'll just pass the hwinfo pointer
-      * to the other threads  -- slightly uncool, but works fine, just need to
-      * make sure that the data doesn't get freed twice. */
-     if (cr->nnodes > 1)
-     {
-         if (!SIMMASTER(cr))
-         {
-             snew(hwinfo, 1);
-         }
-         gmx_bcast(sizeof(&hwinfo), &hwinfo, cr);
-     }
- #else
-     if (PAR(cr) && !SIMMASTER(cr))
-     {
-         /* now we have inputrec on all nodes, can run the detection */
-         /* TODO: perhaps it's better to propagate within a node instead? */
-         snew(hwinfo, 1);
-         gmx_detect_hardware(fplog, hwinfo, cr,
-                             bForceUseGPU, bTryUseGPU, hw_opt->gpu_id);
-     }
- 
-     /* Now do the affinity check with MPI/no-MPI (done earlier with thread-MPI). */
-     gmx_check_thread_affinity_set(fplog, cr,
-                                   hw_opt, hwinfo->nthreads_hw_avail, FALSE);
- #endif
- 
+ +
+ +        minf.cutoff_scheme = inputrec->cutoff_scheme;
+ +        minf.bUseGPU       = FALSE;
+ +
+ +        if (inputrec->cutoff_scheme == ecutsVERLET)
+ +        {
+ +            prepare_verlet_scheme(fplog, hwinfo, cr, nbpu_opt,
+ +                                  inputrec, mtop, state->box,
+ +                                  &minf.bUseGPU);
+ +        }
+ +        else if (hwinfo->bCanUseGPU)
+ +        {
+ +            md_print_warn(cr, fplog,
+ +                          "NOTE: GPU(s) found, but the current simulation can not use GPUs\n"
+ +                          "      To use a GPU, set the mdp option: cutoff-scheme = Verlet\n"
+ +                          "      (for quick performance testing you can use the -testverlet option)\n");
+ +
+ +            if (bForceUseGPU)
+ +            {
+ +                gmx_fatal(FARGS, "GPU requested, but can't be used without cutoff-scheme=Verlet");
+ +            }
+ +        }
+ +    }
+ +#ifndef GMX_THREAD_MPI
+ +    if (PAR(cr))
+ +    {
+ +        gmx_bcast_sim(sizeof(minf), &minf, cr);
+ +    }
+ +#endif
+ +    if (minf.bUseGPU && cr->npmenodes == -1)
+ +    {
+ +        /* Don't automatically use PME-only nodes with GPUs */
+ +        cr->npmenodes = 0;
+ +    }
+ +
+ +    /* Check for externally set OpenMP affinity and turn off internal
+ +     * pinning if any is found. We need to do this check early to tell
+ +     * thread-MPI whether it should do pinning when spawning threads.
+ +     * TODO: the above no longer holds, we should move these checks down
+ +     */
+ +    gmx_omp_check_thread_affinity(fplog, cr, hw_opt);
+ +
+ +#ifdef GMX_THREAD_MPI
+ +    /* With thread-MPI inputrec is only set here on the master thread */
+ +    if (SIMMASTER(cr))
+ +#endif
+ +    {
+ +        check_and_update_hw_opt(hw_opt, minf.cutoff_scheme, SIMMASTER(cr));
+ +
+ +#ifdef GMX_THREAD_MPI
+ +        /* Early check for externally set process affinity. Can't do over all
+ +         * MPI processes because hwinfo is not available everywhere, but with
+ +         * thread-MPI it's needed as pinning might get turned off which needs
+ +         * to be known before starting thread-MPI. */
+ +        gmx_check_thread_affinity_set(fplog,
+ +                                      NULL,
+ +                                      hw_opt, hwinfo->nthreads_hw_avail, FALSE);
+ +#endif
+ +
+ +#ifdef GMX_THREAD_MPI
+ +        if (cr->npmenodes > 0 && hw_opt->nthreads_tmpi <= 0)
+ +        {
+ +            gmx_fatal(FARGS, "You need to explicitly specify the number of MPI threads (-ntmpi) when using separate PME nodes");
+ +        }
+ +#endif
+ +
+ +        if (hw_opt->nthreads_omp_pme != hw_opt->nthreads_omp &&
+ +            cr->npmenodes <= 0)
+ +        {
+ +            gmx_fatal(FARGS, "You need to explicitly specify the number of PME nodes (-npme) when using different number of OpenMP threads for PP and PME nodes");
+ +        }
+ +    }
+ +
+ +#ifdef GMX_THREAD_MPI
+ +    if (SIMMASTER(cr))
+ +    {
+ +        /* NOW the threads will be started: */
+ +        hw_opt->nthreads_tmpi = get_nthreads_mpi(hwinfo,
+ +                                                 hw_opt,
+ +                                                 inputrec, mtop,
+ +                                                 cr, fplog);
+ +        if (hw_opt->nthreads_tot > 0 && hw_opt->nthreads_omp <= 0)
+ +        {
+ +            hw_opt->nthreads_omp = hw_opt->nthreads_tot/hw_opt->nthreads_tmpi;
+ +        }
+ +
+ +        if (hw_opt->nthreads_tmpi > 1)
+ +        {
+ +            /* now start the threads. */
+ +            cr = mdrunner_start_threads(hw_opt, fplog, cr_old, nfile, fnm,
+ +                                        oenv, bVerbose, bCompact, nstglobalcomm,
+ +                                        ddxyz, dd_node_order, rdd, rconstr,
+ +                                        dddlb_opt, dlb_scale, ddcsx, ddcsy, ddcsz,
+ +                                        nbpu_opt,
+ +                                        nsteps_cmdline, nstepout, resetstep, nmultisim,
+ +                                        repl_ex_nst, repl_ex_nex, repl_ex_seed, pforce,
+ +                                        cpt_period, max_hours, deviceOptions,
+ +                                        Flags);
+ +            /* the main thread continues here with a new cr. We don't deallocate
+ +               the old cr because other threads may still be reading it. */
+ +            if (cr == NULL)
+ +            {
+ +                gmx_comm("Failed to spawn threads");
+ +            }
+ +        }
+ +    }
+ +#endif
+ +    /* END OF CAUTION: cr is now reliable */
+ +
+ +    /* g_membed initialisation *
+ +     * Because we change the mtop, init_membed is called before the init_parallel *
+ +     * (in case we ever want to make it run in parallel) */
+ +    if (opt2bSet("-membed", nfile, fnm))
+ +    {
+ +        if (MASTER(cr))
+ +        {
+ +            fprintf(stderr, "Initializing membed");
+ +        }
+ +        membed = init_membed(fplog, nfile, fnm, mtop, inputrec, state, cr, &cpt_period);
+ +    }
+ +
+ +    if (PAR(cr))
+ +    {
+ +        /* now broadcast everything to the non-master nodes/threads: */
+ +        init_parallel(fplog, cr, inputrec, mtop);
+ +
+ +        /* This check needs to happen after get_nthreads_mpi() */
+ +        if (inputrec->cutoff_scheme == ecutsVERLET && (Flags & MD_PARTDEC))
+ +        {
+ +            gmx_fatal_collective(FARGS, cr, NULL,
+ +                                 "The Verlet cut-off scheme is not supported with particle decomposition.\n"
+ +                                 "You can achieve the same effect as particle decomposition by running in parallel using only OpenMP threads.");
+ +        }
+ +    }
+ +    if (fplog != NULL)
+ +    {
+ +        pr_inputrec(fplog, 0, "Input Parameters", inputrec, FALSE);
+ +    }
+ +
-     gmx_check_hw_runconf_consistency(fplog, hwinfo, cr, hw_opt->nthreads_tmpi, minf.bUseGPU);
+ +    /* now make sure the state is initialized and propagated */
+ +    set_state_entries(state, inputrec, cr->nnodes);
+ +
+ +    /* A parallel command line option consistency check that we can
+ +       only do after any threads have started. */
+ +    if (!PAR(cr) &&
+ +        (ddxyz[XX] > 1 || ddxyz[YY] > 1 || ddxyz[ZZ] > 1 || cr->npmenodes > 0))
+ +    {
+ +        gmx_fatal(FARGS,
+ +                  "The -dd or -npme option request a parallel simulation, "
+ +#ifndef GMX_MPI
+ +                  "but %s was compiled without threads or MPI enabled"
+ +#else
+ +#ifdef GMX_THREAD_MPI
+ +                  "but the number of threads (option -nt) is 1"
+ +#else
+ +                  "but %s was not started through mpirun/mpiexec or only one process was requested through mpirun/mpiexec"
+ +#endif
+ +#endif
+ +                  , ShortProgram()
+ +                  );
+ +    }
+ +
+ +    if ((Flags & MD_RERUN) &&
+ +        (EI_ENERGY_MINIMIZATION(inputrec->eI) || eiNM == inputrec->eI))
+ +    {
+ +        gmx_fatal(FARGS, "The .mdp file specified an energy mininization or normal mode algorithm, and these are not compatible with mdrun -rerun");
+ +    }
+ +
+ +    if (can_use_allvsall(inputrec, mtop, TRUE, cr, fplog) && PAR(cr))
+ +    {
+ +        /* Simple neighbour searching and (also?) all-vs-all loops
+ +         * do not work with domain decomposition. */
+ +        Flags |= MD_PARTDEC;
+ +    }
+ +
+ +    if (!EEL_PME(inputrec->coulombtype) || (Flags & MD_PARTDEC))
+ +    {
+ +        if (cr->npmenodes > 0)
+ +        {
+ +            if (!EEL_PME(inputrec->coulombtype))
+ +            {
+ +                gmx_fatal_collective(FARGS, cr, NULL,
+ +                                     "PME nodes are requested, but the system does not use PME electrostatics");
+ +            }
+ +            if (Flags & MD_PARTDEC)
+ +            {
+ +                gmx_fatal_collective(FARGS, cr, NULL,
+ +                                     "PME nodes are requested, but particle decomposition does not support separate PME nodes");
+ +            }
+ +        }
+ +
+ +        cr->npmenodes = 0;
+ +    }
+ +
+ +#ifdef GMX_FAHCORE
+ +    fcRegisterSteps(inputrec->nsteps, inputrec->init_step);
+ +#endif
+ +
+ +    /* NMR restraints must be initialized before load_checkpoint,
+ +     * since with time averaging the history is added to t_state.
+ +     * For proper consistency check we therefore need to extend
+ +     * t_state here.
+ +     * So the PME-only nodes (if present) will also initialize
+ +     * the distance restraints.
+ +     */
+ +    snew(fcd, 1);
+ +
+ +    /* This needs to be called before read_checkpoint to extend the state */
+ +    init_disres(fplog, mtop, inputrec, cr, Flags & MD_PARTDEC, fcd, state, repl_ex_nst > 0);
+ +
+ +    if (gmx_mtop_ftype_count(mtop, F_ORIRES) > 0)
+ +    {
+ +        if (PAR(cr) && !(Flags & MD_PARTDEC))
+ +        {
+ +            gmx_fatal(FARGS, "Orientation restraints do not work (yet) with domain decomposition, use particle decomposition (mdrun option -pd)");
+ +        }
+ +        /* Orientation restraints */
+ +        if (MASTER(cr))
+ +        {
+ +            init_orires(fplog, mtop, state->x, inputrec, cr->ms, &(fcd->orires),
+ +                        state);
+ +        }
+ +    }
+ +
+ +    if (DEFORM(*inputrec))
+ +    {
+ +        /* Store the deform reference box before reading the checkpoint */
+ +        if (SIMMASTER(cr))
+ +        {
+ +            copy_mat(state->box, box);
+ +        }
+ +        if (PAR(cr))
+ +        {
+ +            gmx_bcast(sizeof(box), box, cr);
+ +        }
+ +        /* Because we do not have the update struct available yet
+ +         * in which the reference values should be stored,
+ +         * we store them temporarily in static variables.
+ +         * This should be thread safe, since they are only written once
+ +         * and with identical values.
+ +         */
+ +#ifdef GMX_THREAD_MPI
+ +        tMPI_Thread_mutex_lock(&deform_init_box_mutex);
+ +#endif
+ +        deform_init_init_step_tpx = inputrec->init_step;
+ +        copy_mat(box, deform_init_box_tpx);
+ +#ifdef GMX_THREAD_MPI
+ +        tMPI_Thread_mutex_unlock(&deform_init_box_mutex);
+ +#endif
+ +    }
+ +
+ +    if (opt2bSet("-cpi", nfile, fnm))
+ +    {
+ +        /* Check if checkpoint file exists before doing continuation.
+ +         * This way we can use identical input options for the first and subsequent runs...
+ +         */
+ +        if (gmx_fexist_master(opt2fn_master("-cpi", nfile, fnm, cr), cr) )
+ +        {
+ +            load_checkpoint(opt2fn_master("-cpi", nfile, fnm, cr), &fplog,
+ +                            cr, Flags & MD_PARTDEC, ddxyz,
+ +                            inputrec, state, &bReadRNG, &bReadEkin,
+ +                            (Flags & MD_APPENDFILES),
+ +                            (Flags & MD_APPENDFILESSET));
+ +
+ +            if (bReadRNG)
+ +            {
+ +                Flags |= MD_READ_RNG;
+ +            }
+ +            if (bReadEkin)
+ +            {
+ +                Flags |= MD_READ_EKIN;
+ +            }
+ +        }
+ +    }
+ +
+ +    if (((MASTER(cr) || (Flags & MD_SEPPOT)) && (Flags & MD_APPENDFILES))
+ +#ifdef GMX_THREAD_MPI
+ +        /* With thread MPI only the master node/thread exists in mdrun.c,
+ +         * therefore non-master nodes need to open the "seppot" log file here.
+ +         */
+ +        || (!MASTER(cr) && (Flags & MD_SEPPOT))
+ +#endif
+ +        )
+ +    {
+ +        gmx_log_open(ftp2fn(efLOG, nfile, fnm), cr, !(Flags & MD_SEPPOT),
+ +                     Flags, &fplog);
+ +    }
+ +
+ +    /* override nsteps with value from cmdline */
+ +    override_nsteps_cmdline(fplog, nsteps_cmdline, inputrec, cr);
+ +
+ +    if (SIMMASTER(cr))
+ +    {
+ +        copy_mat(state->box, box);
+ +    }
+ +
+ +    if (PAR(cr))
+ +    {
+ +        gmx_bcast(sizeof(box), box, cr);
+ +    }
+ +
+ +    /* Essential dynamics */
+ +    if (opt2bSet("-ei", nfile, fnm))
+ +    {
+ +        /* Open input and output files, allocate space for ED data structure */
+ +        ed = ed_open(mtop->natoms, &state->edsamstate, nfile, fnm, Flags, oenv, cr);
+ +    }
+ +
+ +    if (PAR(cr) && !((Flags & MD_PARTDEC) ||
+ +                     EI_TPI(inputrec->eI) ||
+ +                     inputrec->eI == eiNM))
+ +    {
+ +        cr->dd = init_domain_decomposition(fplog, cr, Flags, ddxyz, rdd, rconstr,
+ +                                           dddlb_opt, dlb_scale,
+ +                                           ddcsx, ddcsy, ddcsz,
+ +                                           mtop, inputrec,
+ +                                           box, state->x,
+ +                                           &ddbox, &npme_major, &npme_minor);
+ +
+ +        make_dd_communicators(fplog, cr, dd_node_order);
+ +
+ +        /* Set overallocation to avoid frequent reallocation of arrays */
+ +        set_over_alloc_dd(TRUE);
+ +    }
+ +    else
+ +    {
+ +        /* PME, if used, is done on all nodes with 1D decomposition */
+ +        cr->npmenodes = 0;
+ +        cr->duty      = (DUTY_PP | DUTY_PME);
+ +        npme_major    = 1;
+ +        npme_minor    = 1;
+ +        if (!EI_TPI(inputrec->eI))
+ +        {
+ +            npme_major = cr->nnodes;
+ +        }
+ +
+ +        if (inputrec->ePBC == epbcSCREW)
+ +        {
+ +            gmx_fatal(FARGS,
+ +                      "pbc=%s is only implemented with domain decomposition",
+ +                      epbc_names[inputrec->ePBC]);
+ +        }
+ +    }
+ +
+ +    if (PAR(cr))
+ +    {
+ +        /* After possible communicator splitting in make_dd_communicators.
+ +         * we can set up the intra/inter node communication.
+ +         */
+ +        gmx_setup_nodecomm(fplog, cr);
+ +    }
+ +
+ +    /* Initialize per-physical-node MPI process/thread ID and counters. */
+ +    gmx_init_intranode_counters(cr);
+ +
+ +#ifdef GMX_MPI
+ +    md_print_info(cr, fplog, "Using %d MPI %s\n",
+ +                  cr->nnodes,
+ +#ifdef GMX_THREAD_MPI
+ +                  cr->nnodes == 1 ? "thread" : "threads"
+ +#else
+ +                  cr->nnodes == 1 ? "process" : "processes"
+ +#endif
+ +                  );
+ +    fflush(stderr);
+ +#endif
+ +
+ +    gmx_omp_nthreads_init(fplog, cr,
+ +                          hwinfo->nthreads_hw_avail,
+ +                          hw_opt->nthreads_omp,
+ +                          hw_opt->nthreads_omp_pme,
+ +                          (cr->duty & DUTY_PP) == 0,
+ +                          inputrec->cutoff_scheme == ecutsVERLET);
+ +
- #ifdef GMX_THREAD_MPI
-     if (PAR(cr) && SIMMASTER(cr))
- #endif
-     {
-         gmx_hardware_info_free(hwinfo);
-     }
++    /* check consistency and decide on the number of gpus to use. */
++    gmx_check_hw_runconf_consistency(fplog, hwinfo, cr, hw_opt->nthreads_tmpi,
++                                     minf.bUseGPU);
+ +
+ +    /* getting number of PP/PME threads
+ +       PME: env variable should be read only on one node to make sure it is
+ +       identical everywhere;
+ +     */
+ +    /* TODO nthreads_pp is only used for pinning threads.
+ +     * This is a temporary solution until we have a hw topology library.
+ +     */
+ +    nthreads_pp  = gmx_omp_nthreads_get(emntNonbonded);
+ +    nthreads_pme = gmx_omp_nthreads_get(emntPME);
+ +
+ +    wcycle = wallcycle_init(fplog, resetstep, cr, nthreads_pp, nthreads_pme);
+ +
+ +    if (PAR(cr))
+ +    {
+ +        /* Master synchronizes its value of reset_counters with all nodes
+ +         * including PME only nodes */
+ +        reset_counters = wcycle_get_reset_counters(wcycle);
+ +        gmx_bcast_sim(sizeof(reset_counters), &reset_counters, cr);
+ +        wcycle_set_reset_counters(wcycle, reset_counters);
+ +    }
+ +
+ +    snew(nrnb, 1);
+ +    if (cr->duty & DUTY_PP)
+ +    {
+ +        /* For domain decomposition we allocate dynamically
+ +         * in dd_partition_system.
+ +         */
+ +        if (DOMAINDECOMP(cr))
+ +        {
+ +            bcast_state_setup(cr, state);
+ +        }
+ +        else
+ +        {
+ +            if (PAR(cr))
+ +            {
+ +                bcast_state(cr, state, TRUE);
+ +            }
+ +        }
+ +
+ +        /* Initiate forcerecord */
+ +        fr         = mk_forcerec();
+ +        fr->hwinfo = hwinfo;
+ +        init_forcerec(fplog, oenv, fr, fcd, inputrec, mtop, cr, box, FALSE,
+ +                      opt2fn("-table", nfile, fnm),
+ +                      opt2fn("-tabletf", nfile, fnm),
+ +                      opt2fn("-tablep", nfile, fnm),
+ +                      opt2fn("-tableb", nfile, fnm),
+ +                      nbpu_opt,
+ +                      FALSE, pforce);
+ +
+ +        /* version for PCA_NOT_READ_NODE (see md.c) */
+ +        /*init_forcerec(fplog,fr,fcd,inputrec,mtop,cr,box,FALSE,
+ +           "nofile","nofile","nofile","nofile",FALSE,pforce);
+ +         */
+ +        fr->bSepDVDL = ((Flags & MD_SEPPOT) == MD_SEPPOT);
+ +
+ +        /* Initialize QM-MM */
+ +        if (fr->bQMMM)
+ +        {
+ +            init_QMMMrec(cr, box, mtop, inputrec, fr);
+ +        }
+ +
+ +        /* Initialize the mdatoms structure.
+ +         * mdatoms is not filled with atom data,
+ +         * as this can not be done now with domain decomposition.
+ +         */
+ +        mdatoms = init_mdatoms(fplog, mtop, inputrec->efep != efepNO);
+ +
+ +        if (mdatoms->nPerturbed > 0 && inputrec->cutoff_scheme == ecutsVERLET)
+ +        {
+ +            gmx_fatal(FARGS, "The Verlet cut-off scheme does not (yet) support free-energy calculations with perturbed atoms, only perturbed interactions. This will be implemented soon. Use the group scheme for now.");
+ +        }
+ +
+ +        /* Initialize the virtual site communication */
+ +        vsite = init_vsite(mtop, cr, FALSE);
+ +
+ +        calc_shifts(box, fr->shift_vec);
+ +
+ +        /* With periodic molecules the charge groups should be whole at start up
+ +         * and the virtual sites should not be far from their proper positions.
+ +         */
+ +        if (!inputrec->bContinuation && MASTER(cr) &&
+ +            !(inputrec->ePBC != epbcNONE && inputrec->bPeriodicMols))
+ +        {
+ +            /* Make molecules whole at start of run */
+ +            if (fr->ePBC != epbcNONE)
+ +            {
+ +                do_pbc_first_mtop(fplog, inputrec->ePBC, box, mtop, state->x);
+ +            }
+ +            if (vsite)
+ +            {
+ +                /* Correct initial vsite positions are required
+ +                 * for the initial distribution in the domain decomposition
+ +                 * and for the initial shell prediction.
+ +                 */
+ +                construct_vsites_mtop(fplog, vsite, mtop, state->x);
+ +            }
+ +        }
+ +
+ +        if (EEL_PME(fr->eeltype))
+ +        {
+ +            ewaldcoeff = fr->ewaldcoeff;
+ +            pmedata    = &fr->pmedata;
+ +        }
+ +        else
+ +        {
+ +            pmedata = NULL;
+ +        }
+ +    }
+ +    else
+ +    {
+ +        /* This is a PME only node */
+ +
+ +        /* We don't need the state */
+ +        done_state(state);
+ +
+ +        ewaldcoeff = calc_ewaldcoeff(inputrec->rcoulomb, inputrec->ewald_rtol);
+ +        snew(pmedata, 1);
+ +    }
+ +
+ +    if (hw_opt->thread_affinity != threadaffOFF)
+ +    {
+ +        /* Before setting affinity, check whether the affinity has changed
+ +         * - which indicates that probably the OpenMP library has changed it
+ +         * since we first checked).
+ +         */
+ +        gmx_check_thread_affinity_set(fplog, cr,
+ +                                      hw_opt, hwinfo->nthreads_hw_avail, TRUE);
+ +
+ +        /* Set the CPU affinity */
+ +        gmx_set_thread_affinity(fplog, cr, hw_opt, hwinfo);
+ +    }
+ +
+ +    /* Initiate PME if necessary,
+ +     * either on all nodes or on dedicated PME nodes only. */
+ +    if (EEL_PME(inputrec->coulombtype))
+ +    {
+ +        if (mdatoms)
+ +        {
+ +            nChargePerturbed = mdatoms->nChargePerturbed;
+ +        }
+ +        if (cr->npmenodes > 0)
+ +        {
+ +            /* The PME only nodes need to know nChargePerturbed */
+ +            gmx_bcast_sim(sizeof(nChargePerturbed), &nChargePerturbed, cr);
+ +        }
+ +
+ +        if (cr->duty & DUTY_PME)
+ +        {
+ +            status = gmx_pme_init(pmedata, cr, npme_major, npme_minor, inputrec,
+ +                                  mtop ? mtop->natoms : 0, nChargePerturbed,
+ +                                  (Flags & MD_REPRODUCIBLE), nthreads_pme);
+ +            if (status != 0)
+ +            {
+ +                gmx_fatal(FARGS, "Error %d initializing PME", status);
+ +            }
+ +        }
+ +    }
+ +
+ +
+ +    if (integrator[inputrec->eI].func == do_md)
+ +    {
+ +        /* Turn on signal handling on all nodes */
+ +        /*
+ +         * (A user signal from the PME nodes (if any)
+ +         * is communicated to the PP nodes.
+ +         */
+ +        signal_handler_install();
+ +    }
+ +
+ +    if (cr->duty & DUTY_PP)
+ +    {
+ +        if (inputrec->ePull != epullNO)
+ +        {
+ +            /* Initialize pull code */
+ +            init_pull(fplog, inputrec, nfile, fnm, mtop, cr, oenv, inputrec->fepvals->init_lambda,
+ +                      EI_DYNAMICS(inputrec->eI) && MASTER(cr), Flags);
+ +        }
+ +
+ +        if (inputrec->bRot)
+ +        {
+ +            /* Initialize enforced rotation code */
+ +            init_rot(fplog, inputrec, nfile, fnm, cr, state->x, box, mtop, oenv,
+ +                     bVerbose, Flags);
+ +        }
+ +
+ +        constr = init_constraints(fplog, mtop, inputrec, ed, state, cr);
+ +
+ +        if (DOMAINDECOMP(cr))
+ +        {
+ +            dd_init_bondeds(fplog, cr->dd, mtop, vsite, constr, inputrec,
+ +                            Flags & MD_DDBONDCHECK, fr->cginfo_mb);
+ +
+ +            set_dd_parameters(fplog, cr->dd, dlb_scale, inputrec, fr, &ddbox);
+ +
+ +            setup_dd_grid(fplog, cr->dd);
+ +        }
+ +
+ +        /* Now do whatever the user wants us to do (how flexible...) */
+ +        integrator[inputrec->eI].func(fplog, cr, nfile, fnm,
+ +                                      oenv, bVerbose, bCompact,
+ +                                      nstglobalcomm,
+ +                                      vsite, constr,
+ +                                      nstepout, inputrec, mtop,
+ +                                      fcd, state,
+ +                                      mdatoms, nrnb, wcycle, ed, fr,
+ +                                      repl_ex_nst, repl_ex_nex, repl_ex_seed,
+ +                                      membed,
+ +                                      cpt_period, max_hours,
+ +                                      deviceOptions,
+ +                                      Flags,
+ +                                      &runtime);
+ +
+ +        if (inputrec->ePull != epullNO)
+ +        {
+ +            finish_pull(fplog, inputrec->pull);
+ +        }
+ +
+ +        if (inputrec->bRot)
+ +        {
+ +            finish_rot(inputrec->rot);
+ +        }
+ +
+ +    }
+ +    else
+ +    {
+ +        /* do PME only */
+ +        gmx_pmeonly(*pmedata, cr, nrnb, wcycle, ewaldcoeff, FALSE, inputrec);
+ +    }
+ +
+ +    if (EI_DYNAMICS(inputrec->eI) || EI_TPI(inputrec->eI))
+ +    {
+ +        /* Some timing stats */
+ +        if (SIMMASTER(cr))
+ +        {
+ +            if (runtime.proc == 0)
+ +            {
+ +                runtime.proc = runtime.real;
+ +            }
+ +        }
+ +        else
+ +        {
+ +            runtime.real = 0;
+ +        }
+ +    }
+ +
+ +    wallcycle_stop(wcycle, ewcRUN);
+ +
+ +    /* Finish up, write some stuff
+ +     * if rerunMD, don't write last frame again
+ +     */
+ +    finish_run(fplog, cr, ftp2fn(efSTO, nfile, fnm),
+ +               inputrec, nrnb, wcycle, &runtime,
+ +               fr != NULL && fr->nbv != NULL && fr->nbv->bUseGPU ?
+ +               nbnxn_cuda_get_timings(fr->nbv->cu_nbv) : NULL,
+ +               nthreads_pp,
+ +               EI_DYNAMICS(inputrec->eI) && !MULTISIM(cr));
+ +
+ +    if ((cr->duty & DUTY_PP) && fr->nbv != NULL && fr->nbv->bUseGPU)
+ +    {
+ +        char gpu_err_str[STRLEN];
+ +
+ +        /* free GPU memory and uninitialize GPU (by destroying the context) */
+ +        nbnxn_cuda_free(fplog, fr->nbv->cu_nbv);
+ +
+ +        if (!free_gpu(gpu_err_str))
+ +        {
+ +            gmx_warning("On node %d failed to free GPU #%d: %s",
+ +                        cr->nodeid, get_current_gpu_device_id(), gpu_err_str);
+ +        }
+ +    }
+ +
+ +    if (opt2bSet("-membed", nfile, fnm))
+ +    {
+ +        sfree(membed);
+ +    }
+ +
++    gmx_hardware_info_free(hwinfo);
+ +
+ +    /* Does what it says */
+ +    print_date_and_time(fplog, cr->nodeid, "Finished mdrun", &runtime);
+ +
+ +    /* Close logfile already here if we were appending to it */
+ +    if (MASTER(cr) && (Flags & MD_APPENDFILES))
+ +    {
+ +        gmx_log_close(fplog);
+ +    }
+ +
+ +    rc = (int)gmx_get_stop_condition();
+ +
+ +#ifdef GMX_THREAD_MPI
+ +    /* we need to join all threads. The sub-threads join when they
+ +       exit this function, but the master thread needs to be told to
+ +       wait for that. */
+ +    if (PAR(cr) && MASTER(cr))
+ +    {
+ +        tMPI_Finalize();
+ +    }
+ +#endif
+ +
+ +    return rc;
+ +}
author	Mark Abraham <mark.j.abraham@gmail.com>
	Fri, 19 Jul 2013 07:41:13 +0000 (09:41 +0200)
committer	Mark Abraham <mark.j.abraham@gmail.com>
	Wed, 24 Jul 2013 06:03:12 +0000 (08:03 +0200)
		1	2
CMakeLists.txt	patch \|	diff1 \|	diff2 \|	blob \| history
admin/mkhtml	patch \|	diff1 \|	diff2 \|	blob \| history
cmake/gmxManageGPU.cmake	patch \|	diff1 \|	diff2 \|	blob \| history
cmake/gmxManageNvccConfig.cmake	patch \|	diff1 \|	diff2 \|	blob \| history
src/config.h.cmakein	patch \|	diff1 \|	diff2 \|	blob \| history
src/gromacs/gmxana/gmx_genion.c	patch \|	diff1 \|	\|	blob \| history
src/gromacs/gmxana/gmx_tune_pme.c	patch \|	diff1 \|	\|	blob \| history
src/gromacs/gmxlib/cuda_tools/copyrite_gpu.cu	patch \|	diff1 \|	\|	blob \| history
src/gromacs/gmxlib/gmx_detect_hardware.c	patch \|	diff1 \|	\|	blob \| history
src/gromacs/gmxlib/gmx_thread_affinity.c	patch \|	diff1 \|	\|	blob \| history
src/gromacs/gmxlib/string2.c	patch \|	diff1 \|	\|	blob \| history
src/gromacs/legacyheaders/gmx_detect_hardware.h	patch \|	diff1 \|	\|	blob \| history
src/gromacs/legacyheaders/maths.h	patch \|	diff1 \|	\|	blob \| history
src/gromacs/legacyheaders/nbnxn_cuda_data_mgmt.h	patch \|	diff1 \|	\|	blob \| history
src/gromacs/legacyheaders/thread_mpi/atomic.h	patch \|	diff1 \|	diff2 \|	blob \| history
src/gromacs/legacyheaders/thread_mpi/atomic/xlc_ppc.h	patch \|	diff1 \|	diff2 \|	blob \| history
src/gromacs/legacyheaders/types/enums.h	patch \|	diff1 \|	\|	blob \| history
src/gromacs/legacyheaders/types/forcerec.h	patch \|	diff1 \|	\|	blob \| history
src/gromacs/legacyheaders/types/hw_info.h	patch \|	diff1 \|	\|	blob \| history
src/gromacs/libgromacs.pc.cmakein	patch \|	diff1 \|	\|	blob \| history
src/gromacs/mdlib/forcerec.c	patch \|	diff1 \|	\|	blob \| history
src/gromacs/mdlib/nbnxn_atomdata.h	patch \|	diff1 \|	\|	blob \| history
src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu	patch \|	diff1 \|	\|	blob \| history
src/gromacs/mdlib/nbnxn_search.h	patch \|	diff1 \|	\|	blob \| history
src/gromacs/mdlib/qm_orca.c	patch \|	diff1 \|	\|	blob \| history
src/gromacs/mdlib/sim_util.c	patch \|	diff1 \|	\|	blob \| history
src/gromacs/mdlib/update.c	patch \|	diff1 \|	\|	blob \| history
src/programs/mdrun/md.c	patch \|	diff1 \|	\|	blob \| history
src/programs/mdrun/runner.c	patch \|	diff1 \|	\|	blob \| history