if (CMAKE_CXX_COMPILER_LOADED)
get_compiler_info(CXX BUILD_CXX_COMPILER BUILD_CXXFLAGS)
endif ()
+ if(GMX_GPU)
+ get_cuda_compiler_info(CUDA_NVCC_COMPILER_INFO CUDA_NVCC_COMPILER_FLAGS)
+ endif(GMX_GPU)
+
########################################################################
-# Specify install locations and which subdirectories to process #
+# Specify install locations
########################################################################
-if (GMX_USE_RELATIVE_INSTALL_PATH)
- set(GMX_INSTALL_PREFIX "" CACHE STRING "Prefix gets appended to CMAKE_INSTALL_PREFIX. For cpack it sets the root folder of the archive.")
- mark_as_advanced(GMX_INSTALL_PREFIX)
-else()
- set(GMX_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}/")
-endif()
-
if ( NOT DEFINED GMXLIB )
set(GMXLIB lib)
endif()
cd $dir
+ setenv GMX_MAXBACKUP -1
foreach program ( $PROGRAMS )
- if ( ( -x $GMXBINDIR/$program ) && ( $program != "my_dssp" ) && ( $program != "GMXRC" ) && ( $program != "completion.csh" ) && ( $program != "completion.zsh" ) && ( $program != "average" ) && ( $program != "completion.bash" ) ) then
- if ( ( -x $GMXBINDIR/$program ) && ( $program != "my_dssp" ) && ( $program != "GMXRC" ) && ( $program != "completion.csh" ) && ( $program != "completion.zsh" ) && ( $program != "average" ) && ( $program != "completion.bash" ) && ( $program != "luck" ) && ( $program != "demux.pl" ) ) then
++ if ( ( -x $GMXBINDIR/$program ) && ( $program != "my_dssp" ) && ( $program != "GMXRC" ) && ( $program != "completion.csh" ) && ( $program != "completion.zsh" ) && ( $program != "average" ) && ( $program != "completion.bash" ) && ( $program != "demux.pl" ) ) then
echo -n "$program "
cd $HTMLOL
$GMXBINDIR/$program -quiet -man html >& /dev/null
--- /dev/null
- static int greatest_common_divisor(int p, int q)
- {
- int tmp;
- while (q != 0)
- {
- tmp = q;
- q = p % q;
- p = tmp;
- }
- return p;
- }
-
+/*
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * GROningen MAchine for Chemical Simulations
+ *
+ * VERSION 3.2.0
+ * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2004, The GROMACS development team,
+ * check out http://www.gromacs.org for more information.
+
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * If you want to redistribute modifications, please consider that
+ * scientific software is very special. Version control is crucial -
+ * bugs must be traceable. We will be happy to consider code for
+ * inclusion in the official distribution, but derived work must not
+ * be called official GROMACS. Details are found in the README & COPYING
+ * files - if they are missing, get the official version at www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the papers on the package - you can find them in the top README file.
+ *
+ * For more info, check our website at http://www.gromacs.org
+ *
+ * And Hey:
+ * Green Red Orange Magenta Azure Cyan Skyblue
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <ctype.h>
+#include "string2.h"
+#include "smalloc.h"
+#include "sysstuff.h"
+#include "confio.h"
+#include "statutil.h"
+#include "pbc.h"
+#include "force.h"
+#include "gmx_fatal.h"
+#include "futil.h"
+#include "maths.h"
+#include "macros.h"
+#include "vec.h"
+#include "tpxio.h"
+#include "mdrun.h"
+#include "main.h"
+#include "random.h"
+#include "index.h"
+#include "mtop_util.h"
+#include "gmx_ana.h"
+
- int gcd = greatest_common_divisor(n_q, p_q);
+static void insert_ion(int nsa, int *nwater,
+ gmx_bool bSet[], int repl[], atom_id index[],
+ rvec x[], t_pbc *pbc,
+ int sign, int q, const char *ionname,
+ t_atoms *atoms,
+ real rmin, int *seed)
+{
+ int i, ei,nw;
+ real rmin2;
+ rvec dx;
+ gmx_large_int_t maxrand;
+
+ ei = -1;
+ nw = *nwater;
+ maxrand = nw;
+ maxrand *= 1000;
+
+ do
+ {
+ ei = nw*rando(seed);
+ maxrand--;
+ }
+ while (bSet[ei] && (maxrand > 0));
+ if (bSet[ei])
+ {
+ gmx_fatal(FARGS, "No more replaceable solvent!");
+ }
+
+ fprintf(stderr, "Replacing solvent molecule %d (atom %d) with %s\n",
+ ei, index[nsa*ei], ionname);
+
+ /* Replace solvent molecule charges with ion charge */
+ bSet[ei] = TRUE;
+ repl[ei] = sign;
+
+ atoms->atom[index[nsa*ei]].q = q;
+ for (i = 1; i < nsa; i++)
+ {
+ atoms->atom[index[nsa*ei+i]].q = 0;
+ }
+
+ /* Mark all solvent molecules within rmin as unavailable for substitution */
+ if (rmin > 0)
+ {
+ rmin2 = rmin*rmin;
+ for (i = 0; (i < nw); i++)
+ {
+ if (!bSet[i])
+ {
+ pbc_dx(pbc, x[index[nsa*ei]], x[index[nsa*i]], dx);
+ if (iprod(dx, dx) < rmin2)
+ {
+ bSet[i] = TRUE;
+ }
+ }
+ }
+ }
+}
+
+
+static char *aname(const char *mname)
+{
+ char *str;
+ int i;
+
+ str = strdup(mname);
+ i = strlen(str)-1;
+ while (i > 1 && (isdigit(str[i]) || (str[i] == '+') || (str[i] == '-')))
+ {
+ str[i] = '\0';
+ i--;
+ }
+
+ return str;
+}
+
+void sort_ions(int nsa, int nw, int repl[], atom_id index[],
+ t_atoms *atoms, rvec x[],
+ const char *p_name, const char *n_name)
+{
+ int i, j, k, r, np, nn, starta, startr, npi, nni;
+ rvec *xt;
+ char **pptr = NULL, **nptr = NULL, **paptr = NULL, **naptr = NULL;
+
+ snew(xt, atoms->nr);
+
+ /* Put all the solvent in front and count the added ions */
+ np = 0;
+ nn = 0;
+ j = index[0];
+ for (i = 0; i < nw; i++)
+ {
+ r = repl[i];
+ if (r == 0)
+ {
+ for (k = 0; k < nsa; k++)
+ {
+ copy_rvec(x[index[nsa*i+k]], xt[j++]);
+ }
+ }
+ else if (r > 0)
+ {
+ np++;
+ }
+ else if (r < 0)
+ {
+ nn++;
+ }
+ }
+
+ if (np+nn > 0)
+ {
+ /* Put the positive and negative ions at the end */
+ starta = index[nsa*(nw - np - nn)];
+ startr = atoms->atom[starta].resind;
+
+ if (np)
+ {
+ snew(pptr, 1);
+ pptr[0] = strdup(p_name);
+ snew(paptr, 1);
+ paptr[0] = aname(p_name);
+ }
+ if (nn)
+ {
+ snew(nptr, 1);
+ nptr[0] = strdup(n_name);
+ snew(naptr, 1);
+ naptr[0] = aname(n_name);
+ }
+ npi = 0;
+ nni = 0;
+ for (i = 0; i < nw; i++)
+ {
+ r = repl[i];
+ if (r > 0)
+ {
+ j = starta+npi;
+ k = startr+npi;
+ copy_rvec(x[index[nsa*i]], xt[j]);
+ atoms->atomname[j] = paptr;
+ atoms->atom[j].resind = k;
+ atoms->resinfo[k].name = pptr;
+ npi++;
+ }
+ else if (r < 0)
+ {
+ j = starta+np+nni;
+ k = startr+np+nni;
+ copy_rvec(x[index[nsa*i]], xt[j]);
+ atoms->atomname[j] = naptr;
+ atoms->atom[j].resind = k;
+ atoms->resinfo[k].name = nptr;
+ nni++;
+ }
+ }
+ for (i = index[nsa*nw-1]+1; i < atoms->nr; i++)
+ {
+ j = i-(nsa-1)*(np+nn);
+ atoms->atomname[j] = atoms->atomname[i];
+ atoms->atom[j] = atoms->atom[i];
+ copy_rvec(x[i], xt[j]);
+ }
+ atoms->nr -= (nsa-1)*(np+nn);
+
+ /* Copy the new positions back */
+ for (i = index[0]; i < atoms->nr; i++)
+ {
+ copy_rvec(xt[i], x[i]);
+ }
+ sfree(xt);
+ }
+}
+
+static void update_topol(const char *topinout, int p_num, int n_num,
+ const char *p_name, const char *n_name, char *grpname)
+{
+#define TEMP_FILENM "temp.top"
+ FILE *fpin, *fpout;
+ char buf[STRLEN], buf2[STRLEN], *temp, **mol_line = NULL;
+ int line, i, nsol, nmol_line, sol_line, nsol_last;
+ gmx_bool bMolecules;
+
+ printf("\nProcessing topology\n");
+ fpin = ffopen(topinout, "r");
+ fpout = ffopen(TEMP_FILENM, "w");
+
+ line = 0;
+ bMolecules = FALSE;
+ nmol_line = 0;
+ sol_line = -1;
+ nsol_last = -1;
+ while (fgets(buf, STRLEN, fpin))
+ {
+ line++;
+ strcpy(buf2, buf);
+ if ((temp = strchr(buf2, '\n')) != NULL)
+ {
+ temp[0] = '\0';
+ }
+ ltrim(buf2);
+ if (buf2[0] == '[')
+ {
+ buf2[0] = ' ';
+ if ((temp = strchr(buf2, '\n')) != NULL)
+ {
+ temp[0] = '\0';
+ }
+ rtrim(buf2);
+ if (buf2[strlen(buf2)-1] == ']')
+ {
+ buf2[strlen(buf2)-1] = '\0';
+ ltrim(buf2);
+ rtrim(buf2);
+ bMolecules = (gmx_strcasecmp(buf2, "molecules") == 0);
+ }
+ fprintf(fpout, "%s", buf);
+ }
+ else if (!bMolecules)
+ {
+ fprintf(fpout, "%s", buf);
+ }
+ else
+ {
+ /* Check if this is a line with solvent molecules */
+ sscanf(buf, "%s", buf2);
+ if (gmx_strcasecmp(buf2, grpname) == 0)
+ {
+ sol_line = nmol_line;
+ sscanf(buf, "%*s %d", &nsol_last);
+ }
+ /* Store this molecules section line */
+ srenew(mol_line, nmol_line+1);
+ mol_line[nmol_line] = strdup(buf);
+ nmol_line++;
+ }
+ }
+ ffclose(fpin);
+
+ if (sol_line == -1)
+ {
+ ffclose(fpout);
+ gmx_fatal(FARGS, "No line with moleculetype '%s' found the [ molecules ] section of file '%s'", grpname, topinout);
+ }
+ if (nsol_last < p_num+n_num)
+ {
+ ffclose(fpout);
+ gmx_fatal(FARGS, "The last entry for moleculetype '%s' in the [ molecules ] section of file '%s' has less solvent molecules (%d) than were replaced (%d)", grpname, topinout, nsol_last, p_num+n_num);
+ }
+
+ /* Print all the molecule entries */
+ for (i = 0; i < nmol_line; i++)
+ {
+ if (i != sol_line)
+ {
+ fprintf(fpout, "%s", mol_line[i]);
+ }
+ else
+ {
+ printf("Replacing %d solute molecules in topology file (%s) "
+ " by %d %s and %d %s ions.\n",
+ p_num+n_num, topinout, p_num, p_name, n_num, n_name);
+ nsol_last -= p_num + n_num;
+ if (nsol_last > 0)
+ {
+ fprintf(fpout, "%-10s %d\n", grpname, nsol_last);
+ }
+ if (p_num > 0)
+ {
+ fprintf(fpout, "%-15s %d\n", p_name, p_num);
+ }
+ if (n_num > 0)
+ {
+ fprintf(fpout, "%-15s %d\n", n_name, n_num);
+ }
+ }
+ }
+ ffclose(fpout);
+ /* use ffopen to generate backup of topinout */
+ fpout = ffopen(topinout, "w");
+ ffclose(fpout);
+ rename(TEMP_FILENM, topinout);
+#undef TEMP_FILENM
+}
+
+int gmx_genion(int argc, char *argv[])
+{
+ const char *desc[] = {
+ "[TT]genion[tt] randomly replaces solvent molecules with monoatomic ions.",
+ "The group of solvent molecules should be continuous and all molecules",
+ "should have the same number of atoms.",
+ "The user should add the ion molecules to the topology file or use",
+ "the [TT]-p[tt] option to automatically modify the topology.[PAR]",
+ "The ion molecule type, residue and atom names in all force fields",
+ "are the capitalized element names without sign. This molecule name",
+ "should be given with [TT]-pname[tt] or [TT]-nname[tt], and the",
+ "[TT][molecules][tt] section of your topology updated accordingly,",
+ "either by hand or with [TT]-p[tt]. Do not use an atom name instead!",
+ "[PAR]Ions which can have multiple charge states get the multiplicity",
+ "added, without sign, for the uncommon states only.[PAR]",
+ "For larger ions, e.g. sulfate we recommended using [TT]genbox[tt]."
+ };
+ const char *bugs[] = {
+ "If you specify a salt concentration existing ions are not taken into "
+ "account. In effect you therefore specify the amount of salt to be added.",
+ };
+ static int p_num = 0, n_num = 0, p_q = 1, n_q = -1;
+ static const char *p_name = "NA", *n_name = "CL";
+ static real rmin = 0.6, conc = 0;
+ static int seed = 1993;
+ static gmx_bool bNeutral = FALSE;
+ static t_pargs pa[] = {
+ { "-np", FALSE, etINT, {&p_num}, "Number of positive ions" },
+ { "-pname", FALSE, etSTR, {&p_name}, "Name of the positive ion" },
+ { "-pq", FALSE, etINT, {&p_q}, "Charge of the positive ion" },
+ { "-nn", FALSE, etINT, {&n_num}, "Number of negative ions" },
+ { "-nname", FALSE, etSTR, {&n_name}, "Name of the negative ion" },
+ { "-nq", FALSE, etINT, {&n_q}, "Charge of the negative ion" },
+ { "-rmin", FALSE, etREAL, {&rmin}, "Minimum distance between ions" },
+ { "-seed", FALSE, etINT, {&seed}, "Seed for random number generator" },
+ { "-conc", FALSE, etREAL, {&conc},
+ "Specify salt concentration (mol/liter). This will add sufficient ions to reach up to the specified concentration as computed from the volume of the cell in the input [TT].tpr[tt] file. Overrides the [TT]-np[tt] and [TT]-nn[tt] options." },
+ { "-neutral", FALSE, etBOOL, {&bNeutral}, "This option will add enough ions to neutralize the system. These ions are added on top of those specified with [TT]-np[tt]/[TT]-nn[tt] or [TT]-conc[tt]. "}
+ };
+ t_topology top;
+ rvec *x, *v;
+ real vol, qtot;
+ matrix box;
+ t_atoms atoms;
+ t_pbc pbc;
+ int *repl, ePBC;
+ atom_id *index;
+ char *grpname, title[STRLEN];
+ gmx_bool *bSet;
+ int i, nw, nwa, nsa, nsalt, iqtot;
+ output_env_t oenv;
+ t_filenm fnm[] = {
+ { efTPX, NULL, NULL, ffREAD },
+ { efNDX, NULL, NULL, ffOPTRD },
+ { efSTO, "-o", NULL, ffWRITE },
+ { efTOP, "-p", "topol", ffOPTRW }
+ };
+#define NFILE asize(fnm)
+
+ parse_common_args(&argc, argv, PCA_BE_NICE, NFILE, fnm, asize(pa), pa,
+ asize(desc), desc, asize(bugs), bugs, &oenv);
+
+ /* Check input for something sensible */
+ if ((p_num < 0) || (n_num < 0))
+ {
+ gmx_fatal(FARGS, "Negative number of ions to add?");
+ }
+
+ if (conc > 0 && (p_num > 0 || n_num > 0))
+ {
+ fprintf(stderr, "WARNING: -conc specified, overriding -nn and -np.\n");
+ }
+
+ /* Read atom positions and charges */
+ read_tps_conf(ftp2fn(efTPX, NFILE, fnm), title, &top, &ePBC, &x, &v, box, FALSE);
+ atoms = top.atoms;
+
+ /* Compute total charge */
+ qtot = 0;
+ for (i = 0; (i < atoms.nr); i++)
+ {
+ qtot += atoms.atom[i].q;
+ }
+ iqtot = gmx_nint(qtot);
+
+
+ if (conc > 0)
+ {
+ /* Compute number of ions to be added */
+ vol = det(box);
+ nsalt = gmx_nint(conc*vol*AVOGADRO/1e24);
+ p_num = abs(nsalt*n_q);
+ n_num = abs(nsalt*p_q);
+ }
+ if (bNeutral)
+ {
+ int qdelta = p_num*p_q + n_num*n_q + iqtot;
+
+ /* Check if the system is neutralizable
+ * is (qdelta == p_q*p_num + n_q*n_num) solvable for p_num and n_num? */
++ int gcd = gmx_greatest_common_divisor(n_q, p_q);
+ if ((qdelta % gcd) != 0)
+ {
+ gmx_fatal(FARGS, "Can't neutralize this system using -nq %d and"
+ " -pq %d.\n", n_q, p_q);
+ }
+
+ while (qdelta != 0)
+ {
+ while (qdelta < 0)
+ {
+ p_num++;
+ qdelta += p_q;
+ }
+ while (qdelta > 0)
+ {
+ n_num++;
+ qdelta += n_q;
+ }
+ }
+ }
+
+ if ((p_num == 0) && (n_num == 0))
+ {
+ fprintf(stderr, "No ions to add.\n");
+ exit(0);
+ }
+ else
+ {
+ printf("Will try to add %d %s ions and %d %s ions.\n",
+ p_num, p_name, n_num, n_name);
+ printf("Select a continuous group of solvent molecules\n");
+ get_index(&atoms, ftp2fn_null(efNDX, NFILE, fnm), 1, &nwa, &index, &grpname);
+ for (i = 1; i < nwa; i++)
+ {
+ if (index[i] != index[i-1]+1)
+ {
+ gmx_fatal(FARGS, "The solvent group %s is not continuous: "
+ "index[%d]=%d, index[%d]=%d",
+ grpname, i, index[i-1]+1, i+1, index[i]+1);
+ }
+ }
+ nsa = 1;
+ while ((nsa < nwa) &&
+ (atoms.atom[index[nsa]].resind ==
+ atoms.atom[index[nsa-1]].resind))
+ {
+ nsa++;
+ }
+ if (nwa % nsa)
+ {
+ gmx_fatal(FARGS, "Your solvent group size (%d) is not a multiple of %d",
+ nwa, nsa);
+ }
+ nw = nwa/nsa;
+ fprintf(stderr, "Number of (%d-atomic) solvent molecules: %d\n", nsa, nw);
+ if (p_num+n_num > nw)
+ {
+ gmx_fatal(FARGS, "Not enough solvent for adding ions");
+ }
+ }
+
+ if (opt2bSet("-p", NFILE, fnm))
+ {
+ update_topol(opt2fn("-p", NFILE, fnm), p_num, n_num, p_name, n_name, grpname);
+ }
+
+ snew(bSet, nw);
+ snew(repl, nw);
+
+ snew(v, atoms.nr);
+ snew(atoms.pdbinfo, atoms.nr);
+
+ set_pbc(&pbc, ePBC, box);
+
+ /* Now loop over the ions that have to be placed */
+ while (p_num-- > 0)
+ {
+ insert_ion(nsa, &nw, bSet, repl, index, x, &pbc,
+ 1, p_q, p_name, &atoms, rmin, &seed);
+ }
+ while (n_num-- > 0)
+ {
+ insert_ion(nsa, &nw, bSet, repl, index, x, &pbc,
+ -1, n_q, n_name, &atoms, rmin, &seed);
+ }
+ fprintf(stderr, "\n");
+
+ if (nw)
+ {
+ sort_ions(nsa, nw, repl, index, &atoms, x, p_name, n_name);
+ }
+
+ sfree(atoms.pdbinfo);
+ atoms.pdbinfo = NULL;
+ write_sto_conf(ftp2fn(efSTO, NFILE, fnm), *top.name, &atoms, x, NULL, ePBC,
+ box);
+
+ return 0;
+}
--- /dev/null
- sprintf(bbuf, " -np %d ", nnodes);
+/*
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * GROningen MAchine for Chemical Simulations
+ *
+ * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2008, The GROMACS development team,
+ * check out http://www.gromacs.org for more information.
+
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * If you want to redistribute modifications, please consider that
+ * scientific software is very special. Version control is crucial -
+ * bugs must be traceable. We will be happy to consider code for
+ * inclusion in the official distribution, but derived work must not
+ * be called official GROMACS. Details are found in the README & COPYING
+ * files - if they are missing, get the official version at www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the papers on the package - you can find them in the top README file.
+ *
+ * For more info, check our website at http://www.gromacs.org
+ *
+ * And Hey:
+ * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+
+#include <time.h>
+#ifdef HAVE_SYS_TIME_H
+#include <sys/time.h>
+#endif
+
+
+
+#include "statutil.h"
+#include "typedefs.h"
+#include "smalloc.h"
+#include "vec.h"
+#include "copyrite.h"
+#include "statutil.h"
+#include "tpxio.h"
+#include "string2.h"
+#include "readinp.h"
+#include "calcgrid.h"
+#include "checkpoint.h"
+#include "macros.h"
+#include "gmx_ana.h"
+#include "names.h"
+#include "perf_est.h"
+
+
+
+/* Enum for situations that can occur during log file parsing, the
+ * corresponding string entries can be found in do_the_tests() in
+ * const char* ParseLog[] */
+enum {
+ eParselogOK,
+ eParselogNotFound,
+ eParselogNoPerfData,
+ eParselogTerm,
+ eParselogResetProblem,
+ eParselogNoDDGrid,
+ eParselogTPXVersion,
+ eParselogNotParallel,
+ eParselogFatal,
+ eParselogNr
+};
+
+
+typedef struct
+{
+ int nPMEnodes; /* number of PME-only nodes used in this test */
+ int nx, ny, nz; /* DD grid */
+ int guessPME; /* if nPMEnodes == -1, this is the guessed number of PME nodes */
+ double *Gcycles; /* This can contain more than one value if doing multiple tests */
+ double Gcycles_Av;
+ float *ns_per_day;
+ float ns_per_day_Av;
+ float *PME_f_load; /* PME mesh/force load average*/
+ float PME_f_load_Av; /* Average average ;) ... */
+ char *mdrun_cmd_line; /* Mdrun command line used for this test */
+} t_perf;
+
+
+typedef struct
+{
+ int nr_inputfiles; /* The number of tpr and mdp input files */
+ gmx_large_int_t orig_sim_steps; /* Number of steps to be done in the real simulation */
+ gmx_large_int_t orig_init_step; /* Init step for the real simulation */
+ real *rcoulomb; /* The coulomb radii [0...nr_inputfiles] */
+ real *rvdw; /* The vdW radii */
+ real *rlist; /* Neighbourlist cutoff radius */
+ real *rlistlong;
+ int *nkx, *nky, *nkz;
+ real *fsx, *fsy, *fsz; /* Fourierspacing in x,y,z dimension */
+} t_inputinfo;
+
+
+static void sep_line(FILE *fp)
+{
+ fprintf(fp, "\n------------------------------------------------------------\n");
+}
+
+
+/* Wrapper for system calls */
+static int gmx_system_call(char *command)
+{
+#ifdef GMX_NO_SYSTEM
+ gmx_fatal(FARGS, "No calls to system(3) supported on this platform. Attempted to call:\n'%s'\n", command);
+#else
+ return ( system(command) );
+#endif
+}
+
+
+/* Check if string starts with substring */
+static gmx_bool str_starts(const char *string, const char *substring)
+{
+ return ( strncmp(string, substring, strlen(substring)) == 0);
+}
+
+
+static void cleandata(t_perf *perfdata, int test_nr)
+{
+ perfdata->Gcycles[test_nr] = 0.0;
+ perfdata->ns_per_day[test_nr] = 0.0;
+ perfdata->PME_f_load[test_nr] = 0.0;
+
+ return;
+}
+
+
+static gmx_bool is_equal(real a, real b)
+{
+ real diff, eps = 1.0e-7;
+
+
+ diff = a - b;
+
+ if (diff < 0.0)
+ {
+ diff = -diff;
+ }
+
+ if (diff < eps)
+ {
+ return TRUE;
+ }
+ else
+ {
+ return FALSE;
+ }
+}
+
+
+static void finalize(const char *fn_out)
+{
+ char buf[STRLEN];
+ FILE *fp;
+
+
+ fp = fopen(fn_out, "r");
+ fprintf(stdout, "\n\n");
+
+ while (fgets(buf, STRLEN-1, fp) != NULL)
+ {
+ fprintf(stdout, "%s", buf);
+ }
+ fclose(fp);
+ fprintf(stdout, "\n\n");
+}
+
+
+enum {
+ eFoundNothing, eFoundDDStr, eFoundAccountingStr, eFoundCycleStr
+};
+
+static int parse_logfile(const char *logfile, const char *errfile,
+ t_perf *perfdata, int test_nr, int presteps, gmx_large_int_t cpt_steps,
+ int nnodes)
+{
+ FILE *fp;
+ char line[STRLEN], dumstring[STRLEN], dumstring2[STRLEN];
+ const char matchstrdd[] = "Domain decomposition grid";
+ const char matchstrcr[] = "resetting all time and cycle counters";
+ const char matchstrbal[] = "Average PME mesh/force load:";
+ const char matchstring[] = "R E A L C Y C L E A N D T I M E A C C O U N T I N G";
+ const char errSIG[] = "signal, stopping at the next";
+ int iFound;
+ int procs;
+ float dum1, dum2, dum3, dum4;
+ int ndum;
+ int npme;
+ gmx_large_int_t resetsteps = -1;
+ gmx_bool bFoundResetStr = FALSE;
+ gmx_bool bResetChecked = FALSE;
+
+
+ if (!gmx_fexist(logfile))
+ {
+ fprintf(stderr, "WARNING: Could not find logfile %s.\n", logfile);
+ cleandata(perfdata, test_nr);
+ return eParselogNotFound;
+ }
+
+ fp = fopen(logfile, "r");
+ perfdata->PME_f_load[test_nr] = -1.0;
+ perfdata->guessPME = -1;
+
+ iFound = eFoundNothing;
+ if (1 == nnodes)
+ {
+ iFound = eFoundDDStr; /* Skip some case statements */
+ }
+
+ while (fgets(line, STRLEN, fp) != NULL)
+ {
+ /* Remove leading spaces */
+ ltrim(line);
+
+ /* Check for TERM and INT signals from user: */
+ if (strstr(line, errSIG) != NULL)
+ {
+ fclose(fp);
+ cleandata(perfdata, test_nr);
+ return eParselogTerm;
+ }
+
+ /* Check whether cycle resetting worked */
+ if (presteps > 0 && !bFoundResetStr)
+ {
+ if (strstr(line, matchstrcr) != NULL)
+ {
+ sprintf(dumstring, "step %s", gmx_large_int_pfmt);
+ sscanf(line, dumstring, &resetsteps);
+ bFoundResetStr = TRUE;
+ if (resetsteps == presteps+cpt_steps)
+ {
+ bResetChecked = TRUE;
+ }
+ else
+ {
+ sprintf(dumstring, gmx_large_int_pfmt, resetsteps);
+ sprintf(dumstring2, gmx_large_int_pfmt, presteps+cpt_steps);
+ fprintf(stderr, "WARNING: Time step counters were reset at step %s,\n"
+ " though they were supposed to be reset at step %s!\n",
+ dumstring, dumstring2);
+ }
+ }
+ }
+
+ /* Look for strings that appear in a certain order in the log file: */
+ switch (iFound)
+ {
+ case eFoundNothing:
+ /* Look for domain decomp grid and separate PME nodes: */
+ if (str_starts(line, matchstrdd))
+ {
+ sscanf(line, "Domain decomposition grid %d x %d x %d, separate PME nodes %d",
+ &(perfdata->nx), &(perfdata->ny), &(perfdata->nz), &npme);
+ if (perfdata->nPMEnodes == -1)
+ {
+ perfdata->guessPME = npme;
+ }
+ else if (perfdata->nPMEnodes != npme)
+ {
+ gmx_fatal(FARGS, "PME nodes from command line and output file are not identical");
+ }
+ iFound = eFoundDDStr;
+ }
+ /* Catch a few errors that might have occured: */
+ else if (str_starts(line, "There is no domain decomposition for"))
+ {
+ fclose(fp);
+ return eParselogNoDDGrid;
+ }
+ else if (str_starts(line, "reading tpx file"))
+ {
+ fclose(fp);
+ return eParselogTPXVersion;
+ }
+ else if (str_starts(line, "The -dd or -npme option request a parallel simulation"))
+ {
+ fclose(fp);
+ return eParselogNotParallel;
+ }
+ break;
+ case eFoundDDStr:
+ /* Look for PME mesh/force balance (not necessarily present, though) */
+ if (str_starts(line, matchstrbal))
+ {
+ sscanf(&line[strlen(matchstrbal)], "%f", &(perfdata->PME_f_load[test_nr]));
+ }
+ /* Look for matchstring */
+ if (str_starts(line, matchstring))
+ {
+ iFound = eFoundAccountingStr;
+ }
+ break;
+ case eFoundAccountingStr:
+ /* Already found matchstring - look for cycle data */
+ if (str_starts(line, "Total "))
+ {
+ sscanf(line, "Total %d %lf", &procs, &(perfdata->Gcycles[test_nr]));
+ iFound = eFoundCycleStr;
+ }
+ break;
+ case eFoundCycleStr:
+ /* Already found cycle data - look for remaining performance info and return */
+ if (str_starts(line, "Performance:"))
+ {
+ ndum = sscanf(line, "%s %f %f %f %f", dumstring, &dum1, &dum2, &dum3, &dum4);
+ /* (ns/day) is the second last entry, depending on whether GMX_DETAILED_PERF_STATS was set in print_perf(), nrnb.c */
+ perfdata->ns_per_day[test_nr] = (ndum == 5) ? dum3 : dum1;
+ fclose(fp);
+ if (bResetChecked || presteps == 0)
+ {
+ return eParselogOK;
+ }
+ else
+ {
+ return eParselogResetProblem;
+ }
+ }
+ break;
+ }
+ } /* while */
+
+ /* Close the log file */
+ fclose(fp);
+
+ /* Check why there is no performance data in the log file.
+ * Did a fatal errors occur? */
+ if (gmx_fexist(errfile))
+ {
+ fp = fopen(errfile, "r");
+ while (fgets(line, STRLEN, fp) != NULL)
+ {
+ if (str_starts(line, "Fatal error:") )
+ {
+ if (fgets(line, STRLEN, fp) != NULL)
+ {
+ fprintf(stderr, "\nWARNING: An error occured during this benchmark:\n"
+ "%s\n", line);
+ }
+ fclose(fp);
+ cleandata(perfdata, test_nr);
+ return eParselogFatal;
+ }
+ }
+ fclose(fp);
+ }
+ else
+ {
+ fprintf(stderr, "WARNING: Could not find stderr file %s.\n", errfile);
+ }
+
+ /* Giving up ... we could not find out why there is no performance data in
+ * the log file. */
+ fprintf(stdout, "No performance data in log file.\n");
+ cleandata(perfdata, test_nr);
+
+ return eParselogNoPerfData;
+}
+
+
+static gmx_bool analyze_data(
+ FILE *fp,
+ const char *fn,
+ t_perf **perfdata,
+ int nnodes,
+ int ntprs,
+ int ntests,
+ int nrepeats,
+ t_inputinfo *info,
+ int *index_tpr, /* OUT: Nr of mdp file with best settings */
+ int *npme_optimal) /* OUT: Optimal number of PME nodes */
+{
+ int i, j, k;
+ int line = 0, line_win = -1;
+ int k_win = -1, i_win = -1, winPME;
+ double s = 0.0; /* standard deviation */
+ t_perf *pd;
+ char strbuf[STRLEN];
+ char str_PME_f_load[13];
+ gmx_bool bCanUseOrigTPR;
+ gmx_bool bRefinedCoul, bRefinedVdW, bRefinedGrid;
+
+
+ if (nrepeats > 1)
+ {
+ sep_line(fp);
+ fprintf(fp, "Summary of successful runs:\n");
+ fprintf(fp, "Line tpr PME nodes Gcycles Av. Std.dev. ns/day PME/f");
+ if (nnodes > 1)
+ {
+ fprintf(fp, " DD grid");
+ }
+ fprintf(fp, "\n");
+ }
+
+
+ for (k = 0; k < ntprs; k++)
+ {
+ for (i = 0; i < ntests; i++)
+ {
+ /* Select the right dataset: */
+ pd = &(perfdata[k][i]);
+
+ pd->Gcycles_Av = 0.0;
+ pd->PME_f_load_Av = 0.0;
+ pd->ns_per_day_Av = 0.0;
+
+ if (pd->nPMEnodes == -1)
+ {
+ sprintf(strbuf, "(%3d)", pd->guessPME);
+ }
+ else
+ {
+ sprintf(strbuf, " ");
+ }
+
+ /* Get the average run time of a setting */
+ for (j = 0; j < nrepeats; j++)
+ {
+ pd->Gcycles_Av += pd->Gcycles[j];
+ pd->PME_f_load_Av += pd->PME_f_load[j];
+ }
+ pd->Gcycles_Av /= nrepeats;
+ pd->PME_f_load_Av /= nrepeats;
+
+ for (j = 0; j < nrepeats; j++)
+ {
+ if (pd->ns_per_day[j] > 0.0)
+ {
+ pd->ns_per_day_Av += pd->ns_per_day[j];
+ }
+ else
+ {
+ /* Somehow the performance number was not aquired for this run,
+ * therefor set the average to some negative value: */
+ pd->ns_per_day_Av = -1.0f*nrepeats;
+ break;
+ }
+ }
+ pd->ns_per_day_Av /= nrepeats;
+
+ /* Nicer output: */
+ if (pd->PME_f_load_Av > 0.0)
+ {
+ sprintf(str_PME_f_load, "%12.3f", pd->PME_f_load_Av);
+ }
+ else
+ {
+ sprintf(str_PME_f_load, "%s", " - ");
+ }
+
+
+ /* We assume we had a successful run if both averages are positive */
+ if (pd->Gcycles_Av > 0.0 && pd->ns_per_day_Av > 0.0)
+ {
+ /* Output statistics if repeats were done */
+ if (nrepeats > 1)
+ {
+ /* Calculate the standard deviation */
+ s = 0.0;
+ for (j = 0; j < nrepeats; j++)
+ {
+ s += pow( pd->Gcycles[j] - pd->Gcycles_Av, 2 );
+ }
+ s /= (nrepeats - 1);
+ s = sqrt(s);
+
+ fprintf(fp, "%4d %3d %4d%s %12.3f %12.3f %12.3f %s",
+ line, k, pd->nPMEnodes, strbuf, pd->Gcycles_Av, s,
+ pd->ns_per_day_Av, str_PME_f_load);
+ if (nnodes > 1)
+ {
+ fprintf(fp, " %3d %3d %3d", pd->nx, pd->ny, pd->nz);
+ }
+ fprintf(fp, "\n");
+ }
+ /* Store the index of the best run found so far in 'winner': */
+ if ( (k_win == -1) || (pd->Gcycles_Av < perfdata[k_win][i_win].Gcycles_Av) )
+ {
+ k_win = k;
+ i_win = i;
+ line_win = line;
+ }
+ line++;
+ }
+ }
+ }
+
+ if (k_win == -1)
+ {
+ gmx_fatal(FARGS, "None of the runs was successful! Check %s for problems.", fn);
+ }
+
+ sep_line(fp);
+
+ winPME = perfdata[k_win][i_win].nPMEnodes;
+
+ if (1 == ntests)
+ {
+ /* We stuck to a fixed number of PME-only nodes */
+ sprintf(strbuf, "settings No. %d", k_win);
+ }
+ else
+ {
+ /* We have optimized the number of PME-only nodes */
+ if (winPME == -1)
+ {
+ sprintf(strbuf, "%s", "the automatic number of PME nodes");
+ }
+ else
+ {
+ sprintf(strbuf, "%d PME nodes", winPME);
+ }
+ }
+ fprintf(fp, "Best performance was achieved with %s", strbuf);
+ if ((nrepeats > 1) && (ntests > 1))
+ {
+ fprintf(fp, " (see line %d)", line_win);
+ }
+ fprintf(fp, "\n");
+
+ /* Only mention settings if they were modified: */
+ bRefinedCoul = !is_equal(info->rcoulomb[k_win], info->rcoulomb[0]);
+ bRefinedVdW = !is_equal(info->rvdw[k_win], info->rvdw[0] );
+ bRefinedGrid = !(info->nkx[k_win] == info->nkx[0] &&
+ info->nky[k_win] == info->nky[0] &&
+ info->nkz[k_win] == info->nkz[0]);
+
+ if (bRefinedCoul || bRefinedVdW || bRefinedGrid)
+ {
+ fprintf(fp, "Optimized PME settings:\n");
+ bCanUseOrigTPR = FALSE;
+ }
+ else
+ {
+ bCanUseOrigTPR = TRUE;
+ }
+
+ if (bRefinedCoul)
+ {
+ fprintf(fp, " New Coulomb radius: %f nm (was %f nm)\n", info->rcoulomb[k_win], info->rcoulomb[0]);
+ }
+
+ if (bRefinedVdW)
+ {
+ fprintf(fp, " New Van der Waals radius: %f nm (was %f nm)\n", info->rvdw[k_win], info->rvdw[0]);
+ }
+
+ if (bRefinedGrid)
+ {
+ fprintf(fp, " New Fourier grid xyz: %d %d %d (was %d %d %d)\n", info->nkx[k_win], info->nky[k_win], info->nkz[k_win],
+ info->nkx[0], info->nky[0], info->nkz[0]);
+ }
+
+ if (bCanUseOrigTPR && ntprs > 1)
+ {
+ fprintf(fp, "and original PME settings.\n");
+ }
+
+ fflush(fp);
+
+ /* Return the index of the mdp file that showed the highest performance
+ * and the optimal number of PME nodes */
+ *index_tpr = k_win;
+ *npme_optimal = winPME;
+
+ return bCanUseOrigTPR;
+}
+
+
+/* Get the commands we need to set up the runs from environment variables */
+static void get_program_paths(gmx_bool bThreads, char *cmd_mpirun[], char cmd_np[],
+ char *cmd_mdrun[], int repeats)
+{
+ char *command = NULL;
+ char *cp;
+ char *cp2;
+ char line[STRLEN];
+ FILE *fp;
+ const char def_mpirun[] = "mpirun";
+ const char def_mdrun[] = "mdrun";
+ const char filename[] = "benchtest.log";
+
+ /* This string should always be identical to the one in copyrite.c,
+ * gmx_print_version_info() in the defined(GMX_MPI) section */
+ const char match_mpi[] = "MPI library: MPI";
+ const char match_mdrun[] = "Program: ";
+ const char empty_mpirun[] = "";
+ gmx_bool bMdrun = FALSE;
+ gmx_bool bMPI = FALSE;
+
+
+ /* Get the commands we need to set up the runs from environment variables */
+ if (!bThreads)
+ {
+ if ( (cp = getenv("MPIRUN")) != NULL)
+ {
+ *cmd_mpirun = strdup(cp);
+ }
+ else
+ {
+ *cmd_mpirun = strdup(def_mpirun);
+ }
+ }
+ else
+ {
+ *cmd_mpirun = strdup(empty_mpirun);
+ }
+
+ if ( (cp = getenv("MDRUN" )) != NULL)
+ {
+ *cmd_mdrun = strdup(cp);
+ }
+ else
+ {
+ *cmd_mdrun = strdup(def_mdrun);
+ }
+
+
+ /* If no simulations have to be performed, we are done here */
+ if (repeats <= 0)
+ {
+ return;
+ }
+
+ /* Run a small test to see whether mpirun + mdrun work */
+ fprintf(stdout, "Making sure that mdrun can be executed. ");
+ if (bThreads)
+ {
+ snew(command, strlen(*cmd_mdrun) + strlen(cmd_np) + strlen(filename) + 50);
+ sprintf(command, "%s%s-version -maxh 0.001 1> %s 2>&1", *cmd_mdrun, cmd_np, filename);
+ }
+ else
+ {
+ snew(command, strlen(*cmd_mpirun) + strlen(cmd_np) + strlen(*cmd_mdrun) + strlen(filename) + 50);
+ sprintf(command, "%s%s%s -version -maxh 0.001 1> %s 2>&1", *cmd_mpirun, cmd_np, *cmd_mdrun, filename);
+ }
+ fprintf(stdout, "Trying '%s' ... ", command);
+ make_backup(filename);
+ gmx_system_call(command);
+
+ /* Check if we find the characteristic string in the output: */
+ if (!gmx_fexist(filename))
+ {
+ gmx_fatal(FARGS, "Output from test run could not be found.");
+ }
+
+ fp = fopen(filename, "r");
+ /* We need to scan the whole output file, since sometimes the queuing system
+ * also writes stuff to stdout/err */
+ while (!feof(fp) )
+ {
+ cp2 = fgets(line, STRLEN, fp);
+ if (cp2 != NULL)
+ {
+ if (str_starts(line, match_mdrun) )
+ {
+ bMdrun = TRUE;
+ }
+ if (str_starts(line, match_mpi) )
+ {
+ bMPI = TRUE;
+ }
+ }
+ }
+ fclose(fp);
+
+ if (bThreads)
+ {
+ if (bMPI)
+ {
+ gmx_fatal(FARGS, "Need a threaded version of mdrun. This one\n"
+ "(%s)\n"
+ "seems to have been compiled with MPI instead.",
+ *cmd_mdrun);
+ }
+ }
+ else
+ {
+ if (bMdrun && !bMPI)
+ {
+ gmx_fatal(FARGS, "Need an MPI-enabled version of mdrun. This one\n"
+ "(%s)\n"
+ "seems to have been compiled without MPI support.",
+ *cmd_mdrun);
+ }
+ }
+
+ if (!bMdrun)
+ {
+ gmx_fatal(FARGS, "Cannot execute mdrun. Please check %s for problems!",
+ filename);
+ }
+
+ fprintf(stdout, "passed.\n");
+
+ /* Clean up ... */
+ remove(filename);
+ sfree(command);
+}
+
+
+static void launch_simulation(
+ gmx_bool bLaunch, /* Should the simulation be launched? */
+ FILE *fp, /* General log file */
+ gmx_bool bThreads, /* whether to use threads */
+ char *cmd_mpirun, /* Command for mpirun */
+ char *cmd_np, /* Switch for -np or -ntmpi or empty */
+ char *cmd_mdrun, /* Command for mdrun */
+ char *args_for_mdrun, /* Arguments for mdrun */
+ const char *simulation_tpr, /* This tpr will be simulated */
+ int nPMEnodes) /* Number of PME nodes to use */
+{
+ char *command;
+
+
+ /* Make enough space for the system call command,
+ * (100 extra chars for -npme ... etc. options should suffice): */
+ snew(command, strlen(cmd_mpirun)+strlen(cmd_mdrun)+strlen(cmd_np)+strlen(args_for_mdrun)+strlen(simulation_tpr)+100);
+
+ /* Note that the -passall options requires args_for_mdrun to be at the end
+ * of the command line string */
+ if (bThreads)
+ {
+ sprintf(command, "%s%s-npme %d -s %s %s",
+ cmd_mdrun, cmd_np, nPMEnodes, simulation_tpr, args_for_mdrun);
+ }
+ else
+ {
+ sprintf(command, "%s%s%s -npme %d -s %s %s",
+ cmd_mpirun, cmd_np, cmd_mdrun, nPMEnodes, simulation_tpr, args_for_mdrun);
+ }
+
+ fprintf(fp, "%s this command line to launch the simulation:\n\n%s", bLaunch ? "Using" : "Please use", command);
+ sep_line(fp);
+ fflush(fp);
+
+ /* Now the real thing! */
+ if (bLaunch)
+ {
+ fprintf(stdout, "\nLaunching simulation with best parameters now.\nExecuting '%s'", command);
+ sep_line(stdout);
+ fflush(stdout);
+ gmx_system_call(command);
+ }
+}
+
+
+static void modify_PMEsettings(
+ gmx_large_int_t simsteps, /* Set this value as number of time steps */
+ gmx_large_int_t init_step, /* Set this value as init_step */
+ const char *fn_best_tpr, /* tpr file with the best performance */
+ const char *fn_sim_tpr) /* name of tpr file to be launched */
+{
+ t_inputrec *ir;
+ t_state state;
+ gmx_mtop_t mtop;
+ char buf[200];
+
+ snew(ir, 1);
+ read_tpx_state(fn_best_tpr, ir, &state, NULL, &mtop);
+
+ /* Reset nsteps and init_step to the value of the input .tpr file */
+ ir->nsteps = simsteps;
+ ir->init_step = init_step;
+
+ /* Write the tpr file which will be launched */
+ sprintf(buf, "Writing optimized simulation file %s with nsteps=%s.\n", fn_sim_tpr, gmx_large_int_pfmt);
+ fprintf(stdout, buf, ir->nsteps);
+ fflush(stdout);
+ write_tpx_state(fn_sim_tpr, ir, &state, &mtop);
+
+ sfree(ir);
+}
+
+
+#define EPME_SWITCHED(e) ((e) == eelPMESWITCH || (e) == eelPMEUSERSWITCH)
+
+/* Make additional TPR files with more computational load for the
+ * direct space processors: */
+static void make_benchmark_tprs(
+ const char *fn_sim_tpr, /* READ : User-provided tpr file */
+ char *fn_bench_tprs[], /* WRITE: Names of benchmark tpr files */
+ gmx_large_int_t benchsteps, /* Number of time steps for benchmark runs */
+ gmx_large_int_t statesteps, /* Step counter in checkpoint file */
+ real rmin, /* Minimal Coulomb radius */
+ real rmax, /* Maximal Coulomb radius */
+ real bScaleRvdw, /* Scale rvdw along with rcoulomb */
+ int *ntprs, /* No. of TPRs to write, each with a different
+ rcoulomb and fourierspacing */
+ t_inputinfo *info, /* Contains information about mdp file options */
+ FILE *fp) /* Write the output here */
+{
+ int i, j, d;
+ t_inputrec *ir;
+ t_state state;
+ gmx_mtop_t mtop;
+ real nlist_buffer; /* Thickness of the buffer regions for PME-switch potentials */
+ char buf[200];
+ rvec box_size;
+ gmx_bool bNote = FALSE;
+ real add; /* Add this to rcoul for the next test */
+ real fac = 1.0; /* Scaling factor for Coulomb radius */
+ real fourierspacing; /* Basic fourierspacing from tpr */
+
+
+ sprintf(buf, "Making benchmark tpr file%s with %s time step%s",
+ *ntprs > 1 ? "s" : "", gmx_large_int_pfmt, benchsteps > 1 ? "s" : "");
+ fprintf(stdout, buf, benchsteps);
+ if (statesteps > 0)
+ {
+ sprintf(buf, " (adding %s steps from checkpoint file)", gmx_large_int_pfmt);
+ fprintf(stdout, buf, statesteps);
+ benchsteps += statesteps;
+ }
+ fprintf(stdout, ".\n");
+
+
+ snew(ir, 1);
+ read_tpx_state(fn_sim_tpr, ir, &state, NULL, &mtop);
+
+ /* Check if some kind of PME was chosen */
+ if (EEL_PME(ir->coulombtype) == FALSE)
+ {
+ gmx_fatal(FARGS, "Can only do optimizations for simulations with %s electrostatics.",
+ EELTYPE(eelPME));
+ }
+
+ /* Check if rcoulomb == rlist, which is necessary for plain PME. */
+ if ( (ir->cutoff_scheme != ecutsVERLET) &&
+ (eelPME == ir->coulombtype) && !(ir->rcoulomb == ir->rlist))
+ {
+ gmx_fatal(FARGS, "%s requires rcoulomb (%f) to be equal to rlist (%f).",
+ EELTYPE(eelPME), ir->rcoulomb, ir->rlist);
+ }
+ /* For other PME types, rcoulomb is allowed to be smaller than rlist */
+ else if (ir->rcoulomb > ir->rlist)
+ {
+ gmx_fatal(FARGS, "%s requires rcoulomb (%f) to be equal to or smaller than rlist (%f)",
+ EELTYPE(ir->coulombtype), ir->rcoulomb, ir->rlist);
+ }
+
+ if (bScaleRvdw && ir->rvdw != ir->rcoulomb)
+ {
+ fprintf(stdout, "NOTE: input rvdw != rcoulomb, will not scale rvdw\n");
+ bScaleRvdw = FALSE;
+ }
+
+ /* Reduce the number of steps for the benchmarks */
+ info->orig_sim_steps = ir->nsteps;
+ ir->nsteps = benchsteps;
+ /* We must not use init_step from the input tpr file for the benchmarks */
+ info->orig_init_step = ir->init_step;
+ ir->init_step = 0;
+
+ /* For PME-switch potentials, keep the radial distance of the buffer region */
+ nlist_buffer = ir->rlist - ir->rcoulomb;
+
+ /* Determine length of triclinic box vectors */
+ for (d = 0; d < DIM; d++)
+ {
+ box_size[d] = 0;
+ for (i = 0; i < DIM; i++)
+ {
+ box_size[d] += state.box[d][i]*state.box[d][i];
+ }
+ box_size[d] = sqrt(box_size[d]);
+ }
+
+ if (ir->fourier_spacing > 0)
+ {
+ info->fsx[0] = ir->fourier_spacing;
+ info->fsy[0] = ir->fourier_spacing;
+ info->fsz[0] = ir->fourier_spacing;
+ }
+ else
+ {
+ /* Reconstruct fourierspacing per dimension from the number of grid points and box size */
+ info->fsx[0] = box_size[XX]/ir->nkx;
+ info->fsy[0] = box_size[YY]/ir->nky;
+ info->fsz[0] = box_size[ZZ]/ir->nkz;
+ }
+
+ /* If no value for the fourierspacing was provided on the command line, we
+ * use the reconstruction from the tpr file */
+ if (ir->fourier_spacing > 0)
+ {
+ /* Use the spacing from the tpr */
+ fourierspacing = ir->fourier_spacing;
+ }
+ else
+ {
+ /* Use the maximum observed spacing */
+ fourierspacing = max(max(info->fsx[0], info->fsy[0]), info->fsz[0]);
+ }
+
+ fprintf(stdout, "Calculating PME grid points on the basis of a fourierspacing of %f nm\n", fourierspacing);
+
+ /* For performance comparisons the number of particles is useful to have */
+ fprintf(fp, " Number of particles : %d\n", mtop.natoms);
+
+ /* Print information about settings of which some are potentially modified: */
+ fprintf(fp, " Coulomb type : %s\n", EELTYPE(ir->coulombtype));
+ fprintf(fp, " Grid spacing x y z : %f %f %f\n",
+ box_size[XX]/ir->nkx, box_size[YY]/ir->nky, box_size[ZZ]/ir->nkz);
+ fprintf(fp, " Van der Waals type : %s\n", EVDWTYPE(ir->vdwtype));
+ if (EVDW_SWITCHED(ir->vdwtype))
+ {
+ fprintf(fp, " rvdw_switch : %f nm\n", ir->rvdw_switch);
+ }
+ if (EPME_SWITCHED(ir->coulombtype))
+ {
+ fprintf(fp, " rlist : %f nm\n", ir->rlist);
+ }
+ if (ir->rlistlong != max_cutoff(ir->rvdw, ir->rcoulomb))
+ {
+ fprintf(fp, " rlistlong : %f nm\n", ir->rlistlong);
+ }
+
+ /* Print a descriptive line about the tpr settings tested */
+ fprintf(fp, "\nWill try these real/reciprocal workload settings:\n");
+ fprintf(fp, " No. scaling rcoulomb");
+ fprintf(fp, " nkx nky nkz");
+ fprintf(fp, " spacing");
+ if (evdwCUT == ir->vdwtype)
+ {
+ fprintf(fp, " rvdw");
+ }
+ if (EPME_SWITCHED(ir->coulombtype))
+ {
+ fprintf(fp, " rlist");
+ }
+ if (ir->rlistlong != max_cutoff(ir->rlist, max_cutoff(ir->rvdw, ir->rcoulomb)) )
+ {
+ fprintf(fp, " rlistlong");
+ }
+ fprintf(fp, " tpr file\n");
+
+ /* Loop to create the requested number of tpr input files */
+ for (j = 0; j < *ntprs; j++)
+ {
+ /* The first .tpr is the provided one, just need to modify nsteps,
+ * so skip the following block */
+ if (j != 0)
+ {
+ /* Determine which Coulomb radii rc to use in the benchmarks */
+ add = (rmax-rmin)/(*ntprs-1);
+ if (is_equal(rmin, info->rcoulomb[0]))
+ {
+ ir->rcoulomb = rmin + j*add;
+ }
+ else if (is_equal(rmax, info->rcoulomb[0]))
+ {
+ ir->rcoulomb = rmin + (j-1)*add;
+ }
+ else
+ {
+ /* rmin != rcoul != rmax, ergo test between rmin and rmax */
+ add = (rmax-rmin)/(*ntprs-2);
+ ir->rcoulomb = rmin + (j-1)*add;
+ }
+
+ /* Determine the scaling factor fac */
+ fac = ir->rcoulomb/info->rcoulomb[0];
+
+ /* Scale the Fourier grid spacing */
+ ir->nkx = ir->nky = ir->nkz = 0;
+ calc_grid(NULL, state.box, fourierspacing*fac, &ir->nkx, &ir->nky, &ir->nkz);
+
+ /* Adjust other radii since various conditions neet to be fulfilled */
+ if (eelPME == ir->coulombtype)
+ {
+ /* plain PME, rcoulomb must be equal to rlist */
+ ir->rlist = ir->rcoulomb;
+ }
+ else
+ {
+ /* rlist must be >= rcoulomb, we keep the size of the buffer region */
+ ir->rlist = ir->rcoulomb + nlist_buffer;
+ }
+
+ if (bScaleRvdw && evdwCUT == ir->vdwtype)
+ {
+ /* For vdw cutoff, rvdw >= rlist */
+ ir->rvdw = max(info->rvdw[0], ir->rlist);
+ }
+
+ ir->rlistlong = max_cutoff(ir->rlist, max_cutoff(ir->rvdw, ir->rcoulomb));
+
+ } /* end of "if (j != 0)" */
+
+ /* for j==0: Save the original settings
+ * for j >0: Save modified radii and Fourier grids */
+ info->rcoulomb[j] = ir->rcoulomb;
+ info->rvdw[j] = ir->rvdw;
+ info->nkx[j] = ir->nkx;
+ info->nky[j] = ir->nky;
+ info->nkz[j] = ir->nkz;
+ info->rlist[j] = ir->rlist;
+ info->rlistlong[j] = ir->rlistlong;
+ info->fsx[j] = fac*fourierspacing;
+ info->fsy[j] = fac*fourierspacing;
+ info->fsz[j] = fac*fourierspacing;
+
+ /* Write the benchmark tpr file */
+ strncpy(fn_bench_tprs[j], fn_sim_tpr, strlen(fn_sim_tpr)-strlen(".tpr"));
+ sprintf(buf, "_bench%.2d.tpr", j);
+ strcat(fn_bench_tprs[j], buf);
+ fprintf(stdout, "Writing benchmark tpr %s with nsteps=", fn_bench_tprs[j]);
+ fprintf(stdout, gmx_large_int_pfmt, ir->nsteps);
+ if (j > 0)
+ {
+ fprintf(stdout, ", scaling factor %f\n", fac);
+ }
+ else
+ {
+ fprintf(stdout, ", unmodified settings\n");
+ }
+
+ write_tpx_state(fn_bench_tprs[j], ir, &state, &mtop);
+
+ /* Write information about modified tpr settings to log file */
+ fprintf(fp, "%4d%10f%10f", j, fac, ir->rcoulomb);
+ fprintf(fp, "%5d%5d%5d", ir->nkx, ir->nky, ir->nkz);
+ fprintf(fp, " %9f ", info->fsx[j]);
+ if (evdwCUT == ir->vdwtype)
+ {
+ fprintf(fp, "%10f", ir->rvdw);
+ }
+ if (EPME_SWITCHED(ir->coulombtype))
+ {
+ fprintf(fp, "%10f", ir->rlist);
+ }
+ if (info->rlistlong[0] != max_cutoff(info->rlist[0], max_cutoff(info->rvdw[0], info->rcoulomb[0])) )
+ {
+ fprintf(fp, "%10f", ir->rlistlong);
+ }
+ fprintf(fp, " %-14s\n", fn_bench_tprs[j]);
+
+ /* Make it clear to the user that some additional settings were modified */
+ if (!is_equal(ir->rvdw, info->rvdw[0])
+ || !is_equal(ir->rlistlong, info->rlistlong[0]) )
+ {
+ bNote = TRUE;
+ }
+ }
+ if (bNote)
+ {
+ fprintf(fp, "\nNote that in addition to the Coulomb radius and the Fourier grid\n"
+ "other input settings were also changed (see table above).\n"
+ "Please check if the modified settings are appropriate.\n");
+ }
+ fflush(stdout);
+ fflush(fp);
+ sfree(ir);
+}
+
+
+/* Rename the files we want to keep to some meaningful filename and
+ * delete the rest */
+static void cleanup(const t_filenm *fnm, int nfile, int k, int nnodes,
+ int nPMEnodes, int nr, gmx_bool bKeepStderr)
+{
+ char numstring[STRLEN];
+ char newfilename[STRLEN];
+ const char *fn = NULL;
+ int i;
+ const char *opt;
+
+
+ fprintf(stdout, "Cleaning up, deleting benchmark temp files ...\n");
+
+ for (i = 0; i < nfile; i++)
+ {
+ opt = (char *)fnm[i].opt;
+ if (strcmp(opt, "-p") == 0)
+ {
+ /* do nothing; keep this file */
+ ;
+ }
+ else if (strcmp(opt, "-bg") == 0)
+ {
+ /* Give the log file a nice name so one can later see which parameters were used */
+ numstring[0] = '\0';
+ if (nr > 0)
+ {
+ sprintf(numstring, "_%d", nr);
+ }
+ sprintf(newfilename, "%s_no%d_np%d_npme%d%s", opt2fn("-bg", nfile, fnm), k, nnodes, nPMEnodes, numstring);
+ if (gmx_fexist(opt2fn("-bg", nfile, fnm)))
+ {
+ fprintf(stdout, "renaming log file to %s\n", newfilename);
+ make_backup(newfilename);
+ rename(opt2fn("-bg", nfile, fnm), newfilename);
+ }
+ }
+ else if (strcmp(opt, "-err") == 0)
+ {
+ /* This file contains the output of stderr. We want to keep it in
+ * cases where there have been problems. */
+ fn = opt2fn(opt, nfile, fnm);
+ numstring[0] = '\0';
+ if (nr > 0)
+ {
+ sprintf(numstring, "_%d", nr);
+ }
+ sprintf(newfilename, "%s_no%d_np%d_npme%d%s", fn, k, nnodes, nPMEnodes, numstring);
+ if (gmx_fexist(fn))
+ {
+ if (bKeepStderr)
+ {
+ fprintf(stdout, "Saving stderr output in %s\n", newfilename);
+ make_backup(newfilename);
+ rename(fn, newfilename);
+ }
+ else
+ {
+ fprintf(stdout, "Deleting %s\n", fn);
+ remove(fn);
+ }
+ }
+ }
+ /* Delete the files which are created for each benchmark run: (options -b*) */
+ else if ( (0 == strncmp(opt, "-b", 2)) && (opt2bSet(opt, nfile, fnm) || !is_optional(&fnm[i])) )
+ {
+ fn = opt2fn(opt, nfile, fnm);
+ if (gmx_fexist(fn))
+ {
+ fprintf(stdout, "Deleting %s\n", fn);
+ remove(fn);
+ }
+ }
+ }
+}
+
+
+/* Returns the largest common factor of n1 and n2 */
+static int largest_common_factor(int n1, int n2)
+{
+ int factor, nmax;
+
+ nmax = min(n1, n2);
+ for (factor = nmax; factor > 0; factor--)
+ {
+ if (0 == (n1 % factor) && 0 == (n2 % factor) )
+ {
+ return(factor);
+ }
+ }
+ return 0; /* one for the compiler */
+}
+
+enum {
+ eNpmeAuto, eNpmeAll, eNpmeReduced, eNpmeSubset, eNpmeNr
+};
+
+/* Create a list of numbers of PME nodes to test */
+static void make_npme_list(
+ const char *npmevalues_opt, /* Make a complete list with all
+ * possibilities or a short list that keeps only
+ * reasonable numbers of PME nodes */
+ int *nentries, /* Number of entries we put in the nPMEnodes list */
+ int *nPMEnodes[], /* Each entry contains the value for -npme */
+ int nnodes, /* Total number of nodes to do the tests on */
+ int minPMEnodes, /* Minimum number of PME nodes */
+ int maxPMEnodes) /* Maximum number of PME nodes */
+{
+ int i, npme, npp;
+ int min_factor = 1; /* We request that npp and npme have this minimal
+ * largest common factor (depends on npp) */
+ int nlistmax; /* Max. list size */
+ int nlist; /* Actual number of entries in list */
+ int eNPME = 0;
+
+
+ /* Do we need to check all possible values for -npme or is a reduced list enough? */
+ if (0 == strcmp(npmevalues_opt, "all") )
+ {
+ eNPME = eNpmeAll;
+ }
+ else if (0 == strcmp(npmevalues_opt, "subset") )
+ {
+ eNPME = eNpmeSubset;
+ }
+ else /* "auto" or "range" */
+ {
+ if (nnodes <= 64)
+ {
+ eNPME = eNpmeAll;
+ }
+ else if (nnodes < 128)
+ {
+ eNPME = eNpmeReduced;
+ }
+ else
+ {
+ eNPME = eNpmeSubset;
+ }
+ }
+
+ /* Calculate how many entries we could possibly have (in case of -npme all) */
+ if (nnodes > 2)
+ {
+ nlistmax = maxPMEnodes - minPMEnodes + 3;
+ if (0 == minPMEnodes)
+ {
+ nlistmax--;
+ }
+ }
+ else
+ {
+ nlistmax = 1;
+ }
+
+ /* Now make the actual list which is at most of size nlist */
+ snew(*nPMEnodes, nlistmax);
+ nlist = 0; /* start counting again, now the real entries in the list */
+ for (i = 0; i < nlistmax - 2; i++)
+ {
+ npme = maxPMEnodes - i;
+ npp = nnodes-npme;
+ switch (eNPME)
+ {
+ case eNpmeAll:
+ min_factor = 1;
+ break;
+ case eNpmeReduced:
+ min_factor = 2;
+ break;
+ case eNpmeSubset:
+ /* For 2d PME we want a common largest factor of at least the cube
+ * root of the number of PP nodes */
+ min_factor = (int) pow(npp, 1.0/3.0);
+ break;
+ default:
+ gmx_fatal(FARGS, "Unknown option for eNPME in make_npme_list");
+ break;
+ }
+ if (largest_common_factor(npp, npme) >= min_factor)
+ {
+ (*nPMEnodes)[nlist] = npme;
+ nlist++;
+ }
+ }
+ /* We always test 0 PME nodes and the automatic number */
+ *nentries = nlist + 2;
+ (*nPMEnodes)[nlist ] = 0;
+ (*nPMEnodes)[nlist+1] = -1;
+
+ fprintf(stderr, "Will try the following %d different values for -npme:\n", *nentries);
+ for (i = 0; i < *nentries-1; i++)
+ {
+ fprintf(stderr, "%d, ", (*nPMEnodes)[i]);
+ }
+ fprintf(stderr, "and %d (auto).\n", (*nPMEnodes)[*nentries-1]);
+}
+
+
+/* Allocate memory to store the performance data */
+static void init_perfdata(t_perf *perfdata[], int ntprs, int datasets, int repeats)
+{
+ int i, j, k;
+
+
+ for (k = 0; k < ntprs; k++)
+ {
+ snew(perfdata[k], datasets);
+ for (i = 0; i < datasets; i++)
+ {
+ for (j = 0; j < repeats; j++)
+ {
+ snew(perfdata[k][i].Gcycles, repeats);
+ snew(perfdata[k][i].ns_per_day, repeats);
+ snew(perfdata[k][i].PME_f_load, repeats);
+ }
+ }
+ }
+}
+
+
+/* Check for errors on mdrun -h */
+static void make_sure_it_runs(char *mdrun_cmd_line, int length, FILE *fp)
+{
+ char *command, *msg;
+ int ret;
+
+
+ snew(command, length + 15);
+ snew(msg, length + 500);
+
+ fprintf(stdout, "Making shure the benchmarks can be executed ...\n");
+ sprintf(command, "%s-h -quiet", mdrun_cmd_line);
+ ret = gmx_system_call(command);
+
+ if (0 != ret)
+ {
+ /* To prevent confusion, do not again issue a gmx_fatal here since we already
+ * get the error message from mdrun itself */
+ sprintf(msg, "Cannot run the benchmark simulations! Please check the error message of\n"
+ "mdrun for the source of the problem. Did you provide a command line\n"
+ "argument that neither g_tune_pme nor mdrun understands? Offending command:\n"
+ "\n%s\n\n", command);
+
+ fprintf(stderr, "%s", msg);
+ sep_line(fp);
+ fprintf(fp, "%s", msg);
+
+ exit(ret);
+ }
+
+ sfree(command);
+ sfree(msg );
+}
+
+
+static void do_the_tests(
+ FILE *fp, /* General g_tune_pme output file */
+ char **tpr_names, /* Filenames of the input files to test */
+ int maxPMEnodes, /* Max fraction of nodes to use for PME */
+ int minPMEnodes, /* Min fraction of nodes to use for PME */
+ int npme_fixed, /* If >= -1, test fixed number of PME
+ * nodes only */
+ const char *npmevalues_opt, /* Which -npme values should be tested */
+ t_perf **perfdata, /* Here the performace data is stored */
+ int *pmeentries, /* Entries in the nPMEnodes list */
+ int repeats, /* Repeat each test this often */
+ int nnodes, /* Total number of nodes = nPP + nPME */
+ int nr_tprs, /* Total number of tpr files to test */
+ gmx_bool bThreads, /* Threads or MPI? */
+ char *cmd_mpirun, /* mpirun command string */
+ char *cmd_np, /* "-np", "-n", whatever mpirun needs */
+ char *cmd_mdrun, /* mdrun command string */
+ char *cmd_args_bench, /* arguments for mdrun in a string */
+ const t_filenm *fnm, /* List of filenames from command line */
+ int nfile, /* Number of files specified on the cmdl. */
+ int presteps, /* DLB equilibration steps, is checked */
+ gmx_large_int_t cpt_steps) /* Time step counter in the checkpoint */
+{
+ int i, nr, k, ret, count = 0, totaltests;
+ int *nPMEnodes = NULL;
+ t_perf *pd = NULL;
+ int cmdline_length;
+ char *command, *cmd_stub;
+ char buf[STRLEN];
+ gmx_bool bResetProblem = FALSE;
+ gmx_bool bFirst = TRUE;
+
+
+ /* This string array corresponds to the eParselog enum type at the start
+ * of this file */
+ const char* ParseLog[] = {
+ "OK.",
+ "Logfile not found!",
+ "No timings, logfile truncated?",
+ "Run was terminated.",
+ "Counters were not reset properly.",
+ "No DD grid found for these settings.",
+ "TPX version conflict!",
+ "mdrun was not started in parallel!",
+ "An error occured."
+ };
+ char str_PME_f_load[13];
+
+
+ /* Allocate space for the mdrun command line. 100 extra characters should
+ be more than enough for the -npme etcetera arguments */
+ cmdline_length = strlen(cmd_mpirun)
+ + strlen(cmd_np)
+ + strlen(cmd_mdrun)
+ + strlen(cmd_args_bench)
+ + strlen(tpr_names[0]) + 100;
+ snew(command, cmdline_length);
+ snew(cmd_stub, cmdline_length);
+
+ /* Construct the part of the command line that stays the same for all tests: */
+ if (bThreads)
+ {
+ sprintf(cmd_stub, "%s%s", cmd_mdrun, cmd_np);
+ }
+ else
+ {
+ sprintf(cmd_stub, "%s%s%s ", cmd_mpirun, cmd_np, cmd_mdrun);
+ }
+
+ /* Create a list of numbers of PME nodes to test */
+ if (npme_fixed < -1)
+ {
+ make_npme_list(npmevalues_opt, pmeentries, &nPMEnodes,
+ nnodes, minPMEnodes, maxPMEnodes);
+ }
+ else
+ {
+ *pmeentries = 1;
+ snew(nPMEnodes, 1);
+ nPMEnodes[0] = npme_fixed;
+ fprintf(stderr, "Will use a fixed number of %d PME-only nodes.\n", nPMEnodes[0]);
+ }
+
+ if (0 == repeats)
+ {
+ fprintf(fp, "\nNo benchmarks done since number of repeats (-r) is 0.\n");
+ ffclose(fp);
+ finalize(opt2fn("-p", nfile, fnm));
+ exit(0);
+ }
+
+ /* Allocate one dataset for each tpr input file: */
+ init_perfdata(perfdata, nr_tprs, *pmeentries, repeats);
+
+ /*****************************************/
+ /* Main loop over all tpr files to test: */
+ /*****************************************/
+ totaltests = nr_tprs*(*pmeentries)*repeats;
+ for (k = 0; k < nr_tprs; k++)
+ {
+ fprintf(fp, "\nIndividual timings for input file %d (%s):\n", k, tpr_names[k]);
+ fprintf(fp, "PME nodes Gcycles ns/day PME/f Remark\n");
+ /* Loop over various numbers of PME nodes: */
+ for (i = 0; i < *pmeentries; i++)
+ {
+ pd = &perfdata[k][i];
+
+ /* Loop over the repeats for each scenario: */
+ for (nr = 0; nr < repeats; nr++)
+ {
+ pd->nPMEnodes = nPMEnodes[i];
+
+ /* Add -npme and -s to the command line and save it. Note that
+ * the -passall (if set) options requires cmd_args_bench to be
+ * at the end of the command line string */
+ snew(pd->mdrun_cmd_line, cmdline_length);
+ sprintf(pd->mdrun_cmd_line, "%s-npme %d -s %s %s",
+ cmd_stub, pd->nPMEnodes, tpr_names[k], cmd_args_bench);
+
+ /* To prevent that all benchmarks fail due to a show-stopper argument
+ * on the mdrun command line, we make a quick check with mdrun -h first */
+ if (bFirst)
+ {
+ make_sure_it_runs(pd->mdrun_cmd_line, cmdline_length, fp);
+ }
+ bFirst = FALSE;
+
+ /* Do a benchmark simulation: */
+ if (repeats > 1)
+ {
+ sprintf(buf, ", pass %d/%d", nr+1, repeats);
+ }
+ else
+ {
+ buf[0] = '\0';
+ }
+ fprintf(stdout, "\n=== Progress %2.0f%%, tpr %d/%d, run %d/%d%s:\n",
+ (100.0*count)/totaltests,
+ k+1, nr_tprs, i+1, *pmeentries, buf);
+ make_backup(opt2fn("-err", nfile, fnm));
+ sprintf(command, "%s 1> /dev/null 2>%s", pd->mdrun_cmd_line, opt2fn("-err", nfile, fnm));
+ fprintf(stdout, "%s\n", pd->mdrun_cmd_line);
+ gmx_system_call(command);
+
+ /* Collect the performance data from the log file; also check stderr
+ * for fatal errors */
+ ret = parse_logfile(opt2fn("-bg", nfile, fnm), opt2fn("-err", nfile, fnm),
+ pd, nr, presteps, cpt_steps, nnodes);
+ if ((presteps > 0) && (ret == eParselogResetProblem))
+ {
+ bResetProblem = TRUE;
+ }
+
+ if (-1 == pd->nPMEnodes)
+ {
+ sprintf(buf, "(%3d)", pd->guessPME);
+ }
+ else
+ {
+ sprintf(buf, " ");
+ }
+
+ /* Nicer output */
+ if (pd->PME_f_load[nr] > 0.0)
+ {
+ sprintf(str_PME_f_load, "%12.3f", pd->PME_f_load[nr]);
+ }
+ else
+ {
+ sprintf(str_PME_f_load, "%s", " - ");
+ }
+
+ /* Write the data we got to disk */
+ fprintf(fp, "%4d%s %12.3f %12.3f %s %s", pd->nPMEnodes,
+ buf, pd->Gcycles[nr], pd->ns_per_day[nr], str_PME_f_load, ParseLog[ret]);
+ if (!(ret == eParselogOK || ret == eParselogNoDDGrid || ret == eParselogNotFound) )
+ {
+ fprintf(fp, " Check %s file for problems.", ret == eParselogFatal ? "err" : "log");
+ }
+ fprintf(fp, "\n");
+ fflush(fp);
+ count++;
+
+ /* Do some cleaning up and delete the files we do not need any more */
+ cleanup(fnm, nfile, k, nnodes, pd->nPMEnodes, nr, ret == eParselogFatal);
+
+ /* If the first run with this number of processors already failed, do not try again: */
+ if (pd->Gcycles[0] <= 0.0 && repeats > 1)
+ {
+ fprintf(stdout, "Skipping remaining passes of unsuccessful setting, see log file for details.\n");
+ count += repeats-(nr+1);
+ break;
+ }
+ } /* end of repeats loop */
+ } /* end of -npme loop */
+ } /* end of tpr file loop */
+
+ if (bResetProblem)
+ {
+ sep_line(fp);
+ fprintf(fp, "WARNING: The cycle and time step counters could not be reset properly. ");
+ sep_line(fp);
+ }
+ sfree(command);
+ sfree(cmd_stub);
+}
+
+
+static void check_input(
+ int nnodes,
+ int repeats,
+ int *ntprs,
+ real *rmin,
+ real rcoulomb,
+ real *rmax,
+ real maxPMEfraction,
+ real minPMEfraction,
+ int npme_fixed,
+ gmx_large_int_t bench_nsteps,
+ const t_filenm *fnm,
+ int nfile,
+ int sim_part,
+ int presteps,
+ int npargs,
+ t_pargs *pa)
+{
+ int old;
+
+
+ /* Make sure the input file exists */
+ if (!gmx_fexist(opt2fn("-s", nfile, fnm)))
+ {
+ gmx_fatal(FARGS, "File %s not found.", opt2fn("-s", nfile, fnm));
+ }
+
+ /* Make sure that the checkpoint file is not overwritten during benchmarking */
+ if ( (0 == strcmp(opt2fn("-cpi", nfile, fnm), opt2fn("-bcpo", nfile, fnm)) ) && (sim_part > 1) )
+ {
+ gmx_fatal(FARGS, "Checkpoint input (-cpi) and benchmark checkpoint output (-bcpo) files must not be identical.\n"
+ "The checkpoint input file must not be overwritten during the benchmarks.\n");
+ }
+
+ /* Make sure that repeats is >= 0 (if == 0, only write tpr files) */
+ if (repeats < 0)
+ {
+ gmx_fatal(FARGS, "Number of repeats < 0!");
+ }
+
+ /* Check number of nodes */
+ if (nnodes < 1)
+ {
+ gmx_fatal(FARGS, "Number of nodes/threads must be a positive integer.");
+ }
+
+ /* Automatically choose -ntpr if not set */
+ if (*ntprs < 1)
+ {
+ if (nnodes < 16)
+ {
+ *ntprs = 1;
+ }
+ else
+ {
+ *ntprs = 3;
+ /* Set a reasonable scaling factor for rcoulomb */
+ if (*rmax <= 0)
+ {
+ *rmax = rcoulomb * 1.2;
+ }
+ }
+ fprintf(stderr, "Will test %d tpr file%s.\n", *ntprs, *ntprs == 1 ? "" : "s");
+ }
+ else
+ {
+ if (1 == *ntprs)
+ {
+ fprintf(stderr, "Note: Choose ntpr>1 to shift PME load between real and reciprocal space.\n");
+ }
+ }
+
+ /* Make shure that rmin <= rcoulomb <= rmax */
+ if (*rmin <= 0)
+ {
+ *rmin = rcoulomb;
+ }
+ if (*rmax <= 0)
+ {
+ *rmax = rcoulomb;
+ }
+ if (!(*rmin <= *rmax) )
+ {
+ gmx_fatal(FARGS, "Please choose the Coulomb radii such that rmin <= rmax.\n"
+ "rmin = %g, rmax = %g, actual rcoul from .tpr file = %g\n", *rmin, *rmax, rcoulomb);
+ }
+ /* Add test scenarios if rmin or rmax were set */
+ if (*ntprs <= 2)
+ {
+ if (!is_equal(*rmin, rcoulomb) && (*ntprs == 1) )
+ {
+ (*ntprs)++;
+ fprintf(stderr, "NOTE: Setting -rmin to %g changed -ntpr to %d\n",
+ *rmin, *ntprs);
+ }
+ if (!is_equal(*rmax, rcoulomb) && (*ntprs == 1) )
+ {
+ (*ntprs)++;
+ fprintf(stderr, "NOTE: Setting -rmax to %g changed -ntpr to %d\n",
+ *rmax, *ntprs);
+ }
+ }
+ old = *ntprs;
+ /* If one of rmin, rmax is set, we need 2 tpr files at minimum */
+ if (!is_equal(*rmax, rcoulomb) || !is_equal(*rmin, rcoulomb) )
+ {
+ *ntprs = max(*ntprs, 2);
+ }
+
+ /* If both rmin, rmax are set, we need 3 tpr files at minimum */
+ if (!is_equal(*rmax, rcoulomb) && !is_equal(*rmin, rcoulomb) )
+ {
+ *ntprs = max(*ntprs, 3);
+ }
+
+ if (old != *ntprs)
+ {
+ fprintf(stderr, "NOTE: Your rmin, rmax setting changed -ntpr to %d\n", *ntprs);
+ }
+
+ if (*ntprs > 1)
+ {
+ if (is_equal(*rmin, rcoulomb) && is_equal(rcoulomb, *rmax)) /* We have just a single rc */
+ {
+ fprintf(stderr, "WARNING: Resetting -ntpr to 1 since no Coulomb radius scaling is requested.\n"
+ "Please set rmin < rmax to test Coulomb radii in the [rmin, rmax] interval\n"
+ "with correspondingly adjusted PME grid settings\n");
+ *ntprs = 1;
+ }
+ }
+
+ /* Check whether max and min fraction are within required values */
+ if (maxPMEfraction > 0.5 || maxPMEfraction < 0)
+ {
+ gmx_fatal(FARGS, "-max must be between 0 and 0.5");
+ }
+ if (minPMEfraction > 0.5 || minPMEfraction < 0)
+ {
+ gmx_fatal(FARGS, "-min must be between 0 and 0.5");
+ }
+ if (maxPMEfraction < minPMEfraction)
+ {
+ gmx_fatal(FARGS, "-max must be larger or equal to -min");
+ }
+
+ /* Check whether the number of steps - if it was set - has a reasonable value */
+ if (bench_nsteps < 0)
+ {
+ gmx_fatal(FARGS, "Number of steps must be positive.");
+ }
+
+ if (bench_nsteps > 10000 || bench_nsteps < 100)
+ {
+ fprintf(stderr, "WARNING: steps=");
+ fprintf(stderr, gmx_large_int_pfmt, bench_nsteps);
+ fprintf(stderr, ". Are you sure you want to perform so %s steps for each benchmark?\n", (bench_nsteps < 100) ? "few" : "many");
+ }
+
+ if (presteps < 0)
+ {
+ gmx_fatal(FARGS, "Cannot have a negative number of presteps.\n");
+ }
+
+ /* Check for rcoulomb scaling if more than one .tpr file is tested */
+ if (*ntprs > 1)
+ {
+ if (*rmin/rcoulomb < 0.75 || *rmax/rcoulomb > 1.25)
+ {
+ fprintf(stderr, "WARNING: Applying extreme scaling factor. I hope you know what you are doing.\n");
+ }
+ }
+
+ /* If a fixed number of PME nodes is set we do rcoulomb and PME gird tuning
+ * only. We need to check whether the requested number of PME-only nodes
+ * makes sense. */
+ if (npme_fixed > -1)
+ {
+ /* No more than 50% of all nodes can be assigned as PME-only nodes. */
+ if (2*npme_fixed > nnodes)
+ {
+ gmx_fatal(FARGS, "Cannot have more than %d PME-only nodes for a total of %d nodes (you chose %d).\n",
+ nnodes/2, nnodes, npme_fixed);
+ }
+ if ((npme_fixed > 0) && (5*npme_fixed < nnodes))
+ {
+ fprintf(stderr, "WARNING: Only %g percent of the nodes are assigned as PME-only nodes.\n",
+ 100.0*((real)npme_fixed / (real)nnodes));
+ }
+ if (opt2parg_bSet("-min", npargs, pa) || opt2parg_bSet("-max", npargs, pa))
+ {
+ fprintf(stderr, "NOTE: The -min, -max, and -npme options have no effect when a\n"
+ " fixed number of PME-only nodes is requested with -fix.\n");
+ }
+ }
+}
+
+
+/* Returns TRUE when "opt" is needed at launch time */
+static gmx_bool is_launch_file(char *opt, gmx_bool bSet)
+{
+ /* Apart from the input .tpr and the output log files we need all options that
+ * were set on the command line and that do not start with -b */
+ if (0 == strncmp(opt, "-b", 2) || 0 == strncmp(opt, "-s", 2)
+ || 0 == strncmp(opt, "-err", 4) || 0 == strncmp(opt, "-p", 2) )
+ {
+ return FALSE;
+ }
+
+ return bSet;
+}
+
+
+/* Returns TRUE when "opt" defines a file which is needed for the benchmarks runs */
+static gmx_bool is_bench_file(char *opt, gmx_bool bSet, gmx_bool bOptional, gmx_bool bIsOutput)
+{
+ /* Apart from the input .tpr, all files starting with "-b" are for
+ * _b_enchmark files exclusively */
+ if (0 == strncmp(opt, "-s", 2))
+ {
+ return FALSE;
+ }
+
+ if (0 == strncmp(opt, "-b", 2) || 0 == strncmp(opt, "-s", 2))
+ {
+ if (!bOptional || bSet)
+ {
+ return TRUE;
+ }
+ else
+ {
+ return FALSE;
+ }
+ }
+ else
+ {
+ if (bIsOutput)
+ {
+ return FALSE;
+ }
+ else
+ {
+ if (bSet) /* These are additional input files like -cpi -ei */
+ {
+ return TRUE;
+ }
+ else
+ {
+ return FALSE;
+ }
+ }
+ }
+}
+
+
+/* Adds 'buf' to 'str' */
+static void add_to_string(char **str, char *buf)
+{
+ int len;
+
+
+ len = strlen(*str) + strlen(buf) + 1;
+ srenew(*str, len);
+ strcat(*str, buf);
+}
+
+
+/* Create the command line for the benchmark as well as for the real run */
+static void create_command_line_snippets(
+ gmx_bool bAppendFiles,
+ gmx_bool bKeepAndNumCPT,
+ gmx_bool bResetHWay,
+ int presteps,
+ int nfile,
+ t_filenm fnm[],
+ char *cmd_args_bench[], /* command line arguments for benchmark runs */
+ char *cmd_args_launch[], /* command line arguments for simulation run */
+ char extra_args[]) /* Add this to the end of the command line */
+{
+ int i;
+ char *opt;
+ const char *name;
+ char strbuf[STRLEN];
+
+
+ /* strlen needs at least '\0' as a string: */
+ snew(*cmd_args_bench, 1);
+ snew(*cmd_args_launch, 1);
+ *cmd_args_launch[0] = '\0';
+ *cmd_args_bench[0] = '\0';
+
+
+ /*******************************************/
+ /* 1. Process other command line arguments */
+ /*******************************************/
+ if (presteps > 0)
+ {
+ /* Add equilibration steps to benchmark options */
+ sprintf(strbuf, "-resetstep %d ", presteps);
+ add_to_string(cmd_args_bench, strbuf);
+ }
+ /* These switches take effect only at launch time */
+ if (FALSE == bAppendFiles)
+ {
+ add_to_string(cmd_args_launch, "-noappend ");
+ }
+ if (bKeepAndNumCPT)
+ {
+ add_to_string(cmd_args_launch, "-cpnum ");
+ }
+ if (bResetHWay)
+ {
+ add_to_string(cmd_args_launch, "-resethway ");
+ }
+
+ /********************/
+ /* 2. Process files */
+ /********************/
+ for (i = 0; i < nfile; i++)
+ {
+ opt = (char *)fnm[i].opt;
+ name = opt2fn(opt, nfile, fnm);
+
+ /* Strbuf contains the options, now let's sort out where we need that */
+ sprintf(strbuf, "%s %s ", opt, name);
+
+ if (is_bench_file(opt, opt2bSet(opt, nfile, fnm), is_optional(&fnm[i]), is_output(&fnm[i])) )
+ {
+ /* All options starting with -b* need the 'b' removed,
+ * therefore overwrite strbuf */
+ if (0 == strncmp(opt, "-b", 2))
+ {
+ sprintf(strbuf, "-%s %s ", &opt[2], name);
+ }
+
+ add_to_string(cmd_args_bench, strbuf);
+ }
+
+ if (is_launch_file(opt, opt2bSet(opt, nfile, fnm)) )
+ {
+ add_to_string(cmd_args_launch, strbuf);
+ }
+ }
+
+ add_to_string(cmd_args_bench, extra_args);
+ add_to_string(cmd_args_launch, extra_args);
+}
+
+
+/* Set option opt */
+static void setopt(const char *opt, int nfile, t_filenm fnm[])
+{
+ int i;
+
+ for (i = 0; (i < nfile); i++)
+ {
+ if (strcmp(opt, fnm[i].opt) == 0)
+ {
+ fnm[i].flag |= ffSET;
+ }
+ }
+}
+
+
+/* This routine inspects the tpr file and ...
+ * 1. checks for output files that get triggered by a tpr option. These output
+ * files are marked as 'set' to allow for a proper cleanup after each
+ * tuning run.
+ * 2. returns the PME:PP load ratio
+ * 3. returns rcoulomb from the tpr */
+static float inspect_tpr(int nfile, t_filenm fnm[], real *rcoulomb)
+{
+ gmx_bool bPull; /* Is pulling requested in .tpr file? */
+ gmx_bool bTpi; /* Is test particle insertion requested? */
+ gmx_bool bFree; /* Is a free energy simulation requested? */
+ gmx_bool bNM; /* Is a normal mode analysis requested? */
+ t_inputrec ir;
+ t_state state;
+ gmx_mtop_t mtop;
+
+
+ /* Check tpr file for options that trigger extra output files */
+ read_tpx_state(opt2fn("-s", nfile, fnm), &ir, &state, NULL, &mtop);
+ bPull = (epullNO != ir.ePull);
+ bFree = (efepNO != ir.efep );
+ bNM = (eiNM == ir.eI );
+ bTpi = EI_TPI(ir.eI);
+
+ /* Set these output files on the tuning command-line */
+ if (bPull)
+ {
+ setopt("-pf", nfile, fnm);
+ setopt("-px", nfile, fnm);
+ }
+ if (bFree)
+ {
+ setopt("-dhdl", nfile, fnm);
+ }
+ if (bTpi)
+ {
+ setopt("-tpi", nfile, fnm);
+ setopt("-tpid", nfile, fnm);
+ }
+ if (bNM)
+ {
+ setopt("-mtx", nfile, fnm);
+ }
+
+ *rcoulomb = ir.rcoulomb;
+
+ /* Return the estimate for the number of PME nodes */
+ return pme_load_estimate(&mtop, &ir, state.box);
+}
+
+
+static void couple_files_options(int nfile, t_filenm fnm[])
+{
+ int i;
+ gmx_bool bSet, bBench;
+ char *opt;
+ char buf[20];
+
+
+ for (i = 0; i < nfile; i++)
+ {
+ opt = (char *)fnm[i].opt;
+ bSet = ((fnm[i].flag & ffSET) != 0);
+ bBench = (0 == strncmp(opt, "-b", 2));
+
+ /* Check optional files */
+ /* If e.g. -eo is set, then -beo also needs to be set */
+ if (is_optional(&fnm[i]) && bSet && !bBench)
+ {
+ sprintf(buf, "-b%s", &opt[1]);
+ setopt(buf, nfile, fnm);
+ }
+ /* If -beo is set, then -eo also needs to be! */
+ if (is_optional(&fnm[i]) && bSet && bBench)
+ {
+ sprintf(buf, "-%s", &opt[2]);
+ setopt(buf, nfile, fnm);
+ }
+ }
+}
+
+
+static double gettime()
+{
+#ifdef HAVE_GETTIMEOFDAY
+ struct timeval t;
+ double seconds;
+
+ gettimeofday(&t, NULL);
+
+ seconds = (double) t.tv_sec + 1e-6*(double)t.tv_usec;
+
+ return seconds;
+#else
+ double seconds;
+
+ seconds = time(NULL);
+
+ return seconds;
+#endif
+}
+
+
+#define BENCHSTEPS (1000)
+
+int gmx_tune_pme(int argc, char *argv[])
+{
+ const char *desc[] = {
+ "For a given number [TT]-np[tt] or [TT]-ntmpi[tt] of processors/threads, this program systematically",
+ "times [TT]mdrun[tt] with various numbers of PME-only nodes and determines",
+ "which setting is fastest. It will also test whether performance can",
+ "be enhanced by shifting load from the reciprocal to the real space",
+ "part of the Ewald sum. ",
+ "Simply pass your [TT].tpr[tt] file to [TT]g_tune_pme[tt] together with other options",
+ "for [TT]mdrun[tt] as needed.[PAR]",
+ "Which executables are used can be set in the environment variables",
+ "MPIRUN and MDRUN. If these are not present, 'mpirun' and 'mdrun'",
+ "will be used as defaults. Note that for certain MPI frameworks you",
+ "need to provide a machine- or hostfile. This can also be passed",
+ "via the MPIRUN variable, e.g.[PAR]",
+ "[TT]export MPIRUN=\"/usr/local/mpirun -machinefile hosts\"[tt][PAR]",
+ "Please call [TT]g_tune_pme[tt] with the normal options you would pass to",
+ "[TT]mdrun[tt] and add [TT]-np[tt] for the number of processors to perform the",
+ "tests on, or [TT]-ntmpi[tt] for the number of threads. You can also add [TT]-r[tt]",
+ "to repeat each test several times to get better statistics. [PAR]",
+ "[TT]g_tune_pme[tt] can test various real space / reciprocal space workloads",
+ "for you. With [TT]-ntpr[tt] you control how many extra [TT].tpr[tt] files will be",
+ "written with enlarged cutoffs and smaller Fourier grids respectively.",
+ "Typically, the first test (number 0) will be with the settings from the input",
+ "[TT].tpr[tt] file; the last test (number [TT]ntpr[tt]) will have the Coulomb cutoff",
+ "specified by [TT]-rmax[tt] with a somwhat smaller PME grid at the same time. ",
+ "In this last test, the Fourier spacing is multiplied with [TT]rmax[tt]/rcoulomb. ",
+ "The remaining [TT].tpr[tt] files will have equally-spaced Coulomb radii (and Fourier "
+ "spacings) between these extremes. [BB]Note[bb] that you can set [TT]-ntpr[tt] to 1",
+ "if you just seek the optimal number of PME-only nodes; in that case",
+ "your input [TT].tpr[tt] file will remain unchanged.[PAR]",
+ "For the benchmark runs, the default of 1000 time steps should suffice for most",
+ "MD systems. The dynamic load balancing needs about 100 time steps",
+ "to adapt to local load imbalances, therefore the time step counters",
+ "are by default reset after 100 steps. For large systems (>1M atoms), as well as ",
+ "for a higher accuarcy of the measurements, you should set [TT]-resetstep[tt] to a higher value.",
+ "From the 'DD' load imbalance entries in the md.log output file you",
+ "can tell after how many steps the load is sufficiently balanced. Example call:[PAR]"
+ "[TT]g_tune_pme -np 64 -s protein.tpr -launch[tt][PAR]",
+ "After calling [TT]mdrun[tt] several times, detailed performance information",
+ "is available in the output file [TT]perf.out.[tt] ",
+ "[BB]Note[bb] that during the benchmarks, a couple of temporary files are written",
+ "(options [TT]-b[tt]*), these will be automatically deleted after each test.[PAR]",
+ "If you want the simulation to be started automatically with the",
+ "optimized parameters, use the command line option [TT]-launch[tt].[PAR]",
+ };
+
+ int nnodes = 1;
+ int repeats = 2;
+ int pmeentries = 0; /* How many values for -npme do we actually test for each tpr file */
+ real maxPMEfraction = 0.50;
+ real minPMEfraction = 0.25;
+ int maxPMEnodes, minPMEnodes;
+ float guessPMEratio; /* guessed PME:PP ratio based on the tpr file */
+ float guessPMEnodes;
+ int npme_fixed = -2; /* If >= -1, use only this number
+ * of PME-only nodes */
+ int ntprs = 0;
+ real rmin = 0.0, rmax = 0.0; /* min and max value for rcoulomb if scaling is requested */
+ real rcoulomb = -1.0; /* Coulomb radius as set in .tpr file */
+ gmx_bool bScaleRvdw = TRUE;
+ gmx_large_int_t bench_nsteps = BENCHSTEPS;
+ gmx_large_int_t new_sim_nsteps = -1; /* -1 indicates: not set by the user */
+ gmx_large_int_t cpt_steps = 0; /* Step counter in .cpt input file */
+ int presteps = 100; /* Do a full cycle reset after presteps steps */
+ gmx_bool bOverwrite = FALSE, bKeepTPR;
+ gmx_bool bLaunch = FALSE;
+ char *ExtraArgs = NULL;
+ char **tpr_names = NULL;
+ const char *simulation_tpr = NULL;
+ int best_npme, best_tpr;
+ int sim_part = 1; /* For benchmarks with checkpoint files */
+ char bbuf[STRLEN];
+
+ /* Default program names if nothing else is found */
+ char *cmd_mpirun = NULL, *cmd_mdrun = NULL;
+ char *cmd_args_bench, *cmd_args_launch;
+ char *cmd_np = NULL;
+
+ t_perf **perfdata = NULL;
+ t_inputinfo *info;
+ int i;
+ FILE *fp;
+ t_commrec *cr;
+
+ /* Print out how long the tuning took */
+ double seconds;
+
+ static t_filenm fnm[] = {
+ /* g_tune_pme */
+ { efOUT, "-p", "perf", ffWRITE },
+ { efLOG, "-err", "bencherr", ffWRITE },
+ { efTPX, "-so", "tuned", ffWRITE },
+ /* mdrun: */
+ { efTPX, NULL, NULL, ffREAD },
+ { efTRN, "-o", NULL, ffWRITE },
+ { efXTC, "-x", NULL, ffOPTWR },
+ { efCPT, "-cpi", NULL, ffOPTRD },
+ { efCPT, "-cpo", NULL, ffOPTWR },
+ { efSTO, "-c", "confout", ffWRITE },
+ { efEDR, "-e", "ener", ffWRITE },
+ { efLOG, "-g", "md", ffWRITE },
+ { efXVG, "-dhdl", "dhdl", ffOPTWR },
+ { efXVG, "-field", "field", ffOPTWR },
+ { efXVG, "-table", "table", ffOPTRD },
+ { efXVG, "-tabletf", "tabletf", ffOPTRD },
+ { efXVG, "-tablep", "tablep", ffOPTRD },
+ { efXVG, "-tableb", "table", ffOPTRD },
+ { efTRX, "-rerun", "rerun", ffOPTRD },
+ { efXVG, "-tpi", "tpi", ffOPTWR },
+ { efXVG, "-tpid", "tpidist", ffOPTWR },
+ { efEDI, "-ei", "sam", ffOPTRD },
+ { efXVG, "-eo", "edsam", ffOPTWR },
+ { efGCT, "-j", "wham", ffOPTRD },
+ { efGCT, "-jo", "bam", ffOPTWR },
+ { efXVG, "-ffout", "gct", ffOPTWR },
+ { efXVG, "-devout", "deviatie", ffOPTWR },
+ { efXVG, "-runav", "runaver", ffOPTWR },
+ { efXVG, "-px", "pullx", ffOPTWR },
+ { efXVG, "-pf", "pullf", ffOPTWR },
+ { efXVG, "-ro", "rotation", ffOPTWR },
+ { efLOG, "-ra", "rotangles", ffOPTWR },
+ { efLOG, "-rs", "rotslabs", ffOPTWR },
+ { efLOG, "-rt", "rottorque", ffOPTWR },
+ { efMTX, "-mtx", "nm", ffOPTWR },
+ { efNDX, "-dn", "dipole", ffOPTWR },
+ /* Output files that are deleted after each benchmark run */
+ { efTRN, "-bo", "bench", ffWRITE },
+ { efXTC, "-bx", "bench", ffWRITE },
+ { efCPT, "-bcpo", "bench", ffWRITE },
+ { efSTO, "-bc", "bench", ffWRITE },
+ { efEDR, "-be", "bench", ffWRITE },
+ { efLOG, "-bg", "bench", ffWRITE },
+ { efXVG, "-beo", "benchedo", ffOPTWR },
+ { efXVG, "-bdhdl", "benchdhdl", ffOPTWR },
+ { efXVG, "-bfield", "benchfld", ffOPTWR },
+ { efXVG, "-btpi", "benchtpi", ffOPTWR },
+ { efXVG, "-btpid", "benchtpid", ffOPTWR },
+ { efGCT, "-bjo", "bench", ffOPTWR },
+ { efXVG, "-bffout", "benchgct", ffOPTWR },
+ { efXVG, "-bdevout", "benchdev", ffOPTWR },
+ { efXVG, "-brunav", "benchrnav", ffOPTWR },
+ { efXVG, "-bpx", "benchpx", ffOPTWR },
+ { efXVG, "-bpf", "benchpf", ffOPTWR },
+ { efXVG, "-bro", "benchrot", ffOPTWR },
+ { efLOG, "-bra", "benchrota", ffOPTWR },
+ { efLOG, "-brs", "benchrots", ffOPTWR },
+ { efLOG, "-brt", "benchrott", ffOPTWR },
+ { efMTX, "-bmtx", "benchn", ffOPTWR },
+ { efNDX, "-bdn", "bench", ffOPTWR }
+ };
+
+ gmx_bool bThreads = FALSE;
+
+ int nthreads = 1;
+
+ const char *procstring[] =
+ { NULL, "-np", "-n", "none", NULL };
+ const char *npmevalues_opt[] =
+ { NULL, "auto", "all", "subset", NULL };
+
+ gmx_bool bAppendFiles = TRUE;
+ gmx_bool bKeepAndNumCPT = FALSE;
+ gmx_bool bResetCountersHalfWay = FALSE;
+ gmx_bool bBenchmark = TRUE;
+
+ output_env_t oenv = NULL;
+
+ t_pargs pa[] = {
+ /***********************/
+ /* g_tune_pme options: */
+ /***********************/
+ { "-np", FALSE, etINT, {&nnodes},
+ "Number of nodes to run the tests on (must be > 2 for separate PME nodes)" },
+ { "-npstring", FALSE, etENUM, {procstring},
+ "Specify the number of processors to [TT]$MPIRUN[tt] using this string"},
+ { "-ntmpi", FALSE, etINT, {&nthreads},
+ "Number of MPI-threads to run the tests on (turns MPI & mpirun off)"},
+ { "-r", FALSE, etINT, {&repeats},
+ "Repeat each test this often" },
+ { "-max", FALSE, etREAL, {&maxPMEfraction},
+ "Max fraction of PME nodes to test with" },
+ { "-min", FALSE, etREAL, {&minPMEfraction},
+ "Min fraction of PME nodes to test with" },
+ { "-npme", FALSE, etENUM, {npmevalues_opt},
+ "Within -min and -max, benchmark all possible values for [TT]-npme[tt], or just a reasonable subset. "
+ "Auto neglects -min and -max and chooses reasonable values around a guess for npme derived from the .tpr"},
+ { "-fix", FALSE, etINT, {&npme_fixed},
+ "If >= -1, do not vary the number of PME-only nodes, instead use this fixed value and only vary rcoulomb and the PME grid spacing."},
+ { "-rmax", FALSE, etREAL, {&rmax},
+ "If >0, maximal rcoulomb for -ntpr>1 (rcoulomb upscaling results in fourier grid downscaling)" },
+ { "-rmin", FALSE, etREAL, {&rmin},
+ "If >0, minimal rcoulomb for -ntpr>1" },
+ { "-scalevdw", FALSE, etBOOL, {&bScaleRvdw},
+ "Scale rvdw along with rcoulomb"},
+ { "-ntpr", FALSE, etINT, {&ntprs},
+ "Number of [TT].tpr[tt] files to benchmark. Create this many files with different rcoulomb scaling factors depending on -rmin and -rmax. "
+ "If < 1, automatically choose the number of [TT].tpr[tt] files to test" },
+ { "-steps", FALSE, etGMX_LARGE_INT, {&bench_nsteps},
+ "Take timings for this many steps in the benchmark runs" },
+ { "-resetstep", FALSE, etINT, {&presteps},
+ "Let dlb equilibrate this many steps before timings are taken (reset cycle counters after this many steps)" },
+ { "-simsteps", FALSE, etGMX_LARGE_INT, {&new_sim_nsteps},
+ "If non-negative, perform this many steps in the real run (overwrites nsteps from [TT].tpr[tt], add [TT].cpt[tt] steps)" },
+ { "-launch", FALSE, etBOOL, {&bLaunch},
+ "Launch the real simulation after optimization" },
+ { "-bench", FALSE, etBOOL, {&bBenchmark},
+ "Run the benchmarks or just create the input [TT].tpr[tt] files?" },
+ /******************/
+ /* mdrun options: */
+ /******************/
+ /* We let g_tune_pme parse and understand these options, because we need to
+ * prevent that they appear on the mdrun command line for the benchmarks */
+ { "-append", FALSE, etBOOL, {&bAppendFiles},
+ "Append to previous output files when continuing from checkpoint instead of adding the simulation part number to all file names (for launch only)" },
+ { "-cpnum", FALSE, etBOOL, {&bKeepAndNumCPT},
+ "Keep and number checkpoint files (launch only)" },
+ { "-resethway", FALSE, etBOOL, {&bResetCountersHalfWay},
+ "HIDDENReset the cycle counters after half the number of steps or halfway [TT]-maxh[tt] (launch only)" }
+ };
+
+#define NFILE asize(fnm)
+
+ seconds = gettime();
+
+ parse_common_args(&argc, argv, PCA_NOEXIT_ON_ARGS,
+ NFILE, fnm, asize(pa), pa, asize(desc), desc,
+ 0, NULL, &oenv);
+
+ /* Store the remaining unparsed command line entries in a string which
+ * is then attached to the mdrun command line */
+ snew(ExtraArgs, 1);
+ ExtraArgs[0] = '\0';
+ for (i = 1; i < argc; i++) /* argc will now be 1 if everything was understood */
+ {
+ add_to_string(&ExtraArgs, argv[i]);
+ add_to_string(&ExtraArgs, " ");
+ }
+
+ if (opt2parg_bSet("-ntmpi", asize(pa), pa))
+ {
+ bThreads = TRUE;
+ if (opt2parg_bSet("-npstring", asize(pa), pa))
+ {
+ fprintf(stderr, "WARNING: -npstring has no effect when using threads.\n");
+ }
+
+ if (nnodes > 1)
+ {
+ gmx_fatal(FARGS, "Can't run multi-threaded MPI simulation yet!");
+ }
+ /* and now we just set this; a bit of an ugly hack*/
+ nnodes = nthreads;
+ }
+ /* Check for PME:PP ratio and whether tpr triggers additional output files */
+ guessPMEratio = inspect_tpr(NFILE, fnm, &rcoulomb);
+
+ /* Automatically set -beo options if -eo is set etc. */
+ couple_files_options(NFILE, fnm);
+
+ /* Construct the command line arguments for benchmark runs
+ * as well as for the simulation run */
+ if (bThreads)
+ {
+ sprintf(bbuf, " -ntmpi %d ", nthreads);
+ }
+ else
+ {
++ /* This string will be used for MPI runs and will appear after the
++ * mpirun command. */
++ if (strcmp(procstring[0], "none") != 0)
++ {
++ sprintf(bbuf, " %s %d ", procstring[0], nnodes);
++ }
++ else
++ {
++ sprintf(bbuf, " ");
++ }
+ }
+
+ cmd_np = bbuf;
+
+ create_command_line_snippets(bAppendFiles, bKeepAndNumCPT, bResetCountersHalfWay, presteps,
+ NFILE, fnm, &cmd_args_bench, &cmd_args_launch, ExtraArgs);
+
+ /* Read in checkpoint file if requested */
+ sim_part = 1;
+ if (opt2bSet("-cpi", NFILE, fnm))
+ {
+ snew(cr, 1);
+ cr->duty = DUTY_PP; /* makes the following routine happy */
+ read_checkpoint_simulation_part(opt2fn("-cpi", NFILE, fnm),
+ &sim_part, &cpt_steps, cr,
+ FALSE, NFILE, fnm, NULL, NULL);
+ sfree(cr);
+ sim_part++;
+ /* sim_part will now be 1 if no checkpoint file was found */
+ if (sim_part <= 1)
+ {
+ gmx_fatal(FARGS, "Checkpoint file %s not found!", opt2fn("-cpi", NFILE, fnm));
+ }
+ }
+
+ /* Open performance output file and write header info */
+ fp = ffopen(opt2fn("-p", NFILE, fnm), "w");
+
+ /* Make a quick consistency check of command line parameters */
+ check_input(nnodes, repeats, &ntprs, &rmin, rcoulomb, &rmax,
+ maxPMEfraction, minPMEfraction, npme_fixed,
+ bench_nsteps, fnm, NFILE, sim_part, presteps,
+ asize(pa), pa);
+
+ /* Determine the maximum and minimum number of PME nodes to test,
+ * the actual list of settings is build in do_the_tests(). */
+ if ((nnodes > 2) && (npme_fixed < -1))
+ {
+ if (0 == strcmp(npmevalues_opt[0], "auto"))
+ {
+ /* Determine the npme range automatically based on the PME:PP load guess */
+ if (guessPMEratio > 1.0)
+ {
+ /* More PME than PP work, probably we do not need separate PME nodes at all! */
+ maxPMEnodes = nnodes/2;
+ minPMEnodes = nnodes/2;
+ }
+ else
+ {
+ /* PME : PP load is in the range 0..1, let's test around the guess */
+ guessPMEnodes = nnodes/(1.0 + 1.0/guessPMEratio);
+ minPMEnodes = floor(0.7*guessPMEnodes);
+ maxPMEnodes = ceil(1.6*guessPMEnodes);
+ maxPMEnodes = min(maxPMEnodes, nnodes/2);
+ }
+ }
+ else
+ {
+ /* Determine the npme range based on user input */
+ maxPMEnodes = floor(maxPMEfraction*nnodes);
+ minPMEnodes = max(floor(minPMEfraction*nnodes), 0);
+ fprintf(stdout, "Will try runs with %d ", minPMEnodes);
+ if (maxPMEnodes != minPMEnodes)
+ {
+ fprintf(stdout, "- %d ", maxPMEnodes);
+ }
+ fprintf(stdout, "PME-only nodes.\n Note that the automatic number of PME-only nodes and no separate PME nodes are always tested.\n");
+ }
+ }
+ else
+ {
+ maxPMEnodes = 0;
+ minPMEnodes = 0;
+ }
+
+ /* Get the commands we need to set up the runs from environment variables */
+ get_program_paths(bThreads, &cmd_mpirun, cmd_np, &cmd_mdrun, repeats);
+
+ /* Print some header info to file */
+ sep_line(fp);
+ fprintf(fp, "\n P E R F O R M A N C E R E S U L T S\n");
+ sep_line(fp);
+ fprintf(fp, "%s for Gromacs %s\n", ShortProgram(), GromacsVersion());
+ if (!bThreads)
+ {
+ fprintf(fp, "Number of nodes : %d\n", nnodes);
+ fprintf(fp, "The mpirun command is : %s\n", cmd_mpirun);
+ if (strcmp(procstring[0], "none") != 0)
+ {
+ fprintf(fp, "Passing # of nodes via : %s\n", procstring[0]);
+ }
+ else
+ {
+ fprintf(fp, "Not setting number of nodes in system call\n");
+ }
+ }
+ else
+ {
+ fprintf(fp, "Number of threads : %d\n", nnodes);
+ }
+
+ fprintf(fp, "The mdrun command is : %s\n", cmd_mdrun);
+ fprintf(fp, "mdrun args benchmarks : %s\n", cmd_args_bench);
+ fprintf(fp, "Benchmark steps : ");
+ fprintf(fp, gmx_large_int_pfmt, bench_nsteps);
+ fprintf(fp, "\n");
+ fprintf(fp, "dlb equilibration steps : %d\n", presteps);
+ if (sim_part > 1)
+ {
+ fprintf(fp, "Checkpoint time step : ");
+ fprintf(fp, gmx_large_int_pfmt, cpt_steps);
+ fprintf(fp, "\n");
+ }
+ fprintf(fp, "mdrun args at launchtime: %s\n", cmd_args_launch);
+
+ if (new_sim_nsteps >= 0)
+ {
+ bOverwrite = TRUE;
+ fprintf(stderr, "Note: Simulation input file %s will have ", opt2fn("-so", NFILE, fnm));
+ fprintf(stderr, gmx_large_int_pfmt, new_sim_nsteps+cpt_steps);
+ fprintf(stderr, " steps.\n");
+ fprintf(fp, "Simulation steps : ");
+ fprintf(fp, gmx_large_int_pfmt, new_sim_nsteps);
+ fprintf(fp, "\n");
+ }
+ if (repeats > 1)
+ {
+ fprintf(fp, "Repeats for each test : %d\n", repeats);
+ }
+
+ if (npme_fixed >= -1)
+ {
+ fprintf(fp, "Fixing -npme at : %d\n", npme_fixed);
+ }
+
+ fprintf(fp, "Input file : %s\n", opt2fn("-s", NFILE, fnm));
+ fprintf(fp, " PME/PP load estimate : %g\n", guessPMEratio);
+
+ /* Allocate memory for the inputinfo struct: */
+ snew(info, 1);
+ info->nr_inputfiles = ntprs;
+ for (i = 0; i < ntprs; i++)
+ {
+ snew(info->rcoulomb, ntprs);
+ snew(info->rvdw, ntprs);
+ snew(info->rlist, ntprs);
+ snew(info->rlistlong, ntprs);
+ snew(info->nkx, ntprs);
+ snew(info->nky, ntprs);
+ snew(info->nkz, ntprs);
+ snew(info->fsx, ntprs);
+ snew(info->fsy, ntprs);
+ snew(info->fsz, ntprs);
+ }
+ /* Make alternative tpr files to test: */
+ snew(tpr_names, ntprs);
+ for (i = 0; i < ntprs; i++)
+ {
+ snew(tpr_names[i], STRLEN);
+ }
+
+ /* It can be that ntprs is reduced by make_benchmark_tprs if not enough
+ * different grids could be found. */
+ make_benchmark_tprs(opt2fn("-s", NFILE, fnm), tpr_names, bench_nsteps+presteps,
+ cpt_steps, rmin, rmax, bScaleRvdw, &ntprs, info, fp);
+
+ /********************************************************************************/
+ /* Main loop over all scenarios we need to test: tpr files, PME nodes, repeats */
+ /********************************************************************************/
+ snew(perfdata, ntprs);
+ if (bBenchmark)
+ {
+ do_the_tests(fp, tpr_names, maxPMEnodes, minPMEnodes, npme_fixed, npmevalues_opt[0], perfdata, &pmeentries,
+ repeats, nnodes, ntprs, bThreads, cmd_mpirun, cmd_np, cmd_mdrun,
+ cmd_args_bench, fnm, NFILE, presteps, cpt_steps);
+
+ fprintf(fp, "\nTuning took%8.1f minutes.\n", (gettime()-seconds)/60.0);
+
+ /* Analyse the results and give a suggestion for optimal settings: */
+ bKeepTPR = analyze_data(fp, opt2fn("-p", NFILE, fnm), perfdata, nnodes, ntprs, pmeentries,
+ repeats, info, &best_tpr, &best_npme);
+
+ /* Take the best-performing tpr file and enlarge nsteps to original value */
+ if (bKeepTPR && !bOverwrite)
+ {
+ simulation_tpr = opt2fn("-s", NFILE, fnm);
+ }
+ else
+ {
+ simulation_tpr = opt2fn("-so", NFILE, fnm);
+ modify_PMEsettings(bOverwrite ? (new_sim_nsteps+cpt_steps) : info->orig_sim_steps,
+ info->orig_init_step, tpr_names[best_tpr], simulation_tpr);
+ }
+
+ /* Let's get rid of the temporary benchmark input files */
+ for (i = 0; i < ntprs; i++)
+ {
+ fprintf(stdout, "Deleting temporary benchmark input file %s\n", tpr_names[i]);
+ remove(tpr_names[i]);
+ }
+
+ /* Now start the real simulation if the user requested it ... */
+ launch_simulation(bLaunch, fp, bThreads, cmd_mpirun, cmd_np, cmd_mdrun,
+ cmd_args_launch, simulation_tpr, best_npme);
+ }
+ ffclose(fp);
+
+ /* ... or simply print the performance results to screen: */
+ if (!bLaunch)
+ {
+ finalize(opt2fn("-p", NFILE, fnm));
+ }
+
+ return 0;
+}
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2004, The GROMACS development team,
+ * check out http://www.gromacs.org for more information.
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <stdio.h>
+#include <cuda.h>
+#include <cuda_runtime_api.h>
+
+#include "buildinfo.h"
+
+void gmx_print_version_info_gpu(FILE *fp)
+{
+ int cuda_driver,cuda_runtime;
+ fprintf(fp, "CUDA compiler: %s\n",CUDA_NVCC_COMPILER_INFO);
++ fprintf(fp, "CUDA compiler flags:%s\n",CUDA_NVCC_COMPILER_FLAGS);
+ cuda_driver = 0;
+ cudaDriverGetVersion(&cuda_driver);
+ cuda_runtime = 0;
+ cudaRuntimeGetVersion(&cuda_runtime);
+ fprintf(fp, "CUDA driver: %d.%d\n",cuda_driver/1000, cuda_driver%100);
+ fprintf(fp, "CUDA runtime: %d.%d\n",cuda_runtime/1000, cuda_runtime%100);
+}
--- /dev/null
-
+/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
+ *
+ *
+ * This file is part of GROMACS.
+ * Copyright (c) 2012-
+ *
+ * Written by the Gromacs development team under coordination of
+ * David van der Spoel, Berk Hess, and Erik Lindahl.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org
+ *
+ * And Hey:
+ * GROup of MAchos and Cynical Suckers
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <stdlib.h>
+#include <assert.h>
+#include <string.h>
+
+#include "types/enums.h"
+#include "types/hw_info.h"
+#include "types/commrec.h"
+#include "gmx_fatal.h"
+#include "gmx_fatal_collective.h"
+#include "smalloc.h"
+#include "gpu_utils.h"
+#include "statutil.h"
+#include "gmx_detect_hardware.h"
+#include "main.h"
+#include "md_logging.h"
+
++#include "thread_mpi/threads.h"
++
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>
+#endif
- void limit_num_gpus_used(gmx_hw_info_t *hwinfo, int count);
+#if ((defined(WIN32) || defined( _WIN32 ) || defined(WIN64) || defined( _WIN64 )) && !(defined (__CYGWIN__) || defined (__CYGWIN32__)))
+#include "windows.h"
+#endif
+
+/* Although we can't have more than 10 GPU different ID-s passed by the user as
+ * the id-s are assumed to be represented by single digits, as multiple
+ * processes can share a GPU, we can end up with more than 10 IDs.
+ * To account for potential extreme cases we'll set the limit to a pretty
+ * ridiculous number. */
+static unsigned int max_gpu_ids_user = 64;
+
+static const char * invalid_gpuid_hint =
+ "A delimiter-free sequence of valid numeric IDs of available GPUs is expected.";
+
++/* The globally shared hwinfo structure. */
++static gmx_hw_info_t *hwinfo_g;
++/* A reference counter for the hwinfo structure */
++static int n_hwinfo = 0;
++/* A lock to protect the hwinfo structure */
++static tMPI_Thread_mutex_t hw_info_lock = TMPI_THREAD_MUTEX_INITIALIZER;
++
++
+/* FW decl. */
- int npppn, ntmpi_pp, ngpu;
- char sbuf[STRLEN], th_or_proc[STRLEN], th_or_proc_plural[STRLEN], pernode[STRLEN];
- char gpu_plural[2];
- gmx_bool bGPUBin, btMPI, bMPI, bMaxMpiThreadsSet, bNthreadsAuto, bEmulateGPU;
++static void limit_num_gpus_used(gmx_hw_info_t *hwinfo, int count);
+
+static void sprint_gpus(char *sbuf, const gmx_gpu_info_t *gpu_info)
+{
+ int i, ndev;
+ char stmp[STRLEN];
+
+ ndev = gpu_info->ncuda_dev;
+
+ sbuf[0] = '\0';
+ for (i = 0; i < ndev; i++)
+ {
+ get_gpu_device_info_string(stmp, gpu_info, i);
+ strcat(sbuf, " ");
+ strcat(sbuf, stmp);
+ if (i < ndev - 1)
+ {
+ strcat(sbuf, "\n");
+ }
+ }
+}
+
+static void print_gpu_detection_stats(FILE *fplog,
+ const gmx_gpu_info_t *gpu_info,
+ const t_commrec *cr)
+{
+ char onhost[266], stmp[STRLEN];
+ int ngpu;
+
+ ngpu = gpu_info->ncuda_dev;
+
+#if defined GMX_MPI && !defined GMX_THREAD_MPI
+ /* We only print the detection on one, of possibly multiple, nodes */
+ strncpy(onhost, " on host ", 10);
+ gmx_gethostname(onhost+9, 256);
+#else
+ /* We detect all relevant GPUs */
+ strncpy(onhost, "", 1);
+#endif
+
+ if (ngpu > 0)
+ {
+ sprint_gpus(stmp, gpu_info);
+ md_print_warn(cr, fplog, "%d GPU%s detected%s:\n%s\n",
+ ngpu, (ngpu > 1) ? "s" : "", onhost, stmp);
+ }
+ else
+ {
+ md_print_warn(cr, fplog, "No GPUs detected%s\n", onhost);
+ }
+}
+
+static void print_gpu_use_stats(FILE *fplog,
+ const gmx_gpu_info_t *gpu_info,
+ const t_commrec *cr)
+{
+ char sbuf[STRLEN], stmp[STRLEN];
+ int i, ngpu, ngpu_all;
+
+ ngpu = gpu_info->ncuda_dev_use;
+ ngpu_all = gpu_info->ncuda_dev;
+
+ /* Issue note if GPUs are available but not used */
+ if (ngpu_all > 0 && ngpu < 1)
+ {
+ sprintf(sbuf,
+ "%d compatible GPU%s detected in the system, but none will be used.\n"
+ "Consider trying GPU acceleration with the Verlet scheme!",
+ ngpu_all, (ngpu_all > 1) ? "s" : "");
+ }
+ else
+ {
+ sprintf(sbuf, "%d GPU%s %sselected for this run: ",
+ ngpu, (ngpu > 1) ? "s" : "",
+ gpu_info->bUserSet ? "user-" : "auto-");
+ for (i = 0; i < ngpu; i++)
+ {
+ sprintf(stmp, "#%d", get_gpu_device_id(gpu_info, i));
+ if (i < ngpu - 1)
+ {
+ strcat(stmp, ", ");
+ }
+ strcat(sbuf, stmp);
+ }
+ }
+ md_print_info(cr, fplog, "%s\n\n", sbuf);
+}
+
+/* Parse a "plain" GPU ID string which contains a sequence of digits corresponding
+ * to GPU IDs; the order will indicate the process/tMPI thread - GPU assignment. */
+static void parse_gpu_id_plain_string(const char *idstr, int *nid, int *idlist)
+{
+ int i;
+ size_t len_idstr;
+
+ len_idstr = strlen(idstr);
+
+ if (len_idstr > max_gpu_ids_user)
+ {
+ gmx_fatal(FARGS, "%d GPU IDs provided, but only at most %d are supported",
+ len_idstr, max_gpu_ids_user);
+ }
+
+ *nid = len_idstr;
+
+ for (i = 0; i < *nid; i++)
+ {
+ if (idstr[i] < '0' || idstr[i] > '9')
+ {
+ gmx_fatal(FARGS, "Invalid character in GPU ID string: '%c'\n%s\n",
+ idstr[i], invalid_gpuid_hint);
+ }
+ idlist[i] = idstr[i] - '0';
+ }
+}
+
+void gmx_check_hw_runconf_consistency(FILE *fplog, gmx_hw_info_t *hwinfo,
+ const t_commrec *cr, int ntmpi_requested,
+ gmx_bool bUseGPU)
+{
- btMPI = bMPI = FALSE;
- bNthreadsAuto = FALSE;
++ int npppn, ntmpi_pp, ngpu;
++ char sbuf[STRLEN], th_or_proc[STRLEN], th_or_proc_plural[STRLEN], pernode[STRLEN];
++ char gpu_plural[2];
++ gmx_bool bGPUBin, btMPI, bMPI, bMaxMpiThreadsSet, bNthreadsAuto, bEmulateGPU;
++ int ret;
++ static tMPI_Thread_mutex_t cons_lock = TMPI_THREAD_MUTEX_INITIALIZER;
++
+
+ assert(hwinfo);
+ assert(cr);
+
- btMPI = TRUE;
- bNthreadsAuto = (ntmpi_requested < 1);
++ /* Below we only do consistency checks for PP and GPUs,
++ * this is irrelevant for PME only nodes, so in that case we return
++ * here.
++ */
++ if (!(cr->duty & DUTY_PP))
++ {
++ return;
++ }
++
++ /* We run this function only once, but must make sure that all threads
++ that are alive run this function, so they get consistent data. We
++ achieve this by mutual exclusion and returning if the structure is
++ already properly checked & set */
++ ret = tMPI_Thread_mutex_lock(&cons_lock);
++ if (ret != 0)
++ {
++ gmx_fatal(FARGS, "Error locking cons mutex: %s", strerror(errno));
++ }
++
++ if (!hwinfo->bConsistencyChecked)
++ {
++ btMPI = bMPI = FALSE;
++ bNthreadsAuto = FALSE;
+#if defined(GMX_THREAD_MPI)
- bMPI = TRUE;
++ btMPI = TRUE;
++ bNthreadsAuto = (ntmpi_requested < 1);
+#elif defined(GMX_LIB_MPI)
- bGPUBin = TRUE;
++ bMPI = TRUE;
+#endif
+
+#ifdef GMX_GPU
- bGPUBin = FALSE;
++ bGPUBin = TRUE;
+#else
- /* GPU emulation detection is done later, but we need here as well
- * -- uncool, but there's no elegant workaround */
- bEmulateGPU = (getenv("GMX_EMULATE_GPU") != NULL);
- bMaxMpiThreadsSet = (getenv("GMX_MAX_MPI_THREADS") != NULL);
++ bGPUBin = FALSE;
+#endif
+
- if (SIMMASTER(cr))
- {
- /* check the acceleration mdrun is compiled with against hardware capabilities */
- /* TODO: Here we assume homogeneous hardware which is not necessarily the case!
- * Might not hurt to add an extra check over MPI. */
++ /* GPU emulation detection is done later, but we need here as well
++ * -- uncool, but there's no elegant workaround */
++ bEmulateGPU = (getenv("GMX_EMULATE_GPU") != NULL);
++ bMaxMpiThreadsSet = (getenv("GMX_MAX_MPI_THREADS") != NULL);
+
- }
-
- /* Below we only do consistency checks for PP and GPUs,
- * this is irrelevant for PME only nodes, so in that case we return here.
- */
- if (!(cr->duty & DUTY_PP))
- {
- return;
- }
++ /* check the acceleration mdrun is compiled with against hardware
++ capabilities */
++ /* TODO: Here we assume homogeneous hardware which is not necessarily
++ the case! Might not hurt to add an extra check over MPI. */
+ gmx_cpuid_acceleration_check(hwinfo->cpuid_info, fplog);
- /* Need to ensure that we have enough GPUs:
- * - need one GPU per PP node
- * - no GPU oversubscription with tMPI
- * => keep on the GPU support, otherwise turn off (or bail if forced)
- * */
- /* number of PP processes per node */
- npppn = cr->nrank_pp_intranode;
-
- pernode[0] = '\0';
- th_or_proc_plural[0] = '\0';
- if (btMPI)
- {
- sprintf(th_or_proc, "thread-MPI thread");
- if (npppn > 1)
+
- sprintf(th_or_proc_plural, "s");
++ /* Need to ensure that we have enough GPUs:
++ * - need one GPU per PP node
++ * - no GPU oversubscription with tMPI
++ * => keep on the GPU support, otherwise turn off (or bail if forced)
++ * */
++ /* number of PP processes per node */
++ npppn = cr->nrank_pp_intranode;
++
++ pernode[0] = '\0';
++ th_or_proc_plural[0] = '\0';
++ if (btMPI)
+ {
- }
- else if (bMPI)
- {
- sprintf(th_or_proc, "MPI process");
- if (npppn > 1)
++ sprintf(th_or_proc, "thread-MPI thread");
++ if (npppn > 1)
++ {
++ sprintf(th_or_proc_plural, "s");
++ }
+ }
- sprintf(th_or_proc_plural, "es");
++ else if (bMPI)
+ {
- sprintf(pernode, " per node");
- }
- else
- {
- /* neither MPI nor tMPI */
- sprintf(th_or_proc, "process");
- }
-
- if (bGPUBin)
- {
- print_gpu_detection_stats(fplog, &hwinfo->gpu_info, cr);
- }
++ sprintf(th_or_proc, "MPI process");
++ if (npppn > 1)
++ {
++ sprintf(th_or_proc_plural, "es");
++ }
++ sprintf(pernode, " per node");
++ }
++ else
++ {
++ /* neither MPI nor tMPI */
++ sprintf(th_or_proc, "process");
+ }
- if (bUseGPU && hwinfo->bCanUseGPU && !bEmulateGPU)
- {
- ngpu = hwinfo->gpu_info.ncuda_dev_use;
- sprintf(gpu_plural, "%s", (ngpu > 1) ? "s" : "");
+
- /* number of tMPI threads atuo-adjusted */
- if (btMPI && bNthreadsAuto && SIMMASTER(cr))
++ if (bGPUBin)
++ {
++ print_gpu_detection_stats(fplog, &hwinfo->gpu_info, cr);
++ }
+
- if (npppn < ngpu)
++ if (bUseGPU && hwinfo->bCanUseGPU && !bEmulateGPU)
+ {
- if (hwinfo->gpu_info.bUserSet)
++ ngpu = hwinfo->gpu_info.ncuda_dev_use;
++ sprintf(gpu_plural, "%s", (ngpu > 1) ? "s" : "");
++
++ /* number of tMPI threads atuo-adjusted */
++ if (btMPI && bNthreadsAuto)
+ {
- /* The user manually provided more GPUs than threads we could
- * automatically start. */
- gmx_fatal(FARGS,
- "%d GPU%s provided, but only %d PP thread-MPI thread%s coud be started.\n"
- "%s requires one PP tread-MPI thread per GPU; use fewer GPUs%s.",
- ngpu, gpu_plural, npppn, th_or_proc_plural,
- ShortProgram(), bMaxMpiThreadsSet ? "\nor allow more threads to be used" : "");
- }
- else
- {
- /* There are more GPUs than tMPI threads; we have to limit the number GPUs used. */
- md_print_warn(cr, fplog,
- "NOTE: %d GPU%s were detected, but only %d PP thread-MPI thread%s can be started.\n"
- " %s can use one GPU per PP tread-MPI thread, so only %d GPU%s will be used.%s\n",
++ if (npppn < ngpu)
+ {
- ShortProgram(), npppn, npppn > 1 ? "s" : "",
- bMaxMpiThreadsSet ? "\n Also, you can allow more threads to be used by increasing GMX_MAX_MPI_THREADS" : "");
-
- if (cr->rank_pp_intranode == 0)
++ if (hwinfo->gpu_info.bUserSet)
++ {
++ /* The user manually provided more GPUs than threads we
++ could automatically start. */
++ gmx_fatal(FARGS,
++ "%d GPU%s provided, but only %d PP thread-MPI thread%s coud be started.\n"
++ "%s requires one PP tread-MPI thread per GPU; use fewer GPUs%s.",
+ ngpu, gpu_plural, npppn, th_or_proc_plural,
- limit_num_gpus_used(hwinfo, npppn);
- ngpu = hwinfo->gpu_info.ncuda_dev_use;
- sprintf(gpu_plural, "%s", (ngpu > 1) ? "s" : "");
++ ShortProgram(), bMaxMpiThreadsSet ? "\nor allow more threads to be used" : "");
++ }
++ else
+ {
- }
++ /* There are more GPUs than tMPI threads; we have to
++ limit the number GPUs used. */
++ md_print_warn(cr, fplog,
++ "NOTE: %d GPU%s were detected, but only %d PP thread-MPI thread%s can be started.\n"
++ " %s can use one GPU per PP tread-MPI thread, so only %d GPU%s will be used.%s\n",
++ ngpu, gpu_plural, npppn,
++ th_or_proc_plural,
++ ShortProgram(), npppn,
++ npppn > 1 ? "s" : "",
++ bMaxMpiThreadsSet ? "\n Also, you can allow more threads to be used by increasing GMX_MAX_MPI_THREADS" : "");
++
++ if (cr->rank_pp_intranode == 0)
++ {
++ limit_num_gpus_used(hwinfo, npppn);
++ ngpu = hwinfo->gpu_info.ncuda_dev_use;
++ sprintf(gpu_plural, "%s", (ngpu > 1) ? "s" : "");
++ }
+ }
+ }
+ }
- if (ngpu != npppn)
- {
- if (hwinfo->gpu_info.bUserSet)
+
- gmx_fatal(FARGS,
- "Incorrect launch configuration: mismatching number of PP %s%s and GPUs%s.\n"
- "%s was started with %d PP %s%s%s, but you provided %d GPU%s.",
- th_or_proc, btMPI ? "s" : "es", pernode,
- ShortProgram(), npppn, th_or_proc, th_or_proc_plural, pernode, ngpu, gpu_plural);
- }
- else
- {
- if (ngpu > npppn)
++ if (ngpu != npppn)
+ {
- md_print_warn(cr, fplog,
- "NOTE: potentially sub-optimal launch configuration, %s started with less\n"
- " PP %s%s%s than GPU%s available.\n"
- " Each PP %s can use only one GPU, %d GPU%s%s will be used.\n",
- ShortProgram(),
- th_or_proc, th_or_proc_plural, pernode, gpu_plural,
- th_or_proc, npppn, gpu_plural, pernode);
-
- if (bMPI || (btMPI && cr->rank_pp_intranode == 0))
- {
- limit_num_gpus_used(hwinfo, npppn);
- ngpu = hwinfo->gpu_info.ncuda_dev_use;
- sprintf(gpu_plural, "%s", (ngpu > 1) ? "s" : "");
- }
++ if (hwinfo->gpu_info.bUserSet)
+ {
- /* Avoid duplicate error messages.
- * Unfortunately we can only do this at the physical node
- * level, since the hardware setup and MPI process count
- * might be differ over physical nodes.
- */
- if (cr->rank_pp_intranode == 0)
++ gmx_fatal(FARGS,
++ "Incorrect launch configuration: mismatching number of PP %s%s and GPUs%s.\n"
++ "%s was started with %d PP %s%s%s, but you provided %d GPU%s.",
++ th_or_proc, btMPI ? "s" : "es", pernode,
++ ShortProgram(), npppn, th_or_proc,
++ th_or_proc_plural, pernode, ngpu, gpu_plural);
+ }
+ else
+ {
- gmx_fatal(FARGS,
- "Incorrect launch configuration: mismatching number of PP %s%s and GPUs%s.\n"
- "%s was started with %d PP %s%s%s, but only %d GPU%s were detected.",
- th_or_proc, btMPI ? "s" : "es", pernode,
- ShortProgram(), npppn, th_or_proc, th_or_proc_plural, pernode, ngpu, gpu_plural);
++ if (ngpu > npppn)
+ {
- #ifdef GMX_MPI
++ md_print_warn(cr, fplog,
++ "NOTE: potentially sub-optimal launch configuration, %s started with less\n"
++ " PP %s%s%s than GPU%s available.\n"
++ " Each PP %s can use only one GPU, %d GPU%s%s will be used.\n",
++ ShortProgram(), th_or_proc,
++ th_or_proc_plural, pernode, gpu_plural,
++ th_or_proc, npppn, gpu_plural, pernode);
++
++ if (bMPI || (btMPI && cr->rank_pp_intranode == 0))
++ {
++ limit_num_gpus_used(hwinfo, npppn);
++ ngpu = hwinfo->gpu_info.ncuda_dev_use;
++ sprintf(gpu_plural, "%s", (ngpu > 1) ? "s" : "");
++ }
+ }
- /* Avoid other ranks to continue after inconsistency */
- MPI_Barrier(cr->mpi_comm_mygroup);
+ else
+ {
- #endif
++ /* Avoid duplicate error messages.
++ * Unfortunately we can only do this at the physical node
++ * level, since the hardware setup and MPI process count
++ * might be differ over physical nodes.
++ */
++ if (cr->rank_pp_intranode == 0)
++ {
++ gmx_fatal(FARGS,
++ "Incorrect launch configuration: mismatching number of PP %s%s and GPUs%s.\n"
++ "%s was started with %d PP %s%s%s, but only %d GPU%s were detected.",
++ th_or_proc, btMPI ? "s" : "es", pernode,
++ ShortProgram(), npppn, th_or_proc,
++ th_or_proc_plural, pernode, ngpu,
++ gpu_plural);
++ }
+ }
- }
+ }
+ }
- hwinfo->gpu_info.bDevShare = FALSE;
- if (hwinfo->gpu_info.bUserSet && (cr->rank_pp_intranode == 0))
- {
- int i, j, same_count;
- gmx_bool bSomeSame, bAllDifferent;
+
- same_count = 0; /* number of GPUs shared among ranks */
- bSomeSame = FALSE;
- bAllDifferent = TRUE;
++ {
++ int same_count;
+
- for (i = 0; i < ngpu - 1; i++)
- {
- for (j = i + 1; j < ngpu; j++)
++ same_count = gmx_count_gpu_dev_shared(&(hwinfo->gpu_info));
+
- bSomeSame |= hwinfo->gpu_info.cuda_dev_use[i] == hwinfo->gpu_info.cuda_dev_use[j];
- bAllDifferent &= hwinfo->gpu_info.cuda_dev_use[i] != hwinfo->gpu_info.cuda_dev_use[j];
- same_count += hwinfo->gpu_info.cuda_dev_use[i] == hwinfo->gpu_info.cuda_dev_use[j];
++ if (btMPI && same_count > 0)
+ {
- /* store the number of shared/oversubscribed GPUs */
- hwinfo->gpu_info.bDevShare = bSomeSame;
++ gmx_fatal(FARGS,
++ "Invalid GPU assignment: can't share a GPU among multiple thread-MPI threads.\n"
++ "Use MPI if you are sure that you want to assign GPU to multiple threads.");
++ }
++
++ if (same_count > 0)
++ {
++ md_print_warn(cr, fplog,
++ "NOTE: Potentially sub-optimal launch configuration: you assigned %s to\n"
++ " multiple %s%s; this should be avoided as it can cause\n"
++ " performance loss.\n",
++ same_count > 1 ? "GPUs" : "a GPU", th_or_proc, btMPI ? "s" : "es");
+ }
+ }
++ print_gpu_use_stats(fplog, &hwinfo->gpu_info, cr);
++ }
++ hwinfo->bConsistencyChecked = TRUE;
++ }
+
- if (btMPI && !bAllDifferent)
- {
- gmx_fatal(FARGS,
- "Invalid GPU assignment: can't share a GPU among multiple thread-MPI threads.\n"
- "Use MPI if you are sure that you want to assign GPU to multiple threads.");
- }
++ ret = tMPI_Thread_mutex_unlock(&cons_lock);
++ if (ret != 0)
++ {
++ gmx_fatal(FARGS, "Error unlocking cons mutex: %s", strerror(errno));
++ }
+
- if (bSomeSame)
++#ifdef GMX_MPI
++ if (PAR(cr))
++ {
++ /* Avoid other ranks to continue after
++ inconsistency */
++ MPI_Barrier(cr->mpi_comm_mygroup);
++ }
++#endif
++
++}
++
++int gmx_count_gpu_dev_shared(const gmx_gpu_info_t *gpu_info)
++{
++ int same_count = 0;
++ int ngpu = gpu_info->ncuda_dev_use;
+
- md_print_warn(cr, fplog,
- "NOTE: Potentially sub-optimal launch configuration: you assigned %s to\n"
- " multiple %s%s; this should be avoided as it can cause\n"
- " performance loss.\n",
- same_count > 1 ? "GPUs" : "a GPU", th_or_proc, btMPI ? "s" : "es");
++ if (gpu_info->bUserSet)
++ {
++ int i, j;
++
++ for (i = 0; i < ngpu - 1; i++)
++ {
++ for (j = i + 1; j < ngpu; j++)
+ {
- print_gpu_use_stats(fplog, &hwinfo->gpu_info, cr);
++ same_count += (gpu_info->cuda_dev_use[i] ==
++ gpu_info->cuda_dev_use[j]);
+ }
+ }
- void gmx_detect_hardware(FILE *fplog, gmx_hw_info_t *hwinfo,
- const t_commrec *cr,
- gmx_bool bForceUseGPU, gmx_bool bTryUseGPU,
- const char *gpu_id)
+ }
++
++ return same_count;
+}
+
++
+/* Return the number of hardware threads supported by the current CPU.
+ * We assume that this is equal with the number of CPUs reported to be
+ * online by the OS at the time of the call.
+ */
+static int get_nthreads_hw_avail(FILE gmx_unused *fplog, const t_commrec gmx_unused *cr)
+{
+ int ret = 0;
+
+#if ((defined(WIN32) || defined( _WIN32 ) || defined(WIN64) || defined( _WIN64 )) && !(defined (__CYGWIN__) || defined (__CYGWIN32__)))
+ /* Windows */
+ SYSTEM_INFO sysinfo;
+ GetSystemInfo( &sysinfo );
+ ret = sysinfo.dwNumberOfProcessors;
+#elif defined HAVE_SYSCONF
+ /* We are probably on Unix.
+ * Now check if we have the argument to use before executing the call
+ */
+#if defined(_SC_NPROCESSORS_ONLN)
+ ret = sysconf(_SC_NPROCESSORS_ONLN);
+#elif defined(_SC_NPROC_ONLN)
+ ret = sysconf(_SC_NPROC_ONLN);
+#elif defined(_SC_NPROCESSORS_CONF)
+ ret = sysconf(_SC_NPROCESSORS_CONF);
+#elif defined(_SC_NPROC_CONF)
+ ret = sysconf(_SC_NPROC_CONF);
+#endif /* End of check for sysconf argument values */
+
+#else
+ /* Neither windows nor Unix. No fscking idea how many CPUs we have! */
+ ret = -1;
+#endif
+
+ if (debug)
+ {
+ fprintf(debug, "Detected %d processors, will use this as the number "
+ "of supported hardware threads.\n", ret);
+ }
+
+#ifdef GMX_OMPENMP
+ if (ret != gmx_omp_get_num_procs())
+ {
+ md_print_warn(cr, fplog,
+ "Number of CPUs detected (%d) does not match the number reported by OpenMP (%d).\n"
+ "Consider setting the launch configuration manually!",
+ ret, gmx_omp_get_num_procs());
+ }
+#endif
+
+ return ret;
+}
+
- assert(hwinfo);
-
- /* detect CPUID info; no fuss, we don't detect system-wide
- * -- sloppy, but that's it for now */
- if (gmx_cpuid_init(&hwinfo->cpuid_info) != 0)
++gmx_hw_info_t *gmx_detect_hardware(FILE *fplog, const t_commrec *cr,
++ gmx_bool bForceUseGPU, gmx_bool bTryUseGPU,
++ const char *gpu_id)
+{
+ int i;
+ const char *env;
+ char sbuf[STRLEN], stmp[STRLEN];
+ gmx_hw_info_t *hw;
+ gmx_gpu_info_t gpuinfo_auto, gpuinfo_user;
+ gmx_bool bGPUBin;
++ int ret;
+
- gmx_fatal_collective(FARGS, cr, NULL, "CPUID detection failed!");
++ /* make sure no one else is doing the same thing */
++ ret = tMPI_Thread_mutex_lock(&hw_info_lock);
++ if (ret != 0)
+ {
- /* detect number of hardware threads */
- hwinfo->nthreads_hw_avail = get_nthreads_hw_avail(fplog, cr);
++ gmx_fatal(FARGS, "Error locking hwinfo mutex: %s", strerror(errno));
+ }
+
- /* detect GPUs */
- hwinfo->gpu_info.ncuda_dev_use = 0;
- hwinfo->gpu_info.cuda_dev_use = NULL;
- hwinfo->gpu_info.ncuda_dev = 0;
- hwinfo->gpu_info.cuda_dev = NULL;
++ /* only initialize the hwinfo structure if it is not already initalized */
++ if (n_hwinfo == 0)
++ {
++ snew(hwinfo_g, 1);
++ hwinfo_g->bConsistencyChecked = FALSE;
+
- bGPUBin = TRUE;
++ /* detect CPUID info; no fuss, we don't detect system-wide
++ * -- sloppy, but that's it for now */
++ if (gmx_cpuid_init(&hwinfo_g->cpuid_info) != 0)
++ {
++ gmx_fatal_collective(FARGS, cr, NULL, "CPUID detection failed!");
++ }
++
++ /* detect number of hardware threads */
++ hwinfo_g->nthreads_hw_avail = get_nthreads_hw_avail(fplog, cr);
++
++ /* detect GPUs */
++ hwinfo_g->gpu_info.ncuda_dev_use = 0;
++ hwinfo_g->gpu_info.cuda_dev_use = NULL;
++ hwinfo_g->gpu_info.ncuda_dev = 0;
++ hwinfo_g->gpu_info.cuda_dev = NULL;
+
+#ifdef GMX_GPU
- bGPUBin = FALSE;
++ bGPUBin = TRUE;
+#else
- /* Bail if binary is not compiled with GPU acceleration, but this is either
- * explicitly (-nb gpu) or implicitly (gpu ID passed) requested. */
- if (bForceUseGPU && !bGPUBin)
- {
- gmx_fatal(FARGS, "GPU acceleration requested, but %s was compiled without GPU support!", ShortProgram());
- }
- if (gpu_id != NULL && !bGPUBin)
- {
- gmx_fatal(FARGS, "GPU ID string set, but %s was compiled without GPU support!", ShortProgram());
- }
-
- /* run the detection if the binary was compiled with GPU support */
- if (bGPUBin && getenv("GMX_DISABLE_GPU_DETECTION") == NULL)
- {
- char detection_error[STRLEN];
-
- if (detect_cuda_gpus(&hwinfo->gpu_info, detection_error) != 0)
++ bGPUBin = FALSE;
+#endif
+
- if (detection_error != NULL && detection_error[0] != '\0')
- {
- sprintf(sbuf, ":\n %s\n", detection_error);
- }
- else
- {
- sprintf(sbuf, ".");
- }
- md_print_warn(cr, fplog,
- "NOTE: Error occurred during GPU detection%s"
- " Can not use GPU acceleration, will fall back to CPU kernels.\n",
- sbuf);
++ /* Bail if binary is not compiled with GPU acceleration, but this is either
++ * explicitly (-nb gpu) or implicitly (gpu ID passed) requested. */
++ if (bForceUseGPU && !bGPUBin)
+ {
- }
-
- if (bForceUseGPU || bTryUseGPU)
- {
- env = getenv("GMX_GPU_ID");
- if (env != NULL && gpu_id != NULL)
++ gmx_fatal(FARGS, "GPU acceleration requested, but %s was compiled without GPU support!", ShortProgram());
+ }
- gmx_fatal(FARGS, "GMX_GPU_ID and -gpu_id can not be used at the same time");
++ if (gpu_id != NULL && !bGPUBin)
+ {
- if (env == NULL)
++ gmx_fatal(FARGS, "GPU ID string set, but %s was compiled without GPU support!", ShortProgram());
+ }
- env = gpu_id;
++
++ /* run the detection if the binary was compiled with GPU support */
++ if (bGPUBin && getenv("GMX_DISABLE_GPU_DETECTION") == NULL)
+ {
- /* parse GPU IDs if the user passed any */
- if (env != NULL)
++ char detection_error[STRLEN];
++
++ if (detect_cuda_gpus(&hwinfo_g->gpu_info, detection_error) != 0)
++ {
++ if (detection_error != NULL && detection_error[0] != '\0')
++ {
++ sprintf(sbuf, ":\n %s\n", detection_error);
++ }
++ else
++ {
++ sprintf(sbuf, ".");
++ }
++ md_print_warn(cr, fplog,
++ "NOTE: Error occurred during GPU detection%s"
++ " Can not use GPU acceleration, will fall back to CPU kernels.\n",
++ sbuf);
++ }
+ }
+
- int *gpuid, *checkres;
- int nid, res;
++ if (bForceUseGPU || bTryUseGPU)
+ {
- snew(gpuid, max_gpu_ids_user);
- snew(checkres, max_gpu_ids_user);
++ env = getenv("GMX_GPU_ID");
++ if (env != NULL && gpu_id != NULL)
++ {
++ gmx_fatal(FARGS, "GMX_GPU_ID and -gpu_id can not be used at the same time");
++ }
++ if (env == NULL)
++ {
++ env = gpu_id;
++ }
+
- parse_gpu_id_plain_string(env, &nid, gpuid);
++ /* parse GPU IDs if the user passed any */
++ if (env != NULL)
++ {
++ int *gpuid, *checkres;
++ int nid, res;
+
- if (nid == 0)
- {
- gmx_fatal(FARGS, "Empty GPU ID string encountered.\n%s\n", invalid_gpuid_hint);
- }
++ snew(gpuid, max_gpu_ids_user);
++ snew(checkres, max_gpu_ids_user);
+
- res = check_select_cuda_gpus(checkres, &hwinfo->gpu_info, gpuid, nid);
++ parse_gpu_id_plain_string(env, &nid, gpuid);
+
- if (!res)
- {
- print_gpu_detection_stats(fplog, &hwinfo->gpu_info, cr);
++ if (nid == 0)
++ {
++ gmx_fatal(FARGS, "Empty GPU ID string encountered.\n%s\n",
++ invalid_gpuid_hint);
++ }
+
- sprintf(sbuf, "Some of the requested GPUs do not exist, behave strangely, or are not compatible:\n");
- for (i = 0; i < nid; i++)
++ res = check_select_cuda_gpus(checkres, &hwinfo_g->gpu_info,
++ gpuid, nid);
+
- if (checkres[i] != egpuCompatible)
++ if (!res)
+ {
- sprintf(stmp, " GPU #%d: %s\n",
- gpuid[i], gpu_detect_res_str[checkres[i]]);
- strcat(sbuf, stmp);
++ print_gpu_detection_stats(fplog, &hwinfo_g->gpu_info, cr);
++
++ sprintf(sbuf, "Some of the requested GPUs do not exist, behave strangely, or are not compatible:\n");
++ for (i = 0; i < nid; i++)
+ {
- gmx_fatal(FARGS, "%s", sbuf);
- }
++ if (checkres[i] != egpuCompatible)
++ {
++ sprintf(stmp, " GPU #%d: %s\n",
++ gpuid[i], gpu_detect_res_str[checkres[i]]);
++ strcat(sbuf, stmp);
++ }
+ }
++ gmx_fatal(FARGS, "%s", sbuf);
+ }
- hwinfo->gpu_info.bUserSet = TRUE;
+
- sfree(gpuid);
- sfree(checkres);
- }
- else
- {
- pick_compatible_gpus(&hwinfo->gpu_info);
- hwinfo->gpu_info.bUserSet = FALSE;
- }
++ hwinfo_g->gpu_info.bUserSet = TRUE;
+
- /* decide whether we can use GPU */
- hwinfo->bCanUseGPU = (hwinfo->gpu_info.ncuda_dev_use > 0);
- if (!hwinfo->bCanUseGPU && bForceUseGPU)
- {
- gmx_fatal(FARGS, "GPU acceleration requested, but no compatible GPUs were detected.");
++ sfree(gpuid);
++ sfree(checkres);
++ }
++ else
++ {
++ pick_compatible_gpus(&hwinfo_g->gpu_info);
++ hwinfo_g->gpu_info.bUserSet = FALSE;
++ }
+
- void limit_num_gpus_used(gmx_hw_info_t *hwinfo, int count)
++ /* decide whether we can use GPU */
++ hwinfo_g->bCanUseGPU = (hwinfo_g->gpu_info.ncuda_dev_use > 0);
++ if (!hwinfo_g->bCanUseGPU && bForceUseGPU)
++ {
++ gmx_fatal(FARGS, "GPU acceleration requested, but no compatible GPUs were detected.");
++ }
+ }
+ }
++ /* increase the reference counter */
++ n_hwinfo++;
++
++ ret = tMPI_Thread_mutex_unlock(&hw_info_lock);
++ if (ret != 0)
++ {
++ gmx_fatal(FARGS, "Error unlocking hwinfo mutex: %s", strerror(errno));
++ }
++
++ return hwinfo_g;
+}
+
- if (hwinfo)
++static void limit_num_gpus_used(gmx_hw_info_t *hwinfo, int count)
+{
+ int ndev_use;
+
+ assert(hwinfo);
+
+ ndev_use = hwinfo->gpu_info.ncuda_dev_use;
+
+ if (count > ndev_use)
+ {
+ /* won't increase the # of GPUs */
+ return;
+ }
+
+ if (count < 1)
+ {
+ char sbuf[STRLEN];
+ sprintf(sbuf, "Limiting the number of GPUs to <1 doesn't make sense (detected %d, %d requested)!",
+ ndev_use, count);
+ gmx_incons(sbuf);
+ }
+
+ /* TODO: improve this implementation: either sort GPUs or remove the weakest here */
+ hwinfo->gpu_info.ncuda_dev_use = count;
+}
+
+void gmx_hardware_info_free(gmx_hw_info_t *hwinfo)
+{
- gmx_cpuid_done(hwinfo->cpuid_info);
- free_gpu_info(&hwinfo->gpu_info);
- sfree(hwinfo);
++ int ret;
++
++ ret = tMPI_Thread_mutex_lock(&hw_info_lock);
++ if (ret != 0)
++ {
++ gmx_fatal(FARGS, "Error locking hwinfo mutex: %s", strerror(errno));
++ }
++
++ /* decrease the reference counter */
++ n_hwinfo--;
++
++
++ if (hwinfo != hwinfo_g)
++ {
++ gmx_incons("hwinfo < hwinfo_g");
++ }
++
++ if (n_hwinfo < 0)
++ {
++ gmx_incons("n_hwinfo < 0");
++ }
++
++ if (n_hwinfo == 0)
++ {
++ gmx_cpuid_done(hwinfo_g->cpuid_info);
++ free_gpu_info(&hwinfo_g->gpu_info);
++ sfree(hwinfo_g);
++ }
++
++ ret = tMPI_Thread_mutex_unlock(&hw_info_lock);
++ if (ret != 0)
+ {
++ gmx_fatal(FARGS, "Error unlocking hwinfo mutex: %s", strerror(errno));
+ }
+}
--- /dev/null
- /* these are inherently global properties that are shared among all threads
- */
- static const int *locality_order;
- static int rc;
- static gmx_bool have_locality_order = FALSE;
- static tMPI_Thread_mutex_t locality_order_mtx =
- TMPI_THREAD_MUTEX_INITIALIZER;
- static tMPI_Thread_cond_t locality_order_cond =
- TMPI_THREAD_COND_INITIALIZER;
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+#if defined(HAVE_SCHED_H) && defined(HAVE_SCHED_GETAFFINITY)
+#define _GNU_SOURCE
+#include <sched.h>
+#include <sys/syscall.h>
+#endif
+#include <string.h>
+#include <errno.h>
+#include <assert.h>
+#include <stdio.h>
+#include "typedefs.h"
+#include "types/commrec.h"
+#include "types/hw_info.h"
+#include "gmx_cpuid.h"
+#include "gmx_omp.h"
+#include "gmx_omp_nthreads.h"
+#include "mdrun.h"
+#include "md_logging.h"
+#include "statutil.h"
+#include "gmx_thread_affinity.h"
+
+#include "thread_mpi/threads.h"
+
+
+static int
+get_thread_affinity_layout(FILE *fplog,
+ const t_commrec *cr,
+ const gmx_hw_info_t * hwinfo,
+ int nthreads,
+ int pin_offset, int * pin_stride,
+ const int **locality_order)
+{
+ int nhwthreads, npkg, ncores, nhwthreads_per_core, rc;
+ const int * pkg_id;
+ const int * core_id;
+ const int * hwthread_id;
+ gmx_bool bPickPinStride;
+
+ if (pin_offset < 0)
+ {
+ gmx_fatal(FARGS, "Negative thread pinning offset requested");
+ }
+ if (*pin_stride < 0)
+ {
+ gmx_fatal(FARGS, "Negative thread pinning stride requested");
+ }
+
+ rc = gmx_cpuid_topology(hwinfo->cpuid_info, &nhwthreads, &npkg, &ncores,
+ &nhwthreads_per_core,
+ &pkg_id, &core_id, &hwthread_id, locality_order);
+
+ if (rc != 0)
+ {
+ /* topology information not available or invalid, ignore it */
+ nhwthreads = hwinfo->nthreads_hw_avail;
+ *locality_order = NULL;
+
+ if (nhwthreads <= 0)
+ {
+ /* We don't know anything about the hardware, don't pin */
+ md_print_warn(cr, fplog,
+ "NOTE: We don't know how many logical cores we have, will not pin threads");
+
+ return -1;
+ }
+ }
+
+ if (nthreads > nhwthreads)
+ {
+ /* We are oversubscribing, don't pin */
+ md_print_warn(NULL, fplog,
+ "WARNING: Oversubscribing the CPU, will not pin threads");
+
+ return -1;
+ }
+
+ if (pin_offset + nthreads > nhwthreads)
+ {
+ /* We are oversubscribing, don't pin */
+ md_print_warn(NULL, fplog,
+ "WARNING: The requested pin offset is too large for the available logical cores,\n"
+ " will not pin threads");
+
+ return -1;
+ }
+
+
+ /* do we need to choose the pinning stride? */
+ bPickPinStride = (*pin_stride == 0);
+
+ if (bPickPinStride)
+ {
+ if (rc == 0 && pin_offset + nthreads*nhwthreads_per_core <= nhwthreads)
+ {
+ /* Put one thread on each physical core */
+ *pin_stride = nhwthreads_per_core;
+ }
+ else
+ {
+ /* We don't know if we have SMT, and if we do, we don't know
+ * if hw threads in the same physical core are consecutive.
+ * Without SMT the pinning layout should not matter too much.
+ * so we assume a consecutive layout and maximally spread out"
+ * the threads at equal threads per core.
+ * Note that IBM is the major non-x86 case with cpuid support
+ * and probably threads are already pinned by the queuing system,
+ * so we wouldn't end up here in the first place.
+ */
+ *pin_stride = (nhwthreads - pin_offset)/nthreads;
+ }
+ }
+ else
+ {
+ /* Check the placement of the thread with the largest index to make sure
+ * that the offset & stride doesn't cause pinning beyond the last hardware thread. */
+ if (pin_offset + (nthreads-1)*(*pin_stride) >= nhwthreads)
+ {
+ /* We are oversubscribing, don't pin */
+ md_print_warn(NULL, fplog,
+ "WARNING: The requested pinning stride is too large for the available logical cores,\n"
+ " will not pin threads");
+
+ return -1;
+ }
+ }
+
+ if (fplog != NULL)
+ {
+ fprintf(fplog, "Pinning threads with a%s logical core stride of %d\n",
+ bPickPinStride ? "n auto-selected" : " user-specified",
+ *pin_stride);
+ }
+
+ return 0;
+}
+
+/* Set CPU affinity. Can be important for performance.
+ On some systems (e.g. Cray) CPU Affinity is set by default.
+ But default assigning doesn't work (well) with only some ranks
+ having threads. This causes very low performance.
+ External tools have cumbersome syntax for setting affinity
+ in the case that only some ranks have threads.
+ Thus it is important that GROMACS sets the affinity internally
+ if only PME is using threads.
+ */
+void
+gmx_set_thread_affinity(FILE *fplog,
+ const t_commrec *cr,
+ gmx_hw_opt_t *hw_opt,
+ const gmx_hw_info_t *hwinfo)
+{
+ int nth_affinity_set, thread_id_node, thread_id,
+ nthread_local, nthread_node, nthread_hw_max, nphyscore;
+ int offset;
- #endif /* __APPLE__ */
++ const int *locality_order;
++ int rc;
+
+ if (hw_opt->thread_affinity == threadaffOFF)
+ {
+ /* Nothing to do */
+ return;
+ }
+
+ /* If the tMPI thread affinity setting is not supported encourage the user
+ * to report it as it's either a bug or an exotic platform which we might
+ * want to support. */
+ if (tMPI_Thread_setaffinity_support() != TMPI_SETAFFINITY_SUPPORT_YES)
+ {
+ /* we know Mac OS doesn't support setting thread affinity, so there's
+ no point in warning the user in that case. In any other case
+ the user might be able to do something about it. */
+#ifndef __APPLE__
+ md_print_warn(NULL, fplog,
+ "Can not set thread affinities on the current platform. On NUMA systems this\n"
+ "can cause performance degradation. If you think your platform should support\n"
+ "setting affinities, contact the GROMACS developers.");
- /* hw_opt is shared among tMPI threads, so for thread safety we need to do
- * the layout detection only on master as core_pinning_stride is an in-out
- * parameter and gets auto-set depending on its initial value.
- * This
- * This is not thread-safe with multi-simulations, but that's anyway not
- * supported by tMPI. */
- if (SIMMASTER(cr))
- {
- int ret;
- int i;
-
- ret = tMPI_Thread_mutex_lock(&locality_order_mtx);
- if (ret != 0)
- {
- goto locality_order_err;
- }
- rc = get_thread_affinity_layout(fplog, cr, hwinfo,
- nthread_node,
- offset, &hw_opt->core_pinning_stride,
- &locality_order);
- have_locality_order = TRUE;
- ret = tMPI_Thread_cond_broadcast(&locality_order_cond);
- if (ret != 0)
- {
- tMPI_Thread_mutex_unlock(&locality_order_mtx);
- goto locality_order_err;
- }
- ret = tMPI_Thread_mutex_unlock(&locality_order_mtx);
- if (ret != 0)
- {
- goto locality_order_err;
- }
- }
- else
- {
- int ret;
- /* all other threads wait for the locality order data. */
- ret = tMPI_Thread_mutex_lock(&locality_order_mtx);
- if (ret != 0)
- {
- goto locality_order_err;
- }
-
- while (!have_locality_order)
- {
- ret = tMPI_Thread_cond_wait(&locality_order_cond,
- &locality_order_mtx);
- if (ret != 0)
- {
- tMPI_Thread_mutex_unlock(&locality_order_mtx);
- goto locality_order_err;
- }
- }
- ret = tMPI_Thread_mutex_unlock(&locality_order_mtx);
- if (ret != 0)
- {
- goto locality_order_err;
- }
- }
++#endif /* __APPLE__ */
+ return;
+ }
+
+ /* threads on this MPI process or TMPI thread */
+ if (cr->duty & DUTY_PP)
+ {
+ nthread_local = gmx_omp_nthreads_get(emntNonbonded);
+ }
+ else
+ {
+ nthread_local = gmx_omp_nthreads_get(emntPME);
+ }
+
+ /* map the current process to cores */
+ thread_id_node = 0;
+ nthread_node = nthread_local;
+#ifdef GMX_MPI
+ if (PAR(cr) || MULTISIM(cr))
+ {
+ /* We need to determine a scan of the thread counts in this
+ * compute node.
+ */
+ MPI_Comm comm_intra;
+
+ MPI_Comm_split(MPI_COMM_WORLD, gmx_hostname_num(), cr->rank_intranode,
+ &comm_intra);
+ MPI_Scan(&nthread_local, &thread_id_node, 1, MPI_INT, MPI_SUM, comm_intra);
+ /* MPI_Scan is inclusive, but here we need exclusive */
+ thread_id_node -= nthread_local;
+ /* Get the total number of threads on this physical node */
+ MPI_Allreduce(&nthread_local, &nthread_node, 1, MPI_INT, MPI_SUM, comm_intra);
+ MPI_Comm_free(&comm_intra);
+ }
+#endif
+
+ if (hw_opt->thread_affinity == threadaffAUTO &&
+ nthread_node != hwinfo->nthreads_hw_avail)
+ {
+ if (nthread_node > 1 && nthread_node < hwinfo->nthreads_hw_avail)
+ {
+ md_print_warn(cr, fplog,
+ "NOTE: The number of threads is not equal to the number of (logical) cores\n"
+ " and the -pin option is set to auto: will not pin thread to cores.\n"
+ " This can lead to significant performance degradation.\n"
+ " Consider using -pin on (and -pinoffset in case you run multiple jobs).\n");
+ }
+
+ return;
+ }
+
+ offset = 0;
+ if (hw_opt->core_pinning_offset != 0)
+ {
+ offset = hw_opt->core_pinning_offset;
+ md_print_info(cr, fplog, "Applying core pinning offset %d\n", offset);
+ }
+
-
- locality_order_err:
- /* any error in affinity setting shouldn't be fatal, but should generate
- a warning */
- md_print_warn(NULL, fplog,
- "WARNING: Obtaining affinity information failed due to a basic system error: %s.\n"
- " This can cause performance degradation! ",
- strerror(errno));
- return;
++ rc = get_thread_affinity_layout(fplog, cr, hwinfo,
++ nthread_node,
++ offset, &hw_opt->core_pinning_stride,
++ &locality_order);
+
+ if (rc != 0)
+ {
+ /* Incompatible layout, don't pin, warning was already issued */
+ return;
+ }
+
+ /* Set the per-thread affinity. In order to be able to check the success
+ * of affinity settings, we will set nth_affinity_set to 1 on threads
+ * where the affinity setting succeded and to 0 where it failed.
+ * Reducing these 0/1 values over the threads will give the total number
+ * of threads on which we succeeded.
+ */
+ nth_affinity_set = 0;
+#pragma omp parallel firstprivate(thread_id_node) num_threads(nthread_local) \
+ reduction(+:nth_affinity_set)
+ {
+ int index, core;
+ gmx_bool setaffinity_ret;
+
+ thread_id = gmx_omp_get_thread_num();
+ thread_id_node += thread_id;
+ index = offset + thread_id_node*hw_opt->core_pinning_stride;
+ if (locality_order != NULL)
+ {
+ core = locality_order[index];
+ }
+ else
+ {
+ core = index;
+ }
+
+ setaffinity_ret = tMPI_Thread_setaffinity_single(tMPI_Thread_self(), core);
+
+ /* store the per-thread success-values of the setaffinity */
+ nth_affinity_set = (setaffinity_ret == 0);
+
+ if (debug)
+ {
+ fprintf(debug, "On rank %2d, thread %2d, core %2d the affinity setting returned %d\n",
+ cr->nodeid, gmx_omp_get_thread_num(), core, setaffinity_ret);
+ }
+ }
+
+ if (nth_affinity_set > nthread_local)
+ {
+ char msg[STRLEN];
+
+ sprintf(msg, "Looks like we have set affinity for more threads than "
+ "we have (%d > %d)!\n", nth_affinity_set, nthread_local);
+ gmx_incons(msg);
+ }
+ else
+ {
+ /* check & warn if some threads failed to set their affinities */
+ if (nth_affinity_set != nthread_local)
+ {
+ char sbuf1[STRLEN], sbuf2[STRLEN];
+
+ /* sbuf1 contains rank info, while sbuf2 OpenMP thread info */
+ sbuf1[0] = sbuf2[0] = '\0';
+ /* Only add rank info if we have more than one rank. */
+ if (cr->nnodes > 1)
+ {
+#ifdef GMX_MPI
+#ifdef GMX_THREAD_MPI
+ sprintf(sbuf1, "In tMPI thread #%d: ", cr->nodeid);
+#else /* GMX_LIB_MPI */
+ sprintf(sbuf1, "In MPI process #%d: ", cr->nodeid);
+#endif
+#endif /* GMX_MPI */
+ }
+
+ if (nthread_local > 1)
+ {
+ sprintf(sbuf2, "for %d/%d thread%s ",
+ nthread_local - nth_affinity_set, nthread_local,
+ nthread_local > 1 ? "s" : "");
+ }
+
+ md_print_warn(NULL, fplog,
+ "WARNING: %sAffinity setting %sfailed.\n"
+ " This can cause performance degradation! If you think your setting are\n"
+ " correct, contact the GROMACS developers.",
+ sbuf1, sbuf2);
+ }
+ }
+ return;
+}
+
+/* Check the process affinity mask and if it is found to be non-zero,
+ * will honor it and disable mdrun internal affinity setting.
+ * Note that this will only work on Linux as we use a GNU feature.
+ */
+void
+gmx_check_thread_affinity_set(FILE *fplog, const t_commrec *cr,
+ gmx_hw_opt_t *hw_opt, int ncpus,
+ gmx_bool bAfterOpenmpInit)
+{
+#ifdef HAVE_SCHED_GETAFFINITY
+ cpu_set_t mask_current;
+ int i, ret, cpu_count, cpu_set;
+ gmx_bool bAllSet;
+
+ assert(hw_opt);
+ if (hw_opt->thread_affinity == threadaffOFF)
+ {
+ /* internal affinity setting is off, don't bother checking process affinity */
+ return;
+ }
+
+ CPU_ZERO(&mask_current);
+ if ((ret = sched_getaffinity(0, sizeof(cpu_set_t), &mask_current)) != 0)
+ {
+ /* failed to query affinity mask, will just return */
+ if (debug)
+ {
+ fprintf(debug, "Failed to query affinity mask (error %d)", ret);
+ }
+ return;
+ }
+
+ /* Before proceeding with the actual check, make sure that the number of
+ * detected CPUs is >= the CPUs in the current set.
+ * We need to check for CPU_COUNT as it was added only in glibc 2.6. */
+#ifdef CPU_COUNT
+ if (ncpus < CPU_COUNT(&mask_current))
+ {
+ if (debug)
+ {
+ fprintf(debug, "%d CPUs detected, but %d was returned by CPU_COUNT",
+ ncpus, CPU_COUNT(&mask_current));
+ }
+ return;
+ }
+#endif /* CPU_COUNT */
+
+ bAllSet = TRUE;
+ for (i = 0; (i < ncpus && i < CPU_SETSIZE); i++)
+ {
+ bAllSet = bAllSet && (CPU_ISSET(i, &mask_current) != 0);
+ }
+
+ if (!bAllSet)
+ {
+ if (hw_opt->thread_affinity == threadaffAUTO)
+ {
+ if (!bAfterOpenmpInit)
+ {
+ md_print_warn(cr, fplog,
+ "Non-default thread affinity set, disabling internal thread affinity");
+ }
+ else
+ {
+ md_print_warn(cr, fplog,
+ "Non-default thread affinity set probably by the OpenMP library,\n"
+ "disabling internal thread affinity");
+ }
+ hw_opt->thread_affinity = threadaffOFF;
+ }
+ else
+ {
+ /* Only warn once, at the last check (bAfterOpenmpInit==TRUE) */
+ if (bAfterOpenmpInit)
+ {
+ md_print_warn(cr, fplog,
+ "Overriding thread affinity set outside %s\n",
+ ShortProgram());
+ }
+ }
+
+ if (debug)
+ {
+ fprintf(debug, "Non-default affinity mask found\n");
+ }
+ }
+ else
+ {
+ if (debug)
+ {
+ fprintf(debug, "Default affinity mask found\n");
+ }
+ }
+#endif /* HAVE_SCHED_GETAFFINITY */
+}
--- /dev/null
- user = pw->pw_name;
+/*
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * GROningen MAchine for Chemical Simulations
+ *
+ * VERSION 3.2.0
+ * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2004, The GROMACS development team,
+ * check out http://www.gromacs.org for more information.
+
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * If you want to redistribute modifications, please consider that
+ * scientific software is very special. Version control is crucial -
+ * bugs must be traceable. We will be happy to consider code for
+ * inclusion in the official distribution, but derived work must not
+ * be called official GROMACS. Details are found in the README & COPYING
+ * files - if they are missing, get the official version at www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the papers on the package - you can find them in the top README file.
+ *
+ * For more info, check our website at http://www.gromacs.org
+ *
+ * And Hey:
+ * GROningen Mixture of Alchemy and Childrens' Stories
+ */
+/* This file is completely threadsafe - keep it that way! */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+#include "gromacs/utility/gmx_header_config.h"
+
+#ifdef GMX_CRAY_XT3
+#undef HAVE_PWD_H
+#endif
+
+#include <stdio.h>
+#include <ctype.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <sys/types.h>
+#include <time.h>
+
+#ifdef HAVE_SYS_TIME_H
+#include <sys/time.h>
+#endif
+
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>
+#endif
+
+#ifdef HAVE_PWD_H
+#include <pwd.h>
+#endif
+#include <time.h>
+#include <assert.h>
+
+#include "typedefs.h"
+#include "smalloc.h"
+#include "gmx_fatal.h"
+#include "macros.h"
+#include "string2.h"
+#include "futil.h"
+
+int continuing(char *s)
+{
+ int sl;
+ assert(s);
+
+ rtrim(s);
+ sl = strlen(s);
+ if ((sl > 0) && (s[sl-1] == CONTINUE))
+ {
+ s[sl-1] = 0;
+ return TRUE;
+ }
+ else
+ {
+ return FALSE;
+ }
+}
+
+
+
+char *fgets2(char *line, int n, FILE *stream)
+{
+ char *c;
+ if (fgets(line, n, stream) == NULL)
+ {
+ return NULL;
+ }
+ if ((c = strchr(line, '\n')) != NULL)
+ {
+ *c = '\0';
+ }
+ else
+ {
+ /* A line not ending in a newline can only occur at the end of a file,
+ * or because of n being too small.
+ * Since both cases occur very infrequently, we can check for EOF.
+ */
+ if (!gmx_eof(stream))
+ {
+ gmx_fatal(FARGS, "An input file contains a line longer than %d characters, while the buffer passed to fgets2 has size %d. The line starts with: '%20.20s'", n, n, line);
+ }
+ }
+ if ((c = strchr(line, '\r')) != NULL)
+ {
+ *c = '\0';
+ }
+
+ return line;
+}
+
+void strip_comment (char *line)
+{
+ char *c;
+
+ if (!line)
+ {
+ return;
+ }
+
+ /* search for a comment mark and replace it by a zero */
+ if ((c = strchr(line, COMMENTSIGN)) != NULL)
+ {
+ (*c) = 0;
+ }
+}
+
+void upstring (char *str)
+{
+ int i;
+
+ for (i = 0; (i < (int)strlen(str)); i++)
+ {
+ str[i] = toupper(str[i]);
+ }
+}
+
+void ltrim (char *str)
+{
+ char *tr;
+ int i, c;
+
+ if (NULL == str)
+ {
+ return;
+ }
+
+ c = 0;
+ while (('\0' != str[c]) && isspace(str[c]))
+ {
+ c++;
+ }
+ if (c > 0)
+ {
+ for (i = c; ('\0' != str[i]); i++)
+ {
+ str[i-c] = str[i];
+ }
+ str[i-c] = '\0';
+ }
+}
+
+void rtrim (char *str)
+{
+ int nul;
+
+ if (NULL == str)
+ {
+ return;
+ }
+
+ nul = strlen(str)-1;
+ while ((nul > 0) && ((str[nul] == ' ') || (str[nul] == '\t')) )
+ {
+ str[nul] = '\0';
+ nul--;
+ }
+}
+
+void trim (char *str)
+{
+ ltrim (str);
+ rtrim (str);
+}
+
+char *
+gmx_ctime_r(const time_t *clock, char *buf, int n)
+{
+ char tmpbuf[STRLEN];
+
+#ifdef GMX_NATIVE_WINDOWS
+ /* Windows */
+ ctime_s( tmpbuf, STRLEN, clock );
+#elif (defined(__sun))
+ /*Solaris*/
+ ctime_r(clock, tmpbuf, n);
+#else
+ ctime_r(clock, tmpbuf);
+#endif
+ strncpy(buf, tmpbuf, n-1);
+ buf[n-1] = '\0';
+
+ return buf;
+}
+
+void nice_header (FILE *out, const char *fn)
+{
+ const char *unk = "onbekend";
+ time_t clock;
+ const char *user = unk;
+ int gh;
+#ifdef HAVE_PWD_H
+ uid_t uid;
+#else
+ int uid;
+#endif
+ char buf[256] = "";
+ char timebuf[STRLEN];
+#ifdef HAVE_PWD_H
+ struct passwd *pw;
+#endif
+
+ /* Print a nice header above the file */
+ time(&clock);
+ fprintf (out, "%c\n", COMMENTSIGN);
+ fprintf (out, "%c\tFile '%s' was generated\n", COMMENTSIGN, fn ? fn : unk);
+
+#ifdef HAVE_PWD_H
+ uid = getuid();
+ pw = getpwuid(uid);
+ gh = gethostname(buf, 255);
++ /* pw returns null on error (e.g. compute nodes lack /etc/passwd) */
++ user = pw ? pw->pw_name : unk;
+#else
+ uid = 0;
+ gh = -1;
+#endif
+
+ gmx_ctime_r(&clock, timebuf, STRLEN);
+ fprintf (out, "%c\tBy user: %s (%d)\n", COMMENTSIGN,
+ user ? user : unk, (int) uid);
+ fprintf(out, "%c\tOn host: %s\n", COMMENTSIGN, (gh == 0) ? buf : unk);
+
+ fprintf (out, "%c\tAt date: %s", COMMENTSIGN, timebuf);
+ fprintf (out, "%c\n", COMMENTSIGN);
+}
+
+
+int gmx_strcasecmp_min(const char *str1, const char *str2)
+{
+ char ch1, ch2;
+
+ do
+ {
+ do
+ {
+ ch1 = toupper(*(str1++));
+ }
+ while ((ch1 == '-') || (ch1 == '_'));
+ do
+ {
+ ch2 = toupper(*(str2++));
+ }
+ while ((ch2 == '-') || (ch2 == '_'));
+
+ if (ch1 != ch2)
+ {
+ return (ch1-ch2);
+ }
+ }
+ while (ch1);
+ return 0;
+}
+
+int gmx_strncasecmp_min(const char *str1, const char *str2, int n)
+{
+ char ch1, ch2;
+ char *stri1, *stri2;
+
+ stri1 = (char *)str1;
+ stri2 = (char *)str2;
+ do
+ {
+ do
+ {
+ ch1 = toupper(*(str1++));
+ }
+ while ((ch1 == '-') || (ch1 == '_'));
+ do
+ {
+ ch2 = toupper(*(str2++));
+ }
+ while ((ch2 == '-') || (ch2 == '_'));
+
+ if (ch1 != ch2)
+ {
+ return (ch1-ch2);
+ }
+ }
+ while (ch1 && (str1-stri1 < n) && (str2-stri2 < n));
+ return 0;
+}
+
+int gmx_strcasecmp(const char *str1, const char *str2)
+{
+ char ch1, ch2;
+
+ do
+ {
+ ch1 = toupper(*(str1++));
+ ch2 = toupper(*(str2++));
+ if (ch1 != ch2)
+ {
+ return (ch1-ch2);
+ }
+ }
+ while (ch1);
+ return 0;
+}
+
+int gmx_strncasecmp(const char *str1, const char *str2, int n)
+{
+ char ch1, ch2;
+
+ if (n == 0)
+ {
+ return 0;
+ }
+
+ do
+ {
+ ch1 = toupper(*(str1++));
+ ch2 = toupper(*(str2++));
+ if (ch1 != ch2)
+ {
+ return (ch1-ch2);
+ }
+ n--;
+ }
+ while (ch1 && n);
+ return 0;
+}
+
+char *gmx_strdup(const char *src)
+{
+ char *dest;
+
+ snew(dest, strlen(src)+1);
+ strcpy(dest, src);
+
+ return dest;
+}
+
+char *
+gmx_strndup(const char *src, int n)
+{
+ int len;
+ char *dest;
+
+ len = strlen(src);
+ if (len > n)
+ {
+ len = n;
+ }
+ snew(dest, len+1);
+ strncpy(dest, src, len);
+ dest[len] = 0;
+ return dest;
+}
+
+/* Magic hash init number for Dan J. Bernsteins algorithm.
+ * Do NOT use any other value unless you really know what you are doing.
+ */
+const unsigned int
+ gmx_string_hash_init = 5381;
+
+
+unsigned int
+gmx_string_hash_func(const char *s, unsigned int hash_init)
+{
+ int c;
+
+ while ((c = toupper(*s++)) != '\0')
+ {
+ if (isalnum(c))
+ {
+ hash_init = ((hash_init << 5) + hash_init) ^ c; /* (hash * 33) xor c */
+ }
+ }
+ return hash_init;
+}
+
+int
+gmx_wcmatch(const char *pattern, const char *str)
+{
+ while (*pattern)
+ {
+ if (*pattern == '*')
+ {
+ /* Skip multiple wildcards in a sequence */
+ while (*pattern == '*' || *pattern == '?')
+ {
+ ++pattern;
+ /* For ?, we need to check that there are characters left
+ * in str. */
+ if (*pattern == '?')
+ {
+ if (*str == 0)
+ {
+ return GMX_NO_WCMATCH;
+ }
+ else
+ {
+ ++str;
+ }
+ }
+ }
+ /* If the pattern ends after the star, we have a match */
+ if (*pattern == 0)
+ {
+ return 0;
+ }
+ /* Match the rest against each possible suffix of str */
+ while (*str)
+ {
+ /* Only do the recursive call if the first character
+ * matches. We don't have to worry about wildcards here,
+ * since we have processed them above. */
+ if (*pattern == *str)
+ {
+ int rc;
+ /* Match the suffix, and return if a match or an error */
+ rc = gmx_wcmatch(pattern, str);
+ if (rc != GMX_NO_WCMATCH)
+ {
+ return rc;
+ }
+ }
+ ++str;
+ }
+ /* If no suffix of str matches, we don't have a match */
+ return GMX_NO_WCMATCH;
+ }
+ else if ((*pattern == '?' && *str != 0) || *pattern == *str)
+ {
+ ++str;
+ }
+ else
+ {
+ return GMX_NO_WCMATCH;
+ }
+ ++pattern;
+ }
+ /* When the pattern runs out, we have a match if the string has ended. */
+ return (*str == 0) ? 0 : GMX_NO_WCMATCH;
+}
+
+char *wrap_lines(const char *buf, int line_width, int indent, gmx_bool bIndentFirst)
+{
+ char *b2;
+ int i, i0, i2, j, b2len, lspace = 0, l2space = 0;
+ gmx_bool bFirst, bFitsOnLine;
+
+ /* characters are copied from buf to b2 with possible spaces changed
+ * into newlines and extra space added for indentation.
+ * i indexes buf (source buffer) and i2 indexes b2 (destination buffer)
+ * i0 points to the beginning of the current line (in buf, source)
+ * lspace and l2space point to the last space on the current line
+ * bFirst is set to prevent indentation of first line
+ * bFitsOnLine says if the first space occurred before line_width, if
+ * that is not the case, we have a word longer than line_width which
+ * will also not fit on the next line, so we might as well keep it on
+ * the current line (where it also won't fit, but looks better)
+ */
+
+ b2 = NULL;
+ b2len = strlen(buf)+1+indent;
+ snew(b2, b2len);
+ i0 = i2 = 0;
+ if (bIndentFirst)
+ {
+ for (i2 = 0; (i2 < indent); i2++)
+ {
+ b2[i2] = ' ';
+ }
+ }
+ bFirst = TRUE;
+ do
+ {
+ l2space = -1;
+ /* find the last space before end of line */
+ for (i = i0; ((i-i0 < line_width) || (l2space == -1)) && (buf[i]); i++)
+ {
+ b2[i2++] = buf[i];
+ /* remember the position of a space */
+ if (buf[i] == ' ')
+ {
+ lspace = i;
+ l2space = i2-1;
+ }
+ /* if we have a newline before the line is full, reset counters */
+ if (buf[i] == '\n' && buf[i+1])
+ {
+ i0 = i+1;
+ b2len += indent;
+ srenew(b2, b2len);
+ /* add indentation after the newline */
+ for (j = 0; (j < indent); j++)
+ {
+ b2[i2++] = ' ';
+ }
+ }
+ }
+ /* If we are at the last newline, copy it */
+ if (buf[i] == '\n' && !buf[i+1])
+ {
+ b2[i2++] = buf[i++];
+ }
+ /* if we're not at the end of the string */
+ if (buf[i])
+ {
+ /* check if one word does not fit on the line */
+ bFitsOnLine = (i-i0 <= line_width);
+ /* reset line counters to just after the space */
+ i0 = lspace+1;
+ i2 = l2space+1;
+ /* if the words fit on the line, and we're beyond the indentation part */
+ if ( (bFitsOnLine) && (l2space >= indent) )
+ {
+ /* start a new line */
+ b2[l2space] = '\n';
+ /* and add indentation */
+ if (indent)
+ {
+ if (bFirst)
+ {
+ line_width -= indent;
+ bFirst = FALSE;
+ }
+ b2len += indent;
+ srenew(b2, b2len);
+ for (j = 0; (j < indent); j++)
+ {
+ b2[i2++] = ' ';
+ }
+ /* no extra spaces after indent; */
+ while (buf[i0] == ' ')
+ {
+ i0++;
+ }
+ }
+ }
+ }
+ }
+ while (buf[i]);
+ b2[i2] = '\0';
+
+ return b2;
+}
+
+char **split(char sep, const char *str)
+{
+ char **ptr = NULL;
+ int n, nn, nptr = 0;
+
+ if (str == NULL)
+ {
+ return NULL;
+ }
+ nn = strlen(str);
+ for (n = 0; (n < nn); n++)
+ {
+ if (str[n] == sep)
+ {
+ nptr++;
+ }
+ }
+ snew(ptr, nptr+2);
+ nptr = 0;
+ while (*str != '\0')
+ {
+ while ((*str != '\0') && (*str == sep))
+ {
+ str++;
+ }
+ if (*str != '\0')
+ {
+ snew(ptr[nptr], 1+strlen(str));
+ n = 0;
+ while ((*str != '\0') && (*str != sep))
+ {
+ ptr[nptr][n] = *str;
+ str++;
+ n++;
+ }
+ ptr[nptr][n] = '\0';
+ nptr++;
+ }
+ }
+ ptr[nptr] = NULL;
+
+ return ptr;
+}
+
+
+gmx_large_int_t
+str_to_large_int_t(const char *str, char **endptr)
+{
+ int sign = 1;
+ gmx_large_int_t val = 0;
+ char ch;
+ const char *p;
+
+ p = str;
+ if (p == NULL)
+ {
+ *endptr = NULL;
+ return 0;
+ }
+
+ /* Strip off initial white space */
+ while (isspace(*p))
+ {
+ p++;
+ }
+ /* Conform to ISO C99 - return original pointer if string does not contain a number */
+ if (*str == '\0')
+ {
+ *endptr = (char *)str;
+ }
+
+ if (*p == '-')
+ {
+ p++;
+ sign *= -1;
+ }
+
+ while ( ((ch = *p) != '\0') && isdigit(ch) )
+ {
+ /* Important to add sign here, so we dont overflow in final multiplication */
+ ch = (ch-'0')*sign;
+ val = val*10 + ch;
+ if (ch != val%10)
+ {
+ /* Some sort of overflow has occured, set endptr to original string */
+ *endptr = (char *)str;
+ errno = ERANGE;
+ return(0);
+ }
+ p++;
+ }
+
+ *endptr = (char *)p;
+
+ return val;
+}
+
+char *gmx_strsep(char **stringp, const char *delim)
+{
+ char *ret;
+ int len = strlen(delim);
+ int i, j = 0;
+ int found = 0;
+
+ if (!*stringp)
+ {
+ return NULL;
+ }
+ ret = *stringp;
+ do
+ {
+ if ( (*stringp)[j] == '\0')
+ {
+ found = 1;
+ *stringp = NULL;
+ break;
+ }
+ for (i = 0; i < len; i++)
+ {
+ if ( (*stringp)[j] == delim[i])
+ {
+ (*stringp)[j] = '\0';
+ *stringp = *stringp+j+1;
+ found = 1;
+ break;
+ }
+ }
+ j++;
+ }
+ while (!found);
+
+ return ret;
+}
--- /dev/null
- void gmx_detect_hardware(FILE *fplog, gmx_hw_info_t *hwinfo,
- const t_commrec *cr,
- gmx_bool bForceUseGPU, gmx_bool bTryUseGPU,
- const char *gpu_id);
+/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
+ *
+ *
+ * This file is part of GROMACS.
+ * Copyright (c) 2012-
+ *
+ * Written by the Gromacs development team under coordination of
+ * David van der Spoel, Berk Hess, and Erik Lindahl.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org
+ *
+ * And Hey:
+ * GROup of MAchos and Cynical Suckers
+ */
+
+#ifndef GMX_HARDWARE_DETECT_H
+#define GMX_HARDWARE_DETECT_H
+
+#include "types/hw_info.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#if 0
+} /* fixes auto-indentation problems */
+#endif
+
++/* the init and consistency functions depend on commrec that may not be
++ consistent in cuda because MPI types don't exist there. */
++#ifndef __CUDACC__
++#include "types/commrec.h"
++/* return a pointer to a global hwinfo structure. */
++gmx_hw_info_t *gmx_detect_hardware(FILE *fplog, const t_commrec *cr,
++ gmx_bool bForceUseGPU, gmx_bool bTryUseGPU,
++ const char *gpu_id);
+
+void gmx_hardware_info_free(gmx_hw_info_t *hwinfo);
+
++/* Check the thread count + GPU assignment. This function must
++ either be run by all threads that persist (i.e. all tmpi threads),
++ or be run before they are created. */
+void gmx_check_hw_runconf_consistency(FILE *fplog, gmx_hw_info_t *hwinfo,
+ const t_commrec *cr, int ntmpi_requsted,
+ gmx_bool bUseGPU);
++#endif
++
++
++/* Check whether a GPU is shared among ranks, and return the number of shared
++ gpus
++
++ hwinfo = the hwinfo struct
++
++ returns: The number of GPUs shared among ranks, or 0 */
++int gmx_count_gpu_dev_shared(const gmx_gpu_info_t *gpu_info);
++
+
+#ifdef __cplusplus
+}
+#endif
+
+
+#endif /* GMX_HARDWARE_DETECT_H */
--- /dev/null
+/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
+ *
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * GROningen MAchine for Chemical Simulations
+ *
+ * VERSION 3.2.0
+ * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2004, The GROMACS development team,
+ * check out http://www.gromacs.org for more information.
+
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * If you want to redistribute modifications, please consider that
+ * scientific software is very special. Version control is crucial -
+ * bugs must be traceable. We will be happy to consider code for
+ * inclusion in the official distribution, but derived work must not
+ * be called official GROMACS. Details are found in the README & COPYING
+ * files - if they are missing, get the official version at www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the papers on the package - you can find them in the top README file.
+ *
+ * For more info, check our website at http://www.gromacs.org
+ *
+ * And Hey:
+ * Gromacs Runs On Most of All Computer Systems
+ */
+
+#ifndef _maths_h
+#define _maths_h
+
+#include <math.h>
+#include "types/simple.h"
+#include "typedefs.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef M_PI
+#define M_PI 3.14159265358979323846
+#endif
+
+#ifndef M_PI_2
+#define M_PI_2 1.57079632679489661923
+#endif
+
+#ifndef M_2PI
+#define M_2PI 6.28318530717958647692
+#endif
+
+#ifndef M_SQRT2
+#define M_SQRT2 sqrt(2.0)
+#endif
+
+#ifndef M_1_PI
+#define M_1_PI 0.31830988618379067154
+#endif
+
+#ifndef M_FLOAT_1_SQRTPI /* used in CUDA kernels */
+/* 1.0 / sqrt(M_PI) */
+#define M_FLOAT_1_SQRTPI 0.564189583547756f
+#endif
+
+#ifndef M_1_SQRTPI
+/* 1.0 / sqrt(M_PI) */
+#define M_1_SQRTPI 0.564189583547756
+#endif
+
+#ifndef M_2_SQRTPI
+/* 2.0 / sqrt(M_PI) */
+#define M_2_SQRTPI 1.128379167095513
+#endif
+
+int gmx_nint(real a);
+real sign(real x, real y);
+
+real cuberoot (real a);
+double gmx_erfd(double x);
+double gmx_erfcd(double x);
+float gmx_erff(float x);
+float gmx_erfcf(float x);
+#ifdef GMX_DOUBLE
+#define gmx_erf(x) gmx_erfd(x)
+#define gmx_erfc(x) gmx_erfcd(x)
+#else
+#define gmx_erf(x) gmx_erff(x)
+#define gmx_erfc(x) gmx_erfcf(x)
+#endif
+
+gmx_bool gmx_isfinite(real x);
+
+/*! \brief Check if two numbers are within a tolerance
+ *
+ * This routine checks if the relative difference between two numbers is
+ * approximately within the given tolerance, defined as
+ * fabs(f1-f2)<=tolerance*fabs(f1+f2).
+ *
+ * To check if two floating-point numbers are almost identical, use this routine
+ * with the tolerance GMX_REAL_EPS, or GMX_DOUBLE_EPS if the check should be
+ * done in double regardless of Gromacs precision.
+ *
+ * To check if two algorithms produce similar results you will normally need
+ * to relax the tolerance significantly since many operations (e.g. summation)
+ * accumulate floating point errors.
+ *
+ * \param f1 First number to compare
+ * \param f2 Second number to compare
+ * \param tol Tolerance to use
+ *
+ * \return 1 if the relative difference is within tolerance, 0 if not.
+ */
+static int
+gmx_within_tol(double f1,
+ double f2,
+ double tol)
+{
+ /* The or-equal is important - otherwise we return false if f1==f2==0 */
+ if (fabs(f1-f2) <= tol*0.5*(fabs(f1)+fabs(f2)) )
+ {
+ return 1;
+ }
+ else
+ {
+ return 0;
+ }
+}
+
+
+
+/**
+ * Check if a number is smaller than some preset safe minimum
+ * value, currently defined as GMX_REAL_MIN/GMX_REAL_EPS.
+ *
+ * If a number is smaller than this value we risk numerical overflow
+ * if any number larger than 1.0/GMX_REAL_EPS is divided by it.
+ *
+ * \return 1 if 'almost' numerically zero, 0 otherwise.
+ */
+static int
+gmx_numzero(double a)
+{
+ return gmx_within_tol(a, 0.0, GMX_REAL_MIN/GMX_REAL_EPS);
+}
+
+
+static real
+gmx_log2(real x)
+{
+ const real iclog2 = 1.0/log( 2.0 );
+
+ return log( x ) * iclog2;
+}
+
+/*! /brief Multiply two large ints
+ *
+ * Returns true when overflow did not occur.
+ */
+gmx_bool
+check_int_multiply_for_overflow(gmx_large_int_t a,
+ gmx_large_int_t b,
+ gmx_large_int_t *result);
+
++static int gmx_greatest_common_divisor(int p, int q)
++{
++ int tmp;
++ while (q != 0)
++ {
++ tmp = q;
++ q = p % q;
++ p = tmp;
++ }
++ return p;
++}
++
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _maths_h */
--- /dev/null
- gmx_gpu_info_t *gpu_info, int my_gpu_index,
+/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
+ *
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * GROningen MAchine for Chemical Simulations
+ *
+ * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2012, The GROMACS development team,
+ * check out http://www.gromacs.org for more information.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * If you want to redistribute modifications, please consider that
+ * scientific software is very special. Version control is crucial -
+ * bugs must be traceable. We will be happy to consider code for
+ * inclusion in the official distribution, but derived work must not
+ * be called official GROMACS. Details are found in the README & COPYING
+ * files - if they are missing, get the official version at www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the papers on the package - you can find them in the top README file.
+ *
+ * For more info, check our website at http://www.gromacs.org
+ *
+ * And Hey:
+ * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
+ */
+
+#ifndef NBNXN_CUDA_DATA_MGMT_H
+#define NBNXN_CUDA_DATA_MGMT_H
+
+#include "types/simple.h"
+#include "types/interaction_const.h"
+#include "types/nbnxn_cuda_types_ext.h"
+#include "types/hw_info.h"
+
+#ifdef GMX_GPU
+#define FUNC_TERM ;
+#define FUNC_QUALIFIER
+#else
+#define FUNC_TERM {}
+#define FUNC_QUALIFIER static
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*! Initializes the data structures related to CUDA nonbonded calculations. */
+FUNC_QUALIFIER
+void nbnxn_cuda_init(FILE *fplog,
+ nbnxn_cuda_ptr_t *p_cu_nb,
++ const gmx_gpu_info_t *gpu_info, int my_gpu_index,
+ /* true of both local and non-local are don on GPU */
+ gmx_bool bLocalAndNonlocal) FUNC_TERM
+
+/*! Initializes simulation constant data. */
+FUNC_QUALIFIER
+void nbnxn_cuda_init_const(nbnxn_cuda_ptr_t cu_nb,
+ const interaction_const_t *ic,
+ const nonbonded_verlet_group_t *nbv_group) FUNC_TERM
+
+/*! Initializes pair-list data for GPU, called at every pair search step. */
+FUNC_QUALIFIER
+void nbnxn_cuda_init_pairlist(nbnxn_cuda_ptr_t cu_nb,
+ const nbnxn_pairlist_t *h_nblist,
+ int iloc) FUNC_TERM
+
+/*! Initializes atom-data on the GPU, called at every pair search step. */
+FUNC_QUALIFIER
+void nbnxn_cuda_init_atomdata(nbnxn_cuda_ptr_t cu_nb,
+ const nbnxn_atomdata_t *atomdata) FUNC_TERM
+
+/*! \brief Update parameters during PP-PME load balancing. */
+FUNC_QUALIFIER
+void nbnxn_cuda_pme_loadbal_update_param(nbnxn_cuda_ptr_t cu_nb,
+ const interaction_const_t *ic) FUNC_TERM
+
+/*! Uploads shift vector to the GPU if the box is dynamic (otherwise just returns). */
+FUNC_QUALIFIER
+void nbnxn_cuda_upload_shiftvec(nbnxn_cuda_ptr_t cu_nb,
+ const nbnxn_atomdata_t *nbatom) FUNC_TERM
+
+/*! Clears GPU outputs: nonbonded force, shift force and energy. */
+FUNC_QUALIFIER
+void nbnxn_cuda_clear_outputs(nbnxn_cuda_ptr_t cu_nb,
+ int flags) FUNC_TERM
+
+/*! Frees all GPU resources used for the nonbonded calculations. */
+FUNC_QUALIFIER
+void nbnxn_cuda_free(FILE *fplog,
+ nbnxn_cuda_ptr_t cu_nb) FUNC_TERM
+
+/*! Returns the GPU timings structure or NULL if GPU is not used or timing is off. */
+FUNC_QUALIFIER
+wallclock_gpu_t * nbnxn_cuda_get_timings(nbnxn_cuda_ptr_t cu_nb)
+#ifdef GMX_GPU
+;
+#else
+{
+ return NULL;
+}
+#endif
+
+/*! Resets nonbonded GPU timings. */
+FUNC_QUALIFIER
+void nbnxn_cuda_reset_timings(nbnxn_cuda_ptr_t cu_nb) FUNC_TERM
+
+/*! Calculates the minimum size of proximity lists to improve SM load balance
+ with CUDA non-bonded kernels. */
+FUNC_QUALIFIER
+int nbnxn_cuda_min_ci_balanced(nbnxn_cuda_ptr_t cu_nb)
+#ifdef GMX_GPU
+;
+#else
+{
+ return -1;
+}
+#endif
+
++/*! Returns if analytical Ewald CUDA kernels are used. */
++FUNC_QUALIFIER
++gmx_bool nbnxn_cuda_is_kernel_ewald_analytical(const nbnxn_cuda_ptr_t cu_nb)
++#ifdef GMX_GPU
++;
++#else
++{
++ return FALSE;
++}
++#endif
++
+#ifdef __cplusplus
+}
+#endif
+
+#undef FUNC_TERM
+#undef FUNC_QUALIFIER
+
+#endif /* NBNXN_CUDA_DATA_MGMT_H */
--- /dev/null
- #define EEL_USER(e) ((e) == eelUSER || (e) == eelPMEUSER || (e) == (eelPMESWITCH))
+/*
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * GROningen MAchine for Chemical Simulations
+ *
+ * VERSION 3.2.0
+ * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2004, The GROMACS development team,
+ * check out http://www.gromacs.org for more information.
+
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * If you want to redistribute modifications, please consider that
+ * scientific software is very special. Version control is crucial -
+ * bugs must be traceable. We will be happy to consider code for
+ * inclusion in the official distribution, but derived work must not
+ * be called official GROMACS. Details are found in the README & COPYING
+ * files - if they are missing, get the official version at www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the papers on the package - you can find them in the top README file.
+ *
+ * For more info, check our website at http://www.gromacs.org
+ *
+ * And Hey:
+ * GRoups of Organic Molecules in ACtion for Science
+ */
+
+#ifndef ENUMS_H_
+#define ENUMS_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#if 0
+} /* fixes auto-indentation problems */
+#endif
+
+/* note: these enums should correspond to the names in gmxlib/names.c */
+
+enum {
+ epbcXYZ, epbcNONE, epbcXY, epbcSCREW, epbcNR
+};
+
+enum {
+ etcNO, etcBERENDSEN, etcNOSEHOOVER, etcYES, etcANDERSEN, etcANDERSENMASSIVE, etcVRESCALE, etcNR
+}; /* yes is an alias for berendsen */
+
+#define ETC_ANDERSEN(e) (((e) == etcANDERSENMASSIVE) || ((e) == etcANDERSEN))
+
+enum {
+ epcNO, epcBERENDSEN, epcPARRINELLORAHMAN, epcISOTROPIC, epcMTTK, epcNR
+}; /* isotropic is an alias for berendsen */
+
+/* trotter decomposition extended variable parts */
+enum {
+ etrtNONE, etrtNHC, etrtBAROV, etrtBARONHC, etrtNHC2, etrtBAROV2, etrtBARONHC2,
+ etrtVELOCITY1, etrtVELOCITY2, etrtPOSITION, etrtSKIPALL, etrtNR
+};
+
+/* sequenced parts of the trotter decomposition */
+enum {
+ ettTSEQ0, ettTSEQ1, ettTSEQ2, ettTSEQ3, ettTSEQ4, ettTSEQMAX
+};
+
+enum {
+ epctISOTROPIC, epctSEMIISOTROPIC, epctANISOTROPIC,
+ epctSURFACETENSION, epctNR
+};
+
+enum {
+ erscNO, erscALL, erscCOM, erscNR
+};
+
+enum {
+ ecutsGROUP, ecutsVERLET, ecutsNR
+};
+
+/* Coulomb / VdW interaction modifiers.
+ * grompp replaces eintmodPOTSHIFT_VERLET by eintmodPOTSHIFT or eintmodNONE.
+ * Exactcutoff is only used by Reaction-field-zero, and is not user-selectable.
+ */
+enum eintmod {
+ eintmodPOTSHIFT_VERLET, eintmodPOTSHIFT, eintmodNONE, eintmodPOTSWITCH, eintmodEXACTCUTOFF, eintmodNR
+};
+
+/*
+ * eelNOTUSED1 used to be GB, but to enable generalized born with different
+ * forms of electrostatics (RF, switch, etc.) in the future it is now selected
+ * separately (through the implicit_solvent option).
+ */
+enum {
+ eelCUT, eelRF, eelGRF, eelPME, eelEWALD, eelP3M_AD,
+ eelPOISSON, eelSWITCH, eelSHIFT, eelUSER, eelGB_NOTUSED, eelRF_NEC, eelENCADSHIFT,
+ eelPMEUSER, eelPMESWITCH, eelPMEUSERSWITCH, eelRF_ZERO, eelNR
+};
+
+/* Ewald geometry */
+enum {
+ eewg3D, eewg3DC, eewgNR
+};
+
+#define EEL_RF(e) ((e) == eelRF || (e) == eelGRF || (e) == eelRF_NEC || (e) == eelRF_ZERO )
+
+#define EEL_PME(e) ((e) == eelPME || (e) == eelPMESWITCH || (e) == eelPMEUSER || (e) == eelPMEUSERSWITCH || (e) == eelP3M_AD)
+#define EEL_FULL(e) (EEL_PME(e) || (e) == eelPOISSON || (e) == eelEWALD)
+
+#define EEL_SWITCHED(e) ((e) == eelSWITCH || (e) == eelSHIFT || (e) == eelENCADSHIFT || (e) == eelPMESWITCH || (e) == eelPMEUSERSWITCH)
+
++#define EEL_USER(e) ((e) == eelUSER || (e) == eelPMEUSER || (e) == (eelPMEUSERSWITCH))
+
+#define EEL_IS_ZERO_AT_CUTOFF(e) (EEL_SWITCHED(e) || (e) == eelRF_ZERO)
+
+#define EEL_MIGHT_BE_ZERO_AT_CUTOFF(e) (EEL_IS_ZERO_AT_CUTOFF(e) || (e) == eelUSER || (e) == eelPMEUSER)
+
+enum {
+ evdwCUT, evdwSWITCH, evdwSHIFT, evdwUSER, evdwENCADSHIFT, evdwNR
+};
+
+#define EVDW_SWITCHED(e) ((e) == evdwSWITCH || (e) == evdwSHIFT || (e) == evdwENCADSHIFT)
+
+#define EVDW_IS_ZERO_AT_CUTOFF(e) EVDW_SWITCHED(e)
+
+#define EVDW_MIGHT_BE_ZERO_AT_CUTOFF(e) (EVDW_IS_ZERO_AT_CUTOFF(e) || (e) == evdwUSER)
+
+enum {
+ ensGRID, ensSIMPLE, ensNR
+};
+
+/* eiVV is normal velocity verlet -- eiVVAK uses 1/2*(KE(t-dt/2)+KE(t+dt/2)) as the kinetic energy, and the half step kinetic
+ energy for temperature control */
+
+enum {
+ eiMD, eiSteep, eiCG, eiBD, eiSD2, eiNM, eiLBFGS, eiTPI, eiTPIC, eiSD1, eiVV, eiVVAK, eiNR
+};
+#define EI_VV(e) ((e) == eiVV || (e) == eiVVAK)
+#define EI_MD(e) ((e) == eiMD || EI_VV(e))
+#define EI_SD(e) ((e) == eiSD1 || (e) == eiSD2)
+#define EI_RANDOM(e) (EI_SD(e) || (e) == eiBD)
+/*above integrators may not conserve momenta*/
+#define EI_DYNAMICS(e) (EI_MD(e) || EI_SD(e) || (e) == eiBD)
+#define EI_ENERGY_MINIMIZATION(e) ((e) == eiSteep || (e) == eiCG || (e) == eiLBFGS)
+#define EI_TPI(e) ((e) == eiTPI || (e) == eiTPIC)
+
+#define EI_STATE_VELOCITY(e) (EI_MD(e) || EI_SD(e))
+
+enum {
+ econtLINCS, econtSHAKE, econtNR
+};
+
+enum {
+ edrNone, edrSimple, edrEnsemble, edrNR
+};
+
+enum {
+ edrwConservative, edrwEqual, edrwNR
+};
+
+/* Combination rule things */
+enum {
+ eCOMB_NONE, eCOMB_GEOMETRIC, eCOMB_ARITHMETIC, eCOMB_GEOM_SIG_EPS, eCOMB_NR
+};
+
+/* NBF selection */
+enum {
+ eNBF_NONE, eNBF_LJ, eNBF_BHAM, eNBF_NR
+};
+
+/* simulated tempering methods */
+enum {
+ esimtempGEOMETRIC, esimtempEXPONENTIAL, esimtempLINEAR, esimtempNR
+};
+/* FEP selection */
+enum {
+ efepNO, efepYES, efepSTATIC, efepSLOWGROWTH, efepEXPANDED, efepNR
+};
+/* if efepNO, there are no evaluations at other states.
+ if efepYES, treated equivalently to efepSTATIC.
+ if efepSTATIC, then lambdas do not change during the simulation.
+ if efepSLOWGROWTH, then the states change monotonically throughout the simulation.
+ if efepEXPANDED, then expanded ensemble simulations are occuring.
+ */
+
+/* FEP coupling types */
+enum {
+ efptFEP, efptMASS, efptCOUL, efptVDW, efptBONDED, efptRESTRAINT, efptTEMPERATURE, efptNR
+};
+
+/* How the lambda weights are calculated:
+ elamstatsMETROPOLIS = using the metropolis criteria
+ elamstatsBARKER = using the Barker critera for transition weights - also called unoptimized Bennett
+ elamstatsMINVAR = using Barker + minimum variance for weights
+ elamstatsWL = Wang-Landu (using visitation counts)
+ elamstatsWWL = Weighted Wang-Landau (using optimized gibbs weighted visitation counts)
+ */
+enum {
+ elamstatsNO, elamstatsMETROPOLIS, elamstatsBARKER, elamstatsMINVAR, elamstatsWL, elamstatsWWL, elamstatsNR
+};
+
+#define ELAMSTATS_EXPANDED(e) ((e) > elamstatsNO)
+
+#define EWL(e) ((e) == elamstatsWL || (e) == elamstatsWWL)
+
+/* How moves in lambda are calculated:
+ elmovemcMETROPOLIS - using the Metropolis criteria, and 50% up and down
+ elmovemcBARKER - using the Barker criteria, and 50% up and down
+ elmovemcGIBBS - computing the transition using the marginalized probabilities of the lambdas
+ elmovemcMETGIBBS - computing the transition using the metropolized version of Gibbs (Monte Carlo Strategies in Scientific computing, Liu, p. 134)
+ */
+enum {
+ elmcmoveNO, elmcmoveMETROPOLIS, elmcmoveBARKER, elmcmoveGIBBS, elmcmoveMETGIBBS, elmcmoveNR
+};
+
+/* how we decide whether weights have reached equilibrium
+ elmceqNO - never stop, weights keep going
+ elmceqYES - fix the weights from the beginning; no movement
+ elmceqWLDELTA - stop when the WL-delta falls below a certain level
+ elmceqNUMATLAM - stop when we have a certain number of samples at every step
+ elmceqSTEPS - stop when we've run a certain total number of steps
+ elmceqSAMPLES - stop when we've run a certain total number of samples
+ elmceqRATIO - stop when the ratio of samples (lowest to highest) is sufficiently large
+ */
+enum {
+ elmceqNO, elmceqYES, elmceqWLDELTA, elmceqNUMATLAM, elmceqSTEPS, elmceqSAMPLES, elmceqRATIO, elmceqNR
+};
+
+/* separate_dhdl_file selection */
+enum
+{
+ /* NOTE: YES is the first one. Do NOT interpret this one as a gmx_bool */
+ esepdhdlfileYES, esepdhdlfileNO, esepdhdlfileNR
+};
+
+/* dhdl_derivatives selection */
+enum
+{
+ /* NOTE: YES is the first one. Do NOT interpret this one as a gmx_bool */
+ edhdlderivativesYES, edhdlderivativesNO, edhdlderivativesNR
+};
+
+/* Solvent model */
+enum {
+ esolNO, esolSPC, esolTIP4P, esolNR
+};
+
+/* Dispersion correction */
+enum {
+ edispcNO, edispcEnerPres, edispcEner, edispcAllEnerPres, edispcAllEner, edispcNR
+};
+
+/* Shell types, for completion stuff */
+enum {
+ eshellCSH, eshellBASH, eshellZSH, eshellNR
+};
+
+/* Center of mass motion selection */
+enum {
+ ecmLINEAR, ecmANGULAR, ecmNO, ecmNR
+};
+
+/* New version of simulated annealing */
+enum {
+ eannNO, eannSINGLE, eannPERIODIC, eannNR
+};
+
+/* Implicit solvent algorithms */
+enum {
+ eisNO, eisGBSA, eisNR
+};
+
+/* Algorithms for calculating GB radii */
+enum {
+ egbSTILL, egbHCT, egbOBC, egbNR
+};
+
+enum {
+ esaAPPROX, esaNO, esaSTILL, esaNR
+};
+
+/* Wall types */
+enum {
+ ewt93, ewt104, ewtTABLE, ewt126, ewtNR
+};
+
+/* Pull stuff */
+enum {
+ epullNO, epullUMBRELLA, epullCONSTRAINT, epullCONST_F, epullNR
+};
+
+enum {
+ epullgDIST, epullgDIR, epullgCYL, epullgPOS, epullgDIRPBC, epullgNR
+};
+
+#define PULL_CYL(pull) ((pull)->eGeom == epullgCYL)
+
+/* Enforced rotation groups */
+enum {
+ erotgISO, erotgISOPF,
+ erotgPM, erotgPMPF,
+ erotgRM, erotgRMPF,
+ erotgRM2, erotgRM2PF,
+ erotgFLEX, erotgFLEXT,
+ erotgFLEX2, erotgFLEX2T,
+ erotgNR
+};
+
+enum {
+ erotgFitRMSD, erotgFitNORM, erotgFitPOT, erotgFitNR
+};
+
+/* QMMM */
+enum {
+ eQMmethodAM1, eQMmethodPM3, eQMmethodRHF,
+ eQMmethodUHF, eQMmethodDFT, eQMmethodB3LYP, eQMmethodMP2, eQMmethodCASSCF, eQMmethodB3LYPLAN,
+ eQMmethodDIRECT, eQMmethodNR
+};
+
+enum {
+ eQMbasisSTO3G, eQMbasisSTO3G2, eQMbasis321G,
+ eQMbasis321Gp, eQMbasis321dGp, eQMbasis621G,
+ eQMbasis631G, eQMbasis631Gp, eQMbasis631dGp,
+ eQMbasis6311G, eQMbasisNR
+};
+
+enum {
+ eQMMMschemenormal, eQMMMschemeoniom, eQMMMschemeNR
+};
+
+enum {
+ eMultentOptName, eMultentOptNo, eMultentOptLast, eMultentOptNR
+};
+
+/* flat-bottom posres geometries */
+enum {
+ efbposresZERO, efbposresSPHERE, efbposresCYLINDER, efbposresX, efbposresY, efbposresZ,
+ efbposresNR
+};
+
+enum {
+ eAdressOff, eAdressConst, eAdressXSplit, eAdressSphere, eAdressNR
+};
+
+enum {
+ eAdressICOff, eAdressICThermoForce, eAdressICNR
+};
+
+enum {
+ eAdressSITEcom, eAdressSITEcog, eAdressSITEatom, eAdressSITEatomatom, eAdressSITENR
+};
+
+
+/* The interactions contained in a (possibly merged) table
+ * for computing electrostatic, VDW repulsion and/or VDW dispersion
+ * contributions.
+ */
+enum gmx_table_interaction
+{
+ GMX_TABLE_INTERACTION_ELEC,
+ GMX_TABLE_INTERACTION_VDWREP_VDWDISP,
+ GMX_TABLE_INTERACTION_VDWEXPREP_VDWDISP,
+ GMX_TABLE_INTERACTION_VDWDISP,
+ GMX_TABLE_INTERACTION_ELEC_VDWREP_VDWDISP,
+ GMX_TABLE_INTERACTION_ELEC_VDWEXPREP_VDWDISP,
+ GMX_TABLE_INTERACTION_ELEC_VDWDISP,
+ GMX_TABLE_INTERACTION_NR
+};
+
+/* Different formats for table data. Cubic spline tables are typically stored
+ * with the four Y,F,G,H intermediate values (check tables.c for format), which
+ * makes it easy to load with a single 4-way SIMD instruction too.
+ * Linear tables only need one value per table point, or two if both V and F
+ * are calculated. However, with SIMD instructions this makes the loads unaligned,
+ * and in that case we store the data as F, D=F(i+1)-F(i), V, and then a blank value,
+ * which again makes it possible to load as a single instruction.
+ */
+enum gmx_table_format
+{
+ GMX_TABLE_FORMAT_CUBICSPLINE_YFGH,
+ GMX_TABLE_FORMAT_LINEAR_VF,
+ GMX_TABLE_FORMAT_LINEAR_V,
+ GMX_TABLE_FORMAT_LINEAR_F,
+ GMX_TABLE_FORMAT_LINEAR_FDV0,
+ GMX_TABLE_FORMAT_NR
+};
+
+/* Neighborlist geometry type.
+ * Kernels will compute interactions between two particles,
+ * 3-center water, 4-center water or coarse-grained beads.
+ */
+enum gmx_nblist_kernel_geometry
+{
+ GMX_NBLIST_GEOMETRY_PARTICLE_PARTICLE,
+ GMX_NBLIST_GEOMETRY_WATER3_PARTICLE,
+ GMX_NBLIST_GEOMETRY_WATER3_WATER3,
+ GMX_NBLIST_GEOMETRY_WATER4_PARTICLE,
+ GMX_NBLIST_GEOMETRY_WATER4_WATER4,
+ GMX_NBLIST_GEOMETRY_CG_CG,
+ GMX_NBLIST_GEOMETRY_NR
+};
+
+/* Types of electrostatics calculations available inside nonbonded kernels.
+ * Note that these do NOT necessarily correspond to the user selections in the MDP file;
+ * many interactions for instance map to tabulated kernels.
+ */
+enum gmx_nbkernel_elec
+{
+ GMX_NBKERNEL_ELEC_NONE,
+ GMX_NBKERNEL_ELEC_COULOMB,
+ GMX_NBKERNEL_ELEC_REACTIONFIELD,
+ GMX_NBKERNEL_ELEC_CUBICSPLINETABLE,
+ GMX_NBKERNEL_ELEC_GENERALIZEDBORN,
+ GMX_NBKERNEL_ELEC_EWALD,
+ GMX_NBKERNEL_ELEC_NR
+};
+
+/* Types of vdw calculations available inside nonbonded kernels.
+ * Note that these do NOT necessarily correspond to the user selections in the MDP file;
+ * many interactions for instance map to tabulated kernels.
+ */
+enum gmx_nbkernel_vdw
+{
+ GMX_NBKERNEL_VDW_NONE,
+ GMX_NBKERNEL_VDW_LENNARDJONES,
+ GMX_NBKERNEL_VDW_BUCKINGHAM,
+ GMX_NBKERNEL_VDW_CUBICSPLINETABLE,
+ GMX_NBKERNEL_VDW_NR
+};
+/* Types of interactions inside the neighborlist
+ */
+enum gmx_nblist_interaction_type
+{
+ GMX_NBLIST_INTERACTION_STANDARD,
+ GMX_NBLIST_INTERACTION_FREE_ENERGY,
+ GMX_NBLIST_INTERACTION_ADRESS,
+ GMX_NBLIST_INTERACTION_NR
+};
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ENUMS_H_ */
--- /dev/null
- int ePBC;
- gmx_bool bMolPBC;
- int rc_scaling;
- rvec posres_com;
- rvec posres_comB;
-
- gmx_hw_info_t *hwinfo;
- gmx_bool use_cpu_acceleration;
+/*
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * GROningen MAchine for Chemical Simulations
+ *
+ * VERSION 3.2.0
+ * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2004, The GROMACS development team,
+ * check out http://www.gromacs.org for more information.
+
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * If you want to redistribute modifications, please consider that
+ * scientific software is very special. Version control is crucial -
+ * bugs must be traceable. We will be happy to consider code for
+ * inclusion in the official distribution, but derived work must not
+ * be called official GROMACS. Details are found in the README & COPYING
+ * files - if they are missing, get the official version at www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the papers on the package - you can find them in the top README file.
+ *
+ * For more info, check our website at http://www.gromacs.org
+ *
+ * And Hey:
+ * GRoups of Organic Molecules in ACtion for Science
+ */
+
+#include "ns.h"
+#include "genborn.h"
+#include "qmmmrec.h"
+#include "idef.h"
+#include "nb_verlet.h"
+#include "interaction_const.h"
+#include "hw_info.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#if 0
+} /* fixes auto-indentation problems */
+#endif
+
+/* Abstract type for PME that is defined only in the routine that use them. */
+typedef struct gmx_pme *gmx_pme_t;
+
+
+
+/* Structure describing the data in a single table */
+typedef struct
+{
+ enum gmx_table_interaction interaction; /* Types of interactions stored in this table */
+ enum gmx_table_format format; /* Interpolation type and data format */
+
+ real r; /* range of the table */
+ int n; /* n+1 is the number of table points */
+ real scale; /* distance (nm) between two table points */
+ real scale_exp; /* distance for exponential part of VdW table, not always used */
+ real * data; /* the actual table data */
+
+ /* Some information about the table layout. This can also be derived from the interpolation
+ * type and the table interactions, but it is convenient to have here for sanity checks, and it makes it
+ * much easier to access the tables in the nonbonded kernels when we can set the data from variables.
+ * It is always true that stride = formatsize*ninteractions
+ */
+ int formatsize; /* Number of fp variables for each table point (1 for F, 2 for VF, 4 for YFGH, etc.) */
+ int ninteractions; /* Number of interactions in table, 1 for coul-only, 3 for coul+rep+disp. */
+ int stride; /* Distance to next table point (number of fp variables per table point in total) */
+} t_forcetable;
+
+typedef struct
+{
+ t_forcetable table_elec;
+ t_forcetable table_vdw;
+ t_forcetable table_elec_vdw;
+
+ /* The actual neighbor lists, short and long range, see enum above
+ * for definition of neighborlist indices.
+ */
+ t_nblist nlist_sr[eNL_NR];
+ t_nblist nlist_lr[eNL_NR];
+} t_nblists;
+
+/* macros for the cginfo data in forcerec */
+/* The maximum cg size in cginfo is 63
+ * because we only have space for 6 bits in cginfo,
+ * this cg size entry is actually only read with domain decomposition.
+ * But there is a smaller limit due to the t_excl data structure
+ * which is defined in nblist.h.
+ */
+#define SET_CGINFO_GID(cgi, gid) (cgi) = (((cgi) & ~65535) | (gid) )
+#define GET_CGINFO_GID(cgi) ( (cgi) & 65535)
+#define SET_CGINFO_EXCL_INTRA(cgi) (cgi) = ((cgi) | (1<<16))
+#define GET_CGINFO_EXCL_INTRA(cgi) ( (cgi) & (1<<16))
+#define SET_CGINFO_EXCL_INTER(cgi) (cgi) = ((cgi) | (1<<17))
+#define GET_CGINFO_EXCL_INTER(cgi) ( (cgi) & (1<<17))
+#define SET_CGINFO_SOLOPT(cgi, opt) (cgi) = (((cgi) & ~(3<<18)) | ((opt)<<18))
+#define GET_CGINFO_SOLOPT(cgi) (((cgi)>>18) & 3)
+#define SET_CGINFO_CONSTR(cgi) (cgi) = ((cgi) | (1<<20))
+#define GET_CGINFO_CONSTR(cgi) ( (cgi) & (1<<20))
+#define SET_CGINFO_SETTLE(cgi) (cgi) = ((cgi) | (1<<21))
+#define GET_CGINFO_SETTLE(cgi) ( (cgi) & (1<<21))
+/* This bit is only used with bBondComm in the domain decomposition */
+#define SET_CGINFO_BOND_INTER(cgi) (cgi) = ((cgi) | (1<<22))
+#define GET_CGINFO_BOND_INTER(cgi) ( (cgi) & (1<<22))
+#define SET_CGINFO_HAS_VDW(cgi) (cgi) = ((cgi) | (1<<23))
+#define GET_CGINFO_HAS_VDW(cgi) ( (cgi) & (1<<23))
+#define SET_CGINFO_HAS_Q(cgi) (cgi) = ((cgi) | (1<<24))
+#define GET_CGINFO_HAS_Q(cgi) ( (cgi) & (1<<24))
+#define SET_CGINFO_NATOMS(cgi, opt) (cgi) = (((cgi) & ~(63<<25)) | ((opt)<<25))
+#define GET_CGINFO_NATOMS(cgi) (((cgi)>>25) & 63)
+
+
+/* Value to be used in mdrun for an infinite cut-off.
+ * Since we need to compare with the cut-off squared,
+ * this value should be slighlty smaller than sqrt(GMX_FLOAT_MAX).
+ */
+#define GMX_CUTOFF_INF 1E+18
+
+/* enums for the neighborlist type */
+enum {
+ enbvdwNONE, enbvdwLJ, enbvdwBHAM, enbvdwTAB, enbvdwNR
+};
+/* OOR is "one over r" -- standard coul */
+enum {
+ enbcoulNONE, enbcoulOOR, enbcoulRF, enbcoulTAB, enbcoulGB, enbcoulFEWALD, enbcoulNR
+};
+
+enum {
+ egCOULSR, egLJSR, egBHAMSR, egCOULLR, egLJLR, egBHAMLR,
+ egCOUL14, egLJ14, egGB, egNR
+};
+
+typedef struct {
+ int nener; /* The number of energy group pairs */
+ real *ener[egNR]; /* Energy terms for each pair of groups */
+} gmx_grppairener_t;
+
+typedef struct {
+ real term[F_NRE]; /* The energies for all different interaction types */
+ gmx_grppairener_t grpp;
+ double dvdl_lin[efptNR]; /* Contributions to dvdl with linear lam-dependence */
+ double dvdl_nonlin[efptNR]; /* Idem, but non-linear dependence */
+ int n_lambda;
+ int fep_state; /*current fep state -- just for printing */
+ double *enerpart_lambda; /* Partial energy for lambda and flambda[] */
+ real foreign_term[F_NRE]; /* alternate array for storing foreign lambda energies */
+ gmx_grppairener_t foreign_grpp; /* alternate array for storing foreign lambda energies */
+} gmx_enerdata_t;
+/* The idea is that dvdl terms with linear lambda dependence will be added
+ * automatically to enerpart_lambda. Terms with non-linear lambda dependence
+ * should explicitly determine the energies at foreign lambda points
+ * when n_lambda > 0.
+ */
+
+typedef struct {
+ int cg_start;
+ int cg_end;
+ int cg_mod;
+ int *cginfo;
+} cginfo_mb_t;
+
+
+/* ewald table type */
+typedef struct ewald_tab *ewald_tab_t;
+
+typedef struct {
+ rvec *f;
+ int f_nalloc;
+ unsigned red_mask; /* Mask for marking which parts of f are filled */
+ rvec *fshift;
+ real ener[F_NRE];
+ gmx_grppairener_t grpp;
+ real Vcorr;
+ real dvdl[efptNR];
+ tensor vir;
+} f_thread_t;
+
+typedef struct {
+ interaction_const_t *ic;
+
+ /* Domain Decomposition */
+ gmx_bool bDomDec;
+
+ /* PBC stuff */
++ int ePBC;
++ gmx_bool bMolPBC;
++ int rc_scaling;
++ rvec posres_com;
++ rvec posres_comB;
++
++ const gmx_hw_info_t *hwinfo;
++ gmx_bool use_cpu_acceleration;
+
+ /* Interaction for calculated in kernels. In many cases this is similar to
+ * the electrostatics settings in the inputrecord, but the difference is that
+ * these variables always specify the actual interaction in the kernel - if
+ * we are tabulating reaction-field the inputrec will say reaction-field, but
+ * the kernel interaction will say cubic-spline-table. To be safe we also
+ * have a kernel-specific setting for the modifiers - if the interaction is
+ * tabulated we already included the inputrec modification there, so the kernel
+ * modification setting will say 'none' in that case.
+ */
+ int nbkernel_elec_interaction;
+ int nbkernel_vdw_interaction;
+ int nbkernel_elec_modifier;
+ int nbkernel_vdw_modifier;
+
+ /* Use special N*N kernels? */
+ gmx_bool bAllvsAll;
+ /* Private work data */
+ void *AllvsAll_work;
+ void *AllvsAll_workgb;
+
+ /* Cut-Off stuff.
+ * Infinite cut-off's will be GMX_CUTOFF_INF (unlike in t_inputrec: 0).
+ */
+ real rlist, rlistlong;
+
+ /* Dielectric constant resp. multiplication factor for charges */
+ real zsquare, temp;
+ real epsilon_r, epsilon_rf, epsfac;
+
+ /* Constants for reaction fields */
+ real kappa, k_rf, c_rf;
+
+ /* Charge sum and dipole for topology A/B ([0]/[1]) for Ewald corrections */
+ double qsum[2];
+ double q2sum[2];
+ rvec mu_tot[2];
+
+ /* Dispersion correction stuff */
+ int eDispCorr;
+
+ /* The shift of the shift or user potentials */
+ real enershiftsix;
+ real enershifttwelve;
+ /* Integrated differces for energy and virial with cut-off functions */
+ real enerdiffsix;
+ real enerdifftwelve;
+ real virdiffsix;
+ real virdifftwelve;
+ /* Constant for long range dispersion correction (average dispersion)
+ * for topology A/B ([0]/[1]) */
+ real avcsix[2];
+ /* Constant for long range repulsion term. Relative difference of about
+ * 0.1 percent with 0.8 nm cutoffs. But hey, it's cheap anyway...
+ */
+ real avctwelve[2];
+
+ /* Fudge factors */
+ real fudgeQQ;
+
+ /* Table stuff */
+ gmx_bool bcoultab;
+ gmx_bool bvdwtab;
+ /* The normal tables are in the nblists struct(s) below */
+ t_forcetable tab14; /* for 1-4 interactions only */
+
+ /* PPPM & Shifting stuff */
+ int coulomb_modifier;
+ real rcoulomb_switch, rcoulomb;
+ real *phi;
+
+ /* VdW stuff */
+ int vdw_modifier;
+ double reppow;
+ real rvdw_switch, rvdw;
+ real bham_b_max;
+
+ /* Free energy */
+ int efep;
+ real sc_alphavdw;
+ real sc_alphacoul;
+ int sc_power;
+ real sc_r_power;
+ real sc_sigma6_def;
+ real sc_sigma6_min;
+ gmx_bool bSepDVDL;
+
+ /* NS Stuff */
+ int eeltype;
+ int vdwtype;
+ int cg0, hcg;
+ /* solvent_opt contains the enum for the most common solvent
+ * in the system, which will be optimized.
+ * It can be set to esolNO to disable all water optimization */
+ int solvent_opt;
+ int nWatMol;
+ gmx_bool bGrid;
+ gmx_bool bExcl_IntraCGAll_InterCGNone;
+ cginfo_mb_t *cginfo_mb;
+ int *cginfo;
+ rvec *cg_cm;
+ int cg_nalloc;
+ rvec *shift_vec;
+
+ /* The neighborlists including tables */
+ int nnblists;
+ int *gid2nblists;
+ t_nblists *nblists;
+
+ int cutoff_scheme; /* group- or Verlet-style cutoff */
+ gmx_bool bNonbonded; /* true if nonbonded calculations are *not* turned off */
+ nonbonded_verlet_t *nbv;
+
+ /* The wall tables (if used) */
+ int nwall;
+ t_forcetable **wall_tab;
+
+ /* The number of charge groups participating in do_force_lowlevel */
+ int ncg_force;
+ /* The number of atoms participating in do_force_lowlevel */
+ int natoms_force;
+ /* The number of atoms participating in force and constraints */
+ int natoms_force_constr;
+ /* The allocation size of vectors of size natoms_force */
+ int nalloc_force;
+
+ /* Twin Range stuff, f_twin has size natoms_force */
+ gmx_bool bTwinRange;
+ int nlr;
+ rvec *f_twin;
+
+ /* Forces that should not enter into the virial summation:
+ * PPPM/PME/Ewald/posres
+ */
+ gmx_bool bF_NoVirSum;
+ int f_novirsum_n;
+ int f_novirsum_nalloc;
+ rvec *f_novirsum_alloc;
+ /* Pointer that points to f_novirsum_alloc when pressure is calcaluted,
+ * points to the normal force vectors wen pressure is not requested.
+ */
+ rvec *f_novirsum;
+
+ /* Long-range forces and virial for PPPM/PME/Ewald */
+ gmx_pme_t pmedata;
+ tensor vir_el_recip;
+
+ /* PME/Ewald stuff */
+ gmx_bool bEwald;
+ real ewaldcoeff;
+ ewald_tab_t ewald_table;
+
+ /* Virial Stuff */
+ rvec *fshift;
+ rvec vir_diag_posres;
+ dvec vir_wall_z;
+
+ /* Non bonded Parameter lists */
+ int ntype; /* Number of atom types */
+ gmx_bool bBHAM;
+ real *nbfp;
+
+ /* Energy group pair flags */
+ int *egp_flags;
+
+ /* xmdrun flexible constraints */
+ real fc_stepsize;
+
+ /* Generalized born implicit solvent */
+ gmx_bool bGB;
+ /* Generalized born stuff */
+ real gb_epsilon_solvent;
+ /* Table data for GB */
+ t_forcetable gbtab;
+ /* VdW radius for each atomtype (dim is thus ntype) */
+ real *atype_radius;
+ /* Effective radius (derived from effective volume) for each type */
+ real *atype_vol;
+ /* Implicit solvent - surface tension for each atomtype */
+ real *atype_surftens;
+ /* Implicit solvent - radius for GB calculation */
+ real *atype_gb_radius;
+ /* Implicit solvent - overlap for HCT model */
+ real *atype_S_hct;
+ /* Generalized born interaction data */
+ gmx_genborn_t *born;
+
+ /* Table scale for GB */
+ real gbtabscale;
+ /* Table range for GB */
+ real gbtabr;
+ /* GB neighborlists (the sr list will contain for each atom all other atoms
+ * (for use in the SA calculation) and the lr list will contain
+ * for each atom all atoms 1-4 or greater (for use in the GB calculation)
+ */
+ t_nblist gblist_sr;
+ t_nblist gblist_lr;
+ t_nblist gblist;
+
+ /* Inverse square root of the Born radii for implicit solvent */
+ real *invsqrta;
+ /* Derivatives of the potential with respect to the Born radii */
+ real *dvda;
+ /* Derivatives of the Born radii with respect to coordinates */
+ real *dadx;
+ real *dadx_rawptr;
+ int nalloc_dadx; /* Allocated size of dadx */
+
+ /* If > 0 signals Test Particle Insertion,
+ * the value is the number of atoms of the molecule to insert
+ * Only the energy difference due to the addition of the last molecule
+ * should be calculated.
+ */
+ gmx_bool n_tpi;
+
+ /* Neighbor searching stuff */
+ gmx_ns_t ns;
+
+ /* QMMM stuff */
+ gmx_bool bQMMM;
+ t_QMMMrec *qr;
+
+ /* QM-MM neighborlists */
+ t_nblist QMMMlist;
+
+ /* Limit for printing large forces, negative is don't print */
+ real print_force;
+
+ /* coarse load balancing time measurement */
+ double t_fnbf;
+ double t_wait;
+ int timesteps;
+
+ /* parameter needed for AdResS simulation */
+ int adress_type;
+ gmx_bool badress_tf_full_box;
+ real adress_const_wf;
+ real adress_ex_width;
+ real adress_hy_width;
+ int adress_icor;
+ int adress_site;
+ rvec adress_refs;
+ int n_adress_tf_grps;
+ int * adress_tf_table_index;
+ int *adress_group_explicit;
+ t_forcetable * atf_tabs;
+ real adress_ex_forcecap;
+ gmx_bool adress_do_hybridpairs;
+
+ /* User determined parameters, copied from the inputrec */
+ int userint1;
+ int userint2;
+ int userint3;
+ int userint4;
+ real userreal1;
+ real userreal2;
+ real userreal3;
+ real userreal4;
+
+ /* Thread local force and energy data */
+ /* FIXME move to bonded_thread_data_t */
+ int nthreads;
+ int red_ashift;
+ int red_nblock;
+ f_thread_t *f_t;
+
+ /* Exclusion load distribution over the threads */
+ int *excl_load;
+} t_forcerec;
+
+/* Important: Starting with Gromacs-4.6, the values of c6 and c12 in the nbfp array have
+ * been scaled by 6.0 or 12.0 to save flops in the kernels. We have corrected this everywhere
+ * in the code, but beware if you are using these macros externally.
+ */
+#define C6(nbfp, ntp, ai, aj) (nbfp)[2*((ntp)*(ai)+(aj))]
+#define C12(nbfp, ntp, ai, aj) (nbfp)[2*((ntp)*(ai)+(aj))+1]
+#define BHAMC(nbfp, ntp, ai, aj) (nbfp)[3*((ntp)*(ai)+(aj))]
+#define BHAMA(nbfp, ntp, ai, aj) (nbfp)[3*((ntp)*(ai)+(aj))+1]
+#define BHAMB(nbfp, ntp, ai, aj) (nbfp)[3*((ntp)*(ai)+(aj))+2]
+
+#ifdef __cplusplus
+}
+#endif
--- /dev/null
- gmx_bool bUserSet; /* true if the GPUs in cuda_dev_use are manually provided by the user */
- gmx_bool bDevShare; /* true if any of the devices is shared by
- (t)MPI ranks, with auto-detection always FALSE */
+/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
+ *
+ *
+ * This file is part of GROMACS.
+ * Copyright (c) 2012-
+ *
+ * Written by the Gromacs development team under coordination of
+ * David van der Spoel, Berk Hess, and Erik Lindahl.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org
+ *
+ * And Hey:
+ * Gromacs Runs On Most of All Computer Systems
+ */
+
+#ifndef HWINFO_H
+#define HWINFO_H
+
+#include "simple.h"
+#include "nbnxn_cuda_types_ext.h"
+#include "../gmx_cpuid.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#if 0
+} /* fixes auto-indentation problems */
+#endif
+
+/* Possible results of the GPU detection/check.
+ *
+ * The egpuInsane value means that during the sanity checks an error
+ * occurred that indicates malfunctioning of the device, driver, or
+ * incompatible driver/runtime. */
+typedef enum
+{
+ egpuCompatible = 0, egpuNonexistent, egpuIncompatible, egpuInsane
+} e_gpu_detect_res_t;
+
+/* Textual names of the GPU detection/check results (see e_gpu_detect_res_t). */
+static const char * const gpu_detect_res_str[] =
+{
+ "compatible", "inexistent", "incompatible", "insane"
+};
+
+/* GPU device information -- for now with only CUDA devices.
+ * The gmx_hardware_detect module initializes it. */
+typedef struct
+{
- * It is initialized by gmx_detect_hardware(). */
++ gmx_bool bUserSet; /* true if the GPUs in cuda_dev_use are manually provided by the user */
+
+ int ncuda_dev_use; /* number of devices selected to be used */
+ int *cuda_dev_use; /* index of the devices selected to be used */
+ int ncuda_dev; /* total number of devices detected */
+ cuda_dev_info_ptr_t cuda_dev; /* devices detected in the system (per node) */
+} gmx_gpu_info_t;
+
+/* Hardware information structure with CPU and GPU information.
- gmx_bool bCanUseGPU; /* True if compatible GPUs are detected during hardware detection */
- gmx_gpu_info_t gpu_info; /* Information about GPUs detected in the system */
++ * It is initialized by gmx_detect_hardware().
++ * NOTE: this structure may only contain structures that are globally valid
++ * (i.e. must be able to be shared among all threads) */
+typedef struct
+{
- gmx_cpuid_t cpuid_info; /* CPUID information about CPU detected;
- NOTE: this will only detect the CPU thread 0 of the
- current process runs on. */
- int nthreads_hw_avail; /* Number of hardware threads available; this number
- is based on the number of CPUs reported as available
- by the OS at the time of detection. */
++ gmx_bool bCanUseGPU; /* True if compatible GPUs are detected during hardware detection */
++ gmx_gpu_info_t gpu_info; /* Information about GPUs detected in the system */
+
++ gmx_cpuid_t cpuid_info; /* CPUID information about CPU detected;
++ NOTE: this will only detect the CPU thread 0 of the
++ current process runs on. */
++ int nthreads_hw_avail; /* Number of hardware threads available; this number
++ is based on the number of CPUs reported as available
++ by the OS at the time of detection. */
++ gmx_bool bConsistencyChecked; /* whether
++ gmx_check_hw_runconf_consistency()
++ has been run with this hw_info */
+} gmx_hw_info_t;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* HWINFO_H */
--- /dev/null
- Name: libgromacs
+libdir=@LIB_INSTALL_DIR@
+includedir=@INCL_INSTALL_DIR@
+
++Name: libgromacs@GMX_LIBS_SUFFIX@
+Description: Gromacs library
+URL: http://www.gromacs.org
+Version: @PROJECT_VERSION@
+Requires: @PKG_FFT@ @PKG_XML@ @PKG_GSL@
+Libs.private: @CMAKE_THREAD_LIBS_INIT@ @PKG_DL_LIBS@ @OpenMP_LINKER_FLAGS@
+Libs: -L${libdir} -lgromacs@GMX_LIBS_SUFFIX@ @PKG_FFT_LIBS@ -lm
+Cflags: -I${includedir} @PKG_CFLAGS@
+
--- /dev/null
- snew(fr->hwinfo, 1);
- gmx_detect_hardware(fp, fr->hwinfo, cr,
- FALSE, FALSE, NULL);
+/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
+ *
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * GROningen MAchine for Chemical Simulations
+ *
+ * VERSION 3.2.0
+ * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2004, The GROMACS development team,
+ * check out http://www.gromacs.org for more information.
+
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * If you want to redistribute modifications, please consider that
+ * scientific software is very special. Version control is crucial -
+ * bugs must be traceable. We will be happy to consider code for
+ * inclusion in the official distribution, but derived work must not
+ * be called official GROMACS. Details are found in the README & COPYING
+ * files - if they are missing, get the official version at www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the papers on the package - you can find them in the top README file.
+ *
+ * For more info, check our website at http://www.gromacs.org
+ *
+ * And Hey:
+ * GROwing Monsters And Cloning Shrimps
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+#include <string.h>
+#include <assert.h>
+#include "sysstuff.h"
+#include "typedefs.h"
+#include "vec.h"
+#include "maths.h"
+#include "macros.h"
+#include "smalloc.h"
+#include "macros.h"
+#include "gmx_fatal.h"
+#include "gmx_fatal_collective.h"
+#include "physics.h"
+#include "force.h"
+#include "tables.h"
+#include "nonbonded.h"
+#include "invblock.h"
+#include "names.h"
+#include "network.h"
+#include "pbc.h"
+#include "ns.h"
+#include "mshift.h"
+#include "txtdump.h"
+#include "coulomb.h"
+#include "md_support.h"
+#include "md_logging.h"
+#include "domdec.h"
+#include "partdec.h"
+#include "qmmm.h"
+#include "copyrite.h"
+#include "mtop_util.h"
+#include "nbnxn_search.h"
+#include "nbnxn_atomdata.h"
+#include "nbnxn_consts.h"
+#include "statutil.h"
+#include "gmx_omp_nthreads.h"
+#include "gmx_detect_hardware.h"
+
+#ifdef _MSC_VER
+/* MSVC definition for __cpuid() */
+#include <intrin.h>
+#endif
+
+#include "types/nbnxn_cuda_types_ext.h"
+#include "gpu_utils.h"
+#include "nbnxn_cuda_data_mgmt.h"
+#include "pmalloc_cuda.h"
+
+t_forcerec *mk_forcerec(void)
+{
+ t_forcerec *fr;
+
+ snew(fr, 1);
+
+ return fr;
+}
+
+#ifdef DEBUG
+static void pr_nbfp(FILE *fp, real *nbfp, gmx_bool bBHAM, int atnr)
+{
+ int i, j;
+
+ for (i = 0; (i < atnr); i++)
+ {
+ for (j = 0; (j < atnr); j++)
+ {
+ fprintf(fp, "%2d - %2d", i, j);
+ if (bBHAM)
+ {
+ fprintf(fp, " a=%10g, b=%10g, c=%10g\n", BHAMA(nbfp, atnr, i, j),
+ BHAMB(nbfp, atnr, i, j), BHAMC(nbfp, atnr, i, j)/6.0);
+ }
+ else
+ {
+ fprintf(fp, " c6=%10g, c12=%10g\n", C6(nbfp, atnr, i, j)/6.0,
+ C12(nbfp, atnr, i, j)/12.0);
+ }
+ }
+ }
+}
+#endif
+
+static real *mk_nbfp(const gmx_ffparams_t *idef, gmx_bool bBHAM)
+{
+ real *nbfp;
+ int i, j, k, atnr;
+
+ atnr = idef->atnr;
+ if (bBHAM)
+ {
+ snew(nbfp, 3*atnr*atnr);
+ for (i = k = 0; (i < atnr); i++)
+ {
+ for (j = 0; (j < atnr); j++, k++)
+ {
+ BHAMA(nbfp, atnr, i, j) = idef->iparams[k].bham.a;
+ BHAMB(nbfp, atnr, i, j) = idef->iparams[k].bham.b;
+ /* nbfp now includes the 6.0 derivative prefactor */
+ BHAMC(nbfp, atnr, i, j) = idef->iparams[k].bham.c*6.0;
+ }
+ }
+ }
+ else
+ {
+ snew(nbfp, 2*atnr*atnr);
+ for (i = k = 0; (i < atnr); i++)
+ {
+ for (j = 0; (j < atnr); j++, k++)
+ {
+ /* nbfp now includes the 6.0/12.0 derivative prefactors */
+ C6(nbfp, atnr, i, j) = idef->iparams[k].lj.c6*6.0;
+ C12(nbfp, atnr, i, j) = idef->iparams[k].lj.c12*12.0;
+ }
+ }
+ }
+
+ return nbfp;
+}
+
+/* This routine sets fr->solvent_opt to the most common solvent in the
+ * system, e.g. esolSPC or esolTIP4P. It will also mark each charge group in
+ * the fr->solvent_type array with the correct type (or esolNO).
+ *
+ * Charge groups that fulfill the conditions but are not identical to the
+ * most common one will be marked as esolNO in the solvent_type array.
+ *
+ * TIP3p is identical to SPC for these purposes, so we call it
+ * SPC in the arrays (Apologies to Bill Jorgensen ;-)
+ *
+ * NOTE: QM particle should not
+ * become an optimized solvent. Not even if there is only one charge
+ * group in the Qm
+ */
+
+typedef struct
+{
+ int model;
+ int count;
+ int vdwtype[4];
+ real charge[4];
+} solvent_parameters_t;
+
+static void
+check_solvent_cg(const gmx_moltype_t *molt,
+ int cg0,
+ int nmol,
+ const unsigned char *qm_grpnr,
+ const t_grps *qm_grps,
+ t_forcerec * fr,
+ int *n_solvent_parameters,
+ solvent_parameters_t **solvent_parameters_p,
+ int cginfo,
+ int *cg_sp)
+{
+ const t_blocka * excl;
+ t_atom *atom;
+ int j, k;
+ int j0, j1, nj;
+ gmx_bool perturbed;
+ gmx_bool has_vdw[4];
+ gmx_bool match;
+ real tmp_charge[4];
+ int tmp_vdwtype[4];
+ int tjA;
+ gmx_bool qm;
+ solvent_parameters_t *solvent_parameters;
+
+ /* We use a list with parameters for each solvent type.
+ * Every time we discover a new molecule that fulfills the basic
+ * conditions for a solvent we compare with the previous entries
+ * in these lists. If the parameters are the same we just increment
+ * the counter for that type, and otherwise we create a new type
+ * based on the current molecule.
+ *
+ * Once we've finished going through all molecules we check which
+ * solvent is most common, and mark all those molecules while we
+ * clear the flag on all others.
+ */
+
+ solvent_parameters = *solvent_parameters_p;
+
+ /* Mark the cg first as non optimized */
+ *cg_sp = -1;
+
+ /* Check if this cg has no exclusions with atoms in other charge groups
+ * and all atoms inside the charge group excluded.
+ * We only have 3 or 4 atom solvent loops.
+ */
+ if (GET_CGINFO_EXCL_INTER(cginfo) ||
+ !GET_CGINFO_EXCL_INTRA(cginfo))
+ {
+ return;
+ }
+
+ /* Get the indices of the first atom in this charge group */
+ j0 = molt->cgs.index[cg0];
+ j1 = molt->cgs.index[cg0+1];
+
+ /* Number of atoms in our molecule */
+ nj = j1 - j0;
+
+ if (debug)
+ {
+ fprintf(debug,
+ "Moltype '%s': there are %d atoms in this charge group\n",
+ *molt->name, nj);
+ }
+
+ /* Check if it could be an SPC (3 atoms) or TIP4p (4) water,
+ * otherwise skip it.
+ */
+ if (nj < 3 || nj > 4)
+ {
+ return;
+ }
+
+ /* Check if we are doing QM on this group */
+ qm = FALSE;
+ if (qm_grpnr != NULL)
+ {
+ for (j = j0; j < j1 && !qm; j++)
+ {
+ qm = (qm_grpnr[j] < qm_grps->nr - 1);
+ }
+ }
+ /* Cannot use solvent optimization with QM */
+ if (qm)
+ {
+ return;
+ }
+
+ atom = molt->atoms.atom;
+
+ /* Still looks like a solvent, time to check parameters */
+
+ /* If it is perturbed (free energy) we can't use the solvent loops,
+ * so then we just skip to the next molecule.
+ */
+ perturbed = FALSE;
+
+ for (j = j0; j < j1 && !perturbed; j++)
+ {
+ perturbed = PERTURBED(atom[j]);
+ }
+
+ if (perturbed)
+ {
+ return;
+ }
+
+ /* Now it's only a question if the VdW and charge parameters
+ * are OK. Before doing the check we compare and see if they are
+ * identical to a possible previous solvent type.
+ * First we assign the current types and charges.
+ */
+ for (j = 0; j < nj; j++)
+ {
+ tmp_vdwtype[j] = atom[j0+j].type;
+ tmp_charge[j] = atom[j0+j].q;
+ }
+
+ /* Does it match any previous solvent type? */
+ for (k = 0; k < *n_solvent_parameters; k++)
+ {
+ match = TRUE;
+
+
+ /* We can only match SPC with 3 atoms and TIP4p with 4 atoms */
+ if ( (solvent_parameters[k].model == esolSPC && nj != 3) ||
+ (solvent_parameters[k].model == esolTIP4P && nj != 4) )
+ {
+ match = FALSE;
+ }
+
+ /* Check that types & charges match for all atoms in molecule */
+ for (j = 0; j < nj && match == TRUE; j++)
+ {
+ if (tmp_vdwtype[j] != solvent_parameters[k].vdwtype[j])
+ {
+ match = FALSE;
+ }
+ if (tmp_charge[j] != solvent_parameters[k].charge[j])
+ {
+ match = FALSE;
+ }
+ }
+ if (match == TRUE)
+ {
+ /* Congratulations! We have a matched solvent.
+ * Flag it with this type for later processing.
+ */
+ *cg_sp = k;
+ solvent_parameters[k].count += nmol;
+
+ /* We are done with this charge group */
+ return;
+ }
+ }
+
+ /* If we get here, we have a tentative new solvent type.
+ * Before we add it we must check that it fulfills the requirements
+ * of the solvent optimized loops. First determine which atoms have
+ * VdW interactions.
+ */
+ for (j = 0; j < nj; j++)
+ {
+ has_vdw[j] = FALSE;
+ tjA = tmp_vdwtype[j];
+
+ /* Go through all other tpes and see if any have non-zero
+ * VdW parameters when combined with this one.
+ */
+ for (k = 0; k < fr->ntype && (has_vdw[j] == FALSE); k++)
+ {
+ /* We already checked that the atoms weren't perturbed,
+ * so we only need to check state A now.
+ */
+ if (fr->bBHAM)
+ {
+ has_vdw[j] = (has_vdw[j] ||
+ (BHAMA(fr->nbfp, fr->ntype, tjA, k) != 0.0) ||
+ (BHAMB(fr->nbfp, fr->ntype, tjA, k) != 0.0) ||
+ (BHAMC(fr->nbfp, fr->ntype, tjA, k) != 0.0));
+ }
+ else
+ {
+ /* Standard LJ */
+ has_vdw[j] = (has_vdw[j] ||
+ (C6(fr->nbfp, fr->ntype, tjA, k) != 0.0) ||
+ (C12(fr->nbfp, fr->ntype, tjA, k) != 0.0));
+ }
+ }
+ }
+
+ /* Now we know all we need to make the final check and assignment. */
+ if (nj == 3)
+ {
+ /* So, is it an SPC?
+ * For this we require thatn all atoms have charge,
+ * the charges on atom 2 & 3 should be the same, and only
+ * atom 1 might have VdW.
+ */
+ if (has_vdw[1] == FALSE &&
+ has_vdw[2] == FALSE &&
+ tmp_charge[0] != 0 &&
+ tmp_charge[1] != 0 &&
+ tmp_charge[2] == tmp_charge[1])
+ {
+ srenew(solvent_parameters, *n_solvent_parameters+1);
+ solvent_parameters[*n_solvent_parameters].model = esolSPC;
+ solvent_parameters[*n_solvent_parameters].count = nmol;
+ for (k = 0; k < 3; k++)
+ {
+ solvent_parameters[*n_solvent_parameters].vdwtype[k] = tmp_vdwtype[k];
+ solvent_parameters[*n_solvent_parameters].charge[k] = tmp_charge[k];
+ }
+
+ *cg_sp = *n_solvent_parameters;
+ (*n_solvent_parameters)++;
+ }
+ }
+ else if (nj == 4)
+ {
+ /* Or could it be a TIP4P?
+ * For this we require thatn atoms 2,3,4 have charge, but not atom 1.
+ * Only atom 1 mght have VdW.
+ */
+ if (has_vdw[1] == FALSE &&
+ has_vdw[2] == FALSE &&
+ has_vdw[3] == FALSE &&
+ tmp_charge[0] == 0 &&
+ tmp_charge[1] != 0 &&
+ tmp_charge[2] == tmp_charge[1] &&
+ tmp_charge[3] != 0)
+ {
+ srenew(solvent_parameters, *n_solvent_parameters+1);
+ solvent_parameters[*n_solvent_parameters].model = esolTIP4P;
+ solvent_parameters[*n_solvent_parameters].count = nmol;
+ for (k = 0; k < 4; k++)
+ {
+ solvent_parameters[*n_solvent_parameters].vdwtype[k] = tmp_vdwtype[k];
+ solvent_parameters[*n_solvent_parameters].charge[k] = tmp_charge[k];
+ }
+
+ *cg_sp = *n_solvent_parameters;
+ (*n_solvent_parameters)++;
+ }
+ }
+
+ *solvent_parameters_p = solvent_parameters;
+}
+
+static void
+check_solvent(FILE * fp,
+ const gmx_mtop_t * mtop,
+ t_forcerec * fr,
+ cginfo_mb_t *cginfo_mb)
+{
+ const t_block * cgs;
+ const t_block * mols;
+ const gmx_moltype_t *molt;
+ int mb, mol, cg_mol, at_offset, cg_offset, am, cgm, i, nmol_ch, nmol;
+ int n_solvent_parameters;
+ solvent_parameters_t *solvent_parameters;
+ int **cg_sp;
+ int bestsp, bestsol;
+
+ if (debug)
+ {
+ fprintf(debug, "Going to determine what solvent types we have.\n");
+ }
+
+ mols = &mtop->mols;
+
+ n_solvent_parameters = 0;
+ solvent_parameters = NULL;
+ /* Allocate temporary array for solvent type */
+ snew(cg_sp, mtop->nmolblock);
+
+ cg_offset = 0;
+ at_offset = 0;
+ for (mb = 0; mb < mtop->nmolblock; mb++)
+ {
+ molt = &mtop->moltype[mtop->molblock[mb].type];
+ cgs = &molt->cgs;
+ /* Here we have to loop over all individual molecules
+ * because we need to check for QMMM particles.
+ */
+ snew(cg_sp[mb], cginfo_mb[mb].cg_mod);
+ nmol_ch = cginfo_mb[mb].cg_mod/cgs->nr;
+ nmol = mtop->molblock[mb].nmol/nmol_ch;
+ for (mol = 0; mol < nmol_ch; mol++)
+ {
+ cgm = mol*cgs->nr;
+ am = mol*cgs->index[cgs->nr];
+ for (cg_mol = 0; cg_mol < cgs->nr; cg_mol++)
+ {
+ check_solvent_cg(molt, cg_mol, nmol,
+ mtop->groups.grpnr[egcQMMM] ?
+ mtop->groups.grpnr[egcQMMM]+at_offset+am : 0,
+ &mtop->groups.grps[egcQMMM],
+ fr,
+ &n_solvent_parameters, &solvent_parameters,
+ cginfo_mb[mb].cginfo[cgm+cg_mol],
+ &cg_sp[mb][cgm+cg_mol]);
+ }
+ }
+ cg_offset += cgs->nr;
+ at_offset += cgs->index[cgs->nr];
+ }
+
+ /* Puh! We finished going through all charge groups.
+ * Now find the most common solvent model.
+ */
+
+ /* Most common solvent this far */
+ bestsp = -2;
+ for (i = 0; i < n_solvent_parameters; i++)
+ {
+ if (bestsp == -2 ||
+ solvent_parameters[i].count > solvent_parameters[bestsp].count)
+ {
+ bestsp = i;
+ }
+ }
+
+ if (bestsp >= 0)
+ {
+ bestsol = solvent_parameters[bestsp].model;
+ }
+ else
+ {
+ bestsol = esolNO;
+ }
+
+#ifdef DISABLE_WATER_NLIST
+ bestsol = esolNO;
+#endif
+
+ fr->nWatMol = 0;
+ for (mb = 0; mb < mtop->nmolblock; mb++)
+ {
+ cgs = &mtop->moltype[mtop->molblock[mb].type].cgs;
+ nmol = (mtop->molblock[mb].nmol*cgs->nr)/cginfo_mb[mb].cg_mod;
+ for (i = 0; i < cginfo_mb[mb].cg_mod; i++)
+ {
+ if (cg_sp[mb][i] == bestsp)
+ {
+ SET_CGINFO_SOLOPT(cginfo_mb[mb].cginfo[i], bestsol);
+ fr->nWatMol += nmol;
+ }
+ else
+ {
+ SET_CGINFO_SOLOPT(cginfo_mb[mb].cginfo[i], esolNO);
+ }
+ }
+ sfree(cg_sp[mb]);
+ }
+ sfree(cg_sp);
+
+ if (bestsol != esolNO && fp != NULL)
+ {
+ fprintf(fp, "\nEnabling %s-like water optimization for %d molecules.\n\n",
+ esol_names[bestsol],
+ solvent_parameters[bestsp].count);
+ }
+
+ sfree(solvent_parameters);
+ fr->solvent_opt = bestsol;
+}
+
+enum {
+ acNONE = 0, acCONSTRAINT, acSETTLE
+};
+
+static cginfo_mb_t *init_cginfo_mb(FILE *fplog, const gmx_mtop_t *mtop,
+ t_forcerec *fr, gmx_bool bNoSolvOpt,
+ gmx_bool *bExcl_IntraCGAll_InterCGNone)
+{
+ const t_block *cgs;
+ const t_blocka *excl;
+ const gmx_moltype_t *molt;
+ const gmx_molblock_t *molb;
+ cginfo_mb_t *cginfo_mb;
+ gmx_bool *type_VDW;
+ int *cginfo;
+ int cg_offset, a_offset, cgm, am;
+ int mb, m, ncg_tot, cg, a0, a1, gid, ai, j, aj, excl_nalloc;
+ int *a_con;
+ int ftype;
+ int ia;
+ gmx_bool bId, *bExcl, bExclIntraAll, bExclInter, bHaveVDW, bHaveQ;
+
+ ncg_tot = ncg_mtop(mtop);
+ snew(cginfo_mb, mtop->nmolblock);
+
+ snew(type_VDW, fr->ntype);
+ for (ai = 0; ai < fr->ntype; ai++)
+ {
+ type_VDW[ai] = FALSE;
+ for (j = 0; j < fr->ntype; j++)
+ {
+ type_VDW[ai] = type_VDW[ai] ||
+ fr->bBHAM ||
+ C6(fr->nbfp, fr->ntype, ai, j) != 0 ||
+ C12(fr->nbfp, fr->ntype, ai, j) != 0;
+ }
+ }
+
+ *bExcl_IntraCGAll_InterCGNone = TRUE;
+
+ excl_nalloc = 10;
+ snew(bExcl, excl_nalloc);
+ cg_offset = 0;
+ a_offset = 0;
+ for (mb = 0; mb < mtop->nmolblock; mb++)
+ {
+ molb = &mtop->molblock[mb];
+ molt = &mtop->moltype[molb->type];
+ cgs = &molt->cgs;
+ excl = &molt->excls;
+
+ /* Check if the cginfo is identical for all molecules in this block.
+ * If so, we only need an array of the size of one molecule.
+ * Otherwise we make an array of #mol times #cgs per molecule.
+ */
+ bId = TRUE;
+ am = 0;
+ for (m = 0; m < molb->nmol; m++)
+ {
+ am = m*cgs->index[cgs->nr];
+ for (cg = 0; cg < cgs->nr; cg++)
+ {
+ a0 = cgs->index[cg];
+ a1 = cgs->index[cg+1];
+ if (ggrpnr(&mtop->groups, egcENER, a_offset+am+a0) !=
+ ggrpnr(&mtop->groups, egcENER, a_offset +a0))
+ {
+ bId = FALSE;
+ }
+ if (mtop->groups.grpnr[egcQMMM] != NULL)
+ {
+ for (ai = a0; ai < a1; ai++)
+ {
+ if (mtop->groups.grpnr[egcQMMM][a_offset+am+ai] !=
+ mtop->groups.grpnr[egcQMMM][a_offset +ai])
+ {
+ bId = FALSE;
+ }
+ }
+ }
+ }
+ }
+
+ cginfo_mb[mb].cg_start = cg_offset;
+ cginfo_mb[mb].cg_end = cg_offset + molb->nmol*cgs->nr;
+ cginfo_mb[mb].cg_mod = (bId ? 1 : molb->nmol)*cgs->nr;
+ snew(cginfo_mb[mb].cginfo, cginfo_mb[mb].cg_mod);
+ cginfo = cginfo_mb[mb].cginfo;
+
+ /* Set constraints flags for constrained atoms */
+ snew(a_con, molt->atoms.nr);
+ for (ftype = 0; ftype < F_NRE; ftype++)
+ {
+ if (interaction_function[ftype].flags & IF_CONSTRAINT)
+ {
+ int nral;
+
+ nral = NRAL(ftype);
+ for (ia = 0; ia < molt->ilist[ftype].nr; ia += 1+nral)
+ {
+ int a;
+
+ for (a = 0; a < nral; a++)
+ {
+ a_con[molt->ilist[ftype].iatoms[ia+1+a]] =
+ (ftype == F_SETTLE ? acSETTLE : acCONSTRAINT);
+ }
+ }
+ }
+ }
+
+ for (m = 0; m < (bId ? 1 : molb->nmol); m++)
+ {
+ cgm = m*cgs->nr;
+ am = m*cgs->index[cgs->nr];
+ for (cg = 0; cg < cgs->nr; cg++)
+ {
+ a0 = cgs->index[cg];
+ a1 = cgs->index[cg+1];
+
+ /* Store the energy group in cginfo */
+ gid = ggrpnr(&mtop->groups, egcENER, a_offset+am+a0);
+ SET_CGINFO_GID(cginfo[cgm+cg], gid);
+
+ /* Check the intra/inter charge group exclusions */
+ if (a1-a0 > excl_nalloc)
+ {
+ excl_nalloc = a1 - a0;
+ srenew(bExcl, excl_nalloc);
+ }
+ /* bExclIntraAll: all intra cg interactions excluded
+ * bExclInter: any inter cg interactions excluded
+ */
+ bExclIntraAll = TRUE;
+ bExclInter = FALSE;
+ bHaveVDW = FALSE;
+ bHaveQ = FALSE;
+ for (ai = a0; ai < a1; ai++)
+ {
+ /* Check VDW and electrostatic interactions */
+ bHaveVDW = bHaveVDW || (type_VDW[molt->atoms.atom[ai].type] ||
+ type_VDW[molt->atoms.atom[ai].typeB]);
+ bHaveQ = bHaveQ || (molt->atoms.atom[ai].q != 0 ||
+ molt->atoms.atom[ai].qB != 0);
+
+ /* Clear the exclusion list for atom ai */
+ for (aj = a0; aj < a1; aj++)
+ {
+ bExcl[aj-a0] = FALSE;
+ }
+ /* Loop over all the exclusions of atom ai */
+ for (j = excl->index[ai]; j < excl->index[ai+1]; j++)
+ {
+ aj = excl->a[j];
+ if (aj < a0 || aj >= a1)
+ {
+ bExclInter = TRUE;
+ }
+ else
+ {
+ bExcl[aj-a0] = TRUE;
+ }
+ }
+ /* Check if ai excludes a0 to a1 */
+ for (aj = a0; aj < a1; aj++)
+ {
+ if (!bExcl[aj-a0])
+ {
+ bExclIntraAll = FALSE;
+ }
+ }
+
+ switch (a_con[ai])
+ {
+ case acCONSTRAINT:
+ SET_CGINFO_CONSTR(cginfo[cgm+cg]);
+ break;
+ case acSETTLE:
+ SET_CGINFO_SETTLE(cginfo[cgm+cg]);
+ break;
+ default:
+ break;
+ }
+ }
+ if (bExclIntraAll)
+ {
+ SET_CGINFO_EXCL_INTRA(cginfo[cgm+cg]);
+ }
+ if (bExclInter)
+ {
+ SET_CGINFO_EXCL_INTER(cginfo[cgm+cg]);
+ }
+ if (a1 - a0 > MAX_CHARGEGROUP_SIZE)
+ {
+ /* The size in cginfo is currently only read with DD */
+ gmx_fatal(FARGS, "A charge group has size %d which is larger than the limit of %d atoms", a1-a0, MAX_CHARGEGROUP_SIZE);
+ }
+ if (bHaveVDW)
+ {
+ SET_CGINFO_HAS_VDW(cginfo[cgm+cg]);
+ }
+ if (bHaveQ)
+ {
+ SET_CGINFO_HAS_Q(cginfo[cgm+cg]);
+ }
+ /* Store the charge group size */
+ SET_CGINFO_NATOMS(cginfo[cgm+cg], a1-a0);
+
+ if (!bExclIntraAll || bExclInter)
+ {
+ *bExcl_IntraCGAll_InterCGNone = FALSE;
+ }
+ }
+ }
+
+ sfree(a_con);
+
+ cg_offset += molb->nmol*cgs->nr;
+ a_offset += molb->nmol*cgs->index[cgs->nr];
+ }
+ sfree(bExcl);
+
+ /* the solvent optimizer is called after the QM is initialized,
+ * because we don't want to have the QM subsystemto become an
+ * optimized solvent
+ */
+
+ check_solvent(fplog, mtop, fr, cginfo_mb);
+
+ if (getenv("GMX_NO_SOLV_OPT"))
+ {
+ if (fplog)
+ {
+ fprintf(fplog, "Found environment variable GMX_NO_SOLV_OPT.\n"
+ "Disabling all solvent optimization\n");
+ }
+ fr->solvent_opt = esolNO;
+ }
+ if (bNoSolvOpt)
+ {
+ fr->solvent_opt = esolNO;
+ }
+ if (!fr->solvent_opt)
+ {
+ for (mb = 0; mb < mtop->nmolblock; mb++)
+ {
+ for (cg = 0; cg < cginfo_mb[mb].cg_mod; cg++)
+ {
+ SET_CGINFO_SOLOPT(cginfo_mb[mb].cginfo[cg], esolNO);
+ }
+ }
+ }
+
+ return cginfo_mb;
+}
+
+static int *cginfo_expand(int nmb, cginfo_mb_t *cgi_mb)
+{
+ int ncg, mb, cg;
+ int *cginfo;
+
+ ncg = cgi_mb[nmb-1].cg_end;
+ snew(cginfo, ncg);
+ mb = 0;
+ for (cg = 0; cg < ncg; cg++)
+ {
+ while (cg >= cgi_mb[mb].cg_end)
+ {
+ mb++;
+ }
+ cginfo[cg] =
+ cgi_mb[mb].cginfo[(cg - cgi_mb[mb].cg_start) % cgi_mb[mb].cg_mod];
+ }
+
+ return cginfo;
+}
+
+static void set_chargesum(FILE *log, t_forcerec *fr, const gmx_mtop_t *mtop)
+{
+ double qsum, q2sum, q;
+ int mb, nmol, i;
+ const t_atoms *atoms;
+
+ qsum = 0;
+ q2sum = 0;
+ for (mb = 0; mb < mtop->nmolblock; mb++)
+ {
+ nmol = mtop->molblock[mb].nmol;
+ atoms = &mtop->moltype[mtop->molblock[mb].type].atoms;
+ for (i = 0; i < atoms->nr; i++)
+ {
+ q = atoms->atom[i].q;
+ qsum += nmol*q;
+ q2sum += nmol*q*q;
+ }
+ }
+ fr->qsum[0] = qsum;
+ fr->q2sum[0] = q2sum;
+ if (fr->efep != efepNO)
+ {
+ qsum = 0;
+ q2sum = 0;
+ for (mb = 0; mb < mtop->nmolblock; mb++)
+ {
+ nmol = mtop->molblock[mb].nmol;
+ atoms = &mtop->moltype[mtop->molblock[mb].type].atoms;
+ for (i = 0; i < atoms->nr; i++)
+ {
+ q = atoms->atom[i].qB;
+ qsum += nmol*q;
+ q2sum += nmol*q*q;
+ }
+ fr->qsum[1] = qsum;
+ fr->q2sum[1] = q2sum;
+ }
+ }
+ else
+ {
+ fr->qsum[1] = fr->qsum[0];
+ fr->q2sum[1] = fr->q2sum[0];
+ }
+ if (log)
+ {
+ if (fr->efep == efepNO)
+ {
+ fprintf(log, "System total charge: %.3f\n", fr->qsum[0]);
+ }
+ else
+ {
+ fprintf(log, "System total charge, top. A: %.3f top. B: %.3f\n",
+ fr->qsum[0], fr->qsum[1]);
+ }
+ }
+}
+
+void update_forcerec(FILE *log, t_forcerec *fr, matrix box)
+{
+ if (fr->eeltype == eelGRF)
+ {
+ calc_rffac(NULL, fr->eeltype, fr->epsilon_r, fr->epsilon_rf,
+ fr->rcoulomb, fr->temp, fr->zsquare, box,
+ &fr->kappa, &fr->k_rf, &fr->c_rf);
+ }
+}
+
+void set_avcsixtwelve(FILE *fplog, t_forcerec *fr, const gmx_mtop_t *mtop)
+{
+ const t_atoms *atoms, *atoms_tpi;
+ const t_blocka *excl;
+ int mb, nmol, nmolc, i, j, tpi, tpj, j1, j2, k, n, nexcl, q;
+#if (defined SIZEOF_LONG_LONG_INT) && (SIZEOF_LONG_LONG_INT >= 8)
+ long long int npair, npair_ij, tmpi, tmpj;
+#else
+ double npair, npair_ij, tmpi, tmpj;
+#endif
+ double csix, ctwelve;
+ int ntp, *typecount;
+ gmx_bool bBHAM;
+ real *nbfp;
+
+ ntp = fr->ntype;
+ bBHAM = fr->bBHAM;
+ nbfp = fr->nbfp;
+
+ for (q = 0; q < (fr->efep == efepNO ? 1 : 2); q++)
+ {
+ csix = 0;
+ ctwelve = 0;
+ npair = 0;
+ nexcl = 0;
+ if (!fr->n_tpi)
+ {
+ /* Count the types so we avoid natoms^2 operations */
+ snew(typecount, ntp);
+ for (mb = 0; mb < mtop->nmolblock; mb++)
+ {
+ nmol = mtop->molblock[mb].nmol;
+ atoms = &mtop->moltype[mtop->molblock[mb].type].atoms;
+ for (i = 0; i < atoms->nr; i++)
+ {
+ if (q == 0)
+ {
+ tpi = atoms->atom[i].type;
+ }
+ else
+ {
+ tpi = atoms->atom[i].typeB;
+ }
+ typecount[tpi] += nmol;
+ }
+ }
+ for (tpi = 0; tpi < ntp; tpi++)
+ {
+ for (tpj = tpi; tpj < ntp; tpj++)
+ {
+ tmpi = typecount[tpi];
+ tmpj = typecount[tpj];
+ if (tpi != tpj)
+ {
+ npair_ij = tmpi*tmpj;
+ }
+ else
+ {
+ npair_ij = tmpi*(tmpi - 1)/2;
+ }
+ if (bBHAM)
+ {
+ /* nbfp now includes the 6.0 derivative prefactor */
+ csix += npair_ij*BHAMC(nbfp, ntp, tpi, tpj)/6.0;
+ }
+ else
+ {
+ /* nbfp now includes the 6.0/12.0 derivative prefactors */
+ csix += npair_ij* C6(nbfp, ntp, tpi, tpj)/6.0;
+ ctwelve += npair_ij* C12(nbfp, ntp, tpi, tpj)/12.0;
+ }
+ npair += npair_ij;
+ }
+ }
+ sfree(typecount);
+ /* Subtract the excluded pairs.
+ * The main reason for substracting exclusions is that in some cases
+ * some combinations might never occur and the parameters could have
+ * any value. These unused values should not influence the dispersion
+ * correction.
+ */
+ for (mb = 0; mb < mtop->nmolblock; mb++)
+ {
+ nmol = mtop->molblock[mb].nmol;
+ atoms = &mtop->moltype[mtop->molblock[mb].type].atoms;
+ excl = &mtop->moltype[mtop->molblock[mb].type].excls;
+ for (i = 0; (i < atoms->nr); i++)
+ {
+ if (q == 0)
+ {
+ tpi = atoms->atom[i].type;
+ }
+ else
+ {
+ tpi = atoms->atom[i].typeB;
+ }
+ j1 = excl->index[i];
+ j2 = excl->index[i+1];
+ for (j = j1; j < j2; j++)
+ {
+ k = excl->a[j];
+ if (k > i)
+ {
+ if (q == 0)
+ {
+ tpj = atoms->atom[k].type;
+ }
+ else
+ {
+ tpj = atoms->atom[k].typeB;
+ }
+ if (bBHAM)
+ {
+ /* nbfp now includes the 6.0 derivative prefactor */
+ csix -= nmol*BHAMC(nbfp, ntp, tpi, tpj)/6.0;
+ }
+ else
+ {
+ /* nbfp now includes the 6.0/12.0 derivative prefactors */
+ csix -= nmol*C6 (nbfp, ntp, tpi, tpj)/6.0;
+ ctwelve -= nmol*C12(nbfp, ntp, tpi, tpj)/12.0;
+ }
+ nexcl += nmol;
+ }
+ }
+ }
+ }
+ }
+ else
+ {
+ /* Only correct for the interaction of the test particle
+ * with the rest of the system.
+ */
+ atoms_tpi =
+ &mtop->moltype[mtop->molblock[mtop->nmolblock-1].type].atoms;
+
+ npair = 0;
+ for (mb = 0; mb < mtop->nmolblock; mb++)
+ {
+ nmol = mtop->molblock[mb].nmol;
+ atoms = &mtop->moltype[mtop->molblock[mb].type].atoms;
+ for (j = 0; j < atoms->nr; j++)
+ {
+ nmolc = nmol;
+ /* Remove the interaction of the test charge group
+ * with itself.
+ */
+ if (mb == mtop->nmolblock-1)
+ {
+ nmolc--;
+
+ if (mb == 0 && nmol == 1)
+ {
+ gmx_fatal(FARGS, "Old format tpr with TPI, please generate a new tpr file");
+ }
+ }
+ if (q == 0)
+ {
+ tpj = atoms->atom[j].type;
+ }
+ else
+ {
+ tpj = atoms->atom[j].typeB;
+ }
+ for (i = 0; i < fr->n_tpi; i++)
+ {
+ if (q == 0)
+ {
+ tpi = atoms_tpi->atom[i].type;
+ }
+ else
+ {
+ tpi = atoms_tpi->atom[i].typeB;
+ }
+ if (bBHAM)
+ {
+ /* nbfp now includes the 6.0 derivative prefactor */
+ csix += nmolc*BHAMC(nbfp, ntp, tpi, tpj)/6.0;
+ }
+ else
+ {
+ /* nbfp now includes the 6.0/12.0 derivative prefactors */
+ csix += nmolc*C6 (nbfp, ntp, tpi, tpj)/6.0;
+ ctwelve += nmolc*C12(nbfp, ntp, tpi, tpj)/12.0;
+ }
+ npair += nmolc;
+ }
+ }
+ }
+ }
+ if (npair - nexcl <= 0 && fplog)
+ {
+ fprintf(fplog, "\nWARNING: There are no atom pairs for dispersion correction\n\n");
+ csix = 0;
+ ctwelve = 0;
+ }
+ else
+ {
+ csix /= npair - nexcl;
+ ctwelve /= npair - nexcl;
+ }
+ if (debug)
+ {
+ fprintf(debug, "Counted %d exclusions\n", nexcl);
+ fprintf(debug, "Average C6 parameter is: %10g\n", (double)csix);
+ fprintf(debug, "Average C12 parameter is: %10g\n", (double)ctwelve);
+ }
+ fr->avcsix[q] = csix;
+ fr->avctwelve[q] = ctwelve;
+ }
+ if (fplog != NULL)
+ {
+ if (fr->eDispCorr == edispcAllEner ||
+ fr->eDispCorr == edispcAllEnerPres)
+ {
+ fprintf(fplog, "Long Range LJ corr.: <C6> %10.4e, <C12> %10.4e\n",
+ fr->avcsix[0], fr->avctwelve[0]);
+ }
+ else
+ {
+ fprintf(fplog, "Long Range LJ corr.: <C6> %10.4e\n", fr->avcsix[0]);
+ }
+ }
+}
+
+
+static void set_bham_b_max(FILE *fplog, t_forcerec *fr,
+ const gmx_mtop_t *mtop)
+{
+ const t_atoms *at1, *at2;
+ int mt1, mt2, i, j, tpi, tpj, ntypes;
+ real b, bmin;
+ real *nbfp;
+
+ if (fplog)
+ {
+ fprintf(fplog, "Determining largest Buckingham b parameter for table\n");
+ }
+ nbfp = fr->nbfp;
+ ntypes = fr->ntype;
+
+ bmin = -1;
+ fr->bham_b_max = 0;
+ for (mt1 = 0; mt1 < mtop->nmoltype; mt1++)
+ {
+ at1 = &mtop->moltype[mt1].atoms;
+ for (i = 0; (i < at1->nr); i++)
+ {
+ tpi = at1->atom[i].type;
+ if (tpi >= ntypes)
+ {
+ gmx_fatal(FARGS, "Atomtype[%d] = %d, maximum = %d", i, tpi, ntypes);
+ }
+
+ for (mt2 = mt1; mt2 < mtop->nmoltype; mt2++)
+ {
+ at2 = &mtop->moltype[mt2].atoms;
+ for (j = 0; (j < at2->nr); j++)
+ {
+ tpj = at2->atom[j].type;
+ if (tpj >= ntypes)
+ {
+ gmx_fatal(FARGS, "Atomtype[%d] = %d, maximum = %d", j, tpj, ntypes);
+ }
+ b = BHAMB(nbfp, ntypes, tpi, tpj);
+ if (b > fr->bham_b_max)
+ {
+ fr->bham_b_max = b;
+ }
+ if ((b < bmin) || (bmin == -1))
+ {
+ bmin = b;
+ }
+ }
+ }
+ }
+ }
+ if (fplog)
+ {
+ fprintf(fplog, "Buckingham b parameters, min: %g, max: %g\n",
+ bmin, fr->bham_b_max);
+ }
+}
+
+static void make_nbf_tables(FILE *fp, const output_env_t oenv,
+ t_forcerec *fr, real rtab,
+ const t_commrec *cr,
+ const char *tabfn, char *eg1, char *eg2,
+ t_nblists *nbl)
+{
+ char buf[STRLEN];
+ int i, j;
+
+ if (tabfn == NULL)
+ {
+ if (debug)
+ {
+ fprintf(debug, "No table file name passed, can not read table, can not do non-bonded interactions\n");
+ }
+ return;
+ }
+
+ sprintf(buf, "%s", tabfn);
+ if (eg1 && eg2)
+ {
+ /* Append the two energy group names */
+ sprintf(buf + strlen(tabfn) - strlen(ftp2ext(efXVG)) - 1, "_%s_%s.%s",
+ eg1, eg2, ftp2ext(efXVG));
+ }
+ nbl->table_elec_vdw = make_tables(fp, oenv, fr, MASTER(cr), buf, rtab, 0);
+ /* Copy the contents of the table to separate coulomb and LJ tables too,
+ * to improve cache performance.
+ */
+ /* For performance reasons we want
+ * the table data to be aligned to 16-byte. The pointers could be freed
+ * but currently aren't.
+ */
+ nbl->table_elec.interaction = GMX_TABLE_INTERACTION_ELEC;
+ nbl->table_elec.format = nbl->table_elec_vdw.format;
+ nbl->table_elec.r = nbl->table_elec_vdw.r;
+ nbl->table_elec.n = nbl->table_elec_vdw.n;
+ nbl->table_elec.scale = nbl->table_elec_vdw.scale;
+ nbl->table_elec.scale_exp = nbl->table_elec_vdw.scale_exp;
+ nbl->table_elec.formatsize = nbl->table_elec_vdw.formatsize;
+ nbl->table_elec.ninteractions = 1;
+ nbl->table_elec.stride = nbl->table_elec.formatsize * nbl->table_elec.ninteractions;
+ snew_aligned(nbl->table_elec.data, nbl->table_elec.stride*(nbl->table_elec.n+1), 32);
+
+ nbl->table_vdw.interaction = GMX_TABLE_INTERACTION_VDWREP_VDWDISP;
+ nbl->table_vdw.format = nbl->table_elec_vdw.format;
+ nbl->table_vdw.r = nbl->table_elec_vdw.r;
+ nbl->table_vdw.n = nbl->table_elec_vdw.n;
+ nbl->table_vdw.scale = nbl->table_elec_vdw.scale;
+ nbl->table_vdw.scale_exp = nbl->table_elec_vdw.scale_exp;
+ nbl->table_vdw.formatsize = nbl->table_elec_vdw.formatsize;
+ nbl->table_vdw.ninteractions = 2;
+ nbl->table_vdw.stride = nbl->table_vdw.formatsize * nbl->table_vdw.ninteractions;
+ snew_aligned(nbl->table_vdw.data, nbl->table_vdw.stride*(nbl->table_vdw.n+1), 32);
+
+ for (i = 0; i <= nbl->table_elec_vdw.n; i++)
+ {
+ for (j = 0; j < 4; j++)
+ {
+ nbl->table_elec.data[4*i+j] = nbl->table_elec_vdw.data[12*i+j];
+ }
+ for (j = 0; j < 8; j++)
+ {
+ nbl->table_vdw.data[8*i+j] = nbl->table_elec_vdw.data[12*i+4+j];
+ }
+ }
+}
+
+static void count_tables(int ftype1, int ftype2, const gmx_mtop_t *mtop,
+ int *ncount, int **count)
+{
+ const gmx_moltype_t *molt;
+ const t_ilist *il;
+ int mt, ftype, stride, i, j, tabnr;
+
+ for (mt = 0; mt < mtop->nmoltype; mt++)
+ {
+ molt = &mtop->moltype[mt];
+ for (ftype = 0; ftype < F_NRE; ftype++)
+ {
+ if (ftype == ftype1 || ftype == ftype2)
+ {
+ il = &molt->ilist[ftype];
+ stride = 1 + NRAL(ftype);
+ for (i = 0; i < il->nr; i += stride)
+ {
+ tabnr = mtop->ffparams.iparams[il->iatoms[i]].tab.table;
+ if (tabnr < 0)
+ {
+ gmx_fatal(FARGS, "A bonded table number is smaller than 0: %d\n", tabnr);
+ }
+ if (tabnr >= *ncount)
+ {
+ srenew(*count, tabnr+1);
+ for (j = *ncount; j < tabnr+1; j++)
+ {
+ (*count)[j] = 0;
+ }
+ *ncount = tabnr+1;
+ }
+ (*count)[tabnr]++;
+ }
+ }
+ }
+ }
+}
+
+static bondedtable_t *make_bonded_tables(FILE *fplog,
+ int ftype1, int ftype2,
+ const gmx_mtop_t *mtop,
+ const char *basefn, const char *tabext)
+{
+ int i, ncount, *count;
+ char tabfn[STRLEN];
+ bondedtable_t *tab;
+
+ tab = NULL;
+
+ ncount = 0;
+ count = NULL;
+ count_tables(ftype1, ftype2, mtop, &ncount, &count);
+
+ if (ncount > 0)
+ {
+ snew(tab, ncount);
+ for (i = 0; i < ncount; i++)
+ {
+ if (count[i] > 0)
+ {
+ sprintf(tabfn, "%s", basefn);
+ sprintf(tabfn + strlen(basefn) - strlen(ftp2ext(efXVG)) - 1, "_%s%d.%s",
+ tabext, i, ftp2ext(efXVG));
+ tab[i] = make_bonded_table(fplog, tabfn, NRAL(ftype1)-2);
+ }
+ }
+ sfree(count);
+ }
+
+ return tab;
+}
+
+void forcerec_set_ranges(t_forcerec *fr,
+ int ncg_home, int ncg_force,
+ int natoms_force,
+ int natoms_force_constr, int natoms_f_novirsum)
+{
+ fr->cg0 = 0;
+ fr->hcg = ncg_home;
+
+ /* fr->ncg_force is unused in the standard code,
+ * but it can be useful for modified code dealing with charge groups.
+ */
+ fr->ncg_force = ncg_force;
+ fr->natoms_force = natoms_force;
+ fr->natoms_force_constr = natoms_force_constr;
+
+ if (fr->natoms_force_constr > fr->nalloc_force)
+ {
+ fr->nalloc_force = over_alloc_dd(fr->natoms_force_constr);
+
+ if (fr->bTwinRange)
+ {
+ srenew(fr->f_twin, fr->nalloc_force);
+ }
+ }
+
+ if (fr->bF_NoVirSum)
+ {
+ fr->f_novirsum_n = natoms_f_novirsum;
+ if (fr->f_novirsum_n > fr->f_novirsum_nalloc)
+ {
+ fr->f_novirsum_nalloc = over_alloc_dd(fr->f_novirsum_n);
+ srenew(fr->f_novirsum_alloc, fr->f_novirsum_nalloc);
+ }
+ }
+ else
+ {
+ fr->f_novirsum_n = 0;
+ }
+}
+
+static real cutoff_inf(real cutoff)
+{
+ if (cutoff == 0)
+ {
+ cutoff = GMX_CUTOFF_INF;
+ }
+
+ return cutoff;
+}
+
+static void make_adress_tf_tables(FILE *fp, const output_env_t oenv,
+ t_forcerec *fr, const t_inputrec *ir,
+ const char *tabfn, const gmx_mtop_t *mtop,
+ matrix box)
+{
+ char buf[STRLEN];
+ int i, j;
+
+ if (tabfn == NULL)
+ {
+ gmx_fatal(FARGS, "No thermoforce table file given. Use -tabletf to specify a file\n");
+ return;
+ }
+
+ snew(fr->atf_tabs, ir->adress->n_tf_grps);
+
+ sprintf(buf, "%s", tabfn);
+ for (i = 0; i < ir->adress->n_tf_grps; i++)
+ {
+ j = ir->adress->tf_table_index[i]; /* get energy group index */
+ sprintf(buf + strlen(tabfn) - strlen(ftp2ext(efXVG)) - 1, "tf_%s.%s",
+ *(mtop->groups.grpname[mtop->groups.grps[egcENER].nm_ind[j]]), ftp2ext(efXVG));
+ if (fp)
+ {
+ fprintf(fp, "loading tf table for energygrp index %d from %s\n", ir->adress->tf_table_index[i], buf);
+ }
+ fr->atf_tabs[i] = make_atf_table(fp, oenv, fr, buf, box);
+ }
+
+}
+
+gmx_bool can_use_allvsall(const t_inputrec *ir, const gmx_mtop_t *mtop,
+ gmx_bool bPrintNote, t_commrec *cr, FILE *fp)
+{
+ gmx_bool bAllvsAll;
+
+ bAllvsAll =
+ (
+ ir->rlist == 0 &&
+ ir->rcoulomb == 0 &&
+ ir->rvdw == 0 &&
+ ir->ePBC == epbcNONE &&
+ ir->vdwtype == evdwCUT &&
+ ir->coulombtype == eelCUT &&
+ ir->efep == efepNO &&
+ (ir->implicit_solvent == eisNO ||
+ (ir->implicit_solvent == eisGBSA && (ir->gb_algorithm == egbSTILL ||
+ ir->gb_algorithm == egbHCT ||
+ ir->gb_algorithm == egbOBC))) &&
+ getenv("GMX_NO_ALLVSALL") == NULL
+ );
+
+ if (bAllvsAll && ir->opts.ngener > 1)
+ {
+ const char *note = "NOTE: Can not use all-vs-all force loops, because there are multiple energy monitor groups; you might get significantly higher performance when using only a single energy monitor group.\n";
+
+ if (bPrintNote)
+ {
+ if (MASTER(cr))
+ {
+ fprintf(stderr, "\n%s\n", note);
+ }
+ if (fp != NULL)
+ {
+ fprintf(fp, "\n%s\n", note);
+ }
+ }
+ bAllvsAll = FALSE;
+ }
+
+ if (bAllvsAll && fp && MASTER(cr))
+ {
+ fprintf(fp, "\nUsing accelerated all-vs-all kernels.\n\n");
+ }
+
+ return bAllvsAll;
+}
+
+
+static void init_forcerec_f_threads(t_forcerec *fr, int nenergrp)
+{
+ int t, i;
+
+ /* These thread local data structures are used for bondeds only */
+ fr->nthreads = gmx_omp_nthreads_get(emntBonded);
+
+ if (fr->nthreads > 1)
+ {
+ snew(fr->f_t, fr->nthreads);
+ /* Thread 0 uses the global force and energy arrays */
+ for (t = 1; t < fr->nthreads; t++)
+ {
+ fr->f_t[t].f = NULL;
+ fr->f_t[t].f_nalloc = 0;
+ snew(fr->f_t[t].fshift, SHIFTS);
+ fr->f_t[t].grpp.nener = nenergrp*nenergrp;
+ for (i = 0; i < egNR; i++)
+ {
+ snew(fr->f_t[t].grpp.ener[i], fr->f_t[t].grpp.nener);
+ }
+ }
+ }
+}
+
+
+static void pick_nbnxn_kernel_cpu(FILE *fp,
+ const t_commrec *cr,
+ const gmx_cpuid_t cpuid_info,
+ const t_inputrec *ir,
+ int *kernel_type,
+ int *ewald_excl)
+{
+ *kernel_type = nbnxnk4x4_PlainC;
+ *ewald_excl = ewaldexclTable;
+
+#ifdef GMX_NBNXN_SIMD
+ {
+#ifdef GMX_NBNXN_SIMD_4XN
+ *kernel_type = nbnxnk4xN_SIMD_4xN;
+#endif
+#ifdef GMX_NBNXN_SIMD_2XNN
+ /* We expect the 2xNN kernels to be faster in most cases */
+ *kernel_type = nbnxnk4xN_SIMD_2xNN;
+#endif
+
+#if defined GMX_NBNXN_SIMD_4XN && defined GMX_X86_AVX_256
+ if (EEL_RF(ir->coulombtype) || ir->coulombtype == eelCUT)
+ {
+ /* The raw pair rate of the 4x8 kernel is higher than 2x(4+4),
+ * 10% with HT, 50% without HT, but extra zeros interactions
+ * can compensate. As we currently don't detect the actual use
+ * of HT, switch to 4x8 to avoid a potential performance hit.
+ */
+ *kernel_type = nbnxnk4xN_SIMD_4xN;
+ }
+#endif
+ if (getenv("GMX_NBNXN_SIMD_4XN") != NULL)
+ {
+#ifdef GMX_NBNXN_SIMD_4XN
+ *kernel_type = nbnxnk4xN_SIMD_4xN;
+#else
+ gmx_fatal(FARGS, "SIMD 4xN kernels requested, but Gromacs has been compiled without support for these kernels");
+#endif
+ }
+ if (getenv("GMX_NBNXN_SIMD_2XNN") != NULL)
+ {
+#ifdef GMX_NBNXN_SIMD_2XNN
+ *kernel_type = nbnxnk4xN_SIMD_2xNN;
+#else
+ gmx_fatal(FARGS, "SIMD 2x(N+N) kernels requested, but Gromacs has been compiled without support for these kernels");
+#endif
+ }
+
+ /* Analytical Ewald exclusion correction is only an option in the
+ * x86 SIMD kernel. This is faster in single precision
+ * on Bulldozer and slightly faster on Sandy Bridge.
+ */
+#if (defined GMX_X86_AVX_128_FMA || defined GMX_X86_AVX_256) && !defined GMX_DOUBLE
+ *ewald_excl = ewaldexclAnalytical;
+#endif
+ if (getenv("GMX_NBNXN_EWALD_TABLE") != NULL)
+ {
+ *ewald_excl = ewaldexclTable;
+ }
+ if (getenv("GMX_NBNXN_EWALD_ANALYTICAL") != NULL)
+ {
+ *ewald_excl = ewaldexclAnalytical;
+ }
+
+ }
+#endif /* GMX_X86_SSE2 */
+}
+
+
+const char *lookup_nbnxn_kernel_name(int kernel_type)
+{
+ const char *returnvalue = NULL;
+ switch (kernel_type)
+ {
+ case nbnxnkNotSet: returnvalue = "not set"; break;
+ case nbnxnk4x4_PlainC: returnvalue = "plain C"; break;
+#ifndef GMX_NBNXN_SIMD
+ case nbnxnk4xN_SIMD_4xN: returnvalue = "not available"; break;
+ case nbnxnk4xN_SIMD_2xNN: returnvalue = "not available"; break;
+#else
+#ifdef GMX_X86_SSE2
+#if GMX_NBNXN_SIMD_BITWIDTH == 128
+ /* x86 SIMD intrinsics can be converted to either SSE or AVX depending
+ * on compiler flags. As we use nearly identical intrinsics, using an AVX
+ * compiler flag without an AVX macro effectively results in AVX kernels.
+ * For gcc we check for __AVX__
+ * At least a check for icc should be added (if there is a macro)
+ */
+#if !(defined GMX_X86_AVX_128_FMA || defined __AVX__)
+#ifndef GMX_X86_SSE4_1
+ case nbnxnk4xN_SIMD_4xN: returnvalue = "SSE2"; break;
+ case nbnxnk4xN_SIMD_2xNN: returnvalue = "SSE2"; break;
+#else
+ case nbnxnk4xN_SIMD_4xN: returnvalue = "SSE4.1"; break;
+ case nbnxnk4xN_SIMD_2xNN: returnvalue = "SSE4.1"; break;
+#endif
+#else
+ case nbnxnk4xN_SIMD_4xN: returnvalue = "AVX-128"; break;
+ case nbnxnk4xN_SIMD_2xNN: returnvalue = "AVX-128"; break;
+#endif
+#endif
+#if GMX_NBNXN_SIMD_BITWIDTH == 256
+ case nbnxnk4xN_SIMD_4xN: returnvalue = "AVX-256"; break;
+ case nbnxnk4xN_SIMD_2xNN: returnvalue = "AVX-256"; break;
+#endif
+#else /* not GMX_X86_SSE2 */
+ case nbnxnk4xN_SIMD_4xN: returnvalue = "SIMD"; break;
+ case nbnxnk4xN_SIMD_2xNN: returnvalue = "SIMD"; break;
+#endif
+#endif
+ case nbnxnk8x8x8_CUDA: returnvalue = "CUDA"; break;
+ case nbnxnk8x8x8_PlainC: returnvalue = "plain C"; break;
+
+ case nbnxnkNR:
+ default:
+ gmx_fatal(FARGS, "Illegal kernel type selected");
+ returnvalue = NULL;
+ break;
+ }
+ return returnvalue;
+};
+
+static void pick_nbnxn_kernel(FILE *fp,
+ const t_commrec *cr,
+ const gmx_hw_info_t *hwinfo,
+ gmx_bool use_cpu_acceleration,
+ gmx_bool bUseGPU,
+ gmx_bool bEmulateGPU,
+ const t_inputrec *ir,
+ int *kernel_type,
+ int *ewald_excl,
+ gmx_bool bDoNonbonded)
+{
+ assert(kernel_type);
+
+ *kernel_type = nbnxnkNotSet;
+ *ewald_excl = ewaldexclTable;
+
+ if (bEmulateGPU)
+ {
+ *kernel_type = nbnxnk8x8x8_PlainC;
+
+ if (bDoNonbonded)
+ {
+ md_print_warn(cr, fp, "Emulating a GPU run on the CPU (slow)");
+ }
+ }
+ else if (bUseGPU)
+ {
+ *kernel_type = nbnxnk8x8x8_CUDA;
+ }
+
+ if (*kernel_type == nbnxnkNotSet)
+ {
+ if (use_cpu_acceleration)
+ {
+ pick_nbnxn_kernel_cpu(fp, cr, hwinfo->cpuid_info, ir,
+ kernel_type, ewald_excl);
+ }
+ else
+ {
+ *kernel_type = nbnxnk4x4_PlainC;
+ }
+ }
+
+ if (bDoNonbonded && fp != NULL)
+ {
+ fprintf(fp, "\nUsing %s %dx%d non-bonded kernels\n\n",
+ lookup_nbnxn_kernel_name(*kernel_type),
+ nbnxn_kernel_pairlist_simple(*kernel_type) ? NBNXN_CPU_CLUSTER_I_SIZE : NBNXN_GPU_CLUSTER_SIZE,
+ nbnxn_kernel_to_cj_size(*kernel_type));
+ }
+}
+
+static void pick_nbnxn_resources(FILE *fp,
+ const t_commrec *cr,
+ const gmx_hw_info_t *hwinfo,
+ gmx_bool bDoNonbonded,
+ gmx_bool *bUseGPU,
+ gmx_bool *bEmulateGPU)
+{
+ gmx_bool bEmulateGPUEnvVarSet;
+ char gpu_err_str[STRLEN];
+
+ *bUseGPU = FALSE;
+
+ bEmulateGPUEnvVarSet = (getenv("GMX_EMULATE_GPU") != NULL);
+
+ /* Run GPU emulation mode if GMX_EMULATE_GPU is defined. Because
+ * GPUs (currently) only handle non-bonded calculations, we will
+ * automatically switch to emulation if non-bonded calculations are
+ * turned off via GMX_NO_NONBONDED - this is the simple and elegant
+ * way to turn off GPU initialization, data movement, and cleanup.
+ *
+ * GPU emulation can be useful to assess the performance one can expect by
+ * adding GPU(s) to the machine. The conditional below allows this even
+ * if mdrun is compiled without GPU acceleration support.
+ * Note that you should freezing the system as otherwise it will explode.
+ */
+ *bEmulateGPU = (bEmulateGPUEnvVarSet ||
+ (!bDoNonbonded && hwinfo->bCanUseGPU));
+
+ /* Enable GPU mode when GPUs are available or no GPU emulation is requested.
+ */
+ if (hwinfo->bCanUseGPU && !(*bEmulateGPU))
+ {
+ /* Each PP node will use the intra-node id-th device from the
+ * list of detected/selected GPUs. */
+ if (!init_gpu(cr->rank_pp_intranode, gpu_err_str, &hwinfo->gpu_info))
+ {
+ /* At this point the init should never fail as we made sure that
+ * we have all the GPUs we need. If it still does, we'll bail. */
+ gmx_fatal(FARGS, "On node %d failed to initialize GPU #%d: %s",
+ cr->nodeid,
+ get_gpu_device_id(&hwinfo->gpu_info, cr->rank_pp_intranode),
+ gpu_err_str);
+ }
+
+ /* Here we actually turn on hardware GPU acceleration */
+ *bUseGPU = TRUE;
+ }
+}
+
+gmx_bool uses_simple_tables(int cutoff_scheme,
+ nonbonded_verlet_t *nbv,
+ int group)
+{
+ gmx_bool bUsesSimpleTables = TRUE;
+ int grp_index;
+
+ switch (cutoff_scheme)
+ {
+ case ecutsGROUP:
+ bUsesSimpleTables = TRUE;
+ break;
+ case ecutsVERLET:
+ assert(NULL != nbv && NULL != nbv->grp);
+ grp_index = (group < 0) ? 0 : (nbv->ngrp - 1);
+ bUsesSimpleTables = nbnxn_kernel_pairlist_simple(nbv->grp[grp_index].kernel_type);
+ break;
+ default:
+ gmx_incons("unimplemented");
+ }
+ return bUsesSimpleTables;
+}
+
+static void init_ewald_f_table(interaction_const_t *ic,
+ gmx_bool bUsesSimpleTables,
+ real rtab)
+{
+ real maxr;
+
+ if (bUsesSimpleTables)
+ {
+ /* With a spacing of 0.0005 we are at the force summation accuracy
+ * for the SSE kernels for "normal" atomistic simulations.
+ */
+ ic->tabq_scale = ewald_spline3_table_scale(ic->ewaldcoeff,
+ ic->rcoulomb);
+
+ maxr = (rtab > ic->rcoulomb) ? rtab : ic->rcoulomb;
+ ic->tabq_size = (int)(maxr*ic->tabq_scale) + 2;
+ }
+ else
+ {
+ ic->tabq_size = GPU_EWALD_COULOMB_FORCE_TABLE_SIZE;
+ /* Subtract 2 iso 1 to avoid access out of range due to rounding */
+ ic->tabq_scale = (ic->tabq_size - 2)/ic->rcoulomb;
+ }
+
+ sfree_aligned(ic->tabq_coul_FDV0);
+ sfree_aligned(ic->tabq_coul_F);
+ sfree_aligned(ic->tabq_coul_V);
+
+ /* Create the original table data in FDV0 */
+ snew_aligned(ic->tabq_coul_FDV0, ic->tabq_size*4, 32);
+ snew_aligned(ic->tabq_coul_F, ic->tabq_size, 32);
+ snew_aligned(ic->tabq_coul_V, ic->tabq_size, 32);
+ table_spline3_fill_ewald_lr(ic->tabq_coul_F, ic->tabq_coul_V, ic->tabq_coul_FDV0,
+ ic->tabq_size, 1/ic->tabq_scale, ic->ewaldcoeff);
+}
+
+void init_interaction_const_tables(FILE *fp,
+ interaction_const_t *ic,
+ gmx_bool bUsesSimpleTables,
+ real rtab)
+{
+ real spacing;
+
+ if (ic->eeltype == eelEWALD || EEL_PME(ic->eeltype))
+ {
+ init_ewald_f_table(ic, bUsesSimpleTables, rtab);
+
+ if (fp != NULL)
+ {
+ fprintf(fp, "Initialized non-bonded Ewald correction tables, spacing: %.2e size: %d\n\n",
+ 1/ic->tabq_scale, ic->tabq_size);
+ }
+ }
+}
+
+void init_interaction_const(FILE *fp,
+ interaction_const_t **interaction_const,
+ const t_forcerec *fr,
+ real rtab)
+{
+ interaction_const_t *ic;
+ gmx_bool bUsesSimpleTables = TRUE;
+
+ snew(ic, 1);
+
+ /* Just allocate something so we can free it */
+ snew_aligned(ic->tabq_coul_FDV0, 16, 32);
+ snew_aligned(ic->tabq_coul_F, 16, 32);
+ snew_aligned(ic->tabq_coul_V, 16, 32);
+
+ ic->rlist = fr->rlist;
+ ic->rlistlong = fr->rlistlong;
+
+ /* Lennard-Jones */
+ ic->rvdw = fr->rvdw;
+ if (fr->vdw_modifier == eintmodPOTSHIFT)
+ {
+ ic->sh_invrc6 = pow(ic->rvdw, -6.0);
+ }
+ else
+ {
+ ic->sh_invrc6 = 0;
+ }
+
+ /* Electrostatics */
+ ic->eeltype = fr->eeltype;
+ ic->rcoulomb = fr->rcoulomb;
+ ic->epsilon_r = fr->epsilon_r;
+ ic->epsfac = fr->epsfac;
+
+ /* Ewald */
+ ic->ewaldcoeff = fr->ewaldcoeff;
+ if (fr->coulomb_modifier == eintmodPOTSHIFT)
+ {
+ ic->sh_ewald = gmx_erfc(ic->ewaldcoeff*ic->rcoulomb);
+ }
+ else
+ {
+ ic->sh_ewald = 0;
+ }
+
+ /* Reaction-field */
+ if (EEL_RF(ic->eeltype))
+ {
+ ic->epsilon_rf = fr->epsilon_rf;
+ ic->k_rf = fr->k_rf;
+ ic->c_rf = fr->c_rf;
+ }
+ else
+ {
+ /* For plain cut-off we might use the reaction-field kernels */
+ ic->epsilon_rf = ic->epsilon_r;
+ ic->k_rf = 0;
+ if (fr->coulomb_modifier == eintmodPOTSHIFT)
+ {
+ ic->c_rf = 1/ic->rcoulomb;
+ }
+ else
+ {
+ ic->c_rf = 0;
+ }
+ }
+
+ if (fp != NULL)
+ {
+ fprintf(fp, "Potential shift: LJ r^-12: %.3f r^-6 %.3f",
+ sqr(ic->sh_invrc6), ic->sh_invrc6);
+ if (ic->eeltype == eelCUT)
+ {
+ fprintf(fp, ", Coulomb %.3f", ic->c_rf);
+ }
+ else if (EEL_PME(ic->eeltype))
+ {
+ fprintf(fp, ", Ewald %.3e", ic->sh_ewald);
+ }
+ fprintf(fp, "\n");
+ }
+
+ *interaction_const = ic;
+
+ if (fr->nbv != NULL && fr->nbv->bUseGPU)
+ {
+ nbnxn_cuda_init_const(fr->nbv->cu_nbv, ic, fr->nbv->grp);
+ }
+
+ bUsesSimpleTables = uses_simple_tables(fr->cutoff_scheme, fr->nbv, -1);
+ init_interaction_const_tables(fp, ic, bUsesSimpleTables, rtab);
+}
+
+static void init_nb_verlet(FILE *fp,
+ nonbonded_verlet_t **nb_verlet,
+ const t_inputrec *ir,
+ const t_forcerec *fr,
+ const t_commrec *cr,
+ const char *nbpu_opt)
+{
+ nonbonded_verlet_t *nbv;
+ int i;
+ char *env;
+ gmx_bool bEmulateGPU, bHybridGPURun = FALSE;
+
+ nbnxn_alloc_t *nb_alloc;
+ nbnxn_free_t *nb_free;
+
+ snew(nbv, 1);
+
+ pick_nbnxn_resources(fp, cr, fr->hwinfo,
+ fr->bNonbonded,
+ &nbv->bUseGPU,
+ &bEmulateGPU);
+
+ nbv->nbs = NULL;
+
+ nbv->ngrp = (DOMAINDECOMP(cr) ? 2 : 1);
+ for (i = 0; i < nbv->ngrp; i++)
+ {
+ nbv->grp[i].nbl_lists.nnbl = 0;
+ nbv->grp[i].nbat = NULL;
+ nbv->grp[i].kernel_type = nbnxnkNotSet;
+
+ if (i == 0) /* local */
+ {
+ pick_nbnxn_kernel(fp, cr, fr->hwinfo, fr->use_cpu_acceleration,
+ nbv->bUseGPU, bEmulateGPU,
+ ir,
+ &nbv->grp[i].kernel_type,
+ &nbv->grp[i].ewald_excl,
+ fr->bNonbonded);
+ }
+ else /* non-local */
+ {
+ if (nbpu_opt != NULL && strcmp(nbpu_opt, "gpu_cpu") == 0)
+ {
+ /* Use GPU for local, select a CPU kernel for non-local */
+ pick_nbnxn_kernel(fp, cr, fr->hwinfo, fr->use_cpu_acceleration,
+ FALSE, FALSE,
+ ir,
+ &nbv->grp[i].kernel_type,
+ &nbv->grp[i].ewald_excl,
+ fr->bNonbonded);
+
+ bHybridGPURun = TRUE;
+ }
+ else
+ {
+ /* Use the same kernel for local and non-local interactions */
+ nbv->grp[i].kernel_type = nbv->grp[0].kernel_type;
+ nbv->grp[i].ewald_excl = nbv->grp[0].ewald_excl;
+ }
+ }
+ }
+
+ if (nbv->bUseGPU)
+ {
+ /* init the NxN GPU data; the last argument tells whether we'll have
+ * both local and non-local NB calculation on GPU */
+ nbnxn_cuda_init(fp, &nbv->cu_nbv,
+ &fr->hwinfo->gpu_info, cr->rank_pp_intranode,
+ (nbv->ngrp > 1) && !bHybridGPURun);
+
+ if ((env = getenv("GMX_NB_MIN_CI")) != NULL)
+ {
+ char *end;
+
+ nbv->min_ci_balanced = strtol(env, &end, 10);
+ if (!end || (*end != 0) || nbv->min_ci_balanced <= 0)
+ {
+ gmx_fatal(FARGS, "Invalid value passed in GMX_NB_MIN_CI=%s, positive integer required", env);
+ }
+
+ if (debug)
+ {
+ fprintf(debug, "Neighbor-list balancing parameter: %d (passed as env. var.)\n",
+ nbv->min_ci_balanced);
+ }
+ }
+ else
+ {
+ nbv->min_ci_balanced = nbnxn_cuda_min_ci_balanced(nbv->cu_nbv);
+ if (debug)
+ {
+ fprintf(debug, "Neighbor-list balancing parameter: %d (auto-adjusted to the number of GPU multi-processors)\n",
+ nbv->min_ci_balanced);
+ }
+ }
+ }
+ else
+ {
+ nbv->min_ci_balanced = 0;
+ }
+
+ *nb_verlet = nbv;
+
+ nbnxn_init_search(&nbv->nbs,
+ DOMAINDECOMP(cr) ? &cr->dd->nc : NULL,
+ DOMAINDECOMP(cr) ? domdec_zones(cr->dd) : NULL,
+ gmx_omp_nthreads_get(emntNonbonded));
+
+ for (i = 0; i < nbv->ngrp; i++)
+ {
+ if (nbv->grp[0].kernel_type == nbnxnk8x8x8_CUDA)
+ {
+ nb_alloc = &pmalloc;
+ nb_free = &pfree;
+ }
+ else
+ {
+ nb_alloc = NULL;
+ nb_free = NULL;
+ }
+
+ nbnxn_init_pairlist_set(&nbv->grp[i].nbl_lists,
+ nbnxn_kernel_pairlist_simple(nbv->grp[i].kernel_type),
+ /* 8x8x8 "non-simple" lists are ATM always combined */
+ !nbnxn_kernel_pairlist_simple(nbv->grp[i].kernel_type),
+ nb_alloc, nb_free);
+
+ if (i == 0 ||
+ nbv->grp[0].kernel_type != nbv->grp[i].kernel_type)
+ {
+ snew(nbv->grp[i].nbat, 1);
+ nbnxn_atomdata_init(fp,
+ nbv->grp[i].nbat,
+ nbv->grp[i].kernel_type,
+ fr->ntype, fr->nbfp,
+ ir->opts.ngener,
+ nbnxn_kernel_pairlist_simple(nbv->grp[i].kernel_type) ? gmx_omp_nthreads_get(emntNonbonded) : 1,
+ nb_alloc, nb_free);
+ }
+ else
+ {
+ nbv->grp[i].nbat = nbv->grp[0].nbat;
+ }
+ }
+}
+
+void init_forcerec(FILE *fp,
+ const output_env_t oenv,
+ t_forcerec *fr,
+ t_fcdata *fcd,
+ const t_inputrec *ir,
+ const gmx_mtop_t *mtop,
+ const t_commrec *cr,
+ matrix box,
+ gmx_bool bMolEpot,
+ const char *tabfn,
+ const char *tabafn,
+ const char *tabpfn,
+ const char *tabbfn,
+ const char *nbpu_opt,
+ gmx_bool bNoSolvOpt,
+ real print_force)
+{
+ int i, j, m, natoms, ngrp, negp_pp, negptable, egi, egj;
+ real rtab;
+ char *env;
+ double dbl;
+ rvec box_size;
+ const t_block *cgs;
+ gmx_bool bGenericKernelOnly;
+ gmx_bool bTab, bSep14tab, bNormalnblists;
+ t_nblists *nbl;
+ int *nm_ind, egp_flags;
+
+ if (fr->hwinfo == NULL)
+ {
+ /* Detect hardware, gather information.
+ * In mdrun, hwinfo has already been set before calling init_forcerec.
+ * Here we ignore GPUs, as tools will not use them anyhow.
+ */
- fr->bAllvsAll = FALSE;
++ fr->hwinfo = gmx_detect_hardware(fp, cr, FALSE, FALSE, NULL);
+ }
+
+ /* By default we turn acceleration on, but it might be turned off further down... */
+ fr->use_cpu_acceleration = TRUE;
+
+ fr->bDomDec = DOMAINDECOMP(cr);
+
+ natoms = mtop->natoms;
+
+ if (check_box(ir->ePBC, box))
+ {
+ gmx_fatal(FARGS, check_box(ir->ePBC, box));
+ }
+
+ /* Test particle insertion ? */
+ if (EI_TPI(ir->eI))
+ {
+ /* Set to the size of the molecule to be inserted (the last one) */
+ /* Because of old style topologies, we have to use the last cg
+ * instead of the last molecule type.
+ */
+ cgs = &mtop->moltype[mtop->molblock[mtop->nmolblock-1].type].cgs;
+ fr->n_tpi = cgs->index[cgs->nr] - cgs->index[cgs->nr-1];
+ if (fr->n_tpi != mtop->mols.index[mtop->mols.nr] - mtop->mols.index[mtop->mols.nr-1])
+ {
+ gmx_fatal(FARGS, "The molecule to insert can not consist of multiple charge groups.\nMake it a single charge group.");
+ }
+ }
+ else
+ {
+ fr->n_tpi = 0;
+ }
+
+ /* Copy AdResS parameters */
+ if (ir->bAdress)
+ {
+ fr->adress_type = ir->adress->type;
+ fr->adress_const_wf = ir->adress->const_wf;
+ fr->adress_ex_width = ir->adress->ex_width;
+ fr->adress_hy_width = ir->adress->hy_width;
+ fr->adress_icor = ir->adress->icor;
+ fr->adress_site = ir->adress->site;
+ fr->adress_ex_forcecap = ir->adress->ex_forcecap;
+ fr->adress_do_hybridpairs = ir->adress->do_hybridpairs;
+
+
+ snew(fr->adress_group_explicit, ir->adress->n_energy_grps);
+ for (i = 0; i < ir->adress->n_energy_grps; i++)
+ {
+ fr->adress_group_explicit[i] = ir->adress->group_explicit[i];
+ }
+
+ fr->n_adress_tf_grps = ir->adress->n_tf_grps;
+ snew(fr->adress_tf_table_index, fr->n_adress_tf_grps);
+ for (i = 0; i < fr->n_adress_tf_grps; i++)
+ {
+ fr->adress_tf_table_index[i] = ir->adress->tf_table_index[i];
+ }
+ copy_rvec(ir->adress->refs, fr->adress_refs);
+ }
+ else
+ {
+ fr->adress_type = eAdressOff;
+ fr->adress_do_hybridpairs = FALSE;
+ }
+
+ /* Copy the user determined parameters */
+ fr->userint1 = ir->userint1;
+ fr->userint2 = ir->userint2;
+ fr->userint3 = ir->userint3;
+ fr->userint4 = ir->userint4;
+ fr->userreal1 = ir->userreal1;
+ fr->userreal2 = ir->userreal2;
+ fr->userreal3 = ir->userreal3;
+ fr->userreal4 = ir->userreal4;
+
+ /* Shell stuff */
+ fr->fc_stepsize = ir->fc_stepsize;
+
+ /* Free energy */
+ fr->efep = ir->efep;
+ fr->sc_alphavdw = ir->fepvals->sc_alpha;
+ if (ir->fepvals->bScCoul)
+ {
+ fr->sc_alphacoul = ir->fepvals->sc_alpha;
+ fr->sc_sigma6_min = pow(ir->fepvals->sc_sigma_min, 6);
+ }
+ else
+ {
+ fr->sc_alphacoul = 0;
+ fr->sc_sigma6_min = 0; /* only needed when bScCoul is on */
+ }
+ fr->sc_power = ir->fepvals->sc_power;
+ fr->sc_r_power = ir->fepvals->sc_r_power;
+ fr->sc_sigma6_def = pow(ir->fepvals->sc_sigma, 6);
+
+ env = getenv("GMX_SCSIGMA_MIN");
+ if (env != NULL)
+ {
+ dbl = 0;
+ sscanf(env, "%lf", &dbl);
+ fr->sc_sigma6_min = pow(dbl, 6);
+ if (fp)
+ {
+ fprintf(fp, "Setting the minimum soft core sigma to %g nm\n", dbl);
+ }
+ }
+
+ fr->bNonbonded = TRUE;
+ if (getenv("GMX_NO_NONBONDED") != NULL)
+ {
+ /* turn off non-bonded calculations */
+ fr->bNonbonded = FALSE;
+ md_print_warn(cr, fp,
+ "Found environment variable GMX_NO_NONBONDED.\n"
+ "Disabling nonbonded calculations.\n");
+ }
+
+ bGenericKernelOnly = FALSE;
+
+ /* We now check in the NS code whether a particular combination of interactions
+ * can be used with water optimization, and disable it if that is not the case.
+ */
+
+ if (getenv("GMX_NB_GENERIC") != NULL)
+ {
+ if (fp != NULL)
+ {
+ fprintf(fp,
+ "Found environment variable GMX_NB_GENERIC.\n"
+ "Disabling all interaction-specific nonbonded kernels, will only\n"
+ "use the slow generic ones in src/gmxlib/nonbonded/nb_generic.c\n\n");
+ }
+ bGenericKernelOnly = TRUE;
+ }
+
+ if (bGenericKernelOnly == TRUE)
+ {
+ bNoSolvOpt = TRUE;
+ }
+
+ if ( (getenv("GMX_DISABLE_CPU_ACCELERATION") != NULL) || (getenv("GMX_NOOPTIMIZEDKERNELS") != NULL) )
+ {
+ fr->use_cpu_acceleration = FALSE;
+ if (fp != NULL)
+ {
+ fprintf(fp,
+ "\nFound environment variable GMX_DISABLE_CPU_ACCELERATION.\n"
+ "Disabling all CPU architecture-specific (e.g. SSE2/SSE4/AVX) routines.\n\n");
+ }
+ }
+
+ fr->bBHAM = (mtop->ffparams.functype[0] == F_BHAM);
+
+ /* Check if we can/should do all-vs-all kernels */
+ fr->bAllvsAll = can_use_allvsall(ir, mtop, FALSE, NULL, NULL);
+ fr->AllvsAll_work = NULL;
+ fr->AllvsAll_workgb = NULL;
+
+ /* All-vs-all kernels have not been implemented in 4.6, and
+ * the SIMD group kernels are also buggy in this case. Non-accelerated
+ * group kernels are OK. See Redmine #1249. */
+ if (fr->bAllvsAll)
+ {
++ fr->bAllvsAll = FALSE;
+ fr->use_cpu_acceleration = FALSE;
+ if (fp != NULL)
+ {
+ fprintf(fp,
+ "\nYour simulation settings would have triggered the efficient all-vs-all\n"
+ "kernels in GROMACS 4.5, but these have not been implemented in GROMACS\n"
+ "4.6. Also, we can't use the accelerated SIMD kernels here because\n"
+ "of an unfixed bug. The reference C kernels are correct, though, so\n"
+ "we are proceeding by disabling all CPU architecture-specific\n"
+ "(e.g. SSE2/SSE4/AVX) routines. If performance is important, please\n"
+ "use GROMACS 4.5.7 or try cutoff-scheme = Verlet.\n\n");
+ }
+ }
+
+ /* Neighbour searching stuff */
+ fr->cutoff_scheme = ir->cutoff_scheme;
+ fr->bGrid = (ir->ns_type == ensGRID);
+ fr->ePBC = ir->ePBC;
+
+ /* Determine if we will do PBC for distances in bonded interactions */
+ if (fr->ePBC == epbcNONE)
+ {
+ fr->bMolPBC = FALSE;
+ }
+ else
+ {
+ if (!DOMAINDECOMP(cr))
+ {
+ /* The group cut-off scheme and SHAKE assume charge groups
+ * are whole, but not using molpbc is faster in most cases.
+ */
+ if (fr->cutoff_scheme == ecutsGROUP ||
+ (ir->eConstrAlg == econtSHAKE &&
+ (gmx_mtop_ftype_count(mtop, F_CONSTR) > 0 ||
+ gmx_mtop_ftype_count(mtop, F_CONSTRNC) > 0)))
+ {
+ fr->bMolPBC = ir->bPeriodicMols;
+ }
+ else
+ {
+ fr->bMolPBC = TRUE;
+ if (getenv("GMX_USE_GRAPH") != NULL)
+ {
+ fr->bMolPBC = FALSE;
+ if (fp)
+ {
+ fprintf(fp, "\nGMX_MOLPBC is set, using the graph for bonded interactions\n\n");
+ }
+ }
+ }
+ }
+ else
+ {
+ fr->bMolPBC = dd_bonded_molpbc(cr->dd, fr->ePBC);
+ }
+ }
+ fr->bGB = (ir->implicit_solvent == eisGBSA);
+
+ fr->rc_scaling = ir->refcoord_scaling;
+ copy_rvec(ir->posres_com, fr->posres_com);
+ copy_rvec(ir->posres_comB, fr->posres_comB);
+ fr->rlist = cutoff_inf(ir->rlist);
+ fr->rlistlong = cutoff_inf(ir->rlistlong);
+ fr->eeltype = ir->coulombtype;
+ fr->vdwtype = ir->vdwtype;
+
+ fr->coulomb_modifier = ir->coulomb_modifier;
+ fr->vdw_modifier = ir->vdw_modifier;
+
+ /* Electrostatics: Translate from interaction-setting-in-mdp-file to kernel interaction format */
+ switch (fr->eeltype)
+ {
+ case eelCUT:
+ fr->nbkernel_elec_interaction = (fr->bGB) ? GMX_NBKERNEL_ELEC_GENERALIZEDBORN : GMX_NBKERNEL_ELEC_COULOMB;
+ break;
+
+ case eelRF:
+ case eelGRF:
+ case eelRF_NEC:
+ fr->nbkernel_elec_interaction = GMX_NBKERNEL_ELEC_REACTIONFIELD;
+ break;
+
+ case eelRF_ZERO:
+ fr->nbkernel_elec_interaction = GMX_NBKERNEL_ELEC_REACTIONFIELD;
+ fr->coulomb_modifier = eintmodEXACTCUTOFF;
+ break;
+
+ case eelSWITCH:
+ case eelSHIFT:
+ case eelUSER:
+ case eelENCADSHIFT:
+ case eelPMESWITCH:
+ case eelPMEUSER:
+ case eelPMEUSERSWITCH:
+ fr->nbkernel_elec_interaction = GMX_NBKERNEL_ELEC_CUBICSPLINETABLE;
+ break;
+
+ case eelPME:
+ case eelEWALD:
+ fr->nbkernel_elec_interaction = GMX_NBKERNEL_ELEC_EWALD;
+ break;
+
+ default:
+ gmx_fatal(FARGS, "Unsupported electrostatic interaction: %s", eel_names[fr->eeltype]);
+ break;
+ }
+
+ /* Vdw: Translate from mdp settings to kernel format */
+ switch (fr->vdwtype)
+ {
+ case evdwCUT:
+ if (fr->bBHAM)
+ {
+ fr->nbkernel_vdw_interaction = GMX_NBKERNEL_VDW_BUCKINGHAM;
+ }
+ else
+ {
+ fr->nbkernel_vdw_interaction = GMX_NBKERNEL_VDW_LENNARDJONES;
+ }
+ break;
+
+ case evdwSWITCH:
+ case evdwSHIFT:
+ case evdwUSER:
+ case evdwENCADSHIFT:
+ fr->nbkernel_vdw_interaction = GMX_NBKERNEL_VDW_CUBICSPLINETABLE;
+ break;
+
+ default:
+ gmx_fatal(FARGS, "Unsupported vdw interaction: %s", evdw_names[fr->vdwtype]);
+ break;
+ }
+
+ /* These start out identical to ir, but might be altered if we e.g. tabulate the interaction in the kernel */
+ fr->nbkernel_elec_modifier = fr->coulomb_modifier;
+ fr->nbkernel_vdw_modifier = fr->vdw_modifier;
+
+ fr->bTwinRange = fr->rlistlong > fr->rlist;
+ fr->bEwald = (EEL_PME(fr->eeltype) || fr->eeltype == eelEWALD);
+
+ fr->reppow = mtop->ffparams.reppow;
+
+ if (ir->cutoff_scheme == ecutsGROUP)
+ {
+ fr->bvdwtab = (fr->vdwtype != evdwCUT ||
+ !gmx_within_tol(fr->reppow, 12.0, 10*GMX_DOUBLE_EPS));
+ /* We have special kernels for standard Ewald and PME, but the pme-switch ones are tabulated above */
+ fr->bcoultab = !(fr->eeltype == eelCUT ||
+ fr->eeltype == eelEWALD ||
+ fr->eeltype == eelPME ||
+ fr->eeltype == eelRF ||
+ fr->eeltype == eelRF_ZERO);
+
+ /* If the user absolutely wants different switch/shift settings for coul/vdw, it is likely
+ * going to be faster to tabulate the interaction than calling the generic kernel.
+ */
+ if (fr->nbkernel_elec_modifier == eintmodPOTSWITCH && fr->nbkernel_vdw_modifier == eintmodPOTSWITCH)
+ {
+ if ((fr->rcoulomb_switch != fr->rvdw_switch) || (fr->rcoulomb != fr->rvdw))
+ {
+ fr->bcoultab = TRUE;
+ }
+ }
+ else if ((fr->nbkernel_elec_modifier == eintmodPOTSHIFT && fr->nbkernel_vdw_modifier == eintmodPOTSHIFT) ||
+ ((fr->nbkernel_elec_interaction == GMX_NBKERNEL_ELEC_REACTIONFIELD &&
+ fr->nbkernel_elec_modifier == eintmodEXACTCUTOFF &&
+ (fr->nbkernel_vdw_modifier == eintmodPOTSWITCH || fr->nbkernel_vdw_modifier == eintmodPOTSHIFT))))
+ {
+ if (fr->rcoulomb != fr->rvdw)
+ {
+ fr->bcoultab = TRUE;
+ }
+ }
+
+ if (getenv("GMX_REQUIRE_TABLES"))
+ {
+ fr->bvdwtab = TRUE;
+ fr->bcoultab = TRUE;
+ }
+
+ if (fp)
+ {
+ fprintf(fp, "Table routines are used for coulomb: %s\n", bool_names[fr->bcoultab]);
+ fprintf(fp, "Table routines are used for vdw: %s\n", bool_names[fr->bvdwtab ]);
+ }
+
+ if (fr->bvdwtab == TRUE)
+ {
+ fr->nbkernel_vdw_interaction = GMX_NBKERNEL_VDW_CUBICSPLINETABLE;
+ fr->nbkernel_vdw_modifier = eintmodNONE;
+ }
+ if (fr->bcoultab == TRUE)
+ {
+ fr->nbkernel_elec_interaction = GMX_NBKERNEL_ELEC_CUBICSPLINETABLE;
+ fr->nbkernel_elec_modifier = eintmodNONE;
+ }
+ }
+
+ if (ir->cutoff_scheme == ecutsVERLET)
+ {
+ if (!gmx_within_tol(fr->reppow, 12.0, 10*GMX_DOUBLE_EPS))
+ {
+ gmx_fatal(FARGS, "Cut-off scheme %S only supports LJ repulsion power 12", ecutscheme_names[ir->cutoff_scheme]);
+ }
+ fr->bvdwtab = FALSE;
+ fr->bcoultab = FALSE;
+ }
+
+ /* Tables are used for direct ewald sum */
+ if (fr->bEwald)
+ {
+ if (EEL_PME(ir->coulombtype))
+ {
+ if (fp)
+ {
+ fprintf(fp, "Will do PME sum in reciprocal space.\n");
+ }
+ if (ir->coulombtype == eelP3M_AD)
+ {
+ please_cite(fp, "Hockney1988");
+ please_cite(fp, "Ballenegger2012");
+ }
+ else
+ {
+ please_cite(fp, "Essmann95a");
+ }
+
+ if (ir->ewald_geometry == eewg3DC)
+ {
+ if (fp)
+ {
+ fprintf(fp, "Using the Ewald3DC correction for systems with a slab geometry.\n");
+ }
+ please_cite(fp, "In-Chul99a");
+ }
+ }
+ fr->ewaldcoeff = calc_ewaldcoeff(ir->rcoulomb, ir->ewald_rtol);
+ init_ewald_tab(&(fr->ewald_table), ir, fp);
+ if (fp)
+ {
+ fprintf(fp, "Using a Gaussian width (1/beta) of %g nm for Ewald\n",
+ 1/fr->ewaldcoeff);
+ }
+ }
+
+ /* Electrostatics */
+ fr->epsilon_r = ir->epsilon_r;
+ fr->epsilon_rf = ir->epsilon_rf;
+ fr->fudgeQQ = mtop->ffparams.fudgeQQ;
+ fr->rcoulomb_switch = ir->rcoulomb_switch;
+ fr->rcoulomb = cutoff_inf(ir->rcoulomb);
+
+ /* Parameters for generalized RF */
+ fr->zsquare = 0.0;
+ fr->temp = 0.0;
+
+ if (fr->eeltype == eelGRF)
+ {
+ init_generalized_rf(fp, mtop, ir, fr);
+ }
+ else if (fr->eeltype == eelSHIFT)
+ {
+ for (m = 0; (m < DIM); m++)
+ {
+ box_size[m] = box[m][m];
+ }
+
+ if ((fr->eeltype == eelSHIFT && fr->rcoulomb > fr->rcoulomb_switch))
+ {
+ set_shift_consts(fr->rcoulomb_switch, fr->rcoulomb, box_size);
+ }
+ }
+
+ fr->bF_NoVirSum = (EEL_FULL(fr->eeltype) ||
+ gmx_mtop_ftype_count(mtop, F_POSRES) > 0 ||
+ gmx_mtop_ftype_count(mtop, F_FBPOSRES) > 0 ||
+ IR_ELEC_FIELD(*ir) ||
+ (fr->adress_icor != eAdressICOff)
+ );
+
+ if (fr->cutoff_scheme == ecutsGROUP &&
+ ncg_mtop(mtop) > fr->cg_nalloc && !DOMAINDECOMP(cr))
+ {
+ /* Count the total number of charge groups */
+ fr->cg_nalloc = ncg_mtop(mtop);
+ srenew(fr->cg_cm, fr->cg_nalloc);
+ }
+ if (fr->shift_vec == NULL)
+ {
+ snew(fr->shift_vec, SHIFTS);
+ }
+
+ if (fr->fshift == NULL)
+ {
+ snew(fr->fshift, SHIFTS);
+ }
+
+ if (fr->nbfp == NULL)
+ {
+ fr->ntype = mtop->ffparams.atnr;
+ fr->nbfp = mk_nbfp(&mtop->ffparams, fr->bBHAM);
+ }
+
+ /* Copy the energy group exclusions */
+ fr->egp_flags = ir->opts.egp_flags;
+
+ /* Van der Waals stuff */
+ fr->rvdw = cutoff_inf(ir->rvdw);
+ fr->rvdw_switch = ir->rvdw_switch;
+ if ((fr->vdwtype != evdwCUT) && (fr->vdwtype != evdwUSER) && !fr->bBHAM)
+ {
+ if (fr->rvdw_switch >= fr->rvdw)
+ {
+ gmx_fatal(FARGS, "rvdw_switch (%f) must be < rvdw (%f)",
+ fr->rvdw_switch, fr->rvdw);
+ }
+ if (fp)
+ {
+ fprintf(fp, "Using %s Lennard-Jones, switch between %g and %g nm\n",
+ (fr->eeltype == eelSWITCH) ? "switched" : "shifted",
+ fr->rvdw_switch, fr->rvdw);
+ }
+ }
+
+ if (fr->bBHAM && (fr->vdwtype == evdwSHIFT || fr->vdwtype == evdwSWITCH))
+ {
+ gmx_fatal(FARGS, "Switch/shift interaction not supported with Buckingham");
+ }
+
+ if (fp)
+ {
+ fprintf(fp, "Cut-off's: NS: %g Coulomb: %g %s: %g\n",
+ fr->rlist, fr->rcoulomb, fr->bBHAM ? "BHAM" : "LJ", fr->rvdw);
+ }
+
+ fr->eDispCorr = ir->eDispCorr;
+ if (ir->eDispCorr != edispcNO)
+ {
+ set_avcsixtwelve(fp, fr, mtop);
+ }
+
+ if (fr->bBHAM)
+ {
+ set_bham_b_max(fp, fr, mtop);
+ }
+
+ fr->gb_epsilon_solvent = ir->gb_epsilon_solvent;
+
+ /* Copy the GBSA data (radius, volume and surftens for each
+ * atomtype) from the topology atomtype section to forcerec.
+ */
+ snew(fr->atype_radius, fr->ntype);
+ snew(fr->atype_vol, fr->ntype);
+ snew(fr->atype_surftens, fr->ntype);
+ snew(fr->atype_gb_radius, fr->ntype);
+ snew(fr->atype_S_hct, fr->ntype);
+
+ if (mtop->atomtypes.nr > 0)
+ {
+ for (i = 0; i < fr->ntype; i++)
+ {
+ fr->atype_radius[i] = mtop->atomtypes.radius[i];
+ }
+ for (i = 0; i < fr->ntype; i++)
+ {
+ fr->atype_vol[i] = mtop->atomtypes.vol[i];
+ }
+ for (i = 0; i < fr->ntype; i++)
+ {
+ fr->atype_surftens[i] = mtop->atomtypes.surftens[i];
+ }
+ for (i = 0; i < fr->ntype; i++)
+ {
+ fr->atype_gb_radius[i] = mtop->atomtypes.gb_radius[i];
+ }
+ for (i = 0; i < fr->ntype; i++)
+ {
+ fr->atype_S_hct[i] = mtop->atomtypes.S_hct[i];
+ }
+ }
+
+ /* Generate the GB table if needed */
+ if (fr->bGB)
+ {
+#ifdef GMX_DOUBLE
+ fr->gbtabscale = 2000;
+#else
+ fr->gbtabscale = 500;
+#endif
+
+ fr->gbtabr = 100;
+ fr->gbtab = make_gb_table(fp, oenv, fr, tabpfn, fr->gbtabscale);
+
+ init_gb(&fr->born, cr, fr, ir, mtop, ir->rgbradii, ir->gb_algorithm);
+
+ /* Copy local gb data (for dd, this is done in dd_partition_system) */
+ if (!DOMAINDECOMP(cr))
+ {
+ make_local_gb(cr, fr->born, ir->gb_algorithm);
+ }
+ }
+
+ /* Set the charge scaling */
+ if (fr->epsilon_r != 0)
+ {
+ fr->epsfac = ONE_4PI_EPS0/fr->epsilon_r;
+ }
+ else
+ {
+ /* eps = 0 is infinite dieletric: no coulomb interactions */
+ fr->epsfac = 0;
+ }
+
+ /* Reaction field constants */
+ if (EEL_RF(fr->eeltype))
+ {
+ calc_rffac(fp, fr->eeltype, fr->epsilon_r, fr->epsilon_rf,
+ fr->rcoulomb, fr->temp, fr->zsquare, box,
+ &fr->kappa, &fr->k_rf, &fr->c_rf);
+ }
+
+ set_chargesum(fp, fr, mtop);
+
+ /* if we are using LR electrostatics, and they are tabulated,
+ * the tables will contain modified coulomb interactions.
+ * Since we want to use the non-shifted ones for 1-4
+ * coulombic interactions, we must have an extra set of tables.
+ */
+
+ /* Construct tables.
+ * A little unnecessary to make both vdw and coul tables sometimes,
+ * but what the heck... */
+
+ bTab = fr->bcoultab || fr->bvdwtab || fr->bEwald;
+
+ bSep14tab = ((!bTab || fr->eeltype != eelCUT || fr->vdwtype != evdwCUT ||
+ fr->bBHAM || fr->bEwald) &&
+ (gmx_mtop_ftype_count(mtop, F_LJ14) > 0 ||
+ gmx_mtop_ftype_count(mtop, F_LJC14_Q) > 0 ||
+ gmx_mtop_ftype_count(mtop, F_LJC_PAIRS_NB) > 0));
+
+ negp_pp = ir->opts.ngener - ir->nwall;
+ negptable = 0;
+ if (!bTab)
+ {
+ bNormalnblists = TRUE;
+ fr->nnblists = 1;
+ }
+ else
+ {
+ bNormalnblists = (ir->eDispCorr != edispcNO);
+ for (egi = 0; egi < negp_pp; egi++)
+ {
+ for (egj = egi; egj < negp_pp; egj++)
+ {
+ egp_flags = ir->opts.egp_flags[GID(egi, egj, ir->opts.ngener)];
+ if (!(egp_flags & EGP_EXCL))
+ {
+ if (egp_flags & EGP_TABLE)
+ {
+ negptable++;
+ }
+ else
+ {
+ bNormalnblists = TRUE;
+ }
+ }
+ }
+ }
+ if (bNormalnblists)
+ {
+ fr->nnblists = negptable + 1;
+ }
+ else
+ {
+ fr->nnblists = negptable;
+ }
+ if (fr->nnblists > 1)
+ {
+ snew(fr->gid2nblists, ir->opts.ngener*ir->opts.ngener);
+ }
+ }
+
+ if (ir->adress)
+ {
+ fr->nnblists *= 2;
+ }
+
+ snew(fr->nblists, fr->nnblists);
+
+ /* This code automatically gives table length tabext without cut-off's,
+ * in that case grompp should already have checked that we do not need
+ * normal tables and we only generate tables for 1-4 interactions.
+ */
+ rtab = ir->rlistlong + ir->tabext;
+
+ if (bTab)
+ {
+ /* make tables for ordinary interactions */
+ if (bNormalnblists)
+ {
+ make_nbf_tables(fp, oenv, fr, rtab, cr, tabfn, NULL, NULL, &fr->nblists[0]);
+ if (ir->adress)
+ {
+ make_nbf_tables(fp, oenv, fr, rtab, cr, tabfn, NULL, NULL, &fr->nblists[fr->nnblists/2]);
+ }
+ if (!bSep14tab)
+ {
+ fr->tab14 = fr->nblists[0].table_elec_vdw;
+ }
+ m = 1;
+ }
+ else
+ {
+ m = 0;
+ }
+ if (negptable > 0)
+ {
+ /* Read the special tables for certain energy group pairs */
+ nm_ind = mtop->groups.grps[egcENER].nm_ind;
+ for (egi = 0; egi < negp_pp; egi++)
+ {
+ for (egj = egi; egj < negp_pp; egj++)
+ {
+ egp_flags = ir->opts.egp_flags[GID(egi, egj, ir->opts.ngener)];
+ if ((egp_flags & EGP_TABLE) && !(egp_flags & EGP_EXCL))
+ {
+ nbl = &(fr->nblists[m]);
+ if (fr->nnblists > 1)
+ {
+ fr->gid2nblists[GID(egi, egj, ir->opts.ngener)] = m;
+ }
+ /* Read the table file with the two energy groups names appended */
+ make_nbf_tables(fp, oenv, fr, rtab, cr, tabfn,
+ *mtop->groups.grpname[nm_ind[egi]],
+ *mtop->groups.grpname[nm_ind[egj]],
+ &fr->nblists[m]);
+ if (ir->adress)
+ {
+ make_nbf_tables(fp, oenv, fr, rtab, cr, tabfn,
+ *mtop->groups.grpname[nm_ind[egi]],
+ *mtop->groups.grpname[nm_ind[egj]],
+ &fr->nblists[fr->nnblists/2+m]);
+ }
+ m++;
+ }
+ else if (fr->nnblists > 1)
+ {
+ fr->gid2nblists[GID(egi, egj, ir->opts.ngener)] = 0;
+ }
+ }
+ }
+ }
+ }
+ if (bSep14tab)
+ {
+ /* generate extra tables with plain Coulomb for 1-4 interactions only */
+ fr->tab14 = make_tables(fp, oenv, fr, MASTER(cr), tabpfn, rtab,
+ GMX_MAKETABLES_14ONLY);
+ }
+
+ /* Read AdResS Thermo Force table if needed */
+ if (fr->adress_icor == eAdressICThermoForce)
+ {
+ /* old todo replace */
+
+ if (ir->adress->n_tf_grps > 0)
+ {
+ make_adress_tf_tables(fp, oenv, fr, ir, tabfn, mtop, box);
+
+ }
+ else
+ {
+ /* load the default table */
+ snew(fr->atf_tabs, 1);
+ fr->atf_tabs[DEFAULT_TF_TABLE] = make_atf_table(fp, oenv, fr, tabafn, box);
+ }
+ }
+
+ /* Wall stuff */
+ fr->nwall = ir->nwall;
+ if (ir->nwall && ir->wall_type == ewtTABLE)
+ {
+ make_wall_tables(fp, oenv, ir, tabfn, &mtop->groups, fr);
+ }
+
+ if (fcd && tabbfn)
+ {
+ fcd->bondtab = make_bonded_tables(fp,
+ F_TABBONDS, F_TABBONDSNC,
+ mtop, tabbfn, "b");
+ fcd->angletab = make_bonded_tables(fp,
+ F_TABANGLES, -1,
+ mtop, tabbfn, "a");
+ fcd->dihtab = make_bonded_tables(fp,
+ F_TABDIHS, -1,
+ mtop, tabbfn, "d");
+ }
+ else
+ {
+ if (debug)
+ {
+ fprintf(debug, "No fcdata or table file name passed, can not read table, can not do bonded interactions\n");
+ }
+ }
+
+ /* QM/MM initialization if requested
+ */
+ if (ir->bQMMM)
+ {
+ fprintf(stderr, "QM/MM calculation requested.\n");
+ }
+
+ fr->bQMMM = ir->bQMMM;
+ fr->qr = mk_QMMMrec();
+
+ /* Set all the static charge group info */
+ fr->cginfo_mb = init_cginfo_mb(fp, mtop, fr, bNoSolvOpt,
+ &fr->bExcl_IntraCGAll_InterCGNone);
+ if (DOMAINDECOMP(cr))
+ {
+ fr->cginfo = NULL;
+ }
+ else
+ {
+ fr->cginfo = cginfo_expand(mtop->nmolblock, fr->cginfo_mb);
+ }
+
+ if (!DOMAINDECOMP(cr))
+ {
+ /* When using particle decomposition, the effect of the second argument,
+ * which sets fr->hcg, is corrected later in do_md and init_em.
+ */
+ forcerec_set_ranges(fr, ncg_mtop(mtop), ncg_mtop(mtop),
+ mtop->natoms, mtop->natoms, mtop->natoms);
+ }
+
+ fr->print_force = print_force;
+
+
+ /* coarse load balancing vars */
+ fr->t_fnbf = 0.;
+ fr->t_wait = 0.;
+ fr->timesteps = 0;
+
+ /* Initialize neighbor search */
+ init_ns(fp, cr, &fr->ns, fr, mtop, box);
+
+ if (cr->duty & DUTY_PP)
+ {
+ gmx_nonbonded_setup(fp, fr, bGenericKernelOnly);
+ /*
+ if (ir->bAdress)
+ {
+ gmx_setup_adress_kernels(fp,bGenericKernelOnly);
+ }
+ */
+ }
+
+ /* Initialize the thread working data for bonded interactions */
+ init_forcerec_f_threads(fr, mtop->groups.grps[egcENER].nr);
+
+ snew(fr->excl_load, fr->nthreads+1);
+
+ if (fr->cutoff_scheme == ecutsVERLET)
+ {
+ if (ir->rcoulomb != ir->rvdw)
+ {
+ gmx_fatal(FARGS, "With Verlet lists rcoulomb and rvdw should be identical");
+ }
+
+ init_nb_verlet(fp, &fr->nbv, ir, fr, cr, nbpu_opt);
+ }
+
+ /* fr->ic is used both by verlet and group kernels (to some extent) now */
+ init_interaction_const(fp, &fr->ic, fr, rtab);
+ if (ir->eDispCorr != edispcNO)
+ {
+ calc_enervirdiff(fp, ir->eDispCorr, fr);
+ }
+}
+
+#define pr_real(fp, r) fprintf(fp, "%s: %e\n",#r, r)
+#define pr_int(fp, i) fprintf((fp), "%s: %d\n",#i, i)
+#define pr_bool(fp, b) fprintf((fp), "%s: %s\n",#b, bool_names[b])
+
+void pr_forcerec(FILE *fp, t_forcerec *fr, t_commrec *cr)
+{
+ int i;
+
+ pr_real(fp, fr->rlist);
+ pr_real(fp, fr->rcoulomb);
+ pr_real(fp, fr->fudgeQQ);
+ pr_bool(fp, fr->bGrid);
+ pr_bool(fp, fr->bTwinRange);
+ /*pr_int(fp,fr->cg0);
+ pr_int(fp,fr->hcg);*/
+ for (i = 0; i < fr->nnblists; i++)
+ {
+ pr_int(fp, fr->nblists[i].table_elec_vdw.n);
+ }
+ pr_real(fp, fr->rcoulomb_switch);
+ pr_real(fp, fr->rcoulomb);
+
+ fflush(fp);
+}
+
+void forcerec_set_excl_load(t_forcerec *fr,
+ const gmx_localtop_t *top, const t_commrec *cr)
+{
+ const int *ind, *a;
+ int t, i, j, ntot, n, ntarget;
+
+ if (cr != NULL && PARTDECOMP(cr))
+ {
+ /* No OpenMP with particle decomposition */
+ pd_at_range(cr,
+ &fr->excl_load[0],
+ &fr->excl_load[1]);
+
+ return;
+ }
+
+ ind = top->excls.index;
+ a = top->excls.a;
+
+ ntot = 0;
+ for (i = 0; i < top->excls.nr; i++)
+ {
+ for (j = ind[i]; j < ind[i+1]; j++)
+ {
+ if (a[j] > i)
+ {
+ ntot++;
+ }
+ }
+ }
+
+ fr->excl_load[0] = 0;
+ n = 0;
+ i = 0;
+ for (t = 1; t <= fr->nthreads; t++)
+ {
+ ntarget = (ntot*t)/fr->nthreads;
+ while (i < top->excls.nr && n < ntarget)
+ {
+ for (j = ind[i]; j < ind[i+1]; j++)
+ {
+ if (a[j] > i)
+ {
+ n++;
+ }
+ }
+ i++;
+ }
+ fr->excl_load[t] = i;
+ }
+}
--- /dev/null
- #define _nsnxn_atomdata_h
+/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustr
+ *
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * GROningen MAchine for Chemical Simulations
+ *
+ * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2012, The GROMACS development team,
+ * check out http://www.gromacs.org for more information.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * If you want to redistribute modifications, please consider that
+ * scientific software is very special. Version control is crucial -
+ * bugs must be traceable. We will be happy to consider code for
+ * inclusion in the official distribution, but derived work must not
+ * be called official GROMACS. Details are found in the README & COPYING
+ * files - if they are missing, get the official version at www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the papers on the package - you can find them in the top README file.
+ *
+ * For more info, check our website at http://www.gromacs.org
+ *
+ * And Hey:
+ * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
+ */
+
+#ifndef _nbnxn_atomdata_h
++#define _nbnxn_atomdata_h
+
+#include "typedefs.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/* Default nbnxn allocation routine, allocates 32 byte aligned,
+ * which works for plain C and aligned SSE and AVX loads/stores.
+ */
+void nbnxn_alloc_aligned(void **ptr, size_t nbytes);
+
+/* Free function for memory allocated with nbnxn_alloc_aligned */
+void nbnxn_free_aligned(void *ptr);
+
+/* Reallocation wrapper function for nbnxn data structures */
+void nbnxn_realloc_void(void **ptr,
+ int nbytes_copy, int nbytes_new,
+ nbnxn_alloc_t *ma,
+ nbnxn_free_t *mf);
+
+/* Reallocate the nbnxn_atomdata_t for a size of n atoms */
+void nbnxn_atomdata_realloc(nbnxn_atomdata_t *nbat, int n);
+
+/* Copy na rvec elements from x to xnb using nbatFormat, start dest a0,
+ * and fills up to na_round using cx,cy,cz.
+ */
+void copy_rvec_to_nbat_real(const int *a, int na, int na_round,
+ rvec *x, int nbatFormat, real *xnb, int a0,
+ int cx, int cy, int cz);
+
+/* Initialize the non-bonded atom data structure.
+ * The enum for nbatXFormat is in the file defining nbnxn_atomdata_t.
+ * Copy the ntypes*ntypes*2 sized nbfp non-bonded parameter list
+ * to the atom data structure.
+ */
+void nbnxn_atomdata_init(FILE *fp,
+ nbnxn_atomdata_t *nbat,
+ int nb_kernel_type,
+ int ntype, const real *nbfp,
+ int n_energygroups,
+ int nout,
+ nbnxn_alloc_t *alloc,
+ nbnxn_free_t *free);
+
+/* Copy the atom data to the non-bonded atom data structure */
+void nbnxn_atomdata_set(nbnxn_atomdata_t *nbat,
+ int locality,
+ const nbnxn_search_t nbs,
+ const t_mdatoms *mdatoms,
+ const int *atinfo);
+
+/* Copy the shift vectors to nbat */
+void nbnxn_atomdata_copy_shiftvec(gmx_bool dynamic_box,
+ rvec *shift_vec,
+ nbnxn_atomdata_t *nbat);
+
+/* Copy x to nbat->x.
+ * FillLocal tells if the local filler particle coordinates should be zeroed.
+ */
+void nbnxn_atomdata_copy_x_to_nbat_x(const nbnxn_search_t nbs,
+ int locality,
+ gmx_bool FillLocal,
+ rvec *x,
+ nbnxn_atomdata_t *nbat);
+
+/* Add the forces stored in nbat to f, zeros the forces in nbat */
+void nbnxn_atomdata_add_nbat_f_to_f(const nbnxn_search_t nbs,
+ int locality,
+ const nbnxn_atomdata_t *nbat,
+ rvec *f);
+
+/* Add the fshift force stored in nbat to fshift */
+void nbnxn_atomdata_add_nbat_fshift_to_fshift(const nbnxn_atomdata_t *nbat,
+ rvec *fshift);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- /dev/null
- gmx_gpu_info_t *gpu_info, int my_gpu_index,
+/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
+ *
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * GROningen MAchine for Chemical Simulations
+ *
+ * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2012, The GROMACS development team,
+ * check out http://www.gromacs.org for more information.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * If you want to redistribute modifications, please consider that
+ * scientific software is very special. Version control is crucial -
+ * bugs must be traceable. We will be happy to consider code for
+ * inclusion in the official distribution, but derived work must not
+ * be called official GROMACS. Details are found in the README & COPYING
+ * files - if they are missing, get the official version at www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the papers on the package - you can find them in the top README file.
+ *
+ * For more info, check our website at http://www.gromacs.org
+ *
+ * And Hey:
+ * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <assert.h>
+
+#include <cuda.h>
+
+#include "gmx_fatal.h"
+#include "smalloc.h"
+#include "tables.h"
+#include "typedefs.h"
+#include "types/nb_verlet.h"
+#include "types/interaction_const.h"
+#include "types/force_flags.h"
+#include "../nbnxn_consts.h"
++#include "gmx_detect_hardware.h"
+
+#include "nbnxn_cuda_types.h"
+#include "../../gmxlib/cuda_tools/cudautils.cuh"
+#include "nbnxn_cuda_data_mgmt.h"
+#include "pmalloc_cuda.h"
+#include "gpu_utils.h"
+
+static bool bUseCudaEventBlockingSync = false; /* makes the CPU thread block */
+
+/* This is a heuristically determined parameter for the Fermi architecture for
+ * the minimum size of ci lists by multiplying this constant with the # of
+ * multiprocessors on the current device.
+ */
+static unsigned int gpu_min_ci_balanced_factor = 40;
+
+/* Functions from nbnxn_cuda.cu */
+extern void nbnxn_cuda_set_cacheconfig(cuda_dev_info_t *devinfo);
+extern const struct texture<float, 1, cudaReadModeElementType>& nbnxn_cuda_get_nbfp_texref();
+extern const struct texture<float, 1, cudaReadModeElementType>& nbnxn_cuda_get_coulomb_tab_texref();
+
+/* We should actually be using md_print_warn in md_logging.c,
+ * but we can't include mpi.h in CUDA code.
+ */
+static void md_print_warn(FILE *fplog,
+ const char *fmt, ...)
+{
+ va_list ap;
+
+ if (fplog != NULL)
+ {
+ /* We should only print to stderr on the master node,
+ * in most cases fplog is only set on the master node, so this works.
+ */
+ va_start(ap, fmt);
+ fprintf(stderr, "\n");
+ vfprintf(stderr, fmt, ap);
+ fprintf(stderr, "\n");
+ va_end(ap);
+
+ va_start(ap, fmt);
+ fprintf(fplog, "\n");
+ vfprintf(fplog, fmt, ap);
+ fprintf(fplog, "\n");
+ va_end(ap);
+ }
+}
+
+
+/* Fw. decl. */
+static void nbnxn_cuda_clear_e_fshift(nbnxn_cuda_ptr_t cu_nb);
+
+
+/*! Tabulates the Ewald Coulomb force and initializes the size/scale
+ and the table GPU array. If called with an already allocated table,
+ it just re-uploads the table.
+ */
+static void init_ewald_coulomb_force_table(cu_nbparam_t *nbp)
+{
+ float *ftmp, *coul_tab;
+ int tabsize;
+ double tabscale;
+ cudaError_t stat;
+
+ tabsize = GPU_EWALD_COULOMB_FORCE_TABLE_SIZE;
+ /* Subtract 2 iso 1 to avoid access out of range due to rounding */
+ tabscale = (tabsize - 2) / sqrt(nbp->rcoulomb_sq);
+
+ pmalloc((void**)&ftmp, tabsize*sizeof(*ftmp));
+
+ table_spline3_fill_ewald_lr(ftmp, NULL, NULL, tabsize,
+ 1/tabscale, nbp->ewald_beta);
+
+ /* If the table pointer == NULL the table is generated the first time =>
+ the array pointer will be saved to nbparam and the texture is bound.
+ */
+ coul_tab = nbp->coulomb_tab;
+ if (coul_tab == NULL)
+ {
+ stat = cudaMalloc((void **)&coul_tab, tabsize*sizeof(*coul_tab));
+ CU_RET_ERR(stat, "cudaMalloc failed on coul_tab");
+
+ nbp->coulomb_tab = coul_tab;
+
+ cudaChannelFormatDesc cd = cudaCreateChannelDesc<float>();
+ stat = cudaBindTexture(NULL, &nbnxn_cuda_get_coulomb_tab_texref(),
+ coul_tab, &cd, tabsize*sizeof(*coul_tab));
+ CU_RET_ERR(stat, "cudaBindTexture on coul_tab failed");
+ }
+
+ cu_copy_H2D(coul_tab, ftmp, tabsize*sizeof(*coul_tab));
+
+ nbp->coulomb_tab_size = tabsize;
+ nbp->coulomb_tab_scale = tabscale;
+
+ pfree(ftmp);
+}
+
+
+/*! Initializes the atomdata structure first time, it only gets filled at
+ pair-search. */
+static void init_atomdata_first(cu_atomdata_t *ad, int ntypes)
+{
+ cudaError_t stat;
+
+ ad->ntypes = ntypes;
+ stat = cudaMalloc((void**)&ad->shift_vec, SHIFTS*sizeof(*ad->shift_vec));
+ CU_RET_ERR(stat, "cudaMalloc failed on ad->shift_vec");
+ ad->bShiftVecUploaded = false;
+
+ stat = cudaMalloc((void**)&ad->fshift, SHIFTS*sizeof(*ad->fshift));
+ CU_RET_ERR(stat, "cudaMalloc failed on ad->fshift");
+
+ stat = cudaMalloc((void**)&ad->e_lj, sizeof(*ad->e_lj));
+ CU_RET_ERR(stat, "cudaMalloc failed on ad->e_lj");
+ stat = cudaMalloc((void**)&ad->e_el, sizeof(*ad->e_el));
+ CU_RET_ERR(stat, "cudaMalloc failed on ad->e_el");
+
+ /* initialize to NULL poiters to data that is not allocated here and will
+ need reallocation in nbnxn_cuda_init_atomdata */
+ ad->xq = NULL;
+ ad->f = NULL;
+
+ /* size -1 indicates that the respective array hasn't been initialized yet */
+ ad->natoms = -1;
+ ad->nalloc = -1;
+}
+
+/*! Selects the Ewald kernel type, analytical on SM 3.0 and later, tabulated on
+ earlier GPUs, single or twin cut-off. */
+static int pick_ewald_kernel_type(bool bTwinCut,
+ const cuda_dev_info_t *dev_info)
+{
+ bool bUseAnalyticalEwald, bForceAnalyticalEwald, bForceTabulatedEwald;
+ int kernel_type;
+
+ /* Benchmarking/development environment variables to force the use of
+ analytical or tabulated Ewald kernel. */
+ bForceAnalyticalEwald = (getenv("GMX_CUDA_NB_ANA_EWALD") != NULL);
+ bForceTabulatedEwald = (getenv("GMX_CUDA_NB_TAB_EWALD") != NULL);
+
+ if (bForceAnalyticalEwald && bForceTabulatedEwald)
+ {
+ gmx_incons("Both analytical and tabulated Ewald CUDA non-bonded kernels "
+ "requested through environment variables.");
+ }
+
+ /* By default, on SM 3.0 and later use analytical Ewald, on earlier tabulated. */
+ if ((dev_info->prop.major >= 3 || bForceAnalyticalEwald) && !bForceTabulatedEwald)
+ {
+ bUseAnalyticalEwald = true;
+
+ if (debug)
+ {
+ fprintf(debug, "Using analytical Ewald CUDA kernels\n");
+ }
+ }
+ else
+ {
+ bUseAnalyticalEwald = false;
+
+ if (debug)
+ {
+ fprintf(debug, "Using tabulated Ewald CUDA kernels\n");
+ }
+ }
+
+ /* Use twin cut-off kernels if requested by bTwinCut or the env. var.
+ forces it (use it for debugging/benchmarking only). */
+ if (!bTwinCut && (getenv("GMX_CUDA_NB_EWALD_TWINCUT") == NULL))
+ {
+ kernel_type = bUseAnalyticalEwald ? eelCuEWALD_ANA : eelCuEWALD_TAB;
+ }
+ else
+ {
+ kernel_type = bUseAnalyticalEwald ? eelCuEWALD_ANA_TWIN : eelCuEWALD_TAB_TWIN;
+ }
+
+ return kernel_type;
+}
+
+
+/*! Initializes the nonbonded parameter data structure. */
+static void init_nbparam(cu_nbparam_t *nbp,
+ const interaction_const_t *ic,
+ const nbnxn_atomdata_t *nbat,
+ const cuda_dev_info_t *dev_info)
+{
+ cudaError_t stat;
+ int ntypes, nnbfp;
+
+ ntypes = nbat->ntype;
+
+ nbp->ewald_beta = ic->ewaldcoeff;
+ nbp->sh_ewald = ic->sh_ewald;
+ nbp->epsfac = ic->epsfac;
+ nbp->two_k_rf = 2.0 * ic->k_rf;
+ nbp->c_rf = ic->c_rf;
+ nbp->rvdw_sq = ic->rvdw * ic->rvdw;
+ nbp->rcoulomb_sq= ic->rcoulomb * ic->rcoulomb;
+ nbp->rlist_sq = ic->rlist * ic->rlist;
+ nbp->sh_invrc6 = ic->sh_invrc6;
+
+ if (ic->eeltype == eelCUT)
+ {
+ nbp->eeltype = eelCuCUT;
+ }
+ else if (EEL_RF(ic->eeltype))
+ {
+ nbp->eeltype = eelCuRF;
+ }
+ else if ((EEL_PME(ic->eeltype) || ic->eeltype==eelEWALD))
+ {
+ /* Initially rcoulomb == rvdw, so it's surely not twin cut-off. */
+ nbp->eeltype = pick_ewald_kernel_type(false, dev_info);
+ }
+ else
+ {
+ /* Shouldn't happen, as this is checked when choosing Verlet-scheme */
+ gmx_incons("The requested electrostatics type is not implemented in the CUDA GPU accelerated kernels!");
+ }
+
+ /* generate table for PME */
+ nbp->coulomb_tab = NULL;
+ if (nbp->eeltype == eelCuEWALD_TAB || nbp->eeltype == eelCuEWALD_TAB_TWIN)
+ {
+ init_ewald_coulomb_force_table(nbp);
+ }
+
+ nnbfp = 2*ntypes*ntypes;
+ stat = cudaMalloc((void **)&nbp->nbfp, nnbfp*sizeof(*nbp->nbfp));
+ CU_RET_ERR(stat, "cudaMalloc failed on nbp->nbfp");
+ cu_copy_H2D(nbp->nbfp, nbat->nbfp, nnbfp*sizeof(*nbp->nbfp));
+
+ cudaChannelFormatDesc cd = cudaCreateChannelDesc<float>();
+ stat = cudaBindTexture(NULL, &nbnxn_cuda_get_nbfp_texref(),
+ nbp->nbfp, &cd, nnbfp*sizeof(*nbp->nbfp));
+ CU_RET_ERR(stat, "cudaBindTexture on nbfp failed");
+}
+
+/*! Re-generate the GPU Ewald force table, resets rlist, and update the
+ * electrostatic type switching to twin cut-off (or back) if needed. */
+void nbnxn_cuda_pme_loadbal_update_param(nbnxn_cuda_ptr_t cu_nb,
+ const interaction_const_t *ic)
+{
+ cu_nbparam_t *nbp = cu_nb->nbparam;
+
+ nbp->rlist_sq = ic->rlist * ic->rlist;
+ nbp->rcoulomb_sq = ic->rcoulomb * ic->rcoulomb;
+ nbp->ewald_beta = ic->ewaldcoeff;
+
+ nbp->eeltype = pick_ewald_kernel_type(ic->rcoulomb != ic->rvdw,
+ cu_nb->dev_info);
+
+ init_ewald_coulomb_force_table(cu_nb->nbparam);
+}
+
+/*! Initializes the pair list data structure. */
+static void init_plist(cu_plist_t *pl)
+{
+ /* initialize to NULL pointers to data that is not allocated here and will
+ need reallocation in nbnxn_cuda_init_pairlist */
+ pl->sci = NULL;
+ pl->cj4 = NULL;
+ pl->excl = NULL;
+
+ /* size -1 indicates that the respective array hasn't been initialized yet */
+ pl->na_c = -1;
+ pl->nsci = -1;
+ pl->sci_nalloc = -1;
+ pl->ncj4 = -1;
+ pl->cj4_nalloc = -1;
+ pl->nexcl = -1;
+ pl->excl_nalloc = -1;
+ pl->bDoPrune = false;
+}
+
+/*! Initializes the timer data structure. */
+static void init_timers(cu_timers_t *t, bool bUseTwoStreams)
+{
+ cudaError_t stat;
+ int eventflags = ( bUseCudaEventBlockingSync ? cudaEventBlockingSync: cudaEventDefault );
+
+ stat = cudaEventCreateWithFlags(&(t->start_atdat), eventflags);
+ CU_RET_ERR(stat, "cudaEventCreate on start_atdat failed");
+ stat = cudaEventCreateWithFlags(&(t->stop_atdat), eventflags);
+ CU_RET_ERR(stat, "cudaEventCreate on stop_atdat failed");
+
+ /* The non-local counters/stream (second in the array) are needed only with DD. */
+ for (int i = 0; i <= (bUseTwoStreams ? 1 : 0); i++)
+ {
+ stat = cudaEventCreateWithFlags(&(t->start_nb_k[i]), eventflags);
+ CU_RET_ERR(stat, "cudaEventCreate on start_nb_k failed");
+ stat = cudaEventCreateWithFlags(&(t->stop_nb_k[i]), eventflags);
+ CU_RET_ERR(stat, "cudaEventCreate on stop_nb_k failed");
+
+
+ stat = cudaEventCreateWithFlags(&(t->start_pl_h2d[i]), eventflags);
+ CU_RET_ERR(stat, "cudaEventCreate on start_pl_h2d failed");
+ stat = cudaEventCreateWithFlags(&(t->stop_pl_h2d[i]), eventflags);
+ CU_RET_ERR(stat, "cudaEventCreate on stop_pl_h2d failed");
+
+ stat = cudaEventCreateWithFlags(&(t->start_nb_h2d[i]), eventflags);
+ CU_RET_ERR(stat, "cudaEventCreate on start_nb_h2d failed");
+ stat = cudaEventCreateWithFlags(&(t->stop_nb_h2d[i]), eventflags);
+ CU_RET_ERR(stat, "cudaEventCreate on stop_nb_h2d failed");
+
+ stat = cudaEventCreateWithFlags(&(t->start_nb_d2h[i]), eventflags);
+ CU_RET_ERR(stat, "cudaEventCreate on start_nb_d2h failed");
+ stat = cudaEventCreateWithFlags(&(t->stop_nb_d2h[i]), eventflags);
+ CU_RET_ERR(stat, "cudaEventCreate on stop_nb_d2h failed");
+ }
+}
+
+/*! Initializes the timings data structure. */
+static void init_timings(wallclock_gpu_t *t)
+{
+ int i, j;
+
+ t->nb_h2d_t = 0.0;
+ t->nb_d2h_t = 0.0;
+ t->nb_c = 0;
+ t->pl_h2d_t = 0.0;
+ t->pl_h2d_c = 0;
+ for (i = 0; i < 2; i++)
+ {
+ for(j = 0; j < 2; j++)
+ {
+ t->ktime[i][j].t = 0.0;
+ t->ktime[i][j].c = 0;
+ }
+ }
+}
+
+/* Decide which kernel version to use (default or legacy) based on:
+ * - CUDA version used for compilation
+ * - non-bonded kernel selector environment variables
+ * - GPU architecture version
+ */
+static int pick_nbnxn_kernel_version(FILE *fplog,
+ cuda_dev_info_t *devinfo)
+{
+ bool bForceLegacyKernel, bForceDefaultKernel, bCUDA40, bCUDA32;
+ char sbuf[STRLEN];
+ int kver;
+
+ /* Legacy kernel (former k2), kept for backward compatibility as it is
+ faster than the default with CUDA 3.2/4.0 on Fermi (not on Kepler). */
+ bForceLegacyKernel = (getenv("GMX_CUDA_NB_LEGACY") != NULL);
+ /* default kernel (former k3). */
+ bForceDefaultKernel = (getenv("GMX_CUDA_NB_DEFAULT") != NULL);
+
+ if ((unsigned)(bForceLegacyKernel + bForceDefaultKernel) > 1)
+ {
+ gmx_fatal(FARGS, "Multiple CUDA non-bonded kernels requested; to manually pick a kernel set only one \n"
+ "of the following environment variables: \n"
+ "GMX_CUDA_NB_DEFAULT, GMX_CUDA_NB_LEGACY");
+ }
+
+ bCUDA32 = bCUDA40 = false;
+#if CUDA_VERSION == 3200
+ bCUDA32 = true;
+ sprintf(sbuf, "3.2");
+#elif CUDA_VERSION == 4000
+ bCUDA40 = true;
+ sprintf(sbuf, "4.0");
+#endif
+
+ /* default is default ;) */
+ kver = eNbnxnCuKDefault;
+
+ /* Consider switching to legacy kernels only on Fermi */
+ if (devinfo->prop.major < 3 && (bCUDA32 || bCUDA40))
+ {
+ /* use legacy kernel unless something else is forced by an env. var */
+ if (bForceDefaultKernel)
+ {
+ md_print_warn(fplog,
+ "NOTE: CUDA %s compilation detected; with this compiler version the legacy\n"
+ " non-bonded kernels perform best. However, the default kernels were\n"
+ " selected by the GMX_CUDA_NB_DEFAULT environment variable.\n"
+ " For best performance upgrade your CUDA toolkit.\n",
+ sbuf);
+ }
+ else
+ {
+ kver = eNbnxnCuKLegacy;
+ }
+ }
+ else
+ {
+ /* issue note if the non-default kernel is forced by an env. var */
+ if (bForceLegacyKernel)
+ {
+ md_print_warn(fplog,
+ "NOTE: Legacy non-bonded CUDA kernels selected by the GMX_CUDA_NB_LEGACY\n"
+ " env. var. Consider using using the default kernels which should be faster!\n");
+
+ kver = eNbnxnCuKLegacy;
+ }
+ }
+
+ return kver;
+}
+
+void nbnxn_cuda_init(FILE *fplog,
+ nbnxn_cuda_ptr_t *p_cu_nb,
- bool bShouldUsePollSync = (bX86 && bTMPIAtomics && !gpu_info->bDevShare);
++ const gmx_gpu_info_t *gpu_info, int my_gpu_index,
+ gmx_bool bLocalAndNonlocal)
+{
+ cudaError_t stat;
+ nbnxn_cuda_ptr_t nb;
+ char sbuf[STRLEN];
+ bool bStreamSync, bNoStreamSync, bTMPIAtomics, bX86, bOldDriver;
+ int cuda_drv_ver;
+
+ assert(gpu_info);
+
+ if (p_cu_nb == NULL) return;
+
+ snew(nb, 1);
+ snew(nb->atdat, 1);
+ snew(nb->nbparam, 1);
+ snew(nb->plist[eintLocal], 1);
+ if (bLocalAndNonlocal)
+ {
+ snew(nb->plist[eintNonlocal], 1);
+ }
+
+ nb->bUseTwoStreams = bLocalAndNonlocal;
+
+ snew(nb->timers, 1);
+ snew(nb->timings, 1);
+
+ /* init nbst */
+ pmalloc((void**)&nb->nbst.e_lj, sizeof(*nb->nbst.e_lj));
+ pmalloc((void**)&nb->nbst.e_el, sizeof(*nb->nbst.e_el));
+ pmalloc((void**)&nb->nbst.fshift, SHIFTS * sizeof(*nb->nbst.fshift));
+
+ init_plist(nb->plist[eintLocal]);
+
+ /* local/non-local GPU streams */
+ stat = cudaStreamCreate(&nb->stream[eintLocal]);
+ CU_RET_ERR(stat, "cudaStreamCreate on stream[eintLocal] failed");
+ if (nb->bUseTwoStreams)
+ {
+ init_plist(nb->plist[eintNonlocal]);
+ stat = cudaStreamCreate(&nb->stream[eintNonlocal]);
+ CU_RET_ERR(stat, "cudaStreamCreate on stream[eintNonlocal] failed");
+ }
+
+ /* init events for sychronization (timing disabled for performance reasons!) */
+ stat = cudaEventCreateWithFlags(&nb->nonlocal_done, cudaEventDisableTiming);
+ CU_RET_ERR(stat, "cudaEventCreate on nonlocal_done failed");
+ stat = cudaEventCreateWithFlags(&nb->misc_ops_done, cudaEventDisableTiming);
+ CU_RET_ERR(stat, "cudaEventCreate on misc_ops_one failed");
+
+ /* set device info, just point it to the right GPU among the detected ones */
+ nb->dev_info = &gpu_info->cuda_dev[get_gpu_device_id(gpu_info, my_gpu_index)];
+
+ /* On GPUs with ECC enabled, cudaStreamSynchronize shows a large overhead
+ * (which increases with shorter time/step) caused by a known CUDA driver bug.
+ * To work around the issue we'll use an (admittedly fragile) memory polling
+ * waiting to preserve performance. This requires support for atomic
+ * operations and only works on x86/x86_64.
+ * With polling wait event-timing also needs to be disabled.
+ *
+ * The overhead is greatly reduced in API v5.0 drivers and the improvement
+ $ is independent of runtime version. Hence, with API v5.0 drivers and later
+ * we won't switch to polling.
+ *
+ * NOTE: Unfortunately, this is known to fail when GPUs are shared by (t)MPI,
+ * ranks so we will also disable it in that case.
+ */
+
+ bStreamSync = getenv("GMX_CUDA_STREAMSYNC") != NULL;
+ bNoStreamSync = getenv("GMX_NO_CUDA_STREAMSYNC") != NULL;
+
+#ifdef TMPI_ATOMICS
+ bTMPIAtomics = true;
+#else
+ bTMPIAtomics = false;
+#endif
+
+#if defined(i386) || defined(__x86_64__)
+ bX86 = true;
+#else
+ bX86 = false;
+#endif
+
+ if (bStreamSync && bNoStreamSync)
+ {
+ gmx_fatal(FARGS, "Conflicting environment variables: both GMX_CUDA_STREAMSYNC and GMX_NO_CUDA_STREAMSYNC defined");
+ }
+
+ stat = cudaDriverGetVersion(&cuda_drv_ver);
+ CU_RET_ERR(stat, "cudaDriverGetVersion failed");
+
+ bOldDriver = (cuda_drv_ver < 5000);
+
+ if ((nb->dev_info->prop.ECCEnabled == 1) && bOldDriver)
+ {
+ /* Polling wait should be used instead of cudaStreamSynchronize only if:
+ * - ECC is ON & driver is old (checked above),
+ * - we're on x86/x86_64,
+ * - atomics are available, and
+ * - GPUs are not being shared.
+ */
++ bool bShouldUsePollSync = (bX86 && bTMPIAtomics &&
++ (gmx_count_gpu_dev_shared(gpu_info) < 1));
+
+ if (bStreamSync)
+ {
+ nb->bUseStreamSync = true;
+
+ /* only warn if polling should be used */
+ if (bShouldUsePollSync)
+ {
+ md_print_warn(fplog,
+ "NOTE: Using a GPU with ECC enabled and CUDA driver API version <5.0, but\n"
+ " cudaStreamSynchronize waiting is forced by the GMX_CUDA_STREAMSYNC env. var.\n");
+ }
+ }
+ else
+ {
+ nb->bUseStreamSync = !bShouldUsePollSync;
+
+ if (bShouldUsePollSync)
+ {
+ md_print_warn(fplog,
+ "NOTE: Using a GPU with ECC enabled and CUDA driver API version <5.0, known to\n"
+ " cause performance loss. Switching to the alternative polling GPU wait.\n"
+ " If you encounter issues, switch back to standard GPU waiting by setting\n"
+ " the GMX_CUDA_STREAMSYNC environment variable.\n");
+ }
+ else
+ {
+ /* Tell the user that the ECC+old driver combination can be bad */
+ sprintf(sbuf,
+ "NOTE: Using a GPU with ECC enabled and CUDA driver API version <5.0.\n"
+ " A known bug in this driver version can cause performance loss.\n"
+ " However, the polling wait workaround can not be used because\n%s\n"
+ " Consider updating the driver or turning ECC off.",
+ (bX86 && bTMPIAtomics) ?
+ " GPU(s) are being oversubscribed." :
+ " atomic operations are not supported by the platform/CPU+compiler.");
+ md_print_warn(fplog, sbuf);
+ }
+ }
+ }
+ else
+ {
+ if (bNoStreamSync)
+ {
+ nb->bUseStreamSync = false;
+
+ md_print_warn(fplog,
+ "NOTE: Polling wait for GPU synchronization requested by GMX_NO_CUDA_STREAMSYNC\n");
+ }
+ else
+ {
+ /* no/off ECC, cudaStreamSynchronize not turned off by env. var. */
+ nb->bUseStreamSync = true;
+ }
+ }
+
+ /* CUDA timing disabled as event timers don't work:
+ - with multiple streams = domain-decomposition;
+ - with the polling waiting hack (without cudaStreamSynchronize);
+ - when turned off by GMX_DISABLE_CUDA_TIMING.
+ */
+ nb->bDoTime = (!nb->bUseTwoStreams && nb->bUseStreamSync &&
+ (getenv("GMX_DISABLE_CUDA_TIMING") == NULL));
+
+ if (nb->bDoTime)
+ {
+ init_timers(nb->timers, nb->bUseTwoStreams);
+ init_timings(nb->timings);
+ }
+
+ /* set the kernel type for the current GPU */
+ nb->kernel_ver = pick_nbnxn_kernel_version(fplog, nb->dev_info);
+ /* pick L1 cache configuration */
+ nbnxn_cuda_set_cacheconfig(nb->dev_info);
+
+ *p_cu_nb = nb;
+
+ if (debug)
+ {
+ fprintf(debug, "Initialized CUDA data structures.\n");
+ }
+}
+
+void nbnxn_cuda_init_const(nbnxn_cuda_ptr_t cu_nb,
+ const interaction_const_t *ic,
+ const nonbonded_verlet_group_t *nbv_group)
+{
+ init_atomdata_first(cu_nb->atdat, nbv_group[0].nbat->ntype);
+ init_nbparam(cu_nb->nbparam, ic, nbv_group[0].nbat, cu_nb->dev_info);
+
+ /* clear energy and shift force outputs */
+ nbnxn_cuda_clear_e_fshift(cu_nb);
+}
+
+void nbnxn_cuda_init_pairlist(nbnxn_cuda_ptr_t cu_nb,
+ const nbnxn_pairlist_t *h_plist,
+ int iloc)
+{
+ char sbuf[STRLEN];
+ cudaError_t stat;
+ bool bDoTime = cu_nb->bDoTime;
+ cudaStream_t stream = cu_nb->stream[iloc];
+ cu_plist_t *d_plist = cu_nb->plist[iloc];
+
+ if (d_plist->na_c < 0)
+ {
+ d_plist->na_c = h_plist->na_ci;
+ }
+ else
+ {
+ if (d_plist->na_c != h_plist->na_ci)
+ {
+ sprintf(sbuf, "In cu_init_plist: the #atoms per cell has changed (from %d to %d)",
+ d_plist->na_c, h_plist->na_ci);
+ gmx_incons(sbuf);
+ }
+ }
+
+ if (bDoTime)
+ {
+ stat = cudaEventRecord(cu_nb->timers->start_pl_h2d[iloc], stream);
+ CU_RET_ERR(stat, "cudaEventRecord failed");
+ }
+
+ cu_realloc_buffered((void **)&d_plist->sci, h_plist->sci, sizeof(*d_plist->sci),
+ &d_plist->nsci, &d_plist->sci_nalloc,
+ h_plist->nsci,
+ stream, true);
+
+ cu_realloc_buffered((void **)&d_plist->cj4, h_plist->cj4, sizeof(*d_plist->cj4),
+ &d_plist->ncj4, &d_plist->cj4_nalloc,
+ h_plist->ncj4,
+ stream, true);
+
+ cu_realloc_buffered((void **)&d_plist->excl, h_plist->excl, sizeof(*d_plist->excl),
+ &d_plist->nexcl, &d_plist->excl_nalloc,
+ h_plist->nexcl,
+ stream, true);
+
+ if (bDoTime)
+ {
+ stat = cudaEventRecord(cu_nb->timers->stop_pl_h2d[iloc], stream);
+ CU_RET_ERR(stat, "cudaEventRecord failed");
+ }
+
+ /* need to prune the pair list during the next step */
+ d_plist->bDoPrune = true;
+}
+
+void nbnxn_cuda_upload_shiftvec(nbnxn_cuda_ptr_t cu_nb,
+ const nbnxn_atomdata_t *nbatom)
+{
+ cu_atomdata_t *adat = cu_nb->atdat;
+ cudaStream_t ls = cu_nb->stream[eintLocal];
+
+ /* only if we have a dynamic box */
+ if (nbatom->bDynamicBox || !adat->bShiftVecUploaded)
+ {
+ cu_copy_H2D_async(adat->shift_vec, nbatom->shift_vec,
+ SHIFTS * sizeof(*adat->shift_vec), ls);
+ adat->bShiftVecUploaded = true;
+ }
+}
+
+/*! Clears the first natoms_clear elements of the GPU nonbonded force output array. */
+static void nbnxn_cuda_clear_f(nbnxn_cuda_ptr_t cu_nb, int natoms_clear)
+{
+ cudaError_t stat;
+ cu_atomdata_t *adat = cu_nb->atdat;
+ cudaStream_t ls = cu_nb->stream[eintLocal];
+
+ stat = cudaMemsetAsync(adat->f, 0, natoms_clear * sizeof(*adat->f), ls);
+ CU_RET_ERR(stat, "cudaMemsetAsync on f falied");
+}
+
+/*! Clears nonbonded shift force output array and energy outputs on the GPU. */
+static void nbnxn_cuda_clear_e_fshift(nbnxn_cuda_ptr_t cu_nb)
+{
+ cudaError_t stat;
+ cu_atomdata_t *adat = cu_nb->atdat;
+ cudaStream_t ls = cu_nb->stream[eintLocal];
+
+ stat = cudaMemsetAsync(adat->fshift, 0, SHIFTS * sizeof(*adat->fshift), ls);
+ CU_RET_ERR(stat, "cudaMemsetAsync on fshift falied");
+ stat = cudaMemsetAsync(adat->e_lj, 0, sizeof(*adat->e_lj), ls);
+ CU_RET_ERR(stat, "cudaMemsetAsync on e_lj falied");
+ stat = cudaMemsetAsync(adat->e_el, 0, sizeof(*adat->e_el), ls);
+ CU_RET_ERR(stat, "cudaMemsetAsync on e_el falied");
+}
+
+void nbnxn_cuda_clear_outputs(nbnxn_cuda_ptr_t cu_nb, int flags)
+{
+ nbnxn_cuda_clear_f(cu_nb, cu_nb->atdat->natoms);
+ /* clear shift force array and energies if the outputs were
+ used in the current step */
+ if (flags & GMX_FORCE_VIRIAL)
+ {
+ nbnxn_cuda_clear_e_fshift(cu_nb);
+ }
+}
+
+void nbnxn_cuda_init_atomdata(nbnxn_cuda_ptr_t cu_nb,
+ const nbnxn_atomdata_t *nbat)
+{
+ cudaError_t stat;
+ int nalloc, natoms;
+ bool realloced;
+ bool bDoTime = cu_nb->bDoTime;
+ cu_timers_t *timers = cu_nb->timers;
+ cu_atomdata_t *d_atdat = cu_nb->atdat;
+ cudaStream_t ls = cu_nb->stream[eintLocal];
+
+ natoms = nbat->natoms;
+ realloced = false;
+
+ if (bDoTime)
+ {
+ /* time async copy */
+ stat = cudaEventRecord(timers->start_atdat, ls);
+ CU_RET_ERR(stat, "cudaEventRecord failed");
+ }
+
+ /* need to reallocate if we have to copy more atoms than the amount of space
+ available and only allocate if we haven't initialized yet, i.e d_atdat->natoms == -1 */
+ if (natoms > d_atdat->nalloc)
+ {
+ nalloc = over_alloc_small(natoms);
+
+ /* free up first if the arrays have already been initialized */
+ if (d_atdat->nalloc != -1)
+ {
+ cu_free_buffered(d_atdat->f, &d_atdat->natoms, &d_atdat->nalloc);
+ cu_free_buffered(d_atdat->xq);
+ cu_free_buffered(d_atdat->atom_types);
+ }
+
+ stat = cudaMalloc((void **)&d_atdat->f, nalloc*sizeof(*d_atdat->f));
+ CU_RET_ERR(stat, "cudaMalloc failed on d_atdat->f");
+ stat = cudaMalloc((void **)&d_atdat->xq, nalloc*sizeof(*d_atdat->xq));
+ CU_RET_ERR(stat, "cudaMalloc failed on d_atdat->xq");
+
+ stat = cudaMalloc((void **)&d_atdat->atom_types, nalloc*sizeof(*d_atdat->atom_types));
+ CU_RET_ERR(stat, "cudaMalloc failed on d_atdat->atom_types");
+
+ d_atdat->nalloc = nalloc;
+ realloced = true;
+ }
+
+ d_atdat->natoms = natoms;
+ d_atdat->natoms_local = nbat->natoms_local;
+
+ /* need to clear GPU f output if realloc happened */
+ if (realloced)
+ {
+ nbnxn_cuda_clear_f(cu_nb, nalloc);
+ }
+
+ cu_copy_H2D_async(d_atdat->atom_types, nbat->type,
+ natoms*sizeof(*d_atdat->atom_types), ls);
+
+ if (bDoTime)
+ {
+ stat = cudaEventRecord(timers->stop_atdat, ls);
+ CU_RET_ERR(stat, "cudaEventRecord failed");
+ }
+}
+
+void nbnxn_cuda_free(FILE *fplog, nbnxn_cuda_ptr_t cu_nb)
+{
+ cudaError_t stat;
+ cu_atomdata_t *atdat;
+ cu_nbparam_t *nbparam;
+ cu_plist_t *plist, *plist_nl;
+ cu_timers_t *timers;
+
+ if (cu_nb == NULL) return;
+
+ atdat = cu_nb->atdat;
+ nbparam = cu_nb->nbparam;
+ plist = cu_nb->plist[eintLocal];
+ plist_nl = cu_nb->plist[eintNonlocal];
+ timers = cu_nb->timers;
+
+ if (nbparam->eeltype == eelCuEWALD_TAB || nbparam->eeltype == eelCuEWALD_TAB_TWIN)
+ {
+ stat = cudaUnbindTexture(nbnxn_cuda_get_coulomb_tab_texref());
+ CU_RET_ERR(stat, "cudaUnbindTexture on coulomb_tab failed");
+ cu_free_buffered(nbparam->coulomb_tab, &nbparam->coulomb_tab_size);
+ }
+
+ stat = cudaEventDestroy(cu_nb->nonlocal_done);
+ CU_RET_ERR(stat, "cudaEventDestroy failed on timers->nonlocal_done");
+ stat = cudaEventDestroy(cu_nb->misc_ops_done);
+ CU_RET_ERR(stat, "cudaEventDestroy failed on timers->misc_ops_done");
+
+ if (cu_nb->bDoTime)
+ {
+ stat = cudaEventDestroy(timers->start_atdat);
+ CU_RET_ERR(stat, "cudaEventDestroy failed on timers->start_atdat");
+ stat = cudaEventDestroy(timers->stop_atdat);
+ CU_RET_ERR(stat, "cudaEventDestroy failed on timers->stop_atdat");
+
+ /* The non-local counters/stream (second in the array) are needed only with DD. */
+ for (int i = 0; i <= (cu_nb->bUseTwoStreams ? 1 : 0); i++)
+ {
+ stat = cudaEventDestroy(timers->start_nb_k[i]);
+ CU_RET_ERR(stat, "cudaEventDestroy failed on timers->start_nb_k");
+ stat = cudaEventDestroy(timers->stop_nb_k[i]);
+ CU_RET_ERR(stat, "cudaEventDestroy failed on timers->stop_nb_k");
+
+ stat = cudaEventDestroy(timers->start_pl_h2d[i]);
+ CU_RET_ERR(stat, "cudaEventDestroy failed on timers->start_pl_h2d");
+ stat = cudaEventDestroy(timers->stop_pl_h2d[i]);
+ CU_RET_ERR(stat, "cudaEventDestroy failed on timers->stop_pl_h2d");
+
+ stat = cudaStreamDestroy(cu_nb->stream[i]);
+ CU_RET_ERR(stat, "cudaStreamDestroy failed on stream");
+
+ stat = cudaEventDestroy(timers->start_nb_h2d[i]);
+ CU_RET_ERR(stat, "cudaEventDestroy failed on timers->start_nb_h2d");
+ stat = cudaEventDestroy(timers->stop_nb_h2d[i]);
+ CU_RET_ERR(stat, "cudaEventDestroy failed on timers->stop_nb_h2d");
+
+ stat = cudaEventDestroy(timers->start_nb_d2h[i]);
+ CU_RET_ERR(stat, "cudaEventDestroy failed on timers->start_nb_d2h");
+ stat = cudaEventDestroy(timers->stop_nb_d2h[i]);
+ CU_RET_ERR(stat, "cudaEventDestroy failed on timers->stop_nb_d2h");
+ }
+ }
+
+ stat = cudaUnbindTexture(nbnxn_cuda_get_nbfp_texref());
+ CU_RET_ERR(stat, "cudaUnbindTexture on coulomb_tab failed");
+ cu_free_buffered(nbparam->nbfp);
+
+ stat = cudaFree(atdat->shift_vec);
+ CU_RET_ERR(stat, "cudaFree failed on atdat->shift_vec");
+ stat = cudaFree(atdat->fshift);
+ CU_RET_ERR(stat, "cudaFree failed on atdat->fshift");
+
+ stat = cudaFree(atdat->e_lj);
+ CU_RET_ERR(stat, "cudaFree failed on atdat->e_lj");
+ stat = cudaFree(atdat->e_el);
+ CU_RET_ERR(stat, "cudaFree failed on atdat->e_el");
+
+ cu_free_buffered(atdat->f, &atdat->natoms, &atdat->nalloc);
+ cu_free_buffered(atdat->xq);
+ cu_free_buffered(atdat->atom_types, &atdat->ntypes);
+
+ cu_free_buffered(plist->sci, &plist->nsci, &plist->sci_nalloc);
+ cu_free_buffered(plist->cj4, &plist->ncj4, &plist->cj4_nalloc);
+ cu_free_buffered(plist->excl, &plist->nexcl, &plist->excl_nalloc);
+ if (cu_nb->bUseTwoStreams)
+ {
+ cu_free_buffered(plist_nl->sci, &plist_nl->nsci, &plist_nl->sci_nalloc);
+ cu_free_buffered(plist_nl->cj4, &plist_nl->ncj4, &plist_nl->cj4_nalloc);
+ cu_free_buffered(plist_nl->excl, &plist_nl->nexcl, &plist->excl_nalloc);
+ }
+
+ sfree(atdat);
+ sfree(nbparam);
+ sfree(plist);
+ if (cu_nb->bUseTwoStreams)
+ {
+ sfree(plist_nl);
+ }
+ sfree(timers);
+ sfree(cu_nb->timings);
+ sfree(cu_nb);
+
+ if (debug)
+ {
+ fprintf(debug, "Cleaned up CUDA data structures.\n");
+ }
+}
+
+void cu_synchstream_atdat(nbnxn_cuda_ptr_t cu_nb, int iloc)
+{
+ cudaError_t stat;
+ cudaStream_t stream = cu_nb->stream[iloc];
+
+ stat = cudaStreamWaitEvent(stream, cu_nb->timers->stop_atdat, 0);
+ CU_RET_ERR(stat, "cudaStreamWaitEvent failed");
+}
+
+wallclock_gpu_t * nbnxn_cuda_get_timings(nbnxn_cuda_ptr_t cu_nb)
+{
+ return (cu_nb != NULL && cu_nb->bDoTime) ? cu_nb->timings : NULL;
+}
+
+void nbnxn_cuda_reset_timings(nbnxn_cuda_ptr_t cu_nb)
+{
+ if (cu_nb->bDoTime)
+ {
+ init_timings(cu_nb->timings);
+ }
+}
+
+int nbnxn_cuda_min_ci_balanced(nbnxn_cuda_ptr_t cu_nb)
+{
+ return cu_nb != NULL ?
+ gpu_min_ci_balanced_factor*cu_nb->dev_info->prop.multiProcessorCount : 0;
+
+}
++
++gmx_bool nbnxn_cuda_is_kernel_ewald_analytical(const nbnxn_cuda_ptr_t cu_nb)
++{
++ return ((cu_nb->nbparam->eeltype == eelCuEWALD_ANA) ||
++ (cu_nb->nbparam->eeltype == eelCuEWALD_ANA_TWIN));
++}
--- /dev/null
- #define _nsnxn_search_h
+/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustr
+ *
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * GROningen MAchine for Chemical Simulations
+ *
+ * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2012, The GROMACS development team,
+ * check out http://www.gromacs.org for more information.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * If you want to redistribute modifications, please consider that
+ * scientific software is very special. Version control is crucial -
+ * bugs must be traceable. We will be happy to consider code for
+ * inclusion in the official distribution, but derived work must not
+ * be called official GROMACS. Details are found in the README & COPYING
+ * files - if they are missing, get the official version at www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the papers on the package - you can find them in the top README file.
+ *
+ * For more info, check our website at http://www.gromacs.org
+ *
+ * And Hey:
+ * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
+ */
+
+#ifndef _nbnxn_search_h
++#define _nbnxn_search_h
+
+#include "typedefs.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/* Returns the j-cluster size for kernel of type nb_kernel_type */
+int nbnxn_kernel_to_cj_size(int nb_kernel_type);
+
+/* Tells if the pair-list corresponding to nb_kernel_type is simple.
+ * Returns FALSE for super-sub type pair-list.
+ */
+gmx_bool nbnxn_kernel_pairlist_simple(int nb_kernel_type);
+
+/* Due to the cluster size the effective pair-list is longer than
+ * that of a simple atom pair-list. This function gives the extra distance.
+ */
+real nbnxn_get_rlist_effective_inc(int cluster_size, real atom_density);
+
+/* Allocates and initializes a pair search data structure */
+void nbnxn_init_search(nbnxn_search_t * nbs_ptr,
+ ivec *n_dd_cells,
+ gmx_domdec_zones_t *zones,
+ int nthread_max);
+
+/* Put the atoms on the pair search grid.
+ * Only atoms a0 to a1 in x are put on the grid.
+ * The atom_density is used to determine the grid size.
+ * When atom_density=-1, the density is determined from a1-a0 and the corners.
+ * With domain decomposition part of the n particles might have migrated,
+ * but have not been removed yet. This count is given by nmoved.
+ * When move[i] < 0 particle i has migrated and will not be put on the grid.
+ * Without domain decomposition move will be NULL.
+ */
+void nbnxn_put_on_grid(nbnxn_search_t nbs,
+ int ePBC, matrix box,
+ int dd_zone,
+ rvec corner0, rvec corner1,
+ int a0, int a1,
+ real atom_density,
+ const int *atinfo,
+ rvec *x,
+ int nmoved, int *move,
+ int nb_kernel_type,
+ nbnxn_atomdata_t *nbat);
+
+/* As nbnxn_put_on_grid, but for the non-local atoms
+ * with domain decomposition. Should be called after calling
+ * nbnxn_search_put_on_grid for the local atoms / home zone.
+ */
+void nbnxn_put_on_grid_nonlocal(nbnxn_search_t nbs,
+ const gmx_domdec_zones_t *zones,
+ const int *atinfo,
+ rvec *x,
+ int nb_kernel_type,
+ nbnxn_atomdata_t *nbat);
+
+/* Add simple grid type information to the local super/sub grid */
+void nbnxn_grid_add_simple(nbnxn_search_t nbs,
+ nbnxn_atomdata_t *nbat);
+
+/* Return the number of x and y cells in the local grid */
+void nbnxn_get_ncells(nbnxn_search_t nbs, int *ncx, int *ncy);
+
+/* Return the order indices *a of the atoms on the ns grid, size n */
+void nbnxn_get_atomorder(nbnxn_search_t nbs, int **a, int *n);
+
+/* Renumber the atom indices on the grid to consecutive order */
+void nbnxn_set_atomorder(nbnxn_search_t nbs);
+
+/* Initializes a set of pair lists stored in nbnxn_pairlist_set_t */
+void nbnxn_init_pairlist_set(nbnxn_pairlist_set_t *nbl_list,
+ gmx_bool simple, gmx_bool combined,
+ nbnxn_alloc_t *alloc,
+ nbnxn_free_t *free);
+
+/* Make a apir-list with radius rlist, store it in nbl.
+ * The parameter min_ci_balanced sets the minimum required
+ * number or roughly equally sized ci blocks in nbl.
+ * When set >0 ci lists will be chopped up when the estimate
+ * for the number of equally sized lists is below min_ci_balanced.
+ */
+void nbnxn_make_pairlist(const nbnxn_search_t nbs,
+ nbnxn_atomdata_t *nbat,
+ const t_blocka *excl,
+ real rlist,
+ int min_ci_balanced,
+ nbnxn_pairlist_set_t *nbl_list,
+ int iloc,
+ int nb_kernel_type,
+ t_nrnb *nrnb);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- /dev/null
- char
- *buf;
+/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * GROningen MAchine for Chemical Simulations
+ *
+ * VERSION 4.5
+ * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2008, The GROMACS development team,
+ * check out http://www.gromacs.org for more information.
+
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * If you want to redistribute modifications, please consider that
+ * scientific software is very special. Version control is crucial -
+ * bugs must be traceable. We will be happy to consider code for
+ * inclusion in the official distribution, but derived work must not
+ * be called official GROMACS. Details are found in the README & COPYING
+ * files - if they are missing, get the official version at www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the papers on the package - you can find them in the top README file.
+ *
+ * For more info, check our website at http://www.gromacs.org
+ *
+ * And Hey:
+ * Groningen Machine for Chemical Simulation
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+#include "sysstuff.h"
+#include "typedefs.h"
+#include "macros.h"
+#include "smalloc.h"
+#include "physics.h"
+#include "macros.h"
+#include "vec.h"
+#include "force.h"
+#include "invblock.h"
+#include "confio.h"
+#include "names.h"
+#include "network.h"
+#include "pbc.h"
+#include "ns.h"
+#include "nrnb.h"
+#include "bondf.h"
+#include "mshift.h"
+#include "txtdump.h"
+#include "qmmm.h"
+#include <stdio.h>
+#include <string.h>
+#include "gmx_fatal.h"
+#include "typedefs.h"
+#include <stdlib.h>
+
+/* ORCA interface routines */
+
+void init_orca(t_commrec *cr, t_QMrec *qm, t_MMrec *mm)
+{
- gmx_fatal(FARGS, "no $BASENAME\n");
++ char *buf;
+ snew(buf, 200);
++
+ /* ORCA settings on the system */
+ buf = getenv("BASENAME");
+ if (buf)
+ {
+ snew(qm->orca_basename, 200);
+ sscanf(buf, "%s", qm->orca_basename);
+ }
+ else
+ {
- fprintf(stderr, "%s", buf);
++ gmx_fatal(FARGS, "$BASENAME not set\n");
+ }
+
+ /* ORCA directory on the system */
+ snew(buf, 200);
+ buf = getenv("ORCA_PATH");
- gmx_fatal(FARGS, "no $ORCA_PATH, check manual\n");
+
+ if (buf)
+ {
+ snew(qm->orca_dir, 200);
+ sscanf(buf, "%s", qm->orca_dir);
+ }
+ else
+ {
- fprintf(stderr, "%s...\n", qm->orca_dir);
- fprintf(stderr, "orca initialised...\n");
++ gmx_fatal(FARGS, "$ORCA_PATH not set, check manual\n");
+ }
+
- int
- i;
- t_QMMMrec
- *QMMMrec;
- FILE
- *out, *pcFile, *addInputFile, *LJCoeff;
- char
- *buf, *orcaInput, *addInputFilename, *LJCoeffFilename,
- *pcFilename, *exclInName, *exclOutName;
++ fprintf(stderr, "Setting ORCA path to: %s...\n", qm->orca_dir);
++ fprintf(stderr, "ORCA initialised...\n\n");
+ /* since we append the output to the BASENAME.out file,
+ we should delete an existent old out-file here. */
+ sprintf(buf, "%s.out", qm->orca_basename);
+ remove(buf);
+}
+
+
+void write_orca_input(int step, t_forcerec *fr, t_QMrec *qm, t_MMrec *mm)
+{
- fprintf(out, "#input-file generated by gromacs\n");
++ int i;
++ t_QMMMrec *QMMMrec;
++ FILE *out, *pcFile, *addInputFile, *LJCoeff;
++ char *buf, *orcaInput, *addInputFilename, *LJCoeffFilename, *pcFilename, *exclInName, *exclOutName;
++
+ QMMMrec = fr->qr;
++
+ /* write the first part of the input-file */
+ snew(orcaInput, 200);
+ sprintf(orcaInput, "%s.inp", qm->orca_basename);
+ out = fopen(orcaInput, "w");
++
+ snew(addInputFilename, 200);
+ sprintf(addInputFilename, "%s.ORCAINFO", qm->orca_basename);
+ addInputFile = fopen(addInputFilename, "r");
- fprintf(stderr, "No information on the calculation given in <%s>\n", addInputFilename);
- gmx_call("qm_orca.c");
++
++ fprintf(out, "#input-file generated by GROMACS\n");
++
+ if (qm->bTS)
+ {
+ fprintf(out, "!QMMMOpt TightSCF\n");
+ fprintf(out, "%s\n", "%geom TS_Search EF end");
+ }
+ else if (qm->bOPT)
+ {
+ fprintf(out, "!QMMMOpt TightSCF\n");
+ }
+ else
+ {
+ fprintf(out, "!EnGrad TightSCF\n");
+ }
++
+ /* here we include the insertion of the additional orca-input */
+ snew(buf, 200);
+ if (addInputFile != NULL)
+ {
+ while (!feof(addInputFile))
+ {
+ if (fgets(buf, 200, addInputFile) != NULL)
+ {
+ fputs(buf, out);
+ }
+ }
+ }
+ else
+ {
- /* write charge and multiplicity
- */
++ gmx_fatal(FARGS, "No information on the calculation given in %s\n", addInputFilename);
+ }
++
+ fclose(addInputFile);
++
+ if (qm->bTS || qm->bOPT)
+ {
+ /* freeze the frontier QM atoms and Link atoms. This is
+ * important only if a full QM subsystem optimization is done
+ * with a frozen MM environmeent. For dynamics, or gromacs's own
+ * optimization routines this is not important.
+ */
+ /* ORCA reads the exclusions from LJCoeffFilename.Excl,
+ * so we have to rename the file
+ */
+ int didStart = 0;
+ for (i = 0; i < qm->nrQMatoms; i++)
+ {
+ if (qm->frontatoms[i])
+ {
+ if (!didStart)
+ {
+ fprintf(out, "%s\n", "%geom");
+ fprintf(out, " Constraints \n");
+ didStart = 1;
+ }
+ fprintf(out, " {C %d C}\n", i); /* counting from 0 */
+ }
+ }
+ if (didStart)
+ {
+ fprintf(out, " end\n end\n");
+ }
+ /* make a file with information on the C6 and C12 coefficients */
+ if (QMMMrec->QMMMscheme != eQMMMschemeoniom && mm->nrMMatoms)
+ {
+ snew(exclInName, 200);
+ snew(exclOutName, 200);
+ sprintf(exclInName, "QMMMexcl.dat");
+ sprintf(exclOutName, "%s.LJ.Excl", qm->orca_basename);
+ rename(exclInName, exclOutName);
+ snew(LJCoeffFilename, 200);
+ sprintf(LJCoeffFilename, "%s.LJ", qm->orca_basename);
+ fprintf(out, "%s%s%s\n", "%LJCOEFFICIENTS \"", LJCoeffFilename, "\"");
+ /* make a file with information on the C6 and C12 coefficients */
+ LJCoeff = fopen(LJCoeffFilename, "w");
+ fprintf(LJCoeff, "%d\n", qm->nrQMatoms);
+ for (i = 0; i < qm->nrQMatoms; i++)
+ {
+#ifdef GMX_DOUBLE
+ fprintf(LJCoeff, "%10.7lf %10.7lf\n", qm->c6[i], qm->c12[i]);
+#else
+ fprintf(LJCoeff, "%10.7f %10.7f\n", qm->c6[i], qm->c12[i]);
+#endif
+ }
+ fprintf(LJCoeff, "%d\n", mm->nrMMatoms);
+ for (i = 0; i < mm->nrMMatoms; i++)
+ {
+#ifdef GMX_DOUBLE
+ fprintf(LJCoeff, "%10.7lf %10.7lf\n", mm->c6[i], mm->c12[i]);
+#else
+ fprintf(LJCoeff, "%10.7f %10.7f\n", mm->c6[i], mm->c12[i]);
+#endif
+ }
+ fclose(LJCoeff);
+ }
+ }
- /* write the QM coordinates
- */
++
++ /* write charge and multiplicity */
+ fprintf(out, "*xyz %2d%2d\n", qm->QMcharge, qm->multiplicity);
- /* write the MM point charge data
- */
++
++ /* write the QM coordinates */
+ for (i = 0; i < qm->nrQMatoms; i++)
+ {
+ int atomNr;
+ if (qm->atomicnumberQM[i] == 0)
+ {
+ atomNr = 1;
+ }
+ else
+ {
+ atomNr = qm->atomicnumberQM[i];
+ }
+#ifdef GMX_DOUBLE
+ fprintf(out, "%3d %10.7lf %10.7lf %10.7lf\n",
+ atomNr,
+ qm->xQM[i][XX]/0.1,
+ qm->xQM[i][YY]/0.1,
+ qm->xQM[i][ZZ]/0.1);
+#else
+ fprintf(out, "%3d %10.7f %10.7f %10.7f\n",
+ atomNr,
+ qm->xQM[i][XX]/0.1,
+ qm->xQM[i][YY]/0.1,
+ qm->xQM[i][ZZ]/0.1);
+#endif
+ }
+ fprintf(out, "*\n");
++
++ /* write the MM point charge data */
+ if (QMMMrec->QMMMscheme != eQMMMschemeoniom && mm->nrMMatoms)
+ {
+ /* name of the point charge file */
+ snew(pcFilename, 200);
+ sprintf(pcFilename, "%s.pc", qm->orca_basename);
+ fprintf(out, "%s%s%s\n", "%pointcharges \"", pcFilename, "\"");
+ pcFile = fopen(pcFilename, "w");
+ fprintf(pcFile, "%d\n", mm->nrMMatoms);
+ for (i = 0; i < mm->nrMMatoms; i++)
+ {
+#ifdef GMX_DOUBLE
+ fprintf(pcFile, "%8.4lf %10.7lf %10.7lf %10.7lf\n",
+ mm->MMcharges[i],
+ mm->xMM[i][XX]/0.1,
+ mm->xMM[i][YY]/0.1,
+ mm->xMM[i][ZZ]/0.1);
+#else
+ fprintf(pcFile, "%8.4f %10.7f %10.7f %10.7f\n",
+ mm->MMcharges[i],
+ mm->xMM[i][XX]/0.1,
+ mm->xMM[i][YY]/0.1,
+ mm->xMM[i][ZZ]/0.1);
+#endif
+ }
+ fprintf(pcFile, "\n");
+ fclose(pcFile);
+ }
+ fprintf(out, "\n");
+
+ fclose(out);
+} /* write_orca_input */
+
+real read_orca_output(rvec QMgrad[], rvec MMgrad[], int step, t_forcerec *fr,
+ t_QMrec *qm, t_MMrec *mm)
+{
+ int
+ i, j, atnum;
+ char
+ buf[300], tmp[300], orca_xyzFilename[300], orca_pcgradFilename[300], orca_engradFilename[300];
+ real
+ QMener;
+ FILE
+ *xyz, *pcgrad, *engrad;
+ int k;
+ t_QMMMrec
+ *QMMMrec;
+ QMMMrec = fr->qr;
+ /* in case of an optimization, the coordinates are printed in the
+ * xyz file, the energy and gradients for the QM part are stored in the engrad file
+ * and the gradients for the point charges are stored in the pc file.
+ */
+
+ /* we need the new xyz coordinates of the QM atoms only for separate QM-optimization
+ */
+
+ if (qm->bTS || qm->bOPT)
+ {
+ sprintf(orca_xyzFilename, "%s.xyz", qm->orca_basename);
+ xyz = fopen(orca_xyzFilename, "r");
+ if (fgets(buf, 300, xyz) == NULL)
+ {
+ gmx_fatal(FARGS, "Unexpected end of ORCA output");
+ }
+ if (fgets(buf, 300, xyz) == NULL)
+ {
+ gmx_fatal(FARGS, "Unexpected end of ORCA output");
+ }
+ for (i = 0; i < qm->nrQMatoms; i++)
+ {
+ if (fgets(buf, 300, xyz) == NULL)
+ {
+ gmx_fatal(FARGS, "Unexpected end of ORCA output");
+ }
+#ifdef GMX_DOUBLE
+ sscanf(buf, "%s%lf%lf%lf\n",
+ tmp,
+ &qm->xQM[i][XX],
+ &qm->xQM[i][YY],
+ &qm->xQM[i][ZZ]);
+#else
+ sscanf(buf, "%d%f%f%f\n",
+ &atnum,
+ &qm->xQM[i][XX],
+ &qm->xQM[i][YY],
+ &qm->xQM[i][ZZ]);
+#endif
+ for (j = 0; j < DIM; j++)
+ {
+ qm->xQM[i][j] *= 0.1;
+ }
+ }
+ fclose(xyz);
+ }
+ sprintf(orca_engradFilename, "%s.engrad", qm->orca_basename);
+ engrad = fopen(orca_engradFilename, "r");
+ /* we read the energy and the gradient for the qm-atoms from the engrad file
+ */
+ /* we can skip the first seven lines
+ */
+ for (j = 0; j < 7; j++)
+ {
+ if (fgets(buf, 300, engrad) == NULL)
+ {
+ gmx_fatal(FARGS, "Unexpected end of ORCA output");
+ }
+ }
+ /* now comes the energy
+ */
+ if (fgets(buf, 300, engrad) == NULL)
+ {
+ gmx_fatal(FARGS, "Unexpected end of ORCA output");
+ }
+#ifdef GMX_DOUBLE
+ sscanf(buf, "%lf\n", &QMener);
+#else
+ sscanf(buf, "%f\n", &QMener);
+#endif
+ /* we can skip the next three lines
+ */
+ for (j = 0; j < 3; j++)
+ {
+ if (fgets(buf, 300, engrad) == NULL)
+ {
+ gmx_fatal(FARGS, "Unexpected end of ORCA output");
+ }
+ }
+ /* next lines contain the gradients of the QM atoms
+ * now comes the gradient, one value per line:
+ * (atom1 x \n atom1 y \n atom1 z \n atom2 x ...
+ */
+
+ for (i = 0; i < 3*qm->nrQMatoms; i++)
+ {
+ k = i/3;
+ if (fgets(buf, 300, engrad) == NULL)
+ {
+ gmx_fatal(FARGS, "Unexpected end of ORCA output");
+ }
+#ifdef GMX_DOUBLE
+ if (i%3 == 0)
+ {
+ sscanf(buf, "%lf\n", &QMgrad[k][XX]);
+ }
+ else if (i%3 == 1)
+ {
+ sscanf(buf, "%lf\n", &QMgrad[k][YY]);
+ }
+ else if (i%3 == 2)
+ {
+ sscanf(buf, "%lf\n", &QMgrad[k][ZZ]);
+ }
+#else
+ if (i%3 == 0)
+ {
+ sscanf(buf, "%f\n", &QMgrad[k][XX]);
+ }
+ else if (i%3 == 1)
+ {
+ sscanf(buf, "%f\n", &QMgrad[k][YY]);
+ }
+ else if (i%3 == 2)
+ {
+ sscanf(buf, "%f\n", &QMgrad[k][ZZ]);
+ }
+#endif
+ }
+ fclose(engrad);
+ /* write the MM point charge data
+ */
+ if (QMMMrec->QMMMscheme != eQMMMschemeoniom && mm->nrMMatoms)
+ {
+ sprintf(orca_pcgradFilename, "%s.pcgrad", qm->orca_basename);
+ pcgrad = fopen(orca_pcgradFilename, "r");
+
+ /* we read the gradient for the mm-atoms from the pcgrad file
+ */
+ /* we can skip the first line
+ */
+ if (fgets(buf, 300, pcgrad) == NULL)
+ {
+ gmx_fatal(FARGS, "Unexpected end of ORCA output");
+ }
+ for (i = 0; i < mm->nrMMatoms; i++)
+ {
+ if (fgets(buf, 300, pcgrad) == NULL)
+ {
+ gmx_fatal(FARGS, "Unexpected end of ORCA output");
+ }
+ #ifdef GMX_DOUBLE
+ sscanf(buf, "%lf%lf%lf\n",
+ &MMgrad[i][XX],
+ &MMgrad[i][YY],
+ &MMgrad[i][ZZ]);
+ #else
+ sscanf(buf, "%f%f%f\n",
+ &MMgrad[i][XX],
+ &MMgrad[i][YY],
+ &MMgrad[i][ZZ]);
+ #endif
+ }
+ fclose(pcgrad);
+ }
+ return(QMener);
+}
+
+void do_orca(int step, char *exe, char *orca_dir, char *basename)
+{
+
+ /* make the call to the orca binary through system()
+ * The location of the binary is set through the
+ * environment.
+ */
+ char
+ buf[100];
+ sprintf(buf, "%s/%s %s.inp >> %s.out",
+ orca_dir,
+ "orca",
+ basename,
+ basename);
+ fprintf(stderr, "Calling '%s'\n", buf);
+ if (system(buf) != 0)
+ {
+ gmx_fatal(FARGS, "Call to '%s' failed\n", buf);
+ }
+}
+
+real call_orca(t_commrec *cr, t_forcerec *fr,
+ t_QMrec *qm, t_MMrec *mm, rvec f[], rvec fshift[])
+{
+ /* normal orca jobs */
+ static int
+ step = 0;
+ int
+ i, j;
+ real
+ QMener;
+ rvec
+ *QMgrad, *MMgrad;
+ char
+ *exe;
+
+ snew(exe, 30);
+ sprintf(exe, "%s", "orca");
+ snew(QMgrad, qm->nrQMatoms);
+ snew(MMgrad, mm->nrMMatoms);
+
+ write_orca_input(step, fr, qm, mm);
+ do_orca(step, exe, qm->orca_dir, qm->orca_basename);
+ QMener = read_orca_output(QMgrad, MMgrad, step, fr, qm, mm);
+ /* put the QMMM forces in the force array and to the fshift
+ */
+ for (i = 0; i < qm->nrQMatoms; i++)
+ {
+ for (j = 0; j < DIM; j++)
+ {
+ f[i][j] = HARTREE_BOHR2MD*QMgrad[i][j];
+ fshift[i][j] = HARTREE_BOHR2MD*QMgrad[i][j];
+ }
+ }
+ for (i = 0; i < mm->nrMMatoms; i++)
+ {
+ for (j = 0; j < DIM; j++)
+ {
+ f[i+qm->nrQMatoms][j] = HARTREE_BOHR2MD*MMgrad[i][j];
+ fshift[i+qm->nrQMatoms][j] = HARTREE_BOHR2MD*MMgrad[i][j];
+ }
+ }
+ QMener = QMener*HARTREE2KJ*AVOGADRO;
+ step++;
+ free(exe);
+ return(QMener);
+} /* call_orca */
+
+/* end of orca sub routines */
--- /dev/null
- if (nbvg->kernel_type != nbnxnk8x8x8_CUDA)
+/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
+ *
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * GROningen MAchine for Chemical Simulations
+ *
+ * VERSION 3.2.0
+ * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2004, The GROMACS development team,
+ * check out http://www.gromacs.org for more information.
+
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * If you want to redistribute modifications, please consider that
+ * scientific software is very special. Version control is crucial -
+ * bugs must be traceable. We will be happy to consider code for
+ * inclusion in the official distribution, but derived work must not
+ * be called official GROMACS. Details are found in the README & COPYING
+ * files - if they are missing, get the official version at www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the papers on the package - you can find them in the top README file.
+ *
+ * For more info, check our website at http://www.gromacs.org
+ *
+ * And Hey:
+ * GROwing Monsters And Cloning Shrimps
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#ifdef GMX_CRAY_XT3
+#include <catamount/dclock.h>
+#endif
+
+
+#include <stdio.h>
+#include <time.h>
+#ifdef HAVE_SYS_TIME_H
+#include <sys/time.h>
+#endif
+#include <math.h>
+#include "typedefs.h"
+#include "string2.h"
+#include "gmxfio.h"
+#include "smalloc.h"
+#include "names.h"
+#include "confio.h"
+#include "mvdata.h"
+#include "txtdump.h"
+#include "pbc.h"
+#include "chargegroup.h"
+#include "vec.h"
+#include <time.h>
+#include "nrnb.h"
+#include "mshift.h"
+#include "mdrun.h"
+#include "sim_util.h"
+#include "update.h"
+#include "physics.h"
+#include "main.h"
+#include "mdatoms.h"
+#include "force.h"
+#include "bondf.h"
+#include "pme.h"
+#include "disre.h"
+#include "orires.h"
+#include "network.h"
+#include "calcmu.h"
+#include "constr.h"
+#include "xvgr.h"
+#include "trnio.h"
+#include "xtcio.h"
+#include "copyrite.h"
+#include "pull_rotation.h"
+#include "gmx_random.h"
+#include "domdec.h"
+#include "partdec.h"
+#include "gmx_wallcycle.h"
+#include "genborn.h"
+#include "nbnxn_atomdata.h"
+#include "nbnxn_search.h"
+#include "nbnxn_kernels/nbnxn_kernel_ref.h"
+#include "nbnxn_kernels/nbnxn_kernel_simd_4xn.h"
+#include "nbnxn_kernels/nbnxn_kernel_simd_2xnn.h"
+#include "nbnxn_kernels/nbnxn_kernel_gpu_ref.h"
+
+#include "gromacs/utility/gmxmpi.h"
+
+#include "adress.h"
+#include "qmmm.h"
+
+#include "nbnxn_cuda_data_mgmt.h"
+#include "nbnxn_cuda/nbnxn_cuda.h"
+
+#if 0
+typedef struct gmx_timeprint {
+
+} t_gmx_timeprint;
+#endif
+
+/* Portable version of ctime_r implemented in src/gmxlib/string2.c, but we do not want it declared in public installed headers */
+char *
+gmx_ctime_r(const time_t *clock, char *buf, int n);
+
+
+double
+gmx_gettime()
+{
+#ifdef HAVE_GETTIMEOFDAY
+ struct timeval t;
+ double seconds;
+
+ gettimeofday(&t, NULL);
+
+ seconds = (double) t.tv_sec + 1e-6*(double)t.tv_usec;
+
+ return seconds;
+#else
+ double seconds;
+
+ seconds = time(NULL);
+
+ return seconds;
+#endif
+}
+
+
+#define difftime(end, start) ((double)(end)-(double)(start))
+
+void print_time(FILE *out, gmx_runtime_t *runtime, gmx_large_int_t step,
+ t_inputrec *ir, t_commrec *cr)
+{
+ time_t finish;
+ char timebuf[STRLEN];
+ double dt;
+ char buf[48];
+
+#ifndef GMX_THREAD_MPI
+ if (!PAR(cr))
+#endif
+ {
+ fprintf(out, "\r");
+ }
+ fprintf(out, "step %s", gmx_step_str(step, buf));
+ if ((step >= ir->nstlist))
+ {
+ runtime->last = gmx_gettime();
+ dt = difftime(runtime->last, runtime->real);
+ runtime->time_per_step = dt/(step - ir->init_step + 1);
+
+ dt = (ir->nsteps + ir->init_step - step)*runtime->time_per_step;
+
+ if (ir->nsteps >= 0)
+ {
+ if (dt >= 300)
+ {
+ finish = (time_t) (runtime->last + dt);
+ gmx_ctime_r(&finish, timebuf, STRLEN);
+ sprintf(buf, "%s", timebuf);
+ buf[strlen(buf)-1] = '\0';
+ fprintf(out, ", will finish %s", buf);
+ }
+ else
+ {
+ fprintf(out, ", remaining runtime: %5d s ", (int)dt);
+ }
+ }
+ else
+ {
+ fprintf(out, " performance: %.1f ns/day ",
+ ir->delta_t/1000*24*60*60/runtime->time_per_step);
+ }
+ }
+#ifndef GMX_THREAD_MPI
+ if (PAR(cr))
+ {
+ fprintf(out, "\n");
+ }
+#endif
+
+ fflush(out);
+}
+
+#ifdef NO_CLOCK
+#define clock() -1
+#endif
+
+static double set_proctime(gmx_runtime_t *runtime)
+{
+ double diff;
+#ifdef GMX_CRAY_XT3
+ double prev;
+
+ prev = runtime->proc;
+ runtime->proc = dclock();
+
+ diff = runtime->proc - prev;
+#else
+ clock_t prev;
+
+ prev = runtime->proc;
+ runtime->proc = clock();
+
+ diff = (double)(runtime->proc - prev)/(double)CLOCKS_PER_SEC;
+#endif
+ if (diff < 0)
+ {
+ /* The counter has probably looped, ignore this data */
+ diff = 0;
+ }
+
+ return diff;
+}
+
+void runtime_start(gmx_runtime_t *runtime)
+{
+ runtime->real = gmx_gettime();
+ runtime->proc = 0;
+ set_proctime(runtime);
+ runtime->realtime = 0;
+ runtime->proctime = 0;
+ runtime->last = 0;
+ runtime->time_per_step = 0;
+}
+
+void runtime_end(gmx_runtime_t *runtime)
+{
+ double now;
+
+ now = gmx_gettime();
+
+ runtime->proctime += set_proctime(runtime);
+ runtime->realtime = now - runtime->real;
+ runtime->real = now;
+}
+
+void runtime_upd_proc(gmx_runtime_t *runtime)
+{
+ runtime->proctime += set_proctime(runtime);
+}
+
+void print_date_and_time(FILE *fplog, int nodeid, const char *title,
+ const gmx_runtime_t *runtime)
+{
+ int i;
+ char timebuf[STRLEN];
+ char time_string[STRLEN];
+ time_t tmptime;
+
+ if (fplog)
+ {
+ if (runtime != NULL)
+ {
+ tmptime = (time_t) runtime->real;
+ gmx_ctime_r(&tmptime, timebuf, STRLEN);
+ }
+ else
+ {
+ tmptime = (time_t) gmx_gettime();
+ gmx_ctime_r(&tmptime, timebuf, STRLEN);
+ }
+ for (i = 0; timebuf[i] >= ' '; i++)
+ {
+ time_string[i] = timebuf[i];
+ }
+ time_string[i] = '\0';
+
+ fprintf(fplog, "%s on node %d %s\n", title, nodeid, time_string);
+ }
+}
+
+static void sum_forces(int start, int end, rvec f[], rvec flr[])
+{
+ int i;
+
+ if (gmx_debug_at)
+ {
+ pr_rvecs(debug, 0, "fsr", f+start, end-start);
+ pr_rvecs(debug, 0, "flr", flr+start, end-start);
+ }
+ for (i = start; (i < end); i++)
+ {
+ rvec_inc(f[i], flr[i]);
+ }
+}
+
+/*
+ * calc_f_el calculates forces due to an electric field.
+ *
+ * force is kJ mol^-1 nm^-1 = e * kJ mol^-1 nm^-1 / e
+ *
+ * Et[] contains the parameters for the time dependent
+ * part of the field (not yet used).
+ * Ex[] contains the parameters for
+ * the spatial dependent part of the field. You can have cool periodic
+ * fields in principle, but only a constant field is supported
+ * now.
+ * The function should return the energy due to the electric field
+ * (if any) but for now returns 0.
+ *
+ * WARNING:
+ * There can be problems with the virial.
+ * Since the field is not self-consistent this is unavoidable.
+ * For neutral molecules the virial is correct within this approximation.
+ * For neutral systems with many charged molecules the error is small.
+ * But for systems with a net charge or a few charged molecules
+ * the error can be significant when the field is high.
+ * Solution: implement a self-consitent electric field into PME.
+ */
+static void calc_f_el(FILE *fp, int start, int homenr,
+ real charge[], rvec x[], rvec f[],
+ t_cosines Ex[], t_cosines Et[], double t)
+{
+ rvec Ext;
+ real t0;
+ int i, m;
+
+ for (m = 0; (m < DIM); m++)
+ {
+ if (Et[m].n > 0)
+ {
+ if (Et[m].n == 3)
+ {
+ t0 = Et[m].a[1];
+ Ext[m] = cos(Et[m].a[0]*(t-t0))*exp(-sqr(t-t0)/(2.0*sqr(Et[m].a[2])));
+ }
+ else
+ {
+ Ext[m] = cos(Et[m].a[0]*t);
+ }
+ }
+ else
+ {
+ Ext[m] = 1.0;
+ }
+ if (Ex[m].n > 0)
+ {
+ /* Convert the field strength from V/nm to MD-units */
+ Ext[m] *= Ex[m].a[0]*FIELDFAC;
+ for (i = start; (i < start+homenr); i++)
+ {
+ f[i][m] += charge[i]*Ext[m];
+ }
+ }
+ else
+ {
+ Ext[m] = 0;
+ }
+ }
+ if (fp != NULL)
+ {
+ fprintf(fp, "%10g %10g %10g %10g #FIELD\n", t,
+ Ext[XX]/FIELDFAC, Ext[YY]/FIELDFAC, Ext[ZZ]/FIELDFAC);
+ }
+}
+
+static void calc_virial(FILE *fplog, int start, int homenr, rvec x[], rvec f[],
+ tensor vir_part, t_graph *graph, matrix box,
+ t_nrnb *nrnb, const t_forcerec *fr, int ePBC)
+{
+ int i, j;
+ tensor virtest;
+
+ /* The short-range virial from surrounding boxes */
+ clear_mat(vir_part);
+ calc_vir(fplog, SHIFTS, fr->shift_vec, fr->fshift, vir_part, ePBC == epbcSCREW, box);
+ inc_nrnb(nrnb, eNR_VIRIAL, SHIFTS);
+
+ /* Calculate partial virial, for local atoms only, based on short range.
+ * Total virial is computed in global_stat, called from do_md
+ */
+ f_calc_vir(fplog, start, start+homenr, x, f, vir_part, graph, box);
+ inc_nrnb(nrnb, eNR_VIRIAL, homenr);
+
+ /* Add position restraint contribution */
+ for (i = 0; i < DIM; i++)
+ {
+ vir_part[i][i] += fr->vir_diag_posres[i];
+ }
+
+ /* Add wall contribution */
+ for (i = 0; i < DIM; i++)
+ {
+ vir_part[i][ZZ] += fr->vir_wall_z[i];
+ }
+
+ if (debug)
+ {
+ pr_rvecs(debug, 0, "vir_part", vir_part, DIM);
+ }
+}
+
+static void posres_wrapper(FILE *fplog,
+ int flags,
+ gmx_bool bSepDVDL,
+ t_inputrec *ir,
+ t_nrnb *nrnb,
+ gmx_localtop_t *top,
+ matrix box, rvec x[],
+ rvec f[],
+ gmx_enerdata_t *enerd,
+ real *lambda,
+ t_forcerec *fr)
+{
+ t_pbc pbc;
+ real v, dvdl;
+ int i;
+
+ /* Position restraints always require full pbc */
+ set_pbc(&pbc, ir->ePBC, box);
+ dvdl = 0;
+ v = posres(top->idef.il[F_POSRES].nr, top->idef.il[F_POSRES].iatoms,
+ top->idef.iparams_posres,
+ (const rvec*)x, fr->f_novirsum, fr->vir_diag_posres,
+ ir->ePBC == epbcNONE ? NULL : &pbc,
+ lambda[efptRESTRAINT], &dvdl,
+ fr->rc_scaling, fr->ePBC, fr->posres_com, fr->posres_comB);
+ if (bSepDVDL)
+ {
+ fprintf(fplog, sepdvdlformat,
+ interaction_function[F_POSRES].longname, v, dvdl);
+ }
+ enerd->term[F_POSRES] += v;
+ /* If just the force constant changes, the FEP term is linear,
+ * but if k changes, it is not.
+ */
+ enerd->dvdl_nonlin[efptRESTRAINT] += dvdl;
+ inc_nrnb(nrnb, eNR_POSRES, top->idef.il[F_POSRES].nr/2);
+
+ if ((ir->fepvals->n_lambda > 0) && (flags & GMX_FORCE_DHDL))
+ {
+ for (i = 0; i < enerd->n_lambda; i++)
+ {
+ real dvdl_dum, lambda_dum;
+
+ lambda_dum = (i == 0 ? lambda[efptRESTRAINT] : ir->fepvals->all_lambda[efptRESTRAINT][i-1]);
+ v = posres(top->idef.il[F_POSRES].nr, top->idef.il[F_POSRES].iatoms,
+ top->idef.iparams_posres,
+ (const rvec*)x, NULL, NULL,
+ ir->ePBC == epbcNONE ? NULL : &pbc, lambda_dum, &dvdl,
+ fr->rc_scaling, fr->ePBC, fr->posres_com, fr->posres_comB);
+ enerd->enerpart_lambda[i] += v;
+ }
+ }
+}
+
+static void pull_potential_wrapper(FILE *fplog,
+ gmx_bool bSepDVDL,
+ t_commrec *cr,
+ t_inputrec *ir,
+ matrix box, rvec x[],
+ rvec f[],
+ tensor vir_force,
+ t_mdatoms *mdatoms,
+ gmx_enerdata_t *enerd,
+ real *lambda,
+ double t)
+{
+ t_pbc pbc;
+ real dvdl;
+
+ /* Calculate the center of mass forces, this requires communication,
+ * which is why pull_potential is called close to other communication.
+ * The virial contribution is calculated directly,
+ * which is why we call pull_potential after calc_virial.
+ */
+ set_pbc(&pbc, ir->ePBC, box);
+ dvdl = 0;
+ enerd->term[F_COM_PULL] +=
+ pull_potential(ir->ePull, ir->pull, mdatoms, &pbc,
+ cr, t, lambda[efptRESTRAINT], x, f, vir_force, &dvdl);
+ if (bSepDVDL)
+ {
+ fprintf(fplog, sepdvdlformat, "Com pull", enerd->term[F_COM_PULL], dvdl);
+ }
+ enerd->dvdl_lin[efptRESTRAINT] += dvdl;
+}
+
+static void pme_receive_force_ener(FILE *fplog,
+ gmx_bool bSepDVDL,
+ t_commrec *cr,
+ gmx_wallcycle_t wcycle,
+ gmx_enerdata_t *enerd,
+ t_forcerec *fr)
+{
+ real e, v, dvdl;
+ float cycles_ppdpme, cycles_seppme;
+
+ cycles_ppdpme = wallcycle_stop(wcycle, ewcPPDURINGPME);
+ dd_cycles_add(cr->dd, cycles_ppdpme, ddCyclPPduringPME);
+
+ /* In case of node-splitting, the PP nodes receive the long-range
+ * forces, virial and energy from the PME nodes here.
+ */
+ wallcycle_start(wcycle, ewcPP_PMEWAITRECVF);
+ dvdl = 0;
+ gmx_pme_receive_f(cr, fr->f_novirsum, fr->vir_el_recip, &e, &dvdl,
+ &cycles_seppme);
+ if (bSepDVDL)
+ {
+ fprintf(fplog, sepdvdlformat, "PME mesh", e, dvdl);
+ }
+ enerd->term[F_COUL_RECIP] += e;
+ enerd->dvdl_lin[efptCOUL] += dvdl;
+ if (wcycle)
+ {
+ dd_cycles_add(cr->dd, cycles_seppme, ddCyclPME);
+ }
+ wallcycle_stop(wcycle, ewcPP_PMEWAITRECVF);
+}
+
+static void print_large_forces(FILE *fp, t_mdatoms *md, t_commrec *cr,
+ gmx_large_int_t step, real pforce, rvec *x, rvec *f)
+{
+ int i;
+ real pf2, fn2;
+ char buf[STEPSTRSIZE];
+
+ pf2 = sqr(pforce);
+ for (i = md->start; i < md->start+md->homenr; i++)
+ {
+ fn2 = norm2(f[i]);
+ /* We also catch NAN, if the compiler does not optimize this away. */
+ if (fn2 >= pf2 || fn2 != fn2)
+ {
+ fprintf(fp, "step %s atom %6d x %8.3f %8.3f %8.3f force %12.5e\n",
+ gmx_step_str(step, buf),
+ ddglatnr(cr->dd, i), x[i][XX], x[i][YY], x[i][ZZ], sqrt(fn2));
+ }
+ }
+}
+
+static void post_process_forces(FILE *fplog,
+ t_commrec *cr,
+ gmx_large_int_t step,
+ t_nrnb *nrnb, gmx_wallcycle_t wcycle,
+ gmx_localtop_t *top,
+ matrix box, rvec x[],
+ rvec f[],
+ tensor vir_force,
+ t_mdatoms *mdatoms,
+ t_graph *graph,
+ t_forcerec *fr, gmx_vsite_t *vsite,
+ int flags)
+{
+ if (fr->bF_NoVirSum)
+ {
+ if (vsite)
+ {
+ /* Spread the mesh force on virtual sites to the other particles...
+ * This is parallellized. MPI communication is performed
+ * if the constructing atoms aren't local.
+ */
+ wallcycle_start(wcycle, ewcVSITESPREAD);
+ spread_vsite_f(fplog, vsite, x, fr->f_novirsum, NULL,
+ (flags & GMX_FORCE_VIRIAL), fr->vir_el_recip,
+ nrnb,
+ &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr);
+ wallcycle_stop(wcycle, ewcVSITESPREAD);
+ }
+ if (flags & GMX_FORCE_VIRIAL)
+ {
+ /* Now add the forces, this is local */
+ if (fr->bDomDec)
+ {
+ sum_forces(0, fr->f_novirsum_n, f, fr->f_novirsum);
+ }
+ else
+ {
+ sum_forces(mdatoms->start, mdatoms->start+mdatoms->homenr,
+ f, fr->f_novirsum);
+ }
+ if (EEL_FULL(fr->eeltype))
+ {
+ /* Add the mesh contribution to the virial */
+ m_add(vir_force, fr->vir_el_recip, vir_force);
+ }
+ if (debug)
+ {
+ pr_rvecs(debug, 0, "vir_force", vir_force, DIM);
+ }
+ }
+ }
+
+ if (fr->print_force >= 0)
+ {
+ print_large_forces(stderr, mdatoms, cr, step, fr->print_force, x, f);
+ }
+}
+
+static void do_nb_verlet(t_forcerec *fr,
+ interaction_const_t *ic,
+ gmx_enerdata_t *enerd,
+ int flags, int ilocality,
+ int clearF,
+ t_nrnb *nrnb,
+ gmx_wallcycle_t wcycle)
+{
+ int nnbl, kernel_type, enr_nbnxn_kernel_ljc, enr_nbnxn_kernel_lj;
+ char *env;
+ nonbonded_verlet_group_t *nbvg;
++ gmx_bool bCUDA;
+
+ if (!(flags & GMX_FORCE_NONBONDED))
+ {
+ /* skip non-bonded calculation */
+ return;
+ }
+
+ nbvg = &fr->nbv->grp[ilocality];
+
+ /* CUDA kernel launch overhead is already timed separately */
+ if (fr->cutoff_scheme != ecutsVERLET)
+ {
+ gmx_incons("Invalid cut-off scheme passed!");
+ }
+
- if (nbvg->kernel_type != nbnxnk8x8x8_CUDA)
++ bCUDA = (nbvg->kernel_type == nbnxnk8x8x8_CUDA);
++
++ if (!bCUDA)
+ {
+ wallcycle_sub_start(wcycle, ewcsNONBONDED);
+ }
+ switch (nbvg->kernel_type)
+ {
+ case nbnxnk4x4_PlainC:
+ nbnxn_kernel_ref(&nbvg->nbl_lists,
+ nbvg->nbat, ic,
+ fr->shift_vec,
+ flags,
+ clearF,
+ fr->fshift[0],
+ enerd->grpp.ener[egCOULSR],
+ fr->bBHAM ?
+ enerd->grpp.ener[egBHAMSR] :
+ enerd->grpp.ener[egLJSR]);
+ break;
+
+ case nbnxnk4xN_SIMD_4xN:
+ nbnxn_kernel_simd_4xn(&nbvg->nbl_lists,
+ nbvg->nbat, ic,
+ nbvg->ewald_excl,
+ fr->shift_vec,
+ flags,
+ clearF,
+ fr->fshift[0],
+ enerd->grpp.ener[egCOULSR],
+ fr->bBHAM ?
+ enerd->grpp.ener[egBHAMSR] :
+ enerd->grpp.ener[egLJSR]);
+ break;
+ case nbnxnk4xN_SIMD_2xNN:
+ nbnxn_kernel_simd_2xnn(&nbvg->nbl_lists,
+ nbvg->nbat, ic,
+ nbvg->ewald_excl,
+ fr->shift_vec,
+ flags,
+ clearF,
+ fr->fshift[0],
+ enerd->grpp.ener[egCOULSR],
+ fr->bBHAM ?
+ enerd->grpp.ener[egBHAMSR] :
+ enerd->grpp.ener[egLJSR]);
+ break;
+
+ case nbnxnk8x8x8_CUDA:
+ nbnxn_cuda_launch_kernel(fr->nbv->cu_nbv, nbvg->nbat, flags, ilocality);
+ break;
+
+ case nbnxnk8x8x8_PlainC:
+ nbnxn_kernel_gpu_ref(nbvg->nbl_lists.nbl[0],
+ nbvg->nbat, ic,
+ fr->shift_vec,
+ flags,
+ clearF,
+ nbvg->nbat->out[0].f,
+ fr->fshift[0],
+ enerd->grpp.ener[egCOULSR],
+ fr->bBHAM ?
+ enerd->grpp.ener[egBHAMSR] :
+ enerd->grpp.ener[egLJSR]);
+ break;
+
+ default:
+ gmx_incons("Invalid nonbonded kernel type passed!");
+
+ }
- else if (nbvg->ewald_excl == ewaldexclTable)
++ if (!bCUDA)
+ {
+ wallcycle_sub_stop(wcycle, ewcsNONBONDED);
+ }
+
+ if (EEL_RF(ic->eeltype) || ic->eeltype == eelCUT)
+ {
+ enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_RF;
+ }
- enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_TAB;
++ else if ((!bCUDA && nbvg->ewald_excl == ewaldexclAnalytical) ||
++ (bCUDA && nbnxn_cuda_is_kernel_ewald_analytical(fr->nbv->cu_nbv)))
+ {
- enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_EWALD;
++ enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_EWALD;
+ }
+ else
+ {
++ enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_TAB;
+ }
+ enr_nbnxn_kernel_lj = eNR_NBNXN_LJ;
+ if (flags & GMX_FORCE_ENERGY)
+ {
+ /* In eNR_??? the nbnxn F+E kernels are always the F kernel + 1 */
+ enr_nbnxn_kernel_ljc += 1;
+ enr_nbnxn_kernel_lj += 1;
+ }
+
+ inc_nrnb(nrnb, enr_nbnxn_kernel_ljc,
+ nbvg->nbl_lists.natpair_ljq);
+ inc_nrnb(nrnb, enr_nbnxn_kernel_lj,
+ nbvg->nbl_lists.natpair_lj);
+ inc_nrnb(nrnb, enr_nbnxn_kernel_ljc-eNR_NBNXN_LJ_RF+eNR_NBNXN_RF,
+ nbvg->nbl_lists.natpair_q);
+}
+
+void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
+ t_inputrec *inputrec,
+ gmx_large_int_t step, t_nrnb *nrnb, gmx_wallcycle_t wcycle,
+ gmx_localtop_t *top,
+ gmx_mtop_t *mtop,
+ gmx_groups_t *groups,
+ matrix box, rvec x[], history_t *hist,
+ rvec f[],
+ tensor vir_force,
+ t_mdatoms *mdatoms,
+ gmx_enerdata_t *enerd, t_fcdata *fcd,
+ real *lambda, t_graph *graph,
+ t_forcerec *fr, interaction_const_t *ic,
+ gmx_vsite_t *vsite, rvec mu_tot,
+ double t, FILE *field, gmx_edsam_t ed,
+ gmx_bool bBornRadii,
+ int flags)
+{
+ int cg0, cg1, i, j;
+ int start, homenr;
+ int nb_kernel_type;
+ double mu[2*DIM];
+ gmx_bool bSepDVDL, bStateChanged, bNS, bFillGrid, bCalcCGCM, bBS;
+ gmx_bool bDoLongRange, bDoForces, bSepLRF, bUseGPU, bUseOrEmulGPU;
+ gmx_bool bDiffKernels = FALSE;
+ matrix boxs;
+ rvec vzero, box_diag;
+ real e, v, dvdl;
+ float cycles_pme, cycles_force;
+ nonbonded_verlet_t *nbv;
+
+ cycles_force = 0;
+ nbv = fr->nbv;
+ nb_kernel_type = fr->nbv->grp[0].kernel_type;
+
+ start = mdatoms->start;
+ homenr = mdatoms->homenr;
+
+ bSepDVDL = (fr->bSepDVDL && do_per_step(step, inputrec->nstlog));
+
+ clear_mat(vir_force);
+
+ cg0 = 0;
+ if (DOMAINDECOMP(cr))
+ {
+ cg1 = cr->dd->ncg_tot;
+ }
+ else
+ {
+ cg1 = top->cgs.nr;
+ }
+ if (fr->n_tpi > 0)
+ {
+ cg1--;
+ }
+
+ bStateChanged = (flags & GMX_FORCE_STATECHANGED);
+ bNS = (flags & GMX_FORCE_NS) && (fr->bAllvsAll == FALSE);
+ bFillGrid = (bNS && bStateChanged);
+ bCalcCGCM = (bFillGrid && !DOMAINDECOMP(cr));
+ bDoLongRange = (fr->bTwinRange && bNS && (flags & GMX_FORCE_DO_LR));
+ bDoForces = (flags & GMX_FORCE_FORCES);
+ bSepLRF = (bDoLongRange && bDoForces && (flags & GMX_FORCE_SEPLRF));
+ bUseGPU = fr->nbv->bUseGPU;
+ bUseOrEmulGPU = bUseGPU || (nbv->grp[0].kernel_type == nbnxnk8x8x8_PlainC);
+
+ if (bStateChanged)
+ {
+ update_forcerec(fplog, fr, box);
+
+ if (NEED_MUTOT(*inputrec))
+ {
+ /* Calculate total (local) dipole moment in a temporary common array.
+ * This makes it possible to sum them over nodes faster.
+ */
+ calc_mu(start, homenr,
+ x, mdatoms->chargeA, mdatoms->chargeB, mdatoms->nChargePerturbed,
+ mu, mu+DIM);
+ }
+ }
+
+ if (fr->ePBC != epbcNONE)
+ {
+ /* Compute shift vectors every step,
+ * because of pressure coupling or box deformation!
+ */
+ if ((flags & GMX_FORCE_DYNAMICBOX) && bStateChanged)
+ {
+ calc_shifts(box, fr->shift_vec);
+ }
+
+ if (bCalcCGCM)
+ {
+ put_atoms_in_box_omp(fr->ePBC, box, homenr, x);
+ inc_nrnb(nrnb, eNR_SHIFTX, homenr);
+ }
+ else if (EI_ENERGY_MINIMIZATION(inputrec->eI) && graph)
+ {
+ unshift_self(graph, box, x);
+ }
+ }
+
+ nbnxn_atomdata_copy_shiftvec(flags & GMX_FORCE_DYNAMICBOX,
+ fr->shift_vec, nbv->grp[0].nbat);
+
+#ifdef GMX_MPI
+ if (!(cr->duty & DUTY_PME))
+ {
+ /* Send particle coordinates to the pme nodes.
+ * Since this is only implemented for domain decomposition
+ * and domain decomposition does not use the graph,
+ * we do not need to worry about shifting.
+ */
+
+ wallcycle_start(wcycle, ewcPP_PMESENDX);
+
+ bBS = (inputrec->nwall == 2);
+ if (bBS)
+ {
+ copy_mat(box, boxs);
+ svmul(inputrec->wall_ewald_zfac, boxs[ZZ], boxs[ZZ]);
+ }
+
+ gmx_pme_send_x(cr, bBS ? boxs : box, x,
+ mdatoms->nChargePerturbed, lambda[efptCOUL],
+ (flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)), step);
+
+ wallcycle_stop(wcycle, ewcPP_PMESENDX);
+ }
+#endif /* GMX_MPI */
+
+ /* do gridding for pair search */
+ if (bNS)
+ {
+ if (graph && bStateChanged)
+ {
+ /* Calculate intramolecular shift vectors to make molecules whole */
+ mk_mshift(fplog, graph, fr->ePBC, box, x);
+ }
+
+ clear_rvec(vzero);
+ box_diag[XX] = box[XX][XX];
+ box_diag[YY] = box[YY][YY];
+ box_diag[ZZ] = box[ZZ][ZZ];
+
+ wallcycle_start(wcycle, ewcNS);
+ if (!fr->bDomDec)
+ {
+ wallcycle_sub_start(wcycle, ewcsNBS_GRID_LOCAL);
+ nbnxn_put_on_grid(nbv->nbs, fr->ePBC, box,
+ 0, vzero, box_diag,
+ 0, mdatoms->homenr, -1, fr->cginfo, x,
+ 0, NULL,
+ nbv->grp[eintLocal].kernel_type,
+ nbv->grp[eintLocal].nbat);
+ wallcycle_sub_stop(wcycle, ewcsNBS_GRID_LOCAL);
+ }
+ else
+ {
+ wallcycle_sub_start(wcycle, ewcsNBS_GRID_NONLOCAL);
+ nbnxn_put_on_grid_nonlocal(nbv->nbs, domdec_zones(cr->dd),
+ fr->cginfo, x,
+ nbv->grp[eintNonlocal].kernel_type,
+ nbv->grp[eintNonlocal].nbat);
+ wallcycle_sub_stop(wcycle, ewcsNBS_GRID_NONLOCAL);
+ }
+
+ if (nbv->ngrp == 1 ||
+ nbv->grp[eintNonlocal].nbat == nbv->grp[eintLocal].nbat)
+ {
+ nbnxn_atomdata_set(nbv->grp[eintLocal].nbat, eatAll,
+ nbv->nbs, mdatoms, fr->cginfo);
+ }
+ else
+ {
+ nbnxn_atomdata_set(nbv->grp[eintLocal].nbat, eatLocal,
+ nbv->nbs, mdatoms, fr->cginfo);
+ nbnxn_atomdata_set(nbv->grp[eintNonlocal].nbat, eatAll,
+ nbv->nbs, mdatoms, fr->cginfo);
+ }
+ wallcycle_stop(wcycle, ewcNS);
+ }
+
+ /* initialize the GPU atom data and copy shift vector */
+ if (bUseGPU)
+ {
+ if (bNS)
+ {
+ wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU_NB);
+ nbnxn_cuda_init_atomdata(nbv->cu_nbv, nbv->grp[eintLocal].nbat);
+ wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
+ }
+
+ wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU_NB);
+ nbnxn_cuda_upload_shiftvec(nbv->cu_nbv, nbv->grp[eintLocal].nbat);
+ wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
+ }
+
+ /* do local pair search */
+ if (bNS)
+ {
+ wallcycle_start_nocount(wcycle, ewcNS);
+ wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_LOCAL);
+ nbnxn_make_pairlist(nbv->nbs, nbv->grp[eintLocal].nbat,
+ &top->excls,
+ ic->rlist,
+ nbv->min_ci_balanced,
+ &nbv->grp[eintLocal].nbl_lists,
+ eintLocal,
+ nbv->grp[eintLocal].kernel_type,
+ nrnb);
+ wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_LOCAL);
+
+ if (bUseGPU)
+ {
+ /* initialize local pair-list on the GPU */
+ nbnxn_cuda_init_pairlist(nbv->cu_nbv,
+ nbv->grp[eintLocal].nbl_lists.nbl[0],
+ eintLocal);
+ }
+ wallcycle_stop(wcycle, ewcNS);
+ }
+ else
+ {
+ wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
+ wallcycle_sub_start(wcycle, ewcsNB_X_BUF_OPS);
+ nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs, eatLocal, FALSE, x,
+ nbv->grp[eintLocal].nbat);
+ wallcycle_sub_stop(wcycle, ewcsNB_X_BUF_OPS);
+ wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
+ }
+
+ if (bUseGPU)
+ {
+ wallcycle_start(wcycle, ewcLAUNCH_GPU_NB);
+ /* launch local nonbonded F on GPU */
+ do_nb_verlet(fr, ic, enerd, flags, eintLocal, enbvClearFNo,
+ nrnb, wcycle);
+ wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
+ }
+
+ /* Communicate coordinates and sum dipole if necessary +
+ do non-local pair search */
+ if (DOMAINDECOMP(cr))
+ {
+ bDiffKernels = (nbv->grp[eintNonlocal].kernel_type !=
+ nbv->grp[eintLocal].kernel_type);
+
+ if (bDiffKernels)
+ {
+ /* With GPU+CPU non-bonded calculations we need to copy
+ * the local coordinates to the non-local nbat struct
+ * (in CPU format) as the non-local kernel call also
+ * calculates the local - non-local interactions.
+ */
+ wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
+ wallcycle_sub_start(wcycle, ewcsNB_X_BUF_OPS);
+ nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs, eatLocal, TRUE, x,
+ nbv->grp[eintNonlocal].nbat);
+ wallcycle_sub_stop(wcycle, ewcsNB_X_BUF_OPS);
+ wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
+ }
+
+ if (bNS)
+ {
+ wallcycle_start_nocount(wcycle, ewcNS);
+ wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_NONLOCAL);
+
+ if (bDiffKernels)
+ {
+ nbnxn_grid_add_simple(nbv->nbs, nbv->grp[eintNonlocal].nbat);
+ }
+
+ nbnxn_make_pairlist(nbv->nbs, nbv->grp[eintNonlocal].nbat,
+ &top->excls,
+ ic->rlist,
+ nbv->min_ci_balanced,
+ &nbv->grp[eintNonlocal].nbl_lists,
+ eintNonlocal,
+ nbv->grp[eintNonlocal].kernel_type,
+ nrnb);
+
+ wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_NONLOCAL);
+
+ if (nbv->grp[eintNonlocal].kernel_type == nbnxnk8x8x8_CUDA)
+ {
+ /* initialize non-local pair-list on the GPU */
+ nbnxn_cuda_init_pairlist(nbv->cu_nbv,
+ nbv->grp[eintNonlocal].nbl_lists.nbl[0],
+ eintNonlocal);
+ }
+ wallcycle_stop(wcycle, ewcNS);
+ }
+ else
+ {
+ wallcycle_start(wcycle, ewcMOVEX);
+ dd_move_x(cr->dd, box, x);
+
+ /* When we don't need the total dipole we sum it in global_stat */
+ if (bStateChanged && NEED_MUTOT(*inputrec))
+ {
+ gmx_sumd(2*DIM, mu, cr);
+ }
+ wallcycle_stop(wcycle, ewcMOVEX);
+
+ wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
+ wallcycle_sub_start(wcycle, ewcsNB_X_BUF_OPS);
+ nbnxn_atomdata_copy_x_to_nbat_x(nbv->nbs, eatNonlocal, FALSE, x,
+ nbv->grp[eintNonlocal].nbat);
+ wallcycle_sub_stop(wcycle, ewcsNB_X_BUF_OPS);
+ cycles_force += wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
+ }
+
+ if (bUseGPU && !bDiffKernels)
+ {
+ wallcycle_start(wcycle, ewcLAUNCH_GPU_NB);
+ /* launch non-local nonbonded F on GPU */
+ do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFNo,
+ nrnb, wcycle);
+ cycles_force += wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
+ }
+ }
+
+ if (bUseGPU)
+ {
+ /* launch D2H copy-back F */
+ wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU_NB);
+ if (DOMAINDECOMP(cr) && !bDiffKernels)
+ {
+ nbnxn_cuda_launch_cpyback(nbv->cu_nbv, nbv->grp[eintNonlocal].nbat,
+ flags, eatNonlocal);
+ }
+ nbnxn_cuda_launch_cpyback(nbv->cu_nbv, nbv->grp[eintLocal].nbat,
+ flags, eatLocal);
+ cycles_force += wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
+ }
+
+ if (bStateChanged && NEED_MUTOT(*inputrec))
+ {
+ if (PAR(cr))
+ {
+ gmx_sumd(2*DIM, mu, cr);
+ }
+
+ for (i = 0; i < 2; i++)
+ {
+ for (j = 0; j < DIM; j++)
+ {
+ fr->mu_tot[i][j] = mu[i*DIM + j];
+ }
+ }
+ }
+ if (fr->efep == efepNO)
+ {
+ copy_rvec(fr->mu_tot[0], mu_tot);
+ }
+ else
+ {
+ for (j = 0; j < DIM; j++)
+ {
+ mu_tot[j] =
+ (1.0 - lambda[efptCOUL])*fr->mu_tot[0][j] +
+ lambda[efptCOUL]*fr->mu_tot[1][j];
+ }
+ }
+
+ /* Reset energies */
+ reset_enerdata(&(inputrec->opts), fr, bNS, enerd, MASTER(cr));
+ clear_rvecs(SHIFTS, fr->fshift);
+
+ if (DOMAINDECOMP(cr))
+ {
+ if (!(cr->duty & DUTY_PME))
+ {
+ wallcycle_start(wcycle, ewcPPDURINGPME);
+ dd_force_flop_start(cr->dd, nrnb);
+ }
+ }
+
+ if (inputrec->bRot)
+ {
+ /* Enforced rotation has its own cycle counter that starts after the collective
+ * coordinates have been communicated. It is added to ddCyclF to allow
+ * for proper load-balancing */
+ wallcycle_start(wcycle, ewcROT);
+ do_rotation(cr, inputrec, box, x, t, step, wcycle, bNS);
+ wallcycle_stop(wcycle, ewcROT);
+ }
+
+ /* Start the force cycle counter.
+ * This counter is stopped in do_forcelow_level.
+ * No parallel communication should occur while this counter is running,
+ * since that will interfere with the dynamic load balancing.
+ */
+ wallcycle_start(wcycle, ewcFORCE);
+ if (bDoForces)
+ {
+ /* Reset forces for which the virial is calculated separately:
+ * PME/Ewald forces if necessary */
+ if (fr->bF_NoVirSum)
+ {
+ if (flags & GMX_FORCE_VIRIAL)
+ {
+ fr->f_novirsum = fr->f_novirsum_alloc;
+ if (fr->bDomDec)
+ {
+ clear_rvecs(fr->f_novirsum_n, fr->f_novirsum);
+ }
+ else
+ {
+ clear_rvecs(homenr, fr->f_novirsum+start);
+ }
+ }
+ else
+ {
+ /* We are not calculating the pressure so we do not need
+ * a separate array for forces that do not contribute
+ * to the pressure.
+ */
+ fr->f_novirsum = f;
+ }
+ }
+
+ /* Clear the short- and long-range forces */
+ clear_rvecs(fr->natoms_force_constr, f);
+ if (bSepLRF && do_per_step(step, inputrec->nstcalclr))
+ {
+ clear_rvecs(fr->natoms_force_constr, fr->f_twin);
+ }
+
+ clear_rvec(fr->vir_diag_posres);
+ }
+
+ if (inputrec->ePull == epullCONSTRAINT)
+ {
+ clear_pull_forces(inputrec->pull);
+ }
+
+ /* We calculate the non-bonded forces, when done on the CPU, here.
+ * We do this before calling do_force_lowlevel, as in there bondeds
+ * forces are calculated before PME, which does communication.
+ * With this order, non-bonded and bonded force calculation imbalance
+ * can be balanced out by the domain decomposition load balancing.
+ */
+
+ if (!bUseOrEmulGPU)
+ {
+ /* Maybe we should move this into do_force_lowlevel */
+ do_nb_verlet(fr, ic, enerd, flags, eintLocal, enbvClearFYes,
+ nrnb, wcycle);
+ }
+
+ if (!bUseOrEmulGPU || bDiffKernels)
+ {
+ int aloc;
+
+ if (DOMAINDECOMP(cr))
+ {
+ do_nb_verlet(fr, ic, enerd, flags, eintNonlocal,
+ bDiffKernels ? enbvClearFYes : enbvClearFNo,
+ nrnb, wcycle);
+ }
+
+ if (!bUseOrEmulGPU)
+ {
+ aloc = eintLocal;
+ }
+ else
+ {
+ aloc = eintNonlocal;
+ }
+
+ /* Add all the non-bonded force to the normal force array.
+ * This can be split into a local a non-local part when overlapping
+ * communication with calculation with domain decomposition.
+ */
+ cycles_force += wallcycle_stop(wcycle, ewcFORCE);
+ wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
+ wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS);
+ nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs, eatAll, nbv->grp[aloc].nbat, f);
+ wallcycle_sub_stop(wcycle, ewcsNB_F_BUF_OPS);
+ cycles_force += wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
+ wallcycle_start_nocount(wcycle, ewcFORCE);
+
+ /* if there are multiple fshift output buffers reduce them */
+ if ((flags & GMX_FORCE_VIRIAL) &&
+ nbv->grp[aloc].nbl_lists.nnbl > 1)
+ {
+ nbnxn_atomdata_add_nbat_fshift_to_fshift(nbv->grp[aloc].nbat,
+ fr->fshift);
+ }
+ }
+
+ /* update QMMMrec, if necessary */
+ if (fr->bQMMM)
+ {
+ update_QMMMrec(cr, fr, x, mdatoms, box, top);
+ }
+
+ if ((flags & GMX_FORCE_BONDED) && top->idef.il[F_POSRES].nr > 0)
+ {
+ posres_wrapper(fplog, flags, bSepDVDL, inputrec, nrnb, top, box, x,
+ f, enerd, lambda, fr);
+ }
+
+ /* Compute the bonded and non-bonded energies and optionally forces */
+ do_force_lowlevel(fplog, step, fr, inputrec, &(top->idef),
+ cr, nrnb, wcycle, mdatoms, &(inputrec->opts),
+ x, hist, f, bSepLRF ? fr->f_twin : f, enerd, fcd, mtop, top, fr->born,
+ &(top->atomtypes), bBornRadii, box,
+ inputrec->fepvals, lambda, graph, &(top->excls), fr->mu_tot,
+ flags, &cycles_pme);
+
+ if (bSepLRF)
+ {
+ if (do_per_step(step, inputrec->nstcalclr))
+ {
+ /* Add the long range forces to the short range forces */
+ for (i = 0; i < fr->natoms_force_constr; i++)
+ {
+ rvec_add(fr->f_twin[i], f[i], f[i]);
+ }
+ }
+ }
+
+ cycles_force += wallcycle_stop(wcycle, ewcFORCE);
+
+ if (ed)
+ {
+ do_flood(cr, inputrec, x, f, ed, box, step, bNS);
+ }
+
+ if (bUseOrEmulGPU && !bDiffKernels)
+ {
+ /* wait for non-local forces (or calculate in emulation mode) */
+ if (DOMAINDECOMP(cr))
+ {
+ if (bUseGPU)
+ {
+ wallcycle_start(wcycle, ewcWAIT_GPU_NB_NL);
+ nbnxn_cuda_wait_gpu(nbv->cu_nbv,
+ nbv->grp[eintNonlocal].nbat,
+ flags, eatNonlocal,
+ enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
+ fr->fshift);
+ cycles_force += wallcycle_stop(wcycle, ewcWAIT_GPU_NB_NL);
+ }
+ else
+ {
+ wallcycle_start_nocount(wcycle, ewcFORCE);
+ do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFYes,
+ nrnb, wcycle);
+ cycles_force += wallcycle_stop(wcycle, ewcFORCE);
+ }
+ wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
+ wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS);
+ /* skip the reduction if there was no non-local work to do */
+ if (nbv->grp[eintLocal].nbl_lists.nbl[0]->nsci > 0)
+ {
+ nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs, eatNonlocal,
+ nbv->grp[eintNonlocal].nbat, f);
+ }
+ wallcycle_sub_stop(wcycle, ewcsNB_F_BUF_OPS);
+ cycles_force += wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
+ }
+ }
+
+ if (bDoForces)
+ {
+ /* Communicate the forces */
+ if (PAR(cr))
+ {
+ wallcycle_start(wcycle, ewcMOVEF);
+ if (DOMAINDECOMP(cr))
+ {
+ dd_move_f(cr->dd, f, fr->fshift);
+ /* Do we need to communicate the separate force array
+ * for terms that do not contribute to the single sum virial?
+ * Position restraints and electric fields do not introduce
+ * inter-cg forces, only full electrostatics methods do.
+ * When we do not calculate the virial, fr->f_novirsum = f,
+ * so we have already communicated these forces.
+ */
+ if (EEL_FULL(fr->eeltype) && cr->dd->n_intercg_excl &&
+ (flags & GMX_FORCE_VIRIAL))
+ {
+ dd_move_f(cr->dd, fr->f_novirsum, NULL);
+ }
+ if (bSepLRF)
+ {
+ /* We should not update the shift forces here,
+ * since f_twin is already included in f.
+ */
+ dd_move_f(cr->dd, fr->f_twin, NULL);
+ }
+ }
+ wallcycle_stop(wcycle, ewcMOVEF);
+ }
+ }
+
+ if (bUseOrEmulGPU)
+ {
+ /* wait for local forces (or calculate in emulation mode) */
+ if (bUseGPU)
+ {
+ wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
+ nbnxn_cuda_wait_gpu(nbv->cu_nbv,
+ nbv->grp[eintLocal].nbat,
+ flags, eatLocal,
+ enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
+ fr->fshift);
+ wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
+
+ /* now clear the GPU outputs while we finish the step on the CPU */
+
+ wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU_NB);
+ nbnxn_cuda_clear_outputs(nbv->cu_nbv, flags);
+ wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
+ }
+ else
+ {
+ wallcycle_start_nocount(wcycle, ewcFORCE);
+ do_nb_verlet(fr, ic, enerd, flags, eintLocal,
+ DOMAINDECOMP(cr) ? enbvClearFNo : enbvClearFYes,
+ nrnb, wcycle);
+ wallcycle_stop(wcycle, ewcFORCE);
+ }
+ wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
+ wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS);
+ if (nbv->grp[eintLocal].nbl_lists.nbl[0]->nsci > 0)
+ {
+ /* skip the reduction if there was no non-local work to do */
+ nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs, eatLocal,
+ nbv->grp[eintLocal].nbat, f);
+ }
+ wallcycle_sub_stop(wcycle, ewcsNB_F_BUF_OPS);
+ wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
+ }
+
+ if (DOMAINDECOMP(cr))
+ {
+ dd_force_flop_stop(cr->dd, nrnb);
+ if (wcycle)
+ {
+ dd_cycles_add(cr->dd, cycles_force-cycles_pme, ddCyclF);
+ }
+ }
+
+ if (bDoForces)
+ {
+ if (IR_ELEC_FIELD(*inputrec))
+ {
+ /* Compute forces due to electric field */
+ calc_f_el(MASTER(cr) ? field : NULL,
+ start, homenr, mdatoms->chargeA, x, fr->f_novirsum,
+ inputrec->ex, inputrec->et, t);
+ }
+
+ /* If we have NoVirSum forces, but we do not calculate the virial,
+ * we sum fr->f_novirum=f later.
+ */
+ if (vsite && !(fr->bF_NoVirSum && !(flags & GMX_FORCE_VIRIAL)))
+ {
+ wallcycle_start(wcycle, ewcVSITESPREAD);
+ spread_vsite_f(fplog, vsite, x, f, fr->fshift, FALSE, NULL, nrnb,
+ &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr);
+ wallcycle_stop(wcycle, ewcVSITESPREAD);
+
+ if (bSepLRF)
+ {
+ wallcycle_start(wcycle, ewcVSITESPREAD);
+ spread_vsite_f(fplog, vsite, x, fr->f_twin, NULL, FALSE, NULL,
+ nrnb,
+ &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr);
+ wallcycle_stop(wcycle, ewcVSITESPREAD);
+ }
+ }
+
+ if (flags & GMX_FORCE_VIRIAL)
+ {
+ /* Calculation of the virial must be done after vsites! */
+ calc_virial(fplog, mdatoms->start, mdatoms->homenr, x, f,
+ vir_force, graph, box, nrnb, fr, inputrec->ePBC);
+ }
+ }
+
+ if (inputrec->ePull == epullUMBRELLA || inputrec->ePull == epullCONST_F)
+ {
+ pull_potential_wrapper(fplog, bSepDVDL, cr, inputrec, box, x,
+ f, vir_force, mdatoms, enerd, lambda, t);
+ }
+
+ /* Add the forces from enforced rotation potentials (if any) */
+ if (inputrec->bRot)
+ {
+ wallcycle_start(wcycle, ewcROTadd);
+ enerd->term[F_COM_PULL] += add_rot_forces(inputrec->rot, f, cr, step, t);
+ wallcycle_stop(wcycle, ewcROTadd);
+ }
+
+ if (PAR(cr) && !(cr->duty & DUTY_PME))
+ {
+ /* In case of node-splitting, the PP nodes receive the long-range
+ * forces, virial and energy from the PME nodes here.
+ */
+ pme_receive_force_ener(fplog, bSepDVDL, cr, wcycle, enerd, fr);
+ }
+
+ if (bDoForces)
+ {
+ post_process_forces(fplog, cr, step, nrnb, wcycle,
+ top, box, x, f, vir_force, mdatoms, graph, fr, vsite,
+ flags);
+ }
+
+ /* Sum the potential energy terms from group contributions */
+ sum_epot(&(inputrec->opts), &(enerd->grpp), enerd->term);
+}
+
+void do_force_cutsGROUP(FILE *fplog, t_commrec *cr,
+ t_inputrec *inputrec,
+ gmx_large_int_t step, t_nrnb *nrnb, gmx_wallcycle_t wcycle,
+ gmx_localtop_t *top,
+ gmx_mtop_t *mtop,
+ gmx_groups_t *groups,
+ matrix box, rvec x[], history_t *hist,
+ rvec f[],
+ tensor vir_force,
+ t_mdatoms *mdatoms,
+ gmx_enerdata_t *enerd, t_fcdata *fcd,
+ real *lambda, t_graph *graph,
+ t_forcerec *fr, gmx_vsite_t *vsite, rvec mu_tot,
+ double t, FILE *field, gmx_edsam_t ed,
+ gmx_bool bBornRadii,
+ int flags)
+{
+ int cg0, cg1, i, j;
+ int start, homenr;
+ double mu[2*DIM];
+ gmx_bool bSepDVDL, bStateChanged, bNS, bFillGrid, bCalcCGCM, bBS;
+ gmx_bool bDoLongRangeNS, bDoForces, bDoPotential, bSepLRF;
+ gmx_bool bDoAdressWF;
+ matrix boxs;
+ rvec vzero, box_diag;
+ real e, v, dvdlambda[efptNR];
+ t_pbc pbc;
+ float cycles_pme, cycles_force;
+
+ start = mdatoms->start;
+ homenr = mdatoms->homenr;
+
+ bSepDVDL = (fr->bSepDVDL && do_per_step(step, inputrec->nstlog));
+
+ clear_mat(vir_force);
+
+ if (PARTDECOMP(cr))
+ {
+ pd_cg_range(cr, &cg0, &cg1);
+ }
+ else
+ {
+ cg0 = 0;
+ if (DOMAINDECOMP(cr))
+ {
+ cg1 = cr->dd->ncg_tot;
+ }
+ else
+ {
+ cg1 = top->cgs.nr;
+ }
+ if (fr->n_tpi > 0)
+ {
+ cg1--;
+ }
+ }
+
+ bStateChanged = (flags & GMX_FORCE_STATECHANGED);
+ bNS = (flags & GMX_FORCE_NS) && (fr->bAllvsAll == FALSE);
+ /* Should we update the long-range neighborlists at this step? */
+ bDoLongRangeNS = fr->bTwinRange && bNS;
+ /* Should we perform the long-range nonbonded evaluation inside the neighborsearching? */
+ bFillGrid = (bNS && bStateChanged);
+ bCalcCGCM = (bFillGrid && !DOMAINDECOMP(cr));
+ bDoForces = (flags & GMX_FORCE_FORCES);
+ bDoPotential = (flags & GMX_FORCE_ENERGY);
+ bSepLRF = ((inputrec->nstcalclr > 1) && bDoForces &&
+ (flags & GMX_FORCE_SEPLRF) && (flags & GMX_FORCE_DO_LR));
+
+ /* should probably move this to the forcerec since it doesn't change */
+ bDoAdressWF = ((fr->adress_type != eAdressOff));
+
+ if (bStateChanged)
+ {
+ update_forcerec(fplog, fr, box);
+
+ if (NEED_MUTOT(*inputrec))
+ {
+ /* Calculate total (local) dipole moment in a temporary common array.
+ * This makes it possible to sum them over nodes faster.
+ */
+ calc_mu(start, homenr,
+ x, mdatoms->chargeA, mdatoms->chargeB, mdatoms->nChargePerturbed,
+ mu, mu+DIM);
+ }
+ }
+
+ if (fr->ePBC != epbcNONE)
+ {
+ /* Compute shift vectors every step,
+ * because of pressure coupling or box deformation!
+ */
+ if ((flags & GMX_FORCE_DYNAMICBOX) && bStateChanged)
+ {
+ calc_shifts(box, fr->shift_vec);
+ }
+
+ if (bCalcCGCM)
+ {
+ put_charge_groups_in_box(fplog, cg0, cg1, fr->ePBC, box,
+ &(top->cgs), x, fr->cg_cm);
+ inc_nrnb(nrnb, eNR_CGCM, homenr);
+ inc_nrnb(nrnb, eNR_RESETX, cg1-cg0);
+ }
+ else if (EI_ENERGY_MINIMIZATION(inputrec->eI) && graph)
+ {
+ unshift_self(graph, box, x);
+ }
+ }
+ else if (bCalcCGCM)
+ {
+ calc_cgcm(fplog, cg0, cg1, &(top->cgs), x, fr->cg_cm);
+ inc_nrnb(nrnb, eNR_CGCM, homenr);
+ }
+
+ if (bCalcCGCM)
+ {
+ if (PAR(cr))
+ {
+ move_cgcm(fplog, cr, fr->cg_cm);
+ }
+ if (gmx_debug_at)
+ {
+ pr_rvecs(debug, 0, "cgcm", fr->cg_cm, top->cgs.nr);
+ }
+ }
+
+#ifdef GMX_MPI
+ if (!(cr->duty & DUTY_PME))
+ {
+ /* Send particle coordinates to the pme nodes.
+ * Since this is only implemented for domain decomposition
+ * and domain decomposition does not use the graph,
+ * we do not need to worry about shifting.
+ */
+
+ wallcycle_start(wcycle, ewcPP_PMESENDX);
+
+ bBS = (inputrec->nwall == 2);
+ if (bBS)
+ {
+ copy_mat(box, boxs);
+ svmul(inputrec->wall_ewald_zfac, boxs[ZZ], boxs[ZZ]);
+ }
+
+ gmx_pme_send_x(cr, bBS ? boxs : box, x,
+ mdatoms->nChargePerturbed, lambda[efptCOUL],
+ (flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)), step);
+
+ wallcycle_stop(wcycle, ewcPP_PMESENDX);
+ }
+#endif /* GMX_MPI */
+
+ /* Communicate coordinates and sum dipole if necessary */
+ if (PAR(cr))
+ {
+ wallcycle_start(wcycle, ewcMOVEX);
+ if (DOMAINDECOMP(cr))
+ {
+ dd_move_x(cr->dd, box, x);
+ }
+ else
+ {
+ move_x(fplog, cr, GMX_LEFT, GMX_RIGHT, x, nrnb);
+ }
+ wallcycle_stop(wcycle, ewcMOVEX);
+ }
+
+ /* update adress weight beforehand */
+ if (bStateChanged && bDoAdressWF)
+ {
+ /* need pbc for adress weight calculation with pbc_dx */
+ set_pbc(&pbc, inputrec->ePBC, box);
+ if (fr->adress_site == eAdressSITEcog)
+ {
+ update_adress_weights_cog(top->idef.iparams, top->idef.il, x, fr, mdatoms,
+ inputrec->ePBC == epbcNONE ? NULL : &pbc);
+ }
+ else if (fr->adress_site == eAdressSITEcom)
+ {
+ update_adress_weights_com(fplog, cg0, cg1, &(top->cgs), x, fr, mdatoms,
+ inputrec->ePBC == epbcNONE ? NULL : &pbc);
+ }
+ else if (fr->adress_site == eAdressSITEatomatom)
+ {
+ update_adress_weights_atom_per_atom(cg0, cg1, &(top->cgs), x, fr, mdatoms,
+ inputrec->ePBC == epbcNONE ? NULL : &pbc);
+ }
+ else
+ {
+ update_adress_weights_atom(cg0, cg1, &(top->cgs), x, fr, mdatoms,
+ inputrec->ePBC == epbcNONE ? NULL : &pbc);
+ }
+ }
+
+ if (NEED_MUTOT(*inputrec))
+ {
+
+ if (bStateChanged)
+ {
+ if (PAR(cr))
+ {
+ gmx_sumd(2*DIM, mu, cr);
+ }
+ for (i = 0; i < 2; i++)
+ {
+ for (j = 0; j < DIM; j++)
+ {
+ fr->mu_tot[i][j] = mu[i*DIM + j];
+ }
+ }
+ }
+ if (fr->efep == efepNO)
+ {
+ copy_rvec(fr->mu_tot[0], mu_tot);
+ }
+ else
+ {
+ for (j = 0; j < DIM; j++)
+ {
+ mu_tot[j] =
+ (1.0 - lambda[efptCOUL])*fr->mu_tot[0][j] + lambda[efptCOUL]*fr->mu_tot[1][j];
+ }
+ }
+ }
+
+ /* Reset energies */
+ reset_enerdata(&(inputrec->opts), fr, bNS, enerd, MASTER(cr));
+ clear_rvecs(SHIFTS, fr->fshift);
+
+ if (bNS)
+ {
+ wallcycle_start(wcycle, ewcNS);
+
+ if (graph && bStateChanged)
+ {
+ /* Calculate intramolecular shift vectors to make molecules whole */
+ mk_mshift(fplog, graph, fr->ePBC, box, x);
+ }
+
+ /* Do the actual neighbour searching and if twin range electrostatics
+ * also do the calculation of long range forces and energies.
+ */
+ for (i = 0; i < efptNR; i++)
+ {
+ dvdlambda[i] = 0;
+ }
+ ns(fplog, fr, x, box,
+ groups, &(inputrec->opts), top, mdatoms,
+ cr, nrnb, lambda, dvdlambda, &enerd->grpp, bFillGrid,
+ bDoLongRangeNS);
+ if (bSepDVDL)
+ {
+ fprintf(fplog, sepdvdlformat, "LR non-bonded", 0.0, dvdlambda);
+ }
+ enerd->dvdl_lin[efptVDW] += dvdlambda[efptVDW];
+ enerd->dvdl_lin[efptCOUL] += dvdlambda[efptCOUL];
+
+ wallcycle_stop(wcycle, ewcNS);
+ }
+
+ if (inputrec->implicit_solvent && bNS)
+ {
+ make_gb_nblist(cr, inputrec->gb_algorithm, inputrec->rlist,
+ x, box, fr, &top->idef, graph, fr->born);
+ }
+
+ if (DOMAINDECOMP(cr))
+ {
+ if (!(cr->duty & DUTY_PME))
+ {
+ wallcycle_start(wcycle, ewcPPDURINGPME);
+ dd_force_flop_start(cr->dd, nrnb);
+ }
+ }
+
+ if (inputrec->bRot)
+ {
+ /* Enforced rotation has its own cycle counter that starts after the collective
+ * coordinates have been communicated. It is added to ddCyclF to allow
+ * for proper load-balancing */
+ wallcycle_start(wcycle, ewcROT);
+ do_rotation(cr, inputrec, box, x, t, step, wcycle, bNS);
+ wallcycle_stop(wcycle, ewcROT);
+ }
+
+ /* Start the force cycle counter.
+ * This counter is stopped in do_forcelow_level.
+ * No parallel communication should occur while this counter is running,
+ * since that will interfere with the dynamic load balancing.
+ */
+ wallcycle_start(wcycle, ewcFORCE);
+
+ if (bDoForces)
+ {
+ /* Reset forces for which the virial is calculated separately:
+ * PME/Ewald forces if necessary */
+ if (fr->bF_NoVirSum)
+ {
+ if (flags & GMX_FORCE_VIRIAL)
+ {
+ fr->f_novirsum = fr->f_novirsum_alloc;
+ if (fr->bDomDec)
+ {
+ clear_rvecs(fr->f_novirsum_n, fr->f_novirsum);
+ }
+ else
+ {
+ clear_rvecs(homenr, fr->f_novirsum+start);
+ }
+ }
+ else
+ {
+ /* We are not calculating the pressure so we do not need
+ * a separate array for forces that do not contribute
+ * to the pressure.
+ */
+ fr->f_novirsum = f;
+ }
+ }
+
+ /* Clear the short- and long-range forces */
+ clear_rvecs(fr->natoms_force_constr, f);
+ if (bSepLRF && do_per_step(step, inputrec->nstcalclr))
+ {
+ clear_rvecs(fr->natoms_force_constr, fr->f_twin);
+ }
+
+ clear_rvec(fr->vir_diag_posres);
+ }
+ if (inputrec->ePull == epullCONSTRAINT)
+ {
+ clear_pull_forces(inputrec->pull);
+ }
+
+ /* update QMMMrec, if necessary */
+ if (fr->bQMMM)
+ {
+ update_QMMMrec(cr, fr, x, mdatoms, box, top);
+ }
+
+ if ((flags & GMX_FORCE_BONDED) && top->idef.il[F_POSRES].nr > 0)
+ {
+ posres_wrapper(fplog, flags, bSepDVDL, inputrec, nrnb, top, box, x,
+ f, enerd, lambda, fr);
+ }
+
+ if ((flags & GMX_FORCE_BONDED) && top->idef.il[F_FBPOSRES].nr > 0)
+ {
+ /* Flat-bottomed position restraints always require full pbc */
+ if (!(bStateChanged && bDoAdressWF))
+ {
+ set_pbc(&pbc, inputrec->ePBC, box);
+ }
+ v = fbposres(top->idef.il[F_FBPOSRES].nr, top->idef.il[F_FBPOSRES].iatoms,
+ top->idef.iparams_fbposres,
+ (const rvec*)x, fr->f_novirsum, fr->vir_diag_posres,
+ inputrec->ePBC == epbcNONE ? NULL : &pbc,
+ fr->rc_scaling, fr->ePBC, fr->posres_com);
+ enerd->term[F_FBPOSRES] += v;
+ inc_nrnb(nrnb, eNR_FBPOSRES, top->idef.il[F_FBPOSRES].nr/2);
+ }
+
+ /* Compute the bonded and non-bonded energies and optionally forces */
+ do_force_lowlevel(fplog, step, fr, inputrec, &(top->idef),
+ cr, nrnb, wcycle, mdatoms, &(inputrec->opts),
+ x, hist, f, bSepLRF ? fr->f_twin : f, enerd, fcd, mtop, top, fr->born,
+ &(top->atomtypes), bBornRadii, box,
+ inputrec->fepvals, lambda,
+ graph, &(top->excls), fr->mu_tot,
+ flags,
+ &cycles_pme);
+
+ if (bSepLRF)
+ {
+ if (do_per_step(step, inputrec->nstcalclr))
+ {
+ /* Add the long range forces to the short range forces */
+ for (i = 0; i < fr->natoms_force_constr; i++)
+ {
+ rvec_add(fr->f_twin[i], f[i], f[i]);
+ }
+ }
+ }
+
+ cycles_force = wallcycle_stop(wcycle, ewcFORCE);
+
+ if (ed)
+ {
+ do_flood(cr, inputrec, x, f, ed, box, step, bNS);
+ }
+
+ if (DOMAINDECOMP(cr))
+ {
+ dd_force_flop_stop(cr->dd, nrnb);
+ if (wcycle)
+ {
+ dd_cycles_add(cr->dd, cycles_force-cycles_pme, ddCyclF);
+ }
+ }
+
+ if (bDoForces)
+ {
+ if (IR_ELEC_FIELD(*inputrec))
+ {
+ /* Compute forces due to electric field */
+ calc_f_el(MASTER(cr) ? field : NULL,
+ start, homenr, mdatoms->chargeA, x, fr->f_novirsum,
+ inputrec->ex, inputrec->et, t);
+ }
+
+ if (bDoAdressWF && fr->adress_icor == eAdressICThermoForce)
+ {
+ /* Compute thermodynamic force in hybrid AdResS region */
+ adress_thermo_force(start, homenr, &(top->cgs), x, fr->f_novirsum, fr, mdatoms,
+ inputrec->ePBC == epbcNONE ? NULL : &pbc);
+ }
+
+ /* Communicate the forces */
+ if (PAR(cr))
+ {
+ wallcycle_start(wcycle, ewcMOVEF);
+ if (DOMAINDECOMP(cr))
+ {
+ dd_move_f(cr->dd, f, fr->fshift);
+ /* Do we need to communicate the separate force array
+ * for terms that do not contribute to the single sum virial?
+ * Position restraints and electric fields do not introduce
+ * inter-cg forces, only full electrostatics methods do.
+ * When we do not calculate the virial, fr->f_novirsum = f,
+ * so we have already communicated these forces.
+ */
+ if (EEL_FULL(fr->eeltype) && cr->dd->n_intercg_excl &&
+ (flags & GMX_FORCE_VIRIAL))
+ {
+ dd_move_f(cr->dd, fr->f_novirsum, NULL);
+ }
+ if (bSepLRF)
+ {
+ /* We should not update the shift forces here,
+ * since f_twin is already included in f.
+ */
+ dd_move_f(cr->dd, fr->f_twin, NULL);
+ }
+ }
+ else
+ {
+ pd_move_f(cr, f, nrnb);
+ if (bSepLRF)
+ {
+ pd_move_f(cr, fr->f_twin, nrnb);
+ }
+ }
+ wallcycle_stop(wcycle, ewcMOVEF);
+ }
+
+ /* If we have NoVirSum forces, but we do not calculate the virial,
+ * we sum fr->f_novirum=f later.
+ */
+ if (vsite && !(fr->bF_NoVirSum && !(flags & GMX_FORCE_VIRIAL)))
+ {
+ wallcycle_start(wcycle, ewcVSITESPREAD);
+ spread_vsite_f(fplog, vsite, x, f, fr->fshift, FALSE, NULL, nrnb,
+ &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr);
+ wallcycle_stop(wcycle, ewcVSITESPREAD);
+
+ if (bSepLRF)
+ {
+ wallcycle_start(wcycle, ewcVSITESPREAD);
+ spread_vsite_f(fplog, vsite, x, fr->f_twin, NULL, FALSE, NULL,
+ nrnb,
+ &top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr);
+ wallcycle_stop(wcycle, ewcVSITESPREAD);
+ }
+ }
+
+ if (flags & GMX_FORCE_VIRIAL)
+ {
+ /* Calculation of the virial must be done after vsites! */
+ calc_virial(fplog, mdatoms->start, mdatoms->homenr, x, f,
+ vir_force, graph, box, nrnb, fr, inputrec->ePBC);
+ }
+ }
+
+ if (inputrec->ePull == epullUMBRELLA || inputrec->ePull == epullCONST_F)
+ {
+ pull_potential_wrapper(fplog, bSepDVDL, cr, inputrec, box, x,
+ f, vir_force, mdatoms, enerd, lambda, t);
+ }
+
+ /* Add the forces from enforced rotation potentials (if any) */
+ if (inputrec->bRot)
+ {
+ wallcycle_start(wcycle, ewcROTadd);
+ enerd->term[F_COM_PULL] += add_rot_forces(inputrec->rot, f, cr, step, t);
+ wallcycle_stop(wcycle, ewcROTadd);
+ }
+
+ if (PAR(cr) && !(cr->duty & DUTY_PME))
+ {
+ /* In case of node-splitting, the PP nodes receive the long-range
+ * forces, virial and energy from the PME nodes here.
+ */
+ pme_receive_force_ener(fplog, bSepDVDL, cr, wcycle, enerd, fr);
+ }
+
+ if (bDoForces)
+ {
+ post_process_forces(fplog, cr, step, nrnb, wcycle,
+ top, box, x, f, vir_force, mdatoms, graph, fr, vsite,
+ flags);
+ }
+
+ /* Sum the potential energy terms from group contributions */
+ sum_epot(&(inputrec->opts), &(enerd->grpp), enerd->term);
+}
+
+void do_force(FILE *fplog, t_commrec *cr,
+ t_inputrec *inputrec,
+ gmx_large_int_t step, t_nrnb *nrnb, gmx_wallcycle_t wcycle,
+ gmx_localtop_t *top,
+ gmx_mtop_t *mtop,
+ gmx_groups_t *groups,
+ matrix box, rvec x[], history_t *hist,
+ rvec f[],
+ tensor vir_force,
+ t_mdatoms *mdatoms,
+ gmx_enerdata_t *enerd, t_fcdata *fcd,
+ real *lambda, t_graph *graph,
+ t_forcerec *fr,
+ gmx_vsite_t *vsite, rvec mu_tot,
+ double t, FILE *field, gmx_edsam_t ed,
+ gmx_bool bBornRadii,
+ int flags)
+{
+ /* modify force flag if not doing nonbonded */
+ if (!fr->bNonbonded)
+ {
+ flags &= ~GMX_FORCE_NONBONDED;
+ }
+
+ switch (inputrec->cutoff_scheme)
+ {
+ case ecutsVERLET:
+ do_force_cutsVERLET(fplog, cr, inputrec,
+ step, nrnb, wcycle,
+ top, mtop,
+ groups,
+ box, x, hist,
+ f, vir_force,
+ mdatoms,
+ enerd, fcd,
+ lambda, graph,
+ fr, fr->ic,
+ vsite, mu_tot,
+ t, field, ed,
+ bBornRadii,
+ flags);
+ break;
+ case ecutsGROUP:
+ do_force_cutsGROUP(fplog, cr, inputrec,
+ step, nrnb, wcycle,
+ top, mtop,
+ groups,
+ box, x, hist,
+ f, vir_force,
+ mdatoms,
+ enerd, fcd,
+ lambda, graph,
+ fr, vsite, mu_tot,
+ t, field, ed,
+ bBornRadii,
+ flags);
+ break;
+ default:
+ gmx_incons("Invalid cut-off scheme passed!");
+ }
+}
+
+
+void do_constrain_first(FILE *fplog, gmx_constr_t constr,
+ t_inputrec *ir, t_mdatoms *md,
+ t_state *state, rvec *f,
+ t_graph *graph, t_commrec *cr, t_nrnb *nrnb,
+ t_forcerec *fr, gmx_localtop_t *top, tensor shake_vir)
+{
+ int i, m, start, end;
+ gmx_large_int_t step;
+ real dt = ir->delta_t;
+ real dvdl_dum;
+ rvec *savex;
+
+ snew(savex, state->natoms);
+
+ start = md->start;
+ end = md->homenr + start;
+
+ if (debug)
+ {
+ fprintf(debug, "vcm: start=%d, homenr=%d, end=%d\n",
+ start, md->homenr, end);
+ }
+ /* Do a first constrain to reset particles... */
+ step = ir->init_step;
+ if (fplog)
+ {
+ char buf[STEPSTRSIZE];
+ fprintf(fplog, "\nConstraining the starting coordinates (step %s)\n",
+ gmx_step_str(step, buf));
+ }
+ dvdl_dum = 0;
+
+ /* constrain the current position */
+ constrain(NULL, TRUE, FALSE, constr, &(top->idef),
+ ir, NULL, cr, step, 0, md,
+ state->x, state->x, NULL,
+ fr->bMolPBC, state->box,
+ state->lambda[efptBONDED], &dvdl_dum,
+ NULL, NULL, nrnb, econqCoord,
+ ir->epc == epcMTTK, state->veta, state->veta);
+ if (EI_VV(ir->eI))
+ {
+ /* constrain the inital velocity, and save it */
+ /* also may be useful if we need the ekin from the halfstep for velocity verlet */
+ /* might not yet treat veta correctly */
+ constrain(NULL, TRUE, FALSE, constr, &(top->idef),
+ ir, NULL, cr, step, 0, md,
+ state->x, state->v, state->v,
+ fr->bMolPBC, state->box,
+ state->lambda[efptBONDED], &dvdl_dum,
+ NULL, NULL, nrnb, econqVeloc,
+ ir->epc == epcMTTK, state->veta, state->veta);
+ }
+ /* constrain the inital velocities at t-dt/2 */
+ if (EI_STATE_VELOCITY(ir->eI) && ir->eI != eiVV)
+ {
+ for (i = start; (i < end); i++)
+ {
+ for (m = 0; (m < DIM); m++)
+ {
+ /* Reverse the velocity */
+ state->v[i][m] = -state->v[i][m];
+ /* Store the position at t-dt in buf */
+ savex[i][m] = state->x[i][m] + dt*state->v[i][m];
+ }
+ }
+ /* Shake the positions at t=-dt with the positions at t=0
+ * as reference coordinates.
+ */
+ if (fplog)
+ {
+ char buf[STEPSTRSIZE];
+ fprintf(fplog, "\nConstraining the coordinates at t0-dt (step %s)\n",
+ gmx_step_str(step, buf));
+ }
+ dvdl_dum = 0;
+ constrain(NULL, TRUE, FALSE, constr, &(top->idef),
+ ir, NULL, cr, step, -1, md,
+ state->x, savex, NULL,
+ fr->bMolPBC, state->box,
+ state->lambda[efptBONDED], &dvdl_dum,
+ state->v, NULL, nrnb, econqCoord,
+ ir->epc == epcMTTK, state->veta, state->veta);
+
+ for (i = start; i < end; i++)
+ {
+ for (m = 0; m < DIM; m++)
+ {
+ /* Re-reverse the velocities */
+ state->v[i][m] = -state->v[i][m];
+ }
+ }
+ }
+ sfree(savex);
+}
+
+void calc_enervirdiff(FILE *fplog, int eDispCorr, t_forcerec *fr)
+{
+ double eners[2], virs[2], enersum, virsum, y0, f, g, h;
+ double r0, r1, r, rc3, rc9, ea, eb, ec, pa, pb, pc, pd;
+ double invscale, invscale2, invscale3;
+ int ri0, ri1, ri, i, offstart, offset;
+ real scale, *vdwtab, tabfactor, tmp;
+
+ fr->enershiftsix = 0;
+ fr->enershifttwelve = 0;
+ fr->enerdiffsix = 0;
+ fr->enerdifftwelve = 0;
+ fr->virdiffsix = 0;
+ fr->virdifftwelve = 0;
+
+ if (eDispCorr != edispcNO)
+ {
+ for (i = 0; i < 2; i++)
+ {
+ eners[i] = 0;
+ virs[i] = 0;
+ }
+ if ((fr->vdwtype == evdwSWITCH) || (fr->vdwtype == evdwSHIFT))
+ {
+ if (fr->rvdw_switch == 0)
+ {
+ gmx_fatal(FARGS,
+ "With dispersion correction rvdw-switch can not be zero "
+ "for vdw-type = %s", evdw_names[fr->vdwtype]);
+ }
+
+ scale = fr->nblists[0].table_elec_vdw.scale;
+ vdwtab = fr->nblists[0].table_vdw.data;
+
+ /* Round the cut-offs to exact table values for precision */
+ ri0 = floor(fr->rvdw_switch*scale);
+ ri1 = ceil(fr->rvdw*scale);
+ r0 = ri0/scale;
+ r1 = ri1/scale;
+ rc3 = r0*r0*r0;
+ rc9 = rc3*rc3*rc3;
+
+ if (fr->vdwtype == evdwSHIFT)
+ {
+ /* Determine the constant energy shift below rvdw_switch.
+ * Table has a scale factor since we have scaled it down to compensate
+ * for scaling-up c6/c12 with the derivative factors to save flops in analytical kernels.
+ */
+ fr->enershiftsix = (real)(-1.0/(rc3*rc3)) - 6.0*vdwtab[8*ri0];
+ fr->enershifttwelve = (real)( 1.0/(rc9*rc3)) - 12.0*vdwtab[8*ri0 + 4];
+ }
+ /* Add the constant part from 0 to rvdw_switch.
+ * This integration from 0 to rvdw_switch overcounts the number
+ * of interactions by 1, as it also counts the self interaction.
+ * We will correct for this later.
+ */
+ eners[0] += 4.0*M_PI*fr->enershiftsix*rc3/3.0;
+ eners[1] += 4.0*M_PI*fr->enershifttwelve*rc3/3.0;
+
+ invscale = 1.0/(scale);
+ invscale2 = invscale*invscale;
+ invscale3 = invscale*invscale2;
+
+ /* following summation derived from cubic spline definition,
+ Numerical Recipies in C, second edition, p. 113-116. Exact
+ for the cubic spline. We first calculate the negative of
+ the energy from rvdw to rvdw_switch, assuming that g(r)=1,
+ and then add the more standard, abrupt cutoff correction to
+ that result, yielding the long-range correction for a
+ switched function. We perform both the pressure and energy
+ loops at the same time for simplicity, as the computational
+ cost is low. */
+
+ for (i = 0; i < 2; i++)
+ {
+ enersum = 0.0; virsum = 0.0;
+ if (i == 0)
+ {
+ offstart = 0;
+ /* Since the dispersion table has been scaled down a factor 6.0 and the repulsion
+ * a factor 12.0 to compensate for the c6/c12 parameters inside nbfp[] being scaled
+ * up (to save flops in kernels), we need to correct for this.
+ */
+ tabfactor = 6.0;
+ }
+ else
+ {
+ offstart = 4;
+ tabfactor = 12.0;
+ }
+ for (ri = ri0; ri < ri1; ri++)
+ {
+ r = ri*invscale;
+ ea = invscale3;
+ eb = 2.0*invscale2*r;
+ ec = invscale*r*r;
+
+ pa = invscale3;
+ pb = 3.0*invscale2*r;
+ pc = 3.0*invscale*r*r;
+ pd = r*r*r;
+
+ /* this "8" is from the packing in the vdwtab array - perhaps should be #define'ed? */
+ offset = 8*ri + offstart;
+ y0 = vdwtab[offset];
+ f = vdwtab[offset+1];
+ g = vdwtab[offset+2];
+ h = vdwtab[offset+3];
+
+ enersum += y0*(ea/3 + eb/2 + ec) + f*(ea/4 + eb/3 + ec/2) + g*(ea/5 + eb/4 + ec/3) + h*(ea/6 + eb/5 + ec/4);
+ virsum += f*(pa/4 + pb/3 + pc/2 + pd) + 2*g*(pa/5 + pb/4 + pc/3 + pd/2) + 3*h*(pa/6 + pb/5 + pc/4 + pd/3);
+ }
+
+ enersum *= 4.0*M_PI*tabfactor;
+ virsum *= 4.0*M_PI*tabfactor;
+ eners[i] -= enersum;
+ virs[i] -= virsum;
+ }
+
+ /* now add the correction for rvdw_switch to infinity */
+ eners[0] += -4.0*M_PI/(3.0*rc3);
+ eners[1] += 4.0*M_PI/(9.0*rc9);
+ virs[0] += 8.0*M_PI/rc3;
+ virs[1] += -16.0*M_PI/(3.0*rc9);
+ }
+ else if ((fr->vdwtype == evdwCUT) || (fr->vdwtype == evdwUSER))
+ {
+ if (fr->vdwtype == evdwUSER && fplog)
+ {
+ fprintf(fplog,
+ "WARNING: using dispersion correction with user tables\n");
+ }
+ rc3 = fr->rvdw*fr->rvdw*fr->rvdw;
+ rc9 = rc3*rc3*rc3;
+ /* Contribution beyond the cut-off */
+ eners[0] += -4.0*M_PI/(3.0*rc3);
+ eners[1] += 4.0*M_PI/(9.0*rc9);
+ if (fr->vdw_modifier == eintmodPOTSHIFT)
+ {
+ /* Contribution within the cut-off */
+ eners[0] += -4.0*M_PI/(3.0*rc3);
+ eners[1] += 4.0*M_PI/(3.0*rc9);
+ }
+ /* Contribution beyond the cut-off */
+ virs[0] += 8.0*M_PI/rc3;
+ virs[1] += -16.0*M_PI/(3.0*rc9);
+ }
+ else
+ {
+ gmx_fatal(FARGS,
+ "Dispersion correction is not implemented for vdw-type = %s",
+ evdw_names[fr->vdwtype]);
+ }
+ fr->enerdiffsix = eners[0];
+ fr->enerdifftwelve = eners[1];
+ /* The 0.5 is due to the Gromacs definition of the virial */
+ fr->virdiffsix = 0.5*virs[0];
+ fr->virdifftwelve = 0.5*virs[1];
+ }
+}
+
+void calc_dispcorr(FILE *fplog, t_inputrec *ir, t_forcerec *fr,
+ gmx_large_int_t step, int natoms,
+ matrix box, real lambda, tensor pres, tensor virial,
+ real *prescorr, real *enercorr, real *dvdlcorr)
+{
+ gmx_bool bCorrAll, bCorrPres;
+ real dvdlambda, invvol, dens, ninter, avcsix, avctwelve, enerdiff, svir = 0, spres = 0;
+ int m;
+
+ *prescorr = 0;
+ *enercorr = 0;
+ *dvdlcorr = 0;
+
+ clear_mat(virial);
+ clear_mat(pres);
+
+ if (ir->eDispCorr != edispcNO)
+ {
+ bCorrAll = (ir->eDispCorr == edispcAllEner ||
+ ir->eDispCorr == edispcAllEnerPres);
+ bCorrPres = (ir->eDispCorr == edispcEnerPres ||
+ ir->eDispCorr == edispcAllEnerPres);
+
+ invvol = 1/det(box);
+ if (fr->n_tpi)
+ {
+ /* Only correct for the interactions with the inserted molecule */
+ dens = (natoms - fr->n_tpi)*invvol;
+ ninter = fr->n_tpi;
+ }
+ else
+ {
+ dens = natoms*invvol;
+ ninter = 0.5*natoms;
+ }
+
+ if (ir->efep == efepNO)
+ {
+ avcsix = fr->avcsix[0];
+ avctwelve = fr->avctwelve[0];
+ }
+ else
+ {
+ avcsix = (1 - lambda)*fr->avcsix[0] + lambda*fr->avcsix[1];
+ avctwelve = (1 - lambda)*fr->avctwelve[0] + lambda*fr->avctwelve[1];
+ }
+
+ enerdiff = ninter*(dens*fr->enerdiffsix - fr->enershiftsix);
+ *enercorr += avcsix*enerdiff;
+ dvdlambda = 0.0;
+ if (ir->efep != efepNO)
+ {
+ dvdlambda += (fr->avcsix[1] - fr->avcsix[0])*enerdiff;
+ }
+ if (bCorrAll)
+ {
+ enerdiff = ninter*(dens*fr->enerdifftwelve - fr->enershifttwelve);
+ *enercorr += avctwelve*enerdiff;
+ if (fr->efep != efepNO)
+ {
+ dvdlambda += (fr->avctwelve[1] - fr->avctwelve[0])*enerdiff;
+ }
+ }
+
+ if (bCorrPres)
+ {
+ svir = ninter*dens*avcsix*fr->virdiffsix/3.0;
+ if (ir->eDispCorr == edispcAllEnerPres)
+ {
+ svir += ninter*dens*avctwelve*fr->virdifftwelve/3.0;
+ }
+ /* The factor 2 is because of the Gromacs virial definition */
+ spres = -2.0*invvol*svir*PRESFAC;
+
+ for (m = 0; m < DIM; m++)
+ {
+ virial[m][m] += svir;
+ pres[m][m] += spres;
+ }
+ *prescorr += spres;
+ }
+
+ /* Can't currently control when it prints, for now, just print when degugging */
+ if (debug)
+ {
+ if (bCorrAll)
+ {
+ fprintf(debug, "Long Range LJ corr.: <C6> %10.4e, <C12> %10.4e\n",
+ avcsix, avctwelve);
+ }
+ if (bCorrPres)
+ {
+ fprintf(debug,
+ "Long Range LJ corr.: Epot %10g, Pres: %10g, Vir: %10g\n",
+ *enercorr, spres, svir);
+ }
+ else
+ {
+ fprintf(debug, "Long Range LJ corr.: Epot %10g\n", *enercorr);
+ }
+ }
+
+ if (fr->bSepDVDL && do_per_step(step, ir->nstlog))
+ {
+ fprintf(fplog, sepdvdlformat, "Dispersion correction",
+ *enercorr, dvdlambda);
+ }
+ if (fr->efep != efepNO)
+ {
+ *dvdlcorr += dvdlambda;
+ }
+ }
+}
+
+void do_pbc_first(FILE *fplog, matrix box, t_forcerec *fr,
+ t_graph *graph, rvec x[])
+{
+ if (fplog)
+ {
+ fprintf(fplog, "Removing pbc first time\n");
+ }
+ calc_shifts(box, fr->shift_vec);
+ if (graph)
+ {
+ mk_mshift(fplog, graph, fr->ePBC, box, x);
+ if (gmx_debug_at)
+ {
+ p_graph(debug, "do_pbc_first 1", graph);
+ }
+ shift_self(graph, box, x);
+ /* By doing an extra mk_mshift the molecules that are broken
+ * because they were e.g. imported from another software
+ * will be made whole again. Such are the healing powers
+ * of GROMACS.
+ */
+ mk_mshift(fplog, graph, fr->ePBC, box, x);
+ if (gmx_debug_at)
+ {
+ p_graph(debug, "do_pbc_first 2", graph);
+ }
+ }
+ if (fplog)
+ {
+ fprintf(fplog, "Done rmpbc\n");
+ }
+}
+
+static void low_do_pbc_mtop(FILE *fplog, int ePBC, matrix box,
+ gmx_mtop_t *mtop, rvec x[],
+ gmx_bool bFirst)
+{
+ t_graph *graph;
+ int mb, as, mol;
+ gmx_molblock_t *molb;
+
+ if (bFirst && fplog)
+ {
+ fprintf(fplog, "Removing pbc first time\n");
+ }
+
+ snew(graph, 1);
+ as = 0;
+ for (mb = 0; mb < mtop->nmolblock; mb++)
+ {
+ molb = &mtop->molblock[mb];
+ if (molb->natoms_mol == 1 ||
+ (!bFirst && mtop->moltype[molb->type].cgs.nr == 1))
+ {
+ /* Just one atom or charge group in the molecule, no PBC required */
+ as += molb->nmol*molb->natoms_mol;
+ }
+ else
+ {
+ /* Pass NULL iso fplog to avoid graph prints for each molecule type */
+ mk_graph_ilist(NULL, mtop->moltype[molb->type].ilist,
+ 0, molb->natoms_mol, FALSE, FALSE, graph);
+
+ for (mol = 0; mol < molb->nmol; mol++)
+ {
+ mk_mshift(fplog, graph, ePBC, box, x+as);
+
+ shift_self(graph, box, x+as);
+ /* The molecule is whole now.
+ * We don't need the second mk_mshift call as in do_pbc_first,
+ * since we no longer need this graph.
+ */
+
+ as += molb->natoms_mol;
+ }
+ done_graph(graph);
+ }
+ }
+ sfree(graph);
+}
+
+void do_pbc_first_mtop(FILE *fplog, int ePBC, matrix box,
+ gmx_mtop_t *mtop, rvec x[])
+{
+ low_do_pbc_mtop(fplog, ePBC, box, mtop, x, TRUE);
+}
+
+void do_pbc_mtop(FILE *fplog, int ePBC, matrix box,
+ gmx_mtop_t *mtop, rvec x[])
+{
+ low_do_pbc_mtop(fplog, ePBC, box, mtop, x, FALSE);
+}
+
+void finish_run(FILE *fplog, t_commrec *cr, const char *confout,
+ t_inputrec *inputrec,
+ t_nrnb nrnb[], gmx_wallcycle_t wcycle,
+ gmx_runtime_t *runtime,
+ wallclock_gpu_t *gputimes,
+ int omp_nth_pp,
+ gmx_bool bWriteStat)
+{
+ int i, j;
+ t_nrnb *nrnb_tot = NULL;
+ real delta_t;
+ double nbfs, mflop;
+
+ wallcycle_sum(cr, wcycle);
+
+ if (cr->nnodes > 1)
+ {
+ snew(nrnb_tot, 1);
+#ifdef GMX_MPI
+ MPI_Allreduce(nrnb->n, nrnb_tot->n, eNRNB, MPI_DOUBLE, MPI_SUM,
+ cr->mpi_comm_mysim);
+#endif
+ }
+ else
+ {
+ nrnb_tot = nrnb;
+ }
+
+#if defined(GMX_MPI) && !defined(GMX_THREAD_MPI)
+ if (cr->nnodes > 1)
+ {
+ /* reduce nodetime over all MPI processes in the current simulation */
+ double sum;
+ MPI_Allreduce(&runtime->proctime, &sum, 1, MPI_DOUBLE, MPI_SUM,
+ cr->mpi_comm_mysim);
+ runtime->proctime = sum;
+ }
+#endif
+
+ if (SIMMASTER(cr))
+ {
+ print_flop(fplog, nrnb_tot, &nbfs, &mflop);
+ }
+ if (cr->nnodes > 1)
+ {
+ sfree(nrnb_tot);
+ }
+
+ if ((cr->duty & DUTY_PP) && DOMAINDECOMP(cr))
+ {
+ print_dd_statistics(cr, inputrec, fplog);
+ }
+
+#ifdef GMX_MPI
+ if (PARTDECOMP(cr))
+ {
+ if (MASTER(cr))
+ {
+ t_nrnb *nrnb_all;
+ int s;
+ MPI_Status stat;
+
+ snew(nrnb_all, cr->nnodes);
+ nrnb_all[0] = *nrnb;
+ for (s = 1; s < cr->nnodes; s++)
+ {
+ MPI_Recv(nrnb_all[s].n, eNRNB, MPI_DOUBLE, s, 0,
+ cr->mpi_comm_mysim, &stat);
+ }
+ pr_load(fplog, cr, nrnb_all);
+ sfree(nrnb_all);
+ }
+ else
+ {
+ MPI_Send(nrnb->n, eNRNB, MPI_DOUBLE, MASTERRANK(cr), 0,
+ cr->mpi_comm_mysim);
+ }
+ }
+#endif
+
+ if (SIMMASTER(cr))
+ {
+ wallcycle_print(fplog, cr->nnodes, cr->npmenodes, runtime->realtime,
+ wcycle, gputimes);
+
+ if (EI_DYNAMICS(inputrec->eI))
+ {
+ delta_t = inputrec->delta_t;
+ }
+ else
+ {
+ delta_t = 0;
+ }
+
+ if (fplog)
+ {
+ print_perf(fplog, runtime->proctime, runtime->realtime,
+ runtime->nsteps_done, delta_t, nbfs, mflop);
+ }
+ if (bWriteStat)
+ {
+ print_perf(stderr, runtime->proctime, runtime->realtime,
+ runtime->nsteps_done, delta_t, nbfs, mflop);
+ }
+ }
+}
+
+extern void initialize_lambdas(FILE *fplog, t_inputrec *ir, int *fep_state, real *lambda, double *lam0)
+{
+ /* this function works, but could probably use a logic rewrite to keep all the different
+ types of efep straight. */
+
+ int i;
+ t_lambda *fep = ir->fepvals;
+
+ if ((ir->efep == efepNO) && (ir->bSimTemp == FALSE))
+ {
+ for (i = 0; i < efptNR; i++)
+ {
+ lambda[i] = 0.0;
+ if (lam0)
+ {
+ lam0[i] = 0.0;
+ }
+ }
+ return;
+ }
+ else
+ {
+ *fep_state = fep->init_fep_state; /* this might overwrite the checkpoint
+ if checkpoint is set -- a kludge is in for now
+ to prevent this.*/
+ for (i = 0; i < efptNR; i++)
+ {
+ /* overwrite lambda state with init_lambda for now for backwards compatibility */
+ if (fep->init_lambda >= 0) /* if it's -1, it was never initializd */
+ {
+ lambda[i] = fep->init_lambda;
+ if (lam0)
+ {
+ lam0[i] = lambda[i];
+ }
+ }
+ else
+ {
+ lambda[i] = fep->all_lambda[i][*fep_state];
+ if (lam0)
+ {
+ lam0[i] = lambda[i];
+ }
+ }
+ }
+ if (ir->bSimTemp)
+ {
+ /* need to rescale control temperatures to match current state */
+ for (i = 0; i < ir->opts.ngtc; i++)
+ {
+ if (ir->opts.ref_t[i] > 0)
+ {
+ ir->opts.ref_t[i] = ir->simtempvals->temperatures[*fep_state];
+ }
+ }
+ }
+ }
+
+ /* Send to the log the information on the current lambdas */
+ if (fplog != NULL)
+ {
+ fprintf(fplog, "Initial vector of lambda components:[ ");
+ for (i = 0; i < efptNR; i++)
+ {
+ fprintf(fplog, "%10.4f ", lambda[i]);
+ }
+ fprintf(fplog, "]\n");
+ }
+ return;
+}
+
+
+void init_md(FILE *fplog,
+ t_commrec *cr, t_inputrec *ir, const output_env_t oenv,
+ double *t, double *t0,
+ real *lambda, int *fep_state, double *lam0,
+ t_nrnb *nrnb, gmx_mtop_t *mtop,
+ gmx_update_t *upd,
+ int nfile, const t_filenm fnm[],
+ gmx_mdoutf_t **outf, t_mdebin **mdebin,
+ tensor force_vir, tensor shake_vir, rvec mu_tot,
+ gmx_bool *bSimAnn, t_vcm **vcm, t_state *state, unsigned long Flags)
+{
+ int i, j, n;
+ real tmpt, mod;
+
+ /* Initial values */
+ *t = *t0 = ir->init_t;
+
+ *bSimAnn = FALSE;
+ for (i = 0; i < ir->opts.ngtc; i++)
+ {
+ /* set bSimAnn if any group is being annealed */
+ if (ir->opts.annealing[i] != eannNO)
+ {
+ *bSimAnn = TRUE;
+ }
+ }
+ if (*bSimAnn)
+ {
+ update_annealing_target_temp(&(ir->opts), ir->init_t);
+ }
+
+ /* Initialize lambda variables */
+ initialize_lambdas(fplog, ir, fep_state, lambda, lam0);
+
+ if (upd)
+ {
+ *upd = init_update(fplog, ir);
+ }
+
+
+ if (vcm != NULL)
+ {
+ *vcm = init_vcm(fplog, &mtop->groups, ir);
+ }
+
+ if (EI_DYNAMICS(ir->eI) && !(Flags & MD_APPENDFILES))
+ {
+ if (ir->etc == etcBERENDSEN)
+ {
+ please_cite(fplog, "Berendsen84a");
+ }
+ if (ir->etc == etcVRESCALE)
+ {
+ please_cite(fplog, "Bussi2007a");
+ }
+ }
+
+ init_nrnb(nrnb);
+
+ if (nfile != -1)
+ {
+ *outf = init_mdoutf(nfile, fnm, Flags, cr, ir, oenv);
+
+ *mdebin = init_mdebin((Flags & MD_APPENDFILES) ? NULL : (*outf)->fp_ene,
+ mtop, ir, (*outf)->fp_dhdl);
+ }
+
+ if (ir->bAdress)
+ {
+ please_cite(fplog, "Fritsch12");
+ please_cite(fplog, "Junghans10");
+ }
+ /* Initiate variables */
+ clear_mat(force_vir);
+ clear_mat(shake_vir);
+ clear_rvec(mu_tot);
+
+ debug_gmx();
+}
--- /dev/null
- #pragma omp parallel num_threads(ngr)
+/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
+ *
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * GROningen MAchine for Chemical Simulations
+ *
+ * VERSION 3.2.0
+ * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2004, The GROMACS development team,
+ * check out http://www.gromacs.org for more information.
+
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * If you want to redistribute modifications, please consider that
+ * scientific software is very special. Version control is crucial -
+ * bugs must be traceable. We will be happy to consider code for
+ * inclusion in the official distribution, but derived work must not
+ * be called official GROMACS. Details are found in the README & COPYING
+ * files - if they are missing, get the official version at www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the papers on the package - you can find them in the top README file.
+ *
+ * For more info, check our website at http://www.gromacs.org
+ *
+ * And Hey:
+ * GROwing Monsters And Cloning Shrimps
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+
+#include <stdio.h>
+#include <math.h>
+
+#include "types/commrec.h"
+#include "sysstuff.h"
+#include "smalloc.h"
+#include "typedefs.h"
+#include "nrnb.h"
+#include "physics.h"
+#include "macros.h"
+#include "vec.h"
+#include "main.h"
+#include "confio.h"
+#include "update.h"
+#include "gmx_random.h"
+#include "futil.h"
+#include "mshift.h"
+#include "tgroup.h"
+#include "force.h"
+#include "names.h"
+#include "txtdump.h"
+#include "mdrun.h"
+#include "constr.h"
+#include "edsam.h"
+#include "pull.h"
+#include "disre.h"
+#include "orires.h"
+#include "gmx_wallcycle.h"
+#include "gmx_omp_nthreads.h"
+#include "gmx_omp.h"
+
+/*For debugging, start at v(-dt/2) for velolcity verlet -- uncomment next line */
+/*#define STARTFROMDT2*/
+
+typedef struct {
+ double gdt;
+ double eph;
+ double emh;
+ double em;
+ double b;
+ double c;
+ double d;
+} gmx_sd_const_t;
+
+typedef struct {
+ real V;
+ real X;
+ real Yv;
+ real Yx;
+} gmx_sd_sigma_t;
+
+typedef struct {
+ /* The random state for ngaussrand threads.
+ * Normal thermostats need just 1 random number generator,
+ * but SD and BD with OpenMP parallelization need 1 for each thread.
+ */
+ int ngaussrand;
+ gmx_rng_t *gaussrand;
+ /* BD stuff */
+ real *bd_rf;
+ /* SD stuff */
+ gmx_sd_const_t *sdc;
+ gmx_sd_sigma_t *sdsig;
+ rvec *sd_V;
+ int sd_V_nalloc;
+ /* andersen temperature control stuff */
+ gmx_bool *randomize_group;
+ real *boltzfac;
+} gmx_stochd_t;
+
+typedef struct gmx_update
+{
+ gmx_stochd_t *sd;
+ /* xprime for constraint algorithms */
+ rvec *xp;
+ int xp_nalloc;
+
+ /* variable size arrays for andersen */
+ gmx_bool *randatom;
+ int *randatom_list;
+ gmx_bool randatom_list_init;
+
+ /* Variables for the deform algorithm */
+ gmx_large_int_t deformref_step;
+ matrix deformref_box;
+} t_gmx_update;
+
+
+static void do_update_md(int start, int nrend, double dt,
+ t_grp_tcstat *tcstat,
+ double nh_vxi[],
+ gmx_bool bNEMD, t_grp_acc *gstat, rvec accel[],
+ ivec nFreeze[],
+ real invmass[],
+ unsigned short ptype[], unsigned short cFREEZE[],
+ unsigned short cACC[], unsigned short cTC[],
+ rvec x[], rvec xprime[], rvec v[],
+ rvec f[], matrix M,
+ gmx_bool bNH, gmx_bool bPR)
+{
+ double imass, w_dt;
+ int gf = 0, ga = 0, gt = 0;
+ rvec vrel;
+ real vn, vv, va, vb, vnrel;
+ real lg, vxi = 0, u;
+ int n, d;
+
+ if (bNH || bPR)
+ {
+ /* Update with coupling to extended ensembles, used for
+ * Nose-Hoover and Parrinello-Rahman coupling
+ * Nose-Hoover uses the reversible leap-frog integrator from
+ * Holian et al. Phys Rev E 52(3) : 2338, 1995
+ */
+ for (n = start; n < nrend; n++)
+ {
+ imass = invmass[n];
+ if (cFREEZE)
+ {
+ gf = cFREEZE[n];
+ }
+ if (cACC)
+ {
+ ga = cACC[n];
+ }
+ if (cTC)
+ {
+ gt = cTC[n];
+ }
+ lg = tcstat[gt].lambda;
+ if (bNH)
+ {
+ vxi = nh_vxi[gt];
+ }
+ rvec_sub(v[n], gstat[ga].u, vrel);
+
+ for (d = 0; d < DIM; d++)
+ {
+ if ((ptype[n] != eptVSite) && (ptype[n] != eptShell) && !nFreeze[gf][d])
+ {
+ vnrel = (lg*vrel[d] + dt*(imass*f[n][d] - 0.5*vxi*vrel[d]
+ - iprod(M[d], vrel)))/(1 + 0.5*vxi*dt);
+ /* do not scale the mean velocities u */
+ vn = gstat[ga].u[d] + accel[ga][d]*dt + vnrel;
+ v[n][d] = vn;
+ xprime[n][d] = x[n][d]+vn*dt;
+ }
+ else
+ {
+ v[n][d] = 0.0;
+ xprime[n][d] = x[n][d];
+ }
+ }
+ }
+ }
+ else if (cFREEZE != NULL ||
+ nFreeze[0][XX] || nFreeze[0][YY] || nFreeze[0][ZZ] ||
+ bNEMD)
+ {
+ /* Update with Berendsen/v-rescale coupling and freeze or NEMD */
+ for (n = start; n < nrend; n++)
+ {
+ w_dt = invmass[n]*dt;
+ if (cFREEZE)
+ {
+ gf = cFREEZE[n];
+ }
+ if (cACC)
+ {
+ ga = cACC[n];
+ }
+ if (cTC)
+ {
+ gt = cTC[n];
+ }
+ lg = tcstat[gt].lambda;
+
+ for (d = 0; d < DIM; d++)
+ {
+ vn = v[n][d];
+ if ((ptype[n] != eptVSite) && (ptype[n] != eptShell) && !nFreeze[gf][d])
+ {
+ vv = lg*vn + f[n][d]*w_dt;
+
+ /* do not scale the mean velocities u */
+ u = gstat[ga].u[d];
+ va = vv + accel[ga][d]*dt;
+ vb = va + (1.0-lg)*u;
+ v[n][d] = vb;
+ xprime[n][d] = x[n][d]+vb*dt;
+ }
+ else
+ {
+ v[n][d] = 0.0;
+ xprime[n][d] = x[n][d];
+ }
+ }
+ }
+ }
+ else
+ {
+ /* Plain update with Berendsen/v-rescale coupling */
+ for (n = start; n < nrend; n++)
+ {
+ if ((ptype[n] != eptVSite) && (ptype[n] != eptShell))
+ {
+ w_dt = invmass[n]*dt;
+ if (cTC)
+ {
+ gt = cTC[n];
+ }
+ lg = tcstat[gt].lambda;
+
+ for (d = 0; d < DIM; d++)
+ {
+ vn = lg*v[n][d] + f[n][d]*w_dt;
+ v[n][d] = vn;
+ xprime[n][d] = x[n][d] + vn*dt;
+ }
+ }
+ else
+ {
+ for (d = 0; d < DIM; d++)
+ {
+ v[n][d] = 0.0;
+ xprime[n][d] = x[n][d];
+ }
+ }
+ }
+ }
+}
+
+static void do_update_vv_vel(int start, int nrend, double dt,
+ t_grp_tcstat *tcstat, t_grp_acc *gstat,
+ rvec accel[], ivec nFreeze[], real invmass[],
+ unsigned short ptype[], unsigned short cFREEZE[],
+ unsigned short cACC[], rvec v[], rvec f[],
+ gmx_bool bExtended, real veta, real alpha)
+{
+ double imass, w_dt;
+ int gf = 0, ga = 0;
+ rvec vrel;
+ real u, vn, vv, va, vb, vnrel;
+ int n, d;
+ double g, mv1, mv2;
+
+ if (bExtended)
+ {
+ g = 0.25*dt*veta*alpha;
+ mv1 = exp(-g);
+ mv2 = series_sinhx(g);
+ }
+ else
+ {
+ mv1 = 1.0;
+ mv2 = 1.0;
+ }
+ for (n = start; n < nrend; n++)
+ {
+ w_dt = invmass[n]*dt;
+ if (cFREEZE)
+ {
+ gf = cFREEZE[n];
+ }
+ if (cACC)
+ {
+ ga = cACC[n];
+ }
+
+ for (d = 0; d < DIM; d++)
+ {
+ if ((ptype[n] != eptVSite) && (ptype[n] != eptShell) && !nFreeze[gf][d])
+ {
+ v[n][d] = mv1*(mv1*v[n][d] + 0.5*(w_dt*mv2*f[n][d]))+0.5*accel[ga][d]*dt;
+ }
+ else
+ {
+ v[n][d] = 0.0;
+ }
+ }
+ }
+} /* do_update_vv_vel */
+
+static void do_update_vv_pos(int start, int nrend, double dt,
+ t_grp_tcstat *tcstat, t_grp_acc *gstat,
+ rvec accel[], ivec nFreeze[], real invmass[],
+ unsigned short ptype[], unsigned short cFREEZE[],
+ rvec x[], rvec xprime[], rvec v[],
+ rvec f[], gmx_bool bExtended, real veta, real alpha)
+{
+ double imass, w_dt;
+ int gf = 0;
+ int n, d;
+ double g, mr1, mr2;
+
+ /* Would it make more sense if Parrinello-Rahman was put here? */
+ if (bExtended)
+ {
+ g = 0.5*dt*veta;
+ mr1 = exp(g);
+ mr2 = series_sinhx(g);
+ }
+ else
+ {
+ mr1 = 1.0;
+ mr2 = 1.0;
+ }
+
+ for (n = start; n < nrend; n++)
+ {
+
+ if (cFREEZE)
+ {
+ gf = cFREEZE[n];
+ }
+
+ for (d = 0; d < DIM; d++)
+ {
+ if ((ptype[n] != eptVSite) && (ptype[n] != eptShell) && !nFreeze[gf][d])
+ {
+ xprime[n][d] = mr1*(mr1*x[n][d]+mr2*dt*v[n][d]);
+ }
+ else
+ {
+ xprime[n][d] = x[n][d];
+ }
+ }
+ }
+} /* do_update_vv_pos */
+
+static void do_update_visc(int start, int nrend, double dt,
+ t_grp_tcstat *tcstat,
+ double nh_vxi[],
+ real invmass[],
+ unsigned short ptype[], unsigned short cTC[],
+ rvec x[], rvec xprime[], rvec v[],
+ rvec f[], matrix M, matrix box, real
+ cos_accel, real vcos,
+ gmx_bool bNH, gmx_bool bPR)
+{
+ double imass, w_dt;
+ int gt = 0;
+ real vn, vc;
+ real lg, vxi = 0, vv;
+ real fac, cosz;
+ rvec vrel;
+ int n, d;
+
+ fac = 2*M_PI/(box[ZZ][ZZ]);
+
+ if (bNH || bPR)
+ {
+ /* Update with coupling to extended ensembles, used for
+ * Nose-Hoover and Parrinello-Rahman coupling
+ */
+ for (n = start; n < nrend; n++)
+ {
+ imass = invmass[n];
+ if (cTC)
+ {
+ gt = cTC[n];
+ }
+ lg = tcstat[gt].lambda;
+ cosz = cos(fac*x[n][ZZ]);
+
+ copy_rvec(v[n], vrel);
+
+ vc = cosz*vcos;
+ vrel[XX] -= vc;
+ if (bNH)
+ {
+ vxi = nh_vxi[gt];
+ }
+ for (d = 0; d < DIM; d++)
+ {
+ vn = v[n][d];
+
+ if ((ptype[n] != eptVSite) && (ptype[n] != eptShell))
+ {
+ vn = (lg*vrel[d] + dt*(imass*f[n][d] - 0.5*vxi*vrel[d]
+ - iprod(M[d], vrel)))/(1 + 0.5*vxi*dt);
+ if (d == XX)
+ {
+ vn += vc + dt*cosz*cos_accel;
+ }
+ v[n][d] = vn;
+ xprime[n][d] = x[n][d]+vn*dt;
+ }
+ else
+ {
+ xprime[n][d] = x[n][d];
+ }
+ }
+ }
+ }
+ else
+ {
+ /* Classic version of update, used with berendsen coupling */
+ for (n = start; n < nrend; n++)
+ {
+ w_dt = invmass[n]*dt;
+ if (cTC)
+ {
+ gt = cTC[n];
+ }
+ lg = tcstat[gt].lambda;
+ cosz = cos(fac*x[n][ZZ]);
+
+ for (d = 0; d < DIM; d++)
+ {
+ vn = v[n][d];
+
+ if ((ptype[n] != eptVSite) && (ptype[n] != eptShell))
+ {
+ if (d == XX)
+ {
+ vc = cosz*vcos;
+ /* Do not scale the cosine velocity profile */
+ vv = vc + lg*(vn - vc + f[n][d]*w_dt);
+ /* Add the cosine accelaration profile */
+ vv += dt*cosz*cos_accel;
+ }
+ else
+ {
+ vv = lg*(vn + f[n][d]*w_dt);
+ }
+ v[n][d] = vv;
+ xprime[n][d] = x[n][d]+vv*dt;
+ }
+ else
+ {
+ v[n][d] = 0.0;
+ xprime[n][d] = x[n][d];
+ }
+ }
+ }
+ }
+}
+
+/* Allocates and initializes sd->gaussrand[i] for i=1, i<sd->ngaussrand,
+ * Using seeds generated from sd->gaussrand[0].
+ */
+static void init_multiple_gaussrand(gmx_stochd_t *sd)
+{
+ int ngr, i;
+ unsigned int *seed;
+
+ ngr = sd->ngaussrand;
+ snew(seed, ngr);
+
+ for (i = 1; i < ngr; i++)
+ {
+ seed[i] = gmx_rng_uniform_uint32(sd->gaussrand[0]);
+ }
+
- /* Initialize on each thread to have thread-local memory alloced */
++ if (ngr != gmx_omp_nthreads_get(emntUpdate))
++ {
++ gmx_incons("The number of Gaussian number generators should be equal to gmx_omp_nthreads_get(emntUpdate)");
++ }
++
++#pragma omp parallel num_threads(gmx_omp_nthreads_get(emntUpdate))
+ {
+ int th;
+
+ th = gmx_omp_get_thread_num();
+ if (th > 0)
+ {
- int ngtc, real tau_t[], real ref_t[],
++ /* Initialize on each thread to get memory allocated thread-local */
+ sd->gaussrand[th] = gmx_rng_init(seed[th]);
+ }
+ }
+
+ sfree(seed);
+}
+
+static gmx_stochd_t *init_stochd(FILE *fplog, t_inputrec *ir, int nthreads)
+{
+ gmx_stochd_t *sd;
+ gmx_sd_const_t *sdc;
+ int ngtc, n, th;
+ real y;
+
+ snew(sd, 1);
+
+ /* Initiate random number generator for langevin type dynamics,
+ * for BD, SD or velocity rescaling temperature coupling.
+ */
+ if (ir->eI == eiBD || EI_SD(ir->eI))
+ {
+ sd->ngaussrand = nthreads;
+ }
+ else
+ {
+ sd->ngaussrand = 1;
+ }
+ snew(sd->gaussrand, sd->ngaussrand);
+
+ /* Initialize the first random generator */
+ sd->gaussrand[0] = gmx_rng_init(ir->ld_seed);
+
+ if (sd->ngaussrand > 1)
+ {
+ /* Initialize the rest of the random number generators,
+ * using the first one to generate seeds.
+ */
+ init_multiple_gaussrand(sd);
+ }
+
+ ngtc = ir->opts.ngtc;
+
+ if (ir->eI == eiBD)
+ {
+ snew(sd->bd_rf, ngtc);
+ }
+ else if (EI_SD(ir->eI))
+ {
+ snew(sd->sdc, ngtc);
+ snew(sd->sdsig, ngtc);
+
+ sdc = sd->sdc;
+ for (n = 0; n < ngtc; n++)
+ {
+ if (ir->opts.tau_t[n] > 0)
+ {
+ sdc[n].gdt = ir->delta_t/ir->opts.tau_t[n];
+ sdc[n].eph = exp(sdc[n].gdt/2);
+ sdc[n].emh = exp(-sdc[n].gdt/2);
+ sdc[n].em = exp(-sdc[n].gdt);
+ }
+ else
+ {
+ /* No friction and noise on this group */
+ sdc[n].gdt = 0;
+ sdc[n].eph = 1;
+ sdc[n].emh = 1;
+ sdc[n].em = 1;
+ }
+ if (sdc[n].gdt >= 0.05)
+ {
+ sdc[n].b = sdc[n].gdt*(sdc[n].eph*sdc[n].eph - 1)
+ - 4*(sdc[n].eph - 1)*(sdc[n].eph - 1);
+ sdc[n].c = sdc[n].gdt - 3 + 4*sdc[n].emh - sdc[n].em;
+ sdc[n].d = 2 - sdc[n].eph - sdc[n].emh;
+ }
+ else
+ {
+ y = sdc[n].gdt/2;
+ /* Seventh order expansions for small y */
+ sdc[n].b = y*y*y*y*(1/3.0+y*(1/3.0+y*(17/90.0+y*7/9.0)));
+ sdc[n].c = y*y*y*(2/3.0+y*(-1/2.0+y*(7/30.0+y*(-1/12.0+y*31/1260.0))));
+ sdc[n].d = y*y*(-1+y*y*(-1/12.0-y*y/360.0));
+ }
+ if (debug)
+ {
+ fprintf(debug, "SD const tc-grp %d: b %g c %g d %g\n",
+ n, sdc[n].b, sdc[n].c, sdc[n].d);
+ }
+ }
+ }
+ else if (ETC_ANDERSEN(ir->etc))
+ {
+ int ngtc;
+ t_grpopts *opts;
+ real reft;
+
+ opts = &ir->opts;
+ ngtc = opts->ngtc;
+
+ snew(sd->randomize_group, ngtc);
+ snew(sd->boltzfac, ngtc);
+
+ /* for now, assume that all groups, if randomized, are randomized at the same rate, i.e. tau_t is the same. */
+ /* since constraint groups don't necessarily match up with temperature groups! This is checked in readir.c */
+
+ for (n = 0; n < ngtc; n++)
+ {
+ reft = max(0.0, opts->ref_t[n]);
+ if ((opts->tau_t[n] > 0) && (reft > 0)) /* tau_t or ref_t = 0 means that no randomization is done */
+ {
+ sd->randomize_group[n] = TRUE;
+ sd->boltzfac[n] = BOLTZ*opts->ref_t[n];
+ }
+ else
+ {
+ sd->randomize_group[n] = FALSE;
+ }
+ }
+ }
+ return sd;
+}
+
+void get_stochd_state(gmx_update_t upd, t_state *state)
+{
+ /* Note that we only get the state of the first random generator,
+ * even if there are multiple. This avoids repetition.
+ */
+ gmx_rng_get_state(upd->sd->gaussrand[0], state->ld_rng, state->ld_rngi);
+}
+
+void set_stochd_state(gmx_update_t upd, t_state *state)
+{
+ gmx_stochd_t *sd;
+ int i;
+
+ sd = upd->sd;
+
+ gmx_rng_set_state(sd->gaussrand[0], state->ld_rng, state->ld_rngi[0]);
+
+ if (sd->ngaussrand > 1)
+ {
+ /* We only end up here with SD or BD with OpenMP.
+ * Destroy and reinitialize the rest of the random number generators,
+ * using seeds generated from the first one.
+ * Although this doesn't recover the previous state,
+ * it at least avoids repetition, which is most important.
+ * Exaclty restoring states with all MPI+OpenMP setups is difficult
+ * and as the integrator is random to start with, doesn't gain us much.
+ */
+ for (i = 1; i < sd->ngaussrand; i++)
+ {
+ gmx_rng_destroy(sd->gaussrand[i]);
+ }
+
+ init_multiple_gaussrand(sd);
+ }
+}
+
+gmx_update_t init_update(FILE *fplog, t_inputrec *ir)
+{
+ t_gmx_update *upd;
+
+ snew(upd, 1);
+
+ if (ir->eI == eiBD || EI_SD(ir->eI) || ir->etc == etcVRESCALE || ETC_ANDERSEN(ir->etc))
+ {
+ upd->sd = init_stochd(fplog, ir, gmx_omp_nthreads_get(emntUpdate));
+ }
+
+ upd->xp = NULL;
+ upd->xp_nalloc = 0;
+ upd->randatom = NULL;
+ upd->randatom_list = NULL;
+ upd->randatom_list_init = FALSE; /* we have not yet cleared the data structure at this point */
+
+ return upd;
+}
+
+static void do_update_sd1(gmx_stochd_t *sd,
+ gmx_rng_t gaussrand,
+ int start, int nrend, double dt,
+ rvec accel[], ivec nFreeze[],
+ real invmass[], unsigned short ptype[],
+ unsigned short cFREEZE[], unsigned short cACC[],
+ unsigned short cTC[],
+ rvec x[], rvec xprime[], rvec v[], rvec f[],
+ rvec sd_X[],
+ int ngtc, real tau_t[], real ref_t[])
+{
+ gmx_sd_const_t *sdc;
+ gmx_sd_sigma_t *sig;
+ real kT;
+ int gf = 0, ga = 0, gt = 0;
+ real ism, sd_V;
+ int n, d;
+
+ sdc = sd->sdc;
+ sig = sd->sdsig;
+
+ for (n = 0; n < ngtc; n++)
+ {
+ kT = BOLTZ*ref_t[n];
+ /* The mass is encounted for later, since this differs per atom */
+ sig[n].V = sqrt(kT*(1 - sdc[n].em*sdc[n].em));
+ }
+
+ for (n = start; n < nrend; n++)
+ {
+ ism = sqrt(invmass[n]);
+ if (cFREEZE)
+ {
+ gf = cFREEZE[n];
+ }
+ if (cACC)
+ {
+ ga = cACC[n];
+ }
+ if (cTC)
+ {
+ gt = cTC[n];
+ }
+
+ for (d = 0; d < DIM; d++)
+ {
+ if ((ptype[n] != eptVSite) && (ptype[n] != eptShell) && !nFreeze[gf][d])
+ {
+ sd_V = ism*sig[gt].V*gmx_rng_gaussian_table(gaussrand);
+
+ v[n][d] = v[n][d]*sdc[gt].em
+ + (invmass[n]*f[n][d] + accel[ga][d])*tau_t[gt]*(1 - sdc[gt].em)
+ + sd_V;
+
+ xprime[n][d] = x[n][d] + v[n][d]*dt;
+ }
+ else
+ {
+ v[n][d] = 0.0;
+ xprime[n][d] = x[n][d];
+ }
+ }
+ }
+}
+
+static void check_sd2_work_data_allocation(gmx_stochd_t *sd, int nrend)
+{
+ if (nrend > sd->sd_V_nalloc)
+ {
+ sd->sd_V_nalloc = over_alloc_dd(nrend);
+ srenew(sd->sd_V, sd->sd_V_nalloc);
+ }
+}
+
++static void do_update_sd2_Tconsts(gmx_stochd_t *sd,
++ int ngtc,
++ const real tau_t[],
++ const real ref_t[])
++{
++ /* This is separated from the update below, because it is single threaded */
++ gmx_sd_const_t *sdc;
++ gmx_sd_sigma_t *sig;
++ int gt;
++ real kT;
++
++ sdc = sd->sdc;
++ sig = sd->sdsig;
++
++ for (gt = 0; gt < ngtc; gt++)
++ {
++ kT = BOLTZ*ref_t[gt];
++ /* The mass is encounted for later, since this differs per atom */
++ sig[gt].V = sqrt(kT*(1-sdc[gt].em));
++ sig[gt].X = sqrt(kT*sqr(tau_t[gt])*sdc[gt].c);
++ sig[gt].Yv = sqrt(kT*sdc[gt].b/sdc[gt].c);
++ sig[gt].Yx = sqrt(kT*sqr(tau_t[gt])*sdc[gt].b/(1-sdc[gt].em));
++ }
++}
++
+static void do_update_sd2(gmx_stochd_t *sd,
+ gmx_rng_t gaussrand,
+ gmx_bool bInitStep,
+ int start, int nrend,
+ rvec accel[], ivec nFreeze[],
+ real invmass[], unsigned short ptype[],
+ unsigned short cFREEZE[], unsigned short cACC[],
+ unsigned short cTC[],
+ rvec x[], rvec xprime[], rvec v[], rvec f[],
+ rvec sd_X[],
- if (bFirstHalf)
- {
- for (n = 0; n < ngtc; n++)
- {
- kT = BOLTZ*ref_t[n];
- /* The mass is encounted for later, since this differs per atom */
- sig[n].V = sqrt(kT*(1-sdc[n].em));
- sig[n].X = sqrt(kT*sqr(tau_t[n])*sdc[n].c);
- sig[n].Yv = sqrt(kT*sdc[n].b/sdc[n].c);
- sig[n].Yx = sqrt(kT*sqr(tau_t[n])*sdc[n].b/(1-sdc[n].em));
- }
- }
-
++ const real tau_t[],
+ gmx_bool bFirstHalf)
+{
+ gmx_sd_const_t *sdc;
+ gmx_sd_sigma_t *sig;
+ /* The random part of the velocity update, generated in the first
+ * half of the update, needs to be remembered for the second half.
+ */
+ rvec *sd_V;
+ real kT;
+ int gf = 0, ga = 0, gt = 0;
+ real vn = 0, Vmh, Xmh;
+ real ism;
+ int n, d;
+
+ sdc = sd->sdc;
+ sig = sd->sdsig;
+ sd_V = sd->sd_V;
+
- int ngtc, real tau_t[], real ref_t[],
+ for (n = start; n < nrend; n++)
+ {
+ ism = sqrt(invmass[n]);
+ if (cFREEZE)
+ {
+ gf = cFREEZE[n];
+ }
+ if (cACC)
+ {
+ ga = cACC[n];
+ }
+ if (cTC)
+ {
+ gt = cTC[n];
+ }
+
+ for (d = 0; d < DIM; d++)
+ {
+ if (bFirstHalf)
+ {
+ vn = v[n][d];
+ }
+ if ((ptype[n] != eptVSite) && (ptype[n] != eptShell) && !nFreeze[gf][d])
+ {
+ if (bFirstHalf)
+ {
+ if (bInitStep)
+ {
+ sd_X[n][d] = ism*sig[gt].X*gmx_rng_gaussian_table(gaussrand);
+ }
+ Vmh = sd_X[n][d]*sdc[gt].d/(tau_t[gt]*sdc[gt].c)
+ + ism*sig[gt].Yv*gmx_rng_gaussian_table(gaussrand);
+ sd_V[n][d] = ism*sig[gt].V*gmx_rng_gaussian_table(gaussrand);
+
+ v[n][d] = vn*sdc[gt].em
+ + (invmass[n]*f[n][d] + accel[ga][d])*tau_t[gt]*(1 - sdc[gt].em)
+ + sd_V[n][d] - sdc[gt].em*Vmh;
+
+ xprime[n][d] = x[n][d] + v[n][d]*tau_t[gt]*(sdc[gt].eph - sdc[gt].emh);
+ }
+ else
+ {
+
+ /* Correct the velocities for the constraints.
+ * This operation introduces some inaccuracy,
+ * since the velocity is determined from differences in coordinates.
+ */
+ v[n][d] =
+ (xprime[n][d] - x[n][d])/(tau_t[gt]*(sdc[gt].eph - sdc[gt].emh));
+
+ Xmh = sd_V[n][d]*tau_t[gt]*sdc[gt].d/(sdc[gt].em-1)
+ + ism*sig[gt].Yx*gmx_rng_gaussian_table(gaussrand);
+ sd_X[n][d] = ism*sig[gt].X*gmx_rng_gaussian_table(gaussrand);
+
+ xprime[n][d] += sd_X[n][d] - Xmh;
+
+ }
+ }
+ else
+ {
+ if (bFirstHalf)
+ {
+ v[n][d] = 0.0;
+ xprime[n][d] = x[n][d];
+ }
+ }
+ }
+ }
+}
+
++static void do_update_bd_Tconsts(double dt, real friction_coefficient,
++ int ngtc, const real ref_t[],
++ real *rf)
++{
++ /* This is separated from the update below, because it is single threaded */
++ int gt;
++
++ if (friction_coefficient != 0)
++ {
++ for (gt = 0; gt < ngtc; gt++)
++ {
++ rf[gt] = sqrt(2.0*BOLTZ*ref_t[gt]/(friction_coefficient*dt));
++ }
++ }
++ else
++ {
++ for (gt = 0; gt < ngtc; gt++)
++ {
++ rf[gt] = sqrt(2.0*BOLTZ*ref_t[gt]);
++ }
++ }
++}
++
+static void do_update_bd(int start, int nrend, double dt,
+ ivec nFreeze[],
+ real invmass[], unsigned short ptype[],
+ unsigned short cFREEZE[], unsigned short cTC[],
+ rvec x[], rvec xprime[], rvec v[],
+ rvec f[], real friction_coefficient,
- for (n = 0; n < ngtc; n++)
- {
- rf[n] = sqrt(2.0*BOLTZ*ref_t[n]/(friction_coefficient*dt));
- }
- }
- else
- {
- for (n = 0; n < ngtc; n++)
- {
- rf[n] = sqrt(2.0*BOLTZ*ref_t[n]);
- }
+ real *rf, gmx_rng_t gaussrand)
+{
+ /* note -- these appear to be full step velocities . . . */
+ int gf = 0, gt = 0;
+ real vn;
+ real invfr = 0;
+ int n, d;
+
+ if (friction_coefficient != 0)
+ {
+ invfr = 1.0/friction_coefficient;
- inputrec->opts.ngtc, inputrec->opts.tau_t,
- inputrec->opts.ref_t, FALSE);
+ }
++
+ for (n = start; (n < nrend); n++)
+ {
+ if (cFREEZE)
+ {
+ gf = cFREEZE[n];
+ }
+ if (cTC)
+ {
+ gt = cTC[n];
+ }
+ for (d = 0; (d < DIM); d++)
+ {
+ if ((ptype[n] != eptVSite) && (ptype[n] != eptShell) && !nFreeze[gf][d])
+ {
+ if (friction_coefficient != 0)
+ {
+ vn = invfr*f[n][d] + rf[gt]*gmx_rng_gaussian_table(gaussrand);
+ }
+ else
+ {
+ /* NOTE: invmass = 2/(mass*friction_constant*dt) */
+ vn = 0.5*invmass[n]*f[n][d]*dt
+ + sqrt(0.5*invmass[n])*rf[gt]*gmx_rng_gaussian_table(gaussrand);
+ }
+
+ v[n][d] = vn;
+ xprime[n][d] = x[n][d]+vn*dt;
+ }
+ else
+ {
+ v[n][d] = 0.0;
+ xprime[n][d] = x[n][d];
+ }
+ }
+ }
+}
+
+static void dump_it_all(FILE *fp, const char *title,
+ int natoms, rvec x[], rvec xp[], rvec v[], rvec f[])
+{
+#ifdef DEBUG
+ if (fp)
+ {
+ fprintf(fp, "%s\n", title);
+ pr_rvecs(fp, 0, "x", x, natoms);
+ pr_rvecs(fp, 0, "xp", xp, natoms);
+ pr_rvecs(fp, 0, "v", v, natoms);
+ pr_rvecs(fp, 0, "f", f, natoms);
+ }
+#endif
+}
+
+static void calc_ke_part_normal(rvec v[], t_grpopts *opts, t_mdatoms *md,
+ gmx_ekindata_t *ekind, t_nrnb *nrnb, gmx_bool bEkinAveVel,
+ gmx_bool bSaveEkinOld)
+{
+ int g;
+ t_grp_tcstat *tcstat = ekind->tcstat;
+ t_grp_acc *grpstat = ekind->grpstat;
+ int nthread, thread;
+
+ /* three main: VV with AveVel, vv with AveEkin, leap with AveEkin. Leap with AveVel is also
+ an option, but not supported now. Additionally, if we are doing iterations.
+ bEkinAveVel: If TRUE, we sum into ekin, if FALSE, into ekinh.
+ bSavEkinOld: If TRUE (in the case of iteration = bIterate is TRUE), we don't copy over the ekinh_old.
+ If FALSE, we overrwrite it.
+ */
+
+ /* group velocities are calculated in update_ekindata and
+ * accumulated in acumulate_groups.
+ * Now the partial global and groups ekin.
+ */
+ for (g = 0; (g < opts->ngtc); g++)
+ {
+
+ if (!bSaveEkinOld)
+ {
+ copy_mat(tcstat[g].ekinh, tcstat[g].ekinh_old);
+ }
+ if (bEkinAveVel)
+ {
+ clear_mat(tcstat[g].ekinf);
+ }
+ else
+ {
+ clear_mat(tcstat[g].ekinh);
+ }
+ if (bEkinAveVel)
+ {
+ tcstat[g].ekinscalef_nhc = 1.0; /* need to clear this -- logic is complicated! */
+ }
+ }
+ ekind->dekindl_old = ekind->dekindl;
+
+ nthread = gmx_omp_nthreads_get(emntUpdate);
+
+#pragma omp parallel for num_threads(nthread) schedule(static)
+ for (thread = 0; thread < nthread; thread++)
+ {
+ int start_t, end_t, n;
+ int ga, gt;
+ rvec v_corrt;
+ real hm;
+ int d, m;
+ matrix *ekin_sum;
+ real *dekindl_sum;
+
+ start_t = md->start + ((thread+0)*md->homenr)/nthread;
+ end_t = md->start + ((thread+1)*md->homenr)/nthread;
+
+ ekin_sum = ekind->ekin_work[thread];
+ dekindl_sum = ekind->dekindl_work[thread];
+
+ for (gt = 0; gt < opts->ngtc; gt++)
+ {
+ clear_mat(ekin_sum[gt]);
+ }
+ *dekindl_sum = 0.0;
+
+ ga = 0;
+ gt = 0;
+ for (n = start_t; n < end_t; n++)
+ {
+ if (md->cACC)
+ {
+ ga = md->cACC[n];
+ }
+ if (md->cTC)
+ {
+ gt = md->cTC[n];
+ }
+ hm = 0.5*md->massT[n];
+
+ for (d = 0; (d < DIM); d++)
+ {
+ v_corrt[d] = v[n][d] - grpstat[ga].u[d];
+ }
+ for (d = 0; (d < DIM); d++)
+ {
+ for (m = 0; (m < DIM); m++)
+ {
+ /* if we're computing a full step velocity, v_corrt[d] has v(t). Otherwise, v(t+dt/2) */
+ ekin_sum[gt][m][d] += hm*v_corrt[m]*v_corrt[d];
+ }
+ }
+ if (md->nMassPerturbed && md->bPerturbed[n])
+ {
+ *dekindl_sum +=
+ 0.5*(md->massB[n] - md->massA[n])*iprod(v_corrt, v_corrt);
+ }
+ }
+ }
+
+ ekind->dekindl = 0;
+ for (thread = 0; thread < nthread; thread++)
+ {
+ for (g = 0; g < opts->ngtc; g++)
+ {
+ if (bEkinAveVel)
+ {
+ m_add(tcstat[g].ekinf, ekind->ekin_work[thread][g],
+ tcstat[g].ekinf);
+ }
+ else
+ {
+ m_add(tcstat[g].ekinh, ekind->ekin_work[thread][g],
+ tcstat[g].ekinh);
+ }
+ }
+
+ ekind->dekindl += *ekind->dekindl_work[thread];
+ }
+
+ inc_nrnb(nrnb, eNR_EKIN, md->homenr);
+}
+
+static void calc_ke_part_visc(matrix box, rvec x[], rvec v[],
+ t_grpopts *opts, t_mdatoms *md,
+ gmx_ekindata_t *ekind,
+ t_nrnb *nrnb, gmx_bool bEkinAveVel, gmx_bool bSaveEkinOld)
+{
+ int start = md->start, homenr = md->homenr;
+ int g, d, n, m, gt = 0;
+ rvec v_corrt;
+ real hm;
+ t_grp_tcstat *tcstat = ekind->tcstat;
+ t_cos_acc *cosacc = &(ekind->cosacc);
+ real dekindl;
+ real fac, cosz;
+ double mvcos;
+
+ for (g = 0; g < opts->ngtc; g++)
+ {
+ copy_mat(ekind->tcstat[g].ekinh, ekind->tcstat[g].ekinh_old);
+ clear_mat(ekind->tcstat[g].ekinh);
+ }
+ ekind->dekindl_old = ekind->dekindl;
+
+ fac = 2*M_PI/box[ZZ][ZZ];
+ mvcos = 0;
+ dekindl = 0;
+ for (n = start; n < start+homenr; n++)
+ {
+ if (md->cTC)
+ {
+ gt = md->cTC[n];
+ }
+ hm = 0.5*md->massT[n];
+
+ /* Note that the times of x and v differ by half a step */
+ /* MRS -- would have to be changed for VV */
+ cosz = cos(fac*x[n][ZZ]);
+ /* Calculate the amplitude of the new velocity profile */
+ mvcos += 2*cosz*md->massT[n]*v[n][XX];
+
+ copy_rvec(v[n], v_corrt);
+ /* Subtract the profile for the kinetic energy */
+ v_corrt[XX] -= cosz*cosacc->vcos;
+ for (d = 0; (d < DIM); d++)
+ {
+ for (m = 0; (m < DIM); m++)
+ {
+ /* if we're computing a full step velocity, v_corrt[d] has v(t). Otherwise, v(t+dt/2) */
+ if (bEkinAveVel)
+ {
+ tcstat[gt].ekinf[m][d] += hm*v_corrt[m]*v_corrt[d];
+ }
+ else
+ {
+ tcstat[gt].ekinh[m][d] += hm*v_corrt[m]*v_corrt[d];
+ }
+ }
+ }
+ if (md->nPerturbed && md->bPerturbed[n])
+ {
+ dekindl += 0.5*(md->massB[n] - md->massA[n])*iprod(v_corrt, v_corrt);
+ }
+ }
+ ekind->dekindl = dekindl;
+ cosacc->mvcos = mvcos;
+
+ inc_nrnb(nrnb, eNR_EKIN, homenr);
+}
+
+void calc_ke_part(t_state *state, t_grpopts *opts, t_mdatoms *md,
+ gmx_ekindata_t *ekind, t_nrnb *nrnb, gmx_bool bEkinAveVel, gmx_bool bSaveEkinOld)
+{
+ if (ekind->cosacc.cos_accel == 0)
+ {
+ calc_ke_part_normal(state->v, opts, md, ekind, nrnb, bEkinAveVel, bSaveEkinOld);
+ }
+ else
+ {
+ calc_ke_part_visc(state->box, state->x, state->v, opts, md, ekind, nrnb, bEkinAveVel, bSaveEkinOld);
+ }
+}
+
+extern void init_ekinstate(ekinstate_t *ekinstate, const t_inputrec *ir)
+{
+ ekinstate->ekin_n = ir->opts.ngtc;
+ snew(ekinstate->ekinh, ekinstate->ekin_n);
+ snew(ekinstate->ekinf, ekinstate->ekin_n);
+ snew(ekinstate->ekinh_old, ekinstate->ekin_n);
+ snew(ekinstate->ekinscalef_nhc, ekinstate->ekin_n);
+ snew(ekinstate->ekinscaleh_nhc, ekinstate->ekin_n);
+ snew(ekinstate->vscale_nhc, ekinstate->ekin_n);
+ ekinstate->dekindl = 0;
+ ekinstate->mvcos = 0;
+}
+
+void update_ekinstate(ekinstate_t *ekinstate, gmx_ekindata_t *ekind)
+{
+ int i;
+
+ for (i = 0; i < ekinstate->ekin_n; i++)
+ {
+ copy_mat(ekind->tcstat[i].ekinh, ekinstate->ekinh[i]);
+ copy_mat(ekind->tcstat[i].ekinf, ekinstate->ekinf[i]);
+ copy_mat(ekind->tcstat[i].ekinh_old, ekinstate->ekinh_old[i]);
+ ekinstate->ekinscalef_nhc[i] = ekind->tcstat[i].ekinscalef_nhc;
+ ekinstate->ekinscaleh_nhc[i] = ekind->tcstat[i].ekinscaleh_nhc;
+ ekinstate->vscale_nhc[i] = ekind->tcstat[i].vscale_nhc;
+ }
+
+ copy_mat(ekind->ekin, ekinstate->ekin_total);
+ ekinstate->dekindl = ekind->dekindl;
+ ekinstate->mvcos = ekind->cosacc.mvcos;
+
+}
+
+void restore_ekinstate_from_state(t_commrec *cr,
+ gmx_ekindata_t *ekind, ekinstate_t *ekinstate)
+{
+ int i, n;
+
+ if (MASTER(cr))
+ {
+ for (i = 0; i < ekinstate->ekin_n; i++)
+ {
+ copy_mat(ekinstate->ekinh[i], ekind->tcstat[i].ekinh);
+ copy_mat(ekinstate->ekinf[i], ekind->tcstat[i].ekinf);
+ copy_mat(ekinstate->ekinh_old[i], ekind->tcstat[i].ekinh_old);
+ ekind->tcstat[i].ekinscalef_nhc = ekinstate->ekinscalef_nhc[i];
+ ekind->tcstat[i].ekinscaleh_nhc = ekinstate->ekinscaleh_nhc[i];
+ ekind->tcstat[i].vscale_nhc = ekinstate->vscale_nhc[i];
+ }
+
+ copy_mat(ekinstate->ekin_total, ekind->ekin);
+
+ ekind->dekindl = ekinstate->dekindl;
+ ekind->cosacc.mvcos = ekinstate->mvcos;
+ n = ekinstate->ekin_n;
+ }
+
+ if (PAR(cr))
+ {
+ gmx_bcast(sizeof(n), &n, cr);
+ for (i = 0; i < n; i++)
+ {
+ gmx_bcast(DIM*DIM*sizeof(ekind->tcstat[i].ekinh[0][0]),
+ ekind->tcstat[i].ekinh[0], cr);
+ gmx_bcast(DIM*DIM*sizeof(ekind->tcstat[i].ekinf[0][0]),
+ ekind->tcstat[i].ekinf[0], cr);
+ gmx_bcast(DIM*DIM*sizeof(ekind->tcstat[i].ekinh_old[0][0]),
+ ekind->tcstat[i].ekinh_old[0], cr);
+
+ gmx_bcast(sizeof(ekind->tcstat[i].ekinscalef_nhc),
+ &(ekind->tcstat[i].ekinscalef_nhc), cr);
+ gmx_bcast(sizeof(ekind->tcstat[i].ekinscaleh_nhc),
+ &(ekind->tcstat[i].ekinscaleh_nhc), cr);
+ gmx_bcast(sizeof(ekind->tcstat[i].vscale_nhc),
+ &(ekind->tcstat[i].vscale_nhc), cr);
+ }
+ gmx_bcast(DIM*DIM*sizeof(ekind->ekin[0][0]),
+ ekind->ekin[0], cr);
+
+ gmx_bcast(sizeof(ekind->dekindl), &ekind->dekindl, cr);
+ gmx_bcast(sizeof(ekind->cosacc.mvcos), &ekind->cosacc.mvcos, cr);
+ }
+}
+
+void set_deform_reference_box(gmx_update_t upd, gmx_large_int_t step, matrix box)
+{
+ upd->deformref_step = step;
+ copy_mat(box, upd->deformref_box);
+}
+
+static void deform(gmx_update_t upd,
+ int start, int homenr, rvec x[], matrix box, matrix *scale_tot,
+ const t_inputrec *ir, gmx_large_int_t step)
+{
+ matrix bnew, invbox, mu;
+ real elapsed_time;
+ int i, j;
+
+ elapsed_time = (step + 1 - upd->deformref_step)*ir->delta_t;
+ copy_mat(box, bnew);
+ for (i = 0; i < DIM; i++)
+ {
+ for (j = 0; j < DIM; j++)
+ {
+ if (ir->deform[i][j] != 0)
+ {
+ bnew[i][j] =
+ upd->deformref_box[i][j] + elapsed_time*ir->deform[i][j];
+ }
+ }
+ }
+ /* We correct the off-diagonal elements,
+ * which can grow indefinitely during shearing,
+ * so the shifts do not get messed up.
+ */
+ for (i = 1; i < DIM; i++)
+ {
+ for (j = i-1; j >= 0; j--)
+ {
+ while (bnew[i][j] - box[i][j] > 0.5*bnew[j][j])
+ {
+ rvec_dec(bnew[i], bnew[j]);
+ }
+ while (bnew[i][j] - box[i][j] < -0.5*bnew[j][j])
+ {
+ rvec_inc(bnew[i], bnew[j]);
+ }
+ }
+ }
+ m_inv_ur0(box, invbox);
+ copy_mat(bnew, box);
+ mmul_ur0(box, invbox, mu);
+
+ for (i = start; i < start+homenr; i++)
+ {
+ x[i][XX] = mu[XX][XX]*x[i][XX]+mu[YY][XX]*x[i][YY]+mu[ZZ][XX]*x[i][ZZ];
+ x[i][YY] = mu[YY][YY]*x[i][YY]+mu[ZZ][YY]*x[i][ZZ];
+ x[i][ZZ] = mu[ZZ][ZZ]*x[i][ZZ];
+ }
+ if (*scale_tot)
+ {
+ /* The transposes of the scaling matrices are stored,
+ * so we need to do matrix multiplication in the inverse order.
+ */
+ mmul_ur0(*scale_tot, mu, *scale_tot);
+ }
+}
+
+static void combine_forces(int nstcalclr,
+ gmx_constr_t constr,
+ t_inputrec *ir, t_mdatoms *md, t_idef *idef,
+ t_commrec *cr,
+ gmx_large_int_t step,
+ t_state *state, gmx_bool bMolPBC,
+ int start, int nrend,
+ rvec f[], rvec f_lr[],
+ t_nrnb *nrnb)
+{
+ int i, d, nm1;
+
+ /* f contains the short-range forces + the long range forces
+ * which are stored separately in f_lr.
+ */
+
+ if (constr != NULL && !(ir->eConstrAlg == econtSHAKE && ir->epc == epcNO))
+ {
+ /* We need to constrain the LR forces separately,
+ * because due to the different pre-factor for the SR and LR
+ * forces in the update algorithm, we can not determine
+ * the constraint force for the coordinate constraining.
+ * Constrain only the additional LR part of the force.
+ */
+ /* MRS -- need to make sure this works with trotter integration -- the constraint calls may not be right.*/
+ constrain(NULL, FALSE, FALSE, constr, idef, ir, NULL, cr, step, 0, md,
+ state->x, f_lr, f_lr, bMolPBC, state->box, state->lambda[efptBONDED], NULL,
+ NULL, NULL, nrnb, econqForce, ir->epc == epcMTTK, state->veta, state->veta);
+ }
+
+ /* Add nstcalclr-1 times the LR force to the sum of both forces
+ * and store the result in forces_lr.
+ */
+ nm1 = nstcalclr - 1;
+ for (i = start; i < nrend; i++)
+ {
+ for (d = 0; d < DIM; d++)
+ {
+ f_lr[i][d] = f[i][d] + nm1*f_lr[i][d];
+ }
+ }
+}
+
+void update_tcouple(FILE *fplog,
+ gmx_large_int_t step,
+ t_inputrec *inputrec,
+ t_state *state,
+ gmx_ekindata_t *ekind,
+ gmx_wallcycle_t wcycle,
+ gmx_update_t upd,
+ t_extmass *MassQ,
+ t_mdatoms *md)
+
+{
+ gmx_bool bTCouple = FALSE;
+ real dttc;
+ int i, start, end, homenr, offset;
+
+ /* if using vv with trotter decomposition methods, we do this elsewhere in the code */
+ if (inputrec->etc != etcNO &&
+ !(IR_NVT_TROTTER(inputrec) || IR_NPT_TROTTER(inputrec) || IR_NPH_TROTTER(inputrec)))
+ {
+ /* We should only couple after a step where energies were determined (for leapfrog versions)
+ or the step energies are determined, for velocity verlet versions */
+
+ if (EI_VV(inputrec->eI))
+ {
+ offset = 0;
+ }
+ else
+ {
+ offset = 1;
+ }
+ bTCouple = (inputrec->nsttcouple == 1 ||
+ do_per_step(step+inputrec->nsttcouple-offset,
+ inputrec->nsttcouple));
+ }
+
+ if (bTCouple)
+ {
+ dttc = inputrec->nsttcouple*inputrec->delta_t;
+
+ switch (inputrec->etc)
+ {
+ case etcNO:
+ break;
+ case etcBERENDSEN:
+ berendsen_tcoupl(inputrec, ekind, dttc);
+ break;
+ case etcNOSEHOOVER:
+ nosehoover_tcoupl(&(inputrec->opts), ekind, dttc,
+ state->nosehoover_xi, state->nosehoover_vxi, MassQ);
+ break;
+ case etcVRESCALE:
+ vrescale_tcoupl(inputrec, ekind, dttc,
+ state->therm_integral, upd->sd->gaussrand[0]);
+ break;
+ }
+ /* rescale in place here */
+ if (EI_VV(inputrec->eI))
+ {
+ rescale_velocities(ekind, md, md->start, md->start+md->homenr, state->v);
+ }
+ }
+ else
+ {
+ /* Set the T scaling lambda to 1 to have no scaling */
+ for (i = 0; (i < inputrec->opts.ngtc); i++)
+ {
+ ekind->tcstat[i].lambda = 1.0;
+ }
+ }
+}
+
+void update_pcouple(FILE *fplog,
+ gmx_large_int_t step,
+ t_inputrec *inputrec,
+ t_state *state,
+ matrix pcoupl_mu,
+ matrix M,
+ gmx_wallcycle_t wcycle,
+ gmx_update_t upd,
+ gmx_bool bInitStep)
+{
+ gmx_bool bPCouple = FALSE;
+ real dtpc = 0;
+ int i;
+
+ /* if using Trotter pressure, we do this in coupling.c, so we leave it false. */
+ if (inputrec->epc != epcNO && (!(IR_NPT_TROTTER(inputrec) || IR_NPH_TROTTER(inputrec))))
+ {
+ /* We should only couple after a step where energies were determined */
+ bPCouple = (inputrec->nstpcouple == 1 ||
+ do_per_step(step+inputrec->nstpcouple-1,
+ inputrec->nstpcouple));
+ }
+
+ clear_mat(pcoupl_mu);
+ for (i = 0; i < DIM; i++)
+ {
+ pcoupl_mu[i][i] = 1.0;
+ }
+
+ clear_mat(M);
+
+ if (bPCouple)
+ {
+ dtpc = inputrec->nstpcouple*inputrec->delta_t;
+
+ switch (inputrec->epc)
+ {
+ /* We can always pcoupl, even if we did not sum the energies
+ * the previous step, since state->pres_prev is only updated
+ * when the energies have been summed.
+ */
+ case (epcNO):
+ break;
+ case (epcBERENDSEN):
+ if (!bInitStep)
+ {
+ berendsen_pcoupl(fplog, step, inputrec, dtpc, state->pres_prev, state->box,
+ pcoupl_mu);
+ }
+ break;
+ case (epcPARRINELLORAHMAN):
+ parrinellorahman_pcoupl(fplog, step, inputrec, dtpc, state->pres_prev,
+ state->box, state->box_rel, state->boxv,
+ M, pcoupl_mu, bInitStep);
+ break;
+ default:
+ break;
+ }
+ }
+}
+
+static rvec *get_xprime(const t_state *state, gmx_update_t upd)
+{
+ if (state->nalloc > upd->xp_nalloc)
+ {
+ upd->xp_nalloc = state->nalloc;
+ srenew(upd->xp, upd->xp_nalloc);
+ }
+
+ return upd->xp;
+}
+
+void update_constraints(FILE *fplog,
+ gmx_large_int_t step,
+ real *dvdlambda, /* the contribution to be added to the bonded interactions */
+ t_inputrec *inputrec, /* input record and box stuff */
+ gmx_ekindata_t *ekind,
+ t_mdatoms *md,
+ t_state *state,
+ gmx_bool bMolPBC,
+ t_graph *graph,
+ rvec force[], /* forces on home particles */
+ t_idef *idef,
+ tensor vir_part,
+ tensor vir, /* tensors for virial and ekin, needed for computing */
+ t_commrec *cr,
+ t_nrnb *nrnb,
+ gmx_wallcycle_t wcycle,
+ gmx_update_t upd,
+ gmx_constr_t constr,
+ gmx_bool bInitStep,
+ gmx_bool bFirstHalf,
+ gmx_bool bCalcVir,
+ real vetanew)
+{
+ gmx_bool bExtended, bLastStep, bLog = FALSE, bEner = FALSE, bDoConstr = FALSE;
+ double dt;
+ real dt_1;
+ int start, homenr, nrend, i, n, m, g, d;
+ tensor vir_con;
+ rvec *vbuf, *xprime = NULL;
+ int nth, th;
+
+ if (constr)
+ {
+ bDoConstr = TRUE;
+ }
+ if (bFirstHalf && !EI_VV(inputrec->eI))
+ {
+ bDoConstr = FALSE;
+ }
+
+ /* for now, SD update is here -- though it really seems like it
+ should be reformulated as a velocity verlet method, since it has two parts */
+
+ start = md->start;
+ homenr = md->homenr;
+ nrend = start+homenr;
+
+ dt = inputrec->delta_t;
+ dt_1 = 1.0/dt;
+
+ /*
+ * Steps (7C, 8C)
+ * APPLY CONSTRAINTS:
+ * BLOCK SHAKE
+
+ * When doing PR pressure coupling we have to constrain the
+ * bonds in each iteration. If we are only using Nose-Hoover tcoupling
+ * it is enough to do this once though, since the relative velocities
+ * after this will be normal to the bond vector
+ */
+
+ if (bDoConstr)
+ {
+ /* clear out constraints before applying */
+ clear_mat(vir_part);
+
+ xprime = get_xprime(state, upd);
+
+ bLastStep = (step == inputrec->init_step+inputrec->nsteps);
+ bLog = (do_per_step(step, inputrec->nstlog) || bLastStep || (step < 0));
+ bEner = (do_per_step(step, inputrec->nstenergy) || bLastStep);
+ /* Constrain the coordinates xprime */
+ wallcycle_start(wcycle, ewcCONSTR);
+ if (EI_VV(inputrec->eI) && bFirstHalf)
+ {
+ constrain(NULL, bLog, bEner, constr, idef,
+ inputrec, ekind, cr, step, 1, md,
+ state->x, state->v, state->v,
+ bMolPBC, state->box,
+ state->lambda[efptBONDED], dvdlambda,
+ NULL, bCalcVir ? &vir_con : NULL, nrnb, econqVeloc,
+ inputrec->epc == epcMTTK, state->veta, vetanew);
+ }
+ else
+ {
+ constrain(NULL, bLog, bEner, constr, idef,
+ inputrec, ekind, cr, step, 1, md,
+ state->x, xprime, NULL,
+ bMolPBC, state->box,
+ state->lambda[efptBONDED], dvdlambda,
+ state->v, bCalcVir ? &vir_con : NULL, nrnb, econqCoord,
+ inputrec->epc == epcMTTK, state->veta, state->veta);
+ }
+ wallcycle_stop(wcycle, ewcCONSTR);
+
+ where();
+
+ dump_it_all(fplog, "After Shake",
+ state->natoms, state->x, xprime, state->v, force);
+
+ if (bCalcVir)
+ {
+ if (inputrec->eI == eiSD2)
+ {
+ /* A correction factor eph is needed for the SD constraint force */
+ /* Here we can, unfortunately, not have proper corrections
+ * for different friction constants, so we use the first one.
+ */
+ for (i = 0; i < DIM; i++)
+ {
+ for (m = 0; m < DIM; m++)
+ {
+ vir_part[i][m] += upd->sd->sdc[0].eph*vir_con[i][m];
+ }
+ }
+ }
+ else
+ {
+ m_add(vir_part, vir_con, vir_part);
+ }
+ if (debug)
+ {
+ pr_rvecs(debug, 0, "constraint virial", vir_part, DIM);
+ }
+ }
+ }
+
+ where();
+ if ((inputrec->eI == eiSD2) && !(bFirstHalf))
+ {
+ xprime = get_xprime(state, upd);
+
+ nth = gmx_omp_nthreads_get(emntUpdate);
+
+#pragma omp parallel for num_threads(nth) schedule(static)
+ for (th = 0; th < nth; th++)
+ {
+ int start_th, end_th;
+
+ start_th = start + ((nrend-start)* th )/nth;
+ end_th = start + ((nrend-start)*(th+1))/nth;
+
+ /* The second part of the SD integration */
+ do_update_sd2(upd->sd, upd->sd->gaussrand[th],
+ FALSE, start_th, end_th,
+ inputrec->opts.acc, inputrec->opts.nFreeze,
+ md->invmass, md->ptype,
+ md->cFREEZE, md->cACC, md->cTC,
+ state->x, xprime, state->v, force, state->sd_X,
- if (EI_RANDOM(inputrec->eI))
++ inputrec->opts.tau_t,
++ FALSE);
+ }
+ inc_nrnb(nrnb, eNR_UPDATE, homenr);
+
+ if (bDoConstr)
+ {
+ /* Constrain the coordinates xprime */
+ wallcycle_start(wcycle, ewcCONSTR);
+ constrain(NULL, bLog, bEner, constr, idef,
+ inputrec, NULL, cr, step, 1, md,
+ state->x, xprime, NULL,
+ bMolPBC, state->box,
+ state->lambda[efptBONDED], dvdlambda,
+ NULL, NULL, nrnb, econqCoord, FALSE, 0, 0);
+ wallcycle_stop(wcycle, ewcCONSTR);
+ }
+ }
+
+ /* We must always unshift after updating coordinates; if we did not shake
+ x was shifted in do_force */
+
+ if (!(bFirstHalf)) /* in the first half of vv, no shift. */
+ {
+ if (graph && (graph->nnodes > 0))
+ {
+ unshift_x(graph, state->box, state->x, upd->xp);
+ if (TRICLINIC(state->box))
+ {
+ inc_nrnb(nrnb, eNR_SHIFTX, 2*graph->nnodes);
+ }
+ else
+ {
+ inc_nrnb(nrnb, eNR_SHIFTX, graph->nnodes);
+ }
+ }
+ else
+ {
+#pragma omp parallel for num_threads(gmx_omp_nthreads_get(emntUpdate)) schedule(static)
+ for (i = start; i < nrend; i++)
+ {
+ copy_rvec(upd->xp[i], state->x[i]);
+ }
+ }
+
+ dump_it_all(fplog, "After unshift",
+ state->natoms, state->x, upd->xp, state->v, force);
+ }
+/* ############# END the update of velocities and positions ######### */
+}
+
+void update_box(FILE *fplog,
+ gmx_large_int_t step,
+ t_inputrec *inputrec, /* input record and box stuff */
+ t_mdatoms *md,
+ t_state *state,
+ t_graph *graph,
+ rvec force[], /* forces on home particles */
+ matrix *scale_tot,
+ matrix pcoupl_mu,
+ t_nrnb *nrnb,
+ gmx_wallcycle_t wcycle,
+ gmx_update_t upd,
+ gmx_bool bInitStep,
+ gmx_bool bFirstHalf)
+{
+ gmx_bool bExtended, bLastStep, bLog = FALSE, bEner = FALSE;
+ double dt;
+ real dt_1;
+ int start, homenr, nrend, i, n, m, g;
+ tensor vir_con;
+
+ start = md->start;
+ homenr = md->homenr;
+ nrend = start+homenr;
+
+ bExtended =
+ (inputrec->etc == etcNOSEHOOVER) ||
+ (inputrec->epc == epcPARRINELLORAHMAN) ||
+ (inputrec->epc == epcMTTK);
+
+ dt = inputrec->delta_t;
+
+ where();
+
+ /* now update boxes */
+ switch (inputrec->epc)
+ {
+ case (epcNO):
+ break;
+ case (epcBERENDSEN):
+ berendsen_pscale(inputrec, pcoupl_mu, state->box, state->box_rel,
+ start, homenr, state->x, md->cFREEZE, nrnb);
+ break;
+ case (epcPARRINELLORAHMAN):
+ /* The box velocities were updated in do_pr_pcoupl in the update
+ * iteration, but we dont change the box vectors until we get here
+ * since we need to be able to shift/unshift above.
+ */
+ for (i = 0; i < DIM; i++)
+ {
+ for (m = 0; m <= i; m++)
+ {
+ state->box[i][m] += dt*state->boxv[i][m];
+ }
+ }
+ preserve_box_shape(inputrec, state->box_rel, state->box);
+
+ /* Scale the coordinates */
+ for (n = start; (n < start+homenr); n++)
+ {
+ tmvmul_ur0(pcoupl_mu, state->x[n], state->x[n]);
+ }
+ break;
+ case (epcMTTK):
+ switch (inputrec->epct)
+ {
+ case (epctISOTROPIC):
+ /* DIM * eta = ln V. so DIM*eta_new = DIM*eta_old + DIM*dt*veta =>
+ ln V_new = ln V_old + 3*dt*veta => V_new = V_old*exp(3*dt*veta) =>
+ Side length scales as exp(veta*dt) */
+
+ msmul(state->box, exp(state->veta*dt), state->box);
+
+ /* Relate veta to boxv. veta = d(eta)/dT = (1/DIM)*1/V dV/dT.
+ o If we assume isotropic scaling, and box length scaling
+ factor L, then V = L^DIM (det(M)). So dV/dt = DIM
+ L^(DIM-1) dL/dt det(M), and veta = (1/L) dL/dt. The
+ determinant of B is L^DIM det(M), and the determinant
+ of dB/dt is (dL/dT)^DIM det (M). veta will be
+ (det(dB/dT)/det(B))^(1/3). Then since M =
+ B_new*(vol_new)^(1/3), dB/dT_new = (veta_new)*B(new). */
+
+ msmul(state->box, state->veta, state->boxv);
+ break;
+ default:
+ break;
+ }
+ break;
+ default:
+ break;
+ }
+
+ if ((!(IR_NPT_TROTTER(inputrec) || IR_NPH_TROTTER(inputrec))) && scale_tot)
+ {
+ /* The transposes of the scaling matrices are stored,
+ * therefore we need to reverse the order in the multiplication.
+ */
+ mmul_ur0(*scale_tot, pcoupl_mu, *scale_tot);
+ }
+
+ if (DEFORM(*inputrec))
+ {
+ deform(upd, start, homenr, state->x, state->box, scale_tot, inputrec, step);
+ }
+ where();
+ dump_it_all(fplog, "After update",
+ state->natoms, state->x, upd->xp, state->v, force);
+}
+
+void update_coords(FILE *fplog,
+ gmx_large_int_t step,
+ t_inputrec *inputrec, /* input record and box stuff */
+ t_mdatoms *md,
+ t_state *state,
+ gmx_bool bMolPBC,
+ rvec *f, /* forces on home particles */
+ gmx_bool bDoLR,
+ rvec *f_lr,
+ t_fcdata *fcd,
+ gmx_ekindata_t *ekind,
+ matrix M,
+ gmx_wallcycle_t wcycle,
+ gmx_update_t upd,
+ gmx_bool bInitStep,
+ int UpdatePart,
+ t_commrec *cr, /* these shouldn't be here -- need to think about it */
+ t_nrnb *nrnb,
+ gmx_constr_t constr,
+ t_idef *idef)
+{
+ gmx_bool bNH, bPR, bLastStep, bLog = FALSE, bEner = FALSE;
+ double dt, alpha;
+ real *imass, *imassin;
+ rvec *force;
+ real dt_1;
+ int start, homenr, nrend, i, j, d, n, m, g;
+ int blen0, blen1, iatom, jatom, nshake, nsettle, nconstr, nexpand;
+ int *icom = NULL;
+ tensor vir_con;
+ rvec *vcom, *xcom, *vall, *xall, *xin, *vin, *forcein, *fall, *xpall, *xprimein, *xprime;
+ int nth, th;
+
+ /* Running the velocity half does nothing except for velocity verlet */
+ if ((UpdatePart == etrtVELOCITY1 || UpdatePart == etrtVELOCITY2) &&
+ !EI_VV(inputrec->eI))
+ {
+ gmx_incons("update_coords called for velocity without VV integrator");
+ }
+
+ start = md->start;
+ homenr = md->homenr;
+ nrend = start+homenr;
+
+ xprime = get_xprime(state, upd);
+
+ dt = inputrec->delta_t;
+ dt_1 = 1.0/dt;
+
+ /* We need to update the NMR restraint history when time averaging is used */
+ if (state->flags & (1<<estDISRE_RM3TAV))
+ {
+ update_disres_history(fcd, &state->hist);
+ }
+ if (state->flags & (1<<estORIRE_DTAV))
+ {
+ update_orires_history(fcd, &state->hist);
+ }
+
+
+ bNH = inputrec->etc == etcNOSEHOOVER;
+ bPR = ((inputrec->epc == epcPARRINELLORAHMAN) || (inputrec->epc == epcMTTK));
+
+ if (bDoLR && inputrec->nstcalclr > 1 && !EI_VV(inputrec->eI)) /* get this working with VV? */
+ {
+ /* Store the total force + nstcalclr-1 times the LR force
+ * in forces_lr, so it can be used in a normal update algorithm
+ * to produce twin time stepping.
+ */
+ /* is this correct in the new construction? MRS */
+ combine_forces(inputrec->nstcalclr, constr, inputrec, md, idef, cr,
+ step, state, bMolPBC,
+ start, nrend, f, f_lr, nrnb);
+ force = f_lr;
+ }
+ else
+ {
+ force = f;
+ }
+
+ /* ############# START The update of velocities and positions ######### */
+ where();
+ dump_it_all(fplog, "Before update",
+ state->natoms, state->x, xprime, state->v, force);
+
- /* We still need to take care of generating random seeds properly
- * when multi-threading.
- */
- nth = 1;
++ if (inputrec->eI == eiSD2)
+ {
- else
++ check_sd2_work_data_allocation(upd->sd, nrend);
++
++ do_update_sd2_Tconsts(upd->sd,
++ inputrec->opts.ngtc,
++ inputrec->opts.tau_t,
++ inputrec->opts.ref_t);
+ }
- nth = gmx_omp_nthreads_get(emntUpdate);
++ if (inputrec->eI == eiBD)
+ {
- if (inputrec->eI == eiSD2)
- {
- check_sd2_work_data_allocation(upd->sd, nrend);
- }
++ do_update_bd_Tconsts(dt, inputrec->bd_fric,
++ inputrec->opts.ngtc, inputrec->opts.ref_t,
++ upd->sd->bd_rf);
+ }
+
- inputrec->opts.ngtc, inputrec->opts.tau_t, inputrec->opts.ref_t,
++ nth = gmx_omp_nthreads_get(emntUpdate);
+
+#pragma omp parallel for num_threads(nth) schedule(static) private(alpha)
+ for (th = 0; th < nth; th++)
+ {
+ int start_th, end_th;
+
+ start_th = start + ((nrend-start)* th )/nth;
+ end_th = start + ((nrend-start)*(th+1))/nth;
+
+ switch (inputrec->eI)
+ {
+ case (eiMD):
+ if (ekind->cosacc.cos_accel == 0)
+ {
+ do_update_md(start_th, end_th, dt,
+ ekind->tcstat, state->nosehoover_vxi,
+ ekind->bNEMD, ekind->grpstat, inputrec->opts.acc,
+ inputrec->opts.nFreeze,
+ md->invmass, md->ptype,
+ md->cFREEZE, md->cACC, md->cTC,
+ state->x, xprime, state->v, force, M,
+ bNH, bPR);
+ }
+ else
+ {
+ do_update_visc(start_th, end_th, dt,
+ ekind->tcstat, state->nosehoover_vxi,
+ md->invmass, md->ptype,
+ md->cTC, state->x, xprime, state->v, force, M,
+ state->box,
+ ekind->cosacc.cos_accel,
+ ekind->cosacc.vcos,
+ bNH, bPR);
+ }
+ break;
+ case (eiSD1):
+ do_update_sd1(upd->sd, upd->sd->gaussrand[th],
+ start_th, end_th, dt,
+ inputrec->opts.acc, inputrec->opts.nFreeze,
+ md->invmass, md->ptype,
+ md->cFREEZE, md->cACC, md->cTC,
+ state->x, xprime, state->v, force, state->sd_X,
+ inputrec->opts.ngtc, inputrec->opts.tau_t, inputrec->opts.ref_t);
+ break;
+ case (eiSD2):
+ /* The SD update is done in 2 parts, because an extra constraint step
+ * is needed
+ */
+ do_update_sd2(upd->sd, upd->sd->gaussrand[th],
+ bInitStep, start_th, end_th,
+ inputrec->opts.acc, inputrec->opts.nFreeze,
+ md->invmass, md->ptype,
+ md->cFREEZE, md->cACC, md->cTC,
+ state->x, xprime, state->v, force, state->sd_X,
- inputrec->opts.ngtc, inputrec->opts.tau_t, inputrec->opts.ref_t,
++ inputrec->opts.tau_t,
+ TRUE);
+ break;
+ case (eiBD):
+ do_update_bd(start_th, end_th, dt,
+ inputrec->opts.nFreeze, md->invmass, md->ptype,
+ md->cFREEZE, md->cTC,
+ state->x, xprime, state->v, force,
+ inputrec->bd_fric,
+ upd->sd->bd_rf, upd->sd->gaussrand[th]);
+ break;
+ case (eiVV):
+ case (eiVVAK):
+ alpha = 1.0 + DIM/((double)inputrec->opts.nrdf[0]); /* assuming barostat coupled to group 0. */
+ switch (UpdatePart)
+ {
+ case etrtVELOCITY1:
+ case etrtVELOCITY2:
+ do_update_vv_vel(start_th, end_th, dt,
+ ekind->tcstat, ekind->grpstat,
+ inputrec->opts.acc, inputrec->opts.nFreeze,
+ md->invmass, md->ptype,
+ md->cFREEZE, md->cACC,
+ state->v, force,
+ (bNH || bPR), state->veta, alpha);
+ break;
+ case etrtPOSITION:
+ do_update_vv_pos(start_th, end_th, dt,
+ ekind->tcstat, ekind->grpstat,
+ inputrec->opts.acc, inputrec->opts.nFreeze,
+ md->invmass, md->ptype, md->cFREEZE,
+ state->x, xprime, state->v, force,
+ (bNH || bPR), state->veta, alpha);
+ break;
+ }
+ break;
+ default:
+ gmx_fatal(FARGS, "Don't know how to update coordinates");
+ break;
+ }
+ }
+
+}
+
+
+void correct_ekin(FILE *log, int start, int end, rvec v[], rvec vcm, real mass[],
+ real tmass, tensor ekin)
+{
+ /*
+ * This is a debugging routine. It should not be called for production code
+ *
+ * The kinetic energy should calculated according to:
+ * Ekin = 1/2 m (v-vcm)^2
+ * However the correction is not always applied, since vcm may not be
+ * known in time and we compute
+ * Ekin' = 1/2 m v^2 instead
+ * This can be corrected afterwards by computing
+ * Ekin = Ekin' + 1/2 m ( -2 v vcm + vcm^2)
+ * or in hsorthand:
+ * Ekin = Ekin' - m v vcm + 1/2 m vcm^2
+ */
+ int i, j, k;
+ real m, tm;
+ rvec hvcm, mv;
+ tensor dekin;
+
+ /* Local particles */
+ clear_rvec(mv);
+
+ /* Processor dependent part. */
+ tm = 0;
+ for (i = start; (i < end); i++)
+ {
+ m = mass[i];
+ tm += m;
+ for (j = 0; (j < DIM); j++)
+ {
+ mv[j] += m*v[i][j];
+ }
+ }
+ /* Shortcut */
+ svmul(1/tmass, vcm, vcm);
+ svmul(0.5, vcm, hvcm);
+ clear_mat(dekin);
+ for (j = 0; (j < DIM); j++)
+ {
+ for (k = 0; (k < DIM); k++)
+ {
+ dekin[j][k] += vcm[k]*(tm*hvcm[j]-mv[j]);
+ }
+ }
+ pr_rvecs(log, 0, "dekin", dekin, DIM);
+ pr_rvecs(log, 0, " ekin", ekin, DIM);
+ fprintf(log, "dekin = %g, ekin = %g vcm = (%8.4f %8.4f %8.4f)\n",
+ trace(dekin), trace(ekin), vcm[XX], vcm[YY], vcm[ZZ]);
+ fprintf(log, "mv = (%8.4f %8.4f %8.4f)\n",
+ mv[XX], mv[YY], mv[ZZ]);
+}
+
+extern gmx_bool update_randomize_velocities(t_inputrec *ir, gmx_large_int_t step, t_mdatoms *md, t_state *state, gmx_update_t upd, t_idef *idef, gmx_constr_t constr)
+{
+
+ int i;
+ real rate = (ir->delta_t)/ir->opts.tau_t[0];
+ /* proceed with andersen if 1) it's fixed probability per
+ particle andersen or 2) it's massive andersen and it's tau_t/dt */
+ if ((ir->etc == etcANDERSEN) || do_per_step(step, (int)(1.0/rate)))
+ {
+ srenew(upd->randatom, state->nalloc);
+ srenew(upd->randatom_list, state->nalloc);
+ if (upd->randatom_list_init == FALSE)
+ {
+ for (i = 0; i < state->nalloc; i++)
+ {
+ upd->randatom[i] = FALSE;
+ upd->randatom_list[i] = 0;
+ }
+ upd->randatom_list_init = TRUE;
+ }
+ andersen_tcoupl(ir, md, state, upd->sd->gaussrand[0], rate,
+ (ir->etc == etcANDERSEN) ? idef : NULL,
+ constr ? get_nblocks(constr) : 0,
+ constr ? get_sblock(constr) : NULL,
+ upd->randatom, upd->randatom_list,
+ upd->sd->randomize_group, upd->sd->boltzfac);
+ return TRUE;
+ }
+ return FALSE;
+}
--- /dev/null
- /* set free energy calculation frequency as the minimum of nstdhdl, nstexpanded, and nstrepl_ex_nst*/
+/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
+ *
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * GROningen MAchine for Chemical Simulations
+ *
+ * VERSION 3.2.0
+ * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2004, The GROMACS development team,
+ * check out http://www.gromacs.org for more information.
+
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * If you want to redistribute modifications, please consider that
+ * scientific software is very special. Version control is crucial -
+ * bugs must be traceable. We will be happy to consider code for
+ * inclusion in the official distribution, but derived work must not
+ * be called official GROMACS. Details are found in the README & COPYING
+ * files - if they are missing, get the official version at www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the papers on the package - you can find them in the top README file.
+ *
+ * For more info, check our website at http://www.gromacs.org
+ *
+ * And Hey:
+ * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include "typedefs.h"
+#include "smalloc.h"
+#include "sysstuff.h"
+#include "vec.h"
+#include "statutil.h"
+#include "vcm.h"
+#include "mdebin.h"
+#include "nrnb.h"
+#include "calcmu.h"
+#include "index.h"
+#include "vsite.h"
+#include "update.h"
+#include "ns.h"
+#include "trnio.h"
+#include "xtcio.h"
+#include "mdrun.h"
+#include "md_support.h"
+#include "md_logging.h"
+#include "confio.h"
+#include "network.h"
+#include "pull.h"
+#include "xvgr.h"
+#include "physics.h"
+#include "names.h"
+#include "xmdrun.h"
+#include "ionize.h"
+#include "disre.h"
+#include "orires.h"
+#include "pme.h"
+#include "mdatoms.h"
+#include "repl_ex.h"
+#include "qmmm.h"
+#include "domdec.h"
+#include "domdec_network.h"
+#include "partdec.h"
+#include "topsort.h"
+#include "coulomb.h"
+#include "constr.h"
+#include "shellfc.h"
+#include "compute_io.h"
+#include "mvdata.h"
+#include "checkpoint.h"
+#include "mtop_util.h"
+#include "sighandler.h"
+#include "txtdump.h"
+#include "string2.h"
+#include "pme_loadbal.h"
+#include "bondf.h"
+#include "membed.h"
+#include "types/nlistheuristics.h"
+#include "types/iteratedconstraints.h"
+#include "nbnxn_cuda_data_mgmt.h"
+
+#include "gromacs/utility/gmxmpi.h"
+
+#ifdef GMX_FAHCORE
+#include "corewrap.h"
+#endif
+
+static void reset_all_counters(FILE *fplog, t_commrec *cr,
+ gmx_large_int_t step,
+ gmx_large_int_t *step_rel, t_inputrec *ir,
+ gmx_wallcycle_t wcycle, t_nrnb *nrnb,
+ gmx_runtime_t *runtime,
+ nbnxn_cuda_ptr_t cu_nbv)
+{
+ char sbuf[STEPSTRSIZE];
+
+ /* Reset all the counters related to performance over the run */
+ md_print_warn(cr, fplog, "step %s: resetting all time and cycle counters\n",
+ gmx_step_str(step, sbuf));
+
+ if (cu_nbv)
+ {
+ nbnxn_cuda_reset_timings(cu_nbv);
+ }
+
+ wallcycle_stop(wcycle, ewcRUN);
+ wallcycle_reset_all(wcycle);
+ if (DOMAINDECOMP(cr))
+ {
+ reset_dd_statistics_counters(cr->dd);
+ }
+ init_nrnb(nrnb);
+ ir->init_step += *step_rel;
+ ir->nsteps -= *step_rel;
+ *step_rel = 0;
+ wallcycle_start(wcycle, ewcRUN);
+ runtime_start(runtime);
+ print_date_and_time(fplog, cr->nodeid, "Restarted time", runtime);
+}
+
+double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
+ const output_env_t oenv, gmx_bool bVerbose, gmx_bool bCompact,
+ int nstglobalcomm,
+ gmx_vsite_t *vsite, gmx_constr_t constr,
+ int stepout, t_inputrec *ir,
+ gmx_mtop_t *top_global,
+ t_fcdata *fcd,
+ t_state *state_global,
+ t_mdatoms *mdatoms,
+ t_nrnb *nrnb, gmx_wallcycle_t wcycle,
+ gmx_edsam_t ed, t_forcerec *fr,
+ int repl_ex_nst, int repl_ex_nex, int repl_ex_seed, gmx_membed_t membed,
+ real cpt_period, real max_hours,
+ const char *deviceOptions,
+ unsigned long Flags,
+ gmx_runtime_t *runtime)
+{
+ gmx_mdoutf_t *outf;
+ gmx_large_int_t step, step_rel;
+ double run_time;
+ double t, t0, lam0[efptNR];
+ gmx_bool bGStatEveryStep, bGStat, bCalcVir, bCalcEner;
+ gmx_bool bNS, bNStList, bSimAnn, bStopCM, bRerunMD, bNotLastFrame = FALSE,
+ bFirstStep, bStateFromCP, bStateFromTPX, bInitStep, bLastStep,
+ bBornRadii, bStartingFromCpt;
+ gmx_bool bDoDHDL = FALSE, bDoFEP = FALSE, bDoExpanded = FALSE;
+ gmx_bool do_ene, do_log, do_verbose, bRerunWarnNoV = TRUE,
+ bForceUpdate = FALSE, bCPT;
+ int mdof_flags;
+ gmx_bool bMasterState;
+ int force_flags, cglo_flags;
+ tensor force_vir, shake_vir, total_vir, tmp_vir, pres;
+ int i, m;
+ t_trxstatus *status;
+ rvec mu_tot;
+ t_vcm *vcm;
+ t_state *bufstate = NULL;
+ matrix *scale_tot, pcoupl_mu, M, ebox;
+ gmx_nlheur_t nlh;
+ t_trxframe rerun_fr;
+ gmx_repl_ex_t repl_ex = NULL;
+ int nchkpt = 1;
+ gmx_localtop_t *top;
+ t_mdebin *mdebin = NULL;
+ df_history_t df_history;
+ t_state *state = NULL;
+ rvec *f_global = NULL;
+ int n_xtc = -1;
+ rvec *x_xtc = NULL;
+ gmx_enerdata_t *enerd;
+ rvec *f = NULL;
+ gmx_global_stat_t gstat;
+ gmx_update_t upd = NULL;
+ t_graph *graph = NULL;
+ globsig_t gs;
+ gmx_rng_t mcrng = NULL;
+ gmx_bool bFFscan;
+ gmx_groups_t *groups;
+ gmx_ekindata_t *ekind, *ekind_save;
+ gmx_shellfc_t shellfc;
+ int count, nconverged = 0;
+ real timestep = 0;
+ double tcount = 0;
+ gmx_bool bIonize = FALSE;
+ gmx_bool bTCR = FALSE, bConverged = TRUE, bOK, bSumEkinhOld, bExchanged;
+ gmx_bool bAppend;
+ gmx_bool bResetCountersHalfMaxH = FALSE;
+ gmx_bool bVV, bIterativeCase, bFirstIterate, bTemp, bPres, bTrotter;
+ gmx_bool bUpdateDoLR;
+ real mu_aver = 0, dvdl_constr;
+ int a0, a1, gnx = 0, ii;
+ atom_id *grpindex = NULL;
+ char *grpname;
+ t_coupl_rec *tcr = NULL;
+ rvec *xcopy = NULL, *vcopy = NULL, *cbuf = NULL;
+ matrix boxcopy = {{0}}, lastbox;
+ tensor tmpvir;
+ real fom, oldfom, veta_save, pcurr, scalevir, tracevir;
+ real vetanew = 0;
+ int lamnew = 0;
+ /* for FEP */
+ int nstfep;
+ real rate;
+ double cycles;
+ real saved_conserved_quantity = 0;
+ real last_ekin = 0;
+ int iter_i;
+ t_extmass MassQ;
+ int **trotter_seq;
+ char sbuf[STEPSTRSIZE], sbuf2[STEPSTRSIZE];
+ int handled_stop_condition = gmx_stop_cond_none; /* compare to get_stop_condition*/
+ gmx_iterate_t iterate;
+ gmx_large_int_t multisim_nsteps = -1; /* number of steps to do before first multisim
+ simulation stops. If equal to zero, don't
+ communicate any more between multisims.*/
+ /* PME load balancing data for GPU kernels */
+ pme_load_balancing_t pme_loadbal = NULL;
+ double cycles_pmes;
+ gmx_bool bPMETuneTry = FALSE, bPMETuneRunning = FALSE;
+
+#ifdef GMX_FAHCORE
+ /* Temporary addition for FAHCORE checkpointing */
+ int chkpt_ret;
+#endif
+
+ /* Check for special mdrun options */
+ bRerunMD = (Flags & MD_RERUN);
+ bIonize = (Flags & MD_IONIZE);
+ bFFscan = (Flags & MD_FFSCAN);
+ bAppend = (Flags & MD_APPENDFILES);
+ if (Flags & MD_RESETCOUNTERSHALFWAY)
+ {
+ if (ir->nsteps > 0)
+ {
+ /* Signal to reset the counters half the simulation steps. */
+ wcycle_set_reset_counters(wcycle, ir->nsteps/2);
+ }
+ /* Signal to reset the counters halfway the simulation time. */
+ bResetCountersHalfMaxH = (max_hours > 0);
+ }
+
+ /* md-vv uses averaged full step velocities for T-control
+ md-vv-avek uses averaged half step velocities for T-control (but full step ekin for P control)
+ md uses averaged half step kinetic energies to determine temperature unless defined otherwise by GMX_EKIN_AVE_VEL; */
+ bVV = EI_VV(ir->eI);
+ if (bVV) /* to store the initial velocities while computing virial */
+ {
+ snew(cbuf, top_global->natoms);
+ }
+ /* all the iteratative cases - only if there are constraints */
+ bIterativeCase = ((IR_NPH_TROTTER(ir) || IR_NPT_TROTTER(ir)) && (constr) && (!bRerunMD));
+ gmx_iterate_init(&iterate, FALSE); /* The default value of iterate->bIterationActive is set to
+ false in this step. The correct value, true or false,
+ is set at each step, as it depends on the frequency of temperature
+ and pressure control.*/
+ bTrotter = (bVV && (IR_NPT_TROTTER(ir) || IR_NPH_TROTTER(ir) || IR_NVT_TROTTER(ir)));
+
+ if (bRerunMD)
+ {
+ /* Since we don't know if the frames read are related in any way,
+ * rebuild the neighborlist at every step.
+ */
+ ir->nstlist = 1;
+ ir->nstcalcenergy = 1;
+ nstglobalcomm = 1;
+ }
+
+ check_ir_old_tpx_versions(cr, fplog, ir, top_global);
+
+ nstglobalcomm = check_nstglobalcomm(fplog, cr, nstglobalcomm, ir);
+ bGStatEveryStep = (nstglobalcomm == 1);
+
+ if (!bGStatEveryStep && ir->nstlist == -1 && fplog != NULL)
+ {
+ fprintf(fplog,
+ "To reduce the energy communication with nstlist = -1\n"
+ "the neighbor list validity should not be checked at every step,\n"
+ "this means that exact integration is not guaranteed.\n"
+ "The neighbor list validity is checked after:\n"
+ " <n.list life time> - 2*std.dev.(n.list life time) steps.\n"
+ "In most cases this will result in exact integration.\n"
+ "This reduces the energy communication by a factor of 2 to 3.\n"
+ "If you want less energy communication, set nstlist > 3.\n\n");
+ }
+
+ if (bRerunMD || bFFscan)
+ {
+ ir->nstxtcout = 0;
+ }
+ groups = &top_global->groups;
+
+ /* Initial values */
+ init_md(fplog, cr, ir, oenv, &t, &t0, state_global->lambda,
+ &(state_global->fep_state), lam0,
+ nrnb, top_global, &upd,
+ nfile, fnm, &outf, &mdebin,
+ force_vir, shake_vir, mu_tot, &bSimAnn, &vcm, state_global, Flags);
+
+ clear_mat(total_vir);
+ clear_mat(pres);
+ /* Energy terms and groups */
+ snew(enerd, 1);
+ init_enerdata(top_global->groups.grps[egcENER].nr, ir->fepvals->n_lambda,
+ enerd);
+ if (DOMAINDECOMP(cr))
+ {
+ f = NULL;
+ }
+ else
+ {
+ snew(f, top_global->natoms);
+ }
+
+ /* lambda Monte carlo random number generator */
+ if (ir->bExpanded)
+ {
+ mcrng = gmx_rng_init(ir->expandedvals->lmc_seed);
+ }
+ /* copy the state into df_history */
+ copy_df_history(&df_history, &state_global->dfhist);
+
+ /* Kinetic energy data */
+ snew(ekind, 1);
+ init_ekindata(fplog, top_global, &(ir->opts), ekind);
+ /* needed for iteration of constraints */
+ snew(ekind_save, 1);
+ init_ekindata(fplog, top_global, &(ir->opts), ekind_save);
+ /* Copy the cos acceleration to the groups struct */
+ ekind->cosacc.cos_accel = ir->cos_accel;
+
+ gstat = global_stat_init(ir);
+ debug_gmx();
+
+ /* Check for polarizable models and flexible constraints */
+ shellfc = init_shell_flexcon(fplog,
+ top_global, n_flexible_constraints(constr),
+ (ir->bContinuation ||
+ (DOMAINDECOMP(cr) && !MASTER(cr))) ?
+ NULL : state_global->x);
+
+ if (DEFORM(*ir))
+ {
+#ifdef GMX_THREAD_MPI
+ tMPI_Thread_mutex_lock(&deform_init_box_mutex);
+#endif
+ set_deform_reference_box(upd,
+ deform_init_init_step_tpx,
+ deform_init_box_tpx);
+#ifdef GMX_THREAD_MPI
+ tMPI_Thread_mutex_unlock(&deform_init_box_mutex);
+#endif
+ }
+
+ {
+ double io = compute_io(ir, top_global->natoms, groups, mdebin->ebin->nener, 1);
+ if ((io > 2000) && MASTER(cr))
+ {
+ fprintf(stderr,
+ "\nWARNING: This run will generate roughly %.0f Mb of data\n\n",
+ io);
+ }
+ }
+
+ if (DOMAINDECOMP(cr))
+ {
+ top = dd_init_local_top(top_global);
+
+ snew(state, 1);
+ dd_init_local_state(cr->dd, state_global, state);
+
+ if (DDMASTER(cr->dd) && ir->nstfout)
+ {
+ snew(f_global, state_global->natoms);
+ }
+ }
+ else
+ {
+ if (PAR(cr))
+ {
+ /* Initialize the particle decomposition and split the topology */
+ top = split_system(fplog, top_global, ir, cr);
+
+ pd_cg_range(cr, &fr->cg0, &fr->hcg);
+ pd_at_range(cr, &a0, &a1);
+ }
+ else
+ {
+ top = gmx_mtop_generate_local_top(top_global, ir);
+
+ a0 = 0;
+ a1 = top_global->natoms;
+ }
+
+ forcerec_set_excl_load(fr, top, cr);
+
+ state = partdec_init_local_state(cr, state_global);
+ f_global = f;
+
+ atoms2md(top_global, ir, 0, NULL, a0, a1-a0, mdatoms);
+
+ if (vsite)
+ {
+ set_vsite_top(vsite, top, mdatoms, cr);
+ }
+
+ if (ir->ePBC != epbcNONE && !fr->bMolPBC)
+ {
+ graph = mk_graph(fplog, &(top->idef), 0, top_global->natoms, FALSE, FALSE);
+ }
+
+ if (shellfc)
+ {
+ make_local_shells(cr, mdatoms, shellfc);
+ }
+
+ init_bonded_thread_force_reduction(fr, &top->idef);
+
+ if (ir->pull && PAR(cr))
+ {
+ dd_make_local_pull_groups(NULL, ir->pull, mdatoms);
+ }
+ }
+
+ if (DOMAINDECOMP(cr))
+ {
+ /* Distribute the charge groups over the nodes from the master node */
+ dd_partition_system(fplog, ir->init_step, cr, TRUE, 1,
+ state_global, top_global, ir,
+ state, &f, mdatoms, top, fr,
+ vsite, shellfc, constr,
+ nrnb, wcycle, FALSE);
+
+ }
+
+ update_mdatoms(mdatoms, state->lambda[efptMASS]);
+
+ if (opt2bSet("-cpi", nfile, fnm))
+ {
+ bStateFromCP = gmx_fexist_master(opt2fn_master("-cpi", nfile, fnm, cr), cr);
+ }
+ else
+ {
+ bStateFromCP = FALSE;
+ }
+
+ if (MASTER(cr))
+ {
+ if (bStateFromCP)
+ {
+ /* Update mdebin with energy history if appending to output files */
+ if (Flags & MD_APPENDFILES)
+ {
+ restore_energyhistory_from_state(mdebin, &state_global->enerhist);
+ }
+ else
+ {
+ /* We might have read an energy history from checkpoint,
+ * free the allocated memory and reset the counts.
+ */
+ done_energyhistory(&state_global->enerhist);
+ init_energyhistory(&state_global->enerhist);
+ }
+ }
+ /* Set the initial energy history in state by updating once */
+ update_energyhistory(&state_global->enerhist, mdebin);
+ }
+
+ if ((state->flags & (1<<estLD_RNG)) && (Flags & MD_READ_RNG))
+ {
+ /* Set the random state if we read a checkpoint file */
+ set_stochd_state(upd, state);
+ }
+
+ if (state->flags & (1<<estMC_RNG))
+ {
+ set_mc_state(mcrng, state);
+ }
+
+ /* Initialize constraints */
+ if (constr)
+ {
+ if (!DOMAINDECOMP(cr))
+ {
+ set_constraints(constr, top, ir, mdatoms, cr);
+ }
+ }
+
+ /* Check whether we have to GCT stuff */
+ bTCR = ftp2bSet(efGCT, nfile, fnm);
+ if (bTCR)
+ {
+ if (MASTER(cr))
+ {
+ fprintf(stderr, "Will do General Coupling Theory!\n");
+ }
+ gnx = top_global->mols.nr;
+ snew(grpindex, gnx);
+ for (i = 0; (i < gnx); i++)
+ {
+ grpindex[i] = i;
+ }
+ }
+
+ if (repl_ex_nst > 0)
+ {
+ /* We need to be sure replica exchange can only occur
+ * when the energies are current */
+ check_nst_param(fplog, cr, "nstcalcenergy", ir->nstcalcenergy,
+ "repl_ex_nst", &repl_ex_nst);
+ /* This check needs to happen before inter-simulation
+ * signals are initialized, too */
+ }
+ if (repl_ex_nst > 0 && MASTER(cr))
+ {
+ repl_ex = init_replica_exchange(fplog, cr->ms, state_global, ir,
+ repl_ex_nst, repl_ex_nex, repl_ex_seed);
+ }
+
+ /* PME tuning is only supported with GPUs or PME nodes and not with rerun.
+ * With perturbed charges with soft-core we should not change the cut-off.
+ */
+ if ((Flags & MD_TUNEPME) &&
+ EEL_PME(fr->eeltype) &&
+ ( (fr->cutoff_scheme == ecutsVERLET && fr->nbv->bUseGPU) || !(cr->duty & DUTY_PME)) &&
+ !(ir->efep != efepNO && mdatoms->nChargePerturbed > 0 && ir->fepvals->bScCoul) &&
+ !bRerunMD)
+ {
+ pme_loadbal_init(&pme_loadbal, ir, state->box, fr->ic, fr->pmedata);
+ cycles_pmes = 0;
+ if (cr->duty & DUTY_PME)
+ {
+ /* Start tuning right away, as we can't measure the load */
+ bPMETuneRunning = TRUE;
+ }
+ else
+ {
+ /* Separate PME nodes, we can measure the PP/PME load balance */
+ bPMETuneTry = TRUE;
+ }
+ }
+
+ if (!ir->bContinuation && !bRerunMD)
+ {
+ if (mdatoms->cFREEZE && (state->flags & (1<<estV)))
+ {
+ /* Set the velocities of frozen particles to zero */
+ for (i = mdatoms->start; i < mdatoms->start+mdatoms->homenr; i++)
+ {
+ for (m = 0; m < DIM; m++)
+ {
+ if (ir->opts.nFreeze[mdatoms->cFREEZE[i]][m])
+ {
+ state->v[i][m] = 0;
+ }
+ }
+ }
+ }
+
+ if (constr)
+ {
+ /* Constrain the initial coordinates and velocities */
+ do_constrain_first(fplog, constr, ir, mdatoms, state, f,
+ graph, cr, nrnb, fr, top, shake_vir);
+ }
+ if (vsite)
+ {
+ /* Construct the virtual sites for the initial configuration */
+ construct_vsites(fplog, vsite, state->x, nrnb, ir->delta_t, NULL,
+ top->idef.iparams, top->idef.il,
+ fr->ePBC, fr->bMolPBC, graph, cr, state->box);
+ }
+ }
+
+ debug_gmx();
+
- if (ir->bExpanded && (nstfep > ir->expandedvals->nstexpanded))
++ /* set free energy calculation frequency as the minimum
++ greatest common denominator of nstdhdl, nstexpanded, and repl_ex_nst*/
+ nstfep = ir->fepvals->nstdhdl;
- nstfep = ir->expandedvals->nstexpanded;
++ if (ir->bExpanded)
+ {
- if (repl_ex_nst > 0 && nstfep > repl_ex_nst)
++ nstfep = gmx_greatest_common_divisor(ir->fepvals->nstdhdl,nstfep);
+ }
- nstfep = repl_ex_nst;
++ if (repl_ex_nst > 0)
+ {
++ nstfep = gmx_greatest_common_divisor(repl_ex_nst,nstfep);
+ }
+
+ /* I'm assuming we need global communication the first time! MRS */
+ cglo_flags = (CGLO_TEMPERATURE | CGLO_GSTAT
+ | ((ir->comm_mode != ecmNO) ? CGLO_STOPCM : 0)
+ | (bVV ? CGLO_PRESSURE : 0)
+ | (bVV ? CGLO_CONSTRAINT : 0)
+ | (bRerunMD ? CGLO_RERUNMD : 0)
+ | ((Flags & MD_READ_EKIN) ? CGLO_READEKIN : 0));
+
+ bSumEkinhOld = FALSE;
+ compute_globals(fplog, gstat, cr, ir, fr, ekind, state, state_global, mdatoms, nrnb, vcm,
+ NULL, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
+ constr, NULL, FALSE, state->box,
+ top_global, &pcurr, top_global->natoms, &bSumEkinhOld, cglo_flags);
+ if (ir->eI == eiVVAK)
+ {
+ /* a second call to get the half step temperature initialized as well */
+ /* we do the same call as above, but turn the pressure off -- internally to
+ compute_globals, this is recognized as a velocity verlet half-step
+ kinetic energy calculation. This minimized excess variables, but
+ perhaps loses some logic?*/
+
+ compute_globals(fplog, gstat, cr, ir, fr, ekind, state, state_global, mdatoms, nrnb, vcm,
+ NULL, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
+ constr, NULL, FALSE, state->box,
+ top_global, &pcurr, top_global->natoms, &bSumEkinhOld,
+ cglo_flags &~(CGLO_STOPCM | CGLO_PRESSURE));
+ }
+
+ /* Calculate the initial half step temperature, and save the ekinh_old */
+ if (!(Flags & MD_STARTFROMCPT))
+ {
+ for (i = 0; (i < ir->opts.ngtc); i++)
+ {
+ copy_mat(ekind->tcstat[i].ekinh, ekind->tcstat[i].ekinh_old);
+ }
+ }
+ if (ir->eI != eiVV)
+ {
+ enerd->term[F_TEMP] *= 2; /* result of averages being done over previous and current step,
+ and there is no previous step */
+ }
+
+ /* if using an iterative algorithm, we need to create a working directory for the state. */
+ if (bIterativeCase)
+ {
+ bufstate = init_bufstate(state);
+ }
+ if (bFFscan)
+ {
+ snew(xcopy, state->natoms);
+ snew(vcopy, state->natoms);
+ copy_rvecn(state->x, xcopy, 0, state->natoms);
+ copy_rvecn(state->v, vcopy, 0, state->natoms);
+ copy_mat(state->box, boxcopy);
+ }
+
+ /* need to make an initiation call to get the Trotter variables set, as well as other constants for non-trotter
+ temperature control */
+ trotter_seq = init_npt_vars(ir, state, &MassQ, bTrotter);
+
+ if (MASTER(cr))
+ {
+ if (constr && !ir->bContinuation && ir->eConstrAlg == econtLINCS)
+ {
+ fprintf(fplog,
+ "RMS relative constraint deviation after constraining: %.2e\n",
+ constr_rmsd(constr, FALSE));
+ }
+ if (EI_STATE_VELOCITY(ir->eI))
+ {
+ fprintf(fplog, "Initial temperature: %g K\n", enerd->term[F_TEMP]);
+ }
+ if (bRerunMD)
+ {
+ fprintf(stderr, "starting md rerun '%s', reading coordinates from"
+ " input trajectory '%s'\n\n",
+ *(top_global->name), opt2fn("-rerun", nfile, fnm));
+ if (bVerbose)
+ {
+ fprintf(stderr, "Calculated time to finish depends on nsteps from "
+ "run input file,\nwhich may not correspond to the time "
+ "needed to process input trajectory.\n\n");
+ }
+ }
+ else
+ {
+ char tbuf[20];
+ fprintf(stderr, "starting mdrun '%s'\n",
+ *(top_global->name));
+ if (ir->nsteps >= 0)
+ {
+ sprintf(tbuf, "%8.1f", (ir->init_step+ir->nsteps)*ir->delta_t);
+ }
+ else
+ {
+ sprintf(tbuf, "%s", "infinite");
+ }
+ if (ir->init_step > 0)
+ {
+ fprintf(stderr, "%s steps, %s ps (continuing from step %s, %8.1f ps).\n",
+ gmx_step_str(ir->init_step+ir->nsteps, sbuf), tbuf,
+ gmx_step_str(ir->init_step, sbuf2),
+ ir->init_step*ir->delta_t);
+ }
+ else
+ {
+ fprintf(stderr, "%s steps, %s ps.\n",
+ gmx_step_str(ir->nsteps, sbuf), tbuf);
+ }
+ }
+ fprintf(fplog, "\n");
+ }
+
+ /* Set and write start time */
+ runtime_start(runtime);
+ print_date_and_time(fplog, cr->nodeid, "Started mdrun", runtime);
+ wallcycle_start(wcycle, ewcRUN);
+ if (fplog)
+ {
+ fprintf(fplog, "\n");
+ }
+
+ /* safest point to do file checkpointing is here. More general point would be immediately before integrator call */
+#ifdef GMX_FAHCORE
+ chkpt_ret = fcCheckPointParallel( cr->nodeid,
+ NULL, 0);
+ if (chkpt_ret == 0)
+ {
+ gmx_fatal( 3, __FILE__, __LINE__, "Checkpoint error on step %d\n", 0 );
+ }
+#endif
+
+ debug_gmx();
+ /***********************************************************
+ *
+ * Loop over MD steps
+ *
+ ************************************************************/
+
+ /* if rerunMD then read coordinates and velocities from input trajectory */
+ if (bRerunMD)
+ {
+ if (getenv("GMX_FORCE_UPDATE"))
+ {
+ bForceUpdate = TRUE;
+ }
+
+ rerun_fr.natoms = 0;
+ if (MASTER(cr))
+ {
+ bNotLastFrame = read_first_frame(oenv, &status,
+ opt2fn("-rerun", nfile, fnm),
+ &rerun_fr, TRX_NEED_X | TRX_READ_V);
+ if (rerun_fr.natoms != top_global->natoms)
+ {
+ gmx_fatal(FARGS,
+ "Number of atoms in trajectory (%d) does not match the "
+ "run input file (%d)\n",
+ rerun_fr.natoms, top_global->natoms);
+ }
+ if (ir->ePBC != epbcNONE)
+ {
+ if (!rerun_fr.bBox)
+ {
+ gmx_fatal(FARGS, "Rerun trajectory frame step %d time %f does not contain a box, while pbc is used", rerun_fr.step, rerun_fr.time);
+ }
+ if (max_cutoff2(ir->ePBC, rerun_fr.box) < sqr(fr->rlistlong))
+ {
+ gmx_fatal(FARGS, "Rerun trajectory frame step %d time %f has too small box dimensions", rerun_fr.step, rerun_fr.time);
+ }
+ }
+ }
+
+ if (PAR(cr))
+ {
+ rerun_parallel_comm(cr, &rerun_fr, &bNotLastFrame);
+ }
+
+ if (ir->ePBC != epbcNONE)
+ {
+ /* Set the shift vectors.
+ * Necessary here when have a static box different from the tpr box.
+ */
+ calc_shifts(rerun_fr.box, fr->shift_vec);
+ }
+ }
+
+ /* loop over MD steps or if rerunMD to end of input trajectory */
+ bFirstStep = TRUE;
+ /* Skip the first Nose-Hoover integration when we get the state from tpx */
+ bStateFromTPX = !bStateFromCP;
+ bInitStep = bFirstStep && (bStateFromTPX || bVV);
+ bStartingFromCpt = (Flags & MD_STARTFROMCPT) && bInitStep;
+ bLastStep = FALSE;
+ bSumEkinhOld = FALSE;
+ bExchanged = FALSE;
+
+ init_global_signals(&gs, cr, ir, repl_ex_nst);
+
+ step = ir->init_step;
+ step_rel = 0;
+
+ if (ir->nstlist == -1)
+ {
+ init_nlistheuristics(&nlh, bGStatEveryStep, step);
+ }
+
+ if (MULTISIM(cr) && (repl_ex_nst <= 0 ))
+ {
+ /* check how many steps are left in other sims */
+ multisim_nsteps = get_multisim_nsteps(cr, ir->nsteps);
+ }
+
+
+ /* and stop now if we should */
+ bLastStep = (bRerunMD || (ir->nsteps >= 0 && step_rel > ir->nsteps) ||
+ ((multisim_nsteps >= 0) && (step_rel >= multisim_nsteps )));
+ while (!bLastStep || (bRerunMD && bNotLastFrame))
+ {
+
+ wallcycle_start(wcycle, ewcSTEP);
+
+ if (bRerunMD)
+ {
+ if (rerun_fr.bStep)
+ {
+ step = rerun_fr.step;
+ step_rel = step - ir->init_step;
+ }
+ if (rerun_fr.bTime)
+ {
+ t = rerun_fr.time;
+ }
+ else
+ {
+ t = step;
+ }
+ }
+ else
+ {
+ bLastStep = (step_rel == ir->nsteps);
+ t = t0 + step*ir->delta_t;
+ }
+
+ if (ir->efep != efepNO || ir->bSimTemp)
+ {
+ /* find and set the current lambdas. If rerunning, we either read in a state, or a lambda value,
+ requiring different logic. */
+
+ set_current_lambdas(step, ir->fepvals, bRerunMD, &rerun_fr, state_global, state, lam0);
+ bDoDHDL = do_per_step(step, ir->fepvals->nstdhdl);
+ bDoFEP = (do_per_step(step, nstfep) && (ir->efep != efepNO));
+ bDoExpanded = (do_per_step(step, ir->expandedvals->nstexpanded) && (ir->bExpanded) && (step > 0));
+ }
+
+ if (bSimAnn)
+ {
+ update_annealing_target_temp(&(ir->opts), t);
+ }
+
+ if (bRerunMD)
+ {
+ if (!(DOMAINDECOMP(cr) && !MASTER(cr)))
+ {
+ for (i = 0; i < state_global->natoms; i++)
+ {
+ copy_rvec(rerun_fr.x[i], state_global->x[i]);
+ }
+ if (rerun_fr.bV)
+ {
+ for (i = 0; i < state_global->natoms; i++)
+ {
+ copy_rvec(rerun_fr.v[i], state_global->v[i]);
+ }
+ }
+ else
+ {
+ for (i = 0; i < state_global->natoms; i++)
+ {
+ clear_rvec(state_global->v[i]);
+ }
+ if (bRerunWarnNoV)
+ {
+ fprintf(stderr, "\nWARNING: Some frames do not contain velocities.\n"
+ " Ekin, temperature and pressure are incorrect,\n"
+ " the virial will be incorrect when constraints are present.\n"
+ "\n");
+ bRerunWarnNoV = FALSE;
+ }
+ }
+ }
+ copy_mat(rerun_fr.box, state_global->box);
+ copy_mat(state_global->box, state->box);
+
+ if (vsite && (Flags & MD_RERUN_VSITE))
+ {
+ if (DOMAINDECOMP(cr))
+ {
+ gmx_fatal(FARGS, "Vsite recalculation with -rerun is not implemented for domain decomposition, use particle decomposition");
+ }
+ if (graph)
+ {
+ /* Following is necessary because the graph may get out of sync
+ * with the coordinates if we only have every N'th coordinate set
+ */
+ mk_mshift(fplog, graph, fr->ePBC, state->box, state->x);
+ shift_self(graph, state->box, state->x);
+ }
+ construct_vsites(fplog, vsite, state->x, nrnb, ir->delta_t, state->v,
+ top->idef.iparams, top->idef.il,
+ fr->ePBC, fr->bMolPBC, graph, cr, state->box);
+ if (graph)
+ {
+ unshift_self(graph, state->box, state->x);
+ }
+ }
+ }
+
+ /* Stop Center of Mass motion */
+ bStopCM = (ir->comm_mode != ecmNO && do_per_step(step, ir->nstcomm));
+
+ /* Copy back starting coordinates in case we're doing a forcefield scan */
+ if (bFFscan)
+ {
+ for (ii = 0; (ii < state->natoms); ii++)
+ {
+ copy_rvec(xcopy[ii], state->x[ii]);
+ copy_rvec(vcopy[ii], state->v[ii]);
+ }
+ copy_mat(boxcopy, state->box);
+ }
+
+ if (bRerunMD)
+ {
+ /* for rerun MD always do Neighbour Searching */
+ bNS = (bFirstStep || ir->nstlist != 0);
+ bNStList = bNS;
+ }
+ else
+ {
+ /* Determine whether or not to do Neighbour Searching and LR */
+ bNStList = (ir->nstlist > 0 && step % ir->nstlist == 0);
+
+ bNS = (bFirstStep || bExchanged || bNStList || bDoFEP ||
+ (ir->nstlist == -1 && nlh.nabnsb > 0));
+
+ if (bNS && ir->nstlist == -1)
+ {
+ set_nlistheuristics(&nlh, bFirstStep || bExchanged || bDoFEP, step);
+ }
+ }
+
+ /* check whether we should stop because another simulation has
+ stopped. */
+ if (MULTISIM(cr))
+ {
+ if ( (multisim_nsteps >= 0) && (step_rel >= multisim_nsteps) &&
+ (multisim_nsteps != ir->nsteps) )
+ {
+ if (bNS)
+ {
+ if (MASTER(cr))
+ {
+ fprintf(stderr,
+ "Stopping simulation %d because another one has finished\n",
+ cr->ms->sim);
+ }
+ bLastStep = TRUE;
+ gs.sig[eglsCHKPT] = 1;
+ }
+ }
+ }
+
+ /* < 0 means stop at next step, > 0 means stop at next NS step */
+ if ( (gs.set[eglsSTOPCOND] < 0) ||
+ ( (gs.set[eglsSTOPCOND] > 0) && (bNStList || ir->nstlist == 0) ) )
+ {
+ bLastStep = TRUE;
+ }
+
+ /* Determine whether or not to update the Born radii if doing GB */
+ bBornRadii = bFirstStep;
+ if (ir->implicit_solvent && (step % ir->nstgbradii == 0))
+ {
+ bBornRadii = TRUE;
+ }
+
+ do_log = do_per_step(step, ir->nstlog) || bFirstStep || bLastStep;
+ do_verbose = bVerbose &&
+ (step % stepout == 0 || bFirstStep || bLastStep);
+
+ if (bNS && !(bFirstStep && ir->bContinuation && !bRerunMD))
+ {
+ if (bRerunMD)
+ {
+ bMasterState = TRUE;
+ }
+ else
+ {
+ bMasterState = FALSE;
+ /* Correct the new box if it is too skewed */
+ if (DYNAMIC_BOX(*ir))
+ {
+ if (correct_box(fplog, step, state->box, graph))
+ {
+ bMasterState = TRUE;
+ }
+ }
+ if (DOMAINDECOMP(cr) && bMasterState)
+ {
+ dd_collect_state(cr->dd, state, state_global);
+ }
+ }
+
+ if (DOMAINDECOMP(cr))
+ {
+ /* Repartition the domain decomposition */
+ wallcycle_start(wcycle, ewcDOMDEC);
+ dd_partition_system(fplog, step, cr,
+ bMasterState, nstglobalcomm,
+ state_global, top_global, ir,
+ state, &f, mdatoms, top, fr,
+ vsite, shellfc, constr,
+ nrnb, wcycle,
+ do_verbose && !bPMETuneRunning);
+ wallcycle_stop(wcycle, ewcDOMDEC);
+ /* If using an iterative integrator, reallocate space to match the decomposition */
+ }
+ }
+
+ if (MASTER(cr) && do_log && !bFFscan)
+ {
+ print_ebin_header(fplog, step, t, state->lambda[efptFEP]); /* can we improve the information printed here? */
+ }
+
+ if (ir->efep != efepNO)
+ {
+ update_mdatoms(mdatoms, state->lambda[efptMASS]);
+ }
+
+ if ((bRerunMD && rerun_fr.bV) || bExchanged)
+ {
+
+ /* We need the kinetic energy at minus the half step for determining
+ * the full step kinetic energy and possibly for T-coupling.*/
+ /* This may not be quite working correctly yet . . . . */
+ compute_globals(fplog, gstat, cr, ir, fr, ekind, state, state_global, mdatoms, nrnb, vcm,
+ wcycle, enerd, NULL, NULL, NULL, NULL, mu_tot,
+ constr, NULL, FALSE, state->box,
+ top_global, &pcurr, top_global->natoms, &bSumEkinhOld,
+ CGLO_RERUNMD | CGLO_GSTAT | CGLO_TEMPERATURE);
+ }
+ clear_mat(force_vir);
+
+ /* Ionize the atoms if necessary */
+ if (bIonize)
+ {
+ ionize(fplog, oenv, mdatoms, top_global, t, ir, state->x, state->v,
+ mdatoms->start, mdatoms->start+mdatoms->homenr, state->box, cr);
+ }
+
+ /* Update force field in ffscan program */
+ if (bFFscan)
+ {
+ if (update_forcefield(fplog,
+ nfile, fnm, fr,
+ mdatoms->nr, state->x, state->box))
+ {
+ gmx_finalize_par();
+
+ exit(0);
+ }
+ }
+
+ /* We write a checkpoint at this MD step when:
+ * either at an NS step when we signalled through gs,
+ * or at the last step (but not when we do not want confout),
+ * but never at the first step or with rerun.
+ */
+ bCPT = (((gs.set[eglsCHKPT] && (bNS || ir->nstlist == 0)) ||
+ (bLastStep && (Flags & MD_CONFOUT))) &&
+ step > ir->init_step && !bRerunMD);
+ if (bCPT)
+ {
+ gs.set[eglsCHKPT] = 0;
+ }
+
+ /* Determine the energy and pressure:
+ * at nstcalcenergy steps and at energy output steps (set below).
+ */
+ if (EI_VV(ir->eI) && (!bInitStep))
+ {
+ /* for vv, the first half of the integration actually corresponds
+ to the previous step. bCalcEner is only required to be evaluated on the 'next' step,
+ but the virial needs to be calculated on both the current step and the 'next' step. Future
+ reorganization may be able to get rid of one of the bCalcVir=TRUE steps. */
+
+ bCalcEner = do_per_step(step-1, ir->nstcalcenergy);
+ bCalcVir = bCalcEner ||
+ (ir->epc != epcNO && (do_per_step(step, ir->nstpcouple) || do_per_step(step-1, ir->nstpcouple)));
+ }
+ else
+ {
+ bCalcEner = do_per_step(step, ir->nstcalcenergy);
+ bCalcVir = bCalcEner ||
+ (ir->epc != epcNO && do_per_step(step, ir->nstpcouple));
+ }
+
+ /* Do we need global communication ? */
+ bGStat = (bCalcVir || bCalcEner || bStopCM ||
+ do_per_step(step, nstglobalcomm) || (bVV && IR_NVT_TROTTER(ir) && do_per_step(step-1, nstglobalcomm)) ||
+ (ir->nstlist == -1 && !bRerunMD && step >= nlh.step_nscheck));
+
+ do_ene = (do_per_step(step, ir->nstenergy) || bLastStep);
+
+ if (do_ene || do_log)
+ {
+ bCalcVir = TRUE;
+ bCalcEner = TRUE;
+ bGStat = TRUE;
+ }
+
+ /* these CGLO_ options remain the same throughout the iteration */
+ cglo_flags = ((bRerunMD ? CGLO_RERUNMD : 0) |
+ (bGStat ? CGLO_GSTAT : 0)
+ );
+
+ force_flags = (GMX_FORCE_STATECHANGED |
+ ((DYNAMIC_BOX(*ir) || bRerunMD) ? GMX_FORCE_DYNAMICBOX : 0) |
+ GMX_FORCE_ALLFORCES |
+ GMX_FORCE_SEPLRF |
+ (bCalcVir ? GMX_FORCE_VIRIAL : 0) |
+ (bCalcEner ? GMX_FORCE_ENERGY : 0) |
+ (bDoFEP ? GMX_FORCE_DHDL : 0)
+ );
+
+ if (fr->bTwinRange)
+ {
+ if (do_per_step(step, ir->nstcalclr))
+ {
+ force_flags |= GMX_FORCE_DO_LR;
+ }
+ }
+
+ if (shellfc)
+ {
+ /* Now is the time to relax the shells */
+ count = relax_shell_flexcon(fplog, cr, bVerbose, bFFscan ? step+1 : step,
+ ir, bNS, force_flags,
+ bStopCM, top, top_global,
+ constr, enerd, fcd,
+ state, f, force_vir, mdatoms,
+ nrnb, wcycle, graph, groups,
+ shellfc, fr, bBornRadii, t, mu_tot,
+ state->natoms, &bConverged, vsite,
+ outf->fp_field);
+ tcount += count;
+
+ if (bConverged)
+ {
+ nconverged++;
+ }
+ }
+ else
+ {
+ /* The coordinates (x) are shifted (to get whole molecules)
+ * in do_force.
+ * This is parallellized as well, and does communication too.
+ * Check comments in sim_util.c
+ */
+ do_force(fplog, cr, ir, step, nrnb, wcycle, top, top_global, groups,
+ state->box, state->x, &state->hist,
+ f, force_vir, mdatoms, enerd, fcd,
+ state->lambda, graph,
+ fr, vsite, mu_tot, t, outf->fp_field, ed, bBornRadii,
+ (bNS ? GMX_FORCE_NS : 0) | force_flags);
+ }
+
+ if (bTCR)
+ {
+ mu_aver = calc_mu_aver(cr, state->x, mdatoms->chargeA,
+ mu_tot, &top_global->mols, mdatoms, gnx, grpindex);
+ }
+
+ if (bTCR && bFirstStep)
+ {
+ tcr = init_coupling(fplog, nfile, fnm, cr, fr, mdatoms, &(top->idef));
+ fprintf(fplog, "Done init_coupling\n");
+ fflush(fplog);
+ }
+
+ if (bVV && !bStartingFromCpt && !bRerunMD)
+ /* ############### START FIRST UPDATE HALF-STEP FOR VV METHODS############### */
+ {
+ if (ir->eI == eiVV && bInitStep)
+ {
+ /* if using velocity verlet with full time step Ekin,
+ * take the first half step only to compute the
+ * virial for the first step. From there,
+ * revert back to the initial coordinates
+ * so that the input is actually the initial step.
+ */
+ copy_rvecn(state->v, cbuf, 0, state->natoms); /* should make this better for parallelizing? */
+ }
+ else
+ {
+ /* this is for NHC in the Ekin(t+dt/2) version of vv */
+ trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ1);
+ }
+
+ /* If we are using twin-range interactions where the long-range component
+ * is only evaluated every nstcalclr>1 steps, we should do a special update
+ * step to combine the long-range forces on these steps.
+ * For nstcalclr=1 this is not done, since the forces would have been added
+ * directly to the short-range forces already.
+ */
+ bUpdateDoLR = (fr->bTwinRange && do_per_step(step, ir->nstcalclr));
+
+ update_coords(fplog, step, ir, mdatoms, state, fr->bMolPBC,
+ f, bUpdateDoLR, fr->f_twin, fcd,
+ ekind, M, wcycle, upd, bInitStep, etrtVELOCITY1,
+ cr, nrnb, constr, &top->idef);
+
+ if (bIterativeCase && do_per_step(step-1, ir->nstpcouple) && !bInitStep)
+ {
+ gmx_iterate_init(&iterate, TRUE);
+ }
+ /* for iterations, we save these vectors, as we will be self-consistently iterating
+ the calculations */
+
+ /*#### UPDATE EXTENDED VARIABLES IN TROTTER FORMULATION */
+
+ /* save the state */
+ if (iterate.bIterationActive)
+ {
+ copy_coupling_state(state, bufstate, ekind, ekind_save, &(ir->opts));
+ }
+
+ bFirstIterate = TRUE;
+ while (bFirstIterate || iterate.bIterationActive)
+ {
+ if (iterate.bIterationActive)
+ {
+ copy_coupling_state(bufstate, state, ekind_save, ekind, &(ir->opts));
+ if (bFirstIterate && bTrotter)
+ {
+ /* The first time through, we need a decent first estimate
+ of veta(t+dt) to compute the constraints. Do
+ this by computing the box volume part of the
+ trotter integration at this time. Nothing else
+ should be changed by this routine here. If
+ !(first time), we start with the previous value
+ of veta. */
+
+ veta_save = state->veta;
+ trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ0);
+ vetanew = state->veta;
+ state->veta = veta_save;
+ }
+ }
+
+ bOK = TRUE;
+ if (!bRerunMD || rerun_fr.bV || bForceUpdate) /* Why is rerun_fr.bV here? Unclear. */
+ {
+ update_constraints(fplog, step, NULL, ir, ekind, mdatoms,
+ state, fr->bMolPBC, graph, f,
+ &top->idef, shake_vir, NULL,
+ cr, nrnb, wcycle, upd, constr,
+ bInitStep, TRUE, bCalcVir, vetanew);
+
+ if (!bOK && !bFFscan)
+ {
+ gmx_fatal(FARGS, "Constraint error: Shake, Lincs or Settle could not solve the constrains");
+ }
+
+ }
+ else if (graph)
+ {
+ /* Need to unshift here if a do_force has been
+ called in the previous step */
+ unshift_self(graph, state->box, state->x);
+ }
+
+ /* if VV, compute the pressure and constraints */
+ /* For VV2, we strictly only need this if using pressure
+ * control, but we really would like to have accurate pressures
+ * printed out.
+ * Think about ways around this in the future?
+ * For now, keep this choice in comments.
+ */
+ /*bPres = (ir->eI==eiVV || IR_NPT_TROTTER(ir)); */
+ /*bTemp = ((ir->eI==eiVV &&(!bInitStep)) || (ir->eI==eiVVAK && IR_NPT_TROTTER(ir)));*/
+ bPres = TRUE;
+ bTemp = ((ir->eI == eiVV && (!bInitStep)) || (ir->eI == eiVVAK));
+ if (bCalcEner && ir->eI == eiVVAK) /*MRS: 7/9/2010 -- this still doesn't fix it?*/
+ {
+ bSumEkinhOld = TRUE;
+ }
+ /* for vv, the first half of the integration actually corresponds to the previous step.
+ So we need information from the last step in the first half of the integration */
+ if (bGStat || do_per_step(step-1, nstglobalcomm))
+ {
+ compute_globals(fplog, gstat, cr, ir, fr, ekind, state, state_global, mdatoms, nrnb, vcm,
+ wcycle, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
+ constr, NULL, FALSE, state->box,
+ top_global, &pcurr, top_global->natoms, &bSumEkinhOld,
+ cglo_flags
+ | CGLO_ENERGY
+ | (bTemp ? CGLO_TEMPERATURE : 0)
+ | (bPres ? CGLO_PRESSURE : 0)
+ | (bPres ? CGLO_CONSTRAINT : 0)
+ | ((iterate.bIterationActive) ? CGLO_ITERATE : 0)
+ | (bFirstIterate ? CGLO_FIRSTITERATE : 0)
+ | CGLO_SCALEEKIN
+ );
+ /* explanation of above:
+ a) We compute Ekin at the full time step
+ if 1) we are using the AveVel Ekin, and it's not the
+ initial step, or 2) if we are using AveEkin, but need the full
+ time step kinetic energy for the pressure (always true now, since we want accurate statistics).
+ b) If we are using EkinAveEkin for the kinetic energy for the temperature control, we still feed in
+ EkinAveVel because it's needed for the pressure */
+ }
+ /* temperature scaling and pressure scaling to produce the extended variables at t+dt */
+ if (!bInitStep)
+ {
+ if (bTrotter)
+ {
+ m_add(force_vir, shake_vir, total_vir); /* we need the un-dispersion corrected total vir here */
+ trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ2);
+ }
+ else
+ {
+ if (bExchanged)
+ {
+
+ /* We need the kinetic energy at minus the half step for determining
+ * the full step kinetic energy and possibly for T-coupling.*/
+ /* This may not be quite working correctly yet . . . . */
+ compute_globals(fplog, gstat, cr, ir, fr, ekind, state, state_global, mdatoms, nrnb, vcm,
+ wcycle, enerd, NULL, NULL, NULL, NULL, mu_tot,
+ constr, NULL, FALSE, state->box,
+ top_global, &pcurr, top_global->natoms, &bSumEkinhOld,
+ CGLO_RERUNMD | CGLO_GSTAT | CGLO_TEMPERATURE);
+ }
+ }
+ }
+
+ if (iterate.bIterationActive &&
+ done_iterating(cr, fplog, step, &iterate, bFirstIterate,
+ state->veta, &vetanew))
+ {
+ break;
+ }
+ bFirstIterate = FALSE;
+ }
+
+ if (bTrotter && !bInitStep)
+ {
+ copy_mat(shake_vir, state->svir_prev);
+ copy_mat(force_vir, state->fvir_prev);
+ if (IR_NVT_TROTTER(ir) && ir->eI == eiVV)
+ {
+ /* update temperature and kinetic energy now that step is over - this is the v(t+dt) point */
+ enerd->term[F_TEMP] = sum_ekin(&(ir->opts), ekind, NULL, (ir->eI == eiVV), FALSE, FALSE);
+ enerd->term[F_EKIN] = trace(ekind->ekin);
+ }
+ }
+ /* if it's the initial step, we performed this first step just to get the constraint virial */
+ if (bInitStep && ir->eI == eiVV)
+ {
+ copy_rvecn(cbuf, state->v, 0, state->natoms);
+ }
+ }
+
+ /* MRS -- now done iterating -- compute the conserved quantity */
+ if (bVV)
+ {
+ saved_conserved_quantity = compute_conserved_from_auxiliary(ir, state, &MassQ);
+ if (ir->eI == eiVV)
+ {
+ last_ekin = enerd->term[F_EKIN];
+ }
+ if ((ir->eDispCorr != edispcEnerPres) && (ir->eDispCorr != edispcAllEnerPres))
+ {
+ saved_conserved_quantity -= enerd->term[F_DISPCORR];
+ }
+ /* sum up the foreign energy and dhdl terms for vv. currently done every step so that dhdl is correct in the .edr */
+ if (!bRerunMD)
+ {
+ sum_dhdl(enerd, state->lambda, ir->fepvals);
+ }
+ }
+
+ /* ######## END FIRST UPDATE STEP ############## */
+ /* ######## If doing VV, we now have v(dt) ###### */
+ if (bDoExpanded)
+ {
+ /* perform extended ensemble sampling in lambda - we don't
+ actually move to the new state before outputting
+ statistics, but if performing simulated tempering, we
+ do update the velocities and the tau_t. */
+
+ lamnew = ExpandedEnsembleDynamics(fplog, ir, enerd, state, &MassQ, &df_history, step, mcrng, state->v, mdatoms);
+ }
+ /* ################## START TRAJECTORY OUTPUT ################# */
+
+ /* Now we have the energies and forces corresponding to the
+ * coordinates at time t. We must output all of this before
+ * the update.
+ * for RerunMD t is read from input trajectory
+ */
+ mdof_flags = 0;
+ if (do_per_step(step, ir->nstxout))
+ {
+ mdof_flags |= MDOF_X;
+ }
+ if (do_per_step(step, ir->nstvout))
+ {
+ mdof_flags |= MDOF_V;
+ }
+ if (do_per_step(step, ir->nstfout))
+ {
+ mdof_flags |= MDOF_F;
+ }
+ if (do_per_step(step, ir->nstxtcout))
+ {
+ mdof_flags |= MDOF_XTC;
+ }
+ if (bCPT)
+ {
+ mdof_flags |= MDOF_CPT;
+ }
+ ;
+
+#if defined(GMX_FAHCORE) || defined(GMX_WRITELASTSTEP)
+ if (bLastStep)
+ {
+ /* Enforce writing positions and velocities at end of run */
+ mdof_flags |= (MDOF_X | MDOF_V);
+ }
+#endif
+#ifdef GMX_FAHCORE
+ if (MASTER(cr))
+ {
+ fcReportProgress( ir->nsteps, step );
+ }
+
+ /* sync bCPT and fc record-keeping */
+ if (bCPT && MASTER(cr))
+ {
+ fcRequestCheckPoint();
+ }
+#endif
+
+ if (mdof_flags != 0)
+ {
+ wallcycle_start(wcycle, ewcTRAJ);
+ if (bCPT)
+ {
+ if (state->flags & (1<<estLD_RNG))
+ {
+ get_stochd_state(upd, state);
+ }
+ if (state->flags & (1<<estMC_RNG))
+ {
+ get_mc_state(mcrng, state);
+ }
+ if (MASTER(cr))
+ {
+ if (bSumEkinhOld)
+ {
+ state_global->ekinstate.bUpToDate = FALSE;
+ }
+ else
+ {
+ update_ekinstate(&state_global->ekinstate, ekind);
+ state_global->ekinstate.bUpToDate = TRUE;
+ }
+ update_energyhistory(&state_global->enerhist, mdebin);
+ if (ir->efep != efepNO || ir->bSimTemp)
+ {
+ state_global->fep_state = state->fep_state; /* MRS: seems kludgy. The code should be
+ structured so this isn't necessary.
+ Note this reassignment is only necessary
+ for single threads.*/
+ copy_df_history(&state_global->dfhist, &df_history);
+ }
+ }
+ }
+ write_traj(fplog, cr, outf, mdof_flags, top_global,
+ step, t, state, state_global, f, f_global, &n_xtc, &x_xtc);
+ if (bCPT)
+ {
+ nchkpt++;
+ bCPT = FALSE;
+ }
+ debug_gmx();
+ if (bLastStep && step_rel == ir->nsteps &&
+ (Flags & MD_CONFOUT) && MASTER(cr) &&
+ !bRerunMD && !bFFscan)
+ {
+ /* x and v have been collected in write_traj,
+ * because a checkpoint file will always be written
+ * at the last step.
+ */
+ fprintf(stderr, "\nWriting final coordinates.\n");
+ if (fr->bMolPBC)
+ {
+ /* Make molecules whole only for confout writing */
+ do_pbc_mtop(fplog, ir->ePBC, state->box, top_global, state_global->x);
+ }
+ write_sto_conf_mtop(ftp2fn(efSTO, nfile, fnm),
+ *top_global->name, top_global,
+ state_global->x, state_global->v,
+ ir->ePBC, state->box);
+ debug_gmx();
+ }
+ wallcycle_stop(wcycle, ewcTRAJ);
+ }
+
+ /* kludge -- virial is lost with restart for NPT control. Must restart */
+ if (bStartingFromCpt && bVV)
+ {
+ copy_mat(state->svir_prev, shake_vir);
+ copy_mat(state->fvir_prev, force_vir);
+ }
+ /* ################## END TRAJECTORY OUTPUT ################ */
+
+ /* Determine the wallclock run time up till now */
+ run_time = gmx_gettime() - (double)runtime->real;
+
+ /* Check whether everything is still allright */
+ if (((int)gmx_get_stop_condition() > handled_stop_condition)
+#ifdef GMX_THREAD_MPI
+ && MASTER(cr)
+#endif
+ )
+ {
+ /* this is just make gs.sig compatible with the hack
+ of sending signals around by MPI_Reduce with together with
+ other floats */
+ if (gmx_get_stop_condition() == gmx_stop_cond_next_ns)
+ {
+ gs.sig[eglsSTOPCOND] = 1;
+ }
+ if (gmx_get_stop_condition() == gmx_stop_cond_next)
+ {
+ gs.sig[eglsSTOPCOND] = -1;
+ }
+ /* < 0 means stop at next step, > 0 means stop at next NS step */
+ if (fplog)
+ {
+ fprintf(fplog,
+ "\n\nReceived the %s signal, stopping at the next %sstep\n\n",
+ gmx_get_signal_name(),
+ gs.sig[eglsSTOPCOND] == 1 ? "NS " : "");
+ fflush(fplog);
+ }
+ fprintf(stderr,
+ "\n\nReceived the %s signal, stopping at the next %sstep\n\n",
+ gmx_get_signal_name(),
+ gs.sig[eglsSTOPCOND] == 1 ? "NS " : "");
+ fflush(stderr);
+ handled_stop_condition = (int)gmx_get_stop_condition();
+ }
+ else if (MASTER(cr) && (bNS || ir->nstlist <= 0) &&
+ (max_hours > 0 && run_time > max_hours*60.0*60.0*0.99) &&
+ gs.sig[eglsSTOPCOND] == 0 && gs.set[eglsSTOPCOND] == 0)
+ {
+ /* Signal to terminate the run */
+ gs.sig[eglsSTOPCOND] = 1;
+ if (fplog)
+ {
+ fprintf(fplog, "\nStep %s: Run time exceeded %.3f hours, will terminate the run\n", gmx_step_str(step, sbuf), max_hours*0.99);
+ }
+ fprintf(stderr, "\nStep %s: Run time exceeded %.3f hours, will terminate the run\n", gmx_step_str(step, sbuf), max_hours*0.99);
+ }
+
+ if (bResetCountersHalfMaxH && MASTER(cr) &&
+ run_time > max_hours*60.0*60.0*0.495)
+ {
+ gs.sig[eglsRESETCOUNTERS] = 1;
+ }
+
+ if (ir->nstlist == -1 && !bRerunMD)
+ {
+ /* When bGStatEveryStep=FALSE, global_stat is only called
+ * when we check the atom displacements, not at NS steps.
+ * This means that also the bonded interaction count check is not
+ * performed immediately after NS. Therefore a few MD steps could
+ * be performed with missing interactions.
+ * But wrong energies are never written to file,
+ * since energies are only written after global_stat
+ * has been called.
+ */
+ if (step >= nlh.step_nscheck)
+ {
+ nlh.nabnsb = natoms_beyond_ns_buffer(ir, fr, &top->cgs,
+ nlh.scale_tot, state->x);
+ }
+ else
+ {
+ /* This is not necessarily true,
+ * but step_nscheck is determined quite conservatively.
+ */
+ nlh.nabnsb = 0;
+ }
+ }
+
+ /* In parallel we only have to check for checkpointing in steps
+ * where we do global communication,
+ * otherwise the other nodes don't know.
+ */
+ if (MASTER(cr) && ((bGStat || !PAR(cr)) &&
+ cpt_period >= 0 &&
+ (cpt_period == 0 ||
+ run_time >= nchkpt*cpt_period*60.0)) &&
+ gs.set[eglsCHKPT] == 0)
+ {
+ gs.sig[eglsCHKPT] = 1;
+ }
+
+ /* at the start of step, randomize or scale the velocities (trotter done elsewhere) */
+ if (EI_VV(ir->eI))
+ {
+ if (!bInitStep)
+ {
+ update_tcouple(fplog, step, ir, state, ekind, wcycle, upd, &MassQ, mdatoms);
+ }
+ if (ETC_ANDERSEN(ir->etc)) /* keep this outside of update_tcouple because of the extra info required to pass */
+ {
+ gmx_bool bIfRandomize;
+ bIfRandomize = update_randomize_velocities(ir, step, mdatoms, state, upd, &top->idef, constr);
+ /* if we have constraints, we have to remove the kinetic energy parallel to the bonds */
+ if (constr && bIfRandomize)
+ {
+ update_constraints(fplog, step, NULL, ir, ekind, mdatoms,
+ state, fr->bMolPBC, graph, f,
+ &top->idef, tmp_vir, NULL,
+ cr, nrnb, wcycle, upd, constr,
+ bInitStep, TRUE, bCalcVir, vetanew);
+ }
+ }
+ }
+
+ if (bIterativeCase && do_per_step(step, ir->nstpcouple))
+ {
+ gmx_iterate_init(&iterate, TRUE);
+ /* for iterations, we save these vectors, as we will be redoing the calculations */
+ copy_coupling_state(state, bufstate, ekind, ekind_save, &(ir->opts));
+ }
+
+ bFirstIterate = TRUE;
+ while (bFirstIterate || iterate.bIterationActive)
+ {
+ /* We now restore these vectors to redo the calculation with improved extended variables */
+ if (iterate.bIterationActive)
+ {
+ copy_coupling_state(bufstate, state, ekind_save, ekind, &(ir->opts));
+ }
+
+ /* We make the decision to break or not -after- the calculation of Ekin and Pressure,
+ so scroll down for that logic */
+
+ /* ######### START SECOND UPDATE STEP ################# */
+ /* Box is changed in update() when we do pressure coupling,
+ * but we should still use the old box for energy corrections and when
+ * writing it to the energy file, so it matches the trajectory files for
+ * the same timestep above. Make a copy in a separate array.
+ */
+ copy_mat(state->box, lastbox);
+
+ bOK = TRUE;
+ dvdl_constr = 0;
+
+ if (!(bRerunMD && !rerun_fr.bV && !bForceUpdate))
+ {
+ wallcycle_start(wcycle, ewcUPDATE);
+ /* UPDATE PRESSURE VARIABLES IN TROTTER FORMULATION WITH CONSTRAINTS */
+ if (bTrotter)
+ {
+ if (iterate.bIterationActive)
+ {
+ if (bFirstIterate)
+ {
+ scalevir = 1;
+ }
+ else
+ {
+ /* we use a new value of scalevir to converge the iterations faster */
+ scalevir = tracevir/trace(shake_vir);
+ }
+ msmul(shake_vir, scalevir, shake_vir);
+ m_add(force_vir, shake_vir, total_vir);
+ clear_mat(shake_vir);
+ }
+ trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ3);
+ /* We can only do Berendsen coupling after we have summed
+ * the kinetic energy or virial. Since the happens
+ * in global_state after update, we should only do it at
+ * step % nstlist = 1 with bGStatEveryStep=FALSE.
+ */
+ }
+ else
+ {
+ update_tcouple(fplog, step, ir, state, ekind, wcycle, upd, &MassQ, mdatoms);
+ update_pcouple(fplog, step, ir, state, pcoupl_mu, M, wcycle,
+ upd, bInitStep);
+ }
+
+ if (bVV)
+ {
+ bUpdateDoLR = (fr->bTwinRange && do_per_step(step, ir->nstcalclr));
+
+ /* velocity half-step update */
+ update_coords(fplog, step, ir, mdatoms, state, fr->bMolPBC, f,
+ bUpdateDoLR, fr->f_twin, fcd,
+ ekind, M, wcycle, upd, FALSE, etrtVELOCITY2,
+ cr, nrnb, constr, &top->idef);
+ }
+
+ /* Above, initialize just copies ekinh into ekin,
+ * it doesn't copy position (for VV),
+ * and entire integrator for MD.
+ */
+
+ if (ir->eI == eiVVAK)
+ {
+ copy_rvecn(state->x, cbuf, 0, state->natoms);
+ }
+ bUpdateDoLR = (fr->bTwinRange && do_per_step(step, ir->nstcalclr));
+
+ update_coords(fplog, step, ir, mdatoms, state, fr->bMolPBC, f,
+ bUpdateDoLR, fr->f_twin, fcd,
+ ekind, M, wcycle, upd, bInitStep, etrtPOSITION, cr, nrnb, constr, &top->idef);
+ wallcycle_stop(wcycle, ewcUPDATE);
+
+ update_constraints(fplog, step, &dvdl_constr, ir, ekind, mdatoms, state,
+ fr->bMolPBC, graph, f,
+ &top->idef, shake_vir, force_vir,
+ cr, nrnb, wcycle, upd, constr,
+ bInitStep, FALSE, bCalcVir, state->veta);
+
+ if (ir->eI == eiVVAK)
+ {
+ /* erase F_EKIN and F_TEMP here? */
+ /* just compute the kinetic energy at the half step to perform a trotter step */
+ compute_globals(fplog, gstat, cr, ir, fr, ekind, state, state_global, mdatoms, nrnb, vcm,
+ wcycle, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
+ constr, NULL, FALSE, lastbox,
+ top_global, &pcurr, top_global->natoms, &bSumEkinhOld,
+ cglo_flags | CGLO_TEMPERATURE
+ );
+ wallcycle_start(wcycle, ewcUPDATE);
+ trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ4);
+ /* now we know the scaling, we can compute the positions again again */
+ copy_rvecn(cbuf, state->x, 0, state->natoms);
+
+ bUpdateDoLR = (fr->bTwinRange && do_per_step(step, ir->nstcalclr));
+
+ update_coords(fplog, step, ir, mdatoms, state, fr->bMolPBC, f,
+ bUpdateDoLR, fr->f_twin, fcd,
+ ekind, M, wcycle, upd, bInitStep, etrtPOSITION, cr, nrnb, constr, &top->idef);
+ wallcycle_stop(wcycle, ewcUPDATE);
+
+ /* do we need an extra constraint here? just need to copy out of state->v to upd->xp? */
+ /* are the small terms in the shake_vir here due
+ * to numerical errors, or are they important
+ * physically? I'm thinking they are just errors, but not completely sure.
+ * For now, will call without actually constraining, constr=NULL*/
+ update_constraints(fplog, step, NULL, ir, ekind, mdatoms,
+ state, fr->bMolPBC, graph, f,
+ &top->idef, tmp_vir, force_vir,
+ cr, nrnb, wcycle, upd, NULL,
+ bInitStep, FALSE, bCalcVir,
+ state->veta);
+ }
+ if (!bOK && !bFFscan)
+ {
+ gmx_fatal(FARGS, "Constraint error: Shake, Lincs or Settle could not solve the constrains");
+ }
+
+ if (fr->bSepDVDL && fplog && do_log)
+ {
+ fprintf(fplog, sepdvdlformat, "Constraint dV/dl", 0.0, dvdl_constr);
+ }
+ if (bVV)
+ {
+ /* this factor or 2 correction is necessary
+ because half of the constraint force is removed
+ in the vv step, so we have to double it. See
+ the Redmine issue #1255. It is not yet clear
+ if the factor of 2 is exact, or just a very
+ good approximation, and this will be
+ investigated. The next step is to see if this
+ can be done adding a dhdl contribution from the
+ rattle step, but this is somewhat more
+ complicated with the current code. Will be
+ investigated, hopefully for 4.6.3. However,
+ this current solution is much better than
+ having it completely wrong.
+ */
+ enerd->term[F_DVDL_CONSTR] += 2*dvdl_constr;
+ }
+ else
+ {
+ enerd->term[F_DVDL_CONSTR] += dvdl_constr;
+ }
+ }
+ else if (graph)
+ {
+ /* Need to unshift here */
+ unshift_self(graph, state->box, state->x);
+ }
+
+ if (vsite != NULL)
+ {
+ wallcycle_start(wcycle, ewcVSITECONSTR);
+ if (graph != NULL)
+ {
+ shift_self(graph, state->box, state->x);
+ }
+ construct_vsites(fplog, vsite, state->x, nrnb, ir->delta_t, state->v,
+ top->idef.iparams, top->idef.il,
+ fr->ePBC, fr->bMolPBC, graph, cr, state->box);
+
+ if (graph != NULL)
+ {
+ unshift_self(graph, state->box, state->x);
+ }
+ wallcycle_stop(wcycle, ewcVSITECONSTR);
+ }
+
+ /* ############## IF NOT VV, Calculate globals HERE, also iterate constraints ############ */
+ /* With Leap-Frog we can skip compute_globals at
+ * non-communication steps, but we need to calculate
+ * the kinetic energy one step before communication.
+ */
+ if (bGStat || (!EI_VV(ir->eI) && do_per_step(step+1, nstglobalcomm)))
+ {
+ if (ir->nstlist == -1 && bFirstIterate)
+ {
+ gs.sig[eglsNABNSB] = nlh.nabnsb;
+ }
+ compute_globals(fplog, gstat, cr, ir, fr, ekind, state, state_global, mdatoms, nrnb, vcm,
+ wcycle, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
+ constr,
+ bFirstIterate ? &gs : NULL,
+ (step_rel % gs.nstms == 0) &&
+ (multisim_nsteps < 0 || (step_rel < multisim_nsteps)),
+ lastbox,
+ top_global, &pcurr, top_global->natoms, &bSumEkinhOld,
+ cglo_flags
+ | (!EI_VV(ir->eI) || bRerunMD ? CGLO_ENERGY : 0)
+ | (!EI_VV(ir->eI) && bStopCM ? CGLO_STOPCM : 0)
+ | (!EI_VV(ir->eI) ? CGLO_TEMPERATURE : 0)
+ | (!EI_VV(ir->eI) || bRerunMD ? CGLO_PRESSURE : 0)
+ | (iterate.bIterationActive ? CGLO_ITERATE : 0)
+ | (bFirstIterate ? CGLO_FIRSTITERATE : 0)
+ | CGLO_CONSTRAINT
+ );
+ if (ir->nstlist == -1 && bFirstIterate)
+ {
+ nlh.nabnsb = gs.set[eglsNABNSB];
+ gs.set[eglsNABNSB] = 0;
+ }
+ }
+ /* bIterate is set to keep it from eliminating the old ekin kinetic energy terms */
+ /* ############# END CALC EKIN AND PRESSURE ################# */
+
+ /* Note: this is OK, but there are some numerical precision issues with using the convergence of
+ the virial that should probably be addressed eventually. state->veta has better properies,
+ but what we actually need entering the new cycle is the new shake_vir value. Ideally, we could
+ generate the new shake_vir, but test the veta value for convergence. This will take some thought. */
+
+ if (iterate.bIterationActive &&
+ done_iterating(cr, fplog, step, &iterate, bFirstIterate,
+ trace(shake_vir), &tracevir))
+ {
+ break;
+ }
+ bFirstIterate = FALSE;
+ }
+
+ if (!bVV || bRerunMD)
+ {
+ /* sum up the foreign energy and dhdl terms for md and sd. currently done every step so that dhdl is correct in the .edr */
+ sum_dhdl(enerd, state->lambda, ir->fepvals);
+ }
+ update_box(fplog, step, ir, mdatoms, state, graph, f,
+ ir->nstlist == -1 ? &nlh.scale_tot : NULL, pcoupl_mu, nrnb, wcycle, upd, bInitStep, FALSE);
+
+ /* ################# END UPDATE STEP 2 ################# */
+ /* #### We now have r(t+dt) and v(t+dt/2) ############# */
+
+ /* The coordinates (x) were unshifted in update */
+ if (bFFscan && (shellfc == NULL || bConverged))
+ {
+ if (print_forcefield(fplog, enerd->term, mdatoms->homenr,
+ f, NULL, xcopy,
+ &(top_global->mols), mdatoms->massT, pres))
+ {
+ gmx_finalize_par();
+
+ fprintf(stderr, "\n");
+ exit(0);
+ }
+ }
+ if (!bGStat)
+ {
+ /* We will not sum ekinh_old,
+ * so signal that we still have to do it.
+ */
+ bSumEkinhOld = TRUE;
+ }
+
+ if (bTCR)
+ {
+ /* Only do GCT when the relaxation of shells (minimization) has converged,
+ * otherwise we might be coupling to bogus energies.
+ * In parallel we must always do this, because the other sims might
+ * update the FF.
+ */
+
+ /* Since this is called with the new coordinates state->x, I assume
+ * we want the new box state->box too. / EL 20040121
+ */
+ do_coupling(fplog, oenv, nfile, fnm, tcr, t, step, enerd->term, fr,
+ ir, MASTER(cr),
+ mdatoms, &(top->idef), mu_aver,
+ top_global->mols.nr, cr,
+ state->box, total_vir, pres,
+ mu_tot, state->x, f, bConverged);
+ debug_gmx();
+ }
+
+ /* ######### BEGIN PREPARING EDR OUTPUT ########### */
+
+ /* use the directly determined last velocity, not actually the averaged half steps */
+ if (bTrotter && ir->eI == eiVV)
+ {
+ enerd->term[F_EKIN] = last_ekin;
+ }
+ enerd->term[F_ETOT] = enerd->term[F_EPOT] + enerd->term[F_EKIN];
+
+ if (bVV)
+ {
+ enerd->term[F_ECONSERVED] = enerd->term[F_ETOT] + saved_conserved_quantity;
+ }
+ else
+ {
+ enerd->term[F_ECONSERVED] = enerd->term[F_ETOT] + compute_conserved_from_auxiliary(ir, state, &MassQ);
+ }
+ /* Check for excessively large energies */
+ if (bIonize)
+ {
+#ifdef GMX_DOUBLE
+ real etot_max = 1e200;
+#else
+ real etot_max = 1e30;
+#endif
+ if (fabs(enerd->term[F_ETOT]) > etot_max)
+ {
+ fprintf(stderr, "Energy too large (%g), giving up\n",
+ enerd->term[F_ETOT]);
+ }
+ }
+ /* ######### END PREPARING EDR OUTPUT ########### */
+
+ /* Time for performance */
+ if (((step % stepout) == 0) || bLastStep)
+ {
+ runtime_upd_proc(runtime);
+ }
+
+ /* Output stuff */
+ if (MASTER(cr))
+ {
+ gmx_bool do_dr, do_or;
+
+ if (fplog && do_log && bDoExpanded)
+ {
+ /* only needed if doing expanded ensemble */
+ PrintFreeEnergyInfoToFile(fplog, ir->fepvals, ir->expandedvals, ir->bSimTemp ? ir->simtempvals : NULL,
+ &df_history, state->fep_state, ir->nstlog, step);
+ }
+ if (!(bStartingFromCpt && (EI_VV(ir->eI))))
+ {
+ if (bCalcEner)
+ {
+ upd_mdebin(mdebin, bDoDHDL, TRUE,
+ t, mdatoms->tmass, enerd, state,
+ ir->fepvals, ir->expandedvals, lastbox,
+ shake_vir, force_vir, total_vir, pres,
+ ekind, mu_tot, constr);
+ }
+ else
+ {
+ upd_mdebin_step(mdebin);
+ }
+
+ do_dr = do_per_step(step, ir->nstdisreout);
+ do_or = do_per_step(step, ir->nstorireout);
+
+ print_ebin(outf->fp_ene, do_ene, do_dr, do_or, do_log ? fplog : NULL,
+ step, t,
+ eprNORMAL, bCompact, mdebin, fcd, groups, &(ir->opts));
+ }
+ if (ir->ePull != epullNO)
+ {
+ pull_print_output(ir->pull, step, t);
+ }
+
+ if (do_per_step(step, ir->nstlog))
+ {
+ if (fflush(fplog) != 0)
+ {
+ gmx_fatal(FARGS, "Cannot flush logfile - maybe you are out of disk space?");
+ }
+ }
+ }
+ if (bDoExpanded)
+ {
+ /* Have to do this part after outputting the logfile and the edr file */
+ state->fep_state = lamnew;
+ for (i = 0; i < efptNR; i++)
+ {
+ state_global->lambda[i] = ir->fepvals->all_lambda[i][lamnew];
+ }
+ }
+ /* Remaining runtime */
+ if (MULTIMASTER(cr) && (do_verbose || gmx_got_usr_signal()) && !bPMETuneRunning)
+ {
+ if (shellfc)
+ {
+ fprintf(stderr, "\n");
+ }
+ print_time(stderr, runtime, step, ir, cr);
+ }
+
+ /* Replica exchange */
+ bExchanged = FALSE;
+ if ((repl_ex_nst > 0) && (step > 0) && !bLastStep &&
+ do_per_step(step, repl_ex_nst))
+ {
+ bExchanged = replica_exchange(fplog, cr, repl_ex,
+ state_global, enerd,
+ state, step, t);
+
+ if (bExchanged && DOMAINDECOMP(cr))
+ {
+ dd_partition_system(fplog, step, cr, TRUE, 1,
+ state_global, top_global, ir,
+ state, &f, mdatoms, top, fr,
+ vsite, shellfc, constr,
+ nrnb, wcycle, FALSE);
+ }
+ }
+
+ bFirstStep = FALSE;
+ bInitStep = FALSE;
+ bStartingFromCpt = FALSE;
+
+ /* ####### SET VARIABLES FOR NEXT ITERATION IF THEY STILL NEED IT ###### */
+ /* With all integrators, except VV, we need to retain the pressure
+ * at the current step for coupling at the next step.
+ */
+ if ((state->flags & (1<<estPRES_PREV)) &&
+ (bGStatEveryStep ||
+ (ir->nstpcouple > 0 && step % ir->nstpcouple == 0)))
+ {
+ /* Store the pressure in t_state for pressure coupling
+ * at the next MD step.
+ */
+ copy_mat(pres, state->pres_prev);
+ }
+
+ /* ####### END SET VARIABLES FOR NEXT ITERATION ###### */
+
+ if ( (membed != NULL) && (!bLastStep) )
+ {
+ rescale_membed(step_rel, membed, state_global->x);
+ }
+
+ if (bRerunMD)
+ {
+ if (MASTER(cr))
+ {
+ /* read next frame from input trajectory */
+ bNotLastFrame = read_next_frame(oenv, status, &rerun_fr);
+ }
+
+ if (PAR(cr))
+ {
+ rerun_parallel_comm(cr, &rerun_fr, &bNotLastFrame);
+ }
+ }
+
+ if (!bRerunMD || !rerun_fr.bStep)
+ {
+ /* increase the MD step number */
+ step++;
+ step_rel++;
+ }
+
+ cycles = wallcycle_stop(wcycle, ewcSTEP);
+ if (DOMAINDECOMP(cr) && wcycle)
+ {
+ dd_cycles_add(cr->dd, cycles, ddCyclStep);
+ }
+
+ if (bPMETuneRunning || bPMETuneTry)
+ {
+ /* PME grid + cut-off optimization with GPUs or PME nodes */
+
+ /* Count the total cycles over the last steps */
+ cycles_pmes += cycles;
+
+ /* We can only switch cut-off at NS steps */
+ if (step % ir->nstlist == 0)
+ {
+ /* PME grid + cut-off optimization with GPUs or PME nodes */
+ if (bPMETuneTry)
+ {
+ if (DDMASTER(cr->dd))
+ {
+ /* PME node load is too high, start tuning */
+ bPMETuneRunning = (dd_pme_f_ratio(cr->dd) >= 1.05);
+ }
+ dd_bcast(cr->dd, sizeof(gmx_bool), &bPMETuneRunning);
+
+ if (bPMETuneRunning || step_rel > ir->nstlist*50)
+ {
+ bPMETuneTry = FALSE;
+ }
+ }
+ if (bPMETuneRunning)
+ {
+ /* init_step might not be a multiple of nstlist,
+ * but the first cycle is always skipped anyhow.
+ */
+ bPMETuneRunning =
+ pme_load_balance(pme_loadbal, cr,
+ (bVerbose && MASTER(cr)) ? stderr : NULL,
+ fplog,
+ ir, state, cycles_pmes,
+ fr->ic, fr->nbv, &fr->pmedata,
+ step);
+
+ /* Update constants in forcerec/inputrec to keep them in sync with fr->ic */
+ fr->ewaldcoeff = fr->ic->ewaldcoeff;
+ fr->rlist = fr->ic->rlist;
+ fr->rlistlong = fr->ic->rlistlong;
+ fr->rcoulomb = fr->ic->rcoulomb;
+ fr->rvdw = fr->ic->rvdw;
+ }
+ cycles_pmes = 0;
+ }
+ }
+
+ if (step_rel == wcycle_get_reset_counters(wcycle) ||
+ gs.set[eglsRESETCOUNTERS] != 0)
+ {
+ /* Reset all the counters related to performance over the run */
+ reset_all_counters(fplog, cr, step, &step_rel, ir, wcycle, nrnb, runtime,
+ fr->nbv != NULL && fr->nbv->bUseGPU ? fr->nbv->cu_nbv : NULL);
+ wcycle_set_reset_counters(wcycle, -1);
+ if (!(cr->duty & DUTY_PME))
+ {
+ /* Tell our PME node to reset its counters */
+ gmx_pme_send_resetcounters(cr, step);
+ }
+ /* Correct max_hours for the elapsed time */
+ max_hours -= run_time/(60.0*60.0);
+ bResetCountersHalfMaxH = FALSE;
+ gs.set[eglsRESETCOUNTERS] = 0;
+ }
+
+ }
+ /* End of main MD loop */
+ debug_gmx();
+
+ /* Stop the time */
+ runtime_end(runtime);
+
+ if (bRerunMD && MASTER(cr))
+ {
+ close_trj(status);
+ }
+
+ if (!(cr->duty & DUTY_PME))
+ {
+ /* Tell the PME only node to finish */
+ gmx_pme_send_finish(cr);
+ }
+
+ if (MASTER(cr))
+ {
+ if (ir->nstcalcenergy > 0 && !bRerunMD)
+ {
+ print_ebin(outf->fp_ene, FALSE, FALSE, FALSE, fplog, step, t,
+ eprAVER, FALSE, mdebin, fcd, groups, &(ir->opts));
+ }
+ }
+
+ done_mdoutf(outf);
+
+ debug_gmx();
+
+ if (ir->nstlist == -1 && nlh.nns > 0 && fplog)
+ {
+ fprintf(fplog, "Average neighborlist lifetime: %.1f steps, std.dev.: %.1f steps\n", nlh.s1/nlh.nns, sqrt(nlh.s2/nlh.nns - sqr(nlh.s1/nlh.nns)));
+ fprintf(fplog, "Average number of atoms that crossed the half buffer length: %.1f\n\n", nlh.ab/nlh.nns);
+ }
+
+ if (pme_loadbal != NULL)
+ {
+ pme_loadbal_done(pme_loadbal, cr, fplog,
+ fr->nbv != NULL && fr->nbv->bUseGPU);
+ }
+
+ if (shellfc && fplog)
+ {
+ fprintf(fplog, "Fraction of iterations that converged: %.2f %%\n",
+ (nconverged*100.0)/step_rel);
+ fprintf(fplog, "Average number of force evaluations per MD step: %.2f\n\n",
+ tcount/step_rel);
+ }
+
+ if (repl_ex_nst > 0 && MASTER(cr))
+ {
+ print_replica_exchange_statistics(fplog, repl_ex);
+ }
+
+ runtime->nsteps_done = step_rel;
+
+ return 0;
+}
--- /dev/null
- static int get_nthreads_mpi(gmx_hw_info_t *hwinfo,
+/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
+ *
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * GROningen MAchine for Chemical Simulations
+ *
+ * VERSION 3.2.0
+ * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2004, The GROMACS development team,
+ * check out http://www.gromacs.org for more information.
+
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * If you want to redistribute modifications, please consider that
+ * scientific software is very special. Version control is crucial -
+ * bugs must be traceable. We will be happy to consider code for
+ * inclusion in the official distribution, but derived work must not
+ * be called official GROMACS. Details are found in the README & COPYING
+ * files - if they are missing, get the official version at www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the papers on the package - you can find them in the top README file.
+ *
+ * For more info, check our website at http://www.gromacs.org
+ *
+ * And Hey:
+ * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+#include <signal.h>
+#include <stdlib.h>
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>
+#endif
+#include <string.h>
+#include <assert.h>
+
+#include "typedefs.h"
+#include "smalloc.h"
+#include "sysstuff.h"
+#include "statutil.h"
+#include "force.h"
+#include "mdrun.h"
+#include "md_logging.h"
+#include "md_support.h"
+#include "network.h"
+#include "pull.h"
+#include "pull_rotation.h"
+#include "names.h"
+#include "disre.h"
+#include "orires.h"
+#include "pme.h"
+#include "mdatoms.h"
+#include "repl_ex.h"
+#include "qmmm.h"
+#include "domdec.h"
+#include "partdec.h"
+#include "coulomb.h"
+#include "constr.h"
+#include "mvdata.h"
+#include "checkpoint.h"
+#include "mtop_util.h"
+#include "sighandler.h"
+#include "tpxio.h"
+#include "txtdump.h"
+#include "gmx_detect_hardware.h"
+#include "gmx_omp_nthreads.h"
+#include "pull_rotation.h"
+#include "calc_verletbuf.h"
+#include "../mdlib/nbnxn_search.h"
+#include "../mdlib/nbnxn_consts.h"
+#include "gmx_fatal_collective.h"
+#include "membed.h"
+#include "macros.h"
+#include "gmx_omp.h"
+#include "gmx_thread_affinity.h"
+
+#include "gromacs/utility/gmxmpi.h"
+
+#ifdef GMX_FAHCORE
+#include "corewrap.h"
+#endif
+
+#include "gpu_utils.h"
+#include "nbnxn_cuda_data_mgmt.h"
+
+typedef struct {
+ gmx_integrator_t *func;
+} gmx_intp_t;
+
+/* The array should match the eI array in include/types/enums.h */
+const gmx_intp_t integrator[eiNR] = { {do_md}, {do_steep}, {do_cg}, {do_md}, {do_md}, {do_nm}, {do_lbfgs}, {do_tpi}, {do_tpi}, {do_md}, {do_md}, {do_md}};
+
+gmx_large_int_t deform_init_init_step_tpx;
+matrix deform_init_box_tpx;
+#ifdef GMX_THREAD_MPI
+tMPI_Thread_mutex_t deform_init_box_mutex = TMPI_THREAD_MUTEX_INITIALIZER;
+#endif
+
+
+#ifdef GMX_THREAD_MPI
+struct mdrunner_arglist
+{
+ gmx_hw_opt_t *hw_opt;
+ FILE *fplog;
+ t_commrec *cr;
+ int nfile;
+ const t_filenm *fnm;
+ output_env_t oenv;
+ gmx_bool bVerbose;
+ gmx_bool bCompact;
+ int nstglobalcomm;
+ ivec ddxyz;
+ int dd_node_order;
+ real rdd;
+ real rconstr;
+ const char *dddlb_opt;
+ real dlb_scale;
+ const char *ddcsx;
+ const char *ddcsy;
+ const char *ddcsz;
+ const char *nbpu_opt;
+ gmx_large_int_t nsteps_cmdline;
+ int nstepout;
+ int resetstep;
+ int nmultisim;
+ int repl_ex_nst;
+ int repl_ex_nex;
+ int repl_ex_seed;
+ real pforce;
+ real cpt_period;
+ real max_hours;
+ const char *deviceOptions;
+ unsigned long Flags;
+ int ret; /* return value */
+};
+
+
+/* The function used for spawning threads. Extracts the mdrunner()
+ arguments from its one argument and calls mdrunner(), after making
+ a commrec. */
+static void mdrunner_start_fn(void *arg)
+{
+ struct mdrunner_arglist *mda = (struct mdrunner_arglist*)arg;
+ struct mdrunner_arglist mc = *mda; /* copy the arg list to make sure
+ that it's thread-local. This doesn't
+ copy pointed-to items, of course,
+ but those are all const. */
+ t_commrec *cr; /* we need a local version of this */
+ FILE *fplog = NULL;
+ t_filenm *fnm;
+
+ fnm = dup_tfn(mc.nfile, mc.fnm);
+
+ cr = init_par_threads(mc.cr);
+
+ if (MASTER(cr))
+ {
+ fplog = mc.fplog;
+ }
+
+ mda->ret = mdrunner(mc.hw_opt, fplog, cr, mc.nfile, fnm, mc.oenv,
+ mc.bVerbose, mc.bCompact, mc.nstglobalcomm,
+ mc.ddxyz, mc.dd_node_order, mc.rdd,
+ mc.rconstr, mc.dddlb_opt, mc.dlb_scale,
+ mc.ddcsx, mc.ddcsy, mc.ddcsz,
+ mc.nbpu_opt,
+ mc.nsteps_cmdline, mc.nstepout, mc.resetstep,
+ mc.nmultisim, mc.repl_ex_nst, mc.repl_ex_nex, mc.repl_ex_seed, mc.pforce,
+ mc.cpt_period, mc.max_hours, mc.deviceOptions, mc.Flags);
+}
+
+/* called by mdrunner() to start a specific number of threads (including
+ the main thread) for thread-parallel runs. This in turn calls mdrunner()
+ for each thread.
+ All options besides nthreads are the same as for mdrunner(). */
+static t_commrec *mdrunner_start_threads(gmx_hw_opt_t *hw_opt,
+ FILE *fplog, t_commrec *cr, int nfile,
+ const t_filenm fnm[], const output_env_t oenv, gmx_bool bVerbose,
+ gmx_bool bCompact, int nstglobalcomm,
+ ivec ddxyz, int dd_node_order, real rdd, real rconstr,
+ const char *dddlb_opt, real dlb_scale,
+ const char *ddcsx, const char *ddcsy, const char *ddcsz,
+ const char *nbpu_opt,
+ gmx_large_int_t nsteps_cmdline,
+ int nstepout, int resetstep,
+ int nmultisim, int repl_ex_nst, int repl_ex_nex, int repl_ex_seed,
+ real pforce, real cpt_period, real max_hours,
+ const char *deviceOptions, unsigned long Flags)
+{
+ int ret;
+ struct mdrunner_arglist *mda;
+ t_commrec *crn; /* the new commrec */
+ t_filenm *fnmn;
+
+ /* first check whether we even need to start tMPI */
+ if (hw_opt->nthreads_tmpi < 2)
+ {
+ return cr;
+ }
+
+ /* a few small, one-time, almost unavoidable memory leaks: */
+ snew(mda, 1);
+ fnmn = dup_tfn(nfile, fnm);
+
+ /* fill the data structure to pass as void pointer to thread start fn */
+ mda->hw_opt = hw_opt;
+ mda->fplog = fplog;
+ mda->cr = cr;
+ mda->nfile = nfile;
+ mda->fnm = fnmn;
+ mda->oenv = oenv;
+ mda->bVerbose = bVerbose;
+ mda->bCompact = bCompact;
+ mda->nstglobalcomm = nstglobalcomm;
+ mda->ddxyz[XX] = ddxyz[XX];
+ mda->ddxyz[YY] = ddxyz[YY];
+ mda->ddxyz[ZZ] = ddxyz[ZZ];
+ mda->dd_node_order = dd_node_order;
+ mda->rdd = rdd;
+ mda->rconstr = rconstr;
+ mda->dddlb_opt = dddlb_opt;
+ mda->dlb_scale = dlb_scale;
+ mda->ddcsx = ddcsx;
+ mda->ddcsy = ddcsy;
+ mda->ddcsz = ddcsz;
+ mda->nbpu_opt = nbpu_opt;
+ mda->nsteps_cmdline = nsteps_cmdline;
+ mda->nstepout = nstepout;
+ mda->resetstep = resetstep;
+ mda->nmultisim = nmultisim;
+ mda->repl_ex_nst = repl_ex_nst;
+ mda->repl_ex_nex = repl_ex_nex;
+ mda->repl_ex_seed = repl_ex_seed;
+ mda->pforce = pforce;
+ mda->cpt_period = cpt_period;
+ mda->max_hours = max_hours;
+ mda->deviceOptions = deviceOptions;
+ mda->Flags = Flags;
+
+ /* now spawn new threads that start mdrunner_start_fn(), while
+ the main thread returns, we set thread affinity later */
+ ret = tMPI_Init_fn(TRUE, hw_opt->nthreads_tmpi, TMPI_AFFINITY_NONE,
+ mdrunner_start_fn, (void*)(mda) );
+ if (ret != TMPI_SUCCESS)
+ {
+ return NULL;
+ }
+
+ /* make a new comm_rec to reflect the new situation */
+ crn = init_par_threads(cr);
+ return crn;
+}
+
+
+static int get_tmpi_omp_thread_division(const gmx_hw_info_t *hwinfo,
+ const gmx_hw_opt_t *hw_opt,
+ int nthreads_tot,
+ int ngpu)
+{
+ int nthreads_tmpi;
+
+ /* There are no separate PME nodes here, as we ensured in
+ * check_and_update_hw_opt that nthreads_tmpi>0 with PME nodes
+ * and a conditional ensures we would not have ended up here.
+ * Note that separate PME nodes might be switched on later.
+ */
+ if (ngpu > 0)
+ {
+ nthreads_tmpi = ngpu;
+ if (nthreads_tot > 0 && nthreads_tot < nthreads_tmpi)
+ {
+ nthreads_tmpi = nthreads_tot;
+ }
+ }
+ else if (hw_opt->nthreads_omp > 0)
+ {
+ /* Here we could oversubscribe, when we do, we issue a warning later */
+ nthreads_tmpi = max(1, nthreads_tot/hw_opt->nthreads_omp);
+ }
+ else
+ {
+ /* TODO choose nthreads_omp based on hardware topology
+ when we have a hardware topology detection library */
+ /* In general, when running up to 4 threads, OpenMP should be faster.
+ * Note: on AMD Bulldozer we should avoid running OpenMP over two dies.
+ * On Intel>=Nehalem running OpenMP on a single CPU is always faster,
+ * even on two CPUs it's usually faster (but with many OpenMP threads
+ * it could be faster not to use HT, currently we always use HT).
+ * On Nehalem/Westmere we want to avoid running 16 threads over
+ * two CPUs with HT, so we need a limit<16; thus we use 12.
+ * A reasonable limit for Intel Sandy and Ivy bridge,
+ * not knowing the topology, is 16 threads.
+ */
+ const int nthreads_omp_always_faster = 4;
+ const int nthreads_omp_always_faster_Nehalem = 12;
+ const int nthreads_omp_always_faster_SandyBridge = 16;
+ const int first_model_Nehalem = 0x1A;
+ const int first_model_SandyBridge = 0x2A;
+ gmx_bool bIntel_Family6;
+
+ bIntel_Family6 =
+ (gmx_cpuid_vendor(hwinfo->cpuid_info) == GMX_CPUID_VENDOR_INTEL &&
+ gmx_cpuid_family(hwinfo->cpuid_info) == 6);
+
+ if (nthreads_tot <= nthreads_omp_always_faster ||
+ (bIntel_Family6 &&
+ ((gmx_cpuid_model(hwinfo->cpuid_info) >= nthreads_omp_always_faster_Nehalem && nthreads_tot <= nthreads_omp_always_faster_Nehalem) ||
+ (gmx_cpuid_model(hwinfo->cpuid_info) >= nthreads_omp_always_faster_SandyBridge && nthreads_tot <= nthreads_omp_always_faster_SandyBridge))))
+ {
+ /* Use pure OpenMP parallelization */
+ nthreads_tmpi = 1;
+ }
+ else
+ {
+ /* Don't use OpenMP parallelization */
+ nthreads_tmpi = nthreads_tot;
+ }
+ }
+
+ return nthreads_tmpi;
+}
+
+
+/* Get the number of threads to use for thread-MPI based on how many
+ * were requested, which algorithms we're using,
+ * and how many particles there are.
+ * At the point we have already called check_and_update_hw_opt.
+ * Thus all options should be internally consistent and consistent
+ * with the hardware, except that ntmpi could be larger than #GPU.
+ */
- static void prepare_verlet_scheme(FILE *fplog,
- gmx_hw_info_t *hwinfo,
- t_commrec *cr,
- const char *nbpu_opt,
- t_inputrec *ir,
- const gmx_mtop_t *mtop,
- matrix box,
- gmx_bool *bUseGPU)
++static int get_nthreads_mpi(const gmx_hw_info_t *hwinfo,
+ gmx_hw_opt_t *hw_opt,
+ t_inputrec *inputrec, gmx_mtop_t *mtop,
+ const t_commrec *cr,
+ FILE *fplog)
+{
+ int nthreads_hw, nthreads_tot_max, nthreads_tmpi, nthreads_new, ngpu;
+ int min_atoms_per_mpi_thread;
+ char *env;
+ char sbuf[STRLEN];
+ gmx_bool bCanUseGPU;
+
+ if (hw_opt->nthreads_tmpi > 0)
+ {
+ /* Trivial, return right away */
+ return hw_opt->nthreads_tmpi;
+ }
+
+ nthreads_hw = hwinfo->nthreads_hw_avail;
+
+ /* How many total (#tMPI*#OpenMP) threads can we start? */
+ if (hw_opt->nthreads_tot > 0)
+ {
+ nthreads_tot_max = hw_opt->nthreads_tot;
+ }
+ else
+ {
+ nthreads_tot_max = nthreads_hw;
+ }
+
+ bCanUseGPU = (inputrec->cutoff_scheme == ecutsVERLET && hwinfo->bCanUseGPU);
+ if (bCanUseGPU)
+ {
+ ngpu = hwinfo->gpu_info.ncuda_dev_use;
+ }
+ else
+ {
+ ngpu = 0;
+ }
+
+ nthreads_tmpi =
+ get_tmpi_omp_thread_division(hwinfo, hw_opt, nthreads_tot_max, ngpu);
+
+ if (inputrec->eI == eiNM || EI_TPI(inputrec->eI))
+ {
+ /* Steps are divided over the nodes iso splitting the atoms */
+ min_atoms_per_mpi_thread = 0;
+ }
+ else
+ {
+ if (bCanUseGPU)
+ {
+ min_atoms_per_mpi_thread = MIN_ATOMS_PER_GPU;
+ }
+ else
+ {
+ min_atoms_per_mpi_thread = MIN_ATOMS_PER_MPI_THREAD;
+ }
+ }
+
+ /* Check if an algorithm does not support parallel simulation. */
+ if (nthreads_tmpi != 1 &&
+ ( inputrec->eI == eiLBFGS ||
+ inputrec->coulombtype == eelEWALD ) )
+ {
+ nthreads_tmpi = 1;
+
+ md_print_warn(cr, fplog, "The integration or electrostatics algorithm doesn't support parallel runs. Using a single thread-MPI thread.\n");
+ if (hw_opt->nthreads_tmpi > nthreads_tmpi)
+ {
+ gmx_fatal(FARGS, "You asked for more than 1 thread-MPI thread, but an algorithm doesn't support that");
+ }
+ }
+ else if (mtop->natoms/nthreads_tmpi < min_atoms_per_mpi_thread)
+ {
+ /* the thread number was chosen automatically, but there are too many
+ threads (too few atoms per thread) */
+ nthreads_new = max(1, mtop->natoms/min_atoms_per_mpi_thread);
+
+ /* Avoid partial use of Hyper-Threading */
+ if (gmx_cpuid_x86_smt(hwinfo->cpuid_info) == GMX_CPUID_X86_SMT_ENABLED &&
+ nthreads_new > nthreads_hw/2 && nthreads_new < nthreads_hw)
+ {
+ nthreads_new = nthreads_hw/2;
+ }
+
+ /* Avoid large prime numbers in the thread count */
+ if (nthreads_new >= 6)
+ {
+ /* Use only 6,8,10 with additional factors of 2 */
+ int fac;
+
+ fac = 2;
+ while (3*fac*2 <= nthreads_new)
+ {
+ fac *= 2;
+ }
+
+ nthreads_new = (nthreads_new/fac)*fac;
+ }
+ else
+ {
+ /* Avoid 5 */
+ if (nthreads_new == 5)
+ {
+ nthreads_new = 4;
+ }
+ }
+
+ nthreads_tmpi = nthreads_new;
+
+ fprintf(stderr, "\n");
+ fprintf(stderr, "NOTE: Parallelization is limited by the small number of atoms,\n");
+ fprintf(stderr, " only starting %d thread-MPI threads.\n", nthreads_tmpi);
+ fprintf(stderr, " You can use the -nt and/or -ntmpi option to optimize the number of threads.\n\n");
+ }
+
+ return nthreads_tmpi;
+}
+#endif /* GMX_THREAD_MPI */
+
+
+/* Environment variable for setting nstlist */
+static const char* NSTLIST_ENVVAR = "GMX_NSTLIST";
+/* Try to increase nstlist when using a GPU with nstlist less than this */
+static const int NSTLIST_GPU_ENOUGH = 20;
+/* Increase nstlist until the non-bonded cost increases more than this factor */
+static const float NBNXN_GPU_LIST_OK_FAC = 1.25;
+/* Don't increase nstlist beyond a non-bonded cost increases of this factor */
+static const float NBNXN_GPU_LIST_MAX_FAC = 1.40;
+
+/* Try to increase nstlist when running on a GPU */
+static void increase_nstlist(FILE *fp, t_commrec *cr,
+ t_inputrec *ir, const gmx_mtop_t *mtop, matrix box)
+{
+ char *env;
+ int nstlist_orig, nstlist_prev;
+ verletbuf_list_setup_t ls;
+ real rlist_inc, rlist_ok, rlist_max, rlist_new, rlist_prev;
+ int i;
+ t_state state_tmp;
+ gmx_bool bBox, bDD, bCont;
+ const char *nstl_fmt = "\nFor optimal performance with a GPU nstlist (now %d) should be larger.\nThe optimum depends on your CPU and GPU resources.\nYou might want to try several nstlist values.\n";
+ const char *vbd_err = "Can not increase nstlist for GPU run because verlet-buffer-drift is not set or used";
+ const char *box_err = "Can not increase nstlist for GPU run because the box is too small";
+ const char *dd_err = "Can not increase nstlist for GPU run because of domain decomposition limitations";
+ char buf[STRLEN];
+
+ /* Number of + nstlist alternative values to try when switching */
+ const int nstl[] = { 20, 25, 40, 50 };
+#define NNSTL sizeof(nstl)/sizeof(nstl[0])
+
+ env = getenv(NSTLIST_ENVVAR);
+ if (env == NULL)
+ {
+ if (fp != NULL)
+ {
+ fprintf(fp, nstl_fmt, ir->nstlist);
+ }
+ }
+
+ if (ir->verletbuf_drift == 0)
+ {
+ gmx_fatal(FARGS, "You are using an old tpr file with a GPU, please generate a new tpr file with an up to date version of grompp");
+ }
+
+ if (ir->verletbuf_drift < 0)
+ {
+ if (MASTER(cr))
+ {
+ fprintf(stderr, "%s\n", vbd_err);
+ }
+ if (fp != NULL)
+ {
+ fprintf(fp, "%s\n", vbd_err);
+ }
+
+ return;
+ }
+
+ nstlist_orig = ir->nstlist;
+ if (env != NULL)
+ {
+ sprintf(buf, "Getting nstlist from environment variable GMX_NSTLIST=%s", env);
+ if (MASTER(cr))
+ {
+ fprintf(stderr, "%s\n", buf);
+ }
+ if (fp != NULL)
+ {
+ fprintf(fp, "%s\n", buf);
+ }
+ sscanf(env, "%d", &ir->nstlist);
+ }
+
+ verletbuf_get_list_setup(TRUE, &ls);
+
+ /* Allow rlist to make the list double the size of the cut-off sphere */
+ rlist_inc = nbnxn_get_rlist_effective_inc(NBNXN_GPU_CLUSTER_SIZE, mtop->natoms/det(box));
+ rlist_ok = (max(ir->rvdw, ir->rcoulomb) + rlist_inc)*pow(NBNXN_GPU_LIST_OK_FAC, 1.0/3.0) - rlist_inc;
+ rlist_max = (max(ir->rvdw, ir->rcoulomb) + rlist_inc)*pow(NBNXN_GPU_LIST_MAX_FAC, 1.0/3.0) - rlist_inc;
+ if (debug)
+ {
+ fprintf(debug, "GPU nstlist tuning: rlist_inc %.3f rlist_max %.3f\n",
+ rlist_inc, rlist_max);
+ }
+
+ i = 0;
+ nstlist_prev = nstlist_orig;
+ rlist_prev = ir->rlist;
+ do
+ {
+ if (env == NULL)
+ {
+ ir->nstlist = nstl[i];
+ }
+
+ /* Set the pair-list buffer size in ir */
+ calc_verlet_buffer_size(mtop, det(box), ir, ir->verletbuf_drift, &ls,
+ NULL, &rlist_new);
+
+ /* Does rlist fit in the box? */
+ bBox = (sqr(rlist_new) < max_cutoff2(ir->ePBC, box));
+ bDD = TRUE;
+ if (bBox && DOMAINDECOMP(cr))
+ {
+ /* Check if rlist fits in the domain decomposition */
+ if (inputrec2nboundeddim(ir) < DIM)
+ {
+ gmx_incons("Changing nstlist with domain decomposition and unbounded dimensions is not implemented yet");
+ }
+ copy_mat(box, state_tmp.box);
+ bDD = change_dd_cutoff(cr, &state_tmp, ir, rlist_new);
+ }
+
+ bCont = FALSE;
+
+ if (env == NULL)
+ {
+ if (bBox && bDD && rlist_new <= rlist_max)
+ {
+ /* Increase nstlist */
+ nstlist_prev = ir->nstlist;
+ rlist_prev = rlist_new;
+ bCont = (i+1 < NNSTL && rlist_new < rlist_ok);
+ }
+ else
+ {
+ /* Stick with the previous nstlist */
+ ir->nstlist = nstlist_prev;
+ rlist_new = rlist_prev;
+ bBox = TRUE;
+ bDD = TRUE;
+ }
+ }
+
+ i++;
+ }
+ while (bCont);
+
+ if (!bBox || !bDD)
+ {
+ gmx_warning(!bBox ? box_err : dd_err);
+ if (fp != NULL)
+ {
+ fprintf(fp, "\n%s\n", bBox ? box_err : dd_err);
+ }
+ ir->nstlist = nstlist_orig;
+ }
+ else if (ir->nstlist != nstlist_orig || rlist_new != ir->rlist)
+ {
+ sprintf(buf, "Changing nstlist from %d to %d, rlist from %g to %g",
+ nstlist_orig, ir->nstlist,
+ ir->rlist, rlist_new);
+ if (MASTER(cr))
+ {
+ fprintf(stderr, "%s\n\n", buf);
+ }
+ if (fp != NULL)
+ {
+ fprintf(fp, "%s\n\n", buf);
+ }
+ ir->rlist = rlist_new;
+ ir->rlistlong = rlist_new;
+ }
+}
+
- /* Detect hardware, gather information. With tMPI only thread 0 does it
- * and after threads are started broadcasts hwinfo around. */
- snew(hwinfo, 1);
- gmx_detect_hardware(fplog, hwinfo, cr,
- bForceUseGPU, bTryUseGPU, hw_opt->gpu_id);
++static void prepare_verlet_scheme(FILE *fplog,
++ const gmx_hw_info_t *hwinfo,
++ t_commrec *cr,
++ const char *nbpu_opt,
++ t_inputrec *ir,
++ const gmx_mtop_t *mtop,
++ matrix box,
++ gmx_bool *bUseGPU)
+{
+ /* Here we only check for GPU usage on the MPI master process,
+ * as here we don't know how many GPUs we will use yet.
+ * We check for a GPU on all processes later.
+ */
+ *bUseGPU = hwinfo->bCanUseGPU || (getenv("GMX_EMULATE_GPU") != NULL);
+
+ if (ir->verletbuf_drift > 0)
+ {
+ /* Update the Verlet buffer size for the current run setup */
+ verletbuf_list_setup_t ls;
+ real rlist_new;
+
+ /* Here we assume CPU acceleration is on. But as currently
+ * calc_verlet_buffer_size gives the same results for 4x8 and 4x4
+ * and 4x2 gives a larger buffer than 4x4, this is ok.
+ */
+ verletbuf_get_list_setup(*bUseGPU, &ls);
+
+ calc_verlet_buffer_size(mtop, det(box), ir,
+ ir->verletbuf_drift, &ls,
+ NULL, &rlist_new);
+ if (rlist_new != ir->rlist)
+ {
+ if (fplog != NULL)
+ {
+ fprintf(fplog, "\nChanging rlist from %g to %g for non-bonded %dx%d atom kernels\n\n",
+ ir->rlist, rlist_new,
+ ls.cluster_size_i, ls.cluster_size_j);
+ }
+ ir->rlist = rlist_new;
+ ir->rlistlong = rlist_new;
+ }
+ }
+
+ /* With GPU or emulation we should check nstlist for performance */
+ if ((EI_DYNAMICS(ir->eI) &&
+ *bUseGPU &&
+ ir->nstlist < NSTLIST_GPU_ENOUGH) ||
+ getenv(NSTLIST_ENVVAR) != NULL)
+ {
+ /* Choose a better nstlist */
+ increase_nstlist(fplog, cr, ir, mtop, box);
+ }
+}
+
+static void convert_to_verlet_scheme(FILE *fplog,
+ t_inputrec *ir,
+ gmx_mtop_t *mtop, real box_vol)
+{
+ char *conv_mesg = "Converting input file with group cut-off scheme to the Verlet cut-off scheme";
+
+ md_print_warn(NULL, fplog, "%s\n", conv_mesg);
+
+ ir->cutoff_scheme = ecutsVERLET;
+ ir->verletbuf_drift = 0.005;
+
+ if (ir->rcoulomb != ir->rvdw)
+ {
+ gmx_fatal(FARGS, "The VdW and Coulomb cut-offs are different, whereas the Verlet scheme only supports equal cut-offs");
+ }
+
+ if (ir->vdwtype == evdwUSER || EEL_USER(ir->coulombtype))
+ {
+ gmx_fatal(FARGS, "User non-bonded potentials are not (yet) supported with the Verlet scheme");
+ }
+ else if (EVDW_SWITCHED(ir->vdwtype) || EEL_SWITCHED(ir->coulombtype))
+ {
+ md_print_warn(NULL, fplog, "Converting switched or shifted interactions to a shifted potential (without force shift), this will lead to slightly different interaction potentials");
+
+ if (EVDW_SWITCHED(ir->vdwtype))
+ {
+ ir->vdwtype = evdwCUT;
+ }
+ if (EEL_SWITCHED(ir->coulombtype))
+ {
+ if (EEL_FULL(ir->coulombtype))
+ {
+ /* With full electrostatic only PME can be switched */
+ ir->coulombtype = eelPME;
+ }
+ else
+ {
+ md_print_warn(NULL, fplog, "NOTE: Replacing %s electrostatics with reaction-field with epsilon-rf=inf\n", eel_names[ir->coulombtype]);
+ ir->coulombtype = eelRF;
+ ir->epsilon_rf = 0.0;
+ }
+ }
+
+ /* We set the target energy drift to a small number.
+ * Note that this is only for testing. For production the user
+ * should think about this and set the mdp options.
+ */
+ ir->verletbuf_drift = 1e-4;
+ }
+
+ if (inputrec2nboundeddim(ir) != 3)
+ {
+ gmx_fatal(FARGS, "Can only convert old tpr files to the Verlet cut-off scheme with 3D pbc");
+ }
+
+ if (ir->efep != efepNO || ir->implicit_solvent != eisNO)
+ {
+ gmx_fatal(FARGS, "Will not convert old tpr files to the Verlet cut-off scheme with free-energy calculations or implicit solvent");
+ }
+
+ if (EI_DYNAMICS(ir->eI) && !(EI_MD(ir->eI) && ir->etc == etcNO))
+ {
+ verletbuf_list_setup_t ls;
+
+ verletbuf_get_list_setup(FALSE, &ls);
+ calc_verlet_buffer_size(mtop, box_vol, ir, ir->verletbuf_drift, &ls,
+ NULL, &ir->rlist);
+ }
+ else
+ {
+ ir->verletbuf_drift = -1;
+ ir->rlist = 1.05*max(ir->rvdw, ir->rcoulomb);
+ }
+
+ gmx_mtop_remove_chargegroups(mtop);
+}
+
+static void check_and_update_hw_opt(gmx_hw_opt_t *hw_opt,
+ int cutoff_scheme,
+ gmx_bool bIsSimMaster)
+{
+ gmx_omp_nthreads_read_env(&hw_opt->nthreads_omp, bIsSimMaster);
+
+#ifndef GMX_THREAD_MPI
+ if (hw_opt->nthreads_tot > 0)
+ {
+ gmx_fatal(FARGS, "Setting the total number of threads is only supported with thread-MPI and Gromacs was compiled without thread-MPI");
+ }
+ if (hw_opt->nthreads_tmpi > 0)
+ {
+ gmx_fatal(FARGS, "Setting the number of thread-MPI threads is only supported with thread-MPI and Gromacs was compiled without thread-MPI");
+ }
+#endif
+
+ if (hw_opt->nthreads_tot > 0 && hw_opt->nthreads_omp_pme <= 0)
+ {
+ /* We have the same number of OpenMP threads for PP and PME processes,
+ * thus we can perform several consistency checks.
+ */
+ if (hw_opt->nthreads_tmpi > 0 &&
+ hw_opt->nthreads_omp > 0 &&
+ hw_opt->nthreads_tot != hw_opt->nthreads_tmpi*hw_opt->nthreads_omp)
+ {
+ gmx_fatal(FARGS, "The total number of threads requested (%d) does not match the thread-MPI threads (%d) times the OpenMP threads (%d) requested",
+ hw_opt->nthreads_tot, hw_opt->nthreads_tmpi, hw_opt->nthreads_omp);
+ }
+
+ if (hw_opt->nthreads_tmpi > 0 &&
+ hw_opt->nthreads_tot % hw_opt->nthreads_tmpi != 0)
+ {
+ gmx_fatal(FARGS, "The total number of threads requested (%d) is not divisible by the number of thread-MPI threads requested (%d)",
+ hw_opt->nthreads_tot, hw_opt->nthreads_tmpi);
+ }
+
+ if (hw_opt->nthreads_omp > 0 &&
+ hw_opt->nthreads_tot % hw_opt->nthreads_omp != 0)
+ {
+ gmx_fatal(FARGS, "The total number of threads requested (%d) is not divisible by the number of OpenMP threads requested (%d)",
+ hw_opt->nthreads_tot, hw_opt->nthreads_omp);
+ }
+
+ if (hw_opt->nthreads_tmpi > 0 &&
+ hw_opt->nthreads_omp <= 0)
+ {
+ hw_opt->nthreads_omp = hw_opt->nthreads_tot/hw_opt->nthreads_tmpi;
+ }
+ }
+
+#ifndef GMX_OPENMP
+ if (hw_opt->nthreads_omp > 1)
+ {
+ gmx_fatal(FARGS, "OpenMP threads are requested, but Gromacs was compiled without OpenMP support");
+ }
+#endif
+
+ if (cutoff_scheme == ecutsGROUP)
+ {
+ /* We only have OpenMP support for PME only nodes */
+ if (hw_opt->nthreads_omp > 1)
+ {
+ gmx_fatal(FARGS, "OpenMP threads have been requested with cut-off scheme %s, but these are only supported with cut-off scheme %s",
+ ecutscheme_names[cutoff_scheme],
+ ecutscheme_names[ecutsVERLET]);
+ }
+ hw_opt->nthreads_omp = 1;
+ }
+
+ if (hw_opt->nthreads_omp_pme > 0 && hw_opt->nthreads_omp <= 0)
+ {
+ gmx_fatal(FARGS, "You need to specify -ntomp in addition to -ntomp_pme");
+ }
+
+ if (hw_opt->nthreads_tot == 1)
+ {
+ hw_opt->nthreads_tmpi = 1;
+
+ if (hw_opt->nthreads_omp > 1)
+ {
+ gmx_fatal(FARGS, "You requested %d OpenMP threads with %d total threads",
+ hw_opt->nthreads_tmpi, hw_opt->nthreads_tot);
+ }
+ hw_opt->nthreads_omp = 1;
+ }
+
+ if (hw_opt->nthreads_omp_pme <= 0 && hw_opt->nthreads_omp > 0)
+ {
+ hw_opt->nthreads_omp_pme = hw_opt->nthreads_omp;
+ }
+
+ if (debug)
+ {
+ fprintf(debug, "hw_opt: nt %d ntmpi %d ntomp %d ntomp_pme %d gpu_id '%s'\n",
+ hw_opt->nthreads_tot,
+ hw_opt->nthreads_tmpi,
+ hw_opt->nthreads_omp,
+ hw_opt->nthreads_omp_pme,
+ hw_opt->gpu_id != NULL ? hw_opt->gpu_id : "");
+
+ }
+}
+
+
+/* Override the value in inputrec with value passed on the command line (if any) */
+static void override_nsteps_cmdline(FILE *fplog,
+ gmx_large_int_t nsteps_cmdline,
+ t_inputrec *ir,
+ const t_commrec *cr)
+{
+ char sbuf[STEPSTRSIZE];
+
+ assert(ir);
+ assert(cr);
+
+ /* override with anything else than the default -2 */
+ if (nsteps_cmdline > -2)
+ {
+ char stmp[STRLEN];
+
+ ir->nsteps = nsteps_cmdline;
+ if (EI_DYNAMICS(ir->eI))
+ {
+ sprintf(stmp, "Overriding nsteps with value passed on the command line: %s steps, %.3f ps",
+ gmx_step_str(nsteps_cmdline, sbuf),
+ nsteps_cmdline*ir->delta_t);
+ }
+ else
+ {
+ sprintf(stmp, "Overriding nsteps with value passed on the command line: %s steps",
+ gmx_step_str(nsteps_cmdline, sbuf));
+ }
+
+ md_print_warn(cr, fplog, "%s\n", stmp);
+ }
+}
+
+/* Data structure set by SIMMASTER which needs to be passed to all nodes
+ * before the other nodes have read the tpx file and called gmx_detect_hardware.
+ */
+typedef struct {
+ int cutoff_scheme; /* The cutoff scheme from inputrec_t */
+ gmx_bool bUseGPU; /* Use GPU or GPU emulation */
+} master_inf_t;
+
+int mdrunner(gmx_hw_opt_t *hw_opt,
+ FILE *fplog, t_commrec *cr, int nfile,
+ const t_filenm fnm[], const output_env_t oenv, gmx_bool bVerbose,
+ gmx_bool bCompact, int nstglobalcomm,
+ ivec ddxyz, int dd_node_order, real rdd, real rconstr,
+ const char *dddlb_opt, real dlb_scale,
+ const char *ddcsx, const char *ddcsy, const char *ddcsz,
+ const char *nbpu_opt,
+ gmx_large_int_t nsteps_cmdline, int nstepout, int resetstep,
+ int nmultisim, int repl_ex_nst, int repl_ex_nex,
+ int repl_ex_seed, real pforce, real cpt_period, real max_hours,
+ const char *deviceOptions, unsigned long Flags)
+{
+ gmx_bool bForceUseGPU, bTryUseGPU;
+ double nodetime = 0, realtime;
+ t_inputrec *inputrec;
+ t_state *state = NULL;
+ matrix box;
+ gmx_ddbox_t ddbox = {0};
+ int npme_major, npme_minor;
+ real tmpr1, tmpr2;
+ t_nrnb *nrnb;
+ gmx_mtop_t *mtop = NULL;
+ t_mdatoms *mdatoms = NULL;
+ t_forcerec *fr = NULL;
+ t_fcdata *fcd = NULL;
+ real ewaldcoeff = 0;
+ gmx_pme_t *pmedata = NULL;
+ gmx_vsite_t *vsite = NULL;
+ gmx_constr_t constr;
+ int i, m, nChargePerturbed = -1, status, nalloc;
+ char *gro;
+ gmx_wallcycle_t wcycle;
+ gmx_bool bReadRNG, bReadEkin;
+ int list;
+ gmx_runtime_t runtime;
+ int rc;
+ gmx_large_int_t reset_counters;
+ gmx_edsam_t ed = NULL;
+ t_commrec *cr_old = cr;
+ int nthreads_pme = 1;
+ int nthreads_pp = 1;
+ gmx_membed_t membed = NULL;
+ gmx_hw_info_t *hwinfo = NULL;
+ master_inf_t minf = {-1, FALSE};
+
+ /* CAUTION: threads may be started later on in this function, so
+ cr doesn't reflect the final parallel state right now */
+ snew(inputrec, 1);
+ snew(mtop, 1);
+
+ if (Flags & MD_APPENDFILES)
+ {
+ fplog = NULL;
+ }
+
+ bForceUseGPU = (strncmp(nbpu_opt, "gpu", 3) == 0);
+ bTryUseGPU = (strncmp(nbpu_opt, "auto", 4) == 0) || bForceUseGPU;
+
++ /* Detect hardware, gather information. This is an operation that is
++ * global for this process (MPI rank). */
++ hwinfo = gmx_detect_hardware(fplog, cr,
++ bForceUseGPU, bTryUseGPU, hw_opt->gpu_id);
++
++
+ snew(state, 1);
+ if (SIMMASTER(cr))
+ {
+ /* Read (nearly) all data required for the simulation */
+ read_tpx_state(ftp2fn(efTPX, nfile, fnm), inputrec, state, NULL, mtop);
+
+ if (inputrec->cutoff_scheme != ecutsVERLET &&
+ ((Flags & MD_TESTVERLET) || getenv("GMX_VERLET_SCHEME") != NULL))
+ {
+ convert_to_verlet_scheme(fplog, inputrec, mtop, det(state->box));
+ }
+
- #if defined GMX_THREAD_MPI
- /* With tMPI we detected on thread 0 and we'll just pass the hwinfo pointer
- * to the other threads -- slightly uncool, but works fine, just need to
- * make sure that the data doesn't get freed twice. */
- if (cr->nnodes > 1)
- {
- if (!SIMMASTER(cr))
- {
- snew(hwinfo, 1);
- }
- gmx_bcast(sizeof(&hwinfo), &hwinfo, cr);
- }
- #else
- if (PAR(cr) && !SIMMASTER(cr))
- {
- /* now we have inputrec on all nodes, can run the detection */
- /* TODO: perhaps it's better to propagate within a node instead? */
- snew(hwinfo, 1);
- gmx_detect_hardware(fplog, hwinfo, cr,
- bForceUseGPU, bTryUseGPU, hw_opt->gpu_id);
- }
-
- /* Now do the affinity check with MPI/no-MPI (done earlier with thread-MPI). */
- gmx_check_thread_affinity_set(fplog, cr,
- hw_opt, hwinfo->nthreads_hw_avail, FALSE);
- #endif
-
+
+ minf.cutoff_scheme = inputrec->cutoff_scheme;
+ minf.bUseGPU = FALSE;
+
+ if (inputrec->cutoff_scheme == ecutsVERLET)
+ {
+ prepare_verlet_scheme(fplog, hwinfo, cr, nbpu_opt,
+ inputrec, mtop, state->box,
+ &minf.bUseGPU);
+ }
+ else if (hwinfo->bCanUseGPU)
+ {
+ md_print_warn(cr, fplog,
+ "NOTE: GPU(s) found, but the current simulation can not use GPUs\n"
+ " To use a GPU, set the mdp option: cutoff-scheme = Verlet\n"
+ " (for quick performance testing you can use the -testverlet option)\n");
+
+ if (bForceUseGPU)
+ {
+ gmx_fatal(FARGS, "GPU requested, but can't be used without cutoff-scheme=Verlet");
+ }
+ }
+ }
+#ifndef GMX_THREAD_MPI
+ if (PAR(cr))
+ {
+ gmx_bcast_sim(sizeof(minf), &minf, cr);
+ }
+#endif
+ if (minf.bUseGPU && cr->npmenodes == -1)
+ {
+ /* Don't automatically use PME-only nodes with GPUs */
+ cr->npmenodes = 0;
+ }
+
+ /* Check for externally set OpenMP affinity and turn off internal
+ * pinning if any is found. We need to do this check early to tell
+ * thread-MPI whether it should do pinning when spawning threads.
+ * TODO: the above no longer holds, we should move these checks down
+ */
+ gmx_omp_check_thread_affinity(fplog, cr, hw_opt);
+
+#ifdef GMX_THREAD_MPI
+ /* With thread-MPI inputrec is only set here on the master thread */
+ if (SIMMASTER(cr))
+#endif
+ {
+ check_and_update_hw_opt(hw_opt, minf.cutoff_scheme, SIMMASTER(cr));
+
+#ifdef GMX_THREAD_MPI
+ /* Early check for externally set process affinity. Can't do over all
+ * MPI processes because hwinfo is not available everywhere, but with
+ * thread-MPI it's needed as pinning might get turned off which needs
+ * to be known before starting thread-MPI. */
+ gmx_check_thread_affinity_set(fplog,
+ NULL,
+ hw_opt, hwinfo->nthreads_hw_avail, FALSE);
+#endif
+
+#ifdef GMX_THREAD_MPI
+ if (cr->npmenodes > 0 && hw_opt->nthreads_tmpi <= 0)
+ {
+ gmx_fatal(FARGS, "You need to explicitly specify the number of MPI threads (-ntmpi) when using separate PME nodes");
+ }
+#endif
+
+ if (hw_opt->nthreads_omp_pme != hw_opt->nthreads_omp &&
+ cr->npmenodes <= 0)
+ {
+ gmx_fatal(FARGS, "You need to explicitly specify the number of PME nodes (-npme) when using different number of OpenMP threads for PP and PME nodes");
+ }
+ }
+
+#ifdef GMX_THREAD_MPI
+ if (SIMMASTER(cr))
+ {
+ /* NOW the threads will be started: */
+ hw_opt->nthreads_tmpi = get_nthreads_mpi(hwinfo,
+ hw_opt,
+ inputrec, mtop,
+ cr, fplog);
+ if (hw_opt->nthreads_tot > 0 && hw_opt->nthreads_omp <= 0)
+ {
+ hw_opt->nthreads_omp = hw_opt->nthreads_tot/hw_opt->nthreads_tmpi;
+ }
+
+ if (hw_opt->nthreads_tmpi > 1)
+ {
+ /* now start the threads. */
+ cr = mdrunner_start_threads(hw_opt, fplog, cr_old, nfile, fnm,
+ oenv, bVerbose, bCompact, nstglobalcomm,
+ ddxyz, dd_node_order, rdd, rconstr,
+ dddlb_opt, dlb_scale, ddcsx, ddcsy, ddcsz,
+ nbpu_opt,
+ nsteps_cmdline, nstepout, resetstep, nmultisim,
+ repl_ex_nst, repl_ex_nex, repl_ex_seed, pforce,
+ cpt_period, max_hours, deviceOptions,
+ Flags);
+ /* the main thread continues here with a new cr. We don't deallocate
+ the old cr because other threads may still be reading it. */
+ if (cr == NULL)
+ {
+ gmx_comm("Failed to spawn threads");
+ }
+ }
+ }
+#endif
+ /* END OF CAUTION: cr is now reliable */
+
+ /* g_membed initialisation *
+ * Because we change the mtop, init_membed is called before the init_parallel *
+ * (in case we ever want to make it run in parallel) */
+ if (opt2bSet("-membed", nfile, fnm))
+ {
+ if (MASTER(cr))
+ {
+ fprintf(stderr, "Initializing membed");
+ }
+ membed = init_membed(fplog, nfile, fnm, mtop, inputrec, state, cr, &cpt_period);
+ }
+
+ if (PAR(cr))
+ {
+ /* now broadcast everything to the non-master nodes/threads: */
+ init_parallel(fplog, cr, inputrec, mtop);
+
+ /* This check needs to happen after get_nthreads_mpi() */
+ if (inputrec->cutoff_scheme == ecutsVERLET && (Flags & MD_PARTDEC))
+ {
+ gmx_fatal_collective(FARGS, cr, NULL,
+ "The Verlet cut-off scheme is not supported with particle decomposition.\n"
+ "You can achieve the same effect as particle decomposition by running in parallel using only OpenMP threads.");
+ }
+ }
+ if (fplog != NULL)
+ {
+ pr_inputrec(fplog, 0, "Input Parameters", inputrec, FALSE);
+ }
+
- gmx_check_hw_runconf_consistency(fplog, hwinfo, cr, hw_opt->nthreads_tmpi, minf.bUseGPU);
+ /* now make sure the state is initialized and propagated */
+ set_state_entries(state, inputrec, cr->nnodes);
+
+ /* A parallel command line option consistency check that we can
+ only do after any threads have started. */
+ if (!PAR(cr) &&
+ (ddxyz[XX] > 1 || ddxyz[YY] > 1 || ddxyz[ZZ] > 1 || cr->npmenodes > 0))
+ {
+ gmx_fatal(FARGS,
+ "The -dd or -npme option request a parallel simulation, "
+#ifndef GMX_MPI
+ "but %s was compiled without threads or MPI enabled"
+#else
+#ifdef GMX_THREAD_MPI
+ "but the number of threads (option -nt) is 1"
+#else
+ "but %s was not started through mpirun/mpiexec or only one process was requested through mpirun/mpiexec"
+#endif
+#endif
+ , ShortProgram()
+ );
+ }
+
+ if ((Flags & MD_RERUN) &&
+ (EI_ENERGY_MINIMIZATION(inputrec->eI) || eiNM == inputrec->eI))
+ {
+ gmx_fatal(FARGS, "The .mdp file specified an energy mininization or normal mode algorithm, and these are not compatible with mdrun -rerun");
+ }
+
+ if (can_use_allvsall(inputrec, mtop, TRUE, cr, fplog) && PAR(cr))
+ {
+ /* Simple neighbour searching and (also?) all-vs-all loops
+ * do not work with domain decomposition. */
+ Flags |= MD_PARTDEC;
+ }
+
+ if (!EEL_PME(inputrec->coulombtype) || (Flags & MD_PARTDEC))
+ {
+ if (cr->npmenodes > 0)
+ {
+ if (!EEL_PME(inputrec->coulombtype))
+ {
+ gmx_fatal_collective(FARGS, cr, NULL,
+ "PME nodes are requested, but the system does not use PME electrostatics");
+ }
+ if (Flags & MD_PARTDEC)
+ {
+ gmx_fatal_collective(FARGS, cr, NULL,
+ "PME nodes are requested, but particle decomposition does not support separate PME nodes");
+ }
+ }
+
+ cr->npmenodes = 0;
+ }
+
+#ifdef GMX_FAHCORE
+ fcRegisterSteps(inputrec->nsteps, inputrec->init_step);
+#endif
+
+ /* NMR restraints must be initialized before load_checkpoint,
+ * since with time averaging the history is added to t_state.
+ * For proper consistency check we therefore need to extend
+ * t_state here.
+ * So the PME-only nodes (if present) will also initialize
+ * the distance restraints.
+ */
+ snew(fcd, 1);
+
+ /* This needs to be called before read_checkpoint to extend the state */
+ init_disres(fplog, mtop, inputrec, cr, Flags & MD_PARTDEC, fcd, state, repl_ex_nst > 0);
+
+ if (gmx_mtop_ftype_count(mtop, F_ORIRES) > 0)
+ {
+ if (PAR(cr) && !(Flags & MD_PARTDEC))
+ {
+ gmx_fatal(FARGS, "Orientation restraints do not work (yet) with domain decomposition, use particle decomposition (mdrun option -pd)");
+ }
+ /* Orientation restraints */
+ if (MASTER(cr))
+ {
+ init_orires(fplog, mtop, state->x, inputrec, cr->ms, &(fcd->orires),
+ state);
+ }
+ }
+
+ if (DEFORM(*inputrec))
+ {
+ /* Store the deform reference box before reading the checkpoint */
+ if (SIMMASTER(cr))
+ {
+ copy_mat(state->box, box);
+ }
+ if (PAR(cr))
+ {
+ gmx_bcast(sizeof(box), box, cr);
+ }
+ /* Because we do not have the update struct available yet
+ * in which the reference values should be stored,
+ * we store them temporarily in static variables.
+ * This should be thread safe, since they are only written once
+ * and with identical values.
+ */
+#ifdef GMX_THREAD_MPI
+ tMPI_Thread_mutex_lock(&deform_init_box_mutex);
+#endif
+ deform_init_init_step_tpx = inputrec->init_step;
+ copy_mat(box, deform_init_box_tpx);
+#ifdef GMX_THREAD_MPI
+ tMPI_Thread_mutex_unlock(&deform_init_box_mutex);
+#endif
+ }
+
+ if (opt2bSet("-cpi", nfile, fnm))
+ {
+ /* Check if checkpoint file exists before doing continuation.
+ * This way we can use identical input options for the first and subsequent runs...
+ */
+ if (gmx_fexist_master(opt2fn_master("-cpi", nfile, fnm, cr), cr) )
+ {
+ load_checkpoint(opt2fn_master("-cpi", nfile, fnm, cr), &fplog,
+ cr, Flags & MD_PARTDEC, ddxyz,
+ inputrec, state, &bReadRNG, &bReadEkin,
+ (Flags & MD_APPENDFILES),
+ (Flags & MD_APPENDFILESSET));
+
+ if (bReadRNG)
+ {
+ Flags |= MD_READ_RNG;
+ }
+ if (bReadEkin)
+ {
+ Flags |= MD_READ_EKIN;
+ }
+ }
+ }
+
+ if (((MASTER(cr) || (Flags & MD_SEPPOT)) && (Flags & MD_APPENDFILES))
+#ifdef GMX_THREAD_MPI
+ /* With thread MPI only the master node/thread exists in mdrun.c,
+ * therefore non-master nodes need to open the "seppot" log file here.
+ */
+ || (!MASTER(cr) && (Flags & MD_SEPPOT))
+#endif
+ )
+ {
+ gmx_log_open(ftp2fn(efLOG, nfile, fnm), cr, !(Flags & MD_SEPPOT),
+ Flags, &fplog);
+ }
+
+ /* override nsteps with value from cmdline */
+ override_nsteps_cmdline(fplog, nsteps_cmdline, inputrec, cr);
+
+ if (SIMMASTER(cr))
+ {
+ copy_mat(state->box, box);
+ }
+
+ if (PAR(cr))
+ {
+ gmx_bcast(sizeof(box), box, cr);
+ }
+
+ /* Essential dynamics */
+ if (opt2bSet("-ei", nfile, fnm))
+ {
+ /* Open input and output files, allocate space for ED data structure */
+ ed = ed_open(mtop->natoms, &state->edsamstate, nfile, fnm, Flags, oenv, cr);
+ }
+
+ if (PAR(cr) && !((Flags & MD_PARTDEC) ||
+ EI_TPI(inputrec->eI) ||
+ inputrec->eI == eiNM))
+ {
+ cr->dd = init_domain_decomposition(fplog, cr, Flags, ddxyz, rdd, rconstr,
+ dddlb_opt, dlb_scale,
+ ddcsx, ddcsy, ddcsz,
+ mtop, inputrec,
+ box, state->x,
+ &ddbox, &npme_major, &npme_minor);
+
+ make_dd_communicators(fplog, cr, dd_node_order);
+
+ /* Set overallocation to avoid frequent reallocation of arrays */
+ set_over_alloc_dd(TRUE);
+ }
+ else
+ {
+ /* PME, if used, is done on all nodes with 1D decomposition */
+ cr->npmenodes = 0;
+ cr->duty = (DUTY_PP | DUTY_PME);
+ npme_major = 1;
+ npme_minor = 1;
+ if (!EI_TPI(inputrec->eI))
+ {
+ npme_major = cr->nnodes;
+ }
+
+ if (inputrec->ePBC == epbcSCREW)
+ {
+ gmx_fatal(FARGS,
+ "pbc=%s is only implemented with domain decomposition",
+ epbc_names[inputrec->ePBC]);
+ }
+ }
+
+ if (PAR(cr))
+ {
+ /* After possible communicator splitting in make_dd_communicators.
+ * we can set up the intra/inter node communication.
+ */
+ gmx_setup_nodecomm(fplog, cr);
+ }
+
+ /* Initialize per-physical-node MPI process/thread ID and counters. */
+ gmx_init_intranode_counters(cr);
+
+#ifdef GMX_MPI
+ md_print_info(cr, fplog, "Using %d MPI %s\n",
+ cr->nnodes,
+#ifdef GMX_THREAD_MPI
+ cr->nnodes == 1 ? "thread" : "threads"
+#else
+ cr->nnodes == 1 ? "process" : "processes"
+#endif
+ );
+ fflush(stderr);
+#endif
+
+ gmx_omp_nthreads_init(fplog, cr,
+ hwinfo->nthreads_hw_avail,
+ hw_opt->nthreads_omp,
+ hw_opt->nthreads_omp_pme,
+ (cr->duty & DUTY_PP) == 0,
+ inputrec->cutoff_scheme == ecutsVERLET);
+
- #ifdef GMX_THREAD_MPI
- if (PAR(cr) && SIMMASTER(cr))
- #endif
- {
- gmx_hardware_info_free(hwinfo);
- }
++ /* check consistency and decide on the number of gpus to use. */
++ gmx_check_hw_runconf_consistency(fplog, hwinfo, cr, hw_opt->nthreads_tmpi,
++ minf.bUseGPU);
+
+ /* getting number of PP/PME threads
+ PME: env variable should be read only on one node to make sure it is
+ identical everywhere;
+ */
+ /* TODO nthreads_pp is only used for pinning threads.
+ * This is a temporary solution until we have a hw topology library.
+ */
+ nthreads_pp = gmx_omp_nthreads_get(emntNonbonded);
+ nthreads_pme = gmx_omp_nthreads_get(emntPME);
+
+ wcycle = wallcycle_init(fplog, resetstep, cr, nthreads_pp, nthreads_pme);
+
+ if (PAR(cr))
+ {
+ /* Master synchronizes its value of reset_counters with all nodes
+ * including PME only nodes */
+ reset_counters = wcycle_get_reset_counters(wcycle);
+ gmx_bcast_sim(sizeof(reset_counters), &reset_counters, cr);
+ wcycle_set_reset_counters(wcycle, reset_counters);
+ }
+
+ snew(nrnb, 1);
+ if (cr->duty & DUTY_PP)
+ {
+ /* For domain decomposition we allocate dynamically
+ * in dd_partition_system.
+ */
+ if (DOMAINDECOMP(cr))
+ {
+ bcast_state_setup(cr, state);
+ }
+ else
+ {
+ if (PAR(cr))
+ {
+ bcast_state(cr, state, TRUE);
+ }
+ }
+
+ /* Initiate forcerecord */
+ fr = mk_forcerec();
+ fr->hwinfo = hwinfo;
+ init_forcerec(fplog, oenv, fr, fcd, inputrec, mtop, cr, box, FALSE,
+ opt2fn("-table", nfile, fnm),
+ opt2fn("-tabletf", nfile, fnm),
+ opt2fn("-tablep", nfile, fnm),
+ opt2fn("-tableb", nfile, fnm),
+ nbpu_opt,
+ FALSE, pforce);
+
+ /* version for PCA_NOT_READ_NODE (see md.c) */
+ /*init_forcerec(fplog,fr,fcd,inputrec,mtop,cr,box,FALSE,
+ "nofile","nofile","nofile","nofile",FALSE,pforce);
+ */
+ fr->bSepDVDL = ((Flags & MD_SEPPOT) == MD_SEPPOT);
+
+ /* Initialize QM-MM */
+ if (fr->bQMMM)
+ {
+ init_QMMMrec(cr, box, mtop, inputrec, fr);
+ }
+
+ /* Initialize the mdatoms structure.
+ * mdatoms is not filled with atom data,
+ * as this can not be done now with domain decomposition.
+ */
+ mdatoms = init_mdatoms(fplog, mtop, inputrec->efep != efepNO);
+
+ if (mdatoms->nPerturbed > 0 && inputrec->cutoff_scheme == ecutsVERLET)
+ {
+ gmx_fatal(FARGS, "The Verlet cut-off scheme does not (yet) support free-energy calculations with perturbed atoms, only perturbed interactions. This will be implemented soon. Use the group scheme for now.");
+ }
+
+ /* Initialize the virtual site communication */
+ vsite = init_vsite(mtop, cr, FALSE);
+
+ calc_shifts(box, fr->shift_vec);
+
+ /* With periodic molecules the charge groups should be whole at start up
+ * and the virtual sites should not be far from their proper positions.
+ */
+ if (!inputrec->bContinuation && MASTER(cr) &&
+ !(inputrec->ePBC != epbcNONE && inputrec->bPeriodicMols))
+ {
+ /* Make molecules whole at start of run */
+ if (fr->ePBC != epbcNONE)
+ {
+ do_pbc_first_mtop(fplog, inputrec->ePBC, box, mtop, state->x);
+ }
+ if (vsite)
+ {
+ /* Correct initial vsite positions are required
+ * for the initial distribution in the domain decomposition
+ * and for the initial shell prediction.
+ */
+ construct_vsites_mtop(fplog, vsite, mtop, state->x);
+ }
+ }
+
+ if (EEL_PME(fr->eeltype))
+ {
+ ewaldcoeff = fr->ewaldcoeff;
+ pmedata = &fr->pmedata;
+ }
+ else
+ {
+ pmedata = NULL;
+ }
+ }
+ else
+ {
+ /* This is a PME only node */
+
+ /* We don't need the state */
+ done_state(state);
+
+ ewaldcoeff = calc_ewaldcoeff(inputrec->rcoulomb, inputrec->ewald_rtol);
+ snew(pmedata, 1);
+ }
+
+ if (hw_opt->thread_affinity != threadaffOFF)
+ {
+ /* Before setting affinity, check whether the affinity has changed
+ * - which indicates that probably the OpenMP library has changed it
+ * since we first checked).
+ */
+ gmx_check_thread_affinity_set(fplog, cr,
+ hw_opt, hwinfo->nthreads_hw_avail, TRUE);
+
+ /* Set the CPU affinity */
+ gmx_set_thread_affinity(fplog, cr, hw_opt, hwinfo);
+ }
+
+ /* Initiate PME if necessary,
+ * either on all nodes or on dedicated PME nodes only. */
+ if (EEL_PME(inputrec->coulombtype))
+ {
+ if (mdatoms)
+ {
+ nChargePerturbed = mdatoms->nChargePerturbed;
+ }
+ if (cr->npmenodes > 0)
+ {
+ /* The PME only nodes need to know nChargePerturbed */
+ gmx_bcast_sim(sizeof(nChargePerturbed), &nChargePerturbed, cr);
+ }
+
+ if (cr->duty & DUTY_PME)
+ {
+ status = gmx_pme_init(pmedata, cr, npme_major, npme_minor, inputrec,
+ mtop ? mtop->natoms : 0, nChargePerturbed,
+ (Flags & MD_REPRODUCIBLE), nthreads_pme);
+ if (status != 0)
+ {
+ gmx_fatal(FARGS, "Error %d initializing PME", status);
+ }
+ }
+ }
+
+
+ if (integrator[inputrec->eI].func == do_md)
+ {
+ /* Turn on signal handling on all nodes */
+ /*
+ * (A user signal from the PME nodes (if any)
+ * is communicated to the PP nodes.
+ */
+ signal_handler_install();
+ }
+
+ if (cr->duty & DUTY_PP)
+ {
+ if (inputrec->ePull != epullNO)
+ {
+ /* Initialize pull code */
+ init_pull(fplog, inputrec, nfile, fnm, mtop, cr, oenv, inputrec->fepvals->init_lambda,
+ EI_DYNAMICS(inputrec->eI) && MASTER(cr), Flags);
+ }
+
+ if (inputrec->bRot)
+ {
+ /* Initialize enforced rotation code */
+ init_rot(fplog, inputrec, nfile, fnm, cr, state->x, box, mtop, oenv,
+ bVerbose, Flags);
+ }
+
+ constr = init_constraints(fplog, mtop, inputrec, ed, state, cr);
+
+ if (DOMAINDECOMP(cr))
+ {
+ dd_init_bondeds(fplog, cr->dd, mtop, vsite, constr, inputrec,
+ Flags & MD_DDBONDCHECK, fr->cginfo_mb);
+
+ set_dd_parameters(fplog, cr->dd, dlb_scale, inputrec, fr, &ddbox);
+
+ setup_dd_grid(fplog, cr->dd);
+ }
+
+ /* Now do whatever the user wants us to do (how flexible...) */
+ integrator[inputrec->eI].func(fplog, cr, nfile, fnm,
+ oenv, bVerbose, bCompact,
+ nstglobalcomm,
+ vsite, constr,
+ nstepout, inputrec, mtop,
+ fcd, state,
+ mdatoms, nrnb, wcycle, ed, fr,
+ repl_ex_nst, repl_ex_nex, repl_ex_seed,
+ membed,
+ cpt_period, max_hours,
+ deviceOptions,
+ Flags,
+ &runtime);
+
+ if (inputrec->ePull != epullNO)
+ {
+ finish_pull(fplog, inputrec->pull);
+ }
+
+ if (inputrec->bRot)
+ {
+ finish_rot(inputrec->rot);
+ }
+
+ }
+ else
+ {
+ /* do PME only */
+ gmx_pmeonly(*pmedata, cr, nrnb, wcycle, ewaldcoeff, FALSE, inputrec);
+ }
+
+ if (EI_DYNAMICS(inputrec->eI) || EI_TPI(inputrec->eI))
+ {
+ /* Some timing stats */
+ if (SIMMASTER(cr))
+ {
+ if (runtime.proc == 0)
+ {
+ runtime.proc = runtime.real;
+ }
+ }
+ else
+ {
+ runtime.real = 0;
+ }
+ }
+
+ wallcycle_stop(wcycle, ewcRUN);
+
+ /* Finish up, write some stuff
+ * if rerunMD, don't write last frame again
+ */
+ finish_run(fplog, cr, ftp2fn(efSTO, nfile, fnm),
+ inputrec, nrnb, wcycle, &runtime,
+ fr != NULL && fr->nbv != NULL && fr->nbv->bUseGPU ?
+ nbnxn_cuda_get_timings(fr->nbv->cu_nbv) : NULL,
+ nthreads_pp,
+ EI_DYNAMICS(inputrec->eI) && !MULTISIM(cr));
+
+ if ((cr->duty & DUTY_PP) && fr->nbv != NULL && fr->nbv->bUseGPU)
+ {
+ char gpu_err_str[STRLEN];
+
+ /* free GPU memory and uninitialize GPU (by destroying the context) */
+ nbnxn_cuda_free(fplog, fr->nbv->cu_nbv);
+
+ if (!free_gpu(gpu_err_str))
+ {
+ gmx_warning("On node %d failed to free GPU #%d: %s",
+ cr->nodeid, get_current_gpu_device_id(), gpu_err_str);
+ }
+ }
+
+ if (opt2bSet("-membed", nfile, fnm))
+ {
+ sfree(membed);
+ }
+
++ gmx_hardware_info_free(hwinfo);
+
+ /* Does what it says */
+ print_date_and_time(fplog, cr->nodeid, "Finished mdrun", &runtime);
+
+ /* Close logfile already here if we were appending to it */
+ if (MASTER(cr) && (Flags & MD_APPENDFILES))
+ {
+ gmx_log_close(fplog);
+ }
+
+ rc = (int)gmx_get_stop_condition();
+
+#ifdef GMX_THREAD_MPI
+ /* we need to join all threads. The sub-threads join when they
+ exit this function, but the master thread needs to be told to
+ wait for that. */
+ if (PAR(cr) && MASTER(cr))
+ {
+ tMPI_Finalize();
+ }
+#endif
+
+ return rc;
+}