how-to/visualize.rst
install-guide/index.rst
release-notes/index.rst
+ release-notes/2020/major/highlights.rst
+ release-notes/2020/major/features.rst
+ release-notes/2020/major/performance.rst
+ release-notes/2020/major/tools.rst
+ release-notes/2020/major/bugs-fixed.rst
+ release-notes/2020/major/removed-functionality.rst
+ release-notes/2020/major/deprecated-functionality.rst
+ release-notes/2020/major/portability.rst
+ release-notes/2020/major/miscellaneous.rst
+ release-notes/2019/2019.4.rst
+ release-notes/2019/2019.3.rst
release-notes/2019/2019.2.rst
release-notes/2019/2019.1.rst
release-notes/2019/major/highlights.rst
# Support for --enable-avx2 was only added in 3.3.5, but
# configuring with it is at worst a warning, even on an earlier
# version.
+ # On platforms capable of AVX512 where we are building with AVX2,
+ # enabling AVX512 risks clock-throttling the entire mdrun if
+ # fftw happens to pick up an AVX512 kernel (which is not unlikely
+ # as fftw tuning is known to produce highly varying results).
+ set(_fftw_simd_support_level --enable-sse2;--enable-avx;--enable-avx2)
+elseif(${GMX_SIMD_ACTIVE} MATCHES "^(AVX_512)")
# MSVC, GCC < 4.9, Clang < 3.9 do not support AVX-512, so
- # we should not enable it.
+ # we should not enable it there. FFTW does not support clang with
+ # AVX-512, so we should not enable that either.
-if(MSVC OR (CMAKE_COMPILER_IS_GNUCC AND CMAKE_C_COMPILER_VERSION VERSION_LESS 4.9.0) OR
- (CMAKE_C_COMPILER_ID MATCHES "Clang" AND CMAKE_C_COMPILER_VERSION VERSION_LESS 3.9.0) OR
- (CMAKE_C_COMPILER_ID MATCHES "Clang" AND ${GMX_SIMD_ACTIVE} MATCHES "^(AVX_512)"))
- set(_fftw_simd_support_level --enable-sse2;--enable-avx;--enable-avx2)
-else()
- set(_fftw_simd_support_level --enable-sse2;--enable-avx;--enable-avx2;--enable-avx512)
-endif()
+ if(MSVC OR (CMAKE_COMPILER_IS_GNUCC AND CMAKE_C_COMPILER_VERSION VERSION_LESS 4.9.0) OR
- (CMAKE_C_COMPILER_ID MATCHES "Clang" AND CMAKE_C_COMPILER_VERSION VERSION_LESS 3.9.0))
++ (CMAKE_C_COMPILER_ID MATCHES "Clang" AND CMAKE_C_COMPILER_VERSION VERSION_LESS 3.9.0) OR
++ (CMAKE_C_COMPILER_ID MATCHES "Clang" AND ${GMX_SIMD_ACTIVE} MATCHES "^(AVX_512)"))
+ set(_fftw_simd_support_level --enable-sse2;--enable-avx;--enable-avx2)
+ else()
+ set(_fftw_simd_support_level --enable-sse2;--enable-avx;--enable-avx2;--enable-avx512)
+ endif()
elseif(${GMX_SIMD_ACTIVE} MATCHES "^(VSX)")
set(_fftw_simd_support_level --enable-vsx)
endif()
--- /dev/null
- * \param[in] pull_params Pull parameters.
- * \param[in] coord_ind Pull coordinate index.
- * \param[in] box Box vectors.
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2004, The GROMACS development team.
+ * Copyright (c) 2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+#include "gmxpre.h"
+
+#include "read_params.h"
+
+#include "gromacs/awh/awh.h"
+#include "gromacs/fileio/readinp.h"
+#include "gromacs/fileio/warninp.h"
+#include "gromacs/math/units.h"
+#include "gromacs/math/utilities.h"
+#include "gromacs/math/vec.h"
+#include "gromacs/mdtypes/awh_params.h"
+#include "gromacs/mdtypes/inputrec.h"
+#include "gromacs/mdtypes/md_enums.h"
+#include "gromacs/mdtypes/pull_params.h"
+#include "gromacs/pbcutil/pbc.h"
+#include "gromacs/pulling/pull.h"
+#include "gromacs/random/seed.h"
+#include "gromacs/utility/cstringutil.h"
+#include "gromacs/utility/fatalerror.h"
+#include "gromacs/utility/smalloc.h"
+#include "gromacs/utility/stringutil.h"
+
+#include "biasparams.h"
+#include "biassharing.h"
+
+namespace gmx
+{
+
+const char *eawhtarget_names[eawhtargetNR+1] = {
+ "constant", "cutoff", "boltzmann", "local-boltzmann", nullptr
+};
+
+const char *eawhgrowth_names[eawhgrowthNR+1] = {
+ "exp-linear", "linear", nullptr
+};
+
+const char *eawhpotential_names[eawhpotentialNR+1] = {
+ "convolved", "umbrella", nullptr
+};
+
+const char *eawhcoordprovider_names[eawhcoordproviderNR+1] = {
+ "pull", nullptr
+};
+
+/*! \brief
+ * Read parameters of an AWH bias dimension.
+ *
+ * \param[in,out] inp Input file entries.
+ * \param[in] prefix Prefix for dimension parameters.
+ * \param[in,out] dimParams AWH dimensional parameters.
+ * \param[in] pull_params Pull parameters.
+ * \param[in,out] wi Struct for bookeeping warnings.
+ * \param[in] bComment True if comments should be printed.
+ */
+static void readDimParams(std::vector<t_inpfile> *inp, const std::string &prefix,
+ AwhDimParams *dimParams, const pull_params_t *pull_params,
+ warninp_t wi, bool bComment)
+{
+ std::string opt;
+ if (bComment)
+ {
+ printStringNoNewline(inp, "The provider of the reaction coordinate, currently only pull is supported");
+ }
+
+ opt = prefix + "-coord-provider";
+ dimParams->eCoordProvider = get_eeenum(inp, opt, eawhcoordprovider_names, wi);
+
+ if (bComment)
+ {
+ printStringNoNewline(inp, "The coordinate index for this dimension");
+ }
+ opt = prefix + "-coord-index";
+ int coordIndexInput;
+ coordIndexInput = get_eint(inp, opt, 1, wi);
+ if (coordIndexInput < 1)
+ {
+ gmx_fatal(FARGS, "Failed to read a valid coordinate index for %s. "
+ "Note that the pull coordinate indexing starts at 1.", opt.c_str());
+ }
+
+ /* The pull coordinate indices start at 1 in the input file, at 0 internally */
+ dimParams->coordIndex = coordIndexInput - 1;
+
+ /* The pull settings need to be consistent with the AWH settings */
+ if (!(pull_params->coord[dimParams->coordIndex].eType == epullEXTERNAL) )
+ {
+ gmx_fatal(FARGS, "AWH biasing can only be applied to pull type %s",
+ EPULLTYPE(epullEXTERNAL));
+ }
+
+ if (dimParams->coordIndex >= pull_params->ncoord)
+ {
+ gmx_fatal(FARGS, "The given AWH coordinate index (%d) is larger than the number of pull coordinates (%d)",
+ coordIndexInput, pull_params->ncoord);
+ }
+ if (pull_params->coord[dimParams->coordIndex].rate != 0)
+ {
+ auto message = formatString("Setting pull-coord%d-rate (%g) is incompatible with AWH biasing this coordinate",
+ coordIndexInput, pull_params->coord[dimParams->coordIndex].rate);
+ warning_error(wi, message);
+ }
+
+ /* Grid params for each axis */
+ int eGeom = pull_params->coord[dimParams->coordIndex].eGeom;
+
+ if (bComment)
+ {
+ printStringNoNewline(inp, "Start and end values for each coordinate dimension");
+ }
+
+ opt = prefix + "-start";
+ dimParams->origin = get_ereal(inp, opt, 0., wi);
+
+ opt = prefix + "-end";
+ dimParams->end = get_ereal(inp, opt, 0., wi);
+
+ if (gmx_within_tol(dimParams->end - dimParams->origin, 0, GMX_REAL_EPS))
+ {
+ auto message = formatString("The given interval length given by %s-start (%g) and %s-end (%g) is zero. "
+ "This will result in only one point along this axis in the coordinate value grid.",
+ prefix.c_str(), dimParams->origin, prefix.c_str(), dimParams->end);
+ warning(wi, message);
+ }
+ /* Check that the requested interval is in allowed range */
+ if (eGeom == epullgDIST)
+ {
+ if (dimParams->origin < 0 || dimParams->end < 0)
+ {
+ gmx_fatal(FARGS, "%s-start (%g) or %s-end (%g) set to a negative value. With pull geometry distance coordinate values are non-negative. "
+ "Perhaps you want to use geometry %s instead?",
+ prefix.c_str(), dimParams->origin, prefix.c_str(), dimParams->end, EPULLGEOM(epullgDIR));
+ }
+ }
+ else if (eGeom == epullgANGLE || eGeom == epullgANGLEAXIS)
+ {
+ if (dimParams->origin < 0 || dimParams->end > 180)
+ {
+ gmx_fatal(FARGS, "%s-start (%g) and %s-end (%g) are outside of the allowed range 0 to 180 deg for pull geometries %s and %s ",
+ prefix.c_str(), dimParams->origin, prefix.c_str(), dimParams->end, EPULLGEOM(epullgANGLE), EPULLGEOM(epullgANGLEAXIS));
+ }
+ }
+ else if (eGeom == epullgDIHEDRAL)
+ {
+ if (dimParams->origin < -180 || dimParams->end > 180)
+ {
+ gmx_fatal(FARGS, "%s-start (%g) and %s-end (%g) are outside of the allowed range -180 to 180 deg for pull geometry %s. ",
+ prefix.c_str(), dimParams->origin, prefix.c_str(), dimParams->end, EPULLGEOM(epullgDIHEDRAL));
+ }
+ }
+
+ if (bComment)
+ {
+ printStringNoNewline(inp, "The force constant for this coordinate (kJ/mol/nm^2 or kJ/mol/rad^2)");
+ }
+ opt = prefix + "-force-constant";
+ dimParams->forceConstant = get_ereal(inp, opt, 0, wi);
+ if (dimParams->forceConstant <= 0)
+ {
+ warning_error(wi, "The force AWH bias force constant should be > 0");
+ }
+
+ if (bComment)
+ {
+ printStringNoNewline(inp, "Estimated diffusion constant (nm^2/ps or rad^2/ps)");
+ }
+ opt = prefix + "-diffusion";
+ dimParams->diffusion = get_ereal(inp, opt, 0, wi);
+
+ if (dimParams->diffusion <= 0)
+ {
+ const double diffusion_default = 1e-5;
+ auto message = formatString
+ ("%s not explicitly set by user. You can choose to use a default "
+ "value (%g nm^2/ps or rad^2/ps) but this may very well be "
+ "non-optimal for your system!", opt.c_str(), diffusion_default);
+ warning(wi, message);
+ dimParams->diffusion = diffusion_default;
+ }
+
+ if (bComment)
+ {
+ printStringNoNewline(inp, "Diameter that needs to be sampled around a point before it is considered covered.");
+ }
+ opt = prefix + "-cover-diameter";
+ dimParams->coverDiameter = get_ereal(inp, opt, 0, wi);
+
+ if (dimParams->coverDiameter < 0)
+ {
+ gmx_fatal(FARGS, "%s (%g) cannot be negative.",
+ opt.c_str(), dimParams->coverDiameter);
+ }
+}
+
+/*! \brief
+ * Check consistency of input at the AWH bias level.
+ *
+ * \param[in] awhBiasParams AWH bias parameters.
+ * \param[in,out] wi Struct for bookkeeping warnings.
+ */
+static void checkInputConsistencyAwhBias(const AwhBiasParams &awhBiasParams,
+ warninp_t wi)
+{
+ /* Covering diameter and sharing warning. */
+ for (int d = 0; d < awhBiasParams.ndim; d++)
+ {
+ double coverDiameter = awhBiasParams.dimParams[d].coverDiameter;
+ if (awhBiasParams.shareGroup <= 0 && coverDiameter > 0)
+ {
+ warning(wi, "The covering diameter is only relevant to set for bias sharing simulations.");
+ }
+ }
+}
+
+/*! \brief
+ * Read parameters of an AWH bias.
+ *
+ * \param[in,out] inp Input file entries.
+ * \param[in,out] awhBiasParams AWH dimensional parameters.
+ * \param[in] prefix Prefix for bias parameters.
+ * \param[in] ir Input parameter struct.
+ * \param[in,out] wi Struct for bookeeping warnings.
+ * \param[in] bComment True if comments should be printed.
+ */
+static void read_bias_params(std::vector<t_inpfile> *inp, AwhBiasParams *awhBiasParams, const std::string &prefix,
+ const t_inputrec *ir, warninp_t wi, bool bComment)
+{
+ if (bComment)
+ {
+ printStringNoNewline(inp, "Estimated initial PMF error (kJ/mol)");
+ }
+
+ std::string opt = prefix + "-error-init";
+ /* We allow using a default value here without warning (but warn the user if the diffusion constant is not set). */
+ awhBiasParams->errorInitial = get_ereal(inp, opt, 10, wi);
+ if (awhBiasParams->errorInitial <= 0)
+ {
+ gmx_fatal(FARGS, "%s needs to be > 0.", opt.c_str());
+ }
+
+ if (bComment)
+ {
+ printStringNoNewline(inp, "Growth rate of the reference histogram determining the bias update size: exp-linear or linear");
+ }
+ opt = prefix + "-growth";
+ awhBiasParams->eGrowth = get_eeenum(inp, opt, eawhgrowth_names, wi);
+
+ if (bComment)
+ {
+ printStringNoNewline(inp, "Start the simulation by equilibrating histogram towards the target distribution: no or yes");
+ }
+ opt = prefix + "-equilibrate-histogram";
+ awhBiasParams->equilibrateHistogram = (get_eeenum(inp, opt, yesno_names, wi) != 0);
+ if (awhBiasParams->equilibrateHistogram && awhBiasParams->eGrowth != eawhgrowthEXP_LINEAR)
+ {
+ auto message = formatString("Option %s will only have an effect for histogram growth type '%s'.",
+ opt.c_str(), EAWHGROWTH(eawhgrowthEXP_LINEAR));
+ warning(wi, message);
+ }
+
+ if (bComment)
+ {
+ printStringNoNewline(inp, "Target distribution type: constant, cutoff, boltzmann or local-boltzmann");
+ }
+ opt = prefix + "-target";
+ awhBiasParams->eTarget = get_eeenum(inp, opt, eawhtarget_names, wi);
+
+ if ((awhBiasParams->eTarget == eawhtargetLOCALBOLTZMANN) &&
+ (awhBiasParams->eGrowth == eawhgrowthEXP_LINEAR))
+ {
+ auto message = formatString("Target type '%s' combined with histogram growth type '%s' is not "
+ "expected to give stable bias updates. You probably want to use growth type "
+ "'%s' instead.",
+ EAWHTARGET(eawhtargetLOCALBOLTZMANN), EAWHGROWTH(eawhgrowthEXP_LINEAR),
+ EAWHGROWTH(eawhgrowthLINEAR));
+ warning(wi, message);
+ }
+
+ if (bComment)
+ {
+ printStringNoNewline(inp, "Boltzmann beta scaling factor for target distribution types 'boltzmann' and 'boltzmann-local'");
+ }
+ opt = prefix + "-target-beta-scaling";
+ awhBiasParams->targetBetaScaling = get_ereal(inp, opt, 0, wi);
+
+ switch (awhBiasParams->eTarget)
+ {
+ case eawhtargetBOLTZMANN:
+ case eawhtargetLOCALBOLTZMANN:
+ if (awhBiasParams->targetBetaScaling < 0 || awhBiasParams->targetBetaScaling > 1)
+ {
+ gmx_fatal(FARGS, "%s = %g is not useful for target type %s.",
+ opt.c_str(), awhBiasParams->targetBetaScaling, EAWHTARGET(awhBiasParams->eTarget));
+ }
+ break;
+ default:
+ if (awhBiasParams->targetBetaScaling != 0)
+ {
+ gmx_fatal(FARGS, "Value for %s (%g) set explicitly but will not be used for target type %s.",
+ opt.c_str(), awhBiasParams->targetBetaScaling, EAWHTARGET(awhBiasParams->eTarget));
+ }
+ break;
+ }
+
+ if (bComment)
+ {
+ printStringNoNewline(inp, "Free energy cutoff value for target distribution type 'cutoff'");
+ }
+ opt = prefix + "-target-cutoff";
+ awhBiasParams->targetCutoff = get_ereal(inp, opt, 0, wi);
+
+ switch (awhBiasParams->eTarget)
+ {
+ case eawhtargetCUTOFF:
+ if (awhBiasParams->targetCutoff <= 0)
+ {
+ gmx_fatal(FARGS, "%s = %g is not useful for target type %s.",
+ opt.c_str(), awhBiasParams->targetCutoff, EAWHTARGET(awhBiasParams->eTarget));
+ }
+ break;
+ default:
+ if (awhBiasParams->targetCutoff != 0)
+ {
+ gmx_fatal(FARGS, "Value for %s (%g) set explicitly but will not be used for target type %s.",
+ opt.c_str(), awhBiasParams->targetCutoff, EAWHTARGET(awhBiasParams->eTarget));
+ }
+ break;
+ }
+
+ if (bComment)
+ {
+ printStringNoNewline(inp, "Initialize PMF and target with user data: no or yes");
+ }
+ opt = prefix + "-user-data";
+ awhBiasParams->bUserData = get_eeenum(inp, opt, yesno_names, wi);
+
+ if (bComment)
+ {
+ printStringNoNewline(inp, "Group index to share the bias with, 0 means not shared");
+ }
+ opt = prefix + "-share-group";
+ awhBiasParams->shareGroup = get_eint(inp, opt, 0, wi);
+ if (awhBiasParams->shareGroup < 0)
+ {
+ warning_error(wi, "AWH bias share-group should be >= 0");
+ }
+
+ if (bComment)
+ {
+ printStringNoNewline(inp, "Dimensionality of the coordinate");
+ }
+ opt = prefix + "-ndim";
+ awhBiasParams->ndim = get_eint(inp, opt, 0, wi);
+
+ if (awhBiasParams->ndim <= 0 ||
+ awhBiasParams->ndim > c_biasMaxNumDim)
+ {
+ gmx_fatal(FARGS, "%s (%d) needs to be > 0 and at most %d\n", opt.c_str(), awhBiasParams->ndim, c_biasMaxNumDim);
+ }
+ if (awhBiasParams->ndim > 2)
+ {
+ warning_note(wi, "For awh-dim > 2 the estimate based on the diffusion and the initial error is currently only a rough guideline."
+ " You should verify its usefulness for your system before production runs!");
+ }
+ snew(awhBiasParams->dimParams, awhBiasParams->ndim);
+ for (int d = 0; d < awhBiasParams->ndim; d++)
+ {
+ bComment = bComment && d == 0;
+ std::string prefixdim = prefix + formatString("-dim%d", d + 1);
+ readDimParams(inp, prefixdim, &awhBiasParams->dimParams[d], ir->pull, wi, bComment);
+ }
+
+ /* Check consistencies here that cannot be checked at read time at a lower level. */
+ checkInputConsistencyAwhBias(*awhBiasParams, wi);
+}
+
+/*! \brief
+ * Check consistency of input at the AWH level.
+ *
+ * \param[in] awhParams AWH parameters.
+ * \param[in,out] wi Struct for bookkeeping warnings.
+ */
+static void checkInputConsistencyAwh(const AwhParams &awhParams,
+ warninp_t wi)
+{
+ /* Each pull coord can map to at most 1 AWH coord.
+ * Check that we have a shared bias when requesting multisim sharing.
+ */
+ bool haveSharedBias = false;
+ for (int k1 = 0; k1 < awhParams.numBias; k1++)
+ {
+ const AwhBiasParams &awhBiasParams1 = awhParams.awhBiasParams[k1];
+
+ if (awhBiasParams1.shareGroup > 0)
+ {
+ haveSharedBias = true;
+ }
+
+ /* k1 is the reference AWH, k2 is the AWH we compare with (can be equal to k1) */
+ for (int k2 = k1; k2 < awhParams.numBias; k2++)
+ {
+ for (int d1 = 0; d1 < awhBiasParams1.ndim; d1++)
+ {
+ const AwhBiasParams &awhBiasParams2 = awhParams.awhBiasParams[k2];
+
+ /* d1 is the reference dimension of the reference AWH. d2 is the dim index of the AWH to compare with. */
+ for (int d2 = 0; d2 < awhBiasParams2.ndim; d2++)
+ {
+ /* Give an error if (d1, k1) is different from (d2, k2) but the pull coordinate is the same */
+ if ( (d1 != d2 || k1 != k2) && (awhBiasParams1.dimParams[d1].coordIndex == awhBiasParams2.dimParams[d2].coordIndex) )
+ {
+ char errormsg[STRLEN];
+ sprintf(errormsg, "One pull coordinate (%d) cannot be mapped to two separate AWH dimensions (awh%d-dim%d and awh%d-dim%d). "
+ "If this is really what you want to do you will have to duplicate this pull coordinate.",
+ awhBiasParams1.dimParams[d1].coordIndex + 1, k1 + 1, d1 + 1, k2 + 1, d2 + 1);
+ gmx_fatal(FARGS, "%s", errormsg);
+ }
+ }
+ }
+ }
+ }
+
+ if (awhParams.shareBiasMultisim && !haveSharedBias)
+ {
+ warning(wi, "Sharing of biases over multiple simulations is requested, but no bias is marked as shared (share-group > 0)");
+ }
+
+ /* mdrun does not support this (yet), but will check again */
+ if (haveBiasSharingWithinSimulation(awhParams))
+ {
+ warning(wi, "You have shared biases within a single simulation, but mdrun does not support this (yet)");
+ }
+}
+
+AwhParams *readAndCheckAwhParams(std::vector<t_inpfile> *inp, const t_inputrec *ir, warninp_t wi)
+{
+ AwhParams *awhParams;
+ snew(awhParams, 1);
+ std::string opt;
+
+ /* Parameters common for all biases */
+
+ printStringNoNewline(inp, "The way to apply the biasing potential: convolved or umbrella");
+ opt = "awh-potential";
+ awhParams->ePotential = get_eeenum(inp, opt, eawhpotential_names, wi);
+
+ printStringNoNewline(inp, "The random seed used for sampling the umbrella center in the case of umbrella type potential");
+ opt = "awh-seed";
+ awhParams->seed = get_eint(inp, opt, -1, wi);
+ if (awhParams->seed == -1)
+ {
+ awhParams->seed = static_cast<int>(gmx::makeRandomSeed());
+ fprintf(stderr, "Setting the AWH bias MC random seed to %" PRId64 "\n", awhParams->seed);
+ }
+
+ printStringNoNewline(inp, "Data output interval in number of steps");
+ opt = "awh-nstout";
+ awhParams->nstOut = get_eint(inp, opt, 100000, wi);
+ if (awhParams->nstOut <= 0)
+ {
+ auto message = formatString("Not writing AWH output with AWH (%s = %d) does not make sense",
+ opt.c_str(), awhParams->nstOut);
+ warning_error(wi, message);
+ }
+ /* This restriction can be removed by changing a flag of print_ebin() */
+ if (ir->nstenergy == 0 || awhParams->nstOut % ir->nstenergy != 0)
+ {
+ auto message = formatString("%s (%d) should be a multiple of nstenergy (%d)",
+ opt.c_str(), awhParams->nstOut, ir->nstenergy);
+ warning_error(wi, message);
+ }
+
+ printStringNoNewline(inp, "Coordinate sampling interval in number of steps");
+ opt = "awh-nstsample";
+ awhParams->nstSampleCoord = get_eint(inp, opt, 10, wi);
+
+ printStringNoNewline(inp, "Free energy and bias update interval in number of samples");
+ opt = "awh-nsamples-update";
+ awhParams->numSamplesUpdateFreeEnergy = get_eint(inp, opt, 10, wi);
+ if (awhParams->numSamplesUpdateFreeEnergy <= 0)
+ {
+ warning_error(wi, opt + " needs to be an integer > 0");
+ }
+
+ printStringNoNewline(inp, "When true, biases with share-group>0 are shared between multiple simulations");
+ opt = "awh-share-multisim";
+ awhParams->shareBiasMultisim = (get_eeenum(inp, opt, yesno_names, wi) != 0);
+
+ printStringNoNewline(inp, "The number of independent AWH biases");
+ opt = "awh-nbias";
+ awhParams->numBias = get_eint(inp, opt, 1, wi);
+ if (awhParams->numBias <= 0)
+ {
+ gmx_fatal(FARGS, "%s needs to be an integer > 0", opt.c_str());
+ }
+
+ /* Read the parameters specific to each AWH bias */
+ snew(awhParams->awhBiasParams, awhParams->numBias);
+
+ for (int k = 0; k < awhParams->numBias; k++)
+ {
+ bool bComment = (k == 0);
+ std::string prefixawh = formatString("awh%d", k + 1);
+ read_bias_params(inp, &awhParams->awhBiasParams[k], prefixawh, ir, wi, bComment);
+ }
+
+ /* Do a final consistency check before returning */
+ checkInputConsistencyAwh(*awhParams, wi);
+
+ if (ir->init_step != 0)
+ {
+ warning_error(wi, "With AWH init-step should be 0");
+ }
+
+ return awhParams;
+}
+
+/*! \brief
+ * Gets the period of a pull coordinate.
+ *
- static double get_pull_coord_period(const pull_params_t *pull_params,
- int coord_ind,
- const matrix box)
++ * \param[in] pullCoordParams The parameters for the pull coordinate.
++ * \param[in] pbc The PBC setup
++ * \param[in] intervalLength The length of the AWH interval for this pull coordinate
+ * \returns the period (or 0 if not periodic).
+ */
- double period;
- t_pull_coord *pcrd_params = &pull_params->coord[coord_ind];
-
- if (pcrd_params->eGeom == epullgDIRPBC)
- {
- /* For direction periodic, we need the pull vector to be one of the box vectors
- (or more generally I guess it could be an integer combination of boxvectors).
- This boxvector should to be orthogonal to the (periodic) plane spanned by the other two box vectors.
- Here we assume that the pull vector is either x, y or z.
- * E.g. for pull vec = (1, 0, 0) the box vector tensor should look like:
- * | x 0 0 |
- * | 0 a c |
- * | 0 b d |
- *
- The period is then given by the box length x.
-
- Note: we make these checks here for AWH and not in pull because we allow pull to be more general.
- */
- int m_pullvec = -1, count_nonzeros = 0;
-
- /* Check that pull vec has only one component and which component it is. This component gives the relevant box vector */
- for (int m = 0; m < DIM; m++)
- {
- if (pcrd_params->vec[m] != 0)
- {
- m_pullvec = m;
- count_nonzeros++;
- }
- }
- if (count_nonzeros != 1)
- {
- gmx_fatal(FARGS, "For AWH biasing pull coordinate %d with pull geometry %s, the pull vector needs to be parallel to "
- "a box vector that is parallel to either the x, y or z axis and is orthogonal to the other box vectors.",
- coord_ind + 1, EPULLGEOM(epullgDIRPBC));
- }
++static double get_pull_coord_period(const t_pull_coord &pullCoordParams,
++ const t_pbc &pbc,
++ const real intervalLength)
+{
- /* Check that there is a box vec parallel to pull vec and that this boxvec is orthogonal to the other box vectors */
- for (int m = 0; m < DIM; m++)
++ double period = 0;
+
- for (int n = 0; n < DIM; n++)
++ if (pullCoordParams.eGeom == epullgDIR)
++ {
++ const real margin = 0.001;
++ // Make dims periodic when the interval covers > 95%
++ const real periodicFraction = 0.95;
++
++ // Check if the pull direction is along a box vector
++ for (int dim = 0; dim < pbc.ndim_ePBC; dim++)
+ {
- if ((n != m) && (n == m_pullvec || m == m_pullvec) && box[m][n] > 0)
++ const real boxLength = norm(pbc.box[dim]);
++ const real innerProduct = iprod(pullCoordParams.vec, pbc.box[dim]);
++ if (innerProduct >= (1 - margin)*boxLength &&
++ innerProduct <= (1 + margin)*boxLength)
+ {
- gmx_fatal(FARGS, "For AWH biasing pull coordinate %d with pull geometry %s, there needs to be a box vector parallel to the pull vector that is "
- "orthogonal to the other box vectors.",
- coord_ind + 1, EPULLGEOM(epullgDIRPBC));
++ GMX_RELEASE_ASSERT(intervalLength < (1 + margin)*boxLength,
++ "We have checked before that interval <= period");
++ if (intervalLength > periodicFraction*boxLength)
+ {
-
- /* If this box vector only has one component as we assumed the norm should be equal to the absolute value of that component */
- period = static_cast<double>(norm(box[m_pullvec]));
++ period = boxLength;
+ }
+ }
+ }
- else if (pcrd_params->eGeom == epullgDIHEDRAL)
+ }
- else
- {
- period = 0;
- }
++ else if (pullCoordParams.eGeom == epullgDIHEDRAL)
+ {
+ /* The dihedral angle is periodic in -180 to 180 deg */
+ period = 360;
+ }
- const matrix box, int ePBC,
+
+ return period;
+}
+
+/*! \brief
+ * Checks if the given interval is defined in the correct periodic interval.
+ *
+ * \param[in] origin Start value of interval.
+ * \param[in] end End value of interval.
+ * \param[in] period Period (or 0 if not periodic).
+ * \returns true if the end point values are in the correct periodic interval.
+ */
+static bool intervalIsInPeriodicInterval(double origin, double end, double period)
+{
+ return (period == 0) || (std::fabs(origin) <= 0.5*period && std::fabs(end) <= 0.5*period);
+}
+
+/*! \brief
+ * Checks if a value is within an interval.
+ *
+ * \param[in] origin Start value of interval.
+ * \param[in] end End value of interval.
+ * \param[in] period Period (or 0 if not periodic).
+ * \param[in] value Value to check.
+ * \returns true if the value is within the interval.
+ */
+static bool valueIsInInterval(double origin, double end, double period, double value)
+{
+ bool bIn_interval;
+
+ if (period > 0)
+ {
+ if (origin < end)
+ {
+ /* The interval closes within the periodic interval */
+ bIn_interval = (value >= origin) && (value <= end);
+ }
+ else
+ {
+ /* The interval wraps around the periodic boundary */
+ bIn_interval = ((value >= origin) && (value <= 0.5*period)) || ((value >= -0.5*period) && (value <= end));
+ }
+ }
+ else
+ {
+ bIn_interval = (value >= origin) && (value <= end);
+ }
+
+ return bIn_interval;
+}
+
+/*! \brief
+ * Check if the starting configuration is consistent with the given interval.
+ *
+ * \param[in] awhParams AWH parameters.
+ * \param[in,out] wi Struct for bookeeping warnings.
+ */
+static void checkInputConsistencyInterval(const AwhParams *awhParams, warninp_t wi)
+{
+ for (int k = 0; k < awhParams->numBias; k++)
+ {
+ AwhBiasParams *awhBiasParams = &awhParams->awhBiasParams[k];
+ for (int d = 0; d < awhBiasParams->ndim; d++)
+ {
+ AwhDimParams *dimParams = &awhBiasParams->dimParams[d];
+ int coordIndex = dimParams->coordIndex;
+ double origin = dimParams->origin, end = dimParams->end, period = dimParams->period;
+ double coordValueInit = dimParams->coordValueInit;
+
+ if ((period == 0) && (origin > end))
+ {
+ gmx_fatal(FARGS, "For the non-periodic pull coordinates awh%d-dim%d-start (%f) cannot be larger than awh%d-dim%d-end (%f)",
+ k + 1, d + 1, origin, k + 1, d + 1, end);
+ }
+
+ /* Currently we assume symmetric periodic intervals, meaning we use [-period/2, period/2] as the reference interval.
+ Make sure the AWH interval is within this reference interval.
+
+ Note: we could fairly simply allow using a more general interval (e.g. [x, x + period]) but it complicates
+ things slightly and I don't see that there is a great need for it. It would also mean that the interval would
+ depend on AWH input. Also, for dihedral angles you would always want the reference interval to be -180, +180,
+ independent of AWH parameters.
+ */
+ if (!intervalIsInPeriodicInterval(origin, end, period))
+ {
+ gmx_fatal(FARGS, "When using AWH with periodic pull coordinate geometries awh%d-dim%d-start (%.8g) and "
+ "awh%d-dim%d-end (%.8g) should cover at most one period (%.8g) and take values in between "
+ "minus half a period and plus half a period, i.e. in the interval [%.8g, %.8g].",
+ k + 1, d + 1, origin, k + 1, d + 1, end,
+ period, -0.5*period, 0.5*period);
+
+ }
+
+ /* Warn if the pull initial coordinate value is not in the grid */
+ if (!valueIsInInterval(origin, end, period, coordValueInit))
+ {
+ auto message = formatString
+ ("The initial coordinate value (%.8g) for pull coordinate index %d falls outside "
+ "of the sampling nterval awh%d-dim%d-start (%.8g) to awh%d-dim%d-end (%.8g). "
+ "This can lead to large initial forces pulling the coordinate towards the sampling interval.",
+ coordValueInit, coordIndex + 1, k + 1, d + 1, origin, k + 1, d + 1, end);
+ warning(wi, message);
+ }
+ }
+ }
+}
+
+void setStateDependentAwhParams(AwhParams *awhParams,
+ const pull_params_t *pull_params, pull_t *pull_work,
- AwhDimParams *dimParams = &awhBiasParams->dimParams[d];
++ const matrix box, int ePBC, const tensor &compressibility,
+ const t_grpopts *inputrecGroupOptions, warninp_t wi)
+{
+ /* The temperature is not really state depenendent but is not known
+ * when read_awhParams is called (in get ir).
+ * It is known first after do_index has been called in grompp.cpp.
+ */
+ if (inputrecGroupOptions->ref_t == nullptr ||
+ inputrecGroupOptions->ref_t[0] <= 0)
+ {
+ gmx_fatal(FARGS, "AWH biasing is only supported for temperatures > 0");
+ }
+ for (int i = 1; i < inputrecGroupOptions->ngtc; i++)
+ {
+ if (inputrecGroupOptions->ref_t[i] != inputrecGroupOptions->ref_t[0])
+ {
+ gmx_fatal(FARGS, "AWH biasing is currently only supported for identical temperatures for all temperature coupling groups");
+ }
+ }
+
+ t_pbc pbc;
+ set_pbc(&pbc, ePBC, box);
+
+ for (int k = 0; k < awhParams->numBias; k++)
+ {
+ AwhBiasParams *awhBiasParams = &awhParams->awhBiasParams[k];
+ for (int d = 0; d < awhBiasParams->ndim; d++)
+ {
- /* The periodiciy of the AWH grid in certain cases depends on the simulation box */
- dimParams->period = get_pull_coord_period(pull_params, dimParams->coordIndex, box);
++ AwhDimParams *dimParams = &awhBiasParams->dimParams[d];
++ const t_pull_coord &pullCoordParams = pull_params->coord[dimParams->coordIndex];
+
- t_pull_coord *pullCoord = &pull_params->coord[dimParams->coordIndex];
- dimParams->coordValueInit *= pull_conversion_factor_internal2userinput(pullCoord);
++ if (pullCoordParams.eGeom == epullgDIRPBC)
++ {
++ gmx_fatal(FARGS, "AWH does not support pull geometry '%s'. "
++ "If the maximum distance between the groups is always less than half the box size, "
++ "you can use geometry '%s' instead.",
++ EPULLGEOM(epullgDIRPBC),
++ EPULLGEOM(epullgDIR));
++
++ }
++
++ dimParams->period = get_pull_coord_period(pullCoordParams, pbc, dimParams->end - dimParams->origin);
++ // We would like to check for scaling, but we don't have the full inputrec available here
++ if (dimParams->period > 0 && !(pullCoordParams.eGeom == epullgANGLE ||
++ pullCoordParams.eGeom == epullgDIHEDRAL))
++ {
++ bool coordIsScaled = false;
++ for (int d2 = 0; d2 < DIM; d2++)
++ {
++ if (pullCoordParams.vec[d2] != 0 && norm2(compressibility[d2]) != 0)
++ {
++ coordIsScaled = true;
++ }
++ }
++ if (coordIsScaled)
++ {
++ std::string mesg = gmx::formatString("AWH dimension %d of bias %d is periodic with pull geometry '%s', "
++ "while you should are applying pressure scaling to the corresponding box vector, this is not supported.",
++ d + 1, k + 1, EPULLGEOM(pullCoordParams.eGeom));
++ warning(wi, mesg.c_str());
++ }
++ }
+
+ /* The initial coordinate value, converted to external user units. */
+ dimParams->coordValueInit =
+ get_pull_coord_value(pull_work, dimParams->coordIndex, &pbc);
+
++ dimParams->coordValueInit *= pull_conversion_factor_internal2userinput(&pullCoordParams);
+ }
+ }
+ checkInputConsistencyInterval(awhParams, wi);
+
+ /* Register AWH as external potential with pull to check consistency. */
+ Awh::registerAwhWithPull(*awhParams, pull_work);
+}
+
+} // namespace gmx
std::unique_ptr<gmx::UpdateGroupsCog> updateGroupsCog;
/* Are there charge groups? */
- bool haveInterDomainBondeds; /**< Are there inter-domain bonded interactions? */
- bool haveInterDomainMultiBodyBondeds; /**< Are there inter-domain multi-body interactions? */
- /**< True when there are charge groups */
- gmx_bool bCGs = false;
-
- /**< Are there inter-cg bonded interactions? */
- gmx_bool bInterCGBondeds = false;
- /**< Are there inter-cg multi-body interactions? */
- gmx_bool bInterCGMultiBody = false;
++ bool haveInterDomainBondeds = false; /**< Are there inter-domain bonded interactions? */
++ bool haveInterDomainMultiBodyBondeds = false; /**< Are there inter-domain multi-body interactions? */
/* Data for the optional bonded interaction atom communication range */
- gmx_bool bBondComm; /**< Only communicate atoms beyond the non-bonded cut-off when they are involved in bonded interactions with non-local atoms */
- t_blocka *cglink; /**< Links between cg's through bonded interactions */
- char *bLocalCG; /**< Local cg availability, TODO: remove when group scheme is removed */
+ /**< Only communicate atoms beyond the non-bonded cut-off when they are involved in bonded interactions with non-local atoms */
+ gmx_bool bBondComm = false;
+ /**< Links between cg's through bonded interactions */
+ t_blocka *cglink = nullptr;
+ /**< Local cg availability, TODO: remove when group scheme is removed */
+ char *bLocalCG = nullptr;
/* The DLB state, possible values are defined above */
DlbState dlbState;
--- /dev/null
- /* This barrier was not needed in CUDA. Different OpenCL compilers might have different ideas
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2018,2019, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/*! \internal \file
+ * \brief Implements PME OpenCL force gathering kernel.
+ * When including this and other PME OpenCL kernel files, plenty of common
+ * constants/macros are expected to be defined (such as "order" which is PME interpolation order).
+ * For details, please see how pme_program.cl is compiled in pme_gpu_program_impl_ocl.cpp.
+ *
+ * This file's kernels specifically expect the following definitions:
+ *
+ * - atomsPerBlock which expresses how many atoms are processed by a single work group
+ * - order which is a PME interpolation order
+ * - overwriteForces must evaluate to either true or false to specify whether the kernel
+ * overwrites or reduces into the forces buffer
+ * - wrapX and wrapY must evaluate to either true or false to specify whether the grid overlap
+ * in dimension X/Y is to be used
+ *
+ * \author Aleksei Iupinov <a.yupinov@gmail.com>
+ */
+
+#include "pme_gpu_types.h"
+#include "pme_gpu_utils.clh"
+
+#ifndef COMPILE_GATHER_HELPERS_ONCE
+#define COMPILE_GATHER_HELPERS_ONCE
+
+/*! \brief
+ * Unrolls the dynamic index accesses to the constant grid sizes to avoid local memory operations.
+ */
+inline float read_grid_size(const float *realGridSizeFP,
+ const int dimIndex)
+{
+ switch (dimIndex)
+ {
+ case XX: return realGridSizeFP[XX];
+ case YY: return realGridSizeFP[YY];
+ case ZZ: return realGridSizeFP[ZZ];
+ }
+ assert(false);
+ return 0.0f;
+}
+
+/*! \brief Reduce the partial force contributions.
+ *
+ * FIXME: this reduction should be simplified and improved, it does 3x16 force component
+ * reduction per 16 threads so no extra shared mem should be needed for intermediates
+ * or passing results back.
+ *
+ * \param[out] sm_forces Local memory array with the output forces (rvec).
+ * \param[in] atomIndexLocal Local atom index
+ * \param[in] splineIndex Spline index
+ * \param[in] lineIndex Line index (same as threadLocalId)
+ * \param[in] realGridSizeFP Local grid size constant
+ * \param[in] fx Input force partial component X
+ * \param[in] fy Input force partial component Y
+ * \param[in] fz Input force partial component Z
+ * \param[in,out] sm_forceReduction Reduction working buffer
+ * \param[in] sm_forceTemp Convenience pointers into \p sm_forceReduction
+ */
+inline void reduce_atom_forces(__local float * __restrict__ sm_forces,
+ const int atomIndexLocal,
+ const int splineIndex,
+ const int lineIndex,
+ const float *realGridSizeFP,
+ float fx,
+ float fy,
+ float fz,
+ __local float * __restrict__ sm_forceReduction,
+ __local float ** __restrict__ sm_forceTemp
+ )
+
+{
+ // TODO: implement AMD intrinsics reduction, like with shuffles in CUDA version. #2514
+
+ /* Number of data components and threads for a single atom */
+#define atomDataSize threadsPerAtom
+ // We use blockSize local memory elements to read fx, or fy, or fz, and then reduce them to fit into smemPerDim elements
+ // All those guys are defines and not consts, because they go into the local memory array size.
+#define blockSize (atomsPerBlock * atomDataSize)
+#define smemPerDim warp_size
+#define smemReserved (DIM * smemPerDim)
+
+ const int numWarps = blockSize / smemPerDim;
+ const int minStride = max(1, atomDataSize / numWarps);
+
+#pragma unroll DIM
+ for (int dimIndex = 0; dimIndex < DIM; dimIndex++)
+ {
+ int elementIndex = smemReserved + lineIndex;
+ // Store input force contributions
+ sm_forceReduction[elementIndex] = (dimIndex == XX) ? fx : (dimIndex == YY) ? fy : fz;
++
++#if !defined(_AMD_SOURCE_)
++ /* This barrier was not needed in CUDA, nor is it needed on AMD GPUs.
++ * Different OpenCL compilers might have different ideas
+ * about #pragma unroll, though. OpenCL 2 has _attribute__((opencl_unroll_hint)).
+ * #2519
+ */
+ barrier(CLK_LOCAL_MEM_FENCE);
++#endif
+
+ // Reduce to fit into smemPerDim (warp size)
+#pragma unroll
+ for (int redStride = atomDataSize >> 1; redStride > minStride; redStride >>= 1)
+ {
+ if (splineIndex < redStride)
+ {
+ sm_forceReduction[elementIndex] += sm_forceReduction[elementIndex + redStride];
+ }
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+ // Last iteration - packing everything to be nearby, storing convenience pointer
+ sm_forceTemp[dimIndex] = sm_forceReduction + dimIndex * smemPerDim;
+ int redStride = minStride;
+ if (splineIndex < redStride)
+ {
+ const int packedIndex = atomIndexLocal * redStride + splineIndex;
+ sm_forceTemp[dimIndex][packedIndex] = sm_forceReduction[elementIndex] + sm_forceReduction[elementIndex + redStride];
+ }
+ }
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ assert ((blockSize / warp_size) >= DIM);
+
+ const int warpIndex = lineIndex / warp_size;
+ const int dimIndex = warpIndex;
+
+ // First 3 warps can now process 1 dimension each
+ if (dimIndex < DIM)
+ {
+ int sourceIndex = lineIndex % warp_size;
+#pragma unroll
+ for (int redStride = minStride >> 1; redStride > 1; redStride >>= 1)
+ {
+ if (!(splineIndex & redStride))
+ {
+ sm_forceTemp[dimIndex][sourceIndex] += sm_forceTemp[dimIndex][sourceIndex + redStride];
+ }
+ }
+
+ const float n = read_grid_size(realGridSizeFP, dimIndex);
+
+ const int atomIndex = sourceIndex / minStride;
+ if (sourceIndex == minStride * atomIndex)
+ {
+ sm_forces[atomIndex * DIM + dimIndex] = (sm_forceTemp[dimIndex][sourceIndex] + sm_forceTemp[dimIndex][sourceIndex + 1]) * n;
+ }
+ }
+}
+
+#endif //COMPILE_GATHER_HELPERS_ONCE
+
+/*! \brief
+ * An OpenCL kernel which gathers the atom forces from the grid.
+ * The grid is assumed to be wrapped in dimension Z.
+ * Please see the file description for additional defines which this kernel expects.
+ *
+ * \param[in] kernelParams All the PME GPU data.
+ * \param[in] gm_coefficients Atom charges/coefficients.
+ * \param[in] gm_grid Global 3D grid.
+ * \param[in] gm_theta Atom spline parameter values
+ * \param[in] gm_dtheta Atom spline parameter derivatives
+ * \param[in] gm_gridlineIndices Atom gridline indices (ivec)
+ * \param[in,out] gm_forces Atom forces (rvec)
+ */
+__attribute__((reqd_work_group_size(order, order, atomsPerBlock)))
+__kernel void CUSTOMIZED_KERNEL_NAME(pme_gather_kernel)(const struct PmeOpenCLKernelParams kernelParams,
+ __global const float * __restrict__ gm_coefficients,
+ __global const float * __restrict__ gm_grid,
+ __global const float * __restrict__ gm_theta,
+ __global const float * __restrict__ gm_dtheta,
+ __global const int * __restrict__ gm_gridlineIndices,
+ __global float * __restrict__ gm_forces
+ )
+{
+ /* These are the atom indices - for the shared and global memory */
+ const int atomIndexLocal = get_local_id(ZZ);
+ const int atomIndexOffset = get_group_id(XX) * atomsPerBlock;
+ const int atomIndexGlobal = atomIndexOffset + atomIndexLocal;
+
+ /* Some sizes which are defines and not consts because they go into the array size */
+ #define blockSize (atomsPerBlock * atomDataSize)
+ assert(blockSize == (get_local_size(0) * get_local_size(1) * get_local_size(2)));
+ #define smemPerDim warp_size
+ #define smemReserved (DIM * smemPerDim)
+ #define totalSharedMemory (smemReserved + blockSize)
+ #define gridlineIndicesSize (atomsPerBlock * DIM)
+ #define splineParamsSize (atomsPerBlock * DIM * order)
+
+ __local int sm_gridlineIndices[gridlineIndicesSize];
+ __local float2 sm_splineParams[splineParamsSize]; /* Theta/dtheta pairs as .x/.y */
+
+ /* Spline Y/Z coordinates */
+ const int ithy = get_local_id(YY);
+ const int ithz = get_local_id(XX);
+
+ const int threadLocalId = (get_local_id(2) * get_local_size(1) + get_local_id(1)) * get_local_size(0) + get_local_id(0);
+
+ /* These are the spline contribution indices in shared memory */
+ const int splineIndex = (get_local_id(1) * get_local_size(0) + get_local_id(0)); /* Relative to the current particle , 0..15 for order 4 */
+ const int lineIndex = threadLocalId; /* And to all the block's particles */
+
+ /* Staging the atom gridline indices, DIM * atomsPerBlock threads */
+ const int localGridlineIndicesIndex = threadLocalId;
+ const int globalGridlineIndicesIndex = get_group_id(XX) * gridlineIndicesSize + localGridlineIndicesIndex;
+ const int globalCheckIndices = pme_gpu_check_atom_data_index(globalGridlineIndicesIndex, kernelParams.atoms.nAtoms * DIM);
+ if ((localGridlineIndicesIndex < gridlineIndicesSize) & globalCheckIndices)
+ {
+ sm_gridlineIndices[localGridlineIndicesIndex] = gm_gridlineIndices[globalGridlineIndicesIndex];
+ assert(sm_gridlineIndices[localGridlineIndicesIndex] >= 0);
+ }
+ /* Staging the spline parameters, DIM * order * atomsPerBlock threads */
+ const int localSplineParamsIndex = threadLocalId;
+ const int globalSplineParamsIndex = get_group_id(XX) * splineParamsSize + localSplineParamsIndex;
+ const int globalCheckSplineParams = pme_gpu_check_atom_data_index(globalSplineParamsIndex, kernelParams.atoms.nAtoms * DIM * order);
+ if ((localSplineParamsIndex < splineParamsSize) && globalCheckSplineParams)
+ {
+ sm_splineParams[localSplineParamsIndex].x = gm_theta[globalSplineParamsIndex];
+ sm_splineParams[localSplineParamsIndex].y = gm_dtheta[globalSplineParamsIndex];
+ assert(isfinite(sm_splineParams[localSplineParamsIndex].x));
+ assert(isfinite(sm_splineParams[localSplineParamsIndex].y));
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ float fx = 0.0f;
+ float fy = 0.0f;
+ float fz = 0.0f;
+
+ const int globalCheck = pme_gpu_check_atom_data_index(atomIndexGlobal, kernelParams.atoms.nAtoms);
+ const int chargeCheck = pme_gpu_check_atom_charge(gm_coefficients[atomIndexGlobal]);
+
+ if (chargeCheck & globalCheck)
+ {
+ const int nx = kernelParams.grid.realGridSize[XX];
+ const int ny = kernelParams.grid.realGridSize[YY];
+ const int nz = kernelParams.grid.realGridSize[ZZ];
+ const int pny = kernelParams.grid.realGridSizePadded[YY];
+ const int pnz = kernelParams.grid.realGridSizePadded[ZZ];
+
+ const int atomWarpIndex = atomIndexLocal % atomsPerWarp;
+ const int warpIndex = atomIndexLocal / atomsPerWarp;
+
+ const int splineIndexBase = getSplineParamIndexBase(warpIndex, atomWarpIndex);
+ const int splineIndexY = getSplineParamIndex(splineIndexBase, YY, ithy);
+ const float2 tdy = sm_splineParams[splineIndexY];
+ const int splineIndexZ = getSplineParamIndex(splineIndexBase, ZZ, ithz);
+ const float2 tdz = sm_splineParams[splineIndexZ];
+
+ const int ixBase = sm_gridlineIndices[atomIndexLocal * DIM + XX];
+ int iy = sm_gridlineIndices[atomIndexLocal * DIM + YY] + ithy;
+ if (wrapY & (iy >= ny))
+ {
+ iy -= ny;
+ }
+ int iz = sm_gridlineIndices[atomIndexLocal * DIM + ZZ] + ithz;
+ if (iz >= nz)
+ {
+ iz -= nz;
+ }
+ const int constOffset = iy * pnz + iz;
+
+#pragma unroll order
+ for (int ithx = 0; (ithx < order); ithx++)
+ {
+ int ix = ixBase + ithx;
+ if (wrapX & (ix >= nx))
+ {
+ ix -= nx;
+ }
+ const int gridIndexGlobal = ix * pny * pnz + constOffset;
+ assert(gridIndexGlobal >= 0);
+ const float gridValue = gm_grid[gridIndexGlobal];
+ assert(isfinite(gridValue));
+ const int splineIndexX = getSplineParamIndex(splineIndexBase, XX, ithx);
+ const float2 tdx = sm_splineParams[splineIndexX];
+ const float fxy1 = tdz.x * gridValue;
+ const float fz1 = tdz.y * gridValue;
+ fx += tdx.y * tdy.x * fxy1;
+ fy += tdx.x * tdy.y * fxy1;
+ fz += tdx.x * tdy.x * fz1;
+ }
+ }
+
+ // Reduction of partial force contributions
+ __local float sm_forces[atomsPerBlock * DIM];
+
+ __local float sm_forceReduction[totalSharedMemory];
+ __local float *sm_forceTemp[DIM];
+
+ reduce_atom_forces(sm_forces,
+ atomIndexLocal, splineIndex, lineIndex,
+ kernelParams.grid.realGridSizeFP,
+ fx, fy, fz,
+ sm_forceReduction,
+ sm_forceTemp);
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ /* Calculating the final forces with no component branching, atomsPerBlock threads */
+ const int forceIndexLocal = threadLocalId;
+ const int forceIndexGlobal = atomIndexOffset + forceIndexLocal;
+ const int calcIndexCheck = pme_gpu_check_atom_data_index(forceIndexGlobal, kernelParams.atoms.nAtoms);
+ if ((forceIndexLocal < atomsPerBlock) & calcIndexCheck)
+ {
+ const float3 atomForces = vload3(forceIndexLocal, sm_forces);
+ const float negCoefficient = -gm_coefficients[forceIndexGlobal];
+ float3 result;
+ result.x = negCoefficient * kernelParams.current.recipBox[XX][XX] * atomForces.x;
+ result.y = negCoefficient * (kernelParams.current.recipBox[XX][YY] * atomForces.x +
+ kernelParams.current.recipBox[YY][YY] * atomForces.y);
+ result.z = negCoefficient * (kernelParams.current.recipBox[XX][ZZ] * atomForces.x +
+ kernelParams.current.recipBox[YY][ZZ] * atomForces.y +
+ kernelParams.current.recipBox[ZZ][ZZ] * atomForces.z);
+ vstore3(result, forceIndexLocal, sm_forces);
+ }
+
+#if !defined(_AMD_SOURCE_) && !defined(_NVIDIA_SOURCE_)
+ /* This is only here for execution of e.g. 32-sized warps on 16-wide hardware; this was gmx_syncwarp() in CUDA.
+ * #2519
+ */
+ barrier(CLK_LOCAL_MEM_FENCE);
+#endif
+
+ assert(atomsPerBlock <= warp_size);
+
+ /* Writing or adding the final forces component-wise, single warp */
+ const int blockForcesSize = atomsPerBlock * DIM;
+ const int numIter = (blockForcesSize + warp_size - 1) / warp_size;
+ const int iterThreads = blockForcesSize / numIter;
+ if (threadLocalId < iterThreads)
+ {
+#pragma unroll
+ for (int i = 0; i < numIter; i++)
+ {
+ const int outputIndexLocal = i * iterThreads + threadLocalId;
+ const int outputIndexGlobal = get_group_id(XX) * blockForcesSize + outputIndexLocal;
+ const int globalOutputCheck = pme_gpu_check_atom_data_index(outputIndexGlobal, kernelParams.atoms.nAtoms * DIM);
+ if (globalOutputCheck)
+ {
+ const float outputForceComponent = sm_forces[outputIndexLocal];
+ if (overwriteForces)
+ {
+ gm_forces[outputIndexGlobal] = outputForceComponent;
+ }
+ else
+ {
+ gm_forces[outputIndexGlobal] += outputForceComponent;
+ }
+ }
+ }
+ }
+}
--- /dev/null
- if ((bGrasp || bCONECT) && (outftp != efPDB))
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2004, The GROMACS development team.
+ * Copyright (c) 2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+#include "gmxpre.h"
+
+#include "editconf.h"
+
+#include <cmath>
+#include <cstring>
+
+#include <algorithm>
+#include <string>
+
+#include "gromacs/commandline/pargs.h"
+#include "gromacs/commandline/viewit.h"
+#include "gromacs/fileio/confio.h"
+#include "gromacs/fileio/pdbio.h"
+#include "gromacs/fileio/tpxio.h"
+#include "gromacs/fileio/trxio.h"
+#include "gromacs/gmxana/princ.h"
+#include "gromacs/gmxlib/conformation_utilities.h"
+#include "gromacs/math/functions.h"
+#include "gromacs/math/units.h"
+#include "gromacs/math/vec.h"
+#include "gromacs/pbcutil/pbc.h"
+#include "gromacs/pbcutil/rmpbc.h"
+#include "gromacs/topology/atomprop.h"
+#include "gromacs/topology/index.h"
+#include "gromacs/topology/topology.h"
+#include "gromacs/utility/arraysize.h"
+#include "gromacs/utility/cstringutil.h"
+#include "gromacs/utility/fatalerror.h"
+#include "gromacs/utility/futil.h"
+#include "gromacs/utility/gmxassert.h"
+#include "gromacs/utility/smalloc.h"
+#include "gromacs/utility/strdb.h"
+
+static real calc_mass(t_atoms *atoms, gmx_bool bGetMass, AtomProperties *aps)
+{
+ real tmass;
+ int i;
+
+ tmass = 0;
+ for (i = 0; (i < atoms->nr); i++)
+ {
+ if (bGetMass)
+ {
+ aps->setAtomProperty(epropMass,
+ std::string(*atoms->resinfo[atoms->atom[i].resind].name),
+ std::string(*atoms->atomname[i]), &(atoms->atom[i].m));
+ }
+ tmass += atoms->atom[i].m;
+ }
+
+ return tmass;
+}
+
+static real calc_geom(int isize, const int *index, rvec *x, rvec geom_center, rvec minval,
+ rvec maxval, gmx_bool bDiam)
+{
+ real diam2, d;
+ int ii, i, j;
+
+ clear_rvec(geom_center);
+ diam2 = 0;
+ if (isize == 0)
+ {
+ clear_rvec(minval);
+ clear_rvec(maxval);
+ }
+ else
+ {
+ if (index)
+ {
+ ii = index[0];
+ }
+ else
+ {
+ ii = 0;
+ }
+ for (j = 0; j < DIM; j++)
+ {
+ minval[j] = maxval[j] = x[ii][j];
+ }
+ for (i = 0; i < isize; i++)
+ {
+ if (index)
+ {
+ ii = index[i];
+ }
+ else
+ {
+ ii = i;
+ }
+ rvec_inc(geom_center, x[ii]);
+ for (j = 0; j < DIM; j++)
+ {
+ if (x[ii][j] < minval[j])
+ {
+ minval[j] = x[ii][j];
+ }
+ if (x[ii][j] > maxval[j])
+ {
+ maxval[j] = x[ii][j];
+ }
+ }
+ if (bDiam)
+ {
+ if (index)
+ {
+ for (j = i + 1; j < isize; j++)
+ {
+ d = distance2(x[ii], x[index[j]]);
+ diam2 = std::max(d, diam2);
+ }
+ }
+ else
+ {
+ for (j = i + 1; j < isize; j++)
+ {
+ d = distance2(x[i], x[j]);
+ diam2 = std::max(d, diam2);
+ }
+ }
+ }
+ }
+ svmul(1.0 / isize, geom_center, geom_center);
+ }
+
+ return std::sqrt(diam2);
+}
+
+static void center_conf(int natom, rvec *x, rvec center, rvec geom_cent)
+{
+ int i;
+ rvec shift;
+
+ rvec_sub(center, geom_cent, shift);
+
+ printf(" shift :%7.3f%7.3f%7.3f (nm)\n", shift[XX], shift[YY],
+ shift[ZZ]);
+
+ for (i = 0; (i < natom); i++)
+ {
+ rvec_inc(x[i], shift);
+ }
+}
+
+static void scale_conf(int natom, rvec x[], matrix box, const rvec scale)
+{
+ int i, j;
+
+ for (i = 0; i < natom; i++)
+ {
+ for (j = 0; j < DIM; j++)
+ {
+ x[i][j] *= scale[j];
+ }
+ }
+ for (i = 0; i < DIM; i++)
+ {
+ for (j = 0; j < DIM; j++)
+ {
+ box[i][j] *= scale[j];
+ }
+ }
+}
+
+static void read_bfac(const char *fn, int *n_bfac, double **bfac_val, int **bfac_nr)
+{
+ int i;
+ char **bfac_lines;
+
+ *n_bfac = get_lines(fn, &bfac_lines);
+ snew(*bfac_val, *n_bfac);
+ snew(*bfac_nr, *n_bfac);
+ fprintf(stderr, "Reading %d B-factors from %s\n", *n_bfac, fn);
+ for (i = 0; (i < *n_bfac); i++)
+ {
+ sscanf(bfac_lines[i], "%d %lf", &(*bfac_nr)[i], &(*bfac_val)[i]);
+ }
+
+}
+
+static void set_pdb_conf_bfac(int natoms, int nres, t_atoms *atoms, int n_bfac,
+ double *bfac, int *bfac_nr, gmx_bool peratom)
+{
+ real bfac_min, bfac_max;
+ int i, n;
+ gmx_bool found;
+
+ if (n_bfac > atoms->nres)
+ {
+ peratom = TRUE;
+ }
+
+ bfac_max = -1e10;
+ bfac_min = 1e10;
+ for (i = 0; (i < n_bfac); i++)
+ {
+ /* if ((bfac_nr[i]-1<0) || (bfac_nr[i]-1>=atoms->nr))
+ gmx_fatal(FARGS,"Index of B-Factor %d is out of range: %d (%g)",
+ i+1,bfac_nr[i],bfac[i]); */
+ if (bfac[i] > bfac_max)
+ {
+ bfac_max = bfac[i];
+ }
+ if (bfac[i] < bfac_min)
+ {
+ bfac_min = bfac[i];
+ }
+ }
+ while ((bfac_max > 99.99) || (bfac_min < -99.99))
+ {
+ fprintf(stderr,
+ "Range of values for B-factors too large (min %g, max %g) "
+ "will scale down a factor 10\n", bfac_min, bfac_max);
+ for (i = 0; (i < n_bfac); i++)
+ {
+ bfac[i] /= 10;
+ }
+ bfac_max /= 10;
+ bfac_min /= 10;
+ }
+ while ((std::abs(bfac_max) < 0.5) && (std::abs(bfac_min) < 0.5))
+ {
+ fprintf(stderr,
+ "Range of values for B-factors too small (min %g, max %g) "
+ "will scale up a factor 10\n", bfac_min, bfac_max);
+ for (i = 0; (i < n_bfac); i++)
+ {
+ bfac[i] *= 10;
+ }
+ bfac_max *= 10;
+ bfac_min *= 10;
+ }
+
+ for (i = 0; (i < natoms); i++)
+ {
+ atoms->pdbinfo[i].bfac = 0;
+ }
+
+ if (!peratom)
+ {
+ fprintf(stderr, "Will attach %d B-factors to %d residues\n", n_bfac,
+ nres);
+ for (i = 0; (i < n_bfac); i++)
+ {
+ found = FALSE;
+ for (n = 0; (n < natoms); n++)
+ {
+ if (bfac_nr[i] == atoms->resinfo[atoms->atom[n].resind].nr)
+ {
+ atoms->pdbinfo[n].bfac = bfac[i];
+ found = TRUE;
+ }
+ }
+ if (!found)
+ {
+ gmx_warning("Residue nr %d not found\n", bfac_nr[i]);
+ }
+ }
+ }
+ else
+ {
+ fprintf(stderr, "Will attach %d B-factors to %d atoms\n", n_bfac,
+ natoms);
+ for (i = 0; (i < n_bfac); i++)
+ {
+ atoms->pdbinfo[bfac_nr[i] - 1].bfac = bfac[i];
+ }
+ }
+}
+
+static void pdb_legend(FILE *out, int natoms, int nres, t_atoms *atoms, rvec x[])
+{
+ real bfac_min, bfac_max, xmin, ymin, zmin;
+ int i;
+ int space = ' ';
+
+ bfac_max = -1e10;
+ bfac_min = 1e10;
+ xmin = 1e10;
+ ymin = 1e10;
+ zmin = 1e10;
+ for (i = 0; (i < natoms); i++)
+ {
+ xmin = std::min(xmin, x[i][XX]);
+ ymin = std::min(ymin, x[i][YY]);
+ zmin = std::min(zmin, x[i][ZZ]);
+ bfac_min = std::min(bfac_min, atoms->pdbinfo[i].bfac);
+ bfac_max = std::max(bfac_max, atoms->pdbinfo[i].bfac);
+ }
+ fprintf(stderr, "B-factors range from %g to %g\n", bfac_min, bfac_max);
+ for (i = 1; (i < 12); i++)
+ {
+ fprintf(out,
+ "%-6s%5d %-4.4s%3.3s %c%4d%c %8.3f%8.3f%8.3f%6.2f%6.2f\n",
+ "ATOM ", natoms + 1 + i, "CA", "LEG", space, nres + 1, space,
+ (xmin + (i * 0.12)) * 10, ymin * 10, zmin * 10, 1.0, bfac_min
+ + ((i - 1.0) * (bfac_max - bfac_min) / 10));
+ }
+}
+
+static void visualize_images(const char *fn, int ePBC, matrix box)
+{
+ t_atoms atoms;
+ rvec *img;
+ char *c, *ala;
+ int nat, i;
+
+ nat = NTRICIMG + 1;
+ init_t_atoms(&atoms, nat, FALSE);
+ atoms.nr = nat;
+ snew(img, nat);
+ /* FIXME: Constness should not be cast away */
+ c = const_cast<char*>("C");
+ ala = const_cast<char*>("ALA");
+ for (i = 0; i < nat; i++)
+ {
+ atoms.atomname[i] = &c;
+ atoms.atom[i].resind = i;
+ atoms.resinfo[i].name = &ala;
+ atoms.resinfo[i].nr = i + 1;
+ atoms.resinfo[i].chainid = 'A' + i / NCUCVERT;
+ }
+ calc_triclinic_images(box, img + 1);
+
+ write_sto_conf(fn, "Images", &atoms, img, nullptr, ePBC, box);
+
+ done_atom(&atoms);
+ sfree(img);
+}
+
+static void visualize_box(FILE *out, int a0, int r0, matrix box, const rvec gridsize)
+{
+ int *edge;
+ rvec *vert, shift;
+ int nx, ny, nz, nbox, nat;
+ int i, j, x, y, z;
+ int rectedge[24] =
+ {
+ 0, 1, 1, 3, 3, 2, 0, 2, 0, 4, 1, 5, 3, 7, 2, 6, 4, 5, 5, 7, 7, 6, 6,
+ 4
+ };
+
+ a0++;
+ r0++;
+
+ nx = gmx::roundToInt(gridsize[XX]);
+ ny = gmx::roundToInt(gridsize[YY]);
+ nz = gmx::roundToInt(gridsize[ZZ]);
+ nbox = nx * ny * nz;
+ if (TRICLINIC(box))
+ {
+ nat = nbox * NCUCVERT;
+ snew(vert, nat);
+ calc_compact_unitcell_vertices(ecenterDEF, box, vert);
+ j = 0;
+ for (z = 0; z < nz; z++)
+ {
+ for (y = 0; y < ny; y++)
+ {
+ for (x = 0; x < nx; x++)
+ {
+ for (i = 0; i < DIM; i++)
+ {
+ shift[i] = x * box[0][i] + y * box[1][i] + z
+ * box[2][i];
+ }
+ for (i = 0; i < NCUCVERT; i++)
+ {
+ rvec_add(vert[i], shift, vert[j]);
+ j++;
+ }
+ }
+ }
+ }
+
+ for (i = 0; i < nat; i++)
+ {
+ gmx_fprintf_pdb_atomline(out, epdbATOM, a0 + i, "C", ' ', "BOX", 'K' + i / NCUCVERT, r0 + i, ' ',
+ 10*vert[i][XX], 10*vert[i][YY], 10*vert[i][ZZ], 1.0, 0.0, "");
+ }
+
+ edge = compact_unitcell_edges();
+ for (j = 0; j < nbox; j++)
+ {
+ for (i = 0; i < NCUCEDGE; i++)
+ {
+ fprintf(out, "CONECT%5d%5d\n", a0 + j * NCUCVERT + edge[2 * i],
+ a0 + j * NCUCVERT + edge[2 * i + 1]);
+ }
+ }
+
+ sfree(vert);
+ }
+ else
+ {
+ i = 0;
+ for (z = 0; z <= 1; z++)
+ {
+ for (y = 0; y <= 1; y++)
+ {
+ for (x = 0; x <= 1; x++)
+ {
+ gmx_fprintf_pdb_atomline(out, epdbATOM, a0 + i, "C", ' ', "BOX", 'K' + i/8, r0+i, ' ',
+ x * 10 * box[XX][XX], y * 10 * box[YY][YY], z * 10 * box[ZZ][ZZ], 1.0, 0.0, "");
+ i++;
+ }
+ }
+ }
+ for (i = 0; i < 24; i += 2)
+ {
+ fprintf(out, "CONECT%5d%5d\n", a0 + rectedge[i], a0 + rectedge[i + 1]);
+ }
+ }
+}
+
+static void calc_rotmatrix(rvec principal_axis, rvec targetvec, matrix rotmatrix)
+{
+ rvec rotvec;
+ real ux, uy, uz, costheta, sintheta;
+
+ costheta = cos_angle(principal_axis, targetvec);
+ sintheta = std::sqrt(1.0-costheta*costheta); /* sign is always positive since 0<theta<pi */
+
+ /* Determine rotation from cross product with target vector */
+ cprod(principal_axis, targetvec, rotvec);
+ unitv(rotvec, rotvec);
+ printf("Aligning %g %g %g to %g %g %g : xprod %g %g %g\n",
+ principal_axis[XX], principal_axis[YY], principal_axis[ZZ], targetvec[XX], targetvec[YY], targetvec[ZZ],
+ rotvec[XX], rotvec[YY], rotvec[ZZ]);
+
+ ux = rotvec[XX];
+ uy = rotvec[YY];
+ uz = rotvec[ZZ];
+ rotmatrix[0][0] = ux*ux + (1.0-ux*ux)*costheta;
+ rotmatrix[0][1] = ux*uy*(1-costheta)-uz*sintheta;
+ rotmatrix[0][2] = ux*uz*(1-costheta)+uy*sintheta;
+ rotmatrix[1][0] = ux*uy*(1-costheta)+uz*sintheta;
+ rotmatrix[1][1] = uy*uy + (1.0-uy*uy)*costheta;
+ rotmatrix[1][2] = uy*uz*(1-costheta)-ux*sintheta;
+ rotmatrix[2][0] = ux*uz*(1-costheta)-uy*sintheta;
+ rotmatrix[2][1] = uy*uz*(1-costheta)+ux*sintheta;
+ rotmatrix[2][2] = uz*uz + (1.0-uz*uz)*costheta;
+
+ printf("Rotation matrix: \n%g %g %g\n%g %g %g\n%g %g %g\n",
+ rotmatrix[0][0], rotmatrix[0][1], rotmatrix[0][2],
+ rotmatrix[1][0], rotmatrix[1][1], rotmatrix[1][2],
+ rotmatrix[2][0], rotmatrix[2][1], rotmatrix[2][2]);
+}
+
+static void renum_resnr(t_atoms *atoms, int isize, const int *index,
+ int resnr_start)
+{
+ int i, resind_prev, resind;
+
+ resind_prev = -1;
+ for (i = 0; i < isize; i++)
+ {
+ resind = atoms->atom[index == nullptr ? i : index[i]].resind;
+ if (resind != resind_prev)
+ {
+ atoms->resinfo[resind].nr = resnr_start;
+ resnr_start++;
+ }
+ resind_prev = resind;
+ }
+}
+
+int gmx_editconf(int argc, char *argv[])
+{
+ const char *desc[] =
+ {
+ "[THISMODULE] converts generic structure format to [REF].gro[ref], [TT].g96[tt]",
+ "or [REF].pdb[ref].",
+ "[PAR]",
+ "The box can be modified with options [TT]-box[tt], [TT]-d[tt] and",
+ "[TT]-angles[tt]. Both [TT]-box[tt] and [TT]-d[tt]",
+ "will center the system in the box, unless [TT]-noc[tt] is used.",
+ "The [TT]-center[tt] option can be used to shift the geometric center",
+ "of the system from the default of (x/2, y/2, z/2) implied by [TT]-c[tt]",
+ "to some other value.",
+ "[PAR]",
+ "Option [TT]-bt[tt] determines the box type: [TT]triclinic[tt] is a",
+ "triclinic box, [TT]cubic[tt] is a rectangular box with all sides equal",
+ "[TT]dodecahedron[tt] represents a rhombic dodecahedron and",
+ "[TT]octahedron[tt] is a truncated octahedron.",
+ "The last two are special cases of a triclinic box.",
+ "The length of the three box vectors of the truncated octahedron is the",
+ "shortest distance between two opposite hexagons.",
+ "Relative to a cubic box with some periodic image distance, the volume of a ",
+ "dodecahedron with this same periodic distance is 0.71 times that of the cube, ",
+ "and that of a truncated octahedron is 0.77 times.",
+ "[PAR]",
+ "Option [TT]-box[tt] requires only",
+ "one value for a cubic, rhombic dodecahedral, or truncated octahedral box.",
+ "[PAR]",
+ "With [TT]-d[tt] and a [TT]triclinic[tt] box the size of the system in the [IT]x[it]-, [IT]y[it]-,",
+ "and [IT]z[it]-directions is used. With [TT]-d[tt] and [TT]cubic[tt],",
+ "[TT]dodecahedron[tt] or [TT]octahedron[tt] boxes, the dimensions are set",
+ "to the diameter of the system (largest distance between atoms) plus twice",
+ "the specified distance.",
+ "[PAR]",
+ "Option [TT]-angles[tt] is only meaningful with option [TT]-box[tt] and",
+ "a triclinic box and cannot be used with option [TT]-d[tt].",
+ "[PAR]",
+ "When [TT]-n[tt] or [TT]-ndef[tt] is set, a group",
+ "can be selected for calculating the size and the geometric center,",
+ "otherwise the whole system is used.",
+ "[PAR]",
+ "[TT]-rotate[tt] rotates the coordinates and velocities.",
+ "[PAR]",
+ "[TT]-princ[tt] aligns the principal axes of the system along the",
+ "coordinate axes, with the longest axis aligned with the [IT]x[it]-axis. ",
+ "This may allow you to decrease the box volume,",
+ "but beware that molecules can rotate significantly in a nanosecond.",
+ "[PAR]",
+ "Scaling is applied before any of the other operations are",
+ "performed. Boxes and coordinates can be scaled to give a certain density (option",
+ "[TT]-density[tt]). Note that this may be inaccurate in case a [REF].gro[ref]",
+ "file is given as input. A special feature of the scaling option is that when the",
+ "factor -1 is given in one dimension, one obtains a mirror image,",
+ "mirrored in one of the planes. When one uses -1 in three dimensions, ",
+ "a point-mirror image is obtained.[PAR]",
+ "Groups are selected after all operations have been applied.[PAR]",
+ "Periodicity can be removed in a crude manner.",
+ "It is important that the box vectors at the bottom of your input file",
+ "are correct when the periodicity is to be removed.",
+ "[PAR]",
+ "When writing [REF].pdb[ref] files, B-factors can be",
+ "added with the [TT]-bf[tt] option. B-factors are read",
+ "from a file with with following format: first line states number of",
+ "entries in the file, next lines state an index",
+ "followed by a B-factor. The B-factors will be attached per residue",
+ "unless the number of B-factors is larger than the number of the residues or unless the",
+ "[TT]-atom[tt] option is set. Obviously, any type of numeric data can",
+ "be added instead of B-factors. [TT]-legend[tt] will produce",
+ "a row of CA atoms with B-factors ranging from the minimum to the",
+ "maximum value found, effectively making a legend for viewing.",
+ "[PAR]",
+ "With the option [TT]-mead[tt] a special [REF].pdb[ref] ([REF].pqr[ref])",
+ "file for the MEAD electrostatics",
+ "program (Poisson-Boltzmann solver) can be made. A further prerequisite",
+ "is that the input file is a run input file.",
+ "The B-factor field is then filled with the Van der Waals radius",
+ "of the atoms while the occupancy field will hold the charge.",
+ "[PAR]",
+ "The option [TT]-grasp[tt] is similar, but it puts the charges in the B-factor",
+ "and the radius in the occupancy.",
+ "[PAR]",
+ "Option [TT]-align[tt] allows alignment",
+ "of the principal axis of a specified group against the given vector, ",
+ "with an optional center of rotation specified by [TT]-aligncenter[tt].",
+ "[PAR]",
+ "Finally, with option [TT]-label[tt], [TT]editconf[tt] can add a chain identifier",
+ "to a [REF].pdb[ref] file, which can be useful for analysis with e.g. Rasmol.",
+ "[PAR]",
+ "To convert a truncated octrahedron file produced by a package which uses",
+ "a cubic box with the corners cut off (such as GROMOS), use::",
+ "",
+ " gmx editconf -f in -rotate 0 45 35.264 -bt o -box veclen -o out",
+ "",
+ "where [TT]veclen[tt] is the size of the cubic box times [SQRT]3[sqrt]/2."
+ };
+ const char *bugs[] =
+ {
+ "For complex molecules, the periodicity removal routine may break down, "
+ "in that case you can use [gmx-trjconv]."
+ };
+ static real dist = 0.0;
+ static gmx_bool bNDEF = FALSE, bRMPBC = FALSE, bCenter = FALSE, bReadVDW =
+ FALSE, bCONECT = FALSE;
+ static gmx_bool peratom = FALSE, bLegend = FALSE, bOrient = FALSE, bMead =
+ FALSE, bGrasp = FALSE, bSig56 = FALSE;
+ static rvec scale =
+ { 1, 1, 1 }, newbox =
+ { 0, 0, 0 }, newang =
+ { 90, 90, 90 };
+ static real rho = 1000.0, rvdw = 0.12;
+ static rvec center =
+ { 0, 0, 0 }, translation =
+ { 0, 0, 0 }, rotangles =
+ { 0, 0, 0 }, aligncenter =
+ { 0, 0, 0 }, targetvec =
+ { 0, 0, 0 };
+ static const char *btype[] =
+ { nullptr, "triclinic", "cubic", "dodecahedron", "octahedron", nullptr },
+ *label = "A";
+ static rvec visbox =
+ { 0, 0, 0 };
+ static int resnr_start = -1;
+ t_pargs
+ pa[] =
+ {
+ { "-ndef", FALSE, etBOOL,
+ { &bNDEF }, "Choose output from default index groups" },
+ { "-visbox", FALSE, etRVEC,
+ { visbox },
+ "HIDDENVisualize a grid of boxes, -1 visualizes the 14 box images" },
+ { "-bt", FALSE, etENUM,
+ { btype }, "Box type for [TT]-box[tt] and [TT]-d[tt]" },
+ { "-box", FALSE, etRVEC,
+ { newbox }, "Box vector lengths (a,b,c)" },
+ { "-angles", FALSE, etRVEC,
+ { newang }, "Angles between the box vectors (bc,ac,ab)" },
+ { "-d", FALSE, etREAL,
+ { &dist }, "Distance between the solute and the box" },
+ { "-c", FALSE, etBOOL,
+ { &bCenter },
+ "Center molecule in box (implied by [TT]-box[tt] and [TT]-d[tt])" },
+ { "-center", FALSE, etRVEC,
+ { center }, "Shift the geometrical center to (x,y,z)" },
+ { "-aligncenter", FALSE, etRVEC,
+ { aligncenter }, "Center of rotation for alignment" },
+ { "-align", FALSE, etRVEC,
+ { targetvec },
+ "Align to target vector" },
+ { "-translate", FALSE, etRVEC,
+ { translation }, "Translation" },
+ { "-rotate", FALSE, etRVEC,
+ { rotangles },
+ "Rotation around the X, Y and Z axes in degrees" },
+ { "-princ", FALSE, etBOOL,
+ { &bOrient },
+ "Orient molecule(s) along their principal axes" },
+ { "-scale", FALSE, etRVEC,
+ { scale }, "Scaling factor" },
+ { "-density", FALSE, etREAL,
+ { &rho },
+ "Density (g/L) of the output box achieved by scaling" },
+ { "-pbc", FALSE, etBOOL,
+ { &bRMPBC },
+ "Remove the periodicity (make molecule whole again)" },
+ { "-resnr", FALSE, etINT,
+ { &resnr_start },
+ " Renumber residues starting from resnr" },
+ { "-grasp", FALSE, etBOOL,
+ { &bGrasp },
+ "Store the charge of the atom in the B-factor field and the radius of the atom in the occupancy field" },
+ {
+ "-rvdw", FALSE, etREAL,
+ { &rvdw },
+ "Default Van der Waals radius (in nm) if one can not be found in the database or if no parameters are present in the topology file"
+ },
+ { "-sig56", FALSE, etBOOL,
+ { &bSig56 },
+ "Use rmin/2 (minimum in the Van der Waals potential) rather than [GRK]sigma[grk]/2 " },
+ {
+ "-vdwread", FALSE, etBOOL,
+ { &bReadVDW },
+ "Read the Van der Waals radii from the file [TT]vdwradii.dat[tt] rather than computing the radii based on the force field"
+ },
+ { "-atom", FALSE, etBOOL,
+ { &peratom }, "Force B-factor attachment per atom" },
+ { "-legend", FALSE, etBOOL,
+ { &bLegend }, "Make B-factor legend" },
+ { "-label", FALSE, etSTR,
+ { &label }, "Add chain label for all residues" },
+ {
+ "-conect", FALSE, etBOOL,
+ { &bCONECT },
+ "Add CONECT records to a [REF].pdb[ref] file when written. Can only be done when a topology is present"
+ }
+ };
+#define NPA asize(pa)
+
+ FILE *out;
+ const char *infile, *outfile;
+ int outftp, inftp, natom, i, j, n_bfac, itype, ntype;
+ double *bfac = nullptr, c6, c12;
+ int *bfac_nr = nullptr;
+ t_topology *top = nullptr;
+ char *grpname, *sgrpname, *agrpname;
+ int isize, ssize, numAlignmentAtoms;
+ int *index, *sindex, *aindex;
+ rvec *x, *v, gc, rmin, rmax, size;
+ int ePBC;
+ matrix box, rotmatrix, trans;
+ rvec princd, tmpvec;
+ gmx_bool bIndex, bSetSize, bSetAng, bDist, bSetCenter, bAlign;
+ gmx_bool bHaveV, bScale, bRho, bTranslate, bRotate, bCalcGeom, bCalcDiam;
+ real diam = 0, mass = 0, d, vdw;
+ gmx_conect conect;
+ gmx_output_env_t *oenv;
+ t_filenm fnm[] =
+ {
+ { efSTX, "-f", nullptr, ffREAD },
+ { efNDX, "-n", nullptr, ffOPTRD },
+ { efSTO, nullptr, nullptr, ffOPTWR },
+ { efPQR, "-mead", "mead", ffOPTWR },
+ { efDAT, "-bf", "bfact", ffOPTRD }
+ };
+#define NFILE asize(fnm)
+
+ if (!parse_common_args(&argc, argv, PCA_CAN_VIEW, NFILE, fnm, NPA, pa,
+ asize(desc), desc, asize(bugs), bugs, &oenv))
+ {
+ return 0;
+ }
+ fprintf(stdout, "Note that major changes are planned in future for "
+ "editconf, to improve usability and utility.\n");
+
+ bIndex = opt2bSet("-n", NFILE, fnm) || bNDEF;
+ bMead = opt2bSet("-mead", NFILE, fnm);
+ bSetSize = opt2parg_bSet("-box", NPA, pa);
+ bSetAng = opt2parg_bSet("-angles", NPA, pa);
+ bSetCenter = opt2parg_bSet("-center", NPA, pa);
+ bDist = opt2parg_bSet("-d", NPA, pa);
+ bAlign = opt2parg_bSet("-align", NPA, pa);
+ /* Only automatically turn on centering without -noc */
+ if ((bDist || bSetSize || bSetCenter) && !opt2parg_bSet("-c", NPA, pa))
+ {
+ bCenter = TRUE;
+ }
+ bScale = opt2parg_bSet("-scale", NPA, pa);
+ bRho = opt2parg_bSet("-density", NPA, pa);
+ bTranslate = opt2parg_bSet("-translate", NPA, pa);
+ bRotate = opt2parg_bSet("-rotate", NPA, pa);
+ if (bScale && bRho)
+ {
+ fprintf(stderr, "WARNING: setting -density overrides -scale\n");
+ }
+ bScale = bScale || bRho;
+ bCalcGeom = bCenter || bRotate || bOrient || bScale;
+
+ GMX_RELEASE_ASSERT(btype[0] != nullptr, "Option setting inconsistency; btype[0] is NULL");
+
+ bCalcDiam = (btype[0][0] == 'c' || btype[0][0] == 'd' || btype[0][0] == 'o');
+
+ infile = ftp2fn(efSTX, NFILE, fnm);
+ if (bMead)
+ {
+ outfile = ftp2fn(efPQR, NFILE, fnm);
+ }
+ else
+ {
+ outfile = ftp2fn(efSTO, NFILE, fnm);
+ }
+ outftp = fn2ftp(outfile);
+ inftp = fn2ftp(infile);
+
+ AtomProperties aps;
+
+ if (bMead && bGrasp)
+ {
+ printf("Incompatible options -mead and -grasp. Turning off -grasp\n");
+ bGrasp = FALSE;
+ }
- " when using the -grasp or -connect options\n");
++ if (bGrasp && (outftp != efPDB))
+ {
+ gmx_fatal(FARGS, "Output file should be a .pdb file"
- if ((bMead || bGrasp || bCONECT) && (fn2ftp(infile) != efTPR))
++ " when using the -grasp option\n");
+ }
- " when using the -mead or -connect options\n");
++ if ((bMead || bGrasp) && (fn2ftp(infile) != efTPR))
+ {
+ gmx_fatal(FARGS, "Input file should be a .tpr file"
++ " when using the -mead option\n");
+ }
+
+ t_symtab symtab;
+ char *name;
+ t_atoms atoms;
+ open_symtab(&symtab);
+ readConfAndAtoms(infile, &symtab, &name, &atoms, &ePBC, &x, &v, box);
+ natom = atoms.nr;
+ if (atoms.pdbinfo == nullptr)
+ {
+ snew(atoms.pdbinfo, atoms.nr);
+ }
+ atoms.havePdbInfo = TRUE;
+
+ if (fn2ftp(infile) == efPDB)
+ {
+ get_pdb_atomnumber(&atoms, &aps);
+ }
+ printf("Read %d atoms\n", atoms.nr);
+
+ /* Get the element numbers if available in a pdb file */
+ if (fn2ftp(infile) == efPDB)
+ {
+ get_pdb_atomnumber(&atoms, &aps);
+ }
+
+ if (ePBC != epbcNONE)
+ {
+ real vol = det(box);
+ printf("Volume: %g nm^3, corresponds to roughly %d electrons\n",
+ vol, 100*(static_cast<int>(vol*4.5)));
+ }
+
+ if (bMead || bGrasp || bCONECT)
+ {
+ top = read_top(infile, nullptr);
+ }
+
+ if (bMead || bGrasp)
+ {
+ if (atoms.nr != top->atoms.nr)
+ {
+ gmx_fatal(FARGS, "Atom numbers don't match (%d vs. %d)", atoms.nr, top->atoms.nr);
+ }
+ snew(atoms.pdbinfo, top->atoms.nr);
+ ntype = top->idef.atnr;
+ for (i = 0; (i < atoms.nr); i++)
+ {
+ /* Determine the Van der Waals radius from the force field */
+ if (bReadVDW)
+ {
+ if (!aps.setAtomProperty(epropVDW,
+ *top->atoms.resinfo[top->atoms.atom[i].resind].name,
+ *top->atoms.atomname[i], &vdw))
+ {
+ vdw = rvdw;
+ }
+ }
+ else
+ {
+ itype = top->atoms.atom[i].type;
+ c12 = top->idef.iparams[itype*ntype+itype].lj.c12;
+ c6 = top->idef.iparams[itype*ntype+itype].lj.c6;
+ if ((c6 != 0) && (c12 != 0))
+ {
+ real sig6;
+ if (bSig56)
+ {
+ sig6 = 2*c12/c6;
+ }
+ else
+ {
+ sig6 = c12/c6;
+ }
+ vdw = 0.5*gmx::sixthroot(sig6);
+ }
+ else
+ {
+ vdw = rvdw;
+ }
+ }
+ /* Factor of 10 for nm -> Angstroms */
+ vdw *= 10;
+
+ if (bMead)
+ {
+ atoms.pdbinfo[i].occup = top->atoms.atom[i].q;
+ atoms.pdbinfo[i].bfac = vdw;
+ }
+ else
+ {
+ atoms.pdbinfo[i].occup = vdw;
+ atoms.pdbinfo[i].bfac = top->atoms.atom[i].q;
+ }
+ }
+ }
+ bHaveV = FALSE;
+ for (i = 0; (i < natom) && !bHaveV; i++)
+ {
+ for (j = 0; (j < DIM) && !bHaveV; j++)
+ {
+ bHaveV = bHaveV || (v[i][j] != 0);
+ }
+ }
+ printf("%selocities found\n", bHaveV ? "V" : "No v");
+
+ if (visbox[0] > 0)
+ {
+ if (bIndex)
+ {
+ gmx_fatal(FARGS, "Sorry, can not visualize box with index groups");
+ }
+ if (outftp != efPDB)
+ {
+ gmx_fatal(FARGS, "Sorry, can only visualize box with a pdb file");
+ }
+ }
+ else if (visbox[0] == -1)
+ {
+ visualize_images("images.pdb", ePBC, box);
+ }
+
+ /* remove pbc */
+ if (bRMPBC)
+ {
+ rm_gropbc(&atoms, x, box);
+ }
+
+ if (bCalcGeom)
+ {
+ if (bIndex)
+ {
+ fprintf(stderr, "\nSelect a group for determining the system size:\n");
+ get_index(&atoms, ftp2fn_null(efNDX, NFILE, fnm),
+ 1, &ssize, &sindex, &sgrpname);
+ }
+ else
+ {
+ ssize = atoms.nr;
+ sindex = nullptr;
+ }
+ diam = calc_geom(ssize, sindex, x, gc, rmin, rmax, bCalcDiam);
+ rvec_sub(rmax, rmin, size);
+ printf(" system size :%7.3f%7.3f%7.3f (nm)\n",
+ size[XX], size[YY], size[ZZ]);
+ if (bCalcDiam)
+ {
+ printf(" diameter :%7.3f (nm)\n", diam);
+ }
+ printf(" center :%7.3f%7.3f%7.3f (nm)\n", gc[XX], gc[YY], gc[ZZ]);
+ printf(" box vectors :%7.3f%7.3f%7.3f (nm)\n",
+ norm(box[XX]), norm(box[YY]), norm(box[ZZ]));
+ printf(" box angles :%7.2f%7.2f%7.2f (degrees)\n",
+ norm2(box[ZZ]) == 0 ? 0 :
+ RAD2DEG*gmx_angle(box[YY], box[ZZ]),
+ norm2(box[ZZ]) == 0 ? 0 :
+ RAD2DEG*gmx_angle(box[XX], box[ZZ]),
+ norm2(box[YY]) == 0 ? 0 :
+ RAD2DEG*gmx_angle(box[XX], box[YY]));
+ printf(" box volume :%7.2f (nm^3)\n", det(box));
+ }
+
+ if (bRho || bOrient || bAlign)
+ {
+ mass = calc_mass(&atoms, !fn2bTPX(infile), &aps);
+ }
+
+ if (bOrient)
+ {
+ int *index;
+ char *grpnames;
+
+ /* Get a group for principal component analysis */
+ fprintf(stderr, "\nSelect group for the determining the orientation\n");
+ get_index(&atoms, ftp2fn_null(efNDX, NFILE, fnm), 1, &isize, &index, &grpnames);
+
+ /* Orient the principal axes along the coordinate axes */
+ orient_princ(&atoms, isize, index, natom, x, bHaveV ? v : nullptr, nullptr);
+ sfree(index);
+ sfree(grpnames);
+ }
+
+ if (bScale)
+ {
+ /* scale coordinates and box */
+ if (bRho)
+ {
+ /* Compute scaling constant */
+ real vol, dens;
+
+ vol = det(box);
+ dens = (mass*AMU)/(vol*NANO*NANO*NANO);
+ fprintf(stderr, "Volume of input %g (nm^3)\n", vol);
+ fprintf(stderr, "Mass of input %g (a.m.u.)\n", mass);
+ fprintf(stderr, "Density of input %g (g/l)\n", dens);
+ if (vol == 0 || mass == 0)
+ {
+ gmx_fatal(FARGS, "Cannot scale density with "
+ "zero mass (%g) or volume (%g)\n", mass, vol);
+ }
+
+ scale[XX] = scale[YY] = scale[ZZ] = std::cbrt(dens/rho);
+ fprintf(stderr, "Scaling all box vectors by %g\n", scale[XX]);
+ }
+ scale_conf(atoms.nr, x, box, scale);
+ }
+
+ if (bAlign)
+ {
+ if (bIndex)
+ {
+ fprintf(stderr, "\nSelect a group that you want to align:\n");
+ get_index(&atoms, ftp2fn_null(efNDX, NFILE, fnm),
+ 1, &numAlignmentAtoms, &aindex, &agrpname);
+ }
+ else
+ {
+ numAlignmentAtoms = atoms.nr;
+ snew(aindex, numAlignmentAtoms);
+ for (i = 0; i < numAlignmentAtoms; i++)
+ {
+ aindex[i] = i;
+ }
+ }
+ printf("Aligning %d atoms (out of %d) to %g %g %g, center of rotation %g %g %g\n", numAlignmentAtoms, natom,
+ targetvec[XX], targetvec[YY], targetvec[ZZ],
+ aligncenter[XX], aligncenter[YY], aligncenter[ZZ]);
+ /*subtract out pivot point*/
+ for (i = 0; i < numAlignmentAtoms; i++)
+ {
+ rvec_dec(x[aindex[i]], aligncenter);
+ }
+ /*now determine transform and rotate*/
+ /*will this work?*/
+ principal_comp(numAlignmentAtoms, aindex, atoms.atom, x, trans, princd);
+
+ unitv(targetvec, targetvec);
+ printf("Using %g %g %g as principal axis\n", trans[0][2], trans[1][2], trans[2][2]);
+ tmpvec[XX] = trans[0][2]; tmpvec[YY] = trans[1][2]; tmpvec[ZZ] = trans[2][2];
+ calc_rotmatrix(tmpvec, targetvec, rotmatrix);
+ /* rotmatrix finished */
+
+ for (i = 0; i < numAlignmentAtoms; ++i)
+ {
+ mvmul(rotmatrix, x[aindex[i]], tmpvec);
+ copy_rvec(tmpvec, x[aindex[i]]);
+ }
+
+ /*add pivot point back*/
+ for (i = 0; i < numAlignmentAtoms; i++)
+ {
+ rvec_inc(x[aindex[i]], aligncenter);
+ }
+ if (!bIndex)
+ {
+ sfree(aindex);
+ }
+ }
+
+ if (bTranslate)
+ {
+ if (bIndex)
+ {
+ fprintf(stderr, "\nSelect a group that you want to translate:\n");
+ get_index(&atoms, ftp2fn_null(efNDX, NFILE, fnm),
+ 1, &ssize, &sindex, &sgrpname);
+ }
+ else
+ {
+ ssize = atoms.nr;
+ sindex = nullptr;
+ }
+ printf("Translating %d atoms (out of %d) by %g %g %g nm\n", ssize, natom,
+ translation[XX], translation[YY], translation[ZZ]);
+ if (sindex)
+ {
+ for (i = 0; i < ssize; i++)
+ {
+ rvec_inc(x[sindex[i]], translation);
+ }
+ }
+ else
+ {
+ for (i = 0; i < natom; i++)
+ {
+ rvec_inc(x[i], translation);
+ }
+ }
+ }
+ if (bRotate)
+ {
+ /* Rotate */
+ printf("Rotating %g, %g, %g degrees around the X, Y and Z axis respectively\n", rotangles[XX], rotangles[YY], rotangles[ZZ]);
+ for (i = 0; i < DIM; i++)
+ {
+ rotangles[i] *= DEG2RAD;
+ }
+ rotate_conf(natom, x, v, rotangles[XX], rotangles[YY], rotangles[ZZ]);
+ }
+
+ if (bCalcGeom)
+ {
+ /* recalc geometrical center and max and min coordinates and size */
+ calc_geom(ssize, sindex, x, gc, rmin, rmax, FALSE);
+ rvec_sub(rmax, rmin, size);
+ if (bScale || bOrient || bRotate)
+ {
+ printf("new system size : %6.3f %6.3f %6.3f\n",
+ size[XX], size[YY], size[ZZ]);
+ }
+ }
+
+ if ((btype[0] != nullptr) && (bSetSize || bDist || (btype[0][0] == 't' && bSetAng)))
+ {
+ ePBC = epbcXYZ;
+ if (!(bSetSize || bDist))
+ {
+ for (i = 0; i < DIM; i++)
+ {
+ newbox[i] = norm(box[i]);
+ }
+ }
+ clear_mat(box);
+ /* calculate new boxsize */
+ switch (btype[0][0])
+ {
+ case 't':
+ if (bDist)
+ {
+ for (i = 0; i < DIM; i++)
+ {
+ newbox[i] = size[i]+2*dist;
+ }
+ }
+ if (!bSetAng)
+ {
+ box[XX][XX] = newbox[XX];
+ box[YY][YY] = newbox[YY];
+ box[ZZ][ZZ] = newbox[ZZ];
+ }
+ else
+ {
+ matrix_convert(box, newbox, newang);
+ }
+ break;
+ case 'c':
+ case 'd':
+ case 'o':
+ if (bSetSize)
+ {
+ d = newbox[0];
+ }
+ else
+ {
+ d = diam+2*dist;
+ }
+ if (btype[0][0] == 'c')
+ {
+ for (i = 0; i < DIM; i++)
+ {
+ box[i][i] = d;
+ }
+ }
+ else if (btype[0][0] == 'd')
+ {
+ box[XX][XX] = d;
+ box[YY][YY] = d;
+ box[ZZ][XX] = d/2;
+ box[ZZ][YY] = d/2;
+ box[ZZ][ZZ] = d*std::sqrt(2.0)/2.0;
+ }
+ else
+ {
+ box[XX][XX] = d;
+ box[YY][XX] = d/3;
+ box[YY][YY] = d*std::sqrt(2.0)*2.0/3.0;
+ box[ZZ][XX] = -d/3;
+ box[ZZ][YY] = d*std::sqrt(2.0)/3.0;
+ box[ZZ][ZZ] = d*std::sqrt(6.0)/3.0;
+ }
+ break;
+ }
+ }
+
+ /* calculate new coords for geometrical center */
+ if (!bSetCenter)
+ {
+ calc_box_center(ecenterDEF, box, center);
+ }
+
+ /* center molecule on 'center' */
+ if (bCenter)
+ {
+ center_conf(natom, x, center, gc);
+ }
+
+ /* print some */
+ if (bCalcGeom)
+ {
+ calc_geom(ssize, sindex, x, gc, rmin, rmax, FALSE);
+ printf("new center :%7.3f%7.3f%7.3f (nm)\n", gc[XX], gc[YY], gc[ZZ]);
+ }
+ if (bOrient || bScale || bDist || bSetSize)
+ {
+ printf("new box vectors :%7.3f%7.3f%7.3f (nm)\n",
+ norm(box[XX]), norm(box[YY]), norm(box[ZZ]));
+ printf("new box angles :%7.2f%7.2f%7.2f (degrees)\n",
+ norm2(box[ZZ]) == 0 ? 0 :
+ RAD2DEG*gmx_angle(box[YY], box[ZZ]),
+ norm2(box[ZZ]) == 0 ? 0 :
+ RAD2DEG*gmx_angle(box[XX], box[ZZ]),
+ norm2(box[YY]) == 0 ? 0 :
+ RAD2DEG*gmx_angle(box[XX], box[YY]));
+ printf("new box volume :%7.2f (nm^3)\n", det(box));
+ }
+
+ if (check_box(epbcXYZ, box))
+ {
+ printf("\nWARNING: %s\n"
+ "See the GROMACS manual for a description of the requirements that\n"
+ "must be satisfied by descriptions of simulation cells.\n",
+ check_box(epbcXYZ, box));
+ }
+
+ if (bDist && btype[0][0] == 't')
+ {
+ if (TRICLINIC(box))
+ {
+ printf("\nWARNING: Your box is triclinic with non-orthogonal axes. In this case, the\n"
+ "distance from the solute to a box surface along the corresponding normal\n"
+ "vector might be somewhat smaller than your specified value %f.\n"
+ "You can check the actual value with g_mindist -pi\n", dist);
+ }
+ else if (!opt2parg_bSet("-bt", NPA, pa))
+ {
+ printf("\nWARNING: No boxtype specified - distance condition applied in each dimension.\n"
+ "If the molecule rotates the actual distance will be smaller. You might want\n"
+ "to use a cubic box instead, or why not try a dodecahedron today?\n");
+ }
+ }
+ if (bCONECT && (outftp == efPDB) && (inftp == efTPR))
+ {
+ conect = gmx_conect_generate(top);
+ }
+ else
+ {
+ conect = nullptr;
+ }
+
+ if (bIndex)
+ {
+ fprintf(stderr, "\nSelect a group for output:\n");
+ get_index(&atoms, opt2fn_null("-n", NFILE, fnm),
+ 1, &isize, &index, &grpname);
+
+ if (resnr_start >= 0)
+ {
+ renum_resnr(&atoms, isize, index, resnr_start);
+ }
+
+ if (opt2parg_bSet("-label", NPA, pa))
+ {
+ for (i = 0; (i < atoms.nr); i++)
+ {
+ atoms.resinfo[atoms.atom[i].resind].chainid = label[0];
+ }
+ }
+
+ if (opt2bSet("-bf", NFILE, fnm) || bLegend)
+ {
+ gmx_fatal(FARGS, "Sorry, cannot do bfactors with an index group.");
+ }
+
+ if (outftp == efPDB)
+ {
+ out = gmx_ffopen(outfile, "w");
+ write_pdbfile_indexed(out, name, &atoms, x, ePBC, box, ' ', 1, isize, index, conect, FALSE);
+ gmx_ffclose(out);
+ }
+ else
+ {
+ write_sto_conf_indexed(outfile, name, &atoms, x, bHaveV ? v : nullptr, ePBC, box, isize, index);
+ }
+ }
+ else
+ {
+ if (resnr_start >= 0)
+ {
+ renum_resnr(&atoms, atoms.nr, nullptr, resnr_start);
+ }
+
+ if ((outftp == efPDB) || (outftp == efPQR))
+ {
+ out = gmx_ffopen(outfile, "w");
+ if (bMead)
+ {
+ fprintf(out, "REMARK "
+ "The B-factors in this file hold atomic radii\n");
+ fprintf(out, "REMARK "
+ "The occupancy in this file hold atomic charges\n");
+ }
+ else if (bGrasp)
+ {
+ fprintf(out, "GRASP PDB FILE\nFORMAT NUMBER=1\n");
+ fprintf(out, "REMARK "
+ "The B-factors in this file hold atomic charges\n");
+ fprintf(out, "REMARK "
+ "The occupancy in this file hold atomic radii\n");
+ }
+ else if (opt2bSet("-bf", NFILE, fnm))
+ {
+ read_bfac(opt2fn("-bf", NFILE, fnm), &n_bfac, &bfac, &bfac_nr);
+ set_pdb_conf_bfac(atoms.nr, atoms.nres, &atoms,
+ n_bfac, bfac, bfac_nr, peratom);
+ }
+ if (opt2parg_bSet("-label", NPA, pa))
+ {
+ for (i = 0; (i < atoms.nr); i++)
+ {
+ atoms.resinfo[atoms.atom[i].resind].chainid = label[0];
+ }
+ }
+ /* Need to bypass the regular write_pdbfile because I don't want to change
+ * all instances to include the boolean flag for writing out PQR files.
+ */
+ int *index;
+ snew(index, atoms.nr);
+ for (int i = 0; i < atoms.nr; i++)
+ {
+ index[i] = i;
+ }
+ write_pdbfile_indexed(out, name, &atoms, x, ePBC, box, ' ', -1, atoms.nr, index, conect,
+ outftp == efPQR);
+ sfree(index);
+ if (bLegend)
+ {
+ pdb_legend(out, atoms.nr, atoms.nres, &atoms, x);
+ }
+ if (visbox[0] > 0)
+ {
+ visualize_box(out, bLegend ? atoms.nr+12 : atoms.nr,
+ bLegend ? atoms.nres = 12 : atoms.nres, box, visbox);
+ }
+ gmx_ffclose(out);
+ }
+ else
+ {
+ write_sto_conf(outfile, name, &atoms, x, bHaveV ? v : nullptr, ePBC, box);
+ }
+ }
+ done_atom(&atoms);
+ done_symtab(&symtab);
+ sfree(name);
+ if (x)
+ {
+ sfree(x);
+ }
+ if (v)
+ {
+ sfree(v);
+ }
+ do_view(oenv, outfile, nullptr);
+ output_env_done(oenv);
+
+ return 0;
+}
atom->q = q;
atom->m = m;
atom->ptype = pt;
- for (i = 0; (i < MAXFORCEPARAM); i++)
+ for (int i = 0; i < MAXFORCEPARAM; i++)
{
- param->c[i] = c[i];
+ forceParam[i] = c[i];
}
- if ((batype_nr = get_bond_atomtype_type(btype, bat)) == NOTSET)
- {
- add_bond_atomtype(bat, symtab, btype);
- }
- batype_nr = get_bond_atomtype_type(btype, bat);
+ InteractionOfType interactionType({}, forceParam, "");
+
+ batype_nr = bondAtomType->addBondAtomType(symtab, btype);
- if ((nr = get_atomtype_type(type, at)) != NOTSET)
+ if ((nr = at->atomTypeFromName(type)) != NOTSET)
{
- auto message = gmx::formatString("Overriding atomtype %s", type);
+ auto message = gmx::formatString
+ ("Atomtype %s was defined previously (e.g. in the forcefield files), "
+ "and has now been defined again. This could happen e.g. if you would "
+ "use a self-contained molecule .itp file that duplicates or replaces "
+ "the contents of the standard force-field files. You should check "
+ "the contents of your files and remove such repetition. If you know "
+ "you should override the previous definition, then you could choose "
+ "to suppress this warning with -maxwarn.", type);
warning(wi, message);
- if ((nr = set_atomtype(nr, at, symtab, atom, type, param, batype_nr,
- atomnr)) == NOTSET)
+ if ((nr = at->setType(nr, symtab, *atom, type, interactionType, batype_nr,
+ atomnr)) == NOTSET)
{
auto message = gmx::formatString("Replacing atomtype %s failed", type);
warning_error_and_exit(wi, message, FARGS);
--- /dev/null
- gmx_fatal(FARGS, "You are using %d OpenMP threads, which is larger than GMX_OPENMP_MAX_THREADS (%d). Decrease the number of OpenMP threads or rebuild GROMACS with a larger value for GMX_OPENMP_MAX_THREADS.",
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2004, The GROMACS development team.
+ * Copyright (c) 2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ * \brief This file defines functions for managing threading of listed
+ * interactions.
+ *
+ * \author Mark Abraham <mark.j.abraham@gmail.com>
+ * \ingroup module_listed_forces
+ */
+#include "gmxpre.h"
+
+#include "manage_threading.h"
+
+#include "config.h"
+
+#include <cassert>
+#include <cinttypes>
+#include <climits>
+#include <cstdlib>
+
+#include <algorithm>
+#include <string>
+
+#include "gromacs/listed_forces/gpubonded.h"
+#include "gromacs/mdlib/gmx_omp_nthreads.h"
+#include "gromacs/pbcutil/ishift.h"
+#include "gromacs/topology/ifunc.h"
+#include "gromacs/utility/exceptions.h"
+#include "gromacs/utility/fatalerror.h"
+#include "gromacs/utility/gmxassert.h"
+#include "gromacs/utility/smalloc.h"
+
+#include "listed_internal.h"
+#include "utilities.h"
+
+/*! \brief struct for passing all data required for a function type */
+typedef struct {
+ const t_ilist *il; /**< pointer to t_ilist entry corresponding to ftype */
+ int ftype; /**< the function type index */
+ int nat; /**< nr of atoms involved in a single ftype interaction */
+} ilist_data_t;
+
+/*! \brief Divides listed interactions over threads
+ *
+ * This routine attempts to divide all interactions of the numType bondeds
+ * types stored in ild over the threads such that each thread has roughly
+ * equal load and different threads avoid touching the same atoms as much
+ * as possible.
+ */
+static void divide_bondeds_by_locality(bonded_threading_t *bt,
+ int numType,
+ const ilist_data_t *ild)
+{
+ int nat_tot, nat_sum;
+ int ind[F_NRE]; /* index into the ild[].il->iatoms */
+ int at_ind[F_NRE]; /* index of the first atom of the interaction at ind */
+ int f, t;
+
+ assert(numType <= F_NRE);
+
+ nat_tot = 0;
+ for (f = 0; f < numType; f++)
+ {
+ /* Sum #bondeds*#atoms_per_bond over all bonded types */
+ nat_tot += ild[f].il->nr/(ild[f].nat + 1)*ild[f].nat;
+ /* The start bound for thread 0 is 0 for all interactions */
+ ind[f] = 0;
+ /* Initialize the next atom index array */
+ assert(ild[f].il->nr > 0);
+ at_ind[f] = ild[f].il->iatoms[1];
+ }
+
+ nat_sum = 0;
+ /* Loop over the end bounds of the nthreads threads to determine
+ * which interactions threads 0 to nthreads shall calculate.
+ *
+ * NOTE: The cost of these combined loops is #interactions*numType.
+ * This code is running single threaded (difficult to parallelize
+ * over threads). So the relative cost of this function increases
+ * linearly with the number of threads. Since the inner-most loop
+ * is cheap and this is done only at DD repartitioning, the cost should
+ * be negligble. At high thread count many other parts of the code
+ * scale the same way, so it's (currently) not worth improving this.
+ */
+ for (t = 1; t <= bt->nthreads; t++)
+ {
+ int nat_thread;
+
+ /* Here we assume that the computational cost is proportional
+ * to the number of atoms in the interaction. This is a rough
+ * measure, but roughly correct. Usually there are very few
+ * interactions anyhow and there are distributed relatively
+ * uniformly. Proper and RB dihedrals are often distributed
+ * non-uniformly, but their cost is roughly equal.
+ */
+ nat_thread = (nat_tot*t)/bt->nthreads;
+
+ while (nat_sum < nat_thread)
+ {
+ /* To divide bonds based on atom order, we compare
+ * the index of the first atom in the bonded interaction.
+ * This works well, since the domain decomposition generates
+ * bondeds in order of the atoms by looking up interactions
+ * which are linked to the first atom in each interaction.
+ * It usually also works well without DD, since than the atoms
+ * in bonded interactions are usually in increasing order.
+ * If they are not assigned in increasing order, the balancing
+ * is still good, but the memory access and reduction cost will
+ * be higher.
+ */
+ int f_min;
+
+ /* Find out which of the types has the lowest atom index */
+ f_min = 0;
+ for (f = 1; f < numType; f++)
+ {
+ if (at_ind[f] < at_ind[f_min])
+ {
+ f_min = f;
+ }
+ }
+ assert(f_min >= 0 && f_min < numType);
+
+ /* Assign the interaction with the lowest atom index (of type
+ * index f_min) to thread t-1 by increasing ind.
+ */
+ ind[f_min] += ild[f_min].nat + 1;
+ nat_sum += ild[f_min].nat;
+
+ /* Update the first unassigned atom index for this type */
+ if (ind[f_min] < ild[f_min].il->nr)
+ {
+ at_ind[f_min] = ild[f_min].il->iatoms[ind[f_min] + 1];
+ }
+ else
+ {
+ /* We have assigned all interactions of this type.
+ * Setting at_ind to INT_MAX ensures this type will not be
+ * chosen in the for loop above during next iterations.
+ */
+ at_ind[f_min] = INT_MAX;
+ }
+ }
+
+ /* Store the bonded end boundaries (at index t) for thread t-1 */
+ for (f = 0; f < numType; f++)
+ {
+ bt->workDivision.setBound(ild[f].ftype, t, ind[f]);
+ }
+ }
+
+ for (f = 0; f < numType; f++)
+ {
+ assert(ind[f] == ild[f].il->nr);
+ }
+}
+
+//! Return whether function type \p ftype in \p idef has perturbed interactions
+static bool ftypeHasPerturbedEntries(const t_idef &idef,
+ int ftype)
+{
+ GMX_ASSERT(idef.ilsort == ilsortNO_FE || idef.ilsort == ilsortFE_SORTED,
+ "Perturbed interations should be sorted here");
+
+ const t_ilist &ilist = idef.il[ftype];
+
+ return (idef.ilsort != ilsortNO_FE && ilist.nr_nonperturbed != ilist.nr);
+}
+
+//! Divides bonded interactions over threads and GPU
+static void divide_bondeds_over_threads(bonded_threading_t *bt,
+ bool useGpuForBondeds,
+ const t_idef &idef)
+{
+ ilist_data_t ild[F_NRE];
+
+ assert(bt->nthreads > 0);
+
+ bt->haveBondeds = false;
+ int numType = 0;
+ size_t fTypeGpuIndex = 0;
+ for (int fType = 0; fType < F_NRE; fType++)
+ {
+ if (!ftype_is_bonded_potential(fType))
+ {
+ continue;
+ }
+
+ const t_ilist &il = idef.il[fType];
+ int nrToAssignToCpuThreads = il.nr;
+
+ if (useGpuForBondeds &&
+ fTypeGpuIndex < gmx::fTypesOnGpu.size() &&
+ gmx::fTypesOnGpu[fTypeGpuIndex] == fType)
+ {
+ fTypeGpuIndex++;
+
+ /* Perturbation is not implemented in the GPU bonded kernels.
+ * But instead of doing all on the CPU, we could do only
+ * the actually perturbed interactions on the CPU.
+ */
+ if (!ftypeHasPerturbedEntries(idef, fType))
+ {
+ /* We will assign this interaction type to the GPU */
+ nrToAssignToCpuThreads = 0;
+ }
+ }
+
+ if (nrToAssignToCpuThreads > 0)
+ {
+ bt->haveBondeds = true;
+ }
+
+ if (nrToAssignToCpuThreads == 0)
+ {
+ /* No interactions, avoid all the integer math below */
+ for (int t = 0; t <= bt->nthreads; t++)
+ {
+ bt->workDivision.setBound(fType, t, 0);
+ }
+ }
+ else if (bt->nthreads <= bt->max_nthread_uniform || fType == F_DISRES)
+ {
+ /* On up to 4 threads, load balancing the bonded work
+ * is more important than minimizing the reduction cost.
+ */
+
+ const int stride = 1 + NRAL(fType);
+
+ for (int t = 0; t <= bt->nthreads; t++)
+ {
+ /* Divide equally over the threads */
+ int nr_t = (((nrToAssignToCpuThreads/stride)*t)/bt->nthreads)*stride;
+
+ if (fType == F_DISRES)
+ {
+ /* Ensure that distance restraint pairs with the same label
+ * end up on the same thread.
+ */
+ while (nr_t > 0 && nr_t < nrToAssignToCpuThreads &&
+ idef.iparams[il.iatoms[nr_t]].disres.label ==
+ idef.iparams[il.iatoms[nr_t - stride]].disres.label)
+ {
+ nr_t += stride;
+ }
+ }
+
+ bt->workDivision.setBound(fType, t, nr_t);
+ }
+ }
+ else
+ {
+ /* Add this fType to the list to be distributed */
+ int nat = NRAL(fType);
+ ild[numType].ftype = fType;
+ ild[numType].il = &il;
+ ild[numType].nat = nat;
+
+ /* The first index for the thread division is always 0 */
+ bt->workDivision.setBound(fType, 0, 0);
+
+ numType++;
+ }
+ }
+
+ if (numType > 0)
+ {
+ divide_bondeds_by_locality(bt, numType, ild);
+ }
+
+ if (debug)
+ {
+ int f;
+
+ fprintf(debug, "Division of bondeds over threads:\n");
+ for (f = 0; f < F_NRE; f++)
+ {
+ if (ftype_is_bonded_potential(f) && idef.il[f].nr > 0)
+ {
+ int t;
+
+ fprintf(debug, "%16s", interaction_function[f].name);
+ for (t = 0; t < bt->nthreads; t++)
+ {
+ fprintf(debug, " %4d",
+ (bt->workDivision.bound(f, t + 1) -
+ bt->workDivision.bound(f, t))/
+ (1 + NRAL(f)));
+ }
+ fprintf(debug, "\n");
+ }
+ }
+ }
+}
+
+//! Construct a reduction mask for which parts (blocks) of the force array are touched on which thread task
+static void
+calc_bonded_reduction_mask(int natoms,
+ f_thread_t *f_thread,
+ const t_idef &idef,
+ int thread,
+ const bonded_threading_t &bondedThreading)
+{
+ static_assert(BITMASK_SIZE == GMX_OPENMP_MAX_THREADS, "For the error message below we assume these two are equal.");
+
+ if (bondedThreading.nthreads > BITMASK_SIZE)
+ {
+#pragma omp master
++ gmx_fatal(FARGS, "You are using %d OpenMP threads, which is larger than GMX_OPENMP_MAX_THREADS (%d). Decrease the number of OpenMP threads or rebuild GROMACS with a larger value for GMX_OPENMP_MAX_THREADS passed to CMake.",
+ bondedThreading.nthreads, GMX_OPENMP_MAX_THREADS);
+#pragma omp barrier
+ }
+ GMX_ASSERT(bondedThreading.nthreads <= BITMASK_SIZE, "We need at least nthreads bits in the mask");
+
+ int nblock = (natoms + reduction_block_size - 1) >> reduction_block_bits;
+
+ if (nblock > f_thread->block_nalloc)
+ {
+ f_thread->block_nalloc = over_alloc_large(nblock);
+ srenew(f_thread->mask, f_thread->block_nalloc);
+ srenew(f_thread->block_index, f_thread->block_nalloc);
+ // NOTE: It seems f_thread->f does not need to be aligned
+ sfree_aligned(f_thread->f);
+ snew_aligned(f_thread->f, f_thread->block_nalloc*reduction_block_size, 128);
+ }
+
+ gmx_bitmask_t *mask = f_thread->mask;
+
+ for (int b = 0; b < nblock; b++)
+ {
+ bitmask_clear(&mask[b]);
+ }
+
+ for (int ftype = 0; ftype < F_NRE; ftype++)
+ {
+ if (ftype_is_bonded_potential(ftype))
+ {
+ int nb = idef.il[ftype].nr;
+ if (nb > 0)
+ {
+ int nat1 = interaction_function[ftype].nratoms + 1;
+
+ int nb0 = bondedThreading.workDivision.bound(ftype, thread);
+ int nb1 = bondedThreading.workDivision.bound(ftype, thread + 1);
+
+ for (int i = nb0; i < nb1; i += nat1)
+ {
+ for (int a = 1; a < nat1; a++)
+ {
+ bitmask_set_bit(&mask[idef.il[ftype].iatoms[i+a] >> reduction_block_bits], thread);
+ }
+ }
+ }
+ }
+ }
+
+ /* Make an index of the blocks our thread touches, so we can do fast
+ * force buffer clearing.
+ */
+ f_thread->nblock_used = 0;
+ for (int b = 0; b < nblock; b++)
+ {
+ if (bitmask_is_set(mask[b], thread))
+ {
+ f_thread->block_index[f_thread->nblock_used++] = b;
+ }
+ }
+}
+
+void setup_bonded_threading(bonded_threading_t *bt,
+ int numAtoms,
+ bool useGpuForBondeds,
+ const t_idef &idef)
+{
+ int ctot = 0;
+
+ assert(bt->nthreads >= 1);
+
+ /* Divide the bonded interaction over the threads */
+ divide_bondeds_over_threads(bt, useGpuForBondeds, idef);
+
+ if (!bt->haveBondeds)
+ {
+ /* We don't have bondeds, so there is nothing to reduce */
+ return;
+ }
+
+ /* Determine to which blocks each thread's bonded force calculation
+ * contributes. Store this as a mask for each thread.
+ */
+#pragma omp parallel for num_threads(bt->nthreads) schedule(static)
+ for (int t = 0; t < bt->nthreads; t++)
+ {
+ try
+ {
+ calc_bonded_reduction_mask(numAtoms, bt->f_t[t].get(),
+ idef, t, *bt);
+ }
+ GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
+ }
+
+ /* Reduce the masks over the threads and determine which blocks
+ * we need to reduce over.
+ */
+ int nblock_tot = (numAtoms + reduction_block_size - 1) >> reduction_block_bits;
+ /* Ensure we have sufficient space for all blocks */
+ if (static_cast<size_t>(nblock_tot) > bt->block_index.size())
+ {
+ bt->block_index.resize(nblock_tot);
+ }
+ if (static_cast<size_t>(nblock_tot) > bt->mask.size())
+ {
+ bt->mask.resize(nblock_tot);
+ }
+ bt->nblock_used = 0;
+ for (int b = 0; b < nblock_tot; b++)
+ {
+ gmx_bitmask_t *mask = &bt->mask[b];
+
+ /* Generate the union over the threads of the bitmask */
+ bitmask_clear(mask);
+ for (int t = 0; t < bt->nthreads; t++)
+ {
+ bitmask_union(mask, bt->f_t[t]->mask[b]);
+ }
+ if (!bitmask_is_zero(*mask))
+ {
+ bt->block_index[bt->nblock_used++] = b;
+ }
+
+ if (debug)
+ {
+ int c = 0;
+ for (int t = 0; t < bt->nthreads; t++)
+ {
+ if (bitmask_is_set(*mask, t))
+ {
+ c++;
+ }
+ }
+ ctot += c;
+
+ if (gmx_debug_at)
+ {
+ fprintf(debug, "block %d flags %s count %d\n",
+ b, to_hex_string(*mask).c_str(), c);
+ }
+ }
+ }
+ if (debug)
+ {
+ fprintf(debug, "Number of %d atom blocks to reduce: %d\n",
+ reduction_block_size, bt->nblock_used);
+ fprintf(debug, "Reduction density %.2f for touched blocks only %.2f\n",
+ ctot*reduction_block_size/static_cast<double>(numAtoms),
+ ctot/static_cast<double>(bt->nblock_used));
+ }
+}
+
+void tear_down_bonded_threading(bonded_threading_t *bt)
+{
+ delete bt;
+}
+
+f_thread_t::f_thread_t(int numEnergyGroups) :
+ grpp(numEnergyGroups)
+{
+ snew(fshift, SHIFTS);
+}
+
+f_thread_t::~f_thread_t()
+{
+ sfree(mask);
+ sfree(fshift);
+ sfree(block_index);
+ sfree_aligned(f);
+}
+
+bonded_threading_t::bonded_threading_t(const int numThreads,
+ const int numEnergyGroups) :
+ nthreads(numThreads),
+ nblock_used(0),
+ haveBondeds(false),
+ workDivision(nthreads),
+ foreignLambdaWorkDivision(1)
+{
+ f_t.resize(numThreads);
+#pragma omp parallel for num_threads(nthreads) schedule(static)
+ for (int t = 0; t < nthreads; t++)
+ {
+ try
+ {
+ /* Note that thread 0 uses the global fshift and energy arrays,
+ * but to keep the code simple, we initialize all data here.
+ */
+ f_t[t] = std::make_unique<f_thread_t>(numEnergyGroups);
+ }
+ GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
+ }
+}
+
+bonded_threading_t *init_bonded_threading(FILE *fplog,
+ const int nenergrp)
+{
+ /* These thread local data structures are used for bondeds only.
+ *
+ * Note that we also use there structures when running single-threaded.
+ * This is because the bonded force buffer uses type rvec4, whereas
+ * the normal force buffer is uses type rvec. This leads to a little
+ * reduction overhead, but the speed gain in the bonded calculations
+ * of doing transposeScatterIncr/DecrU with aligment 4 instead of 3
+ * is much larger than the reduction overhead.
+ */
+ bonded_threading_t *bt = new bonded_threading_t(gmx_omp_nthreads_get(emntBonded),
+ nenergrp);
+
+ /* The optimal value after which to switch from uniform to localized
+ * bonded interaction distribution is 3, 4 or 5 depending on the system
+ * and hardware.
+ */
+ const int max_nthread_uniform = 4;
+ char * ptr;
+
+ if ((ptr = getenv("GMX_BONDED_NTHREAD_UNIFORM")) != nullptr)
+ {
+ sscanf(ptr, "%d", &bt->max_nthread_uniform);
+ if (fplog != nullptr)
+ {
+ fprintf(fplog, "\nMax threads for uniform bonded distribution set to %d by env.var.\n",
+ bt->max_nthread_uniform);
+ }
+ }
+ else
+ {
+ bt->max_nthread_uniform = max_nthread_uniform;
+ }
+
+ return bt;
+}
block_bc(cr, mtop->bIntermolecularInteractions);
if (mtop->bIntermolecularInteractions)
{
- mtop->intermolecular_ilist = std::make_unique<InteractionLists>();
+ if (!MASTER(cr))
+ {
- mtop->intermolecular_ilist = gmx::compat::make_unique<InteractionLists>();
++ mtop->intermolecular_ilist = std::make_unique<InteractionLists>();
+ }
bc_ilists(cr, mtop->intermolecular_ilist.get());
}
}
mtop->natoms -= n;
- state_change_natoms(state, state->natoms - n);
- snew(x_tmp, state->natoms);
- snew(v_tmp, state->natoms);
+ /* We cannot change the size of the state datastructures here
+ * because we still access the coordinate arrays for all positions
+ * before removing the molecules we want to remove.
+ */
+ const int newStateAtomNumber = state->natoms - n;
+ snew(x_tmp, newStateAtomNumber);
+ snew(v_tmp, newStateAtomNumber);
- for (int i = 0; i < egcNR; i++)
+ for (auto group : keysOf(groups->groupNumbers))
{
- if (groups->grpnr[i] != nullptr)
+ if (!groups->groupNumbers[group].empty())
{
- groups->groupNumbers[group].resize(state->natoms);
- new_egrp[group].resize(state->natoms);
- groups->ngrpnr[i] = newStateAtomNumber;
- snew(new_egrp[i], newStateAtomNumber);
++ groups->groupNumbers[group].resize(newStateAtomNumber);
++ new_egrp[group].resize(newStateAtomNumber);
}
}
"Checkpoint interval (minutes)" },
{ "-cpnum", FALSE, etBOOL, {&mdrunOptions.checkpointOptions.keepAndNumberCheckpointFiles},
"Keep and number checkpoint files" },
- { "-append", FALSE, etBOOL, {&bTryToAppendFiles},
+ { "-append", FALSE, etBOOL, {&appendOption},
"Append to previous output files when continuing from checkpoint instead of adding the simulation part number to all file names" },
{ "-nsteps", FALSE, etINT64, {&mdrunOptions.numStepsCommandline},
- "Run this number of steps, overrides .mdp file option (-1 means infinite, -2 means use mdp option, smaller is invalid)" },
+ "Run this number of steps (-1 means infinite, -2 means use mdp option, smaller is invalid)" },
{ "-maxh", FALSE, etREAL, {&mdrunOptions.maximumHoursToRun},
"Terminate after 0.99 times this time (hours)" },
{ "-replex", FALSE, etINT, {&replExParams.exchangeInterval},
ObservablesHistory observablesHistory = {};
- ContinuationOptions &continuationOptions = mdrunOptions.continuationOptions;
-
- if (continuationOptions.startedFromCheckpoint)
+ if (startingBehavior != StartingBehavior::NewSimulation)
{
- gmx_bool bReadEkin;
-
+ /* Check if checkpoint file exists before doing continuation.
+ * This way we can use identical input options for the first and subsequent runs...
+ */
+ if (mdrunOptions.numStepsCommandline > -2)
+ {
+ /* Temporarily set the number of steps to unmlimited to avoid
+ * triggering the nsteps check in load_checkpoint().
+ * This hack will go away soon when the -nsteps option is removed.
+ */
+ inputrec->nsteps = -1;
+ }
+
load_checkpoint(opt2fn_master("-cpi", filenames.size(), filenames.data(), cr),
logFileHandle,
cr, domdecOptions.numCells,
--- /dev/null
- int ncjTotal = 0;
- for (auto &src : srcSet)
- {
- ncjTotal += src.ncjInUse;
- }
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+#include "gmxpre.h"
+
+#include "pairlist.h"
+
+#include "config.h"
+
+#include <cassert>
+#include <cmath>
+#include <cstring>
+
+#include <algorithm>
+
+#include "gromacs/domdec/domdec_struct.h"
+#include "gromacs/gmxlib/nrnb.h"
+#include "gromacs/math/functions.h"
+#include "gromacs/math/utilities.h"
+#include "gromacs/math/vec.h"
+#include "gromacs/mdlib/gmx_omp_nthreads.h"
+#include "gromacs/mdtypes/group.h"
+#include "gromacs/mdtypes/md_enums.h"
+#include "gromacs/nbnxm/atomdata.h"
+#include "gromacs/nbnxm/gpu_data_mgmt.h"
+#include "gromacs/nbnxm/nbnxm_geometry.h"
+#include "gromacs/nbnxm/nbnxm_simd.h"
+#include "gromacs/pbcutil/ishift.h"
+#include "gromacs/pbcutil/pbc.h"
+#include "gromacs/simd/simd.h"
+#include "gromacs/simd/vector_operations.h"
+#include "gromacs/topology/block.h"
+#include "gromacs/utility/exceptions.h"
+#include "gromacs/utility/fatalerror.h"
+#include "gromacs/utility/gmxomp.h"
+#include "gromacs/utility/smalloc.h"
+
+#include "clusterdistancekerneltype.h"
+#include "gridset.h"
+#include "pairlistset.h"
+#include "pairlistsets.h"
+#include "pairlistwork.h"
+#include "pairsearch.h"
+
+using namespace gmx; // TODO: Remove when this file is moved into gmx namespace
+
+using BoundingBox = Nbnxm::BoundingBox; // TODO: Remove when refactoring this file
+using BoundingBox1D = Nbnxm::BoundingBox1D; // TODO: Remove when refactoring this file
+
+using Grid = Nbnxm::Grid; // TODO: Remove when refactoring this file
+
+// Convience alias for partial Nbnxn namespace usage
+using InteractionLocality = Nbnxm::InteractionLocality;
+
+/* We shift the i-particles backward for PBC.
+ * This leads to more conditionals than shifting forward.
+ * We do this to get more balanced pair lists.
+ */
+constexpr bool c_pbcShiftBackward = true;
+
+/* Layout for the nonbonded NxN pair lists */
+enum class NbnxnLayout
+{
+ NoSimd4x4, // i-cluster size 4, j-cluster size 4
+ Simd4xN, // i-cluster size 4, j-cluster size SIMD width
+ Simd2xNN, // i-cluster size 4, j-cluster size half SIMD width
+ Gpu8x8x8 // i-cluster size 8, j-cluster size 8 + super-clustering
+};
+
+#if GMX_SIMD
+/* Returns the j-cluster size */
+template <NbnxnLayout layout>
+static constexpr int jClusterSize()
+{
+ static_assert(layout == NbnxnLayout::NoSimd4x4 || layout == NbnxnLayout::Simd4xN || layout == NbnxnLayout::Simd2xNN, "Currently jClusterSize only supports CPU layouts");
+
+ return layout == NbnxnLayout::Simd4xN ? GMX_SIMD_REAL_WIDTH : (layout == NbnxnLayout::Simd2xNN ? GMX_SIMD_REAL_WIDTH/2 : c_nbnxnCpuIClusterSize);
+}
+
+/*! \brief Returns the j-cluster index given the i-cluster index.
+ *
+ * \tparam jClusterSize The number of atoms in a j-cluster
+ * \tparam jSubClusterIndex The j-sub-cluster index (0/1), used when size(j-cluster) < size(i-cluster)
+ * \param[in] ci The i-cluster index
+ */
+template <int jClusterSize, int jSubClusterIndex>
+static inline int cjFromCi(int ci)
+{
+ static_assert(jClusterSize == c_nbnxnCpuIClusterSize/2 || jClusterSize == c_nbnxnCpuIClusterSize || jClusterSize == c_nbnxnCpuIClusterSize*2, "Only j-cluster sizes 2, 4 and 8 are currently implemented");
+
+ static_assert(jSubClusterIndex == 0 || jSubClusterIndex == 1,
+ "Only sub-cluster indices 0 and 1 are supported");
+
+ if (jClusterSize == c_nbnxnCpuIClusterSize/2)
+ {
+ if (jSubClusterIndex == 0)
+ {
+ return ci << 1;
+ }
+ else
+ {
+ return ((ci + 1) << 1) - 1;
+ }
+ }
+ else if (jClusterSize == c_nbnxnCpuIClusterSize)
+ {
+ return ci;
+ }
+ else
+ {
+ return ci >> 1;
+ }
+}
+
+/*! \brief Returns the j-cluster index given the i-cluster index.
+ *
+ * \tparam layout The pair-list layout
+ * \tparam jSubClusterIndex The j-sub-cluster index (0/1), used when size(j-cluster) < size(i-cluster)
+ * \param[in] ci The i-cluster index
+ */
+template <NbnxnLayout layout, int jSubClusterIndex>
+static inline int cjFromCi(int ci)
+{
+ constexpr int clusterSize = jClusterSize<layout>();
+
+ return cjFromCi<clusterSize, jSubClusterIndex>(ci);
+}
+
+/* Returns the nbnxn coordinate data index given the i-cluster index */
+template <NbnxnLayout layout>
+static inline int xIndexFromCi(int ci)
+{
+ constexpr int clusterSize = jClusterSize<layout>();
+
+ static_assert(clusterSize == c_nbnxnCpuIClusterSize/2 || clusterSize == c_nbnxnCpuIClusterSize || clusterSize == c_nbnxnCpuIClusterSize*2, "Only j-cluster sizes 2, 4 and 8 are currently implemented");
+
+ if (clusterSize <= c_nbnxnCpuIClusterSize)
+ {
+ /* Coordinates are stored packed in groups of 4 */
+ return ci*STRIDE_P4;
+ }
+ else
+ {
+ /* Coordinates packed in 8, i-cluster size is half the packing width */
+ return (ci >> 1)*STRIDE_P8 + (ci & 1)*(c_packX8 >> 1);
+ }
+}
+
+/* Returns the nbnxn coordinate data index given the j-cluster index */
+template <NbnxnLayout layout>
+static inline int xIndexFromCj(int cj)
+{
+ constexpr int clusterSize = jClusterSize<layout>();
+
+ static_assert(clusterSize == c_nbnxnCpuIClusterSize/2 || clusterSize == c_nbnxnCpuIClusterSize || clusterSize == c_nbnxnCpuIClusterSize*2, "Only j-cluster sizes 2, 4 and 8 are currently implemented");
+
+ if (clusterSize == c_nbnxnCpuIClusterSize/2)
+ {
+ /* Coordinates are stored packed in groups of 4 */
+ return (cj >> 1)*STRIDE_P4 + (cj & 1)*(c_packX4 >> 1);
+ }
+ else if (clusterSize == c_nbnxnCpuIClusterSize)
+ {
+ /* Coordinates are stored packed in groups of 4 */
+ return cj*STRIDE_P4;
+ }
+ else
+ {
+ /* Coordinates are stored packed in groups of 8 */
+ return cj*STRIDE_P8;
+ }
+}
+#endif //GMX_SIMD
+
+
+void nbnxn_init_pairlist_fep(t_nblist *nl)
+{
+ nl->type = GMX_NBLIST_INTERACTION_FREE_ENERGY;
+ nl->igeometry = GMX_NBLIST_GEOMETRY_PARTICLE_PARTICLE;
+ /* The interaction functions are set in the free energy kernel fuction */
+ nl->ivdw = -1;
+ nl->ivdwmod = -1;
+ nl->ielec = -1;
+ nl->ielecmod = -1;
+
+ nl->maxnri = 0;
+ nl->maxnrj = 0;
+ nl->nri = 0;
+ nl->nrj = 0;
+ nl->iinr = nullptr;
+ nl->gid = nullptr;
+ nl->shift = nullptr;
+ nl->jindex = nullptr;
+ nl->jjnr = nullptr;
+ nl->excl_fep = nullptr;
+
+}
+
+static void init_buffer_flags(nbnxn_buffer_flags_t *flags,
+ int natoms)
+{
+ flags->nflag = (natoms + NBNXN_BUFFERFLAG_SIZE - 1)/NBNXN_BUFFERFLAG_SIZE;
+ if (flags->nflag > flags->flag_nalloc)
+ {
+ flags->flag_nalloc = over_alloc_large(flags->nflag);
+ srenew(flags->flag, flags->flag_nalloc);
+ }
+ for (int b = 0; b < flags->nflag; b++)
+ {
+ bitmask_clear(&(flags->flag[b]));
+ }
+}
+
+/* Returns the pair-list cutoff between a bounding box and a grid cell given an atom-to-atom pair-list cutoff
+ *
+ * Given a cutoff distance between atoms, this functions returns the cutoff
+ * distance2 between a bounding box of a group of atoms and a grid cell.
+ * Since atoms can be geometrically outside of the cell they have been
+ * assigned to (when atom groups instead of individual atoms are assigned
+ * to cells), this distance returned can be larger than the input.
+ */
+static real
+listRangeForBoundingBoxToGridCell(real rlist,
+ const Grid::Dimensions &gridDims)
+{
+ return rlist + gridDims.maxAtomGroupRadius;
+
+}
+/* Returns the pair-list cutoff between a grid cells given an atom-to-atom pair-list cutoff
+ *
+ * Given a cutoff distance between atoms, this functions returns the cutoff
+ * distance2 between two grid cells.
+ * Since atoms can be geometrically outside of the cell they have been
+ * assigned to (when atom groups instead of individual atoms are assigned
+ * to cells), this distance returned can be larger than the input.
+ */
+static real
+listRangeForGridCellToGridCell(real rlist,
+ const Grid::Dimensions &iGridDims,
+ const Grid::Dimensions &jGridDims)
+{
+ return rlist + iGridDims.maxAtomGroupRadius + jGridDims.maxAtomGroupRadius;
+}
+
+/* Determines the cell range along one dimension that
+ * the bounding box b0 - b1 sees.
+ */
+template<int dim>
+static void get_cell_range(real b0, real b1,
+ const Grid::Dimensions &jGridDims,
+ real d2, real rlist, int *cf, int *cl)
+{
+ real listRangeBBToCell2 = gmx::square(listRangeForBoundingBoxToGridCell(rlist, jGridDims));
+ real distanceInCells = (b0 - jGridDims.lowerCorner[dim])*jGridDims.invCellSize[dim];
+ *cf = std::max(static_cast<int>(distanceInCells), 0);
+
+ while (*cf > 0 &&
+ d2 + gmx::square((b0 - jGridDims.lowerCorner[dim]) - (*cf - 1 + 1)*jGridDims.cellSize[dim]) < listRangeBBToCell2)
+ {
+ (*cf)--;
+ }
+
+ *cl = std::min(static_cast<int>((b1 - jGridDims.lowerCorner[dim])*jGridDims.invCellSize[dim]), jGridDims.numCells[dim] - 1);
+ while (*cl < jGridDims.numCells[dim] - 1 &&
+ d2 + gmx::square((*cl + 1)*jGridDims.cellSize[dim] - (b1 - jGridDims.lowerCorner[dim])) < listRangeBBToCell2)
+ {
+ (*cl)++;
+ }
+}
+
+/* Reference code calculating the distance^2 between two bounding boxes */
+/*
+ static float box_dist2(float bx0, float bx1, float by0,
+ float by1, float bz0, float bz1,
+ const BoundingBox *bb)
+ {
+ float d2;
+ float dl, dh, dm, dm0;
+
+ d2 = 0;
+
+ dl = bx0 - bb->upper.x;
+ dh = bb->lower.x - bx1;
+ dm = std::max(dl, dh);
+ dm0 = std::max(dm, 0.0f);
+ d2 += dm0*dm0;
+
+ dl = by0 - bb->upper.y;
+ dh = bb->lower.y - by1;
+ dm = std::max(dl, dh);
+ dm0 = std::max(dm, 0.0f);
+ d2 += dm0*dm0;
+
+ dl = bz0 - bb->upper.z;
+ dh = bb->lower.z - bz1;
+ dm = std::max(dl, dh);
+ dm0 = std::max(dm, 0.0f);
+ d2 += dm0*dm0;
+
+ return d2;
+ }
+ */
+
+#if !NBNXN_SEARCH_BB_SIMD4
+
+/*! \brief Plain C code calculating the distance^2 between two bounding boxes in xyz0 format
+ *
+ * \param[in] bb_i First bounding box
+ * \param[in] bb_j Second bounding box
+ */
+static float clusterBoundingBoxDistance2(const BoundingBox &bb_i,
+ const BoundingBox &bb_j)
+{
+ float dl = bb_i.lower.x - bb_j.upper.x;
+ float dh = bb_j.lower.x - bb_i.upper.x;
+ float dm = std::max(dl, dh);
+ float dm0 = std::max(dm, 0.0f);
+ float d2 = dm0*dm0;
+
+ dl = bb_i.lower.y - bb_j.upper.y;
+ dh = bb_j.lower.y - bb_i.upper.y;
+ dm = std::max(dl, dh);
+ dm0 = std::max(dm, 0.0f);
+ d2 += dm0*dm0;
+
+ dl = bb_i.lower.z - bb_j.upper.z;
+ dh = bb_j.lower.z - bb_i.upper.z;
+ dm = std::max(dl, dh);
+ dm0 = std::max(dm, 0.0f);
+ d2 += dm0*dm0;
+
+ return d2;
+}
+
+#else /* NBNXN_SEARCH_BB_SIMD4 */
+
+/*! \brief 4-wide SIMD code calculating the distance^2 between two bounding boxes in xyz0 format
+ *
+ * \param[in] bb_i First bounding box, should be aligned for 4-wide SIMD
+ * \param[in] bb_j Second bounding box, should be aligned for 4-wide SIMD
+ */
+static float clusterBoundingBoxDistance2(const BoundingBox &bb_i,
+ const BoundingBox &bb_j)
+{
+ // TODO: During SIMDv2 transition only some archs use namespace (remove when done)
+ using namespace gmx;
+
+ const Simd4Float bb_i_S0 = load4(bb_i.lower.ptr());
+ const Simd4Float bb_i_S1 = load4(bb_i.upper.ptr());
+ const Simd4Float bb_j_S0 = load4(bb_j.lower.ptr());
+ const Simd4Float bb_j_S1 = load4(bb_j.upper.ptr());
+
+ const Simd4Float dl_S = bb_i_S0 - bb_j_S1;
+ const Simd4Float dh_S = bb_j_S0 - bb_i_S1;
+
+ const Simd4Float dm_S = max(dl_S, dh_S);
+ const Simd4Float dm0_S = max(dm_S, simd4SetZeroF());
+
+ return dotProduct(dm0_S, dm0_S);
+}
+
+/* Calculate bb bounding distances of bb_i[si,...,si+3] and store them in d2 */
+template <int boundingBoxStart>
+static inline void gmx_simdcall
+clusterBoundingBoxDistance2_xxxx_simd4_inner(const float *bb_i,
+ float *d2,
+ const Simd4Float xj_l,
+ const Simd4Float yj_l,
+ const Simd4Float zj_l,
+ const Simd4Float xj_h,
+ const Simd4Float yj_h,
+ const Simd4Float zj_h)
+{
+ constexpr int stride = c_packedBoundingBoxesDimSize;
+
+ const int shi = boundingBoxStart*Nbnxm::c_numBoundingBoxBounds1D*DIM;
+
+ const Simd4Float zero = setZero();
+
+ const Simd4Float xi_l = load4(bb_i + shi + 0*stride);
+ const Simd4Float yi_l = load4(bb_i + shi + 1*stride);
+ const Simd4Float zi_l = load4(bb_i + shi + 2*stride);
+ const Simd4Float xi_h = load4(bb_i + shi + 3*stride);
+ const Simd4Float yi_h = load4(bb_i + shi + 4*stride);
+ const Simd4Float zi_h = load4(bb_i + shi + 5*stride);
+
+ const Simd4Float dx_0 = xi_l - xj_h;
+ const Simd4Float dy_0 = yi_l - yj_h;
+ const Simd4Float dz_0 = zi_l - zj_h;
+
+ const Simd4Float dx_1 = xj_l - xi_h;
+ const Simd4Float dy_1 = yj_l - yi_h;
+ const Simd4Float dz_1 = zj_l - zi_h;
+
+ const Simd4Float mx = max(dx_0, dx_1);
+ const Simd4Float my = max(dy_0, dy_1);
+ const Simd4Float mz = max(dz_0, dz_1);
+
+ const Simd4Float m0x = max(mx, zero);
+ const Simd4Float m0y = max(my, zero);
+ const Simd4Float m0z = max(mz, zero);
+
+ const Simd4Float d2x = m0x * m0x;
+ const Simd4Float d2y = m0y * m0y;
+ const Simd4Float d2z = m0z * m0z;
+
+ const Simd4Float d2s = d2x + d2y;
+ const Simd4Float d2t = d2s + d2z;
+
+ store4(d2 + boundingBoxStart, d2t);
+}
+
+/* 4-wide SIMD code for nsi bb distances for bb format xxxxyyyyzzzz */
+static void
+clusterBoundingBoxDistance2_xxxx_simd4(const float *bb_j,
+ const int nsi,
+ const float *bb_i,
+ float *d2)
+{
+ constexpr int stride = c_packedBoundingBoxesDimSize;
+
+ // TODO: During SIMDv2 transition only some archs use namespace (remove when done)
+ using namespace gmx;
+
+ const Simd4Float xj_l = Simd4Float(bb_j[0*stride]);
+ const Simd4Float yj_l = Simd4Float(bb_j[1*stride]);
+ const Simd4Float zj_l = Simd4Float(bb_j[2*stride]);
+ const Simd4Float xj_h = Simd4Float(bb_j[3*stride]);
+ const Simd4Float yj_h = Simd4Float(bb_j[4*stride]);
+ const Simd4Float zj_h = Simd4Float(bb_j[5*stride]);
+
+ /* Here we "loop" over si (0,stride) from 0 to nsi with step stride.
+ * But as we know the number of iterations is 1 or 2, we unroll manually.
+ */
+ clusterBoundingBoxDistance2_xxxx_simd4_inner<0>(bb_i, d2,
+ xj_l, yj_l, zj_l,
+ xj_h, yj_h, zj_h);
+ if (stride < nsi)
+ {
+ clusterBoundingBoxDistance2_xxxx_simd4_inner<stride>(bb_i, d2,
+ xj_l, yj_l, zj_l,
+ xj_h, yj_h, zj_h);
+ }
+}
+
+#endif /* NBNXN_SEARCH_BB_SIMD4 */
+
+
+/* Returns if any atom pair from two clusters is within distance sqrt(rlist2) */
+static inline gmx_bool
+clusterpair_in_range(const NbnxnPairlistGpuWork &work,
+ int si,
+ int csj, int stride, const real *x_j,
+ real rlist2)
+{
+#if !GMX_SIMD4_HAVE_REAL
+
+ /* Plain C version.
+ * All coordinates are stored as xyzxyz...
+ */
+
+ const real *x_i = work.iSuperClusterData.x.data();
+
+ for (int i = 0; i < c_nbnxnGpuClusterSize; i++)
+ {
+ int i0 = (si*c_nbnxnGpuClusterSize + i)*DIM;
+ for (int j = 0; j < c_nbnxnGpuClusterSize; j++)
+ {
+ int j0 = (csj*c_nbnxnGpuClusterSize + j)*stride;
+
+ real d2 = gmx::square(x_i[i0 ] - x_j[j0 ]) + gmx::square(x_i[i0+1] - x_j[j0+1]) + gmx::square(x_i[i0+2] - x_j[j0+2]);
+
+ if (d2 < rlist2)
+ {
+ return TRUE;
+ }
+ }
+ }
+
+ return FALSE;
+
+#else /* !GMX_SIMD4_HAVE_REAL */
+
+ /* 4-wide SIMD version.
+ * The coordinates x_i are stored as xxxxyyyy..., x_j is stored xyzxyz...
+ * Using 8-wide AVX(2) is not faster on Intel Sandy Bridge and Haswell.
+ */
+ static_assert(c_nbnxnGpuClusterSize == 8 || c_nbnxnGpuClusterSize == 4,
+ "A cluster is hard-coded to 4/8 atoms.");
+
+ Simd4Real rc2_S = Simd4Real(rlist2);
+
+ const real *x_i = work.iSuperClusterData.xSimd.data();
+
+ int dim_stride = c_nbnxnGpuClusterSize*DIM;
+ Simd4Real ix_S0 = load4(x_i + si*dim_stride + 0*GMX_SIMD4_WIDTH);
+ Simd4Real iy_S0 = load4(x_i + si*dim_stride + 1*GMX_SIMD4_WIDTH);
+ Simd4Real iz_S0 = load4(x_i + si*dim_stride + 2*GMX_SIMD4_WIDTH);
+
+ Simd4Real ix_S1, iy_S1, iz_S1;
+ if (c_nbnxnGpuClusterSize == 8)
+ {
+ ix_S1 = load4(x_i + si*dim_stride + 3*GMX_SIMD4_WIDTH);
+ iy_S1 = load4(x_i + si*dim_stride + 4*GMX_SIMD4_WIDTH);
+ iz_S1 = load4(x_i + si*dim_stride + 5*GMX_SIMD4_WIDTH);
+ }
+ /* We loop from the outer to the inner particles to maximize
+ * the chance that we find a pair in range quickly and return.
+ */
+ int j0 = csj*c_nbnxnGpuClusterSize;
+ int j1 = j0 + c_nbnxnGpuClusterSize - 1;
+ while (j0 < j1)
+ {
+ Simd4Real jx0_S, jy0_S, jz0_S;
+ Simd4Real jx1_S, jy1_S, jz1_S;
+
+ Simd4Real dx_S0, dy_S0, dz_S0;
+ Simd4Real dx_S1, dy_S1, dz_S1;
+ Simd4Real dx_S2, dy_S2, dz_S2;
+ Simd4Real dx_S3, dy_S3, dz_S3;
+
+ Simd4Real rsq_S0;
+ Simd4Real rsq_S1;
+ Simd4Real rsq_S2;
+ Simd4Real rsq_S3;
+
+ Simd4Bool wco_S0;
+ Simd4Bool wco_S1;
+ Simd4Bool wco_S2;
+ Simd4Bool wco_S3;
+ Simd4Bool wco_any_S01, wco_any_S23, wco_any_S;
+
+ jx0_S = Simd4Real(x_j[j0*stride+0]);
+ jy0_S = Simd4Real(x_j[j0*stride+1]);
+ jz0_S = Simd4Real(x_j[j0*stride+2]);
+
+ jx1_S = Simd4Real(x_j[j1*stride+0]);
+ jy1_S = Simd4Real(x_j[j1*stride+1]);
+ jz1_S = Simd4Real(x_j[j1*stride+2]);
+
+ /* Calculate distance */
+ dx_S0 = ix_S0 - jx0_S;
+ dy_S0 = iy_S0 - jy0_S;
+ dz_S0 = iz_S0 - jz0_S;
+ dx_S2 = ix_S0 - jx1_S;
+ dy_S2 = iy_S0 - jy1_S;
+ dz_S2 = iz_S0 - jz1_S;
+ if (c_nbnxnGpuClusterSize == 8)
+ {
+ dx_S1 = ix_S1 - jx0_S;
+ dy_S1 = iy_S1 - jy0_S;
+ dz_S1 = iz_S1 - jz0_S;
+ dx_S3 = ix_S1 - jx1_S;
+ dy_S3 = iy_S1 - jy1_S;
+ dz_S3 = iz_S1 - jz1_S;
+ }
+
+ /* rsq = dx*dx+dy*dy+dz*dz */
+ rsq_S0 = norm2(dx_S0, dy_S0, dz_S0);
+ rsq_S2 = norm2(dx_S2, dy_S2, dz_S2);
+ if (c_nbnxnGpuClusterSize == 8)
+ {
+ rsq_S1 = norm2(dx_S1, dy_S1, dz_S1);
+ rsq_S3 = norm2(dx_S3, dy_S3, dz_S3);
+ }
+
+ wco_S0 = (rsq_S0 < rc2_S);
+ wco_S2 = (rsq_S2 < rc2_S);
+ if (c_nbnxnGpuClusterSize == 8)
+ {
+ wco_S1 = (rsq_S1 < rc2_S);
+ wco_S3 = (rsq_S3 < rc2_S);
+ }
+ if (c_nbnxnGpuClusterSize == 8)
+ {
+ wco_any_S01 = wco_S0 || wco_S1;
+ wco_any_S23 = wco_S2 || wco_S3;
+ wco_any_S = wco_any_S01 || wco_any_S23;
+ }
+ else
+ {
+ wco_any_S = wco_S0 || wco_S2;
+ }
+
+ if (anyTrue(wco_any_S))
+ {
+ return TRUE;
+ }
+
+ j0++;
+ j1--;
+ }
+
+ return FALSE;
+
+#endif /* !GMX_SIMD4_HAVE_REAL */
+}
+
+/* Returns the j-cluster index for index cjIndex in a cj list */
+static inline int nblCj(gmx::ArrayRef<const nbnxn_cj_t> cjList,
+ int cjIndex)
+{
+ return cjList[cjIndex].cj;
+}
+
+/* Returns the j-cluster index for index cjIndex in a cj4 list */
+static inline int nblCj(gmx::ArrayRef<const nbnxn_cj4_t> cj4List,
+ int cjIndex)
+{
+ return cj4List[cjIndex/c_nbnxnGpuJgroupSize].cj[cjIndex & (c_nbnxnGpuJgroupSize - 1)];
+}
+
+/* Returns the i-interaction mask of the j sub-cell for index cj_ind */
+static unsigned int nbl_imask0(const NbnxnPairlistGpu *nbl, int cj_ind)
+{
+ return nbl->cj4[cj_ind/c_nbnxnGpuJgroupSize].imei[0].imask;
+}
+
+NbnxnPairlistCpu::NbnxnPairlistCpu() :
+ na_ci(c_nbnxnCpuIClusterSize),
+ na_cj(0),
+ rlist(0),
+ ncjInUse(0),
+ nci_tot(0),
+ work(std::make_unique<NbnxnPairlistCpuWork>())
+{
+}
+
+NbnxnPairlistGpu::NbnxnPairlistGpu(gmx::PinningPolicy pinningPolicy) :
+ na_ci(c_nbnxnGpuClusterSize),
+ na_cj(c_nbnxnGpuClusterSize),
+ na_sc(c_gpuNumClusterPerCell*c_nbnxnGpuClusterSize),
+ rlist(0),
+ sci({}, {pinningPolicy}),
+ cj4({}, {pinningPolicy}),
+ excl({}, {pinningPolicy}),
+ nci_tot(0),
+ work(std::make_unique<NbnxnPairlistGpuWork>())
+{
+ static_assert(c_nbnxnGpuNumClusterPerSupercluster == c_gpuNumClusterPerCell,
+ "The search code assumes that the a super-cluster matches a search grid cell");
+
+ static_assert(sizeof(cj4[0].imei[0].imask)*8 >= c_nbnxnGpuJgroupSize*c_gpuNumClusterPerCell,
+ "The i super-cluster cluster interaction mask does not contain a sufficient number of bits");
+
+ static_assert(sizeof(excl[0])*8 >= c_nbnxnGpuJgroupSize*c_gpuNumClusterPerCell, "The GPU exclusion mask does not contain a sufficient number of bits");
+
+ // We always want a first entry without any exclusions
+ excl.resize(1);
+}
+
+// TODO: Move to pairlistset.cpp
+PairlistSet::PairlistSet(const Nbnxm::InteractionLocality locality,
+ const PairlistParams &pairlistParams) :
+ locality_(locality),
+ params_(pairlistParams)
+{
+ isCpuType_ =
+ (params_.pairlistType == PairlistType::Simple4x2 ||
+ params_.pairlistType == PairlistType::Simple4x4 ||
+ params_.pairlistType == PairlistType::Simple4x8);
+ // Currently GPU lists are always combined
+ combineLists_ = !isCpuType_;
+
+ const int numLists = gmx_omp_nthreads_get(emntNonbonded);
+
+ if (!combineLists_ &&
+ numLists > NBNXN_BUFFERFLAG_MAX_THREADS)
+ {
+ gmx_fatal(FARGS, "%d OpenMP threads were requested. Since the non-bonded force buffer reduction is prohibitively slow with more than %d threads, we do not allow this. Use %d or less OpenMP threads.",
+ numLists, NBNXN_BUFFERFLAG_MAX_THREADS, NBNXN_BUFFERFLAG_MAX_THREADS);
+ }
+
+ if (isCpuType_)
+ {
+ cpuLists_.resize(numLists);
+ if (numLists > 1)
+ {
+ cpuListsWork_.resize(numLists);
+ }
+ }
+ else
+ {
+ /* Only list 0 is used on the GPU, use normal allocation for i>0 */
+ gpuLists_.emplace_back(gmx::PinningPolicy::PinnedIfSupported);
+ /* Lists 0 to numLists are use for constructing lists in parallel
+ * on the CPU using numLists threads (and then merged into list 0).
+ */
+ for (int i = 1; i < numLists; i++)
+ {
+ gpuLists_.emplace_back(gmx::PinningPolicy::CannotBePinned);
+ }
+ }
+ if (params_.haveFep)
+ {
+ fepLists_.resize(numLists);
+
+ /* Execute in order to avoid memory interleaving between threads */
+#pragma omp parallel for num_threads(numLists) schedule(static)
+ for (int i = 0; i < numLists; i++)
+ {
+ try
+ {
+ /* We used to allocate all normal lists locally on each thread
+ * as well. The question is if allocating the object on the
+ * master thread (but all contained list memory thread local)
+ * impacts performance.
+ */
+ fepLists_[i] = std::make_unique<t_nblist>();
+ nbnxn_init_pairlist_fep(fepLists_[i].get());
+ }
+ GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
+ }
+ }
+}
+
+/* Print statistics of a pair list, used for debug output */
+static void print_nblist_statistics(FILE *fp,
+ const NbnxnPairlistCpu &nbl,
+ const Nbnxm::GridSet &gridSet,
+ const real rl)
+{
+ const Grid &grid = gridSet.grids()[0];
+ const Grid::Dimensions &dims = grid.dimensions();
+
+ fprintf(fp, "nbl nci %zu ncj %d\n",
+ nbl.ci.size(), nbl.ncjInUse);
+ const int numAtomsJCluster = grid.geometry().numAtomsJCluster;
+ const double numAtomsPerCell = nbl.ncjInUse/static_cast<double>(grid.numCells())*numAtomsJCluster;
+ fprintf(fp, "nbl na_cj %d rl %g ncp %d per cell %.1f atoms %.1f ratio %.2f\n",
+ nbl.na_cj, rl, nbl.ncjInUse, nbl.ncjInUse/static_cast<double>(grid.numCells()),
+ numAtomsPerCell,
+ numAtomsPerCell/(0.5*4.0/3.0*M_PI*rl*rl*rl*grid.numCells()*numAtomsJCluster/(dims.gridSize[XX]*dims.gridSize[YY]*dims.gridSize[ZZ])));
+
+ fprintf(fp, "nbl average j cell list length %.1f\n",
+ 0.25*nbl.ncjInUse/std::max(static_cast<double>(nbl.ci.size()), 1.0));
+
+ int cs[SHIFTS] = { 0 };
+ int npexcl = 0;
+ for (const nbnxn_ci_t &ciEntry : nbl.ci)
+ {
+ cs[ciEntry.shift & NBNXN_CI_SHIFT] +=
+ ciEntry.cj_ind_end - ciEntry.cj_ind_start;
+
+ int j = ciEntry.cj_ind_start;
+ while (j < ciEntry.cj_ind_end &&
+ nbl.cj[j].excl != NBNXN_INTERACTION_MASK_ALL)
+ {
+ npexcl++;
+ j++;
+ }
+ }
+ fprintf(fp, "nbl cell pairs, total: %zu excl: %d %.1f%%\n",
+ nbl.cj.size(), npexcl, 100*npexcl/std::max(static_cast<double>(nbl.cj.size()), 1.0));
+ for (int s = 0; s < SHIFTS; s++)
+ {
+ if (cs[s] > 0)
+ {
+ fprintf(fp, "nbl shift %2d ncj %3d\n", s, cs[s]);
+ }
+ }
+}
+
+/* Print statistics of a pair lists, used for debug output */
+static void print_nblist_statistics(FILE *fp,
+ const NbnxnPairlistGpu &nbl,
+ const Nbnxm::GridSet &gridSet,
+ const real rl)
+{
+ const Grid &grid = gridSet.grids()[0];
+ const Grid::Dimensions &dims = grid.dimensions();
+
+ fprintf(fp, "nbl nsci %zu ncj4 %zu nsi %d excl4 %zu\n",
+ nbl.sci.size(), nbl.cj4.size(), nbl.nci_tot, nbl.excl.size());
+ const int numAtomsCluster = grid.geometry().numAtomsICluster;
+ const double numAtomsPerCell = nbl.nci_tot/static_cast<double>(grid.numClusters())*numAtomsCluster;
+ fprintf(fp, "nbl na_c %d rl %g ncp %d per cell %.1f atoms %.1f ratio %.2f\n",
+ nbl.na_ci, rl, nbl.nci_tot, nbl.nci_tot/static_cast<double>(grid.numClusters()),
+ numAtomsPerCell,
+ numAtomsPerCell/(0.5*4.0/3.0*M_PI*rl*rl*rl*grid.numClusters()*numAtomsCluster/(dims.gridSize[XX]*dims.gridSize[YY]*dims.gridSize[ZZ])));
+
+ double sum_nsp = 0;
+ double sum_nsp2 = 0;
+ int nsp_max = 0;
+ int c[c_gpuNumClusterPerCell + 1] = { 0 };
+ for (const nbnxn_sci_t &sci : nbl.sci)
+ {
+ int nsp = 0;
+ for (int j4 = sci.cj4_ind_start; j4 < sci.cj4_ind_end; j4++)
+ {
+ for (int j = 0; j < c_nbnxnGpuJgroupSize; j++)
+ {
+ int b = 0;
+ for (int si = 0; si < c_gpuNumClusterPerCell; si++)
+ {
+ if (nbl.cj4[j4].imei[0].imask & (1U << (j*c_gpuNumClusterPerCell + si)))
+ {
+ b++;
+ }
+ }
+ nsp += b;
+ c[b]++;
+ }
+ }
+ sum_nsp += nsp;
+ sum_nsp2 += nsp*nsp;
+ nsp_max = std::max(nsp_max, nsp);
+ }
+ if (!nbl.sci.empty())
+ {
+ sum_nsp /= nbl.sci.size();
+ sum_nsp2 /= nbl.sci.size();
+ }
+ fprintf(fp, "nbl #cluster-pairs: av %.1f stddev %.1f max %d\n",
+ sum_nsp, std::sqrt(sum_nsp2 - sum_nsp*sum_nsp), nsp_max);
+
+ if (!nbl.cj4.empty())
+ {
+ for (int b = 0; b <= c_gpuNumClusterPerCell; b++)
+ {
+ fprintf(fp, "nbl j-list #i-subcell %d %7d %4.1f\n",
+ b, c[b], 100.0*c[b]/size_t {nbl.cj4.size()*c_nbnxnGpuJgroupSize});
+ }
+ }
+}
+
+/* Returns a pointer to the exclusion mask for j-cluster-group \p cj4 and warp \p warp
+ * Generates a new exclusion entry when the j-cluster-group uses
+ * the default all-interaction mask at call time, so the returned mask
+ * can be modified when needed.
+ */
+static nbnxn_excl_t *get_exclusion_mask(NbnxnPairlistGpu *nbl,
+ int cj4,
+ int warp)
+{
+ if (nbl->cj4[cj4].imei[warp].excl_ind == 0)
+ {
+ /* No exclusions set, make a new list entry */
+ const size_t oldSize = nbl->excl.size();
+ GMX_ASSERT(oldSize >= 1, "We should always have entry [0]");
+ /* Add entry with default values: no exclusions */
+ nbl->excl.resize(oldSize + 1);
+ nbl->cj4[cj4].imei[warp].excl_ind = oldSize;
+ }
+
+ return &nbl->excl[nbl->cj4[cj4].imei[warp].excl_ind];
+}
+
+static void set_self_and_newton_excls_supersub(NbnxnPairlistGpu *nbl,
+ int cj4_ind, int sj_offset,
+ int i_cluster_in_cell)
+{
+ nbnxn_excl_t *excl[c_nbnxnGpuClusterpairSplit];
+
+ /* Here we only set the set self and double pair exclusions */
+
+ /* Reserve extra elements, so the resize() in get_exclusion_mask()
+ * will not invalidate excl entries in the loop below
+ */
+ nbl->excl.reserve(nbl->excl.size() + c_nbnxnGpuClusterpairSplit);
+ for (int w = 0; w < c_nbnxnGpuClusterpairSplit; w++)
+ {
+ excl[w] = get_exclusion_mask(nbl, cj4_ind, w);
+ }
+
+ /* Only minor < major bits set */
+ for (int ej = 0; ej < nbl->na_ci; ej++)
+ {
+ int w = (ej>>2);
+ for (int ei = ej; ei < nbl->na_ci; ei++)
+ {
+ excl[w]->pair[(ej & (c_nbnxnGpuJgroupSize-1))*nbl->na_ci + ei] &=
+ ~(1U << (sj_offset*c_gpuNumClusterPerCell + i_cluster_in_cell));
+ }
+ }
+}
+
+/* Returns a diagonal or off-diagonal interaction mask for plain C lists */
+static unsigned int get_imask(gmx_bool rdiag, int ci, int cj)
+{
+ return (rdiag && ci == cj ? NBNXN_INTERACTION_MASK_DIAG : NBNXN_INTERACTION_MASK_ALL);
+}
+
+/* Returns a diagonal or off-diagonal interaction mask for cj-size=2 */
+gmx_unused static unsigned int get_imask_simd_j2(gmx_bool rdiag, int ci, int cj)
+{
+ return (rdiag && ci*2 == cj ? NBNXN_INTERACTION_MASK_DIAG_J2_0 :
+ (rdiag && ci*2+1 == cj ? NBNXN_INTERACTION_MASK_DIAG_J2_1 :
+ NBNXN_INTERACTION_MASK_ALL));
+}
+
+/* Returns a diagonal or off-diagonal interaction mask for cj-size=4 */
+gmx_unused static unsigned int get_imask_simd_j4(gmx_bool rdiag, int ci, int cj)
+{
+ return (rdiag && ci == cj ? NBNXN_INTERACTION_MASK_DIAG : NBNXN_INTERACTION_MASK_ALL);
+}
+
+/* Returns a diagonal or off-diagonal interaction mask for cj-size=8 */
+gmx_unused static unsigned int get_imask_simd_j8(gmx_bool rdiag, int ci, int cj)
+{
+ return (rdiag && ci == cj*2 ? NBNXN_INTERACTION_MASK_DIAG_J8_0 :
+ (rdiag && ci == cj*2+1 ? NBNXN_INTERACTION_MASK_DIAG_J8_1 :
+ NBNXN_INTERACTION_MASK_ALL));
+}
+
+#if GMX_SIMD
+#if GMX_SIMD_REAL_WIDTH == 2
+#define get_imask_simd_4xn get_imask_simd_j2
+#endif
+#if GMX_SIMD_REAL_WIDTH == 4
+#define get_imask_simd_4xn get_imask_simd_j4
+#endif
+#if GMX_SIMD_REAL_WIDTH == 8
+#define get_imask_simd_4xn get_imask_simd_j8
+#define get_imask_simd_2xnn get_imask_simd_j4
+#endif
+#if GMX_SIMD_REAL_WIDTH == 16
+#define get_imask_simd_2xnn get_imask_simd_j8
+#endif
+#endif
+
+/* Plain C code for checking and adding cluster-pairs to the list.
+ *
+ * \param[in] gridj The j-grid
+ * \param[in,out] nbl The pair-list to store the cluster pairs in
+ * \param[in] icluster The index of the i-cluster
+ * \param[in] jclusterFirst The first cluster in the j-range
+ * \param[in] jclusterLast The last cluster in the j-range
+ * \param[in] excludeSubDiagonal Exclude atom pairs with i-index > j-index
+ * \param[in] x_j Coordinates for the j-atom, in xyz format
+ * \param[in] rlist2 The squared list cut-off
+ * \param[in] rbb2 The squared cut-off for putting cluster-pairs in the list based on bounding box distance only
+ * \param[in,out] numDistanceChecks The number of distance checks performed
+ */
+static void
+makeClusterListSimple(const Grid &jGrid,
+ NbnxnPairlistCpu * nbl,
+ int icluster,
+ int jclusterFirst,
+ int jclusterLast,
+ bool excludeSubDiagonal,
+ const real * gmx_restrict x_j,
+ real rlist2,
+ float rbb2,
+ int * gmx_restrict numDistanceChecks)
+{
+ const BoundingBox * gmx_restrict bb_ci = nbl->work->iClusterData.bb.data();
+ const real * gmx_restrict x_ci = nbl->work->iClusterData.x.data();
+
+ gmx_bool InRange;
+
+ InRange = FALSE;
+ while (!InRange && jclusterFirst <= jclusterLast)
+ {
+ real d2 = clusterBoundingBoxDistance2(bb_ci[0], jGrid.jBoundingBoxes()[jclusterFirst]);
+ *numDistanceChecks += 2;
+
+ /* Check if the distance is within the distance where
+ * we use only the bounding box distance rbb,
+ * or within the cut-off and there is at least one atom pair
+ * within the cut-off.
+ */
+ if (d2 < rbb2)
+ {
+ InRange = TRUE;
+ }
+ else if (d2 < rlist2)
+ {
+ int cjf_gl = jGrid.cellOffset() + jclusterFirst;
+ for (int i = 0; i < c_nbnxnCpuIClusterSize && !InRange; i++)
+ {
+ for (int j = 0; j < c_nbnxnCpuIClusterSize; j++)
+ {
+ InRange = InRange ||
+ (gmx::square(x_ci[i*STRIDE_XYZ+XX] - x_j[(cjf_gl*c_nbnxnCpuIClusterSize+j)*STRIDE_XYZ+XX]) +
+ gmx::square(x_ci[i*STRIDE_XYZ+YY] - x_j[(cjf_gl*c_nbnxnCpuIClusterSize+j)*STRIDE_XYZ+YY]) +
+ gmx::square(x_ci[i*STRIDE_XYZ+ZZ] - x_j[(cjf_gl*c_nbnxnCpuIClusterSize+j)*STRIDE_XYZ+ZZ]) < rlist2);
+ }
+ }
+ *numDistanceChecks += c_nbnxnCpuIClusterSize*c_nbnxnCpuIClusterSize;
+ }
+ if (!InRange)
+ {
+ jclusterFirst++;
+ }
+ }
+ if (!InRange)
+ {
+ return;
+ }
+
+ InRange = FALSE;
+ while (!InRange && jclusterLast > jclusterFirst)
+ {
+ real d2 = clusterBoundingBoxDistance2(bb_ci[0], jGrid.jBoundingBoxes()[jclusterLast]);
+ *numDistanceChecks += 2;
+
+ /* Check if the distance is within the distance where
+ * we use only the bounding box distance rbb,
+ * or within the cut-off and there is at least one atom pair
+ * within the cut-off.
+ */
+ if (d2 < rbb2)
+ {
+ InRange = TRUE;
+ }
+ else if (d2 < rlist2)
+ {
+ int cjl_gl = jGrid.cellOffset() + jclusterLast;
+ for (int i = 0; i < c_nbnxnCpuIClusterSize && !InRange; i++)
+ {
+ for (int j = 0; j < c_nbnxnCpuIClusterSize; j++)
+ {
+ InRange = InRange ||
+ (gmx::square(x_ci[i*STRIDE_XYZ+XX] - x_j[(cjl_gl*c_nbnxnCpuIClusterSize+j)*STRIDE_XYZ+XX]) +
+ gmx::square(x_ci[i*STRIDE_XYZ+YY] - x_j[(cjl_gl*c_nbnxnCpuIClusterSize+j)*STRIDE_XYZ+YY]) +
+ gmx::square(x_ci[i*STRIDE_XYZ+ZZ] - x_j[(cjl_gl*c_nbnxnCpuIClusterSize+j)*STRIDE_XYZ+ZZ]) < rlist2);
+ }
+ }
+ *numDistanceChecks += c_nbnxnCpuIClusterSize*c_nbnxnCpuIClusterSize;
+ }
+ if (!InRange)
+ {
+ jclusterLast--;
+ }
+ }
+
+ if (jclusterFirst <= jclusterLast)
+ {
+ for (int jcluster = jclusterFirst; jcluster <= jclusterLast; jcluster++)
+ {
+ /* Store cj and the interaction mask */
+ nbnxn_cj_t cjEntry;
+ cjEntry.cj = jGrid.cellOffset() + jcluster;
+ cjEntry.excl = get_imask(excludeSubDiagonal, icluster, jcluster);
+ nbl->cj.push_back(cjEntry);
+ }
+ /* Increase the closing index in the i list */
+ nbl->ci.back().cj_ind_end = nbl->cj.size();
+ }
+}
+
+#ifdef GMX_NBNXN_SIMD_4XN
+#include "gromacs/nbnxm/pairlist_simd_4xm.h"
+#endif
+#ifdef GMX_NBNXN_SIMD_2XNN
+#include "gromacs/nbnxm/pairlist_simd_2xmm.h"
+#endif
+
+/* Plain C or SIMD4 code for making a pair list of super-cell sci vs scj.
+ * Checks bounding box distances and possibly atom pair distances.
+ */
+static void make_cluster_list_supersub(const Grid &iGrid,
+ const Grid &jGrid,
+ NbnxnPairlistGpu *nbl,
+ const int sci,
+ const int scj,
+ const bool excludeSubDiagonal,
+ const int stride,
+ const real *x,
+ const real rlist2,
+ const float rbb2,
+ int *numDistanceChecks)
+{
+ NbnxnPairlistGpuWork &work = *nbl->work;
+
+#if NBNXN_BBXXXX
+ const float *pbb_ci = work.iSuperClusterData.bbPacked.data();
+#else
+ const BoundingBox *bb_ci = work.iSuperClusterData.bb.data();
+#endif
+
+ assert(c_nbnxnGpuClusterSize == iGrid.geometry().numAtomsICluster);
+ assert(c_nbnxnGpuClusterSize == jGrid.geometry().numAtomsICluster);
+
+ /* We generate the pairlist mainly based on bounding-box distances
+ * and do atom pair distance based pruning on the GPU.
+ * Only if a j-group contains a single cluster-pair, we try to prune
+ * that pair based on atom distances on the CPU to avoid empty j-groups.
+ */
+#define PRUNE_LIST_CPU_ONE 1
+#define PRUNE_LIST_CPU_ALL 0
+
+#if PRUNE_LIST_CPU_ONE
+ int ci_last = -1;
+#endif
+
+ float *d2l = work.distanceBuffer.data();
+
+ for (int subc = 0; subc < jGrid.numClustersPerCell()[scj]; subc++)
+ {
+ const int cj4_ind = work.cj_ind/c_nbnxnGpuJgroupSize;
+ const int cj_offset = work.cj_ind - cj4_ind*c_nbnxnGpuJgroupSize;
+ const int cj = scj*c_gpuNumClusterPerCell + subc;
+
+ const int cj_gl = jGrid.cellOffset()*c_gpuNumClusterPerCell + cj;
+
+ int ci1;
+ if (excludeSubDiagonal && sci == scj)
+ {
+ ci1 = subc + 1;
+ }
+ else
+ {
+ ci1 = iGrid.numClustersPerCell()[sci];
+ }
+
+#if NBNXN_BBXXXX
+ /* Determine all ci1 bb distances in one call with SIMD4 */
+ const int offset = packedBoundingBoxesIndex(cj) + (cj & (c_packedBoundingBoxesDimSize - 1));
+ clusterBoundingBoxDistance2_xxxx_simd4(jGrid.packedBoundingBoxes().data() + offset,
+ ci1, pbb_ci, d2l);
+ *numDistanceChecks += c_nbnxnGpuClusterSize*2;
+#endif
+
+ int npair = 0;
+ unsigned int imask = 0;
+ /* We use a fixed upper-bound instead of ci1 to help optimization */
+ for (int ci = 0; ci < c_gpuNumClusterPerCell; ci++)
+ {
+ if (ci == ci1)
+ {
+ break;
+ }
+
+#if !NBNXN_BBXXXX
+ /* Determine the bb distance between ci and cj */
+ d2l[ci] = clusterBoundingBoxDistance2(bb_ci[ci], jGrid.jBoundingBoxes()[cj]);
+ *numDistanceChecks += 2;
+#endif
+ float d2 = d2l[ci];
+
+#if PRUNE_LIST_CPU_ALL
+ /* Check if the distance is within the distance where
+ * we use only the bounding box distance rbb,
+ * or within the cut-off and there is at least one atom pair
+ * within the cut-off. This check is very costly.
+ */
+ *numDistanceChecks += c_nbnxnGpuClusterSize*c_nbnxnGpuClusterSize;
+ if (d2 < rbb2 ||
+ (d2 < rlist2 &&
+ clusterpair_in_range(work, ci, cj_gl, stride, x, rlist2)))
+#else
+ /* Check if the distance between the two bounding boxes
+ * in within the pair-list cut-off.
+ */
+ if (d2 < rlist2)
+#endif
+ {
+ /* Flag this i-subcell to be taken into account */
+ imask |= (1U << (cj_offset*c_gpuNumClusterPerCell + ci));
+
+#if PRUNE_LIST_CPU_ONE
+ ci_last = ci;
+#endif
+
+ npair++;
+ }
+ }
+
+#if PRUNE_LIST_CPU_ONE
+ /* If we only found 1 pair, check if any atoms are actually
+ * within the cut-off, so we could get rid of it.
+ */
+ if (npair == 1 && d2l[ci_last] >= rbb2 &&
+ !clusterpair_in_range(work, ci_last, cj_gl, stride, x, rlist2))
+ {
+ imask &= ~(1U << (cj_offset*c_gpuNumClusterPerCell + ci_last));
+ npair--;
+ }
+#endif
+
+ if (npair > 0)
+ {
+ /* We have at least one cluster pair: add a j-entry */
+ if (static_cast<size_t>(cj4_ind) == nbl->cj4.size())
+ {
+ nbl->cj4.resize(nbl->cj4.size() + 1);
+ }
+ nbnxn_cj4_t *cj4 = &nbl->cj4[cj4_ind];
+
+ cj4->cj[cj_offset] = cj_gl;
+
+ /* Set the exclusions for the ci==sj entry.
+ * Here we don't bother to check if this entry is actually flagged,
+ * as it will nearly always be in the list.
+ */
+ if (excludeSubDiagonal && sci == scj)
+ {
+ set_self_and_newton_excls_supersub(nbl, cj4_ind, cj_offset, subc);
+ }
+
+ /* Copy the cluster interaction mask to the list */
+ for (int w = 0; w < c_nbnxnGpuClusterpairSplit; w++)
+ {
+ cj4->imei[w].imask |= imask;
+ }
+
+ nbl->work->cj_ind++;
+
+ /* Keep the count */
+ nbl->nci_tot += npair;
+
+ /* Increase the closing index in i super-cell list */
+ nbl->sci.back().cj4_ind_end =
+ (nbl->work->cj_ind + c_nbnxnGpuJgroupSize - 1)/c_nbnxnGpuJgroupSize;
+ }
+ }
+}
+
+/* Returns how many contiguous j-clusters we have starting in the i-list */
+template <typename CjListType>
+static int numContiguousJClusters(const int cjIndexStart,
+ const int cjIndexEnd,
+ gmx::ArrayRef<const CjListType> cjList)
+{
+ const int firstJCluster = nblCj(cjList, cjIndexStart);
+
+ int numContiguous = 0;
+
+ while (cjIndexStart + numContiguous < cjIndexEnd &&
+ nblCj(cjList, cjIndexStart + numContiguous) == firstJCluster + numContiguous)
+ {
+ numContiguous++;
+ }
+
+ return numContiguous;
+}
+
+/*! \internal
+ * \brief Helper struct for efficient searching for excluded atoms in a j-list
+ */
+struct JListRanges
+{
+ /*! \brief Constructs a j-list range from \p cjList with the given index range */
+ template <typename CjListType>
+ JListRanges(int cjIndexStart,
+ int cjIndexEnd,
+ gmx::ArrayRef<const CjListType> cjList);
+
+ int cjIndexStart; //!< The start index in the j-list
+ int cjIndexEnd; //!< The end index in the j-list
+ int cjFirst; //!< The j-cluster with index cjIndexStart
+ int cjLast; //!< The j-cluster with index cjIndexEnd-1
+ int numDirect; //!< Up to cjIndexStart+numDirect the j-clusters are cjFirst + the index offset
+};
+
+#ifndef DOXYGEN
+template <typename CjListType>
+JListRanges::JListRanges(int cjIndexStart,
+ int cjIndexEnd,
+ gmx::ArrayRef<const CjListType> cjList) :
+ cjIndexStart(cjIndexStart),
+ cjIndexEnd(cjIndexEnd)
+{
+ GMX_ASSERT(cjIndexEnd > cjIndexStart, "JListRanges should only be called with non-empty lists");
+
+ cjFirst = nblCj(cjList, cjIndexStart);
+ cjLast = nblCj(cjList, cjIndexEnd - 1);
+
+ /* Determine how many contiguous j-cells we have starting
+ * from the first i-cell. This number can be used to directly
+ * calculate j-cell indices for excluded atoms.
+ */
+ numDirect = numContiguousJClusters(cjIndexStart, cjIndexEnd, cjList);
+}
+#endif // !DOXYGEN
+
+/* Return the index of \p jCluster in the given range or -1 when not present
+ *
+ * Note: This code is executed very often and therefore performance is
+ * important. It should be inlined and fully optimized.
+ */
+template <typename CjListType>
+static inline int
+findJClusterInJList(int jCluster,
+ const JListRanges &ranges,
+ gmx::ArrayRef<const CjListType> cjList)
+{
+ int index;
+
+ if (jCluster < ranges.cjFirst + ranges.numDirect)
+ {
+ /* We can calculate the index directly using the offset */
+ index = ranges.cjIndexStart + jCluster - ranges.cjFirst;
+ }
+ else
+ {
+ /* Search for jCluster using bisection */
+ index = -1;
+ int rangeStart = ranges.cjIndexStart + ranges.numDirect;
+ int rangeEnd = ranges.cjIndexEnd;
+ int rangeMiddle;
+ while (index == -1 && rangeStart < rangeEnd)
+ {
+ rangeMiddle = (rangeStart + rangeEnd) >> 1;
+
+ const int clusterMiddle = nblCj(cjList, rangeMiddle);
+
+ if (jCluster == clusterMiddle)
+ {
+ index = rangeMiddle;
+ }
+ else if (jCluster < clusterMiddle)
+ {
+ rangeEnd = rangeMiddle;
+ }
+ else
+ {
+ rangeStart = rangeMiddle + 1;
+ }
+ }
+ }
+
+ return index;
+}
+
+// TODO: Get rid of the two functions below by renaming sci to ci (or something better)
+
+/* Return the i-entry in the list we are currently operating on */
+static nbnxn_ci_t *getOpenIEntry(NbnxnPairlistCpu *nbl)
+{
+ return &nbl->ci.back();
+}
+
+/* Return the i-entry in the list we are currently operating on */
+static nbnxn_sci_t *getOpenIEntry(NbnxnPairlistGpu *nbl)
+{
+ return &nbl->sci.back();
+}
+
+/* Set all atom-pair exclusions for a simple type list i-entry
+ *
+ * Set all atom-pair exclusions from the topology stored in exclusions
+ * as masks in the pair-list for simple list entry iEntry.
+ */
+static void
+setExclusionsForIEntry(const Nbnxm::GridSet &gridSet,
+ NbnxnPairlistCpu *nbl,
+ gmx_bool diagRemoved,
+ int na_cj_2log,
+ const nbnxn_ci_t &iEntry,
+ const t_blocka &exclusions)
+{
+ if (iEntry.cj_ind_end == iEntry.cj_ind_start)
+ {
+ /* Empty list: no exclusions */
+ return;
+ }
+
+ const JListRanges ranges(iEntry.cj_ind_start, iEntry.cj_ind_end, gmx::makeConstArrayRef(nbl->cj));
+
+ const int iCluster = iEntry.ci;
+
+ gmx::ArrayRef<const int> cell = gridSet.cells();
+ gmx::ArrayRef<const int> atomIndices = gridSet.atomIndices();
+
+ /* Loop over the atoms in the i-cluster */
+ for (int i = 0; i < nbl->na_ci; i++)
+ {
+ const int iIndex = iCluster*nbl->na_ci + i;
+ const int iAtom = atomIndices[iIndex];
+ if (iAtom >= 0)
+ {
+ /* Loop over the topology-based exclusions for this i-atom */
+ for (int exclIndex = exclusions.index[iAtom]; exclIndex < exclusions.index[iAtom + 1]; exclIndex++)
+ {
+ const int jAtom = exclusions.a[exclIndex];
+
+ if (jAtom == iAtom)
+ {
+ /* The self exclusion are already set, save some time */
+ continue;
+ }
+
+ /* Get the index of the j-atom in the nbnxn atom data */
+ const int jIndex = cell[jAtom];
+
+ /* Without shifts we only calculate interactions j>i
+ * for one-way pair-lists.
+ */
+ if (diagRemoved && jIndex <= iIndex)
+ {
+ continue;
+ }
+
+ const int jCluster = (jIndex >> na_cj_2log);
+
+ /* Could the cluster se be in our list? */
+ if (jCluster >= ranges.cjFirst && jCluster <= ranges.cjLast)
+ {
+ const int index =
+ findJClusterInJList(jCluster, ranges,
+ gmx::makeConstArrayRef(nbl->cj));
+
+ if (index >= 0)
+ {
+ /* We found an exclusion, clear the corresponding
+ * interaction bit.
+ */
+ const int innerJ = jIndex - (jCluster << na_cj_2log);
+
+ nbl->cj[index].excl &= ~(1U << ((i << na_cj_2log) + innerJ));
+ }
+ }
+ }
+ }
+ }
+}
+
+/* Add a new i-entry to the FEP list and copy the i-properties */
+static inline void fep_list_new_nri_copy(t_nblist *nlist)
+{
+ /* Add a new i-entry */
+ nlist->nri++;
+
+ assert(nlist->nri < nlist->maxnri);
+
+ /* Duplicate the last i-entry, except for jindex, which continues */
+ nlist->iinr[nlist->nri] = nlist->iinr[nlist->nri-1];
+ nlist->shift[nlist->nri] = nlist->shift[nlist->nri-1];
+ nlist->gid[nlist->nri] = nlist->gid[nlist->nri-1];
+ nlist->jindex[nlist->nri] = nlist->nrj;
+}
+
+/* Rellocate FEP list for size nl->maxnri, TODO: replace by C++ */
+static void reallocate_nblist(t_nblist *nl)
+{
+ if (gmx_debug_at)
+ {
+ fprintf(debug, "reallocating neigborlist (ielec=%d, ivdw=%d, igeometry=%d, type=%d), maxnri=%d\n",
+ nl->ielec, nl->ivdw, nl->igeometry, nl->type, nl->maxnri);
+ }
+ srenew(nl->iinr, nl->maxnri);
+ srenew(nl->gid, nl->maxnri);
+ srenew(nl->shift, nl->maxnri);
+ srenew(nl->jindex, nl->maxnri+1);
+}
+
+/* For load balancing of the free-energy lists over threads, we set
+ * the maximum nrj size of an i-entry to 40. This leads to good
+ * load balancing in the worst case scenario of a single perturbed
+ * particle on 16 threads, while not introducing significant overhead.
+ * Note that half of the perturbed pairs will anyhow end up in very small lists,
+ * since non perturbed i-particles will see few perturbed j-particles).
+ */
+const int max_nrj_fep = 40;
+
+/* Exclude the perturbed pairs from the Verlet list. This is only done to avoid
+ * singularities for overlapping particles (0/0), since the charges and
+ * LJ parameters have been zeroed in the nbnxn data structure.
+ * Simultaneously make a group pair list for the perturbed pairs.
+ */
+static void make_fep_list(gmx::ArrayRef<const int> atomIndices,
+ const nbnxn_atomdata_t *nbat,
+ NbnxnPairlistCpu *nbl,
+ gmx_bool bDiagRemoved,
+ nbnxn_ci_t *nbl_ci,
+ real gmx_unused shx,
+ real gmx_unused shy,
+ real gmx_unused shz,
+ real gmx_unused rlist_fep2,
+ const Grid &iGrid,
+ const Grid &jGrid,
+ t_nblist *nlist)
+{
+ int ci, cj_ind_start, cj_ind_end, cja, cjr;
+ int nri_max;
+ int gid_i = 0, gid_j, gid;
+ int egp_shift, egp_mask;
+ int gid_cj = 0;
+ int ind_i, ind_j, ai, aj;
+ int nri;
+ gmx_bool bFEP_i, bFEP_i_all;
+
+ if (nbl_ci->cj_ind_end == nbl_ci->cj_ind_start)
+ {
+ /* Empty list */
+ return;
+ }
+
+ ci = nbl_ci->ci;
+
+ cj_ind_start = nbl_ci->cj_ind_start;
+ cj_ind_end = nbl_ci->cj_ind_end;
+
+ /* In worst case we have alternating energy groups
+ * and create #atom-pair lists, which means we need the size
+ * of a cluster pair (na_ci*na_cj) times the number of cj's.
+ */
+ nri_max = nbl->na_ci*nbl->na_cj*(cj_ind_end - cj_ind_start);
+ if (nlist->nri + nri_max > nlist->maxnri)
+ {
+ nlist->maxnri = over_alloc_large(nlist->nri + nri_max);
+ reallocate_nblist(nlist);
+ }
+
+ const int numAtomsJCluster = jGrid.geometry().numAtomsJCluster;
+
+ const nbnxn_atomdata_t::Params &nbatParams = nbat->params();
+
+ const int ngid = nbatParams.nenergrp;
+
+ /* TODO: Consider adding a check in grompp and changing this to an assert */
+ const int numBitsInEnergyGroupIdsForAtomsInJCluster = sizeof(gid_cj)*8;
+ if (ngid*numAtomsJCluster > numBitsInEnergyGroupIdsForAtomsInJCluster)
+ {
+ gmx_fatal(FARGS, "The Verlet scheme with %dx%d kernels and free-energy only supports up to %zu energy groups",
+ iGrid.geometry().numAtomsICluster, numAtomsJCluster,
+ (sizeof(gid_cj)*8)/numAtomsJCluster);
+ }
+
+ egp_shift = nbatParams.neg_2log;
+ egp_mask = (1 << egp_shift) - 1;
+
+ /* Loop over the atoms in the i sub-cell */
+ bFEP_i_all = TRUE;
+ for (int i = 0; i < nbl->na_ci; i++)
+ {
+ ind_i = ci*nbl->na_ci + i;
+ ai = atomIndices[ind_i];
+ if (ai >= 0)
+ {
+ nri = nlist->nri;
+ nlist->jindex[nri+1] = nlist->jindex[nri];
+ nlist->iinr[nri] = ai;
+ /* The actual energy group pair index is set later */
+ nlist->gid[nri] = 0;
+ nlist->shift[nri] = nbl_ci->shift & NBNXN_CI_SHIFT;
+
+ bFEP_i = iGrid.atomIsPerturbed(ci - iGrid.cellOffset(), i);
+
+ bFEP_i_all = bFEP_i_all && bFEP_i;
+
+ if (nlist->nrj + (cj_ind_end - cj_ind_start)*nbl->na_cj > nlist->maxnrj)
+ {
+ nlist->maxnrj = over_alloc_small(nlist->nrj + (cj_ind_end - cj_ind_start)*nbl->na_cj);
+ srenew(nlist->jjnr, nlist->maxnrj);
+ srenew(nlist->excl_fep, nlist->maxnrj);
+ }
+
+ if (ngid > 1)
+ {
+ gid_i = (nbatParams.energrp[ci] >> (egp_shift*i)) & egp_mask;
+ }
+
+ for (int cj_ind = cj_ind_start; cj_ind < cj_ind_end; cj_ind++)
+ {
+ unsigned int fep_cj;
+
+ cja = nbl->cj[cj_ind].cj;
+
+ if (numAtomsJCluster == jGrid.geometry().numAtomsICluster)
+ {
+ cjr = cja - jGrid.cellOffset();
+ fep_cj = jGrid.fepBits(cjr);
+ if (ngid > 1)
+ {
+ gid_cj = nbatParams.energrp[cja];
+ }
+ }
+ else if (2*numAtomsJCluster == jGrid.geometry().numAtomsICluster)
+ {
+ cjr = cja - jGrid.cellOffset()*2;
+ /* Extract half of the ci fep/energrp mask */
+ fep_cj = (jGrid.fepBits(cjr >> 1) >> ((cjr & 1)*numAtomsJCluster)) & ((1 << numAtomsJCluster) - 1);
+ if (ngid > 1)
+ {
+ gid_cj = nbatParams.energrp[cja >> 1] >> ((cja & 1)*numAtomsJCluster*egp_shift) & ((1 << (numAtomsJCluster*egp_shift)) - 1);
+ }
+ }
+ else
+ {
+ cjr = cja - (jGrid.cellOffset() >> 1);
+ /* Combine two ci fep masks/energrp */
+ fep_cj = jGrid.fepBits(cjr*2) + (jGrid.fepBits(cjr*2 + 1) << jGrid.geometry().numAtomsICluster);
+ if (ngid > 1)
+ {
+ gid_cj = nbatParams.energrp[cja*2] + (nbatParams.energrp[cja*2+1] << (jGrid.geometry().numAtomsICluster*egp_shift));
+ }
+ }
+
+ if (bFEP_i || fep_cj != 0)
+ {
+ for (int j = 0; j < nbl->na_cj; j++)
+ {
+ /* Is this interaction perturbed and not excluded? */
+ ind_j = cja*nbl->na_cj + j;
+ aj = atomIndices[ind_j];
+ if (aj >= 0 &&
+ (bFEP_i || (fep_cj & (1 << j))) &&
+ (!bDiagRemoved || ind_j >= ind_i))
+ {
+ if (ngid > 1)
+ {
+ gid_j = (gid_cj >> (j*egp_shift)) & egp_mask;
+ gid = GID(gid_i, gid_j, ngid);
+
+ if (nlist->nrj > nlist->jindex[nri] &&
+ nlist->gid[nri] != gid)
+ {
+ /* Energy group pair changed: new list */
+ fep_list_new_nri_copy(nlist);
+ nri = nlist->nri;
+ }
+ nlist->gid[nri] = gid;
+ }
+
+ if (nlist->nrj - nlist->jindex[nri] >= max_nrj_fep)
+ {
+ fep_list_new_nri_copy(nlist);
+ nri = nlist->nri;
+ }
+
+ /* Add it to the FEP list */
+ nlist->jjnr[nlist->nrj] = aj;
+ nlist->excl_fep[nlist->nrj] = (nbl->cj[cj_ind].excl >> (i*nbl->na_cj + j)) & 1;
+ nlist->nrj++;
+
+ /* Exclude it from the normal list.
+ * Note that the charge has been set to zero,
+ * but we need to avoid 0/0, as perturbed atoms
+ * can be on top of each other.
+ */
+ nbl->cj[cj_ind].excl &= ~(1U << (i*nbl->na_cj + j));
+ }
+ }
+ }
+ }
+
+ if (nlist->nrj > nlist->jindex[nri])
+ {
+ /* Actually add this new, non-empty, list */
+ nlist->nri++;
+ nlist->jindex[nlist->nri] = nlist->nrj;
+ }
+ }
+ }
+
+ if (bFEP_i_all)
+ {
+ /* All interactions are perturbed, we can skip this entry */
+ nbl_ci->cj_ind_end = cj_ind_start;
+ nbl->ncjInUse -= cj_ind_end - cj_ind_start;
+ }
+}
+
+/* Return the index of atom a within a cluster */
+static inline int cj_mod_cj4(int cj)
+{
+ return cj & (c_nbnxnGpuJgroupSize - 1);
+}
+
+/* Convert a j-cluster to a cj4 group */
+static inline int cj_to_cj4(int cj)
+{
+ return cj/c_nbnxnGpuJgroupSize;
+}
+
+/* Return the index of an j-atom within a warp */
+static inline int a_mod_wj(int a)
+{
+ return a & (c_nbnxnGpuClusterSize/c_nbnxnGpuClusterpairSplit - 1);
+}
+
+/* As make_fep_list above, but for super/sub lists. */
+static void make_fep_list(gmx::ArrayRef<const int> atomIndices,
+ const nbnxn_atomdata_t *nbat,
+ NbnxnPairlistGpu *nbl,
+ gmx_bool bDiagRemoved,
+ const nbnxn_sci_t *nbl_sci,
+ real shx,
+ real shy,
+ real shz,
+ real rlist_fep2,
+ const Grid &iGrid,
+ const Grid &jGrid,
+ t_nblist *nlist)
+{
+ int nri_max;
+ int c_abs;
+ int ind_i, ind_j, ai, aj;
+ int nri;
+ gmx_bool bFEP_i;
+ real xi, yi, zi;
+ const nbnxn_cj4_t *cj4;
+
+ const int numJClusterGroups = nbl_sci->numJClusterGroups();
+ if (numJClusterGroups == 0)
+ {
+ /* Empty list */
+ return;
+ }
+
+ const int sci = nbl_sci->sci;
+
+ const int cj4_ind_start = nbl_sci->cj4_ind_start;
+ const int cj4_ind_end = nbl_sci->cj4_ind_end;
+
+ /* Here we process one super-cell, max #atoms na_sc, versus a list
+ * cj4 entries, each with max c_nbnxnGpuJgroupSize cj's, each
+ * of size na_cj atoms.
+ * On the GPU we don't support energy groups (yet).
+ * So for each of the na_sc i-atoms, we need max one FEP list
+ * for each max_nrj_fep j-atoms.
+ */
+ nri_max = nbl->na_sc*nbl->na_cj*(1 + (numJClusterGroups*c_nbnxnGpuJgroupSize)/max_nrj_fep);
+ if (nlist->nri + nri_max > nlist->maxnri)
+ {
+ nlist->maxnri = over_alloc_large(nlist->nri + nri_max);
+ reallocate_nblist(nlist);
+ }
+
+ /* Loop over the atoms in the i super-cluster */
+ for (int c = 0; c < c_gpuNumClusterPerCell; c++)
+ {
+ c_abs = sci*c_gpuNumClusterPerCell + c;
+
+ for (int i = 0; i < nbl->na_ci; i++)
+ {
+ ind_i = c_abs*nbl->na_ci + i;
+ ai = atomIndices[ind_i];
+ if (ai >= 0)
+ {
+ nri = nlist->nri;
+ nlist->jindex[nri+1] = nlist->jindex[nri];
+ nlist->iinr[nri] = ai;
+ /* With GPUs, energy groups are not supported */
+ nlist->gid[nri] = 0;
+ nlist->shift[nri] = nbl_sci->shift & NBNXN_CI_SHIFT;
+
+ bFEP_i = iGrid.atomIsPerturbed(c_abs - iGrid.cellOffset()*c_gpuNumClusterPerCell, i);
+
+ xi = nbat->x()[ind_i*nbat->xstride+XX] + shx;
+ yi = nbat->x()[ind_i*nbat->xstride+YY] + shy;
+ zi = nbat->x()[ind_i*nbat->xstride+ZZ] + shz;
+
+ const int nrjMax = nlist->nrj + numJClusterGroups*c_nbnxnGpuJgroupSize*nbl->na_cj;
+ if (nrjMax > nlist->maxnrj)
+ {
+ nlist->maxnrj = over_alloc_small(nrjMax);
+ srenew(nlist->jjnr, nlist->maxnrj);
+ srenew(nlist->excl_fep, nlist->maxnrj);
+ }
+
+ for (int cj4_ind = cj4_ind_start; cj4_ind < cj4_ind_end; cj4_ind++)
+ {
+ cj4 = &nbl->cj4[cj4_ind];
+
+ for (int gcj = 0; gcj < c_nbnxnGpuJgroupSize; gcj++)
+ {
+ if ((cj4->imei[0].imask & (1U << (gcj*c_gpuNumClusterPerCell + c))) == 0)
+ {
+ /* Skip this ci for this cj */
+ continue;
+ }
+
+ const int cjr =
+ cj4->cj[gcj] - jGrid.cellOffset()*c_gpuNumClusterPerCell;
+
+ if (bFEP_i || jGrid.clusterIsPerturbed(cjr))
+ {
+ for (int j = 0; j < nbl->na_cj; j++)
+ {
+ /* Is this interaction perturbed and not excluded? */
+ ind_j = (jGrid.cellOffset()*c_gpuNumClusterPerCell + cjr)*nbl->na_cj + j;
+ aj = atomIndices[ind_j];
+ if (aj >= 0 &&
+ (bFEP_i || jGrid.atomIsPerturbed(cjr, j)) &&
+ (!bDiagRemoved || ind_j >= ind_i))
+ {
+ int excl_pair;
+ unsigned int excl_bit;
+ real dx, dy, dz;
+
+ const int jHalf = j/(c_nbnxnGpuClusterSize/c_nbnxnGpuClusterpairSplit);
+ nbnxn_excl_t *excl =
+ get_exclusion_mask(nbl, cj4_ind, jHalf);
+
+ excl_pair = a_mod_wj(j)*nbl->na_ci + i;
+ excl_bit = (1U << (gcj*c_gpuNumClusterPerCell + c));
+
+ dx = nbat->x()[ind_j*nbat->xstride+XX] - xi;
+ dy = nbat->x()[ind_j*nbat->xstride+YY] - yi;
+ dz = nbat->x()[ind_j*nbat->xstride+ZZ] - zi;
+
+ /* The unpruned GPU list has more than 2/3
+ * of the atom pairs beyond rlist. Using
+ * this list will cause a lot of overhead
+ * in the CPU FEP kernels, especially
+ * relative to the fast GPU kernels.
+ * So we prune the FEP list here.
+ */
+ if (dx*dx + dy*dy + dz*dz < rlist_fep2)
+ {
+ if (nlist->nrj - nlist->jindex[nri] >= max_nrj_fep)
+ {
+ fep_list_new_nri_copy(nlist);
+ nri = nlist->nri;
+ }
+
+ /* Add it to the FEP list */
+ nlist->jjnr[nlist->nrj] = aj;
+ nlist->excl_fep[nlist->nrj] = (excl->pair[excl_pair] & excl_bit) ? 1 : 0;
+ nlist->nrj++;
+ }
+
+ /* Exclude it from the normal list.
+ * Note that the charge and LJ parameters have
+ * been set to zero, but we need to avoid 0/0,
+ * as perturbed atoms can be on top of each other.
+ */
+ excl->pair[excl_pair] &= ~excl_bit;
+ }
+ }
+
+ /* Note that we could mask out this pair in imask
+ * if all i- and/or all j-particles are perturbed.
+ * But since the perturbed pairs on the CPU will
+ * take an order of magnitude more time, the GPU
+ * will finish before the CPU and there is no gain.
+ */
+ }
+ }
+ }
+
+ if (nlist->nrj > nlist->jindex[nri])
+ {
+ /* Actually add this new, non-empty, list */
+ nlist->nri++;
+ nlist->jindex[nlist->nri] = nlist->nrj;
+ }
+ }
+ }
+ }
+}
+
+/* Set all atom-pair exclusions for a GPU type list i-entry
+ *
+ * Sets all atom-pair exclusions from the topology stored in exclusions
+ * as masks in the pair-list for i-super-cluster list entry iEntry.
+ */
+static void
+setExclusionsForIEntry(const Nbnxm::GridSet &gridSet,
+ NbnxnPairlistGpu *nbl,
+ gmx_bool diagRemoved,
+ int gmx_unused na_cj_2log,
+ const nbnxn_sci_t &iEntry,
+ const t_blocka &exclusions)
+{
+ if (iEntry.numJClusterGroups() == 0)
+ {
+ /* Empty list */
+ return;
+ }
+
+ /* Set the search ranges using start and end j-cluster indices.
+ * Note that here we can not use cj4_ind_end, since the last cj4
+ * can be only partially filled, so we use cj_ind.
+ */
+ const JListRanges ranges(iEntry.cj4_ind_start*c_nbnxnGpuJgroupSize,
+ nbl->work->cj_ind,
+ gmx::makeConstArrayRef(nbl->cj4));
+
+ GMX_ASSERT(nbl->na_ci == c_nbnxnGpuClusterSize, "na_ci should match the GPU cluster size");
+ constexpr int c_clusterSize = c_nbnxnGpuClusterSize;
+ constexpr int c_superClusterSize = c_nbnxnGpuNumClusterPerSupercluster*c_nbnxnGpuClusterSize;
+
+ const int iSuperCluster = iEntry.sci;
+
+ gmx::ArrayRef<const int> atomIndices = gridSet.atomIndices();
+ gmx::ArrayRef<const int> cell = gridSet.cells();
+
+ /* Loop over the atoms in the i super-cluster */
+ for (int i = 0; i < c_superClusterSize; i++)
+ {
+ const int iIndex = iSuperCluster*c_superClusterSize + i;
+ const int iAtom = atomIndices[iIndex];
+ if (iAtom >= 0)
+ {
+ const int iCluster = i/c_clusterSize;
+
+ /* Loop over the topology-based exclusions for this i-atom */
+ for (int exclIndex = exclusions.index[iAtom]; exclIndex < exclusions.index[iAtom + 1]; exclIndex++)
+ {
+ const int jAtom = exclusions.a[exclIndex];
+
+ if (jAtom == iAtom)
+ {
+ /* The self exclusions are already set, save some time */
+ continue;
+ }
+
+ /* Get the index of the j-atom in the nbnxn atom data */
+ const int jIndex = cell[jAtom];
+
+ /* Without shifts we only calculate interactions j>i
+ * for one-way pair-lists.
+ */
+ /* NOTE: We would like to use iIndex on the right hand side,
+ * but that makes this routine 25% slower with gcc6/7.
+ * Even using c_superClusterSize makes it slower.
+ * Either of these changes triggers peeling of the exclIndex
+ * loop, which apparently leads to far less efficient code.
+ */
+ if (diagRemoved && jIndex <= iSuperCluster*nbl->na_sc + i)
+ {
+ continue;
+ }
+
+ const int jCluster = jIndex/c_clusterSize;
+
+ /* Check whether the cluster is in our list? */
+ if (jCluster >= ranges.cjFirst && jCluster <= ranges.cjLast)
+ {
+ const int index =
+ findJClusterInJList(jCluster, ranges,
+ gmx::makeConstArrayRef(nbl->cj4));
+
+ if (index >= 0)
+ {
+ /* We found an exclusion, clear the corresponding
+ * interaction bit.
+ */
+ const unsigned int pairMask = (1U << (cj_mod_cj4(index)*c_gpuNumClusterPerCell + iCluster));
+ /* Check if the i-cluster interacts with the j-cluster */
+ if (nbl_imask0(nbl, index) & pairMask)
+ {
+ const int innerI = (i & (c_clusterSize - 1));
+ const int innerJ = (jIndex & (c_clusterSize - 1));
+
+ /* Determine which j-half (CUDA warp) we are in */
+ const int jHalf = innerJ/(c_clusterSize/c_nbnxnGpuClusterpairSplit);
+
+ nbnxn_excl_t *interactionMask =
+ get_exclusion_mask(nbl, cj_to_cj4(index), jHalf);
+
+ interactionMask->pair[a_mod_wj(innerJ)*c_clusterSize + innerI] &= ~pairMask;
+ }
+ }
+ }
+ }
+ }
+ }
+}
+
+/* Make a new ci entry at the back of nbl->ci */
+static void addNewIEntry(NbnxnPairlistCpu *nbl, int ci, int shift, int flags)
+{
+ nbnxn_ci_t ciEntry;
+ ciEntry.ci = ci;
+ ciEntry.shift = shift;
+ /* Store the interaction flags along with the shift */
+ ciEntry.shift |= flags;
+ ciEntry.cj_ind_start = nbl->cj.size();
+ ciEntry.cj_ind_end = nbl->cj.size();
+ nbl->ci.push_back(ciEntry);
+}
+
+/* Make a new sci entry at index nbl->nsci */
+static void addNewIEntry(NbnxnPairlistGpu *nbl, int sci, int shift, int gmx_unused flags)
+{
+ nbnxn_sci_t sciEntry;
+ sciEntry.sci = sci;
+ sciEntry.shift = shift;
+ sciEntry.cj4_ind_start = nbl->cj4.size();
+ sciEntry.cj4_ind_end = nbl->cj4.size();
+
+ nbl->sci.push_back(sciEntry);
+}
+
+/* Sort the simple j-list cj on exclusions.
+ * Entries with exclusions will all be sorted to the beginning of the list.
+ */
+static void sort_cj_excl(nbnxn_cj_t *cj, int ncj,
+ NbnxnPairlistCpuWork *work)
+{
+ work->cj.resize(ncj);
+
+ /* Make a list of the j-cells involving exclusions */
+ int jnew = 0;
+ for (int j = 0; j < ncj; j++)
+ {
+ if (cj[j].excl != NBNXN_INTERACTION_MASK_ALL)
+ {
+ work->cj[jnew++] = cj[j];
+ }
+ }
+ /* Check if there are exclusions at all or not just the first entry */
+ if (!((jnew == 0) ||
+ (jnew == 1 && cj[0].excl != NBNXN_INTERACTION_MASK_ALL)))
+ {
+ for (int j = 0; j < ncj; j++)
+ {
+ if (cj[j].excl == NBNXN_INTERACTION_MASK_ALL)
+ {
+ work->cj[jnew++] = cj[j];
+ }
+ }
+ for (int j = 0; j < ncj; j++)
+ {
+ cj[j] = work->cj[j];
+ }
+ }
+}
+
+/* Close this simple list i entry */
+static void closeIEntry(NbnxnPairlistCpu *nbl,
+ int gmx_unused sp_max_av,
+ gmx_bool gmx_unused progBal,
+ float gmx_unused nsp_tot_est,
+ int gmx_unused thread,
+ int gmx_unused nthread)
+{
+ nbnxn_ci_t &ciEntry = nbl->ci.back();
+
+ /* All content of the new ci entry have already been filled correctly,
+ * we only need to sort and increase counts or remove the entry when empty.
+ */
+ const int jlen = ciEntry.cj_ind_end - ciEntry.cj_ind_start;
+ if (jlen > 0)
+ {
+ sort_cj_excl(nbl->cj.data() + ciEntry.cj_ind_start, jlen, nbl->work.get());
+
+ /* The counts below are used for non-bonded pair/flop counts
+ * and should therefore match the available kernel setups.
+ */
+ if (!(ciEntry.shift & NBNXN_CI_DO_COUL(0)))
+ {
+ nbl->work->ncj_noq += jlen;
+ }
+ else if ((ciEntry.shift & NBNXN_CI_HALF_LJ(0)) ||
+ !(ciEntry.shift & NBNXN_CI_DO_LJ(0)))
+ {
+ nbl->work->ncj_hlj += jlen;
+ }
+ }
+ else
+ {
+ /* Entry is empty: remove it */
+ nbl->ci.pop_back();
+ }
+}
+
+/* Split sci entry for load balancing on the GPU.
+ * Splitting ensures we have enough lists to fully utilize the whole GPU.
+ * With progBal we generate progressively smaller lists, which improves
+ * load balancing. As we only know the current count on our own thread,
+ * we will need to estimate the current total amount of i-entries.
+ * As the lists get concatenated later, this estimate depends
+ * both on nthread and our own thread index.
+ */
+static void split_sci_entry(NbnxnPairlistGpu *nbl,
+ int nsp_target_av,
+ gmx_bool progBal, float nsp_tot_est,
+ int thread, int nthread)
+{
+ int nsp_max;
+
+ if (progBal)
+ {
+ float nsp_est;
+
+ /* Estimate the total numbers of ci's of the nblist combined
+ * over all threads using the target number of ci's.
+ */
+ nsp_est = (nsp_tot_est*thread)/nthread + nbl->nci_tot;
+
+ /* The first ci blocks should be larger, to avoid overhead.
+ * The last ci blocks should be smaller, to improve load balancing.
+ * The factor 3/2 makes the first block 3/2 times the target average
+ * and ensures that the total number of blocks end up equal to
+ * that of equally sized blocks of size nsp_target_av.
+ */
+ nsp_max = static_cast<int>(nsp_target_av*(nsp_tot_est*1.5/(nsp_est + nsp_tot_est)));
+ }
+ else
+ {
+ nsp_max = nsp_target_av;
+ }
+
+ const int cj4_start = nbl->sci.back().cj4_ind_start;
+ const int cj4_end = nbl->sci.back().cj4_ind_end;
+ const int j4len = cj4_end - cj4_start;
+
+ if (j4len > 1 && j4len*c_gpuNumClusterPerCell*c_nbnxnGpuJgroupSize > nsp_max)
+ {
+ /* Modify the last ci entry and process the cj4's again */
+
+ int nsp = 0;
+ int nsp_sci = 0;
+ int nsp_cj4_e = 0;
+ int nsp_cj4 = 0;
+ for (int cj4 = cj4_start; cj4 < cj4_end; cj4++)
+ {
+ int nsp_cj4_p = nsp_cj4;
+ /* Count the number of cluster pairs in this cj4 group */
+ nsp_cj4 = 0;
+ for (int p = 0; p < c_gpuNumClusterPerCell*c_nbnxnGpuJgroupSize; p++)
+ {
+ nsp_cj4 += (nbl->cj4[cj4].imei[0].imask >> p) & 1;
+ }
+
+ /* If adding the current cj4 with nsp_cj4 pairs get us further
+ * away from our target nsp_max, split the list before this cj4.
+ */
+ if (nsp > 0 && nsp_max - nsp < nsp + nsp_cj4 - nsp_max)
+ {
+ /* Split the list at cj4 */
+ nbl->sci.back().cj4_ind_end = cj4;
+ /* Create a new sci entry */
+ nbnxn_sci_t sciNew;
+ sciNew.sci = nbl->sci.back().sci;
+ sciNew.shift = nbl->sci.back().shift;
+ sciNew.cj4_ind_start = cj4;
+ nbl->sci.push_back(sciNew);
+
+ nsp_sci = nsp;
+ nsp_cj4_e = nsp_cj4_p;
+ nsp = 0;
+ }
+ nsp += nsp_cj4;
+ }
+
+ /* Put the remaining cj4's in the last sci entry */
+ nbl->sci.back().cj4_ind_end = cj4_end;
+
+ /* Possibly balance out the last two sci's
+ * by moving the last cj4 of the second last sci.
+ */
+ if (nsp_sci - nsp_cj4_e >= nsp + nsp_cj4_e)
+ {
+ GMX_ASSERT(nbl->sci.size() >= 2, "We expect at least two elements");
+ nbl->sci[nbl->sci.size() - 2].cj4_ind_end--;
+ nbl->sci[nbl->sci.size() - 1].cj4_ind_start--;
+ }
+ }
+}
+
+/* Clost this super/sub list i entry */
+static void closeIEntry(NbnxnPairlistGpu *nbl,
+ int nsp_max_av,
+ gmx_bool progBal, float nsp_tot_est,
+ int thread, int nthread)
+{
+ nbnxn_sci_t &sciEntry = *getOpenIEntry(nbl);
+
+ /* All content of the new ci entry have already been filled correctly,
+ * we only need to, potentially, split or remove the entry when empty.
+ */
+ int j4len = sciEntry.numJClusterGroups();
+ if (j4len > 0)
+ {
+ /* We can only have complete blocks of 4 j-entries in a list,
+ * so round the count up before closing.
+ */
+ int ncj4 = (nbl->work->cj_ind + c_nbnxnGpuJgroupSize - 1)/c_nbnxnGpuJgroupSize;
+ nbl->work->cj_ind = ncj4*c_nbnxnGpuJgroupSize;
+
+ if (nsp_max_av > 0)
+ {
+ /* Measure the size of the new entry and potentially split it */
+ split_sci_entry(nbl, nsp_max_av, progBal, nsp_tot_est,
+ thread, nthread);
+ }
+ }
+ else
+ {
+ /* Entry is empty: remove it */
+ nbl->sci.pop_back();
+ }
+}
+
+/* Syncs the working array before adding another grid pair to the GPU list */
+static void sync_work(NbnxnPairlistCpu gmx_unused *nbl)
+{
+}
+
+/* Syncs the working array before adding another grid pair to the GPU list */
+static void sync_work(NbnxnPairlistGpu *nbl)
+{
+ nbl->work->cj_ind = nbl->cj4.size()*c_nbnxnGpuJgroupSize;
+}
+
+/* Clears an NbnxnPairlistCpu data structure */
+static void clear_pairlist(NbnxnPairlistCpu *nbl)
+{
+ nbl->ci.clear();
+ nbl->cj.clear();
+ nbl->ncjInUse = 0;
+ nbl->nci_tot = 0;
+ nbl->ciOuter.clear();
+ nbl->cjOuter.clear();
+
+ nbl->work->ncj_noq = 0;
+ nbl->work->ncj_hlj = 0;
+}
+
+/* Clears an NbnxnPairlistGpu data structure */
+static void clear_pairlist(NbnxnPairlistGpu *nbl)
+{
+ nbl->sci.clear();
+ nbl->cj4.clear();
+ nbl->excl.resize(1);
+ nbl->nci_tot = 0;
+}
+
+/* Clears a group scheme pair list */
+static void clear_pairlist_fep(t_nblist *nl)
+{
+ nl->nri = 0;
+ nl->nrj = 0;
+ if (nl->jindex == nullptr)
+ {
+ snew(nl->jindex, 1);
+ }
+ nl->jindex[0] = 0;
+}
+
+/* Sets a simple list i-cell bounding box, including PBC shift */
+static inline void set_icell_bb_simple(gmx::ArrayRef<const BoundingBox> bb,
+ int ci,
+ real shx, real shy, real shz,
+ BoundingBox *bb_ci)
+{
+ bb_ci->lower.x = bb[ci].lower.x + shx;
+ bb_ci->lower.y = bb[ci].lower.y + shy;
+ bb_ci->lower.z = bb[ci].lower.z + shz;
+ bb_ci->upper.x = bb[ci].upper.x + shx;
+ bb_ci->upper.y = bb[ci].upper.y + shy;
+ bb_ci->upper.z = bb[ci].upper.z + shz;
+}
+
+/* Sets a simple list i-cell bounding box, including PBC shift */
+static inline void set_icell_bb(const Grid &iGrid,
+ int ci,
+ real shx, real shy, real shz,
+ NbnxnPairlistCpuWork *work)
+{
+ set_icell_bb_simple(iGrid.iBoundingBoxes(), ci, shx, shy, shz,
+ &work->iClusterData.bb[0]);
+}
+
+#if NBNXN_BBXXXX
+/* Sets a super-cell and sub cell bounding boxes, including PBC shift */
+static void set_icell_bbxxxx_supersub(gmx::ArrayRef<const float> bb,
+ int ci,
+ real shx, real shy, real shz,
+ float *bb_ci)
+{
+ constexpr int cellBBStride = packedBoundingBoxesIndex(c_gpuNumClusterPerCell);
+ constexpr int pbbStride = c_packedBoundingBoxesDimSize;
+ const int ia = ci*cellBBStride;
+ for (int m = 0; m < cellBBStride; m += c_packedBoundingBoxesSize)
+ {
+ for (int i = 0; i < pbbStride; i++)
+ {
+ bb_ci[m + 0*pbbStride + i] = bb[ia + m + 0*pbbStride + i] + shx;
+ bb_ci[m + 1*pbbStride + i] = bb[ia + m + 1*pbbStride + i] + shy;
+ bb_ci[m + 2*pbbStride + i] = bb[ia + m + 2*pbbStride + i] + shz;
+ bb_ci[m + 3*pbbStride + i] = bb[ia + m + 3*pbbStride + i] + shx;
+ bb_ci[m + 4*pbbStride + i] = bb[ia + m + 4*pbbStride + i] + shy;
+ bb_ci[m + 5*pbbStride + i] = bb[ia + m + 5*pbbStride + i] + shz;
+ }
+ }
+}
+#endif
+
+/* Sets a super-cell and sub cell bounding boxes, including PBC shift */
+gmx_unused static void set_icell_bb_supersub(gmx::ArrayRef<const BoundingBox> bb,
+ int ci,
+ real shx, real shy, real shz,
+ BoundingBox *bb_ci)
+{
+ for (int i = 0; i < c_gpuNumClusterPerCell; i++)
+ {
+ set_icell_bb_simple(bb, ci*c_gpuNumClusterPerCell+i,
+ shx, shy, shz,
+ &bb_ci[i]);
+ }
+}
+
+/* Sets a super-cell and sub cell bounding boxes, including PBC shift */
+gmx_unused static void set_icell_bb(const Grid &iGrid,
+ int ci,
+ real shx, real shy, real shz,
+ NbnxnPairlistGpuWork *work)
+{
+#if NBNXN_BBXXXX
+ set_icell_bbxxxx_supersub(iGrid.packedBoundingBoxes(), ci, shx, shy, shz,
+ work->iSuperClusterData.bbPacked.data());
+#else
+ set_icell_bb_supersub(iGrid.iBoundingBoxes(), ci, shx, shy, shz,
+ work->iSuperClusterData.bb.data());
+#endif
+}
+
+/* Copies PBC shifted i-cell atom coordinates x,y,z to working array */
+static void icell_set_x_simple(int ci,
+ real shx, real shy, real shz,
+ int stride, const real *x,
+ NbnxnPairlistCpuWork::IClusterData *iClusterData)
+{
+ const int ia = ci*c_nbnxnCpuIClusterSize;
+
+ for (int i = 0; i < c_nbnxnCpuIClusterSize; i++)
+ {
+ iClusterData->x[i*STRIDE_XYZ+XX] = x[(ia+i)*stride+XX] + shx;
+ iClusterData->x[i*STRIDE_XYZ+YY] = x[(ia+i)*stride+YY] + shy;
+ iClusterData->x[i*STRIDE_XYZ+ZZ] = x[(ia+i)*stride+ZZ] + shz;
+ }
+}
+
+static void icell_set_x(int ci,
+ real shx, real shy, real shz,
+ int stride, const real *x,
+ const ClusterDistanceKernelType kernelType,
+ NbnxnPairlistCpuWork *work)
+{
+ switch (kernelType)
+ {
+#if GMX_SIMD
+#ifdef GMX_NBNXN_SIMD_4XN
+ case ClusterDistanceKernelType::CpuSimd_4xM:
+ icell_set_x_simd_4xn(ci, shx, shy, shz, stride, x, work);
+ break;
+#endif
+#ifdef GMX_NBNXN_SIMD_2XNN
+ case ClusterDistanceKernelType::CpuSimd_2xMM:
+ icell_set_x_simd_2xnn(ci, shx, shy, shz, stride, x, work);
+ break;
+#endif
+#endif
+ case ClusterDistanceKernelType::CpuPlainC:
+ icell_set_x_simple(ci, shx, shy, shz, stride, x, &work->iClusterData);
+ break;
+ default:
+ GMX_ASSERT(false, "Unhandled case");
+ break;
+ }
+}
+
+/* Copies PBC shifted super-cell atom coordinates x,y,z to working array */
+static void icell_set_x(int ci,
+ real shx, real shy, real shz,
+ int stride, const real *x,
+ ClusterDistanceKernelType gmx_unused kernelType,
+ NbnxnPairlistGpuWork *work)
+{
+#if !GMX_SIMD4_HAVE_REAL
+
+ real * x_ci = work->iSuperClusterData.x.data();
+
+ int ia = ci*c_gpuNumClusterPerCell*c_nbnxnGpuClusterSize;
+ for (int i = 0; i < c_gpuNumClusterPerCell*c_nbnxnGpuClusterSize; i++)
+ {
+ x_ci[i*DIM + XX] = x[(ia+i)*stride + XX] + shx;
+ x_ci[i*DIM + YY] = x[(ia+i)*stride + YY] + shy;
+ x_ci[i*DIM + ZZ] = x[(ia+i)*stride + ZZ] + shz;
+ }
+
+#else /* !GMX_SIMD4_HAVE_REAL */
+
+ real * x_ci = work->iSuperClusterData.xSimd.data();
+
+ for (int si = 0; si < c_gpuNumClusterPerCell; si++)
+ {
+ for (int i = 0; i < c_nbnxnGpuClusterSize; i += GMX_SIMD4_WIDTH)
+ {
+ int io = si*c_nbnxnGpuClusterSize + i;
+ int ia = ci*c_gpuNumClusterPerCell*c_nbnxnGpuClusterSize + io;
+ for (int j = 0; j < GMX_SIMD4_WIDTH; j++)
+ {
+ x_ci[io*DIM + j + XX*GMX_SIMD4_WIDTH] = x[(ia + j)*stride + XX] + shx;
+ x_ci[io*DIM + j + YY*GMX_SIMD4_WIDTH] = x[(ia + j)*stride + YY] + shy;
+ x_ci[io*DIM + j + ZZ*GMX_SIMD4_WIDTH] = x[(ia + j)*stride + ZZ] + shz;
+ }
+ }
+ }
+
+#endif /* !GMX_SIMD4_HAVE_REAL */
+}
+
+static real minimum_subgrid_size_xy(const Grid &grid)
+{
+ const Grid::Dimensions &dims = grid.dimensions();
+
+ if (grid.geometry().isSimple)
+ {
+ return std::min(dims.cellSize[XX], dims.cellSize[YY]);
+ }
+ else
+ {
+ return std::min(dims.cellSize[XX]/c_gpuNumClusterPerCellX,
+ dims.cellSize[YY]/c_gpuNumClusterPerCellY);
+ }
+}
+
+static real effective_buffer_1x1_vs_MxN(const Grid &iGrid,
+ const Grid &jGrid)
+{
+ const real eff_1x1_buffer_fac_overest = 0.1;
+
+ /* Determine an atom-pair list cut-off buffer size for atom pairs,
+ * to be added to rlist (including buffer) used for MxN.
+ * This is for converting an MxN list to a 1x1 list. This means we can't
+ * use the normal buffer estimate, as we have an MxN list in which
+ * some atom pairs beyond rlist are missing. We want to capture
+ * the beneficial effect of buffering by extra pairs just outside rlist,
+ * while removing the useless pairs that are further away from rlist.
+ * (Also the buffer could have been set manually not using the estimate.)
+ * This buffer size is an overestimate.
+ * We add 10% of the smallest grid sub-cell dimensions.
+ * Note that the z-size differs per cell and we don't use this,
+ * so we overestimate.
+ * With PME, the 10% value gives a buffer that is somewhat larger
+ * than the effective buffer with a tolerance of 0.005 kJ/mol/ps.
+ * Smaller tolerances or using RF lead to a smaller effective buffer,
+ * so 10% gives a safe overestimate.
+ */
+ return eff_1x1_buffer_fac_overest*(minimum_subgrid_size_xy(iGrid) +
+ minimum_subgrid_size_xy(jGrid));
+}
+
+/* Estimates the interaction volume^2 for non-local interactions */
+static real nonlocal_vol2(const struct gmx_domdec_zones_t *zones, const rvec ls, real r)
+{
+ real cl, ca, za;
+ real vold_est;
+ real vol2_est_tot;
+
+ vol2_est_tot = 0;
+
+ /* Here we simply add up the volumes of 1, 2 or 3 1D decomposition
+ * not home interaction volume^2. As these volumes are not additive,
+ * this is an overestimate, but it would only be significant in the limit
+ * of small cells, where we anyhow need to split the lists into
+ * as small parts as possible.
+ */
+
+ for (int z = 0; z < zones->n; z++)
+ {
+ if (zones->shift[z][XX] + zones->shift[z][YY] + zones->shift[z][ZZ] == 1)
+ {
+ cl = 0;
+ ca = 1;
+ za = 1;
+ for (int d = 0; d < DIM; d++)
+ {
+ if (zones->shift[z][d] == 0)
+ {
+ cl += 0.5*ls[d];
+ ca *= ls[d];
+ za *= zones->size[z].x1[d] - zones->size[z].x0[d];
+ }
+ }
+
+ /* 4 octants of a sphere */
+ vold_est = 0.25*M_PI*r*r*r*r;
+ /* 4 quarter pie slices on the edges */
+ vold_est += 4*cl*M_PI/6.0*r*r*r;
+ /* One rectangular volume on a face */
+ vold_est += ca*0.5*r*r;
+
+ vol2_est_tot += vold_est*za;
+ }
+ }
+
+ return vol2_est_tot;
+}
+
+/* Estimates the average size of a full j-list for super/sub setup */
+static void get_nsubpair_target(const Nbnxm::GridSet &gridSet,
+ const InteractionLocality iloc,
+ const real rlist,
+ const int min_ci_balanced,
+ int *nsubpair_target,
+ float *nsubpair_tot_est)
+{
+ /* The target value of 36 seems to be the optimum for Kepler.
+ * Maxwell is less sensitive to the exact value.
+ */
+ const int nsubpair_target_min = 36;
+ real r_eff_sup, vol_est, nsp_est, nsp_est_nl;
+
+ const Grid &grid = gridSet.grids()[0];
+
+ /* We don't need to balance list sizes if:
+ * - We didn't request balancing.
+ * - The number of grid cells >= the number of lists requested,
+ * since we will always generate at least #cells lists.
+ * - We don't have any cells, since then there won't be any lists.
+ */
+ if (min_ci_balanced <= 0 || grid.numCells() >= min_ci_balanced || grid.numCells() == 0)
+ {
+ /* nsubpair_target==0 signals no balancing */
+ *nsubpair_target = 0;
+ *nsubpair_tot_est = 0;
+
+ return;
+ }
+
+ gmx::RVec ls;
+ const int numAtomsCluster = grid.geometry().numAtomsICluster;
+ const Grid::Dimensions &dims = grid.dimensions();
+
+ ls[XX] = dims.cellSize[XX]/c_gpuNumClusterPerCellX;
+ ls[YY] = dims.cellSize[YY]/c_gpuNumClusterPerCellY;
+ ls[ZZ] = numAtomsCluster/(dims.atomDensity*ls[XX]*ls[YY]);
+
+ /* The formulas below are a heuristic estimate of the average nsj per si*/
+ r_eff_sup = rlist + nbnxn_get_rlist_effective_inc(numAtomsCluster, ls);
+
+ if (!gridSet.domainSetup().haveMultipleDomains ||
+ gridSet.domainSetup().zones->n == 1)
+ {
+ nsp_est_nl = 0;
+ }
+ else
+ {
+ nsp_est_nl =
+ gmx::square(dims.atomDensity/numAtomsCluster)*
+ nonlocal_vol2(gridSet.domainSetup().zones, ls, r_eff_sup);
+ }
+
+ if (iloc == InteractionLocality::Local)
+ {
+ /* Sub-cell interacts with itself */
+ vol_est = ls[XX]*ls[YY]*ls[ZZ];
+ /* 6/2 rectangular volume on the faces */
+ vol_est += (ls[XX]*ls[YY] + ls[XX]*ls[ZZ] + ls[YY]*ls[ZZ])*r_eff_sup;
+ /* 12/2 quarter pie slices on the edges */
+ vol_est += 2*(ls[XX] + ls[YY] + ls[ZZ])*0.25*M_PI*gmx::square(r_eff_sup);
+ /* 4 octants of a sphere */
+ vol_est += 0.5*4.0/3.0*M_PI*gmx::power3(r_eff_sup);
+
+ /* Estimate the number of cluster pairs as the local number of
+ * clusters times the volume they interact with times the density.
+ */
+ nsp_est = grid.numClusters()*vol_est*dims.atomDensity/numAtomsCluster;
+
+ /* Subtract the non-local pair count */
+ nsp_est -= nsp_est_nl;
+
+ /* For small cut-offs nsp_est will be an underesimate.
+ * With DD nsp_est_nl is an overestimate so nsp_est can get negative.
+ * So to avoid too small or negative nsp_est we set a minimum of
+ * all cells interacting with all 3^3 direct neighbors (3^3-1)/2+1=14.
+ * This might be a slight overestimate for small non-periodic groups of
+ * atoms as will occur for a local domain with DD, but for small
+ * groups of atoms we'll anyhow be limited by nsubpair_target_min,
+ * so this overestimation will not matter.
+ */
+ nsp_est = std::max(nsp_est, grid.numClusters()*14._real);
+
+ if (debug)
+ {
+ fprintf(debug, "nsp_est local %5.1f non-local %5.1f\n",
+ nsp_est, nsp_est_nl);
+ }
+ }
+ else
+ {
+ nsp_est = nsp_est_nl;
+ }
+
+ /* Thus the (average) maximum j-list size should be as follows.
+ * Since there is overhead, we shouldn't make the lists too small
+ * (and we can't chop up j-groups) so we use a minimum target size of 36.
+ */
+ *nsubpair_target = std::max(nsubpair_target_min,
+ roundToInt(nsp_est/min_ci_balanced));
+ *nsubpair_tot_est = static_cast<int>(nsp_est);
+
+ if (debug)
+ {
+ fprintf(debug, "nbl nsp estimate %.1f, nsubpair_target %d\n",
+ nsp_est, *nsubpair_target);
+ }
+}
+
+/* Debug list print function */
+static void print_nblist_ci_cj(FILE *fp,
+ const NbnxnPairlistCpu &nbl)
+{
+ for (const nbnxn_ci_t &ciEntry : nbl.ci)
+ {
+ fprintf(fp, "ci %4d shift %2d ncj %3d\n",
+ ciEntry.ci, ciEntry.shift,
+ ciEntry.cj_ind_end - ciEntry.cj_ind_start);
+
+ for (int j = ciEntry.cj_ind_start; j < ciEntry.cj_ind_end; j++)
+ {
+ fprintf(fp, " cj %5d imask %x\n",
+ nbl.cj[j].cj,
+ nbl.cj[j].excl);
+ }
+ }
+}
+
+/* Debug list print function */
+static void print_nblist_sci_cj(FILE *fp,
+ const NbnxnPairlistGpu &nbl)
+{
+ for (const nbnxn_sci_t &sci : nbl.sci)
+ {
+ fprintf(fp, "ci %4d shift %2d ncj4 %2d\n",
+ sci.sci, sci.shift,
+ sci.numJClusterGroups());
+
+ int ncp = 0;
+ for (int j4 = sci.cj4_ind_start; j4 < sci.cj4_ind_end; j4++)
+ {
+ for (int j = 0; j < c_nbnxnGpuJgroupSize; j++)
+ {
+ fprintf(fp, " sj %5d imask %x\n",
+ nbl.cj4[j4].cj[j],
+ nbl.cj4[j4].imei[0].imask);
+ for (int si = 0; si < c_gpuNumClusterPerCell; si++)
+ {
+ if (nbl.cj4[j4].imei[0].imask & (1U << (j*c_gpuNumClusterPerCell + si)))
+ {
+ ncp++;
+ }
+ }
+ }
+ }
+ fprintf(fp, "ci %4d shift %2d ncj4 %2d ncp %3d\n",
+ sci.sci, sci.shift,
+ sci.numJClusterGroups(),
+ ncp);
+ }
+}
+
+/* Combine pair lists *nbl generated on multiple threads nblc */
+static void combine_nblists(gmx::ArrayRef<const NbnxnPairlistGpu> nbls,
+ NbnxnPairlistGpu *nblc)
+{
+ int nsci = nblc->sci.size();
+ int ncj4 = nblc->cj4.size();
+ int nexcl = nblc->excl.size();
+ for (auto &nbl : nbls)
+ {
+ nsci += nbl.sci.size();
+ ncj4 += nbl.cj4.size();
+ nexcl += nbl.excl.size();
+ }
+
+ /* Resize with the final, combined size, so we can fill in parallel */
+ /* NOTE: For better performance we should use default initialization */
+ nblc->sci.resize(nsci);
+ nblc->cj4.resize(ncj4);
+ nblc->excl.resize(nexcl);
+
+ /* Each thread should copy its own data to the combined arrays,
+ * as otherwise data will go back and forth between different caches.
+ */
+#if GMX_OPENMP && !(defined __clang_analyzer__)
+ int nthreads = gmx_omp_nthreads_get(emntPairsearch);
+#endif
+
+#pragma omp parallel for num_threads(nthreads) schedule(static)
+ for (int n = 0; n < nbls.ssize(); n++)
+ {
+ try
+ {
+ /* Determine the offset in the combined data for our thread.
+ * Note that the original sizes in nblc are lost.
+ */
+ int sci_offset = nsci;
+ int cj4_offset = ncj4;
+ int excl_offset = nexcl;
+
+ for (int i = n; i < nbls.ssize(); i++)
+ {
+ sci_offset -= nbls[i].sci.size();
+ cj4_offset -= nbls[i].cj4.size();
+ excl_offset -= nbls[i].excl.size();
+ }
+
+ const NbnxnPairlistGpu &nbli = nbls[n];
+
+ for (size_t i = 0; i < nbli.sci.size(); i++)
+ {
+ nblc->sci[sci_offset + i] = nbli.sci[i];
+ nblc->sci[sci_offset + i].cj4_ind_start += cj4_offset;
+ nblc->sci[sci_offset + i].cj4_ind_end += cj4_offset;
+ }
+
+ for (size_t j4 = 0; j4 < nbli.cj4.size(); j4++)
+ {
+ nblc->cj4[cj4_offset + j4] = nbli.cj4[j4];
+ nblc->cj4[cj4_offset + j4].imei[0].excl_ind += excl_offset;
+ nblc->cj4[cj4_offset + j4].imei[1].excl_ind += excl_offset;
+ }
+
+ for (size_t j4 = 0; j4 < nbli.excl.size(); j4++)
+ {
+ nblc->excl[excl_offset + j4] = nbli.excl[j4];
+ }
+ }
+ GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
+ }
+
+ for (auto &nbl : nbls)
+ {
+ nblc->nci_tot += nbl.nci_tot;
+ }
+}
+
+static void balance_fep_lists(gmx::ArrayRef < std::unique_ptr < t_nblist>> fepLists,
+ gmx::ArrayRef<PairsearchWork> work)
+{
+ const int numLists = fepLists.ssize();
+
+ if (numLists == 1)
+ {
+ /* Nothing to balance */
+ return;
+ }
+
+ /* Count the total i-lists and pairs */
+ int nri_tot = 0;
+ int nrj_tot = 0;
+ for (const auto &list : fepLists)
+ {
+ nri_tot += list->nri;
+ nrj_tot += list->nrj;
+ }
+
+ const int nrj_target = (nrj_tot + numLists - 1)/numLists;
+
+ GMX_ASSERT(gmx_omp_nthreads_get(emntNonbonded) == numLists,
+ "We should have as many work objects as FEP lists");
+
+#pragma omp parallel for schedule(static) num_threads(numLists)
+ for (int th = 0; th < numLists; th++)
+ {
+ try
+ {
+ t_nblist *nbl = work[th].nbl_fep.get();
+
+ /* Note that here we allocate for the total size, instead of
+ * a per-thread esimate (which is hard to obtain).
+ */
+ if (nri_tot > nbl->maxnri)
+ {
+ nbl->maxnri = over_alloc_large(nri_tot);
+ reallocate_nblist(nbl);
+ }
+ if (nri_tot > nbl->maxnri || nrj_tot > nbl->maxnrj)
+ {
+ nbl->maxnrj = over_alloc_small(nrj_tot);
+ srenew(nbl->jjnr, nbl->maxnrj);
+ srenew(nbl->excl_fep, nbl->maxnrj);
+ }
+
+ clear_pairlist_fep(nbl);
+ }
+ GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
+ }
+
+ /* Loop over the source lists and assign and copy i-entries */
+ int th_dest = 0;
+ t_nblist *nbld = work[th_dest].nbl_fep.get();
+ for (int th = 0; th < numLists; th++)
+ {
+ const t_nblist *nbls = fepLists[th].get();
+
+ for (int i = 0; i < nbls->nri; i++)
+ {
+ int nrj;
+
+ /* The number of pairs in this i-entry */
+ nrj = nbls->jindex[i+1] - nbls->jindex[i];
+
+ /* Decide if list th_dest is too large and we should procede
+ * to the next destination list.
+ */
+ if (th_dest + 1 < numLists && nbld->nrj > 0 &&
+ nbld->nrj + nrj - nrj_target > nrj_target - nbld->nrj)
+ {
+ th_dest++;
+ nbld = work[th_dest].nbl_fep.get();
+ }
+
+ nbld->iinr[nbld->nri] = nbls->iinr[i];
+ nbld->gid[nbld->nri] = nbls->gid[i];
+ nbld->shift[nbld->nri] = nbls->shift[i];
+
+ for (int j = nbls->jindex[i]; j < nbls->jindex[i+1]; j++)
+ {
+ nbld->jjnr[nbld->nrj] = nbls->jjnr[j];
+ nbld->excl_fep[nbld->nrj] = nbls->excl_fep[j];
+ nbld->nrj++;
+ }
+ nbld->nri++;
+ nbld->jindex[nbld->nri] = nbld->nrj;
+ }
+ }
+
+ /* Swap the list pointers */
+ for (int th = 0; th < numLists; th++)
+ {
+ fepLists[th].swap(work[th].nbl_fep);
+
+ if (debug)
+ {
+ fprintf(debug, "nbl_fep[%d] nri %4d nrj %4d\n",
+ th,
+ fepLists[th]->nri,
+ fepLists[th]->nrj);
+ }
+ }
+}
+
+/* Returns the next ci to be processes by our thread */
+static gmx_bool next_ci(const Grid &grid,
+ int nth, int ci_block,
+ int *ci_x, int *ci_y,
+ int *ci_b, int *ci)
+{
+ (*ci_b)++;
+ (*ci)++;
+
+ if (*ci_b == ci_block)
+ {
+ /* Jump to the next block assigned to this task */
+ *ci += (nth - 1)*ci_block;
+ *ci_b = 0;
+ }
+
+ if (*ci >= grid.numCells())
+ {
+ return FALSE;
+ }
+
+ while (*ci >= grid.firstCellInColumn(*ci_x*grid.dimensions().numCells[YY] + *ci_y + 1))
+ {
+ *ci_y += 1;
+ if (*ci_y == grid.dimensions().numCells[YY])
+ {
+ *ci_x += 1;
+ *ci_y = 0;
+ }
+ }
+
+ return TRUE;
+}
+
+/* Returns the distance^2 for which we put cell pairs in the list
+ * without checking atom pair distances. This is usually < rlist^2.
+ */
+static float boundingbox_only_distance2(const Grid::Dimensions &iGridDims,
+ const Grid::Dimensions &jGridDims,
+ real rlist,
+ gmx_bool simple)
+{
+ /* If the distance between two sub-cell bounding boxes is less
+ * than this distance, do not check the distance between
+ * all particle pairs in the sub-cell, since then it is likely
+ * that the box pair has atom pairs within the cut-off.
+ * We use the nblist cut-off minus 0.5 times the average x/y diagonal
+ * spacing of the sub-cells. Around 40% of the checked pairs are pruned.
+ * Using more than 0.5 gains at most 0.5%.
+ * If forces are calculated more than twice, the performance gain
+ * in the force calculation outweighs the cost of checking.
+ * Note that with subcell lists, the atom-pair distance check
+ * is only performed when only 1 out of 8 sub-cells in within range,
+ * this is because the GPU is much faster than the cpu.
+ */
+ real bbx, bby;
+ real rbb2;
+
+ bbx = 0.5*(iGridDims.cellSize[XX] + jGridDims.cellSize[XX]);
+ bby = 0.5*(iGridDims.cellSize[YY] + jGridDims.cellSize[YY]);
+ if (!simple)
+ {
+ bbx /= c_gpuNumClusterPerCellX;
+ bby /= c_gpuNumClusterPerCellY;
+ }
+
+ rbb2 = std::max(0.0, rlist - 0.5*std::sqrt(bbx*bbx + bby*bby));
+ rbb2 = rbb2 * rbb2;
+
+#if !GMX_DOUBLE
+ return rbb2;
+#else
+ return (float)((1+GMX_FLOAT_EPS)*rbb2);
+#endif
+}
+
+static int get_ci_block_size(const Grid &iGrid,
+ const bool haveMultipleDomains,
+ const int numLists)
+{
+ const int ci_block_enum = 5;
+ const int ci_block_denom = 11;
+ const int ci_block_min_atoms = 16;
+ int ci_block;
+
+ /* Here we decide how to distribute the blocks over the threads.
+ * We use prime numbers to try to avoid that the grid size becomes
+ * a multiple of the number of threads, which would lead to some
+ * threads getting "inner" pairs and others getting boundary pairs,
+ * which in turns will lead to load imbalance between threads.
+ * Set the block size as 5/11/ntask times the average number of cells
+ * in a y,z slab. This should ensure a quite uniform distribution
+ * of the grid parts of the different thread along all three grid
+ * zone boundaries with 3D domain decomposition. At the same time
+ * the blocks will not become too small.
+ */
+ GMX_ASSERT(iGrid.dimensions().numCells[XX] > 0, "Grid can't be empty");
+ GMX_ASSERT(numLists > 0, "We need at least one list");
+ ci_block = (iGrid.numCells()*ci_block_enum)/(ci_block_denom*iGrid.dimensions().numCells[XX]*numLists);
+
+ const int numAtomsPerCell = iGrid.geometry().numAtomsPerCell;
+
+ /* Ensure the blocks are not too small: avoids cache invalidation */
+ if (ci_block*numAtomsPerCell < ci_block_min_atoms)
+ {
+ ci_block = (ci_block_min_atoms + numAtomsPerCell - 1)/numAtomsPerCell;
+ }
+
+ /* Without domain decomposition
+ * or with less than 3 blocks per task, divide in nth blocks.
+ */
+ if (!haveMultipleDomains || numLists*3*ci_block > iGrid.numCells())
+ {
+ ci_block = (iGrid.numCells() + numLists - 1)/numLists;
+ }
+
+ if (ci_block > 1 && (numLists - 1)*ci_block >= iGrid.numCells())
+ {
+ /* Some threads have no work. Although reducing the block size
+ * does not decrease the block count on the first few threads,
+ * with GPUs better mixing of "upper" cells that have more empty
+ * clusters results in a somewhat lower max load over all threads.
+ * Without GPUs the regime of so few atoms per thread is less
+ * performance relevant, but with 8-wide SIMD the same reasoning
+ * applies, since the pair list uses 4 i-atom "sub-clusters".
+ */
+ ci_block--;
+ }
+
+ return ci_block;
+}
+
+/* Returns the number of bits to right-shift a cluster index to obtain
+ * the corresponding force buffer flag index.
+ */
+static int getBufferFlagShift(int numAtomsPerCluster)
+{
+ int bufferFlagShift = 0;
+ while ((numAtomsPerCluster << bufferFlagShift) < NBNXN_BUFFERFLAG_SIZE)
+ {
+ bufferFlagShift++;
+ }
+
+ return bufferFlagShift;
+}
+
+static bool pairlistIsSimple(const NbnxnPairlistCpu gmx_unused &pairlist)
+{
+ return true;
+}
+
+static bool pairlistIsSimple(const NbnxnPairlistGpu gmx_unused &pairlist)
+{
+ return false;
+}
+
+static void
+makeClusterListWrapper(NbnxnPairlistCpu *nbl,
+ const Grid gmx_unused &iGrid,
+ const int ci,
+ const Grid &jGrid,
+ const int firstCell,
+ const int lastCell,
+ const bool excludeSubDiagonal,
+ const nbnxn_atomdata_t *nbat,
+ const real rlist2,
+ const real rbb2,
+ const ClusterDistanceKernelType kernelType,
+ int *numDistanceChecks)
+{
+ switch (kernelType)
+ {
+ case ClusterDistanceKernelType::CpuPlainC:
+ makeClusterListSimple(jGrid,
+ nbl, ci, firstCell, lastCell,
+ excludeSubDiagonal,
+ nbat->x().data(),
+ rlist2, rbb2,
+ numDistanceChecks);
+ break;
+#ifdef GMX_NBNXN_SIMD_4XN
+ case ClusterDistanceKernelType::CpuSimd_4xM:
+ makeClusterListSimd4xn(jGrid,
+ nbl, ci, firstCell, lastCell,
+ excludeSubDiagonal,
+ nbat->x().data(),
+ rlist2, rbb2,
+ numDistanceChecks);
+ break;
+#endif
+#ifdef GMX_NBNXN_SIMD_2XNN
+ case ClusterDistanceKernelType::CpuSimd_2xMM:
+ makeClusterListSimd2xnn(jGrid,
+ nbl, ci, firstCell, lastCell,
+ excludeSubDiagonal,
+ nbat->x().data(),
+ rlist2, rbb2,
+ numDistanceChecks);
+ break;
+#endif
+ default:
+ GMX_ASSERT(false, "Unhandled kernel type");
+ }
+}
+
+static void
+makeClusterListWrapper(NbnxnPairlistGpu *nbl,
+ const Grid &gmx_unused iGrid,
+ const int ci,
+ const Grid &jGrid,
+ const int firstCell,
+ const int lastCell,
+ const bool excludeSubDiagonal,
+ const nbnxn_atomdata_t *nbat,
+ const real rlist2,
+ const real rbb2,
+ ClusterDistanceKernelType gmx_unused kernelType,
+ int *numDistanceChecks)
+{
+ for (int cj = firstCell; cj <= lastCell; cj++)
+ {
+ make_cluster_list_supersub(iGrid, jGrid,
+ nbl, ci, cj,
+ excludeSubDiagonal,
+ nbat->xstride, nbat->x().data(),
+ rlist2, rbb2,
+ numDistanceChecks);
+ }
+}
+
+static int getNumSimpleJClustersInList(const NbnxnPairlistCpu &nbl)
+{
+ return nbl.cj.size();
+}
+
+static int getNumSimpleJClustersInList(const gmx_unused NbnxnPairlistGpu &nbl)
+{
+ return 0;
+}
+
+static void incrementNumSimpleJClustersInList(NbnxnPairlistCpu *nbl,
+ int ncj_old_j)
+{
+ nbl->ncjInUse += nbl->cj.size() - ncj_old_j;
+}
+
+static void incrementNumSimpleJClustersInList(NbnxnPairlistGpu gmx_unused *nbl,
+ int gmx_unused ncj_old_j)
+{
+}
+
+static void checkListSizeConsistency(const NbnxnPairlistCpu &nbl,
+ const bool haveFreeEnergy)
+{
+ GMX_RELEASE_ASSERT(static_cast<size_t>(nbl.ncjInUse) == nbl.cj.size() || haveFreeEnergy,
+ "Without free-energy all cj pair-list entries should be in use. "
+ "Note that subsequent code does not make use of the equality, "
+ "this check is only here to catch bugs");
+}
+
+static void checkListSizeConsistency(const NbnxnPairlistGpu gmx_unused &nbl,
+ bool gmx_unused haveFreeEnergy)
+{
+ /* We currently can not check consistency here */
+}
+
+/* Set the buffer flags for newly added entries in the list */
+static void setBufferFlags(const NbnxnPairlistCpu &nbl,
+ const int ncj_old_j,
+ const int gridj_flag_shift,
+ gmx_bitmask_t *gridj_flag,
+ const int th)
+{
+ if (gmx::ssize(nbl.cj) > ncj_old_j)
+ {
+ int cbFirst = nbl.cj[ncj_old_j].cj >> gridj_flag_shift;
+ int cbLast = nbl.cj.back().cj >> gridj_flag_shift;
+ for (int cb = cbFirst; cb <= cbLast; cb++)
+ {
+ bitmask_init_bit(&gridj_flag[cb], th);
+ }
+ }
+}
+
+static void setBufferFlags(const NbnxnPairlistGpu gmx_unused &nbl,
+ int gmx_unused ncj_old_j,
+ int gmx_unused gridj_flag_shift,
+ gmx_bitmask_t gmx_unused *gridj_flag,
+ int gmx_unused th)
+{
+ GMX_ASSERT(false, "This function should never be called");
+}
+
+/* Generates the part of pair-list nbl assigned to our thread */
+template <typename T>
+static void nbnxn_make_pairlist_part(const Nbnxm::GridSet &gridSet,
+ const Grid &iGrid,
+ const Grid &jGrid,
+ PairsearchWork *work,
+ const nbnxn_atomdata_t *nbat,
+ const t_blocka &exclusions,
+ real rlist,
+ const PairlistType pairlistType,
+ int ci_block,
+ gmx_bool bFBufferFlag,
+ int nsubpair_max,
+ gmx_bool progBal,
+ float nsubpair_tot_est,
+ int th, int nth,
+ T *nbl,
+ t_nblist *nbl_fep)
+{
+ int na_cj_2log;
+ matrix box;
+ real rl_fep2 = 0;
+ float rbb2;
+ int ci_b, ci, ci_x, ci_y, ci_xy;
+ ivec shp;
+ real bx0, bx1, by0, by1, bz0, bz1;
+ real bz1_frac;
+ real d2cx, d2z, d2z_cx, d2z_cy, d2zx, d2zxy, d2xy;
+ int cxf, cxl, cyf, cyf_x, cyl;
+ int numDistanceChecks;
+ int gridi_flag_shift = 0, gridj_flag_shift = 0;
+ gmx_bitmask_t *gridj_flag = nullptr;
+ int ncj_old_i, ncj_old_j;
+
+ if (jGrid.geometry().isSimple != pairlistIsSimple(*nbl) ||
+ iGrid.geometry().isSimple != pairlistIsSimple(*nbl))
+ {
+ gmx_incons("Grid incompatible with pair-list");
+ }
+
+ sync_work(nbl);
+ GMX_ASSERT(nbl->na_ci == jGrid.geometry().numAtomsICluster,
+ "The cluster sizes in the list and grid should match");
+ nbl->na_cj = JClusterSizePerListType[pairlistType];
+ na_cj_2log = get_2log(nbl->na_cj);
+
+ nbl->rlist = rlist;
+
+ if (bFBufferFlag)
+ {
+ /* Determine conversion of clusters to flag blocks */
+ gridi_flag_shift = getBufferFlagShift(nbl->na_ci);
+ gridj_flag_shift = getBufferFlagShift(nbl->na_cj);
+
+ gridj_flag = work->buffer_flags.flag;
+ }
+
+ gridSet.getBox(box);
+
+ const bool haveFep = gridSet.haveFep();
+
+ const real rlist2 = nbl->rlist*nbl->rlist;
+
+ // Select the cluster pair distance kernel type
+ const ClusterDistanceKernelType kernelType =
+ getClusterDistanceKernelType(pairlistType, *nbat);
+
+ if (haveFep && !pairlistIsSimple(*nbl))
+ {
+ /* Determine an atom-pair list cut-off distance for FEP atom pairs.
+ * We should not simply use rlist, since then we would not have
+ * the small, effective buffering of the NxN lists.
+ * The buffer is on overestimate, but the resulting cost for pairs
+ * beyond rlist is neglible compared to the FEP pairs within rlist.
+ */
+ rl_fep2 = nbl->rlist + effective_buffer_1x1_vs_MxN(iGrid, jGrid);
+
+ if (debug)
+ {
+ fprintf(debug, "nbl_fep atom-pair rlist %f\n", rl_fep2);
+ }
+ rl_fep2 = rl_fep2*rl_fep2;
+ }
+
+ const Grid::Dimensions &iGridDims = iGrid.dimensions();
+ const Grid::Dimensions &jGridDims = jGrid.dimensions();
+
+ rbb2 = boundingbox_only_distance2(iGridDims, jGridDims, nbl->rlist, pairlistIsSimple(*nbl));
+
+ if (debug)
+ {
+ fprintf(debug, "nbl bounding box only distance %f\n", std::sqrt(rbb2));
+ }
+
+ const bool isIntraGridList = (&iGrid == &jGrid);
+
+ /* Set the shift range */
+ for (int d = 0; d < DIM; d++)
+ {
+ /* Check if we need periodicity shifts.
+ * Without PBC or with domain decomposition we don't need them.
+ */
+ if (d >= ePBC2npbcdim(gridSet.domainSetup().ePBC) ||
+ gridSet.domainSetup().haveMultipleDomainsPerDim[d])
+ {
+ shp[d] = 0;
+ }
+ else
+ {
+ const real listRangeCellToCell =
+ listRangeForGridCellToGridCell(rlist, iGrid.dimensions(), jGrid.dimensions());
+ if (d == XX &&
+ box[XX][XX] - fabs(box[YY][XX]) - fabs(box[ZZ][XX]) < listRangeCellToCell)
+ {
+ shp[d] = 2;
+ }
+ else
+ {
+ shp[d] = 1;
+ }
+ }
+ }
+ const bool bSimple = pairlistIsSimple(*nbl);
+ gmx::ArrayRef<const BoundingBox> bb_i;
+#if NBNXN_BBXXXX
+ gmx::ArrayRef<const float> pbb_i;
+ if (bSimple)
+ {
+ bb_i = iGrid.iBoundingBoxes();
+ }
+ else
+ {
+ pbb_i = iGrid.packedBoundingBoxes();
+ }
+#else
+ /* We use the normal bounding box format for both grid types */
+ bb_i = iGrid.iBoundingBoxes();
+#endif
+ gmx::ArrayRef<const BoundingBox1D> bbcz_i = iGrid.zBoundingBoxes();
+ gmx::ArrayRef<const int> flags_i = iGrid.clusterFlags();
+ gmx::ArrayRef<const BoundingBox1D> bbcz_j = jGrid.zBoundingBoxes();
+ int cell0_i = iGrid.cellOffset();
+
+ if (debug)
+ {
+ fprintf(debug, "nbl nc_i %d col.av. %.1f ci_block %d\n",
+ iGrid.numCells(), iGrid.numCells()/static_cast<double>(iGrid.numColumns()), ci_block);
+ }
+
+ numDistanceChecks = 0;
+
+ const real listRangeBBToJCell2 = gmx::square(listRangeForBoundingBoxToGridCell(rlist, jGrid.dimensions()));
+
+ /* Initially ci_b and ci to 1 before where we want them to start,
+ * as they will both be incremented in next_ci.
+ */
+ ci_b = -1;
+ ci = th*ci_block - 1;
+ ci_x = 0;
+ ci_y = 0;
+ while (next_ci(iGrid, nth, ci_block, &ci_x, &ci_y, &ci_b, &ci))
+ {
+ if (bSimple && flags_i[ci] == 0)
+ {
+ continue;
+ }
+
+ ncj_old_i = getNumSimpleJClustersInList(*nbl);
+
+ d2cx = 0;
+ if (!isIntraGridList && shp[XX] == 0)
+ {
+ if (bSimple)
+ {
+ bx1 = bb_i[ci].upper.x;
+ }
+ else
+ {
+ bx1 = iGridDims.lowerCorner[XX] + (ci_x+1)*iGridDims.cellSize[XX];
+ }
+ if (bx1 < jGridDims.lowerCorner[XX])
+ {
+ d2cx = gmx::square(jGridDims.lowerCorner[XX] - bx1);
+
+ if (d2cx >= listRangeBBToJCell2)
+ {
+ continue;
+ }
+ }
+ }
+
+ ci_xy = ci_x*iGridDims.numCells[YY] + ci_y;
+
+ /* Loop over shift vectors in three dimensions */
+ for (int tz = -shp[ZZ]; tz <= shp[ZZ]; tz++)
+ {
+ const real shz = tz*box[ZZ][ZZ];
+
+ bz0 = bbcz_i[ci].lower + shz;
+ bz1 = bbcz_i[ci].upper + shz;
+
+ if (tz == 0)
+ {
+ d2z = 0;
+ }
+ else if (tz < 0)
+ {
+ d2z = gmx::square(bz1);
+ }
+ else
+ {
+ d2z = gmx::square(bz0 - box[ZZ][ZZ]);
+ }
+
+ d2z_cx = d2z + d2cx;
+
+ if (d2z_cx >= rlist2)
+ {
+ continue;
+ }
+
+ bz1_frac = bz1/iGrid.numCellsInColumn(ci_xy);
+ if (bz1_frac < 0)
+ {
+ bz1_frac = 0;
+ }
+ /* The check with bz1_frac close to or larger than 1 comes later */
+
+ for (int ty = -shp[YY]; ty <= shp[YY]; ty++)
+ {
+ const real shy = ty*box[YY][YY] + tz*box[ZZ][YY];
+
+ if (bSimple)
+ {
+ by0 = bb_i[ci].lower.y + shy;
+ by1 = bb_i[ci].upper.y + shy;
+ }
+ else
+ {
+ by0 = iGridDims.lowerCorner[YY] + (ci_y )*iGridDims.cellSize[YY] + shy;
+ by1 = iGridDims.lowerCorner[YY] + (ci_y + 1)*iGridDims.cellSize[YY] + shy;
+ }
+
+ get_cell_range<YY>(by0, by1,
+ jGridDims,
+ d2z_cx, rlist,
+ &cyf, &cyl);
+
+ if (cyf > cyl)
+ {
+ continue;
+ }
+
+ d2z_cy = d2z;
+ if (by1 < jGridDims.lowerCorner[YY])
+ {
+ d2z_cy += gmx::square(jGridDims.lowerCorner[YY] - by1);
+ }
+ else if (by0 > jGridDims.upperCorner[YY])
+ {
+ d2z_cy += gmx::square(by0 - jGridDims.upperCorner[YY]);
+ }
+
+ for (int tx = -shp[XX]; tx <= shp[XX]; tx++)
+ {
+ const int shift = XYZ2IS(tx, ty, tz);
+
+ const bool excludeSubDiagonal = (isIntraGridList && shift == CENTRAL);
+
+ if (c_pbcShiftBackward && isIntraGridList && shift > CENTRAL)
+ {
+ continue;
+ }
+
+ const real shx = tx*box[XX][XX] + ty*box[YY][XX] + tz*box[ZZ][XX];
+
+ if (bSimple)
+ {
+ bx0 = bb_i[ci].lower.x + shx;
+ bx1 = bb_i[ci].upper.x + shx;
+ }
+ else
+ {
+ bx0 = iGridDims.lowerCorner[XX] + (ci_x )*iGridDims.cellSize[XX] + shx;
+ bx1 = iGridDims.lowerCorner[XX] + (ci_x+1)*iGridDims.cellSize[XX] + shx;
+ }
+
+ get_cell_range<XX>(bx0, bx1,
+ jGridDims,
+ d2z_cy, rlist,
+ &cxf, &cxl);
+
+ if (cxf > cxl)
+ {
+ continue;
+ }
+
+ addNewIEntry(nbl, cell0_i+ci, shift, flags_i[ci]);
+
+ if ((!c_pbcShiftBackward || excludeSubDiagonal) &&
+ cxf < ci_x)
+ {
+ /* Leave the pairs with i > j.
+ * x is the major index, so skip half of it.
+ */
+ cxf = ci_x;
+ }
+
+ set_icell_bb(iGrid, ci, shx, shy, shz,
+ nbl->work.get());
+
+ icell_set_x(cell0_i+ci, shx, shy, shz,
+ nbat->xstride, nbat->x().data(),
+ kernelType,
+ nbl->work.get());
+
+ for (int cx = cxf; cx <= cxl; cx++)
+ {
+ d2zx = d2z;
+ if (jGridDims.lowerCorner[XX] + cx*jGridDims.cellSize[XX] > bx1)
+ {
+ d2zx += gmx::square(jGridDims.lowerCorner[XX] + cx*jGridDims.cellSize[XX] - bx1);
+ }
+ else if (jGridDims.lowerCorner[XX] + (cx+1)*jGridDims.cellSize[XX] < bx0)
+ {
+ d2zx += gmx::square(jGridDims.lowerCorner[XX] + (cx+1)*jGridDims.cellSize[XX] - bx0);
+ }
+
+ if (isIntraGridList &&
+ cx == 0 &&
+ (!c_pbcShiftBackward || shift == CENTRAL) &&
+ cyf < ci_y)
+ {
+ /* Leave the pairs with i > j.
+ * Skip half of y when i and j have the same x.
+ */
+ cyf_x = ci_y;
+ }
+ else
+ {
+ cyf_x = cyf;
+ }
+
+ for (int cy = cyf_x; cy <= cyl; cy++)
+ {
+ const int columnStart = jGrid.firstCellInColumn(cx*jGridDims.numCells[YY] + cy);
+ const int columnEnd = jGrid.firstCellInColumn(cx*jGridDims.numCells[YY] + cy + 1);
+
+ d2zxy = d2zx;
+ if (jGridDims.lowerCorner[YY] + cy*jGridDims.cellSize[YY] > by1)
+ {
+ d2zxy += gmx::square(jGridDims.lowerCorner[YY] + cy*jGridDims.cellSize[YY] - by1);
+ }
+ else if (jGridDims.lowerCorner[YY] + (cy + 1)*jGridDims.cellSize[YY] < by0)
+ {
+ d2zxy += gmx::square(jGridDims.lowerCorner[YY] + (cy + 1)*jGridDims.cellSize[YY] - by0);
+ }
+ if (columnStart < columnEnd && d2zxy < listRangeBBToJCell2)
+ {
+ /* To improve efficiency in the common case
+ * of a homogeneous particle distribution,
+ * we estimate the index of the middle cell
+ * in range (midCell). We search down and up
+ * starting from this index.
+ *
+ * Note that the bbcz_j array contains bounds
+ * for i-clusters, thus for clusters of 4 atoms.
+ * For the common case where the j-cluster size
+ * is 8, we could step with a stride of 2,
+ * but we do not do this because it would
+ * complicate this code even more.
+ */
+ int midCell = columnStart + static_cast<int>(bz1_frac*(columnEnd - columnStart));
+ if (midCell >= columnEnd)
+ {
+ midCell = columnEnd - 1;
+ }
+
+ d2xy = d2zxy - d2z;
+
+ /* Find the lowest cell that can possibly
+ * be within range.
+ * Check if we hit the bottom of the grid,
+ * if the j-cell is below the i-cell and if so,
+ * if it is within range.
+ */
+ int downTestCell = midCell;
+ while (downTestCell >= columnStart &&
+ (bbcz_j[downTestCell].upper >= bz0 ||
+ d2xy + gmx::square(bbcz_j[downTestCell].upper - bz0) < rlist2))
+ {
+ downTestCell--;
+ }
+ int firstCell = downTestCell + 1;
+
+ /* Find the highest cell that can possibly
+ * be within range.
+ * Check if we hit the top of the grid,
+ * if the j-cell is above the i-cell and if so,
+ * if it is within range.
+ */
+ int upTestCell = midCell + 1;
+ while (upTestCell < columnEnd &&
+ (bbcz_j[upTestCell].lower <= bz1 ||
+ d2xy + gmx::square(bbcz_j[upTestCell].lower - bz1) < rlist2))
+ {
+ upTestCell++;
+ }
+ int lastCell = upTestCell - 1;
+
+#define NBNXN_REFCODE 0
+#if NBNXN_REFCODE
+ {
+ /* Simple reference code, for debugging,
+ * overrides the more complex code above.
+ */
+ firstCell = columnEnd;
+ lastCell = -1;
+ for (int k = columnStart; k < columnEnd; k++)
+ {
+ if (d2xy + gmx::square(bbcz_j[k*NNBSBB_D + 1] - bz0) < rlist2 &&
+ k < firstCell)
+ {
+ firstCell = k;
+ }
+ if (d2xy + gmx::square(bbcz_j[k*NNBSBB_D] - bz1) < rlist2 &&
+ k > lastCell)
+ {
+ lastCell = k;
+ }
+ }
+ }
+#endif
+
+ if (isIntraGridList)
+ {
+ /* We want each atom/cell pair only once,
+ * only use cj >= ci.
+ */
+ if (!c_pbcShiftBackward || shift == CENTRAL)
+ {
+ firstCell = std::max(firstCell, ci);
+ }
+ }
+
+ if (firstCell <= lastCell)
+ {
+ GMX_ASSERT(firstCell >= columnStart && lastCell < columnEnd, "The range should reside within the current grid column");
+
+ /* For f buffer flags with simple lists */
+ ncj_old_j = getNumSimpleJClustersInList(*nbl);
+
+ makeClusterListWrapper(nbl,
+ iGrid, ci,
+ jGrid, firstCell, lastCell,
+ excludeSubDiagonal,
+ nbat,
+ rlist2, rbb2,
+ kernelType,
+ &numDistanceChecks);
+
+ if (bFBufferFlag)
+ {
+ setBufferFlags(*nbl, ncj_old_j, gridj_flag_shift,
+ gridj_flag, th);
+ }
+
+ incrementNumSimpleJClustersInList(nbl, ncj_old_j);
+ }
+ }
+ }
+ }
+
+ /* Set the exclusions for this ci list */
+ setExclusionsForIEntry(gridSet,
+ nbl,
+ excludeSubDiagonal,
+ na_cj_2log,
+ *getOpenIEntry(nbl),
+ exclusions);
+
+ if (haveFep)
+ {
+ make_fep_list(gridSet.atomIndices(), nbat, nbl,
+ excludeSubDiagonal,
+ getOpenIEntry(nbl),
+ shx, shy, shz,
+ rl_fep2,
+ iGrid, jGrid, nbl_fep);
+ }
+
+ /* Close this ci list */
+ closeIEntry(nbl,
+ nsubpair_max,
+ progBal, nsubpair_tot_est,
+ th, nth);
+ }
+ }
+ }
+
+ if (bFBufferFlag && getNumSimpleJClustersInList(*nbl) > ncj_old_i)
+ {
+ bitmask_init_bit(&(work->buffer_flags.flag[(iGrid.cellOffset() + ci) >> gridi_flag_shift]), th);
+ }
+ }
+
+ work->ndistc = numDistanceChecks;
+
+ checkListSizeConsistency(*nbl, haveFep);
+
+ if (debug)
+ {
+ fprintf(debug, "number of distance checks %d\n", numDistanceChecks);
+
+ print_nblist_statistics(debug, *nbl, gridSet, rlist);
+
+ if (haveFep)
+ {
+ fprintf(debug, "nbl FEP list pairs: %d\n", nbl_fep->nrj);
+ }
+ }
+}
+
+static void reduce_buffer_flags(gmx::ArrayRef<PairsearchWork> searchWork,
+ int nsrc,
+ const nbnxn_buffer_flags_t *dest)
+{
+ for (int s = 0; s < nsrc; s++)
+ {
+ gmx_bitmask_t * flag = searchWork[s].buffer_flags.flag;
+
+ for (int b = 0; b < dest->nflag; b++)
+ {
+ bitmask_union(&(dest->flag[b]), flag[b]);
+ }
+ }
+}
+
+static void print_reduction_cost(const nbnxn_buffer_flags_t *flags, int nout)
+{
+ int nelem, nkeep, ncopy, nred, out;
+ gmx_bitmask_t mask_0;
+
+ nelem = 0;
+ nkeep = 0;
+ ncopy = 0;
+ nred = 0;
+ bitmask_init_bit(&mask_0, 0);
+ for (int b = 0; b < flags->nflag; b++)
+ {
+ if (bitmask_is_equal(flags->flag[b], mask_0))
+ {
+ /* Only flag 0 is set, no copy of reduction required */
+ nelem++;
+ nkeep++;
+ }
+ else if (!bitmask_is_zero(flags->flag[b]))
+ {
+ int c = 0;
+ for (out = 0; out < nout; out++)
+ {
+ if (bitmask_is_set(flags->flag[b], out))
+ {
+ c++;
+ }
+ }
+ nelem += c;
+ if (c == 1)
+ {
+ ncopy++;
+ }
+ else
+ {
+ nred += c;
+ }
+ }
+ }
+
+ fprintf(debug, "nbnxn reduction: #flag %d #list %d elem %4.2f, keep %4.2f copy %4.2f red %4.2f\n",
+ flags->nflag, nout,
+ nelem/static_cast<double>(flags->nflag),
+ nkeep/static_cast<double>(flags->nflag),
+ ncopy/static_cast<double>(flags->nflag),
+ nred/static_cast<double>(flags->nflag));
+}
+
+/* Copies the list entries from src to dest when cjStart <= *cjGlobal < cjEnd.
+ * *cjGlobal is updated with the cj count in src.
+ * When setFlags==true, flag bit t is set in flag for all i and j clusters.
+ */
+template<bool setFlags>
+static void copySelectedListRange(const nbnxn_ci_t * gmx_restrict srcCi,
+ const NbnxnPairlistCpu * gmx_restrict src,
+ NbnxnPairlistCpu * gmx_restrict dest,
+ gmx_bitmask_t *flag,
+ int iFlagShift, int jFlagShift, int t)
+{
+ const int ncj = srcCi->cj_ind_end - srcCi->cj_ind_start;
+
+ dest->ci.push_back(*srcCi);
+ dest->ci.back().cj_ind_start = dest->cj.size();
+ dest->ci.back().cj_ind_end = dest->cj.size() + ncj;
+
+ if (setFlags)
+ {
+ bitmask_init_bit(&flag[srcCi->ci >> iFlagShift], t);
+ }
+
+ for (int j = srcCi->cj_ind_start; j < srcCi->cj_ind_end; j++)
+ {
+ dest->cj.push_back(src->cj[j]);
+
+ if (setFlags)
+ {
+ /* NOTE: This is relatively expensive, since this
+ * operation is done for all elements in the list,
+ * whereas at list generation this is done only
+ * once for each flag entry.
+ */
+ bitmask_init_bit(&flag[src->cj[j].cj >> jFlagShift], t);
+ }
+ }
+}
+
++#if defined(__GNUC__) && !defined(__clang__) && !defined(__ICC) && __GNUC__ == 7
++/* Avoid gcc 7 avx512 loop vectorization bug (actually only needed with -mavx512f) */
++#pragma GCC push_options
++#pragma GCC optimize ("no-tree-vectorize")
++#endif
++
++/* Returns the number of cluster pairs that are in use summed over all lists */
++static int countClusterpairs(gmx::ArrayRef<const NbnxnPairlistCpu> pairlists)
++{
++ /* gcc 7 with -mavx512f can miss the contributions of 16 consecutive
++ * elements to the sum calculated in this loop. Above we have disabled
++ * loop vectorization to avoid this bug.
++ */
++ int ncjTotal = 0;
++ for (const auto &pairlist : pairlists)
++ {
++ ncjTotal += pairlist.ncjInUse;
++ }
++ return ncjTotal;
++}
++
++#if defined(__GNUC__) && !defined(__clang__) && !defined(__ICC) && __GNUC__ == 7
++#pragma GCC pop_options
++#endif
++
+/* This routine re-balances the pairlists such that all are nearly equally
+ * sized. Only whole i-entries are moved between lists. These are moved
+ * between the ends of the lists, such that the buffer reduction cost should
+ * not change significantly.
+ * Note that all original reduction flags are currently kept. This can lead
+ * to reduction of parts of the force buffer that could be avoided. But since
+ * the original lists are quite balanced, this will only give minor overhead.
+ */
+static void rebalanceSimpleLists(gmx::ArrayRef<const NbnxnPairlistCpu> srcSet,
+ gmx::ArrayRef<NbnxnPairlistCpu> destSet,
+ gmx::ArrayRef<PairsearchWork> searchWork)
+{
- int ncjTotalNew = 0;
- for (auto &dest : destSet)
- {
- ncjTotalNew += dest.ncjInUse;
- }
++ const int ncjTotal = countClusterpairs(srcSet);
+ const int numLists = srcSet.ssize();
+ const int ncjTarget = (ncjTotal + numLists - 1)/numLists;
+
+#pragma omp parallel num_threads(numLists)
+ {
+ int t = gmx_omp_get_thread_num();
+
+ int cjStart = ncjTarget* t;
+ int cjEnd = ncjTarget*(t + 1);
+
+ /* The destination pair-list for task/thread t */
+ NbnxnPairlistCpu &dest = destSet[t];
+
+ clear_pairlist(&dest);
+ dest.na_cj = srcSet[0].na_cj;
+
+ /* Note that the flags in the work struct (still) contain flags
+ * for all entries that are present in srcSet->nbl[t].
+ */
+ gmx_bitmask_t *flag = searchWork[t].buffer_flags.flag;
+
+ int iFlagShift = getBufferFlagShift(dest.na_ci);
+ int jFlagShift = getBufferFlagShift(dest.na_cj);
+
+ int cjGlobal = 0;
+ for (int s = 0; s < numLists && cjGlobal < cjEnd; s++)
+ {
+ const NbnxnPairlistCpu *src = &srcSet[s];
+
+ if (cjGlobal + src->ncjInUse > cjStart)
+ {
+ for (gmx::index i = 0; i < gmx::ssize(src->ci) && cjGlobal < cjEnd; i++)
+ {
+ const nbnxn_ci_t *srcCi = &src->ci[i];
+ int ncj = srcCi->cj_ind_end - srcCi->cj_ind_start;
+ if (cjGlobal >= cjStart)
+ {
+ /* If the source list is not our own, we need to set
+ * extra flags (the template bool parameter).
+ */
+ if (s != t)
+ {
+ copySelectedListRange
+ <true>
+ (srcCi, src, &dest,
+ flag, iFlagShift, jFlagShift, t);
+ }
+ else
+ {
+ copySelectedListRange
+ <false>
+ (srcCi, src,
+ &dest, flag, iFlagShift, jFlagShift, t);
+ }
+ }
+ cjGlobal += ncj;
+ }
+ }
+ else
+ {
+ cjGlobal += src->ncjInUse;
+ }
+ }
+
+ dest.ncjInUse = dest.cj.size();
+ }
+
+#ifndef NDEBUG
++ const int ncjTotalNew = countClusterpairs(destSet);
+ GMX_RELEASE_ASSERT(ncjTotalNew == ncjTotal, "The total size of the lists before and after rebalancing should match");
+#endif
+}
+
+/* Returns if the pairlists are so imbalanced that it is worth rebalancing. */
+static bool checkRebalanceSimpleLists(gmx::ArrayRef<const NbnxnPairlistCpu> lists)
+{
+ int numLists = lists.ssize();
+ int ncjMax = 0;
+ int ncjTotal = 0;
+ for (int s = 0; s < numLists; s++)
+ {
+ ncjMax = std::max(ncjMax, lists[s].ncjInUse);
+ ncjTotal += lists[s].ncjInUse;
+ }
+ if (debug)
+ {
+ fprintf(debug, "Pair-list ncjMax %d ncjTotal %d\n", ncjMax, ncjTotal);
+ }
+ /* The rebalancing adds 3% extra time to the search. Heuristically we
+ * determined that under common conditions the non-bonded kernel balance
+ * improvement will outweigh this when the imbalance is more than 3%.
+ * But this will, obviously, depend on search vs kernel time and nstlist.
+ */
+ const real rebalanceTolerance = 1.03;
+
+ return numLists*ncjMax > ncjTotal*rebalanceTolerance;
+}
+
+/* Perform a count (linear) sort to sort the smaller lists to the end.
+ * This avoids load imbalance on the GPU, as large lists will be
+ * scheduled and executed first and the smaller lists later.
+ * Load balancing between multi-processors only happens at the end
+ * and there smaller lists lead to more effective load balancing.
+ * The sorting is done on the cj4 count, not on the actual pair counts.
+ * Not only does this make the sort faster, but it also results in
+ * better load balancing than using a list sorted on exact load.
+ * This function swaps the pointer in the pair list to avoid a copy operation.
+ */
+static void sort_sci(NbnxnPairlistGpu *nbl)
+{
+ if (nbl->cj4.size() <= nbl->sci.size())
+ {
+ /* nsci = 0 or all sci have size 1, sorting won't change the order */
+ return;
+ }
+
+ NbnxnPairlistGpuWork &work = *nbl->work;
+
+ /* We will distinguish differences up to double the average */
+ const int m = (2*nbl->cj4.size())/nbl->sci.size();
+
+ /* Resize work.sci_sort so we can sort into it */
+ work.sci_sort.resize(nbl->sci.size());
+
+ std::vector<int> &sort = work.sortBuffer;
+ /* Set up m + 1 entries in sort, initialized at 0 */
+ sort.clear();
+ sort.resize(m + 1, 0);
+ /* Count the entries of each size */
+ for (const nbnxn_sci_t &sci : nbl->sci)
+ {
+ int i = std::min(m, sci.numJClusterGroups());
+ sort[i]++;
+ }
+ /* Calculate the offset for each count */
+ int s0 = sort[m];
+ sort[m] = 0;
+ for (int i = m - 1; i >= 0; i--)
+ {
+ int s1 = sort[i];
+ sort[i] = sort[i + 1] + s0;
+ s0 = s1;
+ }
+
+ /* Sort entries directly into place */
+ gmx::ArrayRef<nbnxn_sci_t> sci_sort = work.sci_sort;
+ for (const nbnxn_sci_t &sci : nbl->sci)
+ {
+ int i = std::min(m, sci.numJClusterGroups());
+ sci_sort[sort[i]++] = sci;
+ }
+
+ /* Swap the sci pointers so we use the new, sorted list */
+ std::swap(nbl->sci, work.sci_sort);
+}
+
+//! Prepares CPU lists produced by the search for dynamic pruning
+static void prepareListsForDynamicPruning(gmx::ArrayRef<NbnxnPairlistCpu> lists);
+
+void
+PairlistSet::constructPairlists(const Nbnxm::GridSet &gridSet,
+ gmx::ArrayRef<PairsearchWork> searchWork,
+ nbnxn_atomdata_t *nbat,
+ const t_blocka *excl,
+ const int minimumIlistCountForGpuBalancing,
+ t_nrnb *nrnb,
+ SearchCycleCounting *searchCycleCounting)
+{
+ const real rlist = params_.rlistOuter;
+
+ int nsubpair_target;
+ float nsubpair_tot_est;
+ int ci_block;
+ gmx_bool progBal;
+ int np_tot, np_noq, np_hlj, nap;
+
+ const int numLists = (isCpuType_ ? cpuLists_.size() : gpuLists_.size());
+
+ if (debug)
+ {
+ fprintf(debug, "ns making %d nblists\n", numLists);
+ }
+
+ nbat->bUseBufferFlags = (nbat->out.size() > 1);
+ /* We should re-init the flags before making the first list */
+ if (nbat->bUseBufferFlags && locality_ == InteractionLocality::Local)
+ {
+ init_buffer_flags(&nbat->buffer_flags, nbat->numAtoms());
+ }
+
+ int nzi;
+ if (locality_ == InteractionLocality::Local)
+ {
+ /* Only zone (grid) 0 vs 0 */
+ nzi = 1;
+ }
+ else
+ {
+ nzi = gridSet.domainSetup().zones->nizone;
+ }
+
+ if (!isCpuType_ && minimumIlistCountForGpuBalancing > 0)
+ {
+ get_nsubpair_target(gridSet, locality_, rlist, minimumIlistCountForGpuBalancing,
+ &nsubpair_target, &nsubpair_tot_est);
+ }
+ else
+ {
+ nsubpair_target = 0;
+ nsubpair_tot_est = 0;
+ }
+
+ /* Clear all pair-lists */
+ for (int th = 0; th < numLists; th++)
+ {
+ if (isCpuType_)
+ {
+ clear_pairlist(&cpuLists_[th]);
+ }
+ else
+ {
+ clear_pairlist(&gpuLists_[th]);
+ }
+
+ if (params_.haveFep)
+ {
+ clear_pairlist_fep(fepLists_[th].get());
+ }
+ }
+
+ const gmx_domdec_zones_t *ddZones = gridSet.domainSetup().zones;
+
+ for (int zi = 0; zi < nzi; zi++)
+ {
+ const Grid &iGrid = gridSet.grids()[zi];
+
+ int zj0;
+ int zj1;
+ if (locality_ == InteractionLocality::Local)
+ {
+ zj0 = 0;
+ zj1 = 1;
+ }
+ else
+ {
+ zj0 = ddZones->izone[zi].j0;
+ zj1 = ddZones->izone[zi].j1;
+ if (zi == 0)
+ {
+ zj0++;
+ }
+ }
+ for (int zj = zj0; zj < zj1; zj++)
+ {
+ const Grid &jGrid = gridSet.grids()[zj];
+
+ if (debug)
+ {
+ fprintf(debug, "ns search grid %d vs %d\n", zi, zj);
+ }
+
+ searchCycleCounting->start(enbsCCsearch);
+
+ ci_block = get_ci_block_size(iGrid, gridSet.domainSetup().haveMultipleDomains, numLists);
+
+ /* With GPU: generate progressively smaller lists for
+ * load balancing for local only or non-local with 2 zones.
+ */
+ progBal = (locality_ == InteractionLocality::Local || ddZones->n <= 2);
+
+#pragma omp parallel for num_threads(numLists) schedule(static)
+ for (int th = 0; th < numLists; th++)
+ {
+ try
+ {
+ /* Re-init the thread-local work flag data before making
+ * the first list (not an elegant conditional).
+ */
+ if (nbat->bUseBufferFlags && ((zi == 0 && zj == 0)))
+ {
+ init_buffer_flags(&searchWork[th].buffer_flags, nbat->numAtoms());
+ }
+
+ if (combineLists_ && th > 0)
+ {
+ GMX_ASSERT(!isCpuType_, "Can only combine GPU lists");
+
+ clear_pairlist(&gpuLists_[th]);
+ }
+
+ PairsearchWork &work = searchWork[th];
+
+ work.cycleCounter.start();
+
+ t_nblist *fepListPtr = (fepLists_.empty() ? nullptr : fepLists_[th].get());
+
+ /* Divide the i cells equally over the pairlists */
+ if (isCpuType_)
+ {
+ nbnxn_make_pairlist_part(gridSet, iGrid, jGrid,
+ &work, nbat, *excl,
+ rlist,
+ params_.pairlistType,
+ ci_block,
+ nbat->bUseBufferFlags,
+ nsubpair_target,
+ progBal, nsubpair_tot_est,
+ th, numLists,
+ &cpuLists_[th],
+ fepListPtr);
+ }
+ else
+ {
+ nbnxn_make_pairlist_part(gridSet, iGrid, jGrid,
+ &work, nbat, *excl,
+ rlist,
+ params_.pairlistType,
+ ci_block,
+ nbat->bUseBufferFlags,
+ nsubpair_target,
+ progBal, nsubpair_tot_est,
+ th, numLists,
+ &gpuLists_[th],
+ fepListPtr);
+ }
+
+ work.cycleCounter.stop();
+ }
+ GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
+ }
+ searchCycleCounting->stop(enbsCCsearch);
+
+ np_tot = 0;
+ np_noq = 0;
+ np_hlj = 0;
+ for (int th = 0; th < numLists; th++)
+ {
+ inc_nrnb(nrnb, eNR_NBNXN_DIST2, searchWork[th].ndistc);
+
+ if (isCpuType_)
+ {
+ const NbnxnPairlistCpu &nbl = cpuLists_[th];
+ np_tot += nbl.cj.size();
+ np_noq += nbl.work->ncj_noq;
+ np_hlj += nbl.work->ncj_hlj;
+ }
+ else
+ {
+ const NbnxnPairlistGpu &nbl = gpuLists_[th];
+ /* This count ignores potential subsequent pair pruning */
+ np_tot += nbl.nci_tot;
+ }
+ }
+ if (isCpuType_)
+ {
+ nap = cpuLists_[0].na_ci*cpuLists_[0].na_cj;
+ }
+ else
+ {
+ nap = gmx::square(gpuLists_[0].na_ci);
+ }
+ natpair_ljq_ = (np_tot - np_noq)*nap - np_hlj*nap/2;
+ natpair_lj_ = np_noq*nap;
+ natpair_q_ = np_hlj*nap/2;
+
+ if (combineLists_ && numLists > 1)
+ {
+ GMX_ASSERT(!isCpuType_, "Can only combine GPU lists");
+
+ searchCycleCounting->start(enbsCCcombine);
+
+ combine_nblists(gmx::constArrayRefFromArray(&gpuLists_[1], numLists - 1),
+ &gpuLists_[0]);
+
+ searchCycleCounting->stop(enbsCCcombine);
+ }
+ }
+ }
+
+ if (isCpuType_)
+ {
+ if (numLists > 1 && checkRebalanceSimpleLists(cpuLists_))
+ {
+ rebalanceSimpleLists(cpuLists_, cpuListsWork_, searchWork);
+
+ /* Swap the sets of pair lists */
+ cpuLists_.swap(cpuListsWork_);
+ }
+ }
+ else
+ {
+ /* Sort the entries on size, large ones first */
+ if (combineLists_ || gpuLists_.size() == 1)
+ {
+ sort_sci(&gpuLists_[0]);
+ }
+ else
+ {
+#pragma omp parallel for num_threads(numLists) schedule(static)
+ for (int th = 0; th < numLists; th++)
+ {
+ try
+ {
+ sort_sci(&gpuLists_[th]);
+ }
+ GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
+ }
+ }
+ }
+
+ if (nbat->bUseBufferFlags)
+ {
+ reduce_buffer_flags(searchWork, numLists, &nbat->buffer_flags);
+ }
+
+ if (gridSet.haveFep())
+ {
+ /* Balance the free-energy lists over all the threads */
+ balance_fep_lists(fepLists_, searchWork);
+ }
+
+ if (isCpuType_)
+ {
+ /* This is a fresh list, so not pruned, stored using ci.
+ * ciOuter is invalid at this point.
+ */
+ GMX_ASSERT(cpuLists_[0].ciOuter.empty(), "ciOuter is invalid so it should be empty");
+ }
+
+ /* If we have more than one list, they either got rebalancing (CPU)
+ * or combined (GPU), so we should dump the final result to debug.
+ */
+ if (debug)
+ {
+ if (isCpuType_ && cpuLists_.size() > 1)
+ {
+ for (auto &cpuList : cpuLists_)
+ {
+ print_nblist_statistics(debug, cpuList, gridSet, rlist);
+ }
+ }
+ else if (!isCpuType_ && gpuLists_.size() > 1)
+ {
+ print_nblist_statistics(debug, gpuLists_[0], gridSet, rlist);
+ }
+ }
+
+ if (debug)
+ {
+ if (gmx_debug_at)
+ {
+ if (isCpuType_)
+ {
+ for (auto &cpuList : cpuLists_)
+ {
+ print_nblist_ci_cj(debug, cpuList);
+ }
+ }
+ else
+ {
+ print_nblist_sci_cj(debug, gpuLists_[0]);
+ }
+ }
+
+ if (nbat->bUseBufferFlags)
+ {
+ print_reduction_cost(&nbat->buffer_flags, numLists);
+ }
+ }
+
+ if (params_.useDynamicPruning && isCpuType_)
+ {
+ prepareListsForDynamicPruning(cpuLists_);
+ }
+}
+
+void
+PairlistSets::construct(const InteractionLocality iLocality,
+ PairSearch *pairSearch,
+ nbnxn_atomdata_t *nbat,
+ const t_blocka *excl,
+ const int64_t step,
+ t_nrnb *nrnb)
+{
+ pairlistSet(iLocality).constructPairlists(pairSearch->gridSet(), pairSearch->work(),
+ nbat, excl, minimumIlistCountForGpuBalancing_,
+ nrnb, &pairSearch->cycleCounting_);
+
+ if (iLocality == Nbnxm::InteractionLocality::Local)
+ {
+ outerListCreationStep_ = step;
+ }
+ else
+ {
+ GMX_RELEASE_ASSERT(outerListCreationStep_ == step,
+ "Outer list should be created at the same step as the inner list");
+ }
+
+ /* Special performance logging stuff (env.var. GMX_NBNXN_CYCLE) */
+ if (iLocality == InteractionLocality::Local)
+ {
+ pairSearch->cycleCounting_.searchCount_++;
+ }
+ if (pairSearch->cycleCounting_.recordCycles_ &&
+ (!pairSearch->gridSet().domainSetup().haveMultipleDomains || iLocality == InteractionLocality::NonLocal) &&
+ pairSearch->cycleCounting_.searchCount_ % 100 == 0)
+ {
+ pairSearch->cycleCounting_.printCycles(stderr, pairSearch->work());
+ }
+}
+
+void
+nonbonded_verlet_t::constructPairlist(const Nbnxm::InteractionLocality iLocality,
+ const t_blocka *excl,
+ int64_t step,
+ t_nrnb *nrnb)
+{
+ pairlistSets_->construct(iLocality, pairSearch_.get(), nbat.get(), excl,
+ step, nrnb);
+
+ if (useGpu())
+ {
+ /* Launch the transfer of the pairlist to the GPU.
+ *
+ * NOTE: The launch overhead is currently not timed separately
+ */
+ Nbnxm::gpu_init_pairlist(gpu_nbv,
+ pairlistSets().pairlistSet(iLocality).gpuList(),
+ iLocality);
+ }
+}
+
+static void prepareListsForDynamicPruning(gmx::ArrayRef<NbnxnPairlistCpu> lists)
+{
+ /* TODO: Restructure the lists so we have actual outer and inner
+ * list objects so we can set a single pointer instead of
+ * swapping several pointers.
+ */
+
+ for (auto &list : lists)
+ {
+ /* The search produced a list in ci/cj.
+ * Swap the list pointers so we get the outer list is ciOuter,cjOuter
+ * and we can prune that to get an inner list in ci/cj.
+ */
+ GMX_RELEASE_ASSERT(list.ciOuter.empty() && list.cjOuter.empty(),
+ "The outer lists should be empty before preparation");
+
+ std::swap(list.ci, list.ciOuter);
+ std::swap(list.cj, list.cjOuter);
+ }
+}
--- /dev/null
- Run this number of steps, overrides .mdp file option (-1 means
- infinite, -2 means use mdp option, smaller is invalid)
+<?xml version="1.0"?>
+<?xml-stylesheet type="text/xsl" href="referencedata.xsl"?>
+<ReferenceData>
+ <String Name="Help string">SYNOPSIS
+
+gmx [-s [<.tpr>]] [-cpi [<.cpt>]] [-table [<.xvg>]] [-tablep [<.xvg>]]
+ [-tableb [<.xvg> [...]]] [-rerun [<.xtc/.trr/...>]] [-ei [<.edi>]]
+ [-multidir [<dir> [...]]] [-awh [<.xvg>]] [-membed [<.dat>]]
+ [-mp [<.top>]] [-mn [<.ndx>]] [-o [<.trr/.cpt/...>]] [-x [<.xtc/.tng>]]
+ [-cpo [<.cpt>]] [-c [<.gro/.g96/...>]] [-e [<.edr>]] [-g [<.log>]]
+ [-dhdl [<.xvg>]] [-field [<.xvg>]] [-tpi [<.xvg>]] [-tpid [<.xvg>]]
+ [-eo [<.xvg>]] [-px [<.xvg>]] [-pf [<.xvg>]] [-ro [<.xvg>]]
+ [-ra [<.log>]] [-rs [<.log>]] [-rt [<.log>]] [-mtx [<.mtx>]]
+ [-if [<.xvg>]] [-swap [<.xvg>]] [-deffnm <string>] [-xvg <enum>]
+ [-dd <vector>] [-ddorder <enum>] [-npme <int>] [-nt <int>] [-ntmpi <int>]
+ [-ntomp <int>] [-ntomp_pme <int>] [-pin <enum>] [-pinoffset <int>]
+ [-pinstride <int>] [-gpu_id <string>] [-gputasks <string>] [-[no]ddcheck]
+ [-rdd <real>] [-rcon <real>] [-dlb <enum>] [-dds <real>] [-nb <enum>]
+ [-nstlist <int>] [-[no]tunepme] [-pme <enum>] [-pmefft <enum>]
+ [-bonded <enum>] [-[no]v] [-pforce <real>] [-[no]reprod] [-cpt <real>]
+ [-[no]cpnum] [-[no]append] [-nsteps <int>] [-maxh <real>] [-replex <int>]
+ [-nex <int>] [-reseed <int>]
+
+DESCRIPTION
+
+[THISMODULE] is the main computational chemistry engine within GROMACS.
+Obviously, it performs Molecular Dynamics simulations, but it can also perform
+Stochastic Dynamics, Energy Minimization, test particle insertion or
+(re)calculation of energies. Normal mode analysis is another option. In this
+case mdrun builds a Hessian matrix from single conformation. For usual Normal
+Modes-like calculations, make sure that the structure provided is properly
+energy-minimized. The generated matrix can be diagonalized by [gmx-nmeig].
+
+The mdrun program reads the run input file (-s) and distributes the topology
+over ranks if needed. mdrun produces at least four output files. A single log
+file (-g) is written. The trajectory file (-o), contains coordinates,
+velocities and optionally forces. The structure file (-c) contains the
+coordinates and velocities of the last step. The energy file (-e) contains
+energies, the temperature, pressure, etc, a lot of these things are also
+printed in the log file. Optionally coordinates can be written to a compressed
+trajectory file (-x).
+
+The option -dhdl is only used when free energy calculation is turned on.
+
+Running mdrun efficiently in parallel is a complex topic, many aspects of
+which are covered in the online User Guide. You should look there for
+practical advice on using many of the options available in mdrun.
+
+ED (essential dynamics) sampling and/or additional flooding potentials are
+switched on by using the -ei flag followed by an .edi file. The .edi file can
+be produced with the make_edi tool or by using options in the essdyn menu of
+the WHAT IF program. mdrun produces a .xvg output file that contains
+projections of positions, velocities and forces onto selected eigenvectors.
+
+When user-defined potential functions have been selected in the .mdp file the
+-table option is used to pass mdrun a formatted table with potential
+functions. The file is read from either the current directory or from the
+GMXLIB directory. A number of pre-formatted tables are presented in the GMXLIB
+dir, for 6-8, 6-9, 6-10, 6-11, 6-12 Lennard-Jones potentials with normal
+Coulomb. When pair interactions are present, a separate table for pair
+interaction functions is read using the -tablep option.
+
+When tabulated bonded functions are present in the topology, interaction
+functions are read using the -tableb option. For each different tabulated
+interaction type used, a table file name must be given. For the topology to
+work, a file name given here must match a character sequence before the file
+extension. That sequence is: an underscore, then a 'b' for bonds, an 'a' for
+angles or a 'd' for dihedrals, and finally the matching table number index
+used in the topology. Note that, these options are deprecated, and in future
+will be available via grompp.
+
+The options -px and -pf are used for writing pull COM coordinates and forces
+when pulling is selected in the .mdp file.
+
+The option -membed does what used to be g_membed, i.e. embed a protein into a
+membrane. This module requires a number of settings that are provided in a
+data file that is the argument of this option. For more details in membrane
+embedding, see the documentation in the user guide. The options -mn and -mp
+are used to provide the index and topology files used for the embedding.
+
+The option -pforce is useful when you suspect a simulation crashes due to too
+large forces. With this option coordinates and forces of atoms with a force
+larger than a certain value will be printed to stderr. It will also terminate
+the run when non-finite forces are present.
+
+Checkpoints containing the complete state of the system are written at regular
+intervals (option -cpt) to the file -cpo, unless option -cpt is set to -1. The
+previous checkpoint is backed up to state_prev.cpt to make sure that a recent
+state of the system is always available, even when the simulation is
+terminated while writing a checkpoint. With -cpnum all checkpoint files are
+kept and appended with the step number. A simulation can be continued by
+reading the full state from file with option -cpi. This option is intelligent
+in the way that if no checkpoint file is found, GROMACS just assumes a normal
+run and starts from the first step of the .tpr file. By default the output
+will be appending to the existing output files. The checkpoint file contains
+checksums of all output files, such that you will never loose data when some
+output files are modified, corrupt or removed. There are three scenarios with
+-cpi:
+
+* no files with matching names are present: new output files are written
+
+* all files are present with names and checksums matching those stored in the
+checkpoint file: files are appended
+
+* otherwise no files are modified and a fatal error is generated
+
+With -noappend new output files are opened and the simulation part number is
+added to all output file names. Note that in all cases the checkpoint file
+itself is not renamed and will be overwritten, unless its name does not match
+the -cpo option.
+
+With checkpointing the output is appended to previously written output files,
+unless -noappend is used or none of the previous output files are present
+(except for the checkpoint file). The integrity of the files to be appended is
+verified using checksums which are stored in the checkpoint file. This ensures
+that output can not be mixed up or corrupted due to file appending. When only
+some of the previous output files are present, a fatal error is generated and
+no old output files are modified and no new output files are opened. The
+result with appending will be the same as from a single run. The contents will
+be binary identical, unless you use a different number of ranks or dynamic
+load balancing or the FFT library uses optimizations through timing.
+
+With option -maxh a simulation is terminated and a checkpoint file is written
+at the first neighbor search step where the run time exceeds -maxh*0.99 hours.
+This option is particularly useful in combination with setting nsteps to -1
+either in the mdp or using the similarly named command line option (although
+the latter is deprecated). This results in an infinite run, terminated only
+when the time limit set by -maxh is reached (if any) or upon receiving a
+signal.
+
+Interactive molecular dynamics (IMD) can be activated by using at least one of
+the three IMD switches: The -imdterm switch allows one to terminate the
+simulation from the molecular viewer (e.g. VMD). With -imdwait, mdrun pauses
+whenever no IMD client is connected. Pulling from the IMD remote can be turned
+on by -imdpull. The port mdrun listens to can be altered by -imdport.The file
+pointed to by -if contains atom indices and forces if IMD pulling is used.
+
+OPTIONS
+
+Options to specify input files:
+
+ -s [<.tpr>] (topol.tpr)
+ Portable xdr run input file
+ -cpi [<.cpt>] (state.cpt) (Opt.)
+ Checkpoint file
+ -table [<.xvg>] (table.xvg) (Opt.)
+ xvgr/xmgr file
+ -tablep [<.xvg>] (tablep.xvg) (Opt.)
+ xvgr/xmgr file
+ -tableb [<.xvg> [...]] (table.xvg) (Opt.)
+ xvgr/xmgr file
+ -rerun [<.xtc/.trr/...>] (rerun.xtc) (Opt.)
+ Trajectory: xtc trr cpt gro g96 pdb tng
+ -ei [<.edi>] (sam.edi) (Opt.)
+ ED sampling input
+ -multidir [<dir> [...]] (rundir) (Opt.)
+ Run directory
+ -awh [<.xvg>] (awhinit.xvg) (Opt.)
+ xvgr/xmgr file
+ -membed [<.dat>] (membed.dat) (Opt.)
+ Generic data file
+ -mp [<.top>] (membed.top) (Opt.)
+ Topology file
+ -mn [<.ndx>] (membed.ndx) (Opt.)
+ Index file
+
+Options to specify output files:
+
+ -o [<.trr/.cpt/...>] (traj.trr)
+ Full precision trajectory: trr cpt tng
+ -x [<.xtc/.tng>] (traj_comp.xtc) (Opt.)
+ Compressed trajectory (tng format or portable xdr format)
+ -cpo [<.cpt>] (state.cpt) (Opt.)
+ Checkpoint file
+ -c [<.gro/.g96/...>] (confout.gro)
+ Structure file: gro g96 pdb brk ent esp
+ -e [<.edr>] (ener.edr)
+ Energy file
+ -g [<.log>] (md.log)
+ Log file
+ -dhdl [<.xvg>] (dhdl.xvg) (Opt.)
+ xvgr/xmgr file
+ -field [<.xvg>] (field.xvg) (Opt.)
+ xvgr/xmgr file
+ -tpi [<.xvg>] (tpi.xvg) (Opt.)
+ xvgr/xmgr file
+ -tpid [<.xvg>] (tpidist.xvg) (Opt.)
+ xvgr/xmgr file
+ -eo [<.xvg>] (edsam.xvg) (Opt.)
+ xvgr/xmgr file
+ -px [<.xvg>] (pullx.xvg) (Opt.)
+ xvgr/xmgr file
+ -pf [<.xvg>] (pullf.xvg) (Opt.)
+ xvgr/xmgr file
+ -ro [<.xvg>] (rotation.xvg) (Opt.)
+ xvgr/xmgr file
+ -ra [<.log>] (rotangles.log) (Opt.)
+ Log file
+ -rs [<.log>] (rotslabs.log) (Opt.)
+ Log file
+ -rt [<.log>] (rottorque.log) (Opt.)
+ Log file
+ -mtx [<.mtx>] (nm.mtx) (Opt.)
+ Hessian matrix
+ -if [<.xvg>] (imdforces.xvg) (Opt.)
+ xvgr/xmgr file
+ -swap [<.xvg>] (swapions.xvg) (Opt.)
+ xvgr/xmgr file
+
+Other options:
+
+ -deffnm <string>
+ Set the default filename for all file options
+ -xvg <enum> (xmgrace)
+ xvg plot formatting: xmgrace, xmgr, none
+ -dd <vector> (0 0 0)
+ Domain decomposition grid, 0 is optimize
+ -ddorder <enum> (interleave)
+ DD rank order: interleave, pp_pme, cartesian
+ -npme <int> (-1)
+ Number of separate ranks to be used for PME, -1 is guess
+ -nt <int> (0)
+ Total number of threads to start (0 is guess)
+ -ntmpi <int> (0)
+ Number of thread-MPI ranks to start (0 is guess)
+ -ntomp <int> (0)
+ Number of OpenMP threads per MPI rank to start (0 is guess)
+ -ntomp_pme <int> (0)
+ Number of OpenMP threads per MPI rank to start (0 is -ntomp)
+ -pin <enum> (auto)
+ Whether mdrun should try to set thread affinities: auto, on, off
+ -pinoffset <int> (0)
+ The lowest logical core number to which mdrun should pin the first
+ thread
+ -pinstride <int> (0)
+ Pinning distance in logical cores for threads, use 0 to minimize
+ the number of threads per physical core
+ -gpu_id <string>
+ List of unique GPU device IDs available to use
+ -gputasks <string>
+ List of GPU device IDs, mapping each PP task on each node to a
+ device
+ -[no]ddcheck (yes)
+ Check for all bonded interactions with DD
+ -rdd <real> (0)
+ The maximum distance for bonded interactions with DD (nm), 0 is
+ determine from initial coordinates
+ -rcon <real> (0)
+ Maximum distance for P-LINCS (nm), 0 is estimate
+ -dlb <enum> (auto)
+ Dynamic load balancing (with DD): auto, no, yes
+ -dds <real> (0.8)
+ Fraction in (0,1) by whose reciprocal the initial DD cell size will
+ be increased in order to provide a margin in which dynamic load
+ balancing can act while preserving the minimum cell size.
+ -nb <enum> (auto)
+ Calculate non-bonded interactions on: auto, cpu, gpu
+ -nstlist <int> (0)
+ Set nstlist when using a Verlet buffer tolerance (0 is guess)
+ -[no]tunepme (yes)
+ Optimize PME load between PP/PME ranks or GPU/CPU (only with the
+ Verlet cut-off scheme)
+ -pme <enum> (auto)
+ Perform PME calculations on: auto, cpu, gpu
+ -pmefft <enum> (auto)
+ Perform PME FFT calculations on: auto, cpu, gpu
+ -bonded <enum> (auto)
+ Perform bonded calculations on: auto, cpu, gpu
+ -[no]v (no)
+ Be loud and noisy
+ -pforce <real> (-1)
+ Print all forces larger than this (kJ/mol nm)
+ -[no]reprod (no)
+ Try to avoid optimizations that affect binary reproducibility
+ -cpt <real> (15)
+ Checkpoint interval (minutes)
+ -[no]cpnum (no)
+ Keep and number checkpoint files
+ -[no]append (yes)
+ Append to previous output files when continuing from checkpoint
+ instead of adding the simulation part number to all file names
+ -nsteps <int> (-2)
++ Run this number of steps (-1 means infinite, -2 means use mdp
++ option, smaller is invalid)
+ -maxh <real> (-1)
+ Terminate after 0.99 times this time (hours)
+ -replex <int> (0)
+ Attempt replica exchange periodically with this period (steps)
+ -nex <int> (0)
+ Number of random exchanges to carry out each exchange interval (N^3
+ is one suggestion). -nex zero or not specified gives neighbor
+ replica exchange.
+ -reseed <int> (-1)
+ Seed for replica exchange, -1 is generate a seed
+</String>
+</ReferenceData>