how-to/visualize.rst
install-guide/index.rst
release-notes/index.rst
+ release-notes/highlights.rst
+ release-notes/features.rst
+ release-notes/performance.rst
+ release-notes/tools.rst
+ release-notes/bugs-fixed.rst
+ release-notes/removed-functionality.rst
+ release-notes/deprecated-functionality.rst
+ release-notes/portability.rst
+ release-notes/miscellaneous.rst
+ release-notes/2019/2019.2.rst
release-notes/2019/2019.1.rst
release-notes/2019/major/highlights.rst
release-notes/2019/major/features.rst
void PmeTestEnvironment::SetUp()
{
- hardwareContexts_.emplace_back(compat::make_unique<TestHardwareContext>(CodePath::CPU, "", nullptr));
+ hardwareContexts_.emplace_back(std::make_unique<TestHardwareContext>(CodePath::CPU, "", nullptr));
hardwareInfo_ = hardwareInit();
- if (!pme_gpu_supports_build(*hardwareInfo_, nullptr))
+ if (!pme_gpu_supports_build(nullptr) ||
+ !pme_gpu_supports_hardware(*hardwareInfo_, nullptr))
{
// PME can only run on the CPU, so don't make any more test contexts.
return;
--- /dev/null
- #include "gromacs/trajectoryanalysis/topologyinformation.h"
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2004, The GROMACS development team.
+ * Copyright (c) 2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+#include "gmxpre.h"
+
+#include "insert_molecules.h"
+
+#include <algorithm>
+#include <memory>
+#include <set>
+#include <string>
+#include <vector>
+
+#include "gromacs/commandline/cmdlineoptionsmodule.h"
+#include "gromacs/fileio/confio.h"
+#include "gromacs/fileio/filetypes.h"
+#include "gromacs/fileio/xvgr.h"
+#include "gromacs/gmxlib/conformation_utilities.h"
+#include "gromacs/gmxpreprocess/makeexclusiondistances.h"
+#include "gromacs/math/functions.h"
+#include "gromacs/math/utilities.h"
+#include "gromacs/math/vec.h"
+#include "gromacs/options/basicoptions.h"
+#include "gromacs/options/filenameoption.h"
+#include "gromacs/options/ioptionscontainer.h"
+#include "gromacs/pbcutil/pbc.h"
+#include "gromacs/random/threefry.h"
+#include "gromacs/random/uniformrealdistribution.h"
+#include "gromacs/selection/nbsearch.h"
+#include "gromacs/selection/selection.h"
+#include "gromacs/selection/selectioncollection.h"
+#include "gromacs/selection/selectionoption.h"
+#include "gromacs/selection/selectionoptionbehavior.h"
+#include "gromacs/topology/atomprop.h"
+#include "gromacs/topology/atoms.h"
+#include "gromacs/topology/atomsbuilder.h"
+#include "gromacs/topology/mtop_util.h"
+#include "gromacs/topology/symtab.h"
+#include "gromacs/topology/topology.h"
+#include "gromacs/trajectory/trajectoryframe.h"
- #include "gromacs/utility/unique_cptr.h"
+#include "gromacs/utility/cstringutil.h"
+#include "gromacs/utility/exceptions.h"
+#include "gromacs/utility/fatalerror.h"
+#include "gromacs/utility/smalloc.h"
- gmx_mtop_t *getTopology(bool /*required*/) override { return topInfo_.mtop(); }
+
+using gmx::RVec;
+
+/* enum for random rotations of inserted solutes */
+enum RotationType {
+ en_rotXYZ, en_rotZ, en_rotNone
+};
+const char *const cRotationEnum[] = {"xyz", "z", "none"};
+
+static void center_molecule(gmx::ArrayRef<RVec> x)
+{
+ rvec center = {0};
+ for (auto &xi : x)
+ {
+ rvec_inc(center, xi);
+ }
+ svmul(1.0/x.size(), center, center);
+ for (auto &xi : x)
+ {
+ rvec_dec(xi, center);
+ }
+}
+
+static void generate_trial_conf(gmx::ArrayRef<RVec> xin,
+ const rvec offset, RotationType enum_rot,
+ gmx::DefaultRandomEngine * rng,
+ std::vector<RVec> *xout)
+{
+ gmx::UniformRealDistribution<real> dist(0, 2.0*M_PI);
+ xout->assign(xin.begin(), xin.end());
+
+ real alfa = 0.0, beta = 0.0, gamma = 0.0;
+ switch (enum_rot)
+ {
+ case en_rotXYZ:
+ alfa = dist(*rng);
+ beta = dist(*rng);
+ gamma = dist(*rng);
+ break;
+ case en_rotZ:
+ alfa = beta = 0.;
+ gamma = dist(*rng);
+ break;
+ case en_rotNone:
+ alfa = beta = gamma = 0.;
+ break;
+ }
+ if (enum_rot == en_rotXYZ || enum_rot == en_rotZ)
+ {
+ rotate_conf(xout->size(), as_rvec_array(xout->data()), nullptr, alfa, beta, gamma);
+ }
+ for (size_t i = 0; i < xout->size(); ++i)
+ {
+ rvec_inc((*xout)[i], offset);
+ }
+}
+
+static bool isInsertionAllowed(gmx::AnalysisNeighborhoodSearch *search,
+ const std::vector<real> &exclusionDistances,
+ const std::vector<RVec> &x,
+ const std::vector<real> &exclusionDistances_insrt,
+ const t_atoms &atoms,
+ const std::set<int> &removableAtoms,
+ gmx::AtomsRemover *remover)
+{
+ gmx::AnalysisNeighborhoodPositions pos(x);
+ gmx::AnalysisNeighborhoodPairSearch pairSearch = search->startPairSearch(pos);
+ gmx::AnalysisNeighborhoodPair pair;
+ while (pairSearch.findNextPair(&pair))
+ {
+ const real r1 = exclusionDistances[pair.refIndex()];
+ const real r2 = exclusionDistances_insrt[pair.testIndex()];
+ if (pair.distance2() < gmx::square(r1 + r2))
+ {
+ if (removableAtoms.count(pair.refIndex()) == 0)
+ {
+ return false;
+ }
+ // TODO: If molecule information is available, this should ideally
+ // use it to remove whole molecules.
+ remover->markResidue(atoms, pair.refIndex(), true);
+ }
+ }
+ return true;
+}
+
+static void insert_mols(int nmol_insrt, int ntry, int seed,
+ real defaultDistance, real scaleFactor,
+ t_atoms *atoms, t_symtab *symtab, std::vector<RVec> *x,
+ const std::set<int> &removableAtoms,
+ const t_atoms &atoms_insrt, gmx::ArrayRef<RVec> x_insrt,
+ int ePBC, matrix box,
+ const std::string &posfn, const rvec deltaR,
+ RotationType enum_rot)
+{
+ fprintf(stderr, "Initialising inter-atomic distances...\n");
+ AtomProperties aps;
+ std::vector<real> exclusionDistances(
+ makeExclusionDistances(atoms, &aps, defaultDistance, scaleFactor));
+ const std::vector<real> exclusionDistances_insrt(
+ makeExclusionDistances(&atoms_insrt, &aps, defaultDistance, scaleFactor));
+
+ const real maxInsertRadius
+ = *std::max_element(exclusionDistances_insrt.begin(),
+ exclusionDistances_insrt.end());
+ real maxRadius = maxInsertRadius;
+ if (!exclusionDistances.empty())
+ {
+ const real maxExistingRadius
+ = *std::max_element(exclusionDistances.begin(),
+ exclusionDistances.end());
+ maxRadius = std::max(maxInsertRadius, maxExistingRadius);
+ }
+
+ // TODO: Make all of this exception-safe.
+ gmx::AnalysisNeighborhood nb;
+ nb.setCutoff(maxInsertRadius + maxRadius);
+
+
+ if (seed == 0)
+ {
+ seed = static_cast<int>(gmx::makeRandomSeed());
+ }
+ fprintf(stderr, "Using random seed %d\n", seed);
+
+ gmx::DefaultRandomEngine rng(seed);
+
+ t_pbc pbc;
+ set_pbc(&pbc, ePBC, box);
+
+ /* With -ip, take nmol_insrt from file posfn */
+ double **rpos = nullptr;
+ const bool insertAtPositions = !posfn.empty();
+ if (insertAtPositions)
+ {
+ int ncol;
+ nmol_insrt = read_xvg(posfn.c_str(), &rpos, &ncol);
+ if (ncol != 3)
+ {
+ gmx_fatal(FARGS, "Expected 3 columns (x/y/z coordinates) in file %s\n",
+ posfn.c_str());
+ }
+ fprintf(stderr, "Read %d positions from file %s\n\n",
+ nmol_insrt, posfn.c_str());
+ }
+
+ gmx::AtomsBuilder builder(atoms, symtab);
+ gmx::AtomsRemover remover(*atoms);
+ {
+ const int finalAtomCount = atoms->nr + nmol_insrt * atoms_insrt.nr;
+ const int finalResidueCount = atoms->nres + nmol_insrt * atoms_insrt.nres;
+ builder.reserve(finalAtomCount, finalResidueCount);
+ x->reserve(finalAtomCount);
+ exclusionDistances.reserve(finalAtomCount);
+ }
+
+ std::vector<RVec> x_n(x_insrt.size());
+
+ int mol = 0;
+ int trial = 0;
+ int firstTrial = 0;
+ int failed = 0;
+ gmx::UniformRealDistribution<real> dist;
+
+ while (mol < nmol_insrt && trial < ntry*nmol_insrt)
+ {
+ rvec offset_x;
+ if (!insertAtPositions)
+ {
+ // Insert at random positions.
+ offset_x[XX] = box[XX][XX] * dist(rng);
+ offset_x[YY] = box[YY][YY] * dist(rng);
+ offset_x[ZZ] = box[ZZ][ZZ] * dist(rng);
+ }
+ else
+ {
+ // Skip a position if ntry trials were not successful.
+ if (trial >= firstTrial + ntry)
+ {
+ fprintf(stderr, " skipped position (%.3f, %.3f, %.3f)\n",
+ rpos[XX][mol], rpos[YY][mol], rpos[ZZ][mol]);
+ ++mol;
+ ++failed;
+ firstTrial = trial;
+ continue;
+ }
+ // Insert at positions taken from option -ip file.
+ offset_x[XX] = rpos[XX][mol] + deltaR[XX]*(2 * dist(rng)-1);
+ offset_x[YY] = rpos[YY][mol] + deltaR[YY]*(2 * dist(rng)-1);
+ offset_x[ZZ] = rpos[ZZ][mol] + deltaR[ZZ]*(2 * dist(rng)-1);
+ }
+ fprintf(stderr, "\rTry %d", ++trial);
+ fflush(stderr);
+
+ generate_trial_conf(x_insrt, offset_x, enum_rot, &rng, &x_n);
+ gmx::AnalysisNeighborhoodPositions pos(*x);
+ gmx::AnalysisNeighborhoodSearch search = nb.initSearch(&pbc, pos);
+ if (isInsertionAllowed(&search, exclusionDistances, x_n, exclusionDistances_insrt,
+ *atoms, removableAtoms, &remover))
+ {
+ x->insert(x->end(), x_n.begin(), x_n.end());
+ exclusionDistances.insert(exclusionDistances.end(),
+ exclusionDistances_insrt.begin(),
+ exclusionDistances_insrt.end());
+ builder.mergeAtoms(atoms_insrt);
+ ++mol;
+ firstTrial = trial;
+ fprintf(stderr, " success (now %d atoms)!\n", builder.currentAtomCount());
+ }
+ }
+
+ fprintf(stderr, "\n");
+ /* print number of molecules added */
+ fprintf(stderr, "Added %d molecules (out of %d requested)\n",
+ mol - failed, nmol_insrt);
+
+ const int originalAtomCount = atoms->nr;
+ const int originalResidueCount = atoms->nres;
+ remover.refreshAtomCount(*atoms);
+ remover.removeMarkedElements(x);
+ remover.removeMarkedAtoms(atoms);
+ if (atoms->nr < originalAtomCount)
+ {
+ fprintf(stderr, "Replaced %d residues (%d atoms)\n",
+ originalResidueCount - atoms->nres,
+ originalAtomCount - atoms->nr);
+ }
+
+ if (rpos != nullptr)
+ {
+ for (int i = 0; i < DIM; ++i)
+ {
+ sfree(rpos[i]);
+ }
+ sfree(rpos);
+ }
+}
+
+namespace gmx
+{
+
+namespace
+{
+
+class InsertMolecules : public ICommandLineOptionsModule, public ITopologyProvider
+{
+ public:
+ InsertMolecules()
+ : bBox_(false), nmolIns_(0), nmolTry_(10), seed_(0),
+ defaultDistance_(0.105), scaleFactor_(0.57), enumRot_(en_rotXYZ)
+ {
+ clear_rvec(newBox_);
+ clear_rvec(deltaR_);
++ clear_mat(box_);
+ }
+
+ // From ITopologyProvider
- TopologyInformation topInfo_;
++ gmx_mtop_t *getTopology(bool /*required*/) override { return &top_; }
+ int getAtomCount() override { return 0; }
+
+ // From ICommandLineOptionsModule
+ void init(CommandLineModuleSettings * /*settings*/) override
+ {
+ }
+ void initOptions(IOptionsContainer *options,
+ ICommandLineOptionsModuleSettings *settings) override;
+ void optionsFinished() override;
+ int run() override;
+
+ private:
+ SelectionCollection selections_;
+
+ std::string inputConfFile_;
+ std::string insertConfFile_;
+ std::string positionFile_;
+ std::string outputConfFile_;
+ rvec newBox_;
+ bool bBox_;
+ int nmolIns_;
+ int nmolTry_;
+ int seed_;
+ real defaultDistance_;
+ real scaleFactor_;
+ rvec deltaR_;
+ RotationType enumRot_;
+ Selection replaceSel_;
+
- topInfo_.fillFromInputFile(inputConfFile_);
- if (topInfo_.mtop()->natoms == 0)
++ gmx_mtop_t top_;
++ std::vector<RVec> x_;
++ matrix box_;
++ int ePBC_;
+};
+
+void InsertMolecules::initOptions(IOptionsContainer *options,
+ ICommandLineOptionsModuleSettings *settings)
+{
+ const char *const desc[] = {
+ "[THISMODULE] inserts [TT]-nmol[tt] copies of the system specified in",
+ "the [TT]-ci[tt] input file. The insertions take place either into",
+ "vacant space in the solute conformation given with [TT]-f[tt], or",
+ "into an empty box given by [TT]-box[tt]. Specifying both [TT]-f[tt]",
+ "and [TT]-box[tt] behaves like [TT]-f[tt], but places a new box",
+ "around the solute before insertions. Any velocities present are",
+ "discarded.",
+ "",
+ "It is possible to also insert into a solvated configuration and",
+ "replace solvent atoms with the inserted atoms. To do this, use",
+ "[TT]-replace[tt] to specify a selection that identifies the atoms",
+ "that can be replaced. The tool assumes that all molecules in this",
+ "selection consist of single residues: each residue from this",
+ "selection that overlaps with the inserted molecules will be removed",
+ "instead of preventing insertion.",
+ "",
+ "By default, the insertion positions are random (with initial seed",
+ "specified by [TT]-seed[tt]). The program iterates until [TT]-nmol[tt]",
+ "molecules have been inserted in the box. Molecules are not inserted",
+ "where the distance between any existing atom and any atom of the",
+ "inserted molecule is less than the sum based on the van der Waals",
+ "radii of both atoms. A database ([TT]vdwradii.dat[tt]) of van der",
+ "Waals radii is read by the program, and the resulting radii scaled",
+ "by [TT]-scale[tt]. If radii are not found in the database, those",
+ "atoms are assigned the (pre-scaled) distance [TT]-radius[tt].",
+ "Note that the usefulness of those radii depends on the atom names,",
+ "and thus varies widely with force field.",
+ "",
+ "A total of [TT]-nmol[tt] * [TT]-try[tt] insertion attempts are made",
+ "before giving up. Increase [TT]-try[tt] if you have several small",
+ "holes to fill. Option [TT]-rot[tt] specifies whether the insertion",
+ "molecules are randomly oriented before insertion attempts.",
+ "",
+ "Alternatively, the molecules can be inserted only at positions defined in",
+ "positions.dat ([TT]-ip[tt]). That file should have 3 columns (x,y,z),",
+ "that give the displacements compared to the input molecule position",
+ "([TT]-ci[tt]). Hence, if that file should contain the absolute",
+ "positions, the molecule must be centered on (0,0,0) before using",
+ "[THISMODULE] (e.g. from [gmx-editconf] [TT]-center[tt]).",
+ "Comments in that file starting with # are ignored. Option [TT]-dr[tt]",
+ "defines the maximally allowed displacements during insertial trials.",
+ "[TT]-try[tt] and [TT]-rot[tt] work as in the default mode (see above)."
+ };
+
+ settings->setHelpText(desc);
+
+ std::shared_ptr<SelectionOptionBehavior> selectionOptionBehavior(
+ new SelectionOptionBehavior(&selections_, this));
+ settings->addOptionsBehavior(selectionOptionBehavior);
+
+ // TODO: Replace use of legacyType.
+ options->addOption(FileNameOption("f")
+ .legacyType(efSTX).inputFile()
+ .store(&inputConfFile_)
+ .defaultBasename("protein")
+ .description("Existing configuration to insert into"));
+ options->addOption(FileNameOption("ci")
+ .legacyType(efSTX).inputFile().required()
+ .store(&insertConfFile_)
+ .defaultBasename("insert")
+ .description("Configuration to insert"));
+ options->addOption(FileNameOption("ip")
+ .filetype(eftGenericData).inputFile()
+ .store(&positionFile_)
+ .defaultBasename("positions")
+ .description("Predefined insertion trial positions"));
+ options->addOption(FileNameOption("o")
+ .legacyType(efSTO).outputFile().required()
+ .store(&outputConfFile_)
+ .defaultBasename("out")
+ .description("Output configuration after insertion"));
+
+ options->addOption(SelectionOption("replace").onlyAtoms()
+ .store(&replaceSel_)
+ .description("Atoms that can be removed if overlapping"));
+ selectionOptionBehavior->initOptions(options);
+
+ options->addOption(RealOption("box").vector()
+ .store(newBox_).storeIsSet(&bBox_)
+ .description("Box size (in nm)"));
+ options->addOption(IntegerOption("nmol")
+ .store(&nmolIns_)
+ .description("Number of extra molecules to insert"));
+ options->addOption(IntegerOption("try")
+ .store(&nmolTry_)
+ .description("Try inserting [TT]-nmol[tt] times [TT]-try[tt] times"));
+ options->addOption(IntegerOption("seed")
+ .store(&seed_)
+ .description("Random generator seed (0 means generate)"));
+ options->addOption(RealOption("radius")
+ .store(&defaultDistance_)
+ .description("Default van der Waals distance"));
+ options->addOption(RealOption("scale")
+ .store(&scaleFactor_)
+ .description("Scale factor to multiply Van der Waals radii from the database in share/gromacs/top/vdwradii.dat. The default value of 0.57 yields density close to 1000 g/l for proteins in water."));
+ options->addOption(RealOption("dr").vector()
+ .store(deltaR_)
+ .description("Allowed displacement in x/y/z from positions in [TT]-ip[tt] file"));
+ options->addOption(EnumOption<RotationType>("rot").enumValue(cRotationEnum)
+ .store(&enumRot_)
+ .description("Rotate inserted molecules randomly"));
+}
+
+void InsertMolecules::optionsFinished()
+{
+ if (nmolIns_ <= 0 && positionFile_.empty())
+ {
+ GMX_THROW(InconsistentInputError("Either -nmol must be larger than 0, "
+ "or positions must be given with -ip."));
+ }
+ if (inputConfFile_.empty() && !bBox_)
+ {
+ GMX_THROW(InconsistentInputError("When no solute (-f) is specified, "
+ "a box size (-box) must be specified."));
+ }
+ if (replaceSel_.isValid() && inputConfFile_.empty())
+ {
+ GMX_THROW(InconsistentInputError("Replacement (-replace) only makes sense "
+ "together with an existing configuration (-f)."));
+ }
+
+ if (!inputConfFile_.empty())
+ {
++ bool bTprFileWasRead;
++ rvec *temporaryX = nullptr;
+ fprintf(stderr, "Reading solute configuration\n");
- const char *outputTitle = topInfo_.name();
- std::vector<RVec> xOutput;
- matrix box = {{ 0 }};
- if (topInfo_.hasTopology())
- {
- xOutput = copyOf(topInfo_.x());
- topInfo_.getBox(box);
- }
- else
- {
- xOutput.resize(topInfo_.mtop()->natoms);
- }
- auto atomsSolute = topInfo_.copyAtoms();
++ readConfAndTopology(inputConfFile_.c_str(), &bTprFileWasRead, &top_,
++ &ePBC_, &temporaryX, nullptr, box_);
++ x_.assign(temporaryX, temporaryX + top_.natoms);
++ sfree(temporaryX);
++ if (top_.natoms == 0)
+ {
+ fprintf(stderr, "Note: no atoms in %s\n", inputConfFile_.c_str());
+ }
+ }
+}
+
+int InsertMolecules::run()
+{
- set_pbc(&pbc, topInfo_.ePBC(), box);
+ std::set<int> removableAtoms;
+ if (replaceSel_.isValid())
+ {
+ t_pbc pbc;
- fr->natoms = topInfo_.mtop()->natoms;
++ set_pbc(&pbc, ePBC_, box_);
+ t_trxframe *fr;
+ snew(fr, 1);
- fr->x = as_rvec_array(xOutput.data());
++ fr->natoms = x_.size();
+ fr->bX = TRUE;
- int ePBCForOutput = topInfo_.ePBC();
++ fr->x = as_rvec_array(x_.data());
+ selections_.evaluate(fr, &pbc);
+ sfree(fr);
+ removableAtoms.insert(replaceSel_.atomIndices().begin(),
+ replaceSel_.atomIndices().end());
+ // TODO: It could be nice to check that removableAtoms contains full
+ // residues, since we anyways remove whole residues instead of
+ // individual atoms.
+ }
+
- clear_mat(box);
- box[XX][XX] = newBox_[XX];
- box[YY][YY] = newBox_[YY];
- box[ZZ][ZZ] = newBox_[ZZ];
++ int ePBCForOutput = ePBC_;
+ if (bBox_)
+ {
+ ePBCForOutput = epbcXYZ;
- if (det(box) == 0)
++ clear_mat(box_);
++ box_[XX][XX] = newBox_[XX];
++ box_[YY][YY] = newBox_[YY];
++ box_[ZZ][ZZ] = newBox_[ZZ];
+ }
- fprintf(stderr, "Reading molecule configuration\n");
- TopologyInformation topInfoForInsertedMolecule;
- topInfoForInsertedMolecule.fillFromInputFile(insertConfFile_);
- auto atomsInserted = topInfoForInsertedMolecule.atoms();
- std::vector<RVec> xInserted = copyOf(topInfoForInsertedMolecule.x());
-
- if (topInfoForInsertedMolecule.mtop()->natoms == 0)
- {
- gmx_fatal(FARGS, "No molecule in %s, please check your input",
- insertConfFile_.c_str());
- }
- if (outputTitle == nullptr)
- {
- outputTitle = topInfoForInsertedMolecule.name();
- }
- if (positionFile_.empty())
++ if (det(box_) == 0)
+ {
+ gmx_fatal(FARGS, "Undefined solute box.\nCreate one with gmx editconf "
+ "or give explicit -box command line option");
+ }
+
- center_molecule(xInserted);
++ gmx_mtop_t topInserted;
++ t_atoms atomsInserted;
++ std::vector<RVec> xInserted;
+ {
- auto symtabInserted = duplicateSymtab(&topInfo_.mtop()->symtab);
- const sfree_guard symtabInsertedGuard(symtabInserted);
++ bool bTprFileWasRead;
++ int ePBC_dummy;
++ matrix box_dummy;
++ rvec *temporaryX;
++ readConfAndTopology(insertConfFile_.c_str(), &bTprFileWasRead, &topInserted,
++ &ePBC_dummy, &temporaryX, nullptr, box_dummy);
++ xInserted.assign(temporaryX, temporaryX + topInserted.natoms);
++ sfree(temporaryX);
++ atomsInserted = gmx_mtop_global_atoms(&topInserted);
++ if (atomsInserted.nr == 0)
++ {
++ gmx_fatal(FARGS, "No molecule in %s, please check your input",
++ insertConfFile_.c_str());
++ }
++ if (top_.name == nullptr)
++ {
++ top_.name = topInserted.name;
++ }
++ if (positionFile_.empty())
++ {
++ center_molecule(xInserted);
++ }
+ }
+
- atomsSolute.get(), symtabInserted, &xOutput, removableAtoms, *atomsInserted, xInserted,
- ePBCForOutput, box, positionFile_, deltaR_, enumRot_);
++ t_atoms atoms = gmx_mtop_global_atoms(&top_);
++
+ /* add nmol_ins molecules of atoms_ins
+ in random orientation at random place */
+ insert_mols(nmolIns_, nmolTry_, seed_, defaultDistance_, scaleFactor_,
- write_sto_conf(outputConfFile_.c_str(), outputTitle, atomsSolute.get(),
- as_rvec_array(xOutput.data()), nullptr, ePBCForOutput, box);
++ &atoms, &top_.symtab, &x_, removableAtoms, atomsInserted, xInserted,
++ ePBCForOutput, box_, positionFile_, deltaR_, enumRot_);
+
+ /* write new configuration to file confout */
+ fprintf(stderr, "Writing generated configuration to %s\n",
+ outputConfFile_.c_str());
- atomsSolute->nr, atomsSolute->nres);
- done_symtab(symtabInserted);
++ write_sto_conf(outputConfFile_.c_str(), *top_.name, &atoms,
++ as_rvec_array(x_.data()), nullptr, ePBCForOutput, box_);
+
+ /* print size of generated configuration */
+ fprintf(stderr, "\nOutput configuration contains %d atoms in %d residues\n",
++ atoms.nr, atoms.nres);
++
++ done_atom(&atoms);
++ done_atom(&atomsInserted);
++
+ return 0;
+}
+
+} // namespace
+
+
+const char* InsertMoleculesInfo::name()
+{
+ static const char* name = "insert-molecules";
+ return name;
+}
+
+const char* InsertMoleculesInfo::shortDescription()
+{
+ static const char* shortDescription =
+ "Insert molecules into existing vacancies";
+ return shortDescription;
+}
+
+ICommandLineOptionsModulePointer InsertMoleculesInfo::create()
+{
+ return ICommandLineOptionsModulePointer(new InsertMolecules());
+}
+
+} // namespace gmx
remover.removeMarkedAtoms(atoms);
}
- static void add_solv(const char *filename,
- const gmx::TopologyInformation &topInfo,
- t_atoms *atoms,
+ static void add_solv(const char *filename, t_atoms *atoms,
+ t_symtab *symtab,
std::vector<RVec> *x, std::vector<RVec> *v,
- matrix box, AtomProperties *aps,
- int ePBC, matrix box, gmx_atomprop_t aps,
++ int ePBC, matrix box, AtomProperties *aps,
real defaultDistance, real scaleFactor,
real rshell, int max_sol)
{
done_atom(newatoms);
sfree(newatoms);
}
+ if (atomsSolvent)
+ {
+ done_atom(atomsSolvent);
+ sfree(atomsSolvent);
+ }
}
-static void update_top(t_atoms *atoms, int firstSolventResidueIndex, matrix box, int NFILE, t_filenm fnm[],
- gmx_atomprop_t aps)
+static void update_top(t_atoms *atoms,
+ int firstSolventResidueIndex,
+ matrix box,
+ int NFILE,
+ t_filenm fnm[],
+ AtomProperties *aps)
{
FILE *fpin, *fpout;
char buf[STRLEN*2], buf2[STRLEN], *temp;
"a box size (-box) must be specified");
}
- aps = gmx_atomprop_init();
+ AtomProperties aps;
- gmx::TopologyInformation topInfo;
- std::vector<RVec> x, v;
- matrix box = {{ 0 }};
+ /* solute configuration data */
+ gmx_mtop_t top;
+ std::vector<RVec> x, v;
+ matrix box = {{ 0 }};
+ int ePBC = -1;
+ t_atoms *atoms;
+ snew(atoms, 1);
if (bProt)
{
/* Generate a solute configuration */
"or give explicit -box command line option");
}
- add_solv(solventFileName, topInfo, atoms.get(), &x, &v, box,
+ add_solv(solventFileName, atoms, &top.symtab, &x, &v, ePBCForOutput, box,
- aps, defaultDistance, scaleFactor, r_shell, max_sol);
+ &aps, defaultDistance, scaleFactor, r_shell, max_sol);
/* write new configuration 1 to file confout */
confout = ftp2fn(efSTO, NFILE, fnm);
/* print size of generated configuration */
fprintf(stderr, "\nOutput configuration contains %d atoms in %d residues\n",
atoms->nr, atoms->nres);
- update_top(atoms.get(), firstSolventResidueIndex, box, NFILE, fnm, &aps);
- update_top(atoms, firstSolventResidueIndex, box, NFILE, fnm, aps);
++ update_top(atoms, firstSolventResidueIndex, box, NFILE, fnm, &aps);
+
+ done_atom(atoms);
+ sfree(atoms);
- gmx_atomprop_destroy(aps);
output_env_done(oenv);
return 0;
}
/* launch local nonbonded work on GPU */
- wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
+ wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
- do_nb_verlet(fr, ic, enerd, flags, eintLocal, enbvClearFNo,
+ do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::Local, enbvClearFNo,
step, nrnb, wcycle);
wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
wallcycle_stop(wcycle, ewcLAUNCH_GPU);
wallcycle_start(wcycle, ewcLAUNCH_GPU);
/* launch non-local nonbonded tasks on GPU */
- wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
+ wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
- nbnxn_gpu_copy_xq_to_gpu(nbv->gpu_nbv, nbv->nbat, eatNonlocal, ppForceWorkload->haveGpuBondedWork);
+ Nbnxm::gpu_copy_xq_to_gpu(nbv->gpu_nbv, nbv->nbat, Nbnxm::AtomLocality::NonLocal, ppForceWorkload->haveGpuBondedWork);
wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
if (ppForceWorkload->haveGpuBondedWork)
--- /dev/null
- c12*(-repu_shift_V2 + repu_shift_V3*r_switch)*r_switch*r_switch*inv_r;
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014,2016,2017,2018,2019, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/*! \internal \file
+ * \brief
+ * Utility constant and function declaration for the CUDA non-bonded kernels.
+ * This header should be included once at the top level, just before the
+ * kernels are included (has to be preceded by nbnxn_cuda_types.h).
+ *
+ * \author Szilárd Páll <pall.szilard@gmail.com>
+ * \ingroup module_nbnxm
+ */
+#include <assert.h>
+
+/* Note that floating-point constants in CUDA code should be suffixed
+ * with f (e.g. 0.5f), to stop the compiler producing intermediate
+ * code that is in double precision.
+ */
+
+#include "gromacs/gpu_utils/cuda_arch_utils.cuh"
+#include "gromacs/gpu_utils/cuda_kernel_utils.cuh"
+#include "gromacs/gpu_utils/vectype_ops.cuh"
+
+#include "nbnxm_cuda_types.h"
+
+#ifndef NBNXM_CUDA_KERNEL_UTILS_CUH
+#define NBNXM_CUDA_KERNEL_UTILS_CUH
+
+/*! \brief Log of the i and j cluster size.
+ * change this together with c_clSize !*/
+static const int __device__ c_clSizeLog2 = 3;
+/*! \brief Square of cluster size. */
+static const int __device__ c_clSizeSq = c_clSize*c_clSize;
+/*! \brief j-cluster size after split (4 in the current implementation). */
+static const int __device__ c_splitClSize = c_clSize/c_nbnxnGpuClusterpairSplit;
+/*! \brief Stride in the force accumualation buffer */
+static const int __device__ c_fbufStride = c_clSizeSq;
+/*! \brief i-cluster interaction mask for a super-cluster with all c_numClPerSupercl=8 bits set */
+static const unsigned __device__ superClInteractionMask = ((1U << c_numClPerSupercl) - 1U);
+
+static const float __device__ c_oneSixth = 0.16666667f;
+static const float __device__ c_oneTwelveth = 0.08333333f;
+
+
+/*! Convert LJ sigma,epsilon parameters to C6,C12. */
+static __forceinline__ __device__
+void convert_sigma_epsilon_to_c6_c12(const float sigma,
+ const float epsilon,
+ float *c6,
+ float *c12)
+{
+ float sigma2, sigma6;
+
+ sigma2 = sigma * sigma;
+ sigma6 = sigma2 *sigma2 * sigma2;
+ *c6 = epsilon * sigma6;
+ *c12 = *c6 * sigma6;
+}
+
+/*! Apply force switch, force + energy version. */
+static __forceinline__ __device__
+void calculate_force_switch_F(const cu_nbparam_t nbparam,
+ float c6,
+ float c12,
+ float inv_r,
+ float r2,
+ float *F_invr)
+{
+ float r, r_switch;
+
+ /* force switch constants */
+ float disp_shift_V2 = nbparam.dispersion_shift.c2;
+ float disp_shift_V3 = nbparam.dispersion_shift.c3;
+ float repu_shift_V2 = nbparam.repulsion_shift.c2;
+ float repu_shift_V3 = nbparam.repulsion_shift.c3;
+
+ r = r2 * inv_r;
+ r_switch = r - nbparam.rvdw_switch;
+ r_switch = r_switch >= 0.0f ? r_switch : 0.0f;
+
+ *F_invr +=
+ -c6*(disp_shift_V2 + disp_shift_V3*r_switch)*r_switch*r_switch*inv_r +
- c12*(-repu_shift_V2 + repu_shift_V3*r_switch)*r_switch*r_switch*inv_r;
++ c12*(repu_shift_V2 + repu_shift_V3*r_switch)*r_switch*r_switch*inv_r;
+}
+
+/*! Apply force switch, force-only version. */
+static __forceinline__ __device__
+void calculate_force_switch_F_E(const cu_nbparam_t nbparam,
+ float c6,
+ float c12,
+ float inv_r,
+ float r2,
+ float *F_invr,
+ float *E_lj)
+{
+ float r, r_switch;
+
+ /* force switch constants */
+ float disp_shift_V2 = nbparam.dispersion_shift.c2;
+ float disp_shift_V3 = nbparam.dispersion_shift.c3;
+ float repu_shift_V2 = nbparam.repulsion_shift.c2;
+ float repu_shift_V3 = nbparam.repulsion_shift.c3;
+
+ float disp_shift_F2 = nbparam.dispersion_shift.c2/3;
+ float disp_shift_F3 = nbparam.dispersion_shift.c3/4;
+ float repu_shift_F2 = nbparam.repulsion_shift.c2/3;
+ float repu_shift_F3 = nbparam.repulsion_shift.c3/4;
+
+ r = r2 * inv_r;
+ r_switch = r - nbparam.rvdw_switch;
+ r_switch = r_switch >= 0.0f ? r_switch : 0.0f;
+
+ *F_invr +=
+ -c6*(disp_shift_V2 + disp_shift_V3*r_switch)*r_switch*r_switch*inv_r +
++ c12*(repu_shift_V2 + repu_shift_V3*r_switch)*r_switch*r_switch*inv_r;
+ *E_lj +=
+ c6*(disp_shift_F2 + disp_shift_F3*r_switch)*r_switch*r_switch*r_switch -
+ c12*(repu_shift_F2 + repu_shift_F3*r_switch)*r_switch*r_switch*r_switch;
+}
+
+/*! Apply potential switch, force-only version. */
+static __forceinline__ __device__
+void calculate_potential_switch_F(const cu_nbparam_t nbparam,
+ float inv_r,
+ float r2,
+ float *F_invr,
+ float *E_lj)
+{
+ float r, r_switch;
+ float sw, dsw;
+
+ /* potential switch constants */
+ float switch_V3 = nbparam.vdw_switch.c3;
+ float switch_V4 = nbparam.vdw_switch.c4;
+ float switch_V5 = nbparam.vdw_switch.c5;
+ float switch_F2 = 3*nbparam.vdw_switch.c3;
+ float switch_F3 = 4*nbparam.vdw_switch.c4;
+ float switch_F4 = 5*nbparam.vdw_switch.c5;
+
+ r = r2 * inv_r;
+ r_switch = r - nbparam.rvdw_switch;
+
+ /* Unlike in the F+E kernel, conditional is faster here */
+ if (r_switch > 0.0f)
+ {
+ sw = 1.0f + (switch_V3 + (switch_V4 + switch_V5*r_switch)*r_switch)*r_switch*r_switch*r_switch;
+ dsw = (switch_F2 + (switch_F3 + switch_F4*r_switch)*r_switch)*r_switch*r_switch;
+
+ *F_invr = (*F_invr)*sw - inv_r*(*E_lj)*dsw;
+ }
+}
+
+/*! Apply potential switch, force + energy version. */
+static __forceinline__ __device__
+void calculate_potential_switch_F_E(const cu_nbparam_t nbparam,
+ float inv_r,
+ float r2,
+ float *F_invr,
+ float *E_lj)
+{
+ float r, r_switch;
+ float sw, dsw;
+
+ /* potential switch constants */
+ float switch_V3 = nbparam.vdw_switch.c3;
+ float switch_V4 = nbparam.vdw_switch.c4;
+ float switch_V5 = nbparam.vdw_switch.c5;
+ float switch_F2 = 3*nbparam.vdw_switch.c3;
+ float switch_F3 = 4*nbparam.vdw_switch.c4;
+ float switch_F4 = 5*nbparam.vdw_switch.c5;
+
+ r = r2 * inv_r;
+ r_switch = r - nbparam.rvdw_switch;
+ r_switch = r_switch >= 0.0f ? r_switch : 0.0f;
+
+ /* Unlike in the F-only kernel, masking is faster here */
+ sw = 1.0f + (switch_V3 + (switch_V4 + switch_V5*r_switch)*r_switch)*r_switch*r_switch*r_switch;
+ dsw = (switch_F2 + (switch_F3 + switch_F4*r_switch)*r_switch)*r_switch*r_switch;
+
+ *F_invr = (*F_invr)*sw - inv_r*(*E_lj)*dsw;
+ *E_lj *= sw;
+}
+
+
+/*! \brief Fetch C6 grid contribution coefficients and return the product of these.
+ *
+ * Depending on what is supported, it fetches parameters either
+ * using direct load, texture objects, or texrefs.
+ */
+static __forceinline__ __device__
+float calculate_lj_ewald_c6grid(const cu_nbparam_t nbparam,
+ int typei,
+ int typej)
+{
+#if DISABLE_CUDA_TEXTURES
+ return LDG(&nbparam.nbfp_comb[2*typei]) * LDG(&nbparam.nbfp_comb[2*typej]);
+#else
+ return tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2*typei) * tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2*typej);
+#endif /* DISABLE_CUDA_TEXTURES */
+}
+
+
+/*! Calculate LJ-PME grid force contribution with
+ * geometric combination rule.
+ */
+static __forceinline__ __device__
+void calculate_lj_ewald_comb_geom_F(const cu_nbparam_t nbparam,
+ int typei,
+ int typej,
+ float r2,
+ float inv_r2,
+ float lje_coeff2,
+ float lje_coeff6_6,
+ float *F_invr)
+{
+ float c6grid, inv_r6_nm, cr2, expmcr2, poly;
+
+ c6grid = calculate_lj_ewald_c6grid(nbparam, typei, typej);
+
+ /* Recalculate inv_r6 without exclusion mask */
+ inv_r6_nm = inv_r2*inv_r2*inv_r2;
+ cr2 = lje_coeff2*r2;
+ expmcr2 = expf(-cr2);
+ poly = 1.0f + cr2 + 0.5f*cr2*cr2;
+
+ /* Subtract the grid force from the total LJ force */
+ *F_invr += c6grid*(inv_r6_nm - expmcr2*(inv_r6_nm*poly + lje_coeff6_6))*inv_r2;
+}
+
+
+/*! Calculate LJ-PME grid force + energy contribution with
+ * geometric combination rule.
+ */
+static __forceinline__ __device__
+void calculate_lj_ewald_comb_geom_F_E(const cu_nbparam_t nbparam,
+ int typei,
+ int typej,
+ float r2,
+ float inv_r2,
+ float lje_coeff2,
+ float lje_coeff6_6,
+ float int_bit,
+ float *F_invr,
+ float *E_lj)
+{
+ float c6grid, inv_r6_nm, cr2, expmcr2, poly, sh_mask;
+
+ c6grid = calculate_lj_ewald_c6grid(nbparam, typei, typej);
+
+ /* Recalculate inv_r6 without exclusion mask */
+ inv_r6_nm = inv_r2*inv_r2*inv_r2;
+ cr2 = lje_coeff2*r2;
+ expmcr2 = expf(-cr2);
+ poly = 1.0f + cr2 + 0.5f*cr2*cr2;
+
+ /* Subtract the grid force from the total LJ force */
+ *F_invr += c6grid*(inv_r6_nm - expmcr2*(inv_r6_nm*poly + lje_coeff6_6))*inv_r2;
+
+ /* Shift should be applied only to real LJ pairs */
+ sh_mask = nbparam.sh_lj_ewald*int_bit;
+ *E_lj += c_oneSixth*c6grid*(inv_r6_nm*(1.0f - expmcr2*poly) + sh_mask);
+}
+
+/*! Fetch per-type LJ parameters.
+ *
+ * Depending on what is supported, it fetches parameters either
+ * using direct load, texture objects, or texrefs.
+ */
+static __forceinline__ __device__
+float2 fetch_nbfp_comb_c6_c12(const cu_nbparam_t nbparam,
+ int type)
+{
+ float2 c6c12;
+#if DISABLE_CUDA_TEXTURES
+ /* Force an 8-byte fetch to save a memory instruction. */
+ float2 *nbfp_comb = (float2 *)nbparam.nbfp_comb;
+ c6c12 = LDG(&nbfp_comb[type]);
+#else
+ /* NOTE: as we always do 8-byte aligned loads, we could
+ fetch float2 here too just as above. */
+ c6c12.x = tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2*type);
+ c6c12.y = tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2*type + 1);
+#endif /* DISABLE_CUDA_TEXTURES */
+
+ return c6c12;
+}
+
+
+/*! Calculate LJ-PME grid force + energy contribution (if E_lj != nullptr) with
+ * Lorentz-Berthelot combination rule.
+ * We use a single F+E kernel with conditional because the performance impact
+ * of this is pretty small and LB on the CPU is anyway very slow.
+ */
+static __forceinline__ __device__
+void calculate_lj_ewald_comb_LB_F_E(const cu_nbparam_t nbparam,
+ int typei,
+ int typej,
+ float r2,
+ float inv_r2,
+ float lje_coeff2,
+ float lje_coeff6_6,
+ float int_bit,
+ float *F_invr,
+ float *E_lj)
+{
+ float c6grid, inv_r6_nm, cr2, expmcr2, poly;
+ float sigma, sigma2, epsilon;
+
+ /* sigma and epsilon are scaled to give 6*C6 */
+ float2 c6c12_i = fetch_nbfp_comb_c6_c12(nbparam, typei);
+ float2 c6c12_j = fetch_nbfp_comb_c6_c12(nbparam, typej);
+
+ sigma = c6c12_i.x + c6c12_j.x;
+ epsilon = c6c12_i.y * c6c12_j.y;
+
+ sigma2 = sigma*sigma;
+ c6grid = epsilon*sigma2*sigma2*sigma2;
+
+ /* Recalculate inv_r6 without exclusion mask */
+ inv_r6_nm = inv_r2*inv_r2*inv_r2;
+ cr2 = lje_coeff2*r2;
+ expmcr2 = expf(-cr2);
+ poly = 1.0f + cr2 + 0.5f*cr2*cr2;
+
+ /* Subtract the grid force from the total LJ force */
+ *F_invr += c6grid*(inv_r6_nm - expmcr2*(inv_r6_nm*poly + lje_coeff6_6))*inv_r2;
+
+ if (E_lj != nullptr)
+ {
+ float sh_mask;
+
+ /* Shift should be applied only to real LJ pairs */
+ sh_mask = nbparam.sh_lj_ewald*int_bit;
+ *E_lj += c_oneSixth*c6grid*(inv_r6_nm*(1.0f - expmcr2*poly) + sh_mask);
+ }
+}
+
+
+/*! Fetch two consecutive values from the Ewald correction F*r table.
+ *
+ * Depending on what is supported, it fetches parameters either
+ * using direct load, texture objects, or texrefs.
+ */
+static __forceinline__ __device__
+float2 fetch_coulomb_force_r(const cu_nbparam_t nbparam,
+ int index)
+{
+ float2 d;
+
+#if DISABLE_CUDA_TEXTURES
+ /* Can't do 8-byte fetch because some of the addresses will be misaligned. */
+ d.x = LDG(&nbparam.coulomb_tab[index]);
+ d.y = LDG(&nbparam.coulomb_tab[index + 1]);
+#else
+ d.x = tex1Dfetch<float>(nbparam.coulomb_tab_texobj, index);
+ d.y = tex1Dfetch<float>(nbparam.coulomb_tab_texobj, index + 1);
+#endif // DISABLE_CUDA_TEXTURES
+
+ return d;
+}
+
+/*! Linear interpolation using exactly two FMA operations.
+ *
+ * Implements numeric equivalent of: (1-t)*d0 + t*d1
+ * Note that CUDA does not have fnms, otherwise we'd use
+ * fma(t, d1, fnms(t, d0, d0)
+ * but input modifiers are designed for this and are fast.
+ */
+template <typename T>
+__forceinline__ __host__ __device__
+T lerp(T d0, T d1, T t)
+{
+ return fma(t, d1, fma(-t, d0, d0));
+}
+
+/*! Interpolate Ewald coulomb force correction using the F*r table.
+ */
+static __forceinline__ __device__
+float interpolate_coulomb_force_r(const cu_nbparam_t nbparam,
+ float r)
+{
+ float normalized = nbparam.coulomb_tab_scale * r;
+ int index = (int) normalized;
+ float fraction = normalized - index;
+
+ float2 d01 = fetch_coulomb_force_r(nbparam, index);
+
+ return lerp(d01.x, d01.y, fraction);
+}
+
+/*! Fetch C6 and C12 from the parameter table.
+ *
+ * Depending on what is supported, it fetches parameters either
+ * using direct load, texture objects, or texrefs.
+ */
+static __forceinline__ __device__
+void fetch_nbfp_c6_c12(float &c6,
+ float &c12,
+ const cu_nbparam_t nbparam,
+ int baseIndex)
+{
+#if DISABLE_CUDA_TEXTURES
+ /* Force an 8-byte fetch to save a memory instruction. */
+ float2 *nbfp = (float2 *)nbparam.nbfp;
+ float2 c6c12;
+ c6c12 = LDG(&nbfp[baseIndex]);
+ c6 = c6c12.x;
+ c12 = c6c12.y;
+#else
+ /* NOTE: as we always do 8-byte aligned loads, we could
+ fetch float2 here too just as above. */
+ c6 = tex1Dfetch<float>(nbparam.nbfp_texobj, 2*baseIndex);
+ c12 = tex1Dfetch<float>(nbparam.nbfp_texobj, 2*baseIndex + 1);
+#endif // DISABLE_CUDA_TEXTURES
+}
+
+
+/*! Calculate analytical Ewald correction term. */
+static __forceinline__ __device__
+float pmecorrF(float z2)
+{
+ const float FN6 = -1.7357322914161492954e-8f;
+ const float FN5 = 1.4703624142580877519e-6f;
+ const float FN4 = -0.000053401640219807709149f;
+ const float FN3 = 0.0010054721316683106153f;
+ const float FN2 = -0.019278317264888380590f;
+ const float FN1 = 0.069670166153766424023f;
+ const float FN0 = -0.75225204789749321333f;
+
+ const float FD4 = 0.0011193462567257629232f;
+ const float FD3 = 0.014866955030185295499f;
+ const float FD2 = 0.11583842382862377919f;
+ const float FD1 = 0.50736591960530292870f;
+ const float FD0 = 1.0f;
+
+ float z4;
+ float polyFN0, polyFN1, polyFD0, polyFD1;
+
+ z4 = z2*z2;
+
+ polyFD0 = FD4*z4 + FD2;
+ polyFD1 = FD3*z4 + FD1;
+ polyFD0 = polyFD0*z4 + FD0;
+ polyFD0 = polyFD1*z2 + polyFD0;
+
+ polyFD0 = 1.0f/polyFD0;
+
+ polyFN0 = FN6*z4 + FN4;
+ polyFN1 = FN5*z4 + FN3;
+ polyFN0 = polyFN0*z4 + FN2;
+ polyFN1 = polyFN1*z4 + FN1;
+ polyFN0 = polyFN0*z4 + FN0;
+ polyFN0 = polyFN1*z2 + polyFN0;
+
+ return polyFN0*polyFD0;
+}
+
+/*! Final j-force reduction; this generic implementation works with
+ * arbitrary array sizes.
+ */
+static __forceinline__ __device__
+void reduce_force_j_generic(float *f_buf, float3 *fout,
+ int tidxi, int tidxj, int aidx)
+{
+ if (tidxi < 3)
+ {
+ float f = 0.0f;
+ for (int j = tidxj * c_clSize; j < (tidxj + 1) * c_clSize; j++)
+ {
+ f += f_buf[c_fbufStride * tidxi + j];
+ }
+
+ atomicAdd((&fout[aidx].x)+tidxi, f);
+ }
+}
+
+/*! Final j-force reduction; this implementation only with power of two
+ * array sizes.
+ */
+static __forceinline__ __device__
+void reduce_force_j_warp_shfl(float3 f, float3 *fout,
+ int tidxi, int aidx,
+ const unsigned int activemask)
+{
+ f.x += gmx_shfl_down_sync(activemask, f.x, 1);
+ f.y += gmx_shfl_up_sync (activemask, f.y, 1);
+ f.z += gmx_shfl_down_sync(activemask, f.z, 1);
+
+ if (tidxi & 1)
+ {
+ f.x = f.y;
+ }
+
+ f.x += gmx_shfl_down_sync(activemask, f.x, 2);
+ f.z += gmx_shfl_up_sync (activemask, f.z, 2);
+
+ if (tidxi & 2)
+ {
+ f.x = f.z;
+ }
+
+ f.x += gmx_shfl_down_sync(activemask, f.x, 4);
+
+ if (tidxi < 3)
+ {
+ atomicAdd((&fout[aidx].x) + tidxi, f.x);
+ }
+}
+
+/*! Final i-force reduction; this generic implementation works with
+ * arbitrary array sizes.
+ * TODO: add the tidxi < 3 trick
+ */
+static __forceinline__ __device__
+void reduce_force_i_generic(float *f_buf, float3 *fout,
+ float *fshift_buf, bool bCalcFshift,
+ int tidxi, int tidxj, int aidx)
+{
+ if (tidxj < 3)
+ {
+ float f = 0.0f;
+ for (int j = tidxi; j < c_clSizeSq; j += c_clSize)
+ {
+ f += f_buf[tidxj * c_fbufStride + j];
+ }
+
+ atomicAdd(&fout[aidx].x + tidxj, f);
+
+ if (bCalcFshift)
+ {
+ *fshift_buf += f;
+ }
+ }
+}
+
+/*! Final i-force reduction; this implementation works only with power of two
+ * array sizes.
+ */
+static __forceinline__ __device__
+void reduce_force_i_pow2(volatile float *f_buf, float3 *fout,
+ float *fshift_buf, bool bCalcFshift,
+ int tidxi, int tidxj, int aidx)
+{
+ int i, j;
+ float f;
+
+ assert(c_clSize == 1 << c_clSizeLog2);
+
+ /* Reduce the initial c_clSize values for each i atom to half
+ * every step by using c_clSize * i threads.
+ * Can't just use i as loop variable because than nvcc refuses to unroll.
+ */
+ i = c_clSize/2;
+#pragma unroll 5
+ for (j = c_clSizeLog2 - 1; j > 0; j--)
+ {
+ if (tidxj < i)
+ {
+
+ f_buf[ tidxj * c_clSize + tidxi] += f_buf[ (tidxj + i) * c_clSize + tidxi];
+ f_buf[ c_fbufStride + tidxj * c_clSize + tidxi] += f_buf[ c_fbufStride + (tidxj + i) * c_clSize + tidxi];
+ f_buf[2 * c_fbufStride + tidxj * c_clSize + tidxi] += f_buf[2 * c_fbufStride + (tidxj + i) * c_clSize + tidxi];
+ }
+ i >>= 1;
+ }
+
+ /* i == 1, last reduction step, writing to global mem */
+ if (tidxj < 3)
+ {
+ /* tidxj*c_fbufStride selects x, y or z */
+ f = f_buf[tidxj * c_fbufStride + tidxi] +
+ f_buf[tidxj * c_fbufStride + i * c_clSize + tidxi];
+
+ atomicAdd(&(fout[aidx].x) + tidxj, f);
+
+ if (bCalcFshift)
+ {
+ *fshift_buf += f;
+ }
+ }
+
+}
+
+/*! Final i-force reduction wrapper; calls the generic or pow2 reduction depending
+ * on whether the size of the array to be reduced is power of two or not.
+ */
+static __forceinline__ __device__
+void reduce_force_i(float *f_buf, float3 *f,
+ float *fshift_buf, bool bCalcFshift,
+ int tidxi, int tidxj, int ai)
+{
+ if ((c_clSize & (c_clSize - 1)))
+ {
+ reduce_force_i_generic(f_buf, f, fshift_buf, bCalcFshift, tidxi, tidxj, ai);
+ }
+ else
+ {
+ reduce_force_i_pow2(f_buf, f, fshift_buf, bCalcFshift, tidxi, tidxj, ai);
+ }
+}
+
+/*! Final i-force reduction; this implementation works only with power of two
+ * array sizes.
+ */
+static __forceinline__ __device__
+void reduce_force_i_warp_shfl(float3 fin, float3 *fout,
+ float *fshift_buf, bool bCalcFshift,
+ int tidxj, int aidx,
+ const unsigned int activemask)
+{
+ fin.x += gmx_shfl_down_sync(activemask, fin.x, c_clSize);
+ fin.y += gmx_shfl_up_sync (activemask, fin.y, c_clSize);
+ fin.z += gmx_shfl_down_sync(activemask, fin.z, c_clSize);
+
+ if (tidxj & 1)
+ {
+ fin.x = fin.y;
+ }
+
+ fin.x += gmx_shfl_down_sync(activemask, fin.x, 2*c_clSize);
+ fin.z += gmx_shfl_up_sync (activemask, fin.z, 2*c_clSize);
+
+ if (tidxj & 2)
+ {
+ fin.x = fin.z;
+ }
+
+ /* Threads 0,1,2 and 4,5,6 increment x,y,z for their warp */
+ if ((tidxj & 3) < 3)
+ {
+ atomicAdd(&fout[aidx].x + (tidxj & 3), fin.x);
+
+ if (bCalcFshift)
+ {
+ *fshift_buf += fin.x;
+ }
+ }
+}
+
+/*! Energy reduction; this implementation works only with power of two
+ * array sizes.
+ */
+static __forceinline__ __device__
+void reduce_energy_pow2(volatile float *buf,
+ float *e_lj, float *e_el,
+ unsigned int tidx)
+{
+ float e1, e2;
+
+ unsigned int i = warp_size/2;
+
+ /* Can't just use i as loop variable because than nvcc refuses to unroll. */
+#pragma unroll 10
+ for (int j = warp_size_log2 - 1; j > 0; j--)
+ {
+ if (tidx < i)
+ {
+ buf[ tidx] += buf[ tidx + i];
+ buf[c_fbufStride + tidx] += buf[c_fbufStride + tidx + i];
+ }
+ i >>= 1;
+ }
+
+ // TODO do this on two threads - requires e_lj and e_el to be stored on adjascent
+ // memory locations to make sense
+ /* last reduction step, writing to global mem */
+ if (tidx == 0)
+ {
+ e1 = buf[ tidx] + buf[ tidx + i];
+ e2 = buf[c_fbufStride + tidx] + buf[c_fbufStride + tidx + i];
+
+ atomicAdd(e_lj, e1);
+ atomicAdd(e_el, e2);
+ }
+}
+
+/*! Energy reduction; this implementation works only with power of two
+ * array sizes.
+ */
+static __forceinline__ __device__
+void reduce_energy_warp_shfl(float E_lj, float E_el,
+ float *e_lj, float *e_el,
+ int tidx,
+ const unsigned int activemask)
+{
+ int i, sh;
+
+ sh = 1;
+#pragma unroll 5
+ for (i = 0; i < 5; i++)
+ {
+ E_lj += gmx_shfl_down_sync(activemask, E_lj, sh);
+ E_el += gmx_shfl_down_sync(activemask, E_el, sh);
+ sh += sh;
+ }
+
+ /* The first thread in the warp writes the reduced energies */
+ if (tidx == 0 || tidx == warp_size)
+ {
+ atomicAdd(e_lj, E_lj);
+ atomicAdd(e_el, E_el);
+ }
+}
+
+#endif /* NBNXN_CUDA_KERNEL_UTILS_CUH */
--- /dev/null
- c12*(-repu_shift_V2 + repu_shift_V3*r_switch)*r_switch*r_switch*inv_r;
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014,2016,2017,2018,2019, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+#define GMX_DOUBLE 0
+
+#include "gromacs/gpu_utils/device_utils.clh"
+#include "gromacs/gpu_utils/vectype_ops.clh"
+#include "gromacs/nbnxm/constants.h"
+#include "gromacs/pbcutil/ishift.h"
+
+#include "nbnxm_ocl_consts.h"
+
+#define CL_SIZE (NBNXN_GPU_CLUSTER_SIZE)
+#define NCL_PER_SUPERCL c_nbnxnGpuNumClusterPerSupercluster
+
+#define WARP_SIZE (CL_SIZE*CL_SIZE/2) //Currently only c_nbnxnGpuClusterpairSplit=2 supported
+
+#if defined _NVIDIA_SOURCE_ || defined _AMD_SOURCE_
+/* Currently we enable CJ prefetch for AMD/NVIDIA and disable it for other vendors
+ * Note that this should precede the kernel_utils include.
+ */
+#define USE_CJ_PREFETCH 1
+#else
+#define USE_CJ_PREFETCH 0
+#endif
+
+#if defined cl_intel_subgroups || defined cl_khr_subgroups || (defined __OPENCL_VERSION__ && __OPENCL_VERSION__ >= 210)
+#define HAVE_SUBGROUP 1
+#else
+#define HAVE_SUBGROUP 0
+#endif
+
+#ifdef cl_intel_subgroups
+#define HAVE_INTEL_SUBGROUP 1
+#else
+#define HAVE_INTEL_SUBGROUP 0
+#endif
+
+#if defined _INTEL_SOURCE_
+#define SUBGROUP_SIZE 8
+#elif defined _AMD_SOURCE_
+#define SUBGROUP_SIZE 64
+#else
+#define SUBGROUP_SIZE 32
+#endif
+
+#define REDUCE_SHUFFLE (HAVE_INTEL_SUBGROUP && CL_SIZE == 4 && SUBGROUP_SIZE == WARP_SIZE)
+#define USE_SUBGROUP_ANY (HAVE_SUBGROUP && SUBGROUP_SIZE == WARP_SIZE)
+#define USE_SUBGROUP_PRELOAD HAVE_INTEL_SUBGROUP
+
+/* 1.0 / sqrt(M_PI) */
+#define M_FLOAT_1_SQRTPI 0.564189583547756f
+
+//-------------------
+
+#ifndef NBNXN_OPENCL_KERNEL_UTILS_CLH
+#define NBNXN_OPENCL_KERNEL_UTILS_CLH
+
+#if CL_SIZE == 8
+#define WARP_SIZE_LOG2 (5)
+#define CL_SIZE_LOG2 (3)
+#elif CL_SIZE == 4
+#define WARP_SIZE_LOG2 (3)
+#define CL_SIZE_LOG2 (2)
+#else
+#error unsupported CL_SIZE
+#endif
+
+#define CL_SIZE_SQ (CL_SIZE * CL_SIZE)
+#define FBUF_STRIDE (CL_SIZE_SQ)
+
+#define ONE_SIXTH_F 0.16666667f
+#define ONE_TWELVETH_F 0.08333333f
+
+
+#ifdef __GNUC__
+/* GCC, clang, and some ICC pretending to be GCC */
+# define gmx_unused __attribute__ ((unused))
+#else
+# define gmx_unused
+#endif
+
+// Data structures shared between OpenCL device code and OpenCL host code
+// TODO: review, improve
+// Replaced real by float for now, to avoid including any other header
+typedef struct {
+ float c2;
+ float c3;
+ float cpot;
+} shift_consts_t;
+
+/* Used with potential switching:
+ * rsw = max(r - r_switch, 0)
+ * sw = 1 + c3*rsw^3 + c4*rsw^4 + c5*rsw^5
+ * dsw = 3*c3*rsw^2 + 4*c4*rsw^3 + 5*c5*rsw^4
+ * force = force*dsw - potential*sw
+ * potential *= sw
+ */
+typedef struct {
+ float c3;
+ float c4;
+ float c5;
+} switch_consts_t;
+
+// Data structure shared between the OpenCL device code and OpenCL host code
+// Must not contain OpenCL objects (buffers)
+typedef struct cl_nbparam_params
+{
+
+ int eeltype; /**< type of electrostatics, takes values from #eelCu */
+ int vdwtype; /**< type of VdW impl., takes values from #evdwCu */
+
+ float epsfac; /**< charge multiplication factor */
+ float c_rf; /**< Reaction-field/plain cutoff electrostatics const. */
+ float two_k_rf; /**< Reaction-field electrostatics constant */
+ float ewald_beta; /**< Ewald/PME parameter */
+ float sh_ewald; /**< Ewald/PME correction term substracted from the direct-space potential */
+ float sh_lj_ewald; /**< LJ-Ewald/PME correction term added to the correction potential */
+ float ewaldcoeff_lj; /**< LJ-Ewald/PME coefficient */
+
+ float rcoulomb_sq; /**< Coulomb cut-off squared */
+
+ float rvdw_sq; /**< VdW cut-off squared */
+ float rvdw_switch; /**< VdW switched cut-off */
+ float rlistOuter_sq; /**< Full, outer pair-list cut-off squared */
+ float rlistInner_sq; /**< Inner, dynamic pruned pair-list cut-off squared XXX: this is only needed in the pruning kernels, but for now we also pass it to the nonbondeds */
+
+ shift_consts_t dispersion_shift; /**< VdW shift dispersion constants */
+ shift_consts_t repulsion_shift; /**< VdW shift repulsion constants */
+ switch_consts_t vdw_switch; /**< VdW switch constants */
+
+ /* Ewald Coulomb force table data - accessed through texture memory */
+ float coulomb_tab_scale; /**< table scale/spacing */
+}cl_nbparam_params_t;
+
+typedef struct {
+ int sci; /* i-super-cluster */
+ int shift; /* Shift vector index plus possible flags */
+ int cj4_ind_start; /* Start index into cj4 */
+ int cj4_ind_end; /* End index into cj4 */
+} nbnxn_sci_t;
+
+typedef struct {
+ unsigned int imask; /* The i-cluster interactions mask for 1 warp */
+ int excl_ind; /* Index into the exclusion array for 1 warp */
+} nbnxn_im_ei_t;
+
+typedef struct {
+ int cj[4]; /* The 4 j-clusters */
+ nbnxn_im_ei_t imei[2]; /* The i-cluster mask data for 2 warps */
+} nbnxn_cj4_t;
+
+
+typedef struct {
+ unsigned int pair[CL_SIZE*CL_SIZE/2]; /* Topology exclusion interaction bits for one warp,
+ * each unsigned has bitS for 4*8 i clusters
+ */
+} nbnxn_excl_t;
+
+/*! i-cluster interaction mask for a super-cluster with all NCL_PER_SUPERCL bits set */
+__constant unsigned supercl_interaction_mask = ((1U << NCL_PER_SUPERCL) - 1U);
+
+gmx_opencl_inline
+void preloadCj4Generic(__local int *sm_cjPreload,
+ const __global int *gm_cj,
+ int tidxi,
+ int tidxj,
+ bool gmx_unused iMaskCond)
+{
+ /* Pre-load cj into shared memory */
+#if defined _AMD_SOURCE_ //TODO: fix by setting c_nbnxnGpuClusterpairSplit properly
+ if (tidxj == 0 & tidxi < c_nbnxnGpuJgroupSize)
+ {
+ sm_cjPreload[tidxi] = gm_cj[tidxi];
+ }
+#else
+ const int c_clSize = CL_SIZE;
+ const int c_nbnxnGpuClusterpairSplit = 2;
+ const int c_splitClSize = c_clSize/c_nbnxnGpuClusterpairSplit;
+ if ((tidxj == 0 | tidxj == c_splitClSize) & (tidxi < c_nbnxnGpuJgroupSize))
+ {
+ sm_cjPreload[tidxi + tidxj * c_nbnxnGpuJgroupSize/c_splitClSize] = gm_cj[tidxi];
+ }
+#endif
+}
+
+
+#if USE_SUBGROUP_PRELOAD
+gmx_opencl_inline
+int preloadCj4Subgroup(const __global int *gm_cj)
+{
+ //loads subgroup-size # of elements (8) instead of the 4 required
+ //equivalent to *cjs = *gm_cj
+ return intel_sub_group_block_read((const __global uint *)gm_cj);
+}
+#endif //USE_SUBGROUP_PRELOAD
+
+#if USE_SUBGROUP_PRELOAD
+typedef size_t CjType;
+#else
+typedef __local int* CjType;
+#endif
+
+/*! \brief Preload cj4
+ *
+ * - For AMD we load once for a wavefront of 64 threads (on 4 threads * NTHREAD_Z)
+ * - For NVIDIA once per warp (on 2x4 threads * NTHREAD_Z)
+ * - For Intel(/USE_SUBGROUP_PRELOAD) loads into private memory(/register) instead of local memory
+ *
+ * It is the caller's responsibility to make sure that data is consumed only when
+ * it's ready. This function does not call a barrier.
+ */
+gmx_opencl_inline
+void preloadCj4(CjType gmx_unused *cjs,
+ const __global int gmx_unused *gm_cj,
+ int gmx_unused tidxi,
+ int gmx_unused tidxj,
+ bool gmx_unused iMaskCond)
+{
+#if USE_SUBGROUP_PRELOAD
+ *cjs = preloadCj4Subgroup(gm_cj);
+#elif USE_CJ_PREFETCH
+ preloadCj4Generic(*cjs, gm_cj, tidxi, tidxj, iMaskCond);
+#else
+ //nothing to do
+#endif
+}
+
+gmx_opencl_inline
+int loadCjPreload(__local int * sm_cjPreload,
+ int jm,
+ int gmx_unused tidxi,
+ int gmx_unused tidxj)
+{
+#if defined _AMD_SOURCE_
+ int warpLoadOffset = 0; //TODO: fix by setting c_nbnxnGpuClusterpairSplit properly
+#else
+ const int c_clSize = CL_SIZE;
+ const int c_nbnxnGpuClusterpairSplit = 2;
+ const int c_splitClSize = c_clSize/c_nbnxnGpuClusterpairSplit;
+ int warpLoadOffset = (tidxj & c_splitClSize) * c_nbnxnGpuJgroupSize/c_splitClSize;
+#endif
+ return sm_cjPreload[jm + warpLoadOffset];
+}
+
+/* \brief Load a cj given a jm index.
+ *
+ * If cj4 preloading is enabled, it loads from the local memory, otherwise from global.
+ */
+gmx_opencl_inline
+int loadCj(CjType cjs, const __global int gmx_unused* gm_cj,
+ int jm, int gmx_unused tidxi, int gmx_unused tidxj)
+{
+#if USE_SUBGROUP_PRELOAD
+ return sub_group_broadcast(cjs, jm);
+#elif USE_CJ_PREFETCH
+ return loadCjPreload(cjs, jm, tidxi, tidxj);
+#else
+ return gm_cj[jm];
+#endif
+}
+
+/*! Convert LJ sigma,epsilon parameters to C6,C12. */
+gmx_opencl_inline
+void convert_sigma_epsilon_to_c6_c12(const float sigma,
+ const float epsilon,
+ float *c6,
+ float *c12)
+{
+ float sigma2, sigma6;
+
+ sigma2 = sigma * sigma;
+ sigma6 = sigma2 *sigma2 * sigma2;
+ *c6 = epsilon * sigma6;
+ *c12 = *c6 * sigma6;
+}
+
+
+/*! Apply force switch, force + energy version. */
+gmx_opencl_inline
+void calculate_force_switch_F(const cl_nbparam_params_t *nbparam,
+ float c6,
+ float c12,
+ float inv_r,
+ float r2,
+ float *F_invr)
+{
+ float r, r_switch;
+
+ /* force switch constants */
+ float disp_shift_V2 = nbparam->dispersion_shift.c2;
+ float disp_shift_V3 = nbparam->dispersion_shift.c3;
+ float repu_shift_V2 = nbparam->repulsion_shift.c2;
+ float repu_shift_V3 = nbparam->repulsion_shift.c3;
+
+ r = r2 * inv_r;
+ r_switch = r - nbparam->rvdw_switch;
+ r_switch = r_switch >= 0.0f ? r_switch : 0.0f;
+
+ *F_invr +=
+ -c6*(disp_shift_V2 + disp_shift_V3*r_switch)*r_switch*r_switch*inv_r +
- c12*(-repu_shift_V2 + repu_shift_V3*r_switch)*r_switch*r_switch*inv_r;
++ c12*(repu_shift_V2 + repu_shift_V3*r_switch)*r_switch*r_switch*inv_r;
+}
+
+/*! Apply force switch, force-only version. */
+gmx_opencl_inline
+void calculate_force_switch_F_E(const cl_nbparam_params_t *nbparam,
+ float c6,
+ float c12,
+ float inv_r,
+ float r2,
+ float *F_invr,
+ float *E_lj)
+{
+ float r, r_switch;
+
+ /* force switch constants */
+ float disp_shift_V2 = nbparam->dispersion_shift.c2;
+ float disp_shift_V3 = nbparam->dispersion_shift.c3;
+ float repu_shift_V2 = nbparam->repulsion_shift.c2;
+ float repu_shift_V3 = nbparam->repulsion_shift.c3;
+
+ float disp_shift_F2 = nbparam->dispersion_shift.c2/3;
+ float disp_shift_F3 = nbparam->dispersion_shift.c3/4;
+ float repu_shift_F2 = nbparam->repulsion_shift.c2/3;
+ float repu_shift_F3 = nbparam->repulsion_shift.c3/4;
+
+ r = r2 * inv_r;
+ r_switch = r - nbparam->rvdw_switch;
+ r_switch = r_switch >= 0.0f ? r_switch : 0.0f;
+
+ *F_invr +=
+ -c6*(disp_shift_V2 + disp_shift_V3*r_switch)*r_switch*r_switch*inv_r +
++ c12*(repu_shift_V2 + repu_shift_V3*r_switch)*r_switch*r_switch*inv_r;
+ *E_lj +=
+ c6*(disp_shift_F2 + disp_shift_F3*r_switch)*r_switch*r_switch*r_switch -
+ c12*(repu_shift_F2 + repu_shift_F3*r_switch)*r_switch*r_switch*r_switch;
+}
+
+/*! Apply potential switch, force-only version. */
+gmx_opencl_inline
+void calculate_potential_switch_F(const cl_nbparam_params_t *nbparam,
+ float inv_r,
+ float r2,
+ float *F_invr,
+ const float *E_lj)
+{
+ float r, r_switch;
+ float sw, dsw;
+
+ /* potential switch constants */
+ float switch_V3 = nbparam->vdw_switch.c3;
+ float switch_V4 = nbparam->vdw_switch.c4;
+ float switch_V5 = nbparam->vdw_switch.c5;
+ float switch_F2 = nbparam->vdw_switch.c3;
+ float switch_F3 = nbparam->vdw_switch.c4;
+ float switch_F4 = nbparam->vdw_switch.c5;
+
+ r = r2 * inv_r;
+ r_switch = r - nbparam->rvdw_switch;
+
+ /* Unlike in the F+E kernel, conditional is faster here */
+ if (r_switch > 0.0f)
+ {
+ sw = 1.0f + (switch_V3 + (switch_V4 + switch_V5*r_switch)*r_switch)*r_switch*r_switch*r_switch;
+ dsw = (switch_F2 + (switch_F3 + switch_F4*r_switch)*r_switch)*r_switch*r_switch;
+
+ *F_invr = (*F_invr)*sw - inv_r*(*E_lj)*dsw;
+ }
+}
+
+/*! Apply potential switch, force + energy version. */
+gmx_opencl_inline
+void calculate_potential_switch_F_E(const cl_nbparam_params_t *nbparam,
+ float inv_r,
+ float r2,
+ float *F_invr,
+ float *E_lj)
+{
+ float r, r_switch;
+ float sw, dsw;
+
+ /* potential switch constants */
+ float switch_V3 = nbparam->vdw_switch.c3;
+ float switch_V4 = nbparam->vdw_switch.c4;
+ float switch_V5 = nbparam->vdw_switch.c5;
+ float switch_F2 = nbparam->vdw_switch.c3;
+ float switch_F3 = nbparam->vdw_switch.c4;
+ float switch_F4 = nbparam->vdw_switch.c5;
+
+ r = r2 * inv_r;
+ r_switch = r - nbparam->rvdw_switch;
+ r_switch = r_switch >= 0.0f ? r_switch : 0.0f;
+
+ /* Unlike in the F-only kernel, masking is faster here */
+ sw = 1.0f + (switch_V3 + (switch_V4 + switch_V5*r_switch)*r_switch)*r_switch*r_switch*r_switch;
+ dsw = (switch_F2 + (switch_F3 + switch_F4*r_switch)*r_switch)*r_switch*r_switch;
+
+ *F_invr = (*F_invr)*sw - inv_r*(*E_lj)*dsw;
+ *E_lj *= sw;
+}
+
+/*! Calculate LJ-PME grid force contribution with
+ * geometric combination rule.
+ */
+gmx_opencl_inline
+void calculate_lj_ewald_comb_geom_F(__constant const float *nbfp_comb_climg2d,
+ int typei,
+ int typej,
+ float r2,
+ float inv_r2,
+ float lje_coeff2,
+ float lje_coeff6_6,
+ float *F_invr)
+{
+ float c6grid, inv_r6_nm, cr2, expmcr2, poly;
+
+ c6grid = nbfp_comb_climg2d[2*typei]*nbfp_comb_climg2d[2*typej];
+
+ /* Recalculate inv_r6 without exclusion mask */
+ inv_r6_nm = inv_r2*inv_r2*inv_r2;
+ cr2 = lje_coeff2*r2;
+ expmcr2 = exp(-cr2);
+ poly = 1.0f + cr2 + 0.5f*cr2*cr2;
+
+ /* Subtract the grid force from the total LJ force */
+ *F_invr += c6grid*(inv_r6_nm - expmcr2*(inv_r6_nm*poly + lje_coeff6_6))*inv_r2;
+}
+
+/*! Calculate LJ-PME grid force + energy contribution with
+ * geometric combination rule.
+ */
+gmx_opencl_inline
+void calculate_lj_ewald_comb_geom_F_E(__constant const float *nbfp_comb_climg2d,
+ const cl_nbparam_params_t *nbparam,
+ int typei,
+ int typej,
+ float r2,
+ float inv_r2,
+ float lje_coeff2,
+ float lje_coeff6_6,
+ float int_bit,
+ float *F_invr,
+ float *E_lj)
+{
+ float c6grid, inv_r6_nm, cr2, expmcr2, poly, sh_mask;
+
+ c6grid = nbfp_comb_climg2d[2*typei]*nbfp_comb_climg2d[2*typej];
+
+ /* Recalculate inv_r6 without exclusion mask */
+ inv_r6_nm = inv_r2*inv_r2*inv_r2;
+ cr2 = lje_coeff2*r2;
+ expmcr2 = exp(-cr2);
+ poly = 1.0f + cr2 + 0.5f*cr2*cr2;
+
+ /* Subtract the grid force from the total LJ force */
+ *F_invr += c6grid*(inv_r6_nm - expmcr2*(inv_r6_nm*poly + lje_coeff6_6))*inv_r2;
+
+ /* Shift should be applied only to real LJ pairs */
+ sh_mask = nbparam->sh_lj_ewald*int_bit;
+ *E_lj += ONE_SIXTH_F*c6grid*(inv_r6_nm*(1.0f - expmcr2*poly) + sh_mask);
+}
+
+/*! Calculate LJ-PME grid force + energy contribution (if E_lj != NULL) with
+ * Lorentz-Berthelot combination rule.
+ * We use a single F+E kernel with conditional because the performance impact
+ * of this is pretty small and LB on the CPU is anyway very slow.
+ */
+gmx_opencl_inline
+void calculate_lj_ewald_comb_LB_F_E(__constant const float *nbfp_comb_climg2d,
+ const cl_nbparam_params_t *nbparam,
+ int typei,
+ int typej,
+ float r2,
+ float inv_r2,
+ float lje_coeff2,
+ float lje_coeff6_6,
+ float int_bit,
+ bool with_E_lj,
+ float *F_invr,
+ float *E_lj)
+{
+ float c6grid, inv_r6_nm, cr2, expmcr2, poly;
+ float sigma, sigma2, epsilon;
+
+ /* sigma and epsilon are scaled to give 6*C6 */
+ sigma = nbfp_comb_climg2d[2*typei] + nbfp_comb_climg2d[2*typej];
+
+ epsilon = nbfp_comb_climg2d[2*typei+1]*nbfp_comb_climg2d[2*typej+1];
+
+ sigma2 = sigma*sigma;
+ c6grid = epsilon*sigma2*sigma2*sigma2;
+
+ /* Recalculate inv_r6 without exclusion mask */
+ inv_r6_nm = inv_r2*inv_r2*inv_r2;
+ cr2 = lje_coeff2*r2;
+ expmcr2 = exp(-cr2);
+ poly = 1.0f + cr2 + 0.5f*cr2*cr2;
+
+ /* Subtract the grid force from the total LJ force */
+ *F_invr += c6grid*(inv_r6_nm - expmcr2*(inv_r6_nm*poly + lje_coeff6_6))*inv_r2;
+
+ if (with_E_lj)
+ {
+ float sh_mask;
+
+ /* Shift should be applied only to real LJ pairs */
+ sh_mask = nbparam->sh_lj_ewald*int_bit;
+ *E_lj += ONE_SIXTH_F*c6grid*(inv_r6_nm*(1.0f - expmcr2*poly) + sh_mask);
+ }
+}
+
+/*! Interpolate Ewald coulomb force using the table through the tex_nbfp texture.
+ * Original idea: from the OpenMM project
+ */
+gmx_opencl_inline float
+interpolate_coulomb_force_r(__constant const float *coulomb_tab_climg2d,
+ float r,
+ float scale)
+{
+ float normalized = scale * r;
+ int index = (int) normalized;
+ float fract2 = normalized - index;
+ float fract1 = 1.0f - fract2;
+
+ return fract1*coulomb_tab_climg2d[index] +
+ fract2*coulomb_tab_climg2d[index + 1];
+}
+
+/*! Calculate analytical Ewald correction term. */
+gmx_opencl_inline
+float pmecorrF(float z2)
+{
+ const float FN6 = -1.7357322914161492954e-8f;
+ const float FN5 = 1.4703624142580877519e-6f;
+ const float FN4 = -0.000053401640219807709149f;
+ const float FN3 = 0.0010054721316683106153f;
+ const float FN2 = -0.019278317264888380590f;
+ const float FN1 = 0.069670166153766424023f;
+ const float FN0 = -0.75225204789749321333f;
+
+ const float FD4 = 0.0011193462567257629232f;
+ const float FD3 = 0.014866955030185295499f;
+ const float FD2 = 0.11583842382862377919f;
+ const float FD1 = 0.50736591960530292870f;
+ const float FD0 = 1.0f;
+
+ float z4;
+ float polyFN0, polyFN1, polyFD0, polyFD1;
+
+ z4 = z2*z2;
+
+ polyFD0 = FD4*z4 + FD2;
+ polyFD1 = FD3*z4 + FD1;
+ polyFD0 = polyFD0*z4 + FD0;
+ polyFD0 = polyFD1*z2 + polyFD0;
+
+ polyFD0 = 1.0f/polyFD0;
+
+ polyFN0 = FN6*z4 + FN4;
+ polyFN1 = FN5*z4 + FN3;
+ polyFN0 = polyFN0*z4 + FN2;
+ polyFN1 = polyFN1*z4 + FN1;
+ polyFN0 = polyFN0*z4 + FN0;
+ polyFN0 = polyFN1*z2 + polyFN0;
+
+ return polyFN0*polyFD0;
+}
+
+#if REDUCE_SHUFFLE
+gmx_opencl_inline
+void reduce_force_j_shfl(float3 fin, __global float *fout,
+ int gmx_unused tidxi, int gmx_unused tidxj, int aidx)
+{
+ /* Only does reduction over 4 elements in cluster. Needs to be changed
+ * for CL_SIZE>4. See CUDA code for required code */
+ fin.x += intel_sub_group_shuffle_down(fin.x, fin.x, 1);
+ fin.y += intel_sub_group_shuffle_up (fin.y, fin.y, 1);
+ fin.z += intel_sub_group_shuffle_down(fin.z, fin.z, 1);
+ if ((tidxi & 1) == 1)
+ {
+ fin.x = fin.y;
+ }
+ fin.x += intel_sub_group_shuffle_down(fin.x, fin.x, 2);
+ fin.z += intel_sub_group_shuffle_up (fin.z, fin.z, 2);
+ if (tidxi == 2)
+ {
+ fin.x = fin.z;
+ }
+ if (tidxi < 3)
+ {
+ atomicAdd_g_f(&fout[3 * aidx + tidxi], fin.x);
+ }
+}
+#endif
+
+gmx_opencl_inline
+void reduce_force_j_generic(__local float *f_buf, float3 fcj_buf, __global float *fout,
+ int tidxi, int tidxj, int aidx)
+{
+ int tidx = tidxi + tidxj*CL_SIZE;
+ f_buf[ tidx] = fcj_buf.x;
+ f_buf[ FBUF_STRIDE + tidx] = fcj_buf.y;
+ f_buf[2 * FBUF_STRIDE + tidx] = fcj_buf.z;
+
+ /* Split the reduction between the first 3 column threads
+ Threads with column id 0 will do the reduction for (float3).x components
+ Threads with column id 1 will do the reduction for (float3).y components
+ Threads with column id 2 will do the reduction for (float3).z components.
+ The reduction is performed for each line tidxj of f_buf. */
+ if (tidxi < 3)
+ {
+ float f = 0.0f;
+ for (int j = tidxj * CL_SIZE; j < (tidxj + 1) * CL_SIZE; j++)
+ {
+ f += f_buf[FBUF_STRIDE * tidxi + j];
+ }
+
+ atomicAdd_g_f(&fout[3 * aidx + tidxi], f);
+ }
+}
+
+/*! Final j-force reduction
+ */
+gmx_opencl_inline
+void reduce_force_j(__local float gmx_unused *f_buf, float3 fcj_buf, __global float *fout,
+ int tidxi, int tidxj, int aidx)
+{
+#if REDUCE_SHUFFLE
+ reduce_force_j_shfl(fcj_buf, fout, tidxi, tidxj, aidx);
+#else
+ reduce_force_j_generic(f_buf, fcj_buf, fout, tidxi, tidxj, aidx);
+#endif
+}
+
+#if REDUCE_SHUFFLE
+gmx_opencl_inline
+void reduce_force_i_and_shift_shfl(float3* fci_buf, __global float *fout,
+ bool bCalcFshift, int tidxi, int tidxj,
+ int sci, int shift, __global float *fshift)
+{
+ /* Only does reduction over 4 elements in cluster (2 per warp). Needs to be changed
+ * for CL_SIZE>4.*/
+ float2 fshift_buf = 0;
+ for (int ci_offset = 0; ci_offset < NCL_PER_SUPERCL; ci_offset++)
+ {
+ int aidx = (sci * NCL_PER_SUPERCL + ci_offset) * CL_SIZE + tidxi;
+ float3 fin = fci_buf[ci_offset];
+ fin.x += intel_sub_group_shuffle_down(fin.x, fin.x, CL_SIZE);
+ fin.y += intel_sub_group_shuffle_up (fin.y, fin.y, CL_SIZE);
+ fin.z += intel_sub_group_shuffle_down(fin.z, fin.z, CL_SIZE);
+
+ if (tidxj & 1)
+ {
+ fin.x = fin.y;
+ }
+ /* Threads 0,1 and 2,3 increment x,y for their warp */
+ atomicAdd_g_f(&fout[3*aidx + (tidxj & 1)], fin.x);
+ if (bCalcFshift)
+ {
+ fshift_buf[0] += fin.x;
+ }
+ /* Threads 0 and 2 increment z for their warp */
+ if ((tidxj & 1) == 0)
+ {
+ atomicAdd_g_f(&fout[3*aidx+2], fin.z);
+ if (bCalcFshift)
+ {
+ fshift_buf[1] += fin.z;
+ }
+ }
+ }
+ /* add up local shift forces into global mem */
+ if (bCalcFshift)
+ {
+ //Threads 0,1 and 2,3 update x,y
+ atomicAdd_g_f(&(fshift[3 * shift + (tidxj&1)]), fshift_buf[0]);
+ //Threads 0 and 2 update z
+ if ((tidxj & 1) == 0)
+ {
+ atomicAdd_g_f(&(fshift[3 * shift + 2]), fshift_buf[1]);
+ }
+ }
+}
+#endif
+
+/*! Final i-force reduction; this implementation works only with power of two
+ * array sizes.
+ */
+gmx_opencl_inline
+void reduce_force_i_and_shift_pow2(volatile __local float *f_buf, float3* fci_buf,
+ __global float *fout,
+ bool bCalcFshift,
+ int tidxi, int tidxj,
+ int sci, int shift, __global float *fshift)
+{
+ float fshift_buf = 0;
+ for (int ci_offset = 0; ci_offset < NCL_PER_SUPERCL; ci_offset++)
+ {
+ int aidx = (sci * NCL_PER_SUPERCL + ci_offset) * CL_SIZE + tidxi;
+ int tidx = tidxi + tidxj*CL_SIZE;
+ /* store i forces in shmem */
+ f_buf[ tidx] = fci_buf[ci_offset].x;
+ f_buf[ FBUF_STRIDE + tidx] = fci_buf[ci_offset].y;
+ f_buf[2 * FBUF_STRIDE + tidx] = fci_buf[ci_offset].z;
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ int i, j;
+ /* Reduce the initial CL_SIZE values for each i atom to half
+ * every step by using CL_SIZE * i threads.
+ * Can't just use i as loop variable because than nvcc refuses to unroll.
+ */
+ i = CL_SIZE/2;
+ for (j = CL_SIZE_LOG2 - 1; j > 0; j--)
+ {
+ if (tidxj < i)
+ {
+
+ f_buf[ tidxj * CL_SIZE + tidxi] += f_buf[ (tidxj + i) * CL_SIZE + tidxi];
+ f_buf[ FBUF_STRIDE + tidxj * CL_SIZE + tidxi] += f_buf[ FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi];
+ f_buf[2 * FBUF_STRIDE + tidxj * CL_SIZE + tidxi] += f_buf[2 * FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi];
+ }
+ i >>= 1;
+ }
+ /* needed because
+ * a) for CL_SIZE<8: id 2 (doing z in next block) is in 2nd warp
+ * b) for all CL_SIZE a barrier is needed before f_buf is reused by next reduce_force_i call
+ * TODO: Test on Nvidia for performance difference between having the barrier here or after the atomicAdd
+ */
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ /* i == 1, last reduction step, writing to global mem */
+ /* Split the reduction between the first 3 line threads
+ Threads with line id 0 will do the reduction for (float3).x components
+ Threads with line id 1 will do the reduction for (float3).y components
+ Threads with line id 2 will do the reduction for (float3).z components. */
+ if (tidxj < 3)
+ {
+ float f = f_buf[tidxj * FBUF_STRIDE + tidxi] + f_buf[tidxj * FBUF_STRIDE + i * CL_SIZE + tidxi];
+
+ atomicAdd_g_f(&fout[3 * aidx + tidxj], f);
+
+ if (bCalcFshift)
+ {
+ fshift_buf += f;
+ }
+ }
+ }
+ /* add up local shift forces into global mem */
+ if (bCalcFshift)
+ {
+ /* Only threads with tidxj < 3 will update fshift.
+ The threads performing the update, must be the same as the threads
+ storing the reduction result above.
+ */
+ if (tidxj < 3)
+ {
+ atomicAdd_g_f(&(fshift[3 * shift + tidxj]), fshift_buf);
+ }
+ }
+}
+
+/*! Final i-force reduction
+ */
+gmx_opencl_inline
+void reduce_force_i_and_shift(__local float gmx_unused *f_buf, float3* fci_buf, __global float *f,
+ bool bCalcFshift, int tidxi, int tidxj, int sci,
+ int shift, __global float *fshift)
+{
+#if REDUCE_SHUFFLE
+ reduce_force_i_and_shift_shfl(fci_buf, f, bCalcFshift, tidxi, tidxj,
+ sci, shift, fshift);
+#else
+ reduce_force_i_and_shift_pow2(f_buf, fci_buf, f, bCalcFshift, tidxi, tidxj,
+ sci, shift, fshift);
+#endif
+}
+
+
+
+#if REDUCE_SHUFFLE
+gmx_opencl_inline
+void reduce_energy_shfl(float E_lj, float E_el,
+ volatile __global float *e_lj,
+ volatile __global float *e_el,
+ unsigned int tidx)
+{
+ E_lj = sub_group_reduce_add(E_lj);
+ E_el = sub_group_reduce_add(E_el);
+ /* Should be get_sub_group_local_id()==0. Doesn't work with Intel Classic driver.
+ * To make it possible to use REDUCE_SHUFFLE with single subgroup per i-j pair
+ * (e.g. subgroup size 16 with CL_SIZE 4), either this "if" needs to be changed or
+ * the definition of WARP_SIZE (currently CL_SIZE*CL_SIZE/2) needs to be changed
+ * (by supporting c_nbnxnGpuClusterpairSplit=1). */
+ if (tidx == 0 || tidx == WARP_SIZE)
+ {
+ atomicAdd_g_f(e_lj, E_lj);
+ atomicAdd_g_f(e_el, E_el);
+ }
+}
+#endif
+
+/*! Energy reduction; this implementation works only with power of two
+ * array sizes.
+ */
+gmx_opencl_inline
+void reduce_energy_pow2(volatile __local float *buf,
+ volatile __global float *e_lj,
+ volatile __global float *e_el,
+ unsigned int tidx)
+{
+ int j;
+
+ unsigned int i = WARP_SIZE/2;
+
+ /* Can't just use i as loop variable because than nvcc refuses to unroll. */
+ for (j = WARP_SIZE_LOG2 - 1; j > 0; j--)
+ {
+ if (tidx < i)
+ {
+ buf[ tidx] += buf[ tidx + i];
+ buf[FBUF_STRIDE + tidx] += buf[FBUF_STRIDE + tidx + i];
+ }
+ i >>= 1;
+ }
+
+ /* last reduction step, writing to global mem */
+ if (tidx == 0)
+ {
+ float e1 = buf[ tidx] + buf[ tidx + i];
+ float e2 = buf[FBUF_STRIDE + tidx] + buf[FBUF_STRIDE + tidx + i];
+
+ atomicAdd_g_f(e_lj, e1);
+ atomicAdd_g_f(e_el, e2);
+ }
+}
+
+gmx_opencl_inline
+void reduce_energy(volatile __local float gmx_unused *buf,
+ float E_lj, float E_el,
+ volatile __global float *e_lj,
+ volatile __global float *e_el,
+ unsigned int tidx)
+{
+#if REDUCE_SHUFFLE
+ reduce_energy_shfl(E_lj, E_el, e_lj, e_el, tidx);
+#else
+ /* flush the energies to shmem and reduce them */
+ buf[ tidx] = E_lj;
+ buf[FBUF_STRIDE + tidx] = E_el;
+ reduce_energy_pow2(buf + (tidx & WARP_SIZE), e_lj, e_el, tidx & ~WARP_SIZE);
+#endif
+}
+
+gmx_opencl_inline
+bool gmx_sub_group_any_localmem(volatile __local uint *warp_any, int widx, bool pred)
+{
+ if (pred)
+ {
+ warp_any[widx] = 1;
+ }
+
+ bool ret = warp_any[widx];
+
+ warp_any[widx] = 0;
+
+ return ret;
+}
+
+//! Returns a true if predicate is true for any work item in warp
+gmx_opencl_inline
+bool gmx_sub_group_any(volatile __local uint gmx_unused *warp_any, int gmx_unused widx, bool pred)
+{
+#if USE_SUBGROUP_ANY
+ return sub_group_any(pred);
+#else
+ return gmx_sub_group_any_localmem(warp_any, widx, pred);
+#endif
+}
+
+#endif /* NBNXN_OPENCL_KERNEL_UTILS_CLH */