Merge branch 'release-2019' into master

author Paul Bauer <paul.bauer.q@gmail.com>

Wed, 20 Feb 2019 12:50:49 +0000 (13:50 +0100)

committer Paul Bauer <paul.bauer.q@gmail.com>

Wed, 20 Feb 2019 12:50:49 +0000 (13:50 +0100)
author Paul Bauer <paul.bauer.q@gmail.com>
Wed, 20 Feb 2019 12:50:49 +0000 (13:50 +0100)
committer Paul Bauer <paul.bauer.q@gmail.com>
Wed, 20 Feb 2019 12:50:49 +0000 (13:50 +0100)
diff --cc cmake/gmxManageNvccConfig.cmake
Simple merge
diff --cc cmake/gmxSimdFlags.cmake
Simple merge
diff --cc cmake/gmxVersionInfo.cmake
Simple merge
diff --cc docs/CMakeLists.txt

index 8ad2e04f5a0fa9e773899a049354432bb002a513,65c8c5756f5b14ece9c420aa7be1ef112ef0bec5..9f6c80457a8492f81970588fc1ee8a9412cb69bf
--- 1/docs/CMakeLists.txt
--- 2/docs/CMakeLists.txt
+++ b/docs/CMakeLists.txt
@@@ -368,15 -367,7 +368,16 @@@ if (SPHINX_FOUND
           how-to/visualize.rst
           install-guide/index.rst
           release-notes/index.rst
+ +        release-notes/highlights.rst
+ +        release-notes/features.rst
+ +        release-notes/performance.rst
+ +        release-notes/tools.rst
+ +        release-notes/bugs-fixed.rst
+ +        release-notes/removed-functionality.rst
+ +        release-notes/deprecated-functionality.rst
+ +        release-notes/portability.rst
+ +        release-notes/miscellaneous.rst
+         release-notes/2019/2019.2.rst
           release-notes/2019/2019.1.rst
           release-notes/2019/major/highlights.rst
           release-notes/2019/major/features.rst
diff --cc docs/install-guide/index.rst
Simple merge
diff --cc docs/release-notes/index.rst
Simple merge
diff --cc docs/user-guide/mdp-options.rst
Simple merge
diff --cc src/gromacs/CMakeLists.txt
Simple merge
diff --cc src/gromacs/domdec/partition.cpp
Simple merge
diff --cc src/gromacs/ewald/pme.cpp
Simple merge
diff --cc src/gromacs/ewald/tests/pmetestcommon.cpp
Simple merge
diff --cc src/gromacs/ewald/tests/testhardwarecontexts.cpp

index 6157cd3dd1bc251fe7a692e58c5800ba70865aab,5cbbcc311951ab6a4620288092dbdabd36c00943..e15c37c3a1f1c2eae738e2b2f7fd84c77d6e1682
--- 1/src/gromacs/ewald/tests/testhardwarecontexts.cpp
--- 2/src/gromacs/ewald/tests/testhardwarecontexts.cpp
+++ b/src/gromacs/ewald/tests/testhardwarecontexts.cpp
@@@ -110,10 -109,11 +110,11 @@@ static gmx_hw_info_t *hardwareInit(
   
   void PmeTestEnvironment::SetUp()
   {
- -    hardwareContexts_.emplace_back(compat::make_unique<TestHardwareContext>(CodePath::CPU, "", nullptr));
+ +    hardwareContexts_.emplace_back(std::make_unique<TestHardwareContext>(CodePath::CPU, "", nullptr));
   
       hardwareInfo_ = hardwareInit();
-     if (!pme_gpu_supports_build(*hardwareInfo_, nullptr))
+     if (!pme_gpu_supports_build(nullptr) ||
+         !pme_gpu_supports_hardware(*hardwareInfo_, nullptr))
       {
           // PME can only run on the CPU, so don't make any more test contexts.
           return;
diff --cc src/gromacs/gmxpreprocess/insert_molecules.cpp

index 572c385516faf0094710afd2bfaeb227ca9dfcde,0000000000000000000000000000000000000000..8f7df5bbbce7ffc533f485cd36be65b0f8e33dde

mode 100644,000000..100644
--- 1/src/gromacs/gmxpreprocess/insert_molecules.cpp
--- /dev/null
+++ b/src/gromacs/gmxpreprocess/insert_molecules.cpp
@@@ -1,616 -1,0 +1,621 @@@
- #include "gromacs/trajectoryanalysis/topologyinformation.h"
+ +/*
+ + * This file is part of the GROMACS molecular simulation package.
+ + *
+ + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ + * Copyright (c) 2001-2004, The GROMACS development team.
+ + * Copyright (c) 2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
+ + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ + * and including many others, as listed in the AUTHORS file in the
+ + * top-level source directory and at http://www.gromacs.org.
+ + *
+ + * GROMACS is free software; you can redistribute it and/or
+ + * modify it under the terms of the GNU Lesser General Public License
+ + * as published by the Free Software Foundation; either version 2.1
+ + * of the License, or (at your option) any later version.
+ + *
+ + * GROMACS is distributed in the hope that it will be useful,
+ + * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ + * Lesser General Public License for more details.
+ + *
+ + * You should have received a copy of the GNU Lesser General Public
+ + * License along with GROMACS; if not, see
+ + * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ + *
+ + * If you want to redistribute modifications to GROMACS, please
+ + * consider that scientific software is very special. Version
+ + * control is crucial - bugs must be traceable. We will be happy to
+ + * consider code for inclusion in the official distribution, but
+ + * derived work must not be called official GROMACS. Details are found
+ + * in the README & COPYING files - if they are missing, get the
+ + * official version at http://www.gromacs.org.
+ + *
+ + * To help us fund GROMACS development, we humbly ask that you cite
+ + * the research papers on the package. Check out http://www.gromacs.org.
+ + */
+ +#include "gmxpre.h"
+ +
+ +#include "insert_molecules.h"
+ +
+ +#include <algorithm>
+ +#include <memory>
+ +#include <set>
+ +#include <string>
+ +#include <vector>
+ +
+ +#include "gromacs/commandline/cmdlineoptionsmodule.h"
+ +#include "gromacs/fileio/confio.h"
+ +#include "gromacs/fileio/filetypes.h"
+ +#include "gromacs/fileio/xvgr.h"
+ +#include "gromacs/gmxlib/conformation_utilities.h"
+ +#include "gromacs/gmxpreprocess/makeexclusiondistances.h"
+ +#include "gromacs/math/functions.h"
+ +#include "gromacs/math/utilities.h"
+ +#include "gromacs/math/vec.h"
+ +#include "gromacs/options/basicoptions.h"
+ +#include "gromacs/options/filenameoption.h"
+ +#include "gromacs/options/ioptionscontainer.h"
+ +#include "gromacs/pbcutil/pbc.h"
+ +#include "gromacs/random/threefry.h"
+ +#include "gromacs/random/uniformrealdistribution.h"
+ +#include "gromacs/selection/nbsearch.h"
+ +#include "gromacs/selection/selection.h"
+ +#include "gromacs/selection/selectioncollection.h"
+ +#include "gromacs/selection/selectionoption.h"
+ +#include "gromacs/selection/selectionoptionbehavior.h"
+ +#include "gromacs/topology/atomprop.h"
+ +#include "gromacs/topology/atoms.h"
+ +#include "gromacs/topology/atomsbuilder.h"
+ +#include "gromacs/topology/mtop_util.h"
+ +#include "gromacs/topology/symtab.h"
+ +#include "gromacs/topology/topology.h"
+ +#include "gromacs/trajectory/trajectoryframe.h"
- #include "gromacs/utility/unique_cptr.h"
+ +#include "gromacs/utility/cstringutil.h"
+ +#include "gromacs/utility/exceptions.h"
+ +#include "gromacs/utility/fatalerror.h"
+ +#include "gromacs/utility/smalloc.h"
-         gmx_mtop_t *getTopology(bool /*required*/) override { return topInfo_.mtop(); }
+ +
+ +using gmx::RVec;
+ +
+ +/* enum for random rotations of inserted solutes */
+ +enum RotationType {
+ +    en_rotXYZ, en_rotZ, en_rotNone
+ +};
+ +const char *const cRotationEnum[] = {"xyz", "z", "none"};
+ +
+ +static void center_molecule(gmx::ArrayRef<RVec> x)
+ +{
+ +    rvec center = {0};
+ +    for (auto &xi : x)
+ +    {
+ +        rvec_inc(center, xi);
+ +    }
+ +    svmul(1.0/x.size(), center, center);
+ +    for (auto &xi : x)
+ +    {
+ +        rvec_dec(xi, center);
+ +    }
+ +}
+ +
+ +static void generate_trial_conf(gmx::ArrayRef<RVec> xin,
+ +                                const rvec offset, RotationType enum_rot,
+ +                                gmx::DefaultRandomEngine * rng,
+ +                                std::vector<RVec> *xout)
+ +{
+ +    gmx::UniformRealDistribution<real> dist(0, 2.0*M_PI);
+ +    xout->assign(xin.begin(), xin.end());
+ +
+ +    real alfa = 0.0, beta = 0.0, gamma = 0.0;
+ +    switch (enum_rot)
+ +    {
+ +        case en_rotXYZ:
+ +            alfa  = dist(*rng);
+ +            beta  = dist(*rng);
+ +            gamma = dist(*rng);
+ +            break;
+ +        case en_rotZ:
+ +            alfa  = beta = 0.;
+ +            gamma = dist(*rng);
+ +            break;
+ +        case en_rotNone:
+ +            alfa = beta = gamma = 0.;
+ +            break;
+ +    }
+ +    if (enum_rot == en_rotXYZ || enum_rot == en_rotZ)
+ +    {
+ +        rotate_conf(xout->size(), as_rvec_array(xout->data()), nullptr, alfa, beta, gamma);
+ +    }
+ +    for (size_t i = 0; i < xout->size(); ++i)
+ +    {
+ +        rvec_inc((*xout)[i], offset);
+ +    }
+ +}
+ +
+ +static bool isInsertionAllowed(gmx::AnalysisNeighborhoodSearch *search,
+ +                               const std::vector<real>         &exclusionDistances,
+ +                               const std::vector<RVec>         &x,
+ +                               const std::vector<real>         &exclusionDistances_insrt,
+ +                               const t_atoms                   &atoms,
+ +                               const std::set<int>             &removableAtoms,
+ +                               gmx::AtomsRemover               *remover)
+ +{
+ +    gmx::AnalysisNeighborhoodPositions  pos(x);
+ +    gmx::AnalysisNeighborhoodPairSearch pairSearch = search->startPairSearch(pos);
+ +    gmx::AnalysisNeighborhoodPair       pair;
+ +    while (pairSearch.findNextPair(&pair))
+ +    {
+ +        const real r1 = exclusionDistances[pair.refIndex()];
+ +        const real r2 = exclusionDistances_insrt[pair.testIndex()];
+ +        if (pair.distance2() < gmx::square(r1 + r2))
+ +        {
+ +            if (removableAtoms.count(pair.refIndex()) == 0)
+ +            {
+ +                return false;
+ +            }
+ +            // TODO: If molecule information is available, this should ideally
+ +            // use it to remove whole molecules.
+ +            remover->markResidue(atoms, pair.refIndex(), true);
+ +        }
+ +    }
+ +    return true;
+ +}
+ +
+ +static void insert_mols(int nmol_insrt, int ntry, int seed,
+ +                        real defaultDistance, real scaleFactor,
+ +                        t_atoms *atoms, t_symtab *symtab, std::vector<RVec> *x,
+ +                        const std::set<int> &removableAtoms,
+ +                        const t_atoms &atoms_insrt, gmx::ArrayRef<RVec> x_insrt,
+ +                        int ePBC, matrix box,
+ +                        const std::string &posfn, const rvec deltaR,
+ +                        RotationType enum_rot)
+ +{
+ +    fprintf(stderr, "Initialising inter-atomic distances...\n");
+ +    AtomProperties          aps;
+ +    std::vector<real>       exclusionDistances(
+ +            makeExclusionDistances(atoms, &aps, defaultDistance, scaleFactor));
+ +    const std::vector<real> exclusionDistances_insrt(
+ +            makeExclusionDistances(&atoms_insrt, &aps, defaultDistance, scaleFactor));
+ +
+ +    const real       maxInsertRadius
+ +        = *std::max_element(exclusionDistances_insrt.begin(),
+ +                            exclusionDistances_insrt.end());
+ +    real             maxRadius = maxInsertRadius;
+ +    if (!exclusionDistances.empty())
+ +    {
+ +        const real maxExistingRadius
+ +            = *std::max_element(exclusionDistances.begin(),
+ +                                exclusionDistances.end());
+ +        maxRadius = std::max(maxInsertRadius, maxExistingRadius);
+ +    }
+ +
+ +    // TODO: Make all of this exception-safe.
+ +    gmx::AnalysisNeighborhood nb;
+ +    nb.setCutoff(maxInsertRadius + maxRadius);
+ +
+ +
+ +    if (seed == 0)
+ +    {
+ +        seed = static_cast<int>(gmx::makeRandomSeed());
+ +    }
+ +    fprintf(stderr, "Using random seed %d\n", seed);
+ +
+ +    gmx::DefaultRandomEngine rng(seed);
+ +
+ +    t_pbc                    pbc;
+ +    set_pbc(&pbc, ePBC, box);
+ +
+ +    /* With -ip, take nmol_insrt from file posfn */
+ +    double     **rpos              = nullptr;
+ +    const bool   insertAtPositions = !posfn.empty();
+ +    if (insertAtPositions)
+ +    {
+ +        int ncol;
+ +        nmol_insrt = read_xvg(posfn.c_str(), &rpos, &ncol);
+ +        if (ncol != 3)
+ +        {
+ +            gmx_fatal(FARGS, "Expected 3 columns (x/y/z coordinates) in file %s\n",
+ +                      posfn.c_str());
+ +        }
+ +        fprintf(stderr, "Read %d positions from file %s\n\n",
+ +                nmol_insrt, posfn.c_str());
+ +    }
+ +
+ +    gmx::AtomsBuilder builder(atoms, symtab);
+ +    gmx::AtomsRemover remover(*atoms);
+ +    {
+ +        const int finalAtomCount    = atoms->nr + nmol_insrt * atoms_insrt.nr;
+ +        const int finalResidueCount = atoms->nres + nmol_insrt * atoms_insrt.nres;
+ +        builder.reserve(finalAtomCount, finalResidueCount);
+ +        x->reserve(finalAtomCount);
+ +        exclusionDistances.reserve(finalAtomCount);
+ +    }
+ +
+ +    std::vector<RVec>                    x_n(x_insrt.size());
+ +
+ +    int                                  mol        = 0;
+ +    int                                  trial      = 0;
+ +    int                                  firstTrial = 0;
+ +    int                                  failed     = 0;
+ +    gmx::UniformRealDistribution<real>   dist;
+ +
+ +    while (mol < nmol_insrt && trial < ntry*nmol_insrt)
+ +    {
+ +        rvec offset_x;
+ +        if (!insertAtPositions)
+ +        {
+ +            // Insert at random positions.
+ +            offset_x[XX] = box[XX][XX] * dist(rng);
+ +            offset_x[YY] = box[YY][YY] * dist(rng);
+ +            offset_x[ZZ] = box[ZZ][ZZ] * dist(rng);
+ +        }
+ +        else
+ +        {
+ +            // Skip a position if ntry trials were not successful.
+ +            if (trial >= firstTrial + ntry)
+ +            {
+ +                fprintf(stderr, " skipped position (%.3f, %.3f, %.3f)\n",
+ +                        rpos[XX][mol], rpos[YY][mol], rpos[ZZ][mol]);
+ +                ++mol;
+ +                ++failed;
+ +                firstTrial = trial;
+ +                continue;
+ +            }
+ +            // Insert at positions taken from option -ip file.
+ +            offset_x[XX] = rpos[XX][mol] + deltaR[XX]*(2 * dist(rng)-1);
+ +            offset_x[YY] = rpos[YY][mol] + deltaR[YY]*(2 * dist(rng)-1);
+ +            offset_x[ZZ] = rpos[ZZ][mol] + deltaR[ZZ]*(2 * dist(rng)-1);
+ +        }
+ +        fprintf(stderr, "\rTry %d", ++trial);
+ +        fflush(stderr);
+ +
+ +        generate_trial_conf(x_insrt, offset_x, enum_rot, &rng, &x_n);
+ +        gmx::AnalysisNeighborhoodPositions pos(*x);
+ +        gmx::AnalysisNeighborhoodSearch    search = nb.initSearch(&pbc, pos);
+ +        if (isInsertionAllowed(&search, exclusionDistances, x_n, exclusionDistances_insrt,
+ +                               *atoms, removableAtoms, &remover))
+ +        {
+ +            x->insert(x->end(), x_n.begin(), x_n.end());
+ +            exclusionDistances.insert(exclusionDistances.end(),
+ +                                      exclusionDistances_insrt.begin(),
+ +                                      exclusionDistances_insrt.end());
+ +            builder.mergeAtoms(atoms_insrt);
+ +            ++mol;
+ +            firstTrial = trial;
+ +            fprintf(stderr, " success (now %d atoms)!\n", builder.currentAtomCount());
+ +        }
+ +    }
+ +
+ +    fprintf(stderr, "\n");
+ +    /* print number of molecules added */
+ +    fprintf(stderr, "Added %d molecules (out of %d requested)\n",
+ +            mol - failed, nmol_insrt);
+ +
+ +    const int originalAtomCount    = atoms->nr;
+ +    const int originalResidueCount = atoms->nres;
+ +    remover.refreshAtomCount(*atoms);
+ +    remover.removeMarkedElements(x);
+ +    remover.removeMarkedAtoms(atoms);
+ +    if (atoms->nr < originalAtomCount)
+ +    {
+ +        fprintf(stderr, "Replaced %d residues (%d atoms)\n",
+ +                originalResidueCount - atoms->nres,
+ +                originalAtomCount - atoms->nr);
+ +    }
+ +
+ +    if (rpos != nullptr)
+ +    {
+ +        for (int i = 0; i < DIM; ++i)
+ +        {
+ +            sfree(rpos[i]);
+ +        }
+ +        sfree(rpos);
+ +    }
+ +}
+ +
+ +namespace gmx
+ +{
+ +
+ +namespace
+ +{
+ +
+ +class InsertMolecules : public ICommandLineOptionsModule, public ITopologyProvider
+ +{
+ +    public:
+ +        InsertMolecules()
+ +            : bBox_(false), nmolIns_(0), nmolTry_(10), seed_(0),
+ +              defaultDistance_(0.105), scaleFactor_(0.57), enumRot_(en_rotXYZ)
+ +        {
+ +            clear_rvec(newBox_);
+ +            clear_rvec(deltaR_);
++            clear_mat(box_);
+ +        }
+ +
+ +        // From ITopologyProvider
-         TopologyInformation topInfo_;
++        gmx_mtop_t *getTopology(bool /*required*/) override { return &top_; }
+ +        int getAtomCount() override { return 0; }
+ +
+ +        // From ICommandLineOptionsModule
+ +        void init(CommandLineModuleSettings * /*settings*/) override
+ +        {
+ +        }
+ +        void initOptions(IOptionsContainer                 *options,
+ +                         ICommandLineOptionsModuleSettings *settings) override;
+ +        void optionsFinished() override;
+ +        int run() override;
+ +
+ +    private:
+ +        SelectionCollection selections_;
+ +
+ +        std::string         inputConfFile_;
+ +        std::string         insertConfFile_;
+ +        std::string         positionFile_;
+ +        std::string         outputConfFile_;
+ +        rvec                newBox_;
+ +        bool                bBox_;
+ +        int                 nmolIns_;
+ +        int                 nmolTry_;
+ +        int                 seed_;
+ +        real                defaultDistance_;
+ +        real                scaleFactor_;
+ +        rvec                deltaR_;
+ +        RotationType        enumRot_;
+ +        Selection           replaceSel_;
+ +
-         topInfo_.fillFromInputFile(inputConfFile_);
-         if (topInfo_.mtop()->natoms == 0)
++        gmx_mtop_t          top_;
++        std::vector<RVec>   x_;
++        matrix              box_;
++        int                 ePBC_;
+ +};
+ +
+ +void InsertMolecules::initOptions(IOptionsContainer                 *options,
+ +                                  ICommandLineOptionsModuleSettings *settings)
+ +{
+ +    const char *const desc[] = {
+ +        "[THISMODULE] inserts [TT]-nmol[tt] copies of the system specified in",
+ +        "the [TT]-ci[tt] input file. The insertions take place either into",
+ +        "vacant space in the solute conformation given with [TT]-f[tt], or",
+ +        "into an empty box given by [TT]-box[tt]. Specifying both [TT]-f[tt]",
+ +        "and [TT]-box[tt] behaves like [TT]-f[tt], but places a new box",
+ +        "around the solute before insertions. Any velocities present are",
+ +        "discarded.",
+ +        "",
+ +        "It is possible to also insert into a solvated configuration and",
+ +        "replace solvent atoms with the inserted atoms. To do this, use",
+ +        "[TT]-replace[tt] to specify a selection that identifies the atoms",
+ +        "that can be replaced. The tool assumes that all molecules in this",
+ +        "selection consist of single residues: each residue from this",
+ +        "selection that overlaps with the inserted molecules will be removed",
+ +        "instead of preventing insertion.",
+ +        "",
+ +        "By default, the insertion positions are random (with initial seed",
+ +        "specified by [TT]-seed[tt]). The program iterates until [TT]-nmol[tt]",
+ +        "molecules have been inserted in the box. Molecules are not inserted",
+ +        "where the distance between any existing atom and any atom of the",
+ +        "inserted molecule is less than the sum based on the van der Waals",
+ +        "radii of both atoms. A database ([TT]vdwradii.dat[tt]) of van der",
+ +        "Waals radii is read by the program, and the resulting radii scaled",
+ +        "by [TT]-scale[tt]. If radii are not found in the database, those",
+ +        "atoms are assigned the (pre-scaled) distance [TT]-radius[tt].",
+ +        "Note that the usefulness of those radii depends on the atom names,",
+ +        "and thus varies widely with force field.",
+ +        "",
+ +        "A total of [TT]-nmol[tt] * [TT]-try[tt] insertion attempts are made",
+ +        "before giving up. Increase [TT]-try[tt] if you have several small",
+ +        "holes to fill. Option [TT]-rot[tt] specifies whether the insertion",
+ +        "molecules are randomly oriented before insertion attempts.",
+ +        "",
+ +        "Alternatively, the molecules can be inserted only at positions defined in",
+ +        "positions.dat ([TT]-ip[tt]). That file should have 3 columns (x,y,z),",
+ +        "that give the displacements compared to the input molecule position",
+ +        "([TT]-ci[tt]). Hence, if that file should contain the absolute",
+ +        "positions, the molecule must be centered on (0,0,0) before using",
+ +        "[THISMODULE] (e.g. from [gmx-editconf] [TT]-center[tt]).",
+ +        "Comments in that file starting with # are ignored. Option [TT]-dr[tt]",
+ +        "defines the maximally allowed displacements during insertial trials.",
+ +        "[TT]-try[tt] and [TT]-rot[tt] work as in the default mode (see above)."
+ +    };
+ +
+ +    settings->setHelpText(desc);
+ +
+ +    std::shared_ptr<SelectionOptionBehavior> selectionOptionBehavior(
+ +            new SelectionOptionBehavior(&selections_, this));
+ +    settings->addOptionsBehavior(selectionOptionBehavior);
+ +
+ +    // TODO: Replace use of legacyType.
+ +    options->addOption(FileNameOption("f")
+ +                           .legacyType(efSTX).inputFile()
+ +                           .store(&inputConfFile_)
+ +                           .defaultBasename("protein")
+ +                           .description("Existing configuration to insert into"));
+ +    options->addOption(FileNameOption("ci")
+ +                           .legacyType(efSTX).inputFile().required()
+ +                           .store(&insertConfFile_)
+ +                           .defaultBasename("insert")
+ +                           .description("Configuration to insert"));
+ +    options->addOption(FileNameOption("ip")
+ +                           .filetype(eftGenericData).inputFile()
+ +                           .store(&positionFile_)
+ +                           .defaultBasename("positions")
+ +                           .description("Predefined insertion trial positions"));
+ +    options->addOption(FileNameOption("o")
+ +                           .legacyType(efSTO).outputFile().required()
+ +                           .store(&outputConfFile_)
+ +                           .defaultBasename("out")
+ +                           .description("Output configuration after insertion"));
+ +
+ +    options->addOption(SelectionOption("replace").onlyAtoms()
+ +                           .store(&replaceSel_)
+ +                           .description("Atoms that can be removed if overlapping"));
+ +    selectionOptionBehavior->initOptions(options);
+ +
+ +    options->addOption(RealOption("box").vector()
+ +                           .store(newBox_).storeIsSet(&bBox_)
+ +                           .description("Box size (in nm)"));
+ +    options->addOption(IntegerOption("nmol")
+ +                           .store(&nmolIns_)
+ +                           .description("Number of extra molecules to insert"));
+ +    options->addOption(IntegerOption("try")
+ +                           .store(&nmolTry_)
+ +                           .description("Try inserting [TT]-nmol[tt] times [TT]-try[tt] times"));
+ +    options->addOption(IntegerOption("seed")
+ +                           .store(&seed_)
+ +                           .description("Random generator seed (0 means generate)"));
+ +    options->addOption(RealOption("radius")
+ +                           .store(&defaultDistance_)
+ +                           .description("Default van der Waals distance"));
+ +    options->addOption(RealOption("scale")
+ +                           .store(&scaleFactor_)
+ +                           .description("Scale factor to multiply Van der Waals radii from the database in share/gromacs/top/vdwradii.dat. The default value of 0.57 yields density close to 1000 g/l for proteins in water."));
+ +    options->addOption(RealOption("dr").vector()
+ +                           .store(deltaR_)
+ +                           .description("Allowed displacement in x/y/z from positions in [TT]-ip[tt] file"));
+ +    options->addOption(EnumOption<RotationType>("rot").enumValue(cRotationEnum)
+ +                           .store(&enumRot_)
+ +                           .description("Rotate inserted molecules randomly"));
+ +}
+ +
+ +void InsertMolecules::optionsFinished()
+ +{
+ +    if (nmolIns_ <= 0 && positionFile_.empty())
+ +    {
+ +        GMX_THROW(InconsistentInputError("Either -nmol must be larger than 0, "
+ +                                         "or positions must be given with -ip."));
+ +    }
+ +    if (inputConfFile_.empty() && !bBox_)
+ +    {
+ +        GMX_THROW(InconsistentInputError("When no solute (-f) is specified, "
+ +                                         "a box size (-box) must be specified."));
+ +    }
+ +    if (replaceSel_.isValid() && inputConfFile_.empty())
+ +    {
+ +        GMX_THROW(InconsistentInputError("Replacement (-replace) only makes sense "
+ +                                         "together with an existing configuration (-f)."));
+ +    }
+ +
+ +    if (!inputConfFile_.empty())
+ +    {
++        bool  bTprFileWasRead;
++        rvec *temporaryX = nullptr;
+ +        fprintf(stderr, "Reading solute configuration\n");
-     const char       *outputTitle = topInfo_.name();
-     std::vector<RVec> xOutput;
-     matrix            box = {{ 0 }};
-     if (topInfo_.hasTopology())
-     {
-         xOutput = copyOf(topInfo_.x());
-         topInfo_.getBox(box);
-     }
-     else
-     {
-         xOutput.resize(topInfo_.mtop()->natoms);
-     }
-     auto          atomsSolute = topInfo_.copyAtoms();
++        readConfAndTopology(inputConfFile_.c_str(), &bTprFileWasRead, &top_,
++                            &ePBC_, &temporaryX, nullptr, box_);
++        x_.assign(temporaryX, temporaryX + top_.natoms);
++        sfree(temporaryX);
++        if (top_.natoms == 0)
+ +        {
+ +            fprintf(stderr, "Note: no atoms in %s\n", inputConfFile_.c_str());
+ +        }
+ +    }
+ +}
+ +
+ +int InsertMolecules::run()
+ +{
-         set_pbc(&pbc, topInfo_.ePBC(), box);
+ +    std::set<int> removableAtoms;
+ +    if (replaceSel_.isValid())
+ +    {
+ +        t_pbc       pbc;
-         fr->natoms = topInfo_.mtop()->natoms;
++        set_pbc(&pbc, ePBC_, box_);
+ +        t_trxframe *fr;
+ +        snew(fr, 1);
-         fr->x      = as_rvec_array(xOutput.data());
++        fr->natoms = x_.size();
+ +        fr->bX     = TRUE;
-     int ePBCForOutput = topInfo_.ePBC();
++        fr->x      = as_rvec_array(x_.data());
+ +        selections_.evaluate(fr, &pbc);
+ +        sfree(fr);
+ +        removableAtoms.insert(replaceSel_.atomIndices().begin(),
+ +                              replaceSel_.atomIndices().end());
+ +        // TODO: It could be nice to check that removableAtoms contains full
+ +        // residues, since we anyways remove whole residues instead of
+ +        // individual atoms.
+ +    }
+ +
-         clear_mat(box);
-         box[XX][XX] = newBox_[XX];
-         box[YY][YY] = newBox_[YY];
-         box[ZZ][ZZ] = newBox_[ZZ];
++    int ePBCForOutput = ePBC_;
+ +    if (bBox_)
+ +    {
+ +        ePBCForOutput = epbcXYZ;
-     if (det(box) == 0)
++        clear_mat(box_);
++        box_[XX][XX] = newBox_[XX];
++        box_[YY][YY] = newBox_[YY];
++        box_[ZZ][ZZ] = newBox_[ZZ];
+ +    }
-     fprintf(stderr, "Reading molecule configuration\n");
-     TopologyInformation      topInfoForInsertedMolecule;
-     topInfoForInsertedMolecule.fillFromInputFile(insertConfFile_);
-     auto                     atomsInserted = topInfoForInsertedMolecule.atoms();
-     std::vector<RVec>        xInserted     = copyOf(topInfoForInsertedMolecule.x());
- 
-     if (topInfoForInsertedMolecule.mtop()->natoms == 0)
-     {
-         gmx_fatal(FARGS, "No molecule in %s, please check your input",
-                   insertConfFile_.c_str());
-     }
-     if (outputTitle == nullptr)
-     {
-         outputTitle = topInfoForInsertedMolecule.name();
-     }
-     if (positionFile_.empty())
++    if (det(box_) == 0)
+ +    {
+ +        gmx_fatal(FARGS, "Undefined solute box.\nCreate one with gmx editconf "
+ +                  "or give explicit -box command line option");
+ +    }
+ +
-         center_molecule(xInserted);
++    gmx_mtop_t         topInserted;
++    t_atoms            atomsInserted;
++    std::vector<RVec>  xInserted;
+ +    {
-     auto              symtabInserted = duplicateSymtab(&topInfo_.mtop()->symtab);
-     const sfree_guard symtabInsertedGuard(symtabInserted);
++        bool        bTprFileWasRead;
++        int         ePBC_dummy;
++        matrix      box_dummy;
++        rvec       *temporaryX;
++        readConfAndTopology(insertConfFile_.c_str(), &bTprFileWasRead, &topInserted,
++                            &ePBC_dummy, &temporaryX, nullptr, box_dummy);
++        xInserted.assign(temporaryX, temporaryX + topInserted.natoms);
++        sfree(temporaryX);
++        atomsInserted = gmx_mtop_global_atoms(&topInserted);
++        if (atomsInserted.nr == 0)
++        {
++            gmx_fatal(FARGS, "No molecule in %s, please check your input",
++                      insertConfFile_.c_str());
++        }
++        if (top_.name == nullptr)
++        {
++            top_.name = topInserted.name;
++        }
++        if (positionFile_.empty())
++        {
++            center_molecule(xInserted);
++        }
+ +    }
+ +
-                 atomsSolute.get(), symtabInserted, &xOutput, removableAtoms, *atomsInserted, xInserted,
-                 ePBCForOutput, box, positionFile_, deltaR_, enumRot_);
++    t_atoms atoms = gmx_mtop_global_atoms(&top_);
++
+ +    /* add nmol_ins molecules of atoms_ins
+ +       in random orientation at random place */
+ +    insert_mols(nmolIns_, nmolTry_, seed_, defaultDistance_, scaleFactor_,
-     write_sto_conf(outputConfFile_.c_str(), outputTitle, atomsSolute.get(),
-                    as_rvec_array(xOutput.data()), nullptr, ePBCForOutput, box);
++                &atoms, &top_.symtab, &x_, removableAtoms, atomsInserted, xInserted,
++                ePBCForOutput, box_, positionFile_, deltaR_, enumRot_);
+ +
+ +    /* write new configuration to file confout */
+ +    fprintf(stderr, "Writing generated configuration to %s\n",
+ +            outputConfFile_.c_str());
-             atomsSolute->nr, atomsSolute->nres);
-     done_symtab(symtabInserted);
++    write_sto_conf(outputConfFile_.c_str(), *top_.name, &atoms,
++                   as_rvec_array(x_.data()), nullptr, ePBCForOutput, box_);
+ +
+ +    /* print size of generated configuration */
+ +    fprintf(stderr, "\nOutput configuration contains %d atoms in %d residues\n",
++            atoms.nr, atoms.nres);
++
++    done_atom(&atoms);
++    done_atom(&atomsInserted);
++
+ +    return 0;
+ +}
+ +
+ +}   // namespace
+ +
+ +
+ +const char* InsertMoleculesInfo::name()
+ +{
+ +    static const char* name = "insert-molecules";
+ +    return name;
+ +}
+ +
+ +const char* InsertMoleculesInfo::shortDescription()
+ +{
+ +    static const char* shortDescription =
+ +        "Insert molecules into existing vacancies";
+ +    return shortDescription;
+ +}
+ +
+ +ICommandLineOptionsModulePointer InsertMoleculesInfo::create()
+ +{
+ +    return ICommandLineOptionsModulePointer(new InsertMolecules());
+ +}
+ +
+ +} // namespace gmx
diff --cc src/gromacs/gmxpreprocess/readir.cpp
Simple merge
diff --cc src/gromacs/gmxpreprocess/solvate.cpp

index 60fa35e97cd941905fa394c0208f3cd9db5ec6b3,c6e6eba38e77f5d2db51b47add28790f379c4b3e..bf44bd395e803a013bc8a71a4191f255e7942e83
--- 1/src/gromacs/gmxpreprocess/solvate.cpp
--- 2/src/gromacs/gmxpreprocess/solvate.cpp
+++ b/src/gromacs/gmxpreprocess/solvate.cpp
@@@ -638,11 -637,10 +637,10 @@@ static void removeExtraSolventMolecules
       remover.removeMarkedAtoms(atoms);
   }
   
- static void add_solv(const char *filename,
-                      const gmx::TopologyInformation &topInfo,
-                      t_atoms *atoms,
+ static void add_solv(const char *filename, t_atoms *atoms,
+                      t_symtab *symtab,
                        std::vector<RVec> *x, std::vector<RVec> *v,
-                      matrix box, AtomProperties *aps,
- -                     int ePBC, matrix box, gmx_atomprop_t aps,
++                     int ePBC, matrix box, AtomProperties *aps,
                        real defaultDistance, real scaleFactor,
                        real rshell, int max_sol)
   {
@@@ -741,14 -744,15 +744,19 @@@
           done_atom(newatoms);
           sfree(newatoms);
       }
+     if (atomsSolvent)
+     {
+         done_atom(atomsSolvent);
+         sfree(atomsSolvent);
+     }
   }
   
- -static void update_top(t_atoms *atoms, int firstSolventResidueIndex, matrix box, int NFILE, t_filenm fnm[],
- -                       gmx_atomprop_t aps)
+ +static void update_top(t_atoms        *atoms,
+ +                       int             firstSolventResidueIndex,
+ +                       matrix          box,
+ +                       int             NFILE,
+ +                       t_filenm        fnm[],
+ +                       AtomProperties *aps)
   {
       FILE        *fpin, *fpout;
       char         buf[STRLEN*2], buf2[STRLEN], *temp;
@@@ -971,11 -976,15 +979,15 @@@ int gmx_solvate(int argc, char *argv[]
                     "a box size (-box) must be specified");
       }
   
- -    aps = gmx_atomprop_init();
+ +    AtomProperties           aps;
   
-     gmx::TopologyInformation topInfo;
-     std::vector<RVec>        x, v;
-     matrix                   box = {{ 0 }};
+     /* solute configuration data */
+     gmx_mtop_t        top;
+     std::vector<RVec> x, v;
+     matrix            box  = {{ 0 }};
+     int               ePBC = -1;
+     t_atoms          *atoms;
+     snew(atoms, 1);
       if (bProt)
       {
           /* Generate a solute configuration */
@@@ -1019,8 -1032,8 +1035,8 @@@
                     "or give explicit -box command line option");
       }
   
-     add_solv(solventFileName, topInfo, atoms.get(), &x, &v, box,
+     add_solv(solventFileName, atoms, &top.symtab, &x, &v, ePBCForOutput, box,
- -             aps, defaultDistance, scaleFactor, r_shell, max_sol);
+ +             &aps, defaultDistance, scaleFactor, r_shell, max_sol);
   
       /* write new configuration 1 to file confout */
       confout = ftp2fn(efSTO, NFILE, fnm);
@@@ -1032,7 -1045,11 +1048,10 @@@
       /* print size of generated configuration */
       fprintf(stderr, "\nOutput configuration contains %d atoms in %d residues\n",
               atoms->nr, atoms->nres);
-     update_top(atoms.get(), firstSolventResidueIndex, box, NFILE, fnm, &aps);
- -    update_top(atoms, firstSolventResidueIndex, box, NFILE, fnm, aps);
++    update_top(atoms, firstSolventResidueIndex, box, NFILE, fnm, &aps);
+ 
+     done_atom(atoms);
+     sfree(atoms);
- -    gmx_atomprop_destroy(aps);
       output_env_done(oenv);
   
       return 0;
diff --cc src/gromacs/mdlib/constr.cpp
Simple merge
diff --cc src/gromacs/mdlib/constr.h
Simple merge
diff --cc src/gromacs/mdlib/force.cpp
Simple merge
diff --cc src/gromacs/mdlib/sim_util.cpp

index 6dedbb795b4a78c9bf5664e3dc175846d0ef0b43,4b0992916ab6f12b19a59b5ead139840cf39b156..3d5578a664b07f0849877da60a1a25f0b68985c1
--- 1/src/gromacs/mdlib/sim_util.cpp
--- 2/src/gromacs/mdlib/sim_util.cpp
+++ b/src/gromacs/mdlib/sim_util.cpp
@@@ -1202,8 -1290,8 +1202,8 @@@ static void do_force_cutsVERLET(FILE *f
           }
   
           /* launch local nonbonded work on GPU */
-         wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
+         wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
- -        do_nb_verlet(fr, ic, enerd, flags, eintLocal, enbvClearFNo,
+ +        do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::Local, enbvClearFNo,
                        step, nrnb, wcycle);
           wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
           wallcycle_stop(wcycle, ewcLAUNCH_GPU);
@@@ -1267,8 -1352,8 +1267,8 @@@
               wallcycle_start(wcycle, ewcLAUNCH_GPU);
   
               /* launch non-local nonbonded tasks on GPU */
-             wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
+             wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
- -            nbnxn_gpu_copy_xq_to_gpu(nbv->gpu_nbv, nbv->nbat, eatNonlocal, ppForceWorkload->haveGpuBondedWork);
+ +            Nbnxm::gpu_copy_xq_to_gpu(nbv->gpu_nbv, nbv->nbat, Nbnxm::AtomLocality::NonLocal, ppForceWorkload->haveGpuBondedWork);
               wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
   
               if (ppForceWorkload->haveGpuBondedWork)
diff --cc src/gromacs/mdrun/rerun.cpp
Simple merge
diff --cc src/gromacs/mdrun/runner.cpp
Simple merge
diff --cc src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel_utils.cuh

index 34b932394b97f81c62fa1710c6529d8729c7902a,0000000000000000000000000000000000000000..758e5ec55d16c6ea3799dda3c8844016c03e6e5f

mode 100644,000000..100644
--- 1/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel_utils.cuh
--- /dev/null
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel_utils.cuh
@@@ -1,737 -1,0 +1,737 @@@
-         c12*(-repu_shift_V2 + repu_shift_V3*r_switch)*r_switch*r_switch*inv_r;
+ +/*
+ + * This file is part of the GROMACS molecular simulation package.
+ + *
+ + * Copyright (c) 2012,2013,2014,2016,2017,2018,2019, by the GROMACS development team, led by
+ + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ + * and including many others, as listed in the AUTHORS file in the
+ + * top-level source directory and at http://www.gromacs.org.
+ + *
+ + * GROMACS is free software; you can redistribute it and/or
+ + * modify it under the terms of the GNU Lesser General Public License
+ + * as published by the Free Software Foundation; either version 2.1
+ + * of the License, or (at your option) any later version.
+ + *
+ + * GROMACS is distributed in the hope that it will be useful,
+ + * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ + * Lesser General Public License for more details.
+ + *
+ + * You should have received a copy of the GNU Lesser General Public
+ + * License along with GROMACS; if not, see
+ + * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ + *
+ + * If you want to redistribute modifications to GROMACS, please
+ + * consider that scientific software is very special. Version
+ + * control is crucial - bugs must be traceable. We will be happy to
+ + * consider code for inclusion in the official distribution, but
+ + * derived work must not be called official GROMACS. Details are found
+ + * in the README & COPYING files - if they are missing, get the
+ + * official version at http://www.gromacs.org.
+ + *
+ + * To help us fund GROMACS development, we humbly ask that you cite
+ + * the research papers on the package. Check out http://www.gromacs.org.
+ + */
+ +
+ +/*! \internal \file
+ + *  \brief
+ + *  Utility constant and function declaration for the CUDA non-bonded kernels.
+ + *  This header should be included once at the top level, just before the
+ + *  kernels are included (has to be preceded by nbnxn_cuda_types.h).
+ + *
+ + *  \author Szilárd Páll <pall.szilard@gmail.com>
+ + *  \ingroup module_nbnxm
+ + */
+ +#include <assert.h>
+ +
+ +/* Note that floating-point constants in CUDA code should be suffixed
+ + * with f (e.g. 0.5f), to stop the compiler producing intermediate
+ + * code that is in double precision.
+ + */
+ +
+ +#include "gromacs/gpu_utils/cuda_arch_utils.cuh"
+ +#include "gromacs/gpu_utils/cuda_kernel_utils.cuh"
+ +#include "gromacs/gpu_utils/vectype_ops.cuh"
+ +
+ +#include "nbnxm_cuda_types.h"
+ +
+ +#ifndef NBNXM_CUDA_KERNEL_UTILS_CUH
+ +#define NBNXM_CUDA_KERNEL_UTILS_CUH
+ +
+ +/*! \brief Log of the i and j cluster size.
+ + *  change this together with c_clSize !*/
+ +static const int __device__          c_clSizeLog2  = 3;
+ +/*! \brief Square of cluster size. */
+ +static const int __device__          c_clSizeSq    = c_clSize*c_clSize;
+ +/*! \brief j-cluster size after split (4 in the current implementation). */
+ +static const int __device__          c_splitClSize = c_clSize/c_nbnxnGpuClusterpairSplit;
+ +/*! \brief Stride in the force accumualation buffer */
+ +static const int __device__          c_fbufStride  = c_clSizeSq;
+ +/*! \brief i-cluster interaction mask for a super-cluster with all c_numClPerSupercl=8 bits set */
+ +static const unsigned __device__     superClInteractionMask = ((1U << c_numClPerSupercl) - 1U);
+ +
+ +static const float __device__        c_oneSixth    = 0.16666667f;
+ +static const float __device__        c_oneTwelveth = 0.08333333f;
+ +
+ +
+ +/*! Convert LJ sigma,epsilon parameters to C6,C12. */
+ +static __forceinline__ __device__
+ +void convert_sigma_epsilon_to_c6_c12(const float  sigma,
+ +                                     const float  epsilon,
+ +                                     float       *c6,
+ +                                     float       *c12)
+ +{
+ +    float sigma2, sigma6;
+ +
+ +    sigma2 = sigma * sigma;
+ +    sigma6 = sigma2 *sigma2 * sigma2;
+ +    *c6    = epsilon * sigma6;
+ +    *c12   = *c6 * sigma6;
+ +}
+ +
+ +/*! Apply force switch,  force + energy version. */
+ +static __forceinline__ __device__
+ +void calculate_force_switch_F(const  cu_nbparam_t nbparam,
+ +                              float               c6,
+ +                              float               c12,
+ +                              float               inv_r,
+ +                              float               r2,
+ +                              float              *F_invr)
+ +{
+ +    float r, r_switch;
+ +
+ +    /* force switch constants */
+ +    float disp_shift_V2 = nbparam.dispersion_shift.c2;
+ +    float disp_shift_V3 = nbparam.dispersion_shift.c3;
+ +    float repu_shift_V2 = nbparam.repulsion_shift.c2;
+ +    float repu_shift_V3 = nbparam.repulsion_shift.c3;
+ +
+ +    r         = r2 * inv_r;
+ +    r_switch  = r - nbparam.rvdw_switch;
+ +    r_switch  = r_switch >= 0.0f ? r_switch : 0.0f;
+ +
+ +    *F_invr  +=
+ +        -c6*(disp_shift_V2 + disp_shift_V3*r_switch)*r_switch*r_switch*inv_r +
-         c12*(-repu_shift_V2 + repu_shift_V3*r_switch)*r_switch*r_switch*inv_r;
++        c12*(repu_shift_V2 + repu_shift_V3*r_switch)*r_switch*r_switch*inv_r;
+ +}
+ +
+ +/*! Apply force switch, force-only version. */
+ +static __forceinline__ __device__
+ +void calculate_force_switch_F_E(const  cu_nbparam_t nbparam,
+ +                                float               c6,
+ +                                float               c12,
+ +                                float               inv_r,
+ +                                float               r2,
+ +                                float              *F_invr,
+ +                                float              *E_lj)
+ +{
+ +    float r, r_switch;
+ +
+ +    /* force switch constants */
+ +    float disp_shift_V2 = nbparam.dispersion_shift.c2;
+ +    float disp_shift_V3 = nbparam.dispersion_shift.c3;
+ +    float repu_shift_V2 = nbparam.repulsion_shift.c2;
+ +    float repu_shift_V3 = nbparam.repulsion_shift.c3;
+ +
+ +    float disp_shift_F2 = nbparam.dispersion_shift.c2/3;
+ +    float disp_shift_F3 = nbparam.dispersion_shift.c3/4;
+ +    float repu_shift_F2 = nbparam.repulsion_shift.c2/3;
+ +    float repu_shift_F3 = nbparam.repulsion_shift.c3/4;
+ +
+ +    r         = r2 * inv_r;
+ +    r_switch  = r - nbparam.rvdw_switch;
+ +    r_switch  = r_switch >= 0.0f ? r_switch : 0.0f;
+ +
+ +    *F_invr  +=
+ +        -c6*(disp_shift_V2 + disp_shift_V3*r_switch)*r_switch*r_switch*inv_r +
++        c12*(repu_shift_V2 + repu_shift_V3*r_switch)*r_switch*r_switch*inv_r;
+ +    *E_lj    +=
+ +        c6*(disp_shift_F2 + disp_shift_F3*r_switch)*r_switch*r_switch*r_switch -
+ +        c12*(repu_shift_F2 + repu_shift_F3*r_switch)*r_switch*r_switch*r_switch;
+ +}
+ +
+ +/*! Apply potential switch, force-only version. */
+ +static __forceinline__ __device__
+ +void calculate_potential_switch_F(const  cu_nbparam_t nbparam,
+ +                                  float               inv_r,
+ +                                  float               r2,
+ +                                  float              *F_invr,
+ +                                  float              *E_lj)
+ +{
+ +    float r, r_switch;
+ +    float sw, dsw;
+ +
+ +    /* potential switch constants */
+ +    float switch_V3 = nbparam.vdw_switch.c3;
+ +    float switch_V4 = nbparam.vdw_switch.c4;
+ +    float switch_V5 = nbparam.vdw_switch.c5;
+ +    float switch_F2 = 3*nbparam.vdw_switch.c3;
+ +    float switch_F3 = 4*nbparam.vdw_switch.c4;
+ +    float switch_F4 = 5*nbparam.vdw_switch.c5;
+ +
+ +    r        = r2 * inv_r;
+ +    r_switch = r - nbparam.rvdw_switch;
+ +
+ +    /* Unlike in the F+E kernel, conditional is faster here */
+ +    if (r_switch > 0.0f)
+ +    {
+ +        sw      = 1.0f + (switch_V3 + (switch_V4 + switch_V5*r_switch)*r_switch)*r_switch*r_switch*r_switch;
+ +        dsw     = (switch_F2 + (switch_F3 + switch_F4*r_switch)*r_switch)*r_switch*r_switch;
+ +
+ +        *F_invr = (*F_invr)*sw - inv_r*(*E_lj)*dsw;
+ +    }
+ +}
+ +
+ +/*! Apply potential switch, force + energy version. */
+ +static __forceinline__ __device__
+ +void calculate_potential_switch_F_E(const  cu_nbparam_t nbparam,
+ +                                    float               inv_r,
+ +                                    float               r2,
+ +                                    float              *F_invr,
+ +                                    float              *E_lj)
+ +{
+ +    float r, r_switch;
+ +    float sw, dsw;
+ +
+ +    /* potential switch constants */
+ +    float switch_V3 = nbparam.vdw_switch.c3;
+ +    float switch_V4 = nbparam.vdw_switch.c4;
+ +    float switch_V5 = nbparam.vdw_switch.c5;
+ +    float switch_F2 = 3*nbparam.vdw_switch.c3;
+ +    float switch_F3 = 4*nbparam.vdw_switch.c4;
+ +    float switch_F4 = 5*nbparam.vdw_switch.c5;
+ +
+ +    r        = r2 * inv_r;
+ +    r_switch = r - nbparam.rvdw_switch;
+ +    r_switch = r_switch >= 0.0f ? r_switch : 0.0f;
+ +
+ +    /* Unlike in the F-only kernel, masking is faster here */
+ +    sw       = 1.0f + (switch_V3 + (switch_V4 + switch_V5*r_switch)*r_switch)*r_switch*r_switch*r_switch;
+ +    dsw      = (switch_F2 + (switch_F3 + switch_F4*r_switch)*r_switch)*r_switch*r_switch;
+ +
+ +    *F_invr  = (*F_invr)*sw - inv_r*(*E_lj)*dsw;
+ +    *E_lj   *= sw;
+ +}
+ +
+ +
+ +/*! \brief Fetch C6 grid contribution coefficients and return the product of these.
+ + *
+ + *  Depending on what is supported, it fetches parameters either
+ + *  using direct load, texture objects, or texrefs.
+ + */
+ +static __forceinline__ __device__
+ +float calculate_lj_ewald_c6grid(const cu_nbparam_t nbparam,
+ +                                int                typei,
+ +                                int                typej)
+ +{
+ +#if DISABLE_CUDA_TEXTURES
+ +    return LDG(&nbparam.nbfp_comb[2*typei]) * LDG(&nbparam.nbfp_comb[2*typej]);
+ +#else
+ +    return tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2*typei) * tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2*typej);
+ +#endif /* DISABLE_CUDA_TEXTURES */
+ +}
+ +
+ +
+ +/*! Calculate LJ-PME grid force contribution with
+ + *  geometric combination rule.
+ + */
+ +static __forceinline__ __device__
+ +void calculate_lj_ewald_comb_geom_F(const cu_nbparam_t nbparam,
+ +                                    int                typei,
+ +                                    int                typej,
+ +                                    float              r2,
+ +                                    float              inv_r2,
+ +                                    float              lje_coeff2,
+ +                                    float              lje_coeff6_6,
+ +                                    float             *F_invr)
+ +{
+ +    float c6grid, inv_r6_nm, cr2, expmcr2, poly;
+ +
+ +    c6grid = calculate_lj_ewald_c6grid(nbparam, typei, typej);
+ +
+ +    /* Recalculate inv_r6 without exclusion mask */
+ +    inv_r6_nm = inv_r2*inv_r2*inv_r2;
+ +    cr2       = lje_coeff2*r2;
+ +    expmcr2   = expf(-cr2);
+ +    poly      = 1.0f + cr2 + 0.5f*cr2*cr2;
+ +
+ +    /* Subtract the grid force from the total LJ force */
+ +    *F_invr  += c6grid*(inv_r6_nm - expmcr2*(inv_r6_nm*poly + lje_coeff6_6))*inv_r2;
+ +}
+ +
+ +
+ +/*! Calculate LJ-PME grid force + energy contribution with
+ + *  geometric combination rule.
+ + */
+ +static __forceinline__ __device__
+ +void calculate_lj_ewald_comb_geom_F_E(const cu_nbparam_t nbparam,
+ +                                      int                typei,
+ +                                      int                typej,
+ +                                      float              r2,
+ +                                      float              inv_r2,
+ +                                      float              lje_coeff2,
+ +                                      float              lje_coeff6_6,
+ +                                      float              int_bit,
+ +                                      float             *F_invr,
+ +                                      float             *E_lj)
+ +{
+ +    float c6grid, inv_r6_nm, cr2, expmcr2, poly, sh_mask;
+ +
+ +    c6grid = calculate_lj_ewald_c6grid(nbparam, typei, typej);
+ +
+ +    /* Recalculate inv_r6 without exclusion mask */
+ +    inv_r6_nm = inv_r2*inv_r2*inv_r2;
+ +    cr2       = lje_coeff2*r2;
+ +    expmcr2   = expf(-cr2);
+ +    poly      = 1.0f + cr2 + 0.5f*cr2*cr2;
+ +
+ +    /* Subtract the grid force from the total LJ force */
+ +    *F_invr  += c6grid*(inv_r6_nm - expmcr2*(inv_r6_nm*poly + lje_coeff6_6))*inv_r2;
+ +
+ +    /* Shift should be applied only to real LJ pairs */
+ +    sh_mask   = nbparam.sh_lj_ewald*int_bit;
+ +    *E_lj    += c_oneSixth*c6grid*(inv_r6_nm*(1.0f - expmcr2*poly) + sh_mask);
+ +}
+ +
+ +/*! Fetch per-type LJ parameters.
+ + *
+ + *  Depending on what is supported, it fetches parameters either
+ + *  using direct load, texture objects, or texrefs.
+ + */
+ +static __forceinline__ __device__
+ +float2 fetch_nbfp_comb_c6_c12(const cu_nbparam_t nbparam,
+ +                              int                type)
+ +{
+ +    float2 c6c12;
+ +#if DISABLE_CUDA_TEXTURES
+ +    /* Force an 8-byte fetch to save a memory instruction. */
+ +    float2 *nbfp_comb = (float2 *)nbparam.nbfp_comb;
+ +    c6c12 = LDG(&nbfp_comb[type]);
+ +#else
+ +    /* NOTE: as we always do 8-byte aligned loads, we could
+ +       fetch float2 here too just as above. */
+ +    c6c12.x = tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2*type);
+ +    c6c12.y = tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2*type + 1);
+ +#endif /* DISABLE_CUDA_TEXTURES */
+ +
+ +    return c6c12;
+ +}
+ +
+ +
+ +/*! Calculate LJ-PME grid force + energy contribution (if E_lj != nullptr) with
+ + *  Lorentz-Berthelot combination rule.
+ + *  We use a single F+E kernel with conditional because the performance impact
+ + *  of this is pretty small and LB on the CPU is anyway very slow.
+ + */
+ +static __forceinline__ __device__
+ +void calculate_lj_ewald_comb_LB_F_E(const cu_nbparam_t nbparam,
+ +                                    int                typei,
+ +                                    int                typej,
+ +                                    float              r2,
+ +                                    float              inv_r2,
+ +                                    float              lje_coeff2,
+ +                                    float              lje_coeff6_6,
+ +                                    float              int_bit,
+ +                                    float             *F_invr,
+ +                                    float             *E_lj)
+ +{
+ +    float c6grid, inv_r6_nm, cr2, expmcr2, poly;
+ +    float sigma, sigma2, epsilon;
+ +
+ +    /* sigma and epsilon are scaled to give 6*C6 */
+ +    float2 c6c12_i = fetch_nbfp_comb_c6_c12(nbparam, typei);
+ +    float2 c6c12_j = fetch_nbfp_comb_c6_c12(nbparam, typej);
+ +
+ +    sigma   = c6c12_i.x + c6c12_j.x;
+ +    epsilon = c6c12_i.y * c6c12_j.y;
+ +
+ +    sigma2  = sigma*sigma;
+ +    c6grid  = epsilon*sigma2*sigma2*sigma2;
+ +
+ +    /* Recalculate inv_r6 without exclusion mask */
+ +    inv_r6_nm = inv_r2*inv_r2*inv_r2;
+ +    cr2       = lje_coeff2*r2;
+ +    expmcr2   = expf(-cr2);
+ +    poly      = 1.0f + cr2 + 0.5f*cr2*cr2;
+ +
+ +    /* Subtract the grid force from the total LJ force */
+ +    *F_invr  += c6grid*(inv_r6_nm - expmcr2*(inv_r6_nm*poly + lje_coeff6_6))*inv_r2;
+ +
+ +    if (E_lj != nullptr)
+ +    {
+ +        float sh_mask;
+ +
+ +        /* Shift should be applied only to real LJ pairs */
+ +        sh_mask   = nbparam.sh_lj_ewald*int_bit;
+ +        *E_lj    += c_oneSixth*c6grid*(inv_r6_nm*(1.0f - expmcr2*poly) + sh_mask);
+ +    }
+ +}
+ +
+ +
+ +/*! Fetch two consecutive values from the Ewald correction F*r table.
+ + *
+ + *  Depending on what is supported, it fetches parameters either
+ + *  using direct load, texture objects, or texrefs.
+ + */
+ +static __forceinline__ __device__
+ +float2 fetch_coulomb_force_r(const cu_nbparam_t nbparam,
+ +                             int                index)
+ +{
+ +    float2 d;
+ +
+ +#if DISABLE_CUDA_TEXTURES
+ +    /* Can't do 8-byte fetch because some of the addresses will be misaligned. */
+ +    d.x = LDG(&nbparam.coulomb_tab[index]);
+ +    d.y = LDG(&nbparam.coulomb_tab[index + 1]);
+ +#else
+ +    d.x = tex1Dfetch<float>(nbparam.coulomb_tab_texobj, index);
+ +    d.y = tex1Dfetch<float>(nbparam.coulomb_tab_texobj, index + 1);
+ +#endif // DISABLE_CUDA_TEXTURES
+ +
+ +    return d;
+ +}
+ +
+ +/*! Linear interpolation using exactly two FMA operations.
+ + *
+ + *  Implements numeric equivalent of: (1-t)*d0 + t*d1
+ + *  Note that CUDA does not have fnms, otherwise we'd use
+ + *  fma(t, d1, fnms(t, d0, d0)
+ + *  but input modifiers are designed for this and are fast.
+ + */
+ +template <typename T>
+ +__forceinline__ __host__ __device__
+ +T lerp(T d0, T d1, T t)
+ +{
+ +    return fma(t, d1, fma(-t, d0, d0));
+ +}
+ +
+ +/*! Interpolate Ewald coulomb force correction using the F*r table.
+ + */
+ +static __forceinline__ __device__
+ +float interpolate_coulomb_force_r(const cu_nbparam_t nbparam,
+ +                                  float              r)
+ +{
+ +    float  normalized = nbparam.coulomb_tab_scale * r;
+ +    int    index      = (int) normalized;
+ +    float  fraction   = normalized - index;
+ +
+ +    float2 d01 = fetch_coulomb_force_r(nbparam, index);
+ +
+ +    return lerp(d01.x, d01.y, fraction);
+ +}
+ +
+ +/*! Fetch C6 and C12 from the parameter table.
+ + *
+ + *  Depending on what is supported, it fetches parameters either
+ + *  using direct load, texture objects, or texrefs.
+ + */
+ +static __forceinline__ __device__
+ +void fetch_nbfp_c6_c12(float               &c6,
+ +                       float               &c12,
+ +                       const cu_nbparam_t   nbparam,
+ +                       int                  baseIndex)
+ +{
+ +#if DISABLE_CUDA_TEXTURES
+ +    /* Force an 8-byte fetch to save a memory instruction. */
+ +    float2 *nbfp = (float2 *)nbparam.nbfp;
+ +    float2  c6c12;
+ +    c6c12 = LDG(&nbfp[baseIndex]);
+ +    c6    = c6c12.x;
+ +    c12   = c6c12.y;
+ +#else
+ +    /* NOTE: as we always do 8-byte aligned loads, we could
+ +       fetch float2 here too just as above. */
+ +    c6  = tex1Dfetch<float>(nbparam.nbfp_texobj, 2*baseIndex);
+ +    c12 = tex1Dfetch<float>(nbparam.nbfp_texobj, 2*baseIndex + 1);
+ +#endif // DISABLE_CUDA_TEXTURES
+ +}
+ +
+ +
+ +/*! Calculate analytical Ewald correction term. */
+ +static __forceinline__ __device__
+ +float pmecorrF(float z2)
+ +{
+ +    const float FN6 = -1.7357322914161492954e-8f;
+ +    const float FN5 = 1.4703624142580877519e-6f;
+ +    const float FN4 = -0.000053401640219807709149f;
+ +    const float FN3 = 0.0010054721316683106153f;
+ +    const float FN2 = -0.019278317264888380590f;
+ +    const float FN1 = 0.069670166153766424023f;
+ +    const float FN0 = -0.75225204789749321333f;
+ +
+ +    const float FD4 = 0.0011193462567257629232f;
+ +    const float FD3 = 0.014866955030185295499f;
+ +    const float FD2 = 0.11583842382862377919f;
+ +    const float FD1 = 0.50736591960530292870f;
+ +    const float FD0 = 1.0f;
+ +
+ +    float       z4;
+ +    float       polyFN0, polyFN1, polyFD0, polyFD1;
+ +
+ +    z4          = z2*z2;
+ +
+ +    polyFD0     = FD4*z4 + FD2;
+ +    polyFD1     = FD3*z4 + FD1;
+ +    polyFD0     = polyFD0*z4 + FD0;
+ +    polyFD0     = polyFD1*z2 + polyFD0;
+ +
+ +    polyFD0     = 1.0f/polyFD0;
+ +
+ +    polyFN0     = FN6*z4 + FN4;
+ +    polyFN1     = FN5*z4 + FN3;
+ +    polyFN0     = polyFN0*z4 + FN2;
+ +    polyFN1     = polyFN1*z4 + FN1;
+ +    polyFN0     = polyFN0*z4 + FN0;
+ +    polyFN0     = polyFN1*z2 + polyFN0;
+ +
+ +    return polyFN0*polyFD0;
+ +}
+ +
+ +/*! Final j-force reduction; this generic implementation works with
+ + *  arbitrary array sizes.
+ + */
+ +static __forceinline__ __device__
+ +void reduce_force_j_generic(float *f_buf, float3 *fout,
+ +                            int tidxi, int tidxj, int aidx)
+ +{
+ +    if (tidxi < 3)
+ +    {
+ +        float f = 0.0f;
+ +        for (int j = tidxj * c_clSize; j < (tidxj + 1) * c_clSize; j++)
+ +        {
+ +            f += f_buf[c_fbufStride * tidxi + j];
+ +        }
+ +
+ +        atomicAdd((&fout[aidx].x)+tidxi, f);
+ +    }
+ +}
+ +
+ +/*! Final j-force reduction; this implementation only with power of two
+ + *  array sizes.
+ + */
+ +static __forceinline__ __device__
+ +void reduce_force_j_warp_shfl(float3 f, float3 *fout,
+ +                              int tidxi, int aidx,
+ +                              const unsigned int activemask)
+ +{
+ +    f.x += gmx_shfl_down_sync(activemask, f.x, 1);
+ +    f.y += gmx_shfl_up_sync  (activemask, f.y, 1);
+ +    f.z += gmx_shfl_down_sync(activemask, f.z, 1);
+ +
+ +    if (tidxi & 1)
+ +    {
+ +        f.x = f.y;
+ +    }
+ +
+ +    f.x += gmx_shfl_down_sync(activemask, f.x, 2);
+ +    f.z += gmx_shfl_up_sync  (activemask, f.z, 2);
+ +
+ +    if (tidxi & 2)
+ +    {
+ +        f.x = f.z;
+ +    }
+ +
+ +    f.x += gmx_shfl_down_sync(activemask, f.x, 4);
+ +
+ +    if (tidxi < 3)
+ +    {
+ +        atomicAdd((&fout[aidx].x) + tidxi, f.x);
+ +    }
+ +}
+ +
+ +/*! Final i-force reduction; this generic implementation works with
+ + *  arbitrary array sizes.
+ + * TODO: add the tidxi < 3 trick
+ + */
+ +static __forceinline__ __device__
+ +void reduce_force_i_generic(float *f_buf, float3 *fout,
+ +                            float *fshift_buf, bool bCalcFshift,
+ +                            int tidxi, int tidxj, int aidx)
+ +{
+ +    if (tidxj < 3)
+ +    {
+ +        float f = 0.0f;
+ +        for (int j = tidxi; j < c_clSizeSq; j += c_clSize)
+ +        {
+ +            f += f_buf[tidxj * c_fbufStride + j];
+ +        }
+ +
+ +        atomicAdd(&fout[aidx].x + tidxj, f);
+ +
+ +        if (bCalcFshift)
+ +        {
+ +            *fshift_buf += f;
+ +        }
+ +    }
+ +}
+ +
+ +/*! Final i-force reduction; this implementation works only with power of two
+ + *  array sizes.
+ + */
+ +static __forceinline__ __device__
+ +void reduce_force_i_pow2(volatile float *f_buf, float3 *fout,
+ +                         float *fshift_buf, bool bCalcFshift,
+ +                         int tidxi, int tidxj, int aidx)
+ +{
+ +    int     i, j;
+ +    float   f;
+ +
+ +    assert(c_clSize == 1 << c_clSizeLog2);
+ +
+ +    /* Reduce the initial c_clSize values for each i atom to half
+ +     * every step by using c_clSize * i threads.
+ +     * Can't just use i as loop variable because than nvcc refuses to unroll.
+ +     */
+ +    i = c_clSize/2;
+ +#pragma unroll 5
+ +    for (j = c_clSizeLog2 - 1; j > 0; j--)
+ +    {
+ +        if (tidxj < i)
+ +        {
+ +
+ +            f_buf[                   tidxj * c_clSize + tidxi] += f_buf[                   (tidxj + i) * c_clSize + tidxi];
+ +            f_buf[    c_fbufStride + tidxj * c_clSize + tidxi] += f_buf[    c_fbufStride + (tidxj + i) * c_clSize + tidxi];
+ +            f_buf[2 * c_fbufStride + tidxj * c_clSize + tidxi] += f_buf[2 * c_fbufStride + (tidxj + i) * c_clSize + tidxi];
+ +        }
+ +        i >>= 1;
+ +    }
+ +
+ +    /* i == 1, last reduction step, writing to global mem */
+ +    if (tidxj < 3)
+ +    {
+ +        /* tidxj*c_fbufStride selects x, y or z */
+ +        f = f_buf[tidxj * c_fbufStride               + tidxi] +
+ +            f_buf[tidxj * c_fbufStride + i * c_clSize + tidxi];
+ +
+ +        atomicAdd(&(fout[aidx].x) + tidxj, f);
+ +
+ +        if (bCalcFshift)
+ +        {
+ +            *fshift_buf += f;
+ +        }
+ +    }
+ +
+ +}
+ +
+ +/*! Final i-force reduction wrapper; calls the generic or pow2 reduction depending
+ + *  on whether the size of the array to be reduced is power of two or not.
+ + */
+ +static __forceinline__ __device__
+ +void reduce_force_i(float *f_buf, float3 *f,
+ +                    float *fshift_buf, bool bCalcFshift,
+ +                    int tidxi, int tidxj, int ai)
+ +{
+ +    if ((c_clSize & (c_clSize - 1)))
+ +    {
+ +        reduce_force_i_generic(f_buf, f, fshift_buf, bCalcFshift, tidxi, tidxj, ai);
+ +    }
+ +    else
+ +    {
+ +        reduce_force_i_pow2(f_buf, f, fshift_buf, bCalcFshift, tidxi, tidxj, ai);
+ +    }
+ +}
+ +
+ +/*! Final i-force reduction; this implementation works only with power of two
+ + *  array sizes.
+ + */
+ +static __forceinline__ __device__
+ +void reduce_force_i_warp_shfl(float3 fin, float3 *fout,
+ +                              float *fshift_buf, bool bCalcFshift,
+ +                              int tidxj, int aidx,
+ +                              const unsigned int activemask)
+ +{
+ +    fin.x += gmx_shfl_down_sync(activemask, fin.x, c_clSize);
+ +    fin.y += gmx_shfl_up_sync  (activemask, fin.y, c_clSize);
+ +    fin.z += gmx_shfl_down_sync(activemask, fin.z, c_clSize);
+ +
+ +    if (tidxj & 1)
+ +    {
+ +        fin.x = fin.y;
+ +    }
+ +
+ +    fin.x += gmx_shfl_down_sync(activemask, fin.x, 2*c_clSize);
+ +    fin.z += gmx_shfl_up_sync  (activemask, fin.z, 2*c_clSize);
+ +
+ +    if (tidxj & 2)
+ +    {
+ +        fin.x = fin.z;
+ +    }
+ +
+ +    /* Threads 0,1,2 and 4,5,6 increment x,y,z for their warp */
+ +    if ((tidxj & 3) < 3)
+ +    {
+ +        atomicAdd(&fout[aidx].x + (tidxj & 3), fin.x);
+ +
+ +        if (bCalcFshift)
+ +        {
+ +            *fshift_buf += fin.x;
+ +        }
+ +    }
+ +}
+ +
+ +/*! Energy reduction; this implementation works only with power of two
+ + *  array sizes.
+ + */
+ +static __forceinline__ __device__
+ +void reduce_energy_pow2(volatile float *buf,
+ +                        float *e_lj, float *e_el,
+ +                        unsigned int tidx)
+ +{
+ +    float        e1, e2;
+ +
+ +    unsigned int i = warp_size/2;
+ +
+ +    /* Can't just use i as loop variable because than nvcc refuses to unroll. */
+ +#pragma unroll 10
+ +    for (int j = warp_size_log2 - 1; j > 0; j--)
+ +    {
+ +        if (tidx < i)
+ +        {
+ +            buf[               tidx] += buf[               tidx + i];
+ +            buf[c_fbufStride + tidx] += buf[c_fbufStride + tidx + i];
+ +        }
+ +        i >>= 1;
+ +    }
+ +
+ +    // TODO do this on two threads - requires e_lj and e_el to be stored on adjascent
+ +    // memory locations to make sense
+ +    /* last reduction step, writing to global mem */
+ +    if (tidx == 0)
+ +    {
+ +        e1 = buf[               tidx] + buf[               tidx + i];
+ +        e2 = buf[c_fbufStride + tidx] + buf[c_fbufStride + tidx + i];
+ +
+ +        atomicAdd(e_lj, e1);
+ +        atomicAdd(e_el, e2);
+ +    }
+ +}
+ +
+ +/*! Energy reduction; this implementation works only with power of two
+ + *  array sizes.
+ + */
+ +static __forceinline__ __device__
+ +void reduce_energy_warp_shfl(float E_lj, float E_el,
+ +                             float *e_lj, float *e_el,
+ +                             int tidx,
+ +                             const unsigned int activemask)
+ +{
+ +    int i, sh;
+ +
+ +    sh = 1;
+ +#pragma unroll 5
+ +    for (i = 0; i < 5; i++)
+ +    {
+ +        E_lj += gmx_shfl_down_sync(activemask, E_lj, sh);
+ +        E_el += gmx_shfl_down_sync(activemask, E_el, sh);
+ +        sh   += sh;
+ +    }
+ +
+ +    /* The first thread in the warp writes the reduced energies */
+ +    if (tidx == 0 || tidx == warp_size)
+ +    {
+ +        atomicAdd(e_lj, E_lj);
+ +        atomicAdd(e_el, E_el);
+ +    }
+ +}
+ +
+ +#endif /* NBNXN_CUDA_KERNEL_UTILS_CUH */
diff --cc src/gromacs/nbnxm/opencl/nbnxm_ocl_kernel_utils.clh

index 74fcc398ac8080973c369dd0727da5c987ff5fbb,0000000000000000000000000000000000000000..c63c10d9571919fe14c3f15d12c9990d4eb6f1a3

mode 100644,000000..100644
--- 1/src/gromacs/nbnxm/opencl/nbnxm_ocl_kernel_utils.clh
--- /dev/null
+++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl_kernel_utils.clh
@@@ -1,913 -1,0 +1,913 @@@
-         c12*(-repu_shift_V2 + repu_shift_V3*r_switch)*r_switch*r_switch*inv_r;
+ +/*
+ + * This file is part of the GROMACS molecular simulation package.
+ + *
+ + * Copyright (c) 2012,2013,2014,2016,2017,2018,2019, by the GROMACS development team, led by
+ + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ + * and including many others, as listed in the AUTHORS file in the
+ + * top-level source directory and at http://www.gromacs.org.
+ + *
+ + * GROMACS is free software; you can redistribute it and/or
+ + * modify it under the terms of the GNU Lesser General Public License
+ + * as published by the Free Software Foundation; either version 2.1
+ + * of the License, or (at your option) any later version.
+ + *
+ + * GROMACS is distributed in the hope that it will be useful,
+ + * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ + * Lesser General Public License for more details.
+ + *
+ + * You should have received a copy of the GNU Lesser General Public
+ + * License along with GROMACS; if not, see
+ + * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ + *
+ + * If you want to redistribute modifications to GROMACS, please
+ + * consider that scientific software is very special. Version
+ + * control is crucial - bugs must be traceable. We will be happy to
+ + * consider code for inclusion in the official distribution, but
+ + * derived work must not be called official GROMACS. Details are found
+ + * in the README & COPYING files - if they are missing, get the
+ + * official version at http://www.gromacs.org.
+ + *
+ + * To help us fund GROMACS development, we humbly ask that you cite
+ + * the research papers on the package. Check out http://www.gromacs.org.
+ + */
+ +
+ +#define GMX_DOUBLE 0
+ +
+ +#include "gromacs/gpu_utils/device_utils.clh"
+ +#include "gromacs/gpu_utils/vectype_ops.clh"
+ +#include "gromacs/nbnxm/constants.h"
+ +#include "gromacs/pbcutil/ishift.h"
+ +
+ +#include "nbnxm_ocl_consts.h"
+ +
+ +#define CL_SIZE                 (NBNXN_GPU_CLUSTER_SIZE)
+ +#define NCL_PER_SUPERCL         c_nbnxnGpuNumClusterPerSupercluster
+ +
+ +#define WARP_SIZE  (CL_SIZE*CL_SIZE/2) //Currently only c_nbnxnGpuClusterpairSplit=2 supported
+ +
+ +#if defined _NVIDIA_SOURCE_ || defined _AMD_SOURCE_
+ +/* Currently we enable CJ prefetch for AMD/NVIDIA and disable it for other vendors
+ + * Note that this should precede the kernel_utils include.
+ + */
+ +#define USE_CJ_PREFETCH 1
+ +#else
+ +#define USE_CJ_PREFETCH 0
+ +#endif
+ +
+ +#if defined cl_intel_subgroups || defined cl_khr_subgroups || (defined __OPENCL_VERSION__ && __OPENCL_VERSION__ >= 210)
+ +#define HAVE_SUBGROUP 1
+ +#else
+ +#define HAVE_SUBGROUP 0
+ +#endif
+ +
+ +#ifdef cl_intel_subgroups
+ +#define HAVE_INTEL_SUBGROUP 1
+ +#else
+ +#define HAVE_INTEL_SUBGROUP 0
+ +#endif
+ +
+ +#if defined _INTEL_SOURCE_
+ +#define SUBGROUP_SIZE 8
+ +#elif defined _AMD_SOURCE_
+ +#define SUBGROUP_SIZE 64
+ +#else
+ +#define SUBGROUP_SIZE 32
+ +#endif
+ +
+ +#define REDUCE_SHUFFLE (HAVE_INTEL_SUBGROUP && CL_SIZE == 4 && SUBGROUP_SIZE == WARP_SIZE)
+ +#define USE_SUBGROUP_ANY (HAVE_SUBGROUP && SUBGROUP_SIZE == WARP_SIZE)
+ +#define USE_SUBGROUP_PRELOAD HAVE_INTEL_SUBGROUP
+ +
+ +/* 1.0 / sqrt(M_PI) */
+ +#define M_FLOAT_1_SQRTPI 0.564189583547756f
+ +
+ +//-------------------
+ +
+ +#ifndef NBNXN_OPENCL_KERNEL_UTILS_CLH
+ +#define NBNXN_OPENCL_KERNEL_UTILS_CLH
+ +
+ +#if CL_SIZE == 8
+ +#define WARP_SIZE_LOG2  (5)
+ +#define CL_SIZE_LOG2    (3)
+ +#elif CL_SIZE == 4
+ +#define WARP_SIZE_LOG2  (3)
+ +#define CL_SIZE_LOG2    (2)
+ +#else
+ +#error unsupported CL_SIZE
+ +#endif
+ +
+ +#define CL_SIZE_SQ      (CL_SIZE * CL_SIZE)
+ +#define FBUF_STRIDE     (CL_SIZE_SQ)
+ +
+ +#define ONE_SIXTH_F     0.16666667f
+ +#define ONE_TWELVETH_F  0.08333333f
+ +
+ +
+ +#ifdef __GNUC__
+ +/* GCC, clang, and some ICC pretending to be GCC */
+ +#  define gmx_unused __attribute__ ((unused))
+ +#else
+ +#  define gmx_unused
+ +#endif
+ +
+ +// Data structures shared between OpenCL device code and OpenCL host code
+ +// TODO: review, improve
+ +// Replaced real by float for now, to avoid including any other header
+ +typedef struct {
+ +    float c2;
+ +    float c3;
+ +    float cpot;
+ +} shift_consts_t;
+ +
+ +/* Used with potential switching:
+ + * rsw        = max(r - r_switch, 0)
+ + * sw         = 1 + c3*rsw^3 + c4*rsw^4 + c5*rsw^5
+ + * dsw        = 3*c3*rsw^2 + 4*c4*rsw^3 + 5*c5*rsw^4
+ + * force      = force*dsw - potential*sw
+ + * potential *= sw
+ + */
+ +typedef struct {
+ +    float c3;
+ +    float c4;
+ +    float c5;
+ +} switch_consts_t;
+ +
+ +// Data structure shared between the OpenCL device code and OpenCL host code
+ +// Must not contain OpenCL objects (buffers)
+ +typedef struct cl_nbparam_params
+ +{
+ +
+ +    int             eeltype;          /**< type of electrostatics, takes values from #eelCu */
+ +    int             vdwtype;          /**< type of VdW impl., takes values from #evdwCu     */
+ +
+ +    float           epsfac;           /**< charge multiplication factor                      */
+ +    float           c_rf;             /**< Reaction-field/plain cutoff electrostatics const. */
+ +    float           two_k_rf;         /**< Reaction-field electrostatics constant            */
+ +    float           ewald_beta;       /**< Ewald/PME parameter                               */
+ +    float           sh_ewald;         /**< Ewald/PME correction term substracted from the direct-space potential */
+ +    float           sh_lj_ewald;      /**< LJ-Ewald/PME correction term added to the correction potential        */
+ +    float           ewaldcoeff_lj;    /**< LJ-Ewald/PME coefficient                          */
+ +
+ +    float           rcoulomb_sq;      /**< Coulomb cut-off squared                           */
+ +
+ +    float           rvdw_sq;          /**< VdW cut-off squared                               */
+ +    float           rvdw_switch;      /**< VdW switched cut-off                              */
+ +    float           rlistOuter_sq;    /**< Full, outer pair-list cut-off squared             */
+ +    float           rlistInner_sq;    /**< Inner, dynamic pruned pair-list cut-off squared  XXX: this is only needed in the pruning kernels, but for now we also pass it to the nonbondeds */
+ +
+ +    shift_consts_t  dispersion_shift; /**< VdW shift dispersion constants           */
+ +    shift_consts_t  repulsion_shift;  /**< VdW shift repulsion constants            */
+ +    switch_consts_t vdw_switch;       /**< VdW switch constants                     */
+ +
+ +    /* Ewald Coulomb force table data - accessed through texture memory */
+ +    float                  coulomb_tab_scale;  /**< table scale/spacing                        */
+ +}cl_nbparam_params_t;
+ +
+ +typedef struct {
+ +    int sci;            /* i-super-cluster       */
+ +    int shift;          /* Shift vector index plus possible flags */
+ +    int cj4_ind_start;  /* Start index into cj4  */
+ +    int cj4_ind_end;    /* End index into cj4    */
+ +} nbnxn_sci_t;
+ +
+ +typedef struct {
+ +    unsigned int imask;    /* The i-cluster interactions mask for 1 warp  */
+ +    int          excl_ind; /* Index into the exclusion array for 1 warp   */
+ +} nbnxn_im_ei_t;
+ +
+ +typedef struct {
+ +    int           cj[4];   /* The 4 j-clusters                            */
+ +    nbnxn_im_ei_t imei[2]; /* The i-cluster mask data       for 2 warps   */
+ +} nbnxn_cj4_t;
+ +
+ +
+ +typedef struct {
+ +    unsigned int pair[CL_SIZE*CL_SIZE/2]; /* Topology exclusion interaction bits for one warp,
+ +                                           * each unsigned has bitS for 4*8 i clusters
+ +                                           */
+ +} nbnxn_excl_t;
+ +
+ +/*! i-cluster interaction mask for a super-cluster with all NCL_PER_SUPERCL bits set */
+ +__constant unsigned supercl_interaction_mask = ((1U << NCL_PER_SUPERCL) - 1U);
+ +
+ +gmx_opencl_inline
+ +void preloadCj4Generic(__local int        *sm_cjPreload,
+ +                       const __global int *gm_cj,
+ +                       int                 tidxi,
+ +                       int                 tidxj,
+ +                       bool gmx_unused     iMaskCond)
+ +{
+ +    /* Pre-load cj into shared memory */
+ +#if defined _AMD_SOURCE_ //TODO: fix by setting c_nbnxnGpuClusterpairSplit properly
+ +    if (tidxj == 0 & tidxi < c_nbnxnGpuJgroupSize)
+ +    {
+ +        sm_cjPreload[tidxi] = gm_cj[tidxi];
+ +    }
+ +#else
+ +    const int c_clSize                   = CL_SIZE;
+ +    const int c_nbnxnGpuClusterpairSplit = 2;
+ +    const int c_splitClSize              = c_clSize/c_nbnxnGpuClusterpairSplit;
+ +    if ((tidxj == 0 | tidxj == c_splitClSize) & (tidxi < c_nbnxnGpuJgroupSize))
+ +    {
+ +        sm_cjPreload[tidxi + tidxj * c_nbnxnGpuJgroupSize/c_splitClSize] = gm_cj[tidxi];
+ +    }
+ +#endif
+ +}
+ +
+ +
+ +#if USE_SUBGROUP_PRELOAD
+ +gmx_opencl_inline
+ +int  preloadCj4Subgroup(const __global int *gm_cj)
+ +{
+ +    //loads subgroup-size # of elements (8) instead of the 4 required
+ +    //equivalent to *cjs = *gm_cj
+ +    return intel_sub_group_block_read((const __global uint *)gm_cj);
+ +}
+ +#endif //USE_SUBGROUP_PRELOAD
+ +
+ +#if USE_SUBGROUP_PRELOAD
+ +typedef size_t CjType;
+ +#else
+ +typedef __local int* CjType;
+ +#endif
+ +
+ +/*! \brief Preload cj4
+ + *
+ + * - For AMD we load once for a wavefront of 64 threads (on 4 threads * NTHREAD_Z)
+ + * - For NVIDIA once per warp (on 2x4 threads * NTHREAD_Z)
+ + * - For Intel(/USE_SUBGROUP_PRELOAD) loads into private memory(/register) instead of local memory
+ + *
+ + * It is the caller's responsibility to make sure that data is consumed only when
+ + * it's ready. This function does not call a barrier.
+ + */
+ +gmx_opencl_inline
+ +void preloadCj4(CjType gmx_unused             *cjs,
+ +                const __global int gmx_unused *gm_cj,
+ +                int gmx_unused                 tidxi,
+ +                int gmx_unused                 tidxj,
+ +                bool gmx_unused                iMaskCond)
+ +{
+ +#if USE_SUBGROUP_PRELOAD
+ +    *cjs = preloadCj4Subgroup(gm_cj);
+ +#elif USE_CJ_PREFETCH
+ +    preloadCj4Generic(*cjs, gm_cj, tidxi, tidxj, iMaskCond);
+ +#else
+ +    //nothing to do
+ +#endif
+ +}
+ +
+ +gmx_opencl_inline
+ +int loadCjPreload(__local int           *        sm_cjPreload,
+ +                  int                            jm,
+ +                  int gmx_unused                 tidxi,
+ +                  int gmx_unused                 tidxj)
+ +{
+ +#if defined _AMD_SOURCE_
+ +    int       warpLoadOffset = 0; //TODO: fix by setting c_nbnxnGpuClusterpairSplit properly
+ +#else
+ +    const int c_clSize                   = CL_SIZE;
+ +    const int c_nbnxnGpuClusterpairSplit = 2;
+ +    const int c_splitClSize              = c_clSize/c_nbnxnGpuClusterpairSplit;
+ +    int       warpLoadOffset             = (tidxj & c_splitClSize) * c_nbnxnGpuJgroupSize/c_splitClSize;
+ +#endif
+ +    return sm_cjPreload[jm + warpLoadOffset];
+ +}
+ +
+ +/* \brief Load a cj given a jm index.
+ + *
+ + * If cj4 preloading is enabled, it loads from the local memory, otherwise from global.
+ + */
+ +gmx_opencl_inline
+ +int loadCj(CjType cjs, const __global int gmx_unused* gm_cj,
+ +           int jm, int gmx_unused tidxi, int gmx_unused tidxj)
+ +{
+ +#if USE_SUBGROUP_PRELOAD
+ +    return sub_group_broadcast(cjs, jm);
+ +#elif USE_CJ_PREFETCH
+ +    return loadCjPreload(cjs, jm, tidxi, tidxj);
+ +#else
+ +    return gm_cj[jm];
+ +#endif
+ +}
+ +
+ +/*! Convert LJ sigma,epsilon parameters to C6,C12. */
+ +gmx_opencl_inline
+ +void convert_sigma_epsilon_to_c6_c12(const float  sigma,
+ +                                     const float  epsilon,
+ +                                     float       *c6,
+ +                                     float       *c12)
+ +{
+ +    float sigma2, sigma6;
+ +
+ +    sigma2 = sigma * sigma;
+ +    sigma6 = sigma2 *sigma2 * sigma2;
+ +    *c6    = epsilon * sigma6;
+ +    *c12   = *c6 * sigma6;
+ +}
+ +
+ +
+ +/*! Apply force switch,  force + energy version. */
+ +gmx_opencl_inline
+ +void calculate_force_switch_F(const cl_nbparam_params_t *nbparam,
+ +                              float                      c6,
+ +                              float                      c12,
+ +                              float                      inv_r,
+ +                              float                      r2,
+ +                              float                     *F_invr)
+ +{
+ +    float r, r_switch;
+ +
+ +    /* force switch constants */
+ +    float disp_shift_V2 = nbparam->dispersion_shift.c2;
+ +    float disp_shift_V3 = nbparam->dispersion_shift.c3;
+ +    float repu_shift_V2 = nbparam->repulsion_shift.c2;
+ +    float repu_shift_V3 = nbparam->repulsion_shift.c3;
+ +
+ +    r         = r2 * inv_r;
+ +    r_switch  = r - nbparam->rvdw_switch;
+ +    r_switch  = r_switch >= 0.0f ? r_switch : 0.0f;
+ +
+ +    *F_invr  +=
+ +        -c6*(disp_shift_V2 + disp_shift_V3*r_switch)*r_switch*r_switch*inv_r +
-         c12*(-repu_shift_V2 + repu_shift_V3*r_switch)*r_switch*r_switch*inv_r;
++        c12*(repu_shift_V2 + repu_shift_V3*r_switch)*r_switch*r_switch*inv_r;
+ +}
+ +
+ +/*! Apply force switch, force-only version. */
+ +gmx_opencl_inline
+ +void calculate_force_switch_F_E(const cl_nbparam_params_t *nbparam,
+ +                                float                      c6,
+ +                                float                      c12,
+ +                                float                      inv_r,
+ +                                float                      r2,
+ +                                float                     *F_invr,
+ +                                float                     *E_lj)
+ +{
+ +    float r, r_switch;
+ +
+ +    /* force switch constants */
+ +    float disp_shift_V2 = nbparam->dispersion_shift.c2;
+ +    float disp_shift_V3 = nbparam->dispersion_shift.c3;
+ +    float repu_shift_V2 = nbparam->repulsion_shift.c2;
+ +    float repu_shift_V3 = nbparam->repulsion_shift.c3;
+ +
+ +    float disp_shift_F2 = nbparam->dispersion_shift.c2/3;
+ +    float disp_shift_F3 = nbparam->dispersion_shift.c3/4;
+ +    float repu_shift_F2 = nbparam->repulsion_shift.c2/3;
+ +    float repu_shift_F3 = nbparam->repulsion_shift.c3/4;
+ +
+ +    r         = r2 * inv_r;
+ +    r_switch  = r - nbparam->rvdw_switch;
+ +    r_switch  = r_switch >= 0.0f ? r_switch : 0.0f;
+ +
+ +    *F_invr  +=
+ +        -c6*(disp_shift_V2 + disp_shift_V3*r_switch)*r_switch*r_switch*inv_r +
++        c12*(repu_shift_V2 + repu_shift_V3*r_switch)*r_switch*r_switch*inv_r;
+ +    *E_lj    +=
+ +        c6*(disp_shift_F2 + disp_shift_F3*r_switch)*r_switch*r_switch*r_switch -
+ +        c12*(repu_shift_F2 + repu_shift_F3*r_switch)*r_switch*r_switch*r_switch;
+ +}
+ +
+ +/*! Apply potential switch, force-only version. */
+ +gmx_opencl_inline
+ +void calculate_potential_switch_F(const cl_nbparam_params_t *nbparam,
+ +                                  float                      inv_r,
+ +                                  float                      r2,
+ +                                  float                     *F_invr,
+ +                                  const float               *E_lj)
+ +{
+ +    float r, r_switch;
+ +    float sw, dsw;
+ +
+ +    /* potential switch constants */
+ +    float switch_V3 = nbparam->vdw_switch.c3;
+ +    float switch_V4 = nbparam->vdw_switch.c4;
+ +    float switch_V5 = nbparam->vdw_switch.c5;
+ +    float switch_F2 = nbparam->vdw_switch.c3;
+ +    float switch_F3 = nbparam->vdw_switch.c4;
+ +    float switch_F4 = nbparam->vdw_switch.c5;
+ +
+ +    r        = r2 * inv_r;
+ +    r_switch = r - nbparam->rvdw_switch;
+ +
+ +    /* Unlike in the F+E kernel, conditional is faster here */
+ +    if (r_switch > 0.0f)
+ +    {
+ +        sw      = 1.0f + (switch_V3 + (switch_V4 + switch_V5*r_switch)*r_switch)*r_switch*r_switch*r_switch;
+ +        dsw     = (switch_F2 + (switch_F3 + switch_F4*r_switch)*r_switch)*r_switch*r_switch;
+ +
+ +        *F_invr = (*F_invr)*sw - inv_r*(*E_lj)*dsw;
+ +    }
+ +}
+ +
+ +/*! Apply potential switch, force + energy version. */
+ +gmx_opencl_inline
+ +void calculate_potential_switch_F_E(const cl_nbparam_params_t *nbparam,
+ +                                    float                      inv_r,
+ +                                    float                      r2,
+ +                                    float                     *F_invr,
+ +                                    float                     *E_lj)
+ +{
+ +    float r, r_switch;
+ +    float sw, dsw;
+ +
+ +    /* potential switch constants */
+ +    float switch_V3 = nbparam->vdw_switch.c3;
+ +    float switch_V4 = nbparam->vdw_switch.c4;
+ +    float switch_V5 = nbparam->vdw_switch.c5;
+ +    float switch_F2 = nbparam->vdw_switch.c3;
+ +    float switch_F3 = nbparam->vdw_switch.c4;
+ +    float switch_F4 = nbparam->vdw_switch.c5;
+ +
+ +    r        = r2 * inv_r;
+ +    r_switch = r - nbparam->rvdw_switch;
+ +    r_switch = r_switch >= 0.0f ? r_switch : 0.0f;
+ +
+ +    /* Unlike in the F-only kernel, masking is faster here */
+ +    sw       = 1.0f + (switch_V3 + (switch_V4 + switch_V5*r_switch)*r_switch)*r_switch*r_switch*r_switch;
+ +    dsw      = (switch_F2 + (switch_F3 + switch_F4*r_switch)*r_switch)*r_switch*r_switch;
+ +
+ +    *F_invr  = (*F_invr)*sw - inv_r*(*E_lj)*dsw;
+ +    *E_lj   *= sw;
+ +}
+ +
+ +/*! Calculate LJ-PME grid force contribution with
+ + *  geometric combination rule.
+ + */
+ +gmx_opencl_inline
+ +void calculate_lj_ewald_comb_geom_F(__constant const float *nbfp_comb_climg2d,
+ +                                    int                     typei,
+ +                                    int                     typej,
+ +                                    float                   r2,
+ +                                    float                   inv_r2,
+ +                                    float                   lje_coeff2,
+ +                                    float                   lje_coeff6_6,
+ +                                    float                  *F_invr)
+ +{
+ +    float c6grid, inv_r6_nm, cr2, expmcr2, poly;
+ +
+ +    c6grid    = nbfp_comb_climg2d[2*typei]*nbfp_comb_climg2d[2*typej];
+ +
+ +    /* Recalculate inv_r6 without exclusion mask */
+ +    inv_r6_nm = inv_r2*inv_r2*inv_r2;
+ +    cr2       = lje_coeff2*r2;
+ +    expmcr2   = exp(-cr2);
+ +    poly      = 1.0f + cr2 + 0.5f*cr2*cr2;
+ +
+ +    /* Subtract the grid force from the total LJ force */
+ +    *F_invr  += c6grid*(inv_r6_nm - expmcr2*(inv_r6_nm*poly + lje_coeff6_6))*inv_r2;
+ +}
+ +
+ +/*! Calculate LJ-PME grid force + energy contribution with
+ + *  geometric combination rule.
+ + */
+ +gmx_opencl_inline
+ +void calculate_lj_ewald_comb_geom_F_E(__constant const float    *nbfp_comb_climg2d,
+ +                                      const cl_nbparam_params_t *nbparam,
+ +                                      int                        typei,
+ +                                      int                        typej,
+ +                                      float                      r2,
+ +                                      float                      inv_r2,
+ +                                      float                      lje_coeff2,
+ +                                      float                      lje_coeff6_6,
+ +                                      float                      int_bit,
+ +                                      float                     *F_invr,
+ +                                      float                     *E_lj)
+ +{
+ +    float c6grid, inv_r6_nm, cr2, expmcr2, poly, sh_mask;
+ +
+ +    c6grid    = nbfp_comb_climg2d[2*typei]*nbfp_comb_climg2d[2*typej];
+ +
+ +    /* Recalculate inv_r6 without exclusion mask */
+ +    inv_r6_nm = inv_r2*inv_r2*inv_r2;
+ +    cr2       = lje_coeff2*r2;
+ +    expmcr2   = exp(-cr2);
+ +    poly      = 1.0f + cr2 + 0.5f*cr2*cr2;
+ +
+ +    /* Subtract the grid force from the total LJ force */
+ +    *F_invr  += c6grid*(inv_r6_nm - expmcr2*(inv_r6_nm*poly + lje_coeff6_6))*inv_r2;
+ +
+ +    /* Shift should be applied only to real LJ pairs */
+ +    sh_mask   = nbparam->sh_lj_ewald*int_bit;
+ +    *E_lj    += ONE_SIXTH_F*c6grid*(inv_r6_nm*(1.0f - expmcr2*poly) + sh_mask);
+ +}
+ +
+ +/*! Calculate LJ-PME grid force + energy contribution (if E_lj != NULL) with
+ + *  Lorentz-Berthelot combination rule.
+ + *  We use a single F+E kernel with conditional because the performance impact
+ + *  of this is pretty small and LB on the CPU is anyway very slow.
+ + */
+ +gmx_opencl_inline
+ +void calculate_lj_ewald_comb_LB_F_E(__constant const float    *nbfp_comb_climg2d,
+ +                                    const cl_nbparam_params_t *nbparam,
+ +                                    int                        typei,
+ +                                    int                        typej,
+ +                                    float                      r2,
+ +                                    float                      inv_r2,
+ +                                    float                      lje_coeff2,
+ +                                    float                      lje_coeff6_6,
+ +                                    float                      int_bit,
+ +                                    bool                       with_E_lj,
+ +                                    float                     *F_invr,
+ +                                    float                     *E_lj)
+ +{
+ +    float c6grid, inv_r6_nm, cr2, expmcr2, poly;
+ +    float sigma, sigma2, epsilon;
+ +
+ +    /* sigma and epsilon are scaled to give 6*C6 */
+ +    sigma      = nbfp_comb_climg2d[2*typei] + nbfp_comb_climg2d[2*typej];
+ +
+ +    epsilon    = nbfp_comb_climg2d[2*typei+1]*nbfp_comb_climg2d[2*typej+1];
+ +
+ +    sigma2  = sigma*sigma;
+ +    c6grid  = epsilon*sigma2*sigma2*sigma2;
+ +
+ +    /* Recalculate inv_r6 without exclusion mask */
+ +    inv_r6_nm = inv_r2*inv_r2*inv_r2;
+ +    cr2       = lje_coeff2*r2;
+ +    expmcr2   = exp(-cr2);
+ +    poly      = 1.0f + cr2 + 0.5f*cr2*cr2;
+ +
+ +    /* Subtract the grid force from the total LJ force */
+ +    *F_invr  += c6grid*(inv_r6_nm - expmcr2*(inv_r6_nm*poly + lje_coeff6_6))*inv_r2;
+ +
+ +    if (with_E_lj)
+ +    {
+ +        float sh_mask;
+ +
+ +        /* Shift should be applied only to real LJ pairs */
+ +        sh_mask   = nbparam->sh_lj_ewald*int_bit;
+ +        *E_lj    += ONE_SIXTH_F*c6grid*(inv_r6_nm*(1.0f - expmcr2*poly) + sh_mask);
+ +    }
+ +}
+ +
+ +/*! Interpolate Ewald coulomb force using the table through the tex_nbfp texture.
+ + *  Original idea: from the OpenMM project
+ + */
+ +gmx_opencl_inline float
+ +interpolate_coulomb_force_r(__constant const float *coulomb_tab_climg2d,
+ +                            float                   r,
+ +                            float                   scale)
+ +{
+ +    float   normalized = scale * r;
+ +    int     index      = (int) normalized;
+ +    float   fract2     = normalized - index;
+ +    float   fract1     = 1.0f - fract2;
+ +
+ +    return fract1*coulomb_tab_climg2d[index] +
+ +           fract2*coulomb_tab_climg2d[index + 1];
+ +}
+ +
+ +/*! Calculate analytical Ewald correction term. */
+ +gmx_opencl_inline
+ +float pmecorrF(float z2)
+ +{
+ +    const float FN6 = -1.7357322914161492954e-8f;
+ +    const float FN5 = 1.4703624142580877519e-6f;
+ +    const float FN4 = -0.000053401640219807709149f;
+ +    const float FN3 = 0.0010054721316683106153f;
+ +    const float FN2 = -0.019278317264888380590f;
+ +    const float FN1 = 0.069670166153766424023f;
+ +    const float FN0 = -0.75225204789749321333f;
+ +
+ +    const float FD4 = 0.0011193462567257629232f;
+ +    const float FD3 = 0.014866955030185295499f;
+ +    const float FD2 = 0.11583842382862377919f;
+ +    const float FD1 = 0.50736591960530292870f;
+ +    const float FD0 = 1.0f;
+ +
+ +    float       z4;
+ +    float       polyFN0, polyFN1, polyFD0, polyFD1;
+ +
+ +    z4          = z2*z2;
+ +
+ +    polyFD0     = FD4*z4 + FD2;
+ +    polyFD1     = FD3*z4 + FD1;
+ +    polyFD0     = polyFD0*z4 + FD0;
+ +    polyFD0     = polyFD1*z2 + polyFD0;
+ +
+ +    polyFD0     = 1.0f/polyFD0;
+ +
+ +    polyFN0     = FN6*z4 + FN4;
+ +    polyFN1     = FN5*z4 + FN3;
+ +    polyFN0     = polyFN0*z4 + FN2;
+ +    polyFN1     = polyFN1*z4 + FN1;
+ +    polyFN0     = polyFN0*z4 + FN0;
+ +    polyFN0     = polyFN1*z2 + polyFN0;
+ +
+ +    return polyFN0*polyFD0;
+ +}
+ +
+ +#if REDUCE_SHUFFLE
+ +gmx_opencl_inline
+ +void reduce_force_j_shfl(float3 fin, __global float *fout,
+ +                         int gmx_unused tidxi, int gmx_unused tidxj, int aidx)
+ +{
+ +    /* Only does reduction over 4 elements in cluster. Needs to be changed
+ +     * for CL_SIZE>4. See CUDA code for required code */
+ +    fin.x += intel_sub_group_shuffle_down(fin.x, fin.x, 1);
+ +    fin.y += intel_sub_group_shuffle_up  (fin.y, fin.y, 1);
+ +    fin.z += intel_sub_group_shuffle_down(fin.z, fin.z, 1);
+ +    if ((tidxi & 1) == 1)
+ +    {
+ +        fin.x = fin.y;
+ +    }
+ +    fin.x += intel_sub_group_shuffle_down(fin.x, fin.x, 2);
+ +    fin.z += intel_sub_group_shuffle_up  (fin.z, fin.z, 2);
+ +    if (tidxi == 2)
+ +    {
+ +        fin.x = fin.z;
+ +    }
+ +    if (tidxi < 3)
+ +    {
+ +        atomicAdd_g_f(&fout[3 * aidx + tidxi], fin.x);
+ +    }
+ +}
+ +#endif
+ +
+ +gmx_opencl_inline
+ +void reduce_force_j_generic(__local float *f_buf, float3 fcj_buf, __global float *fout,
+ +                            int tidxi, int tidxj, int aidx)
+ +{
+ +    int tidx = tidxi + tidxj*CL_SIZE;
+ +    f_buf[                  tidx] = fcj_buf.x;
+ +    f_buf[    FBUF_STRIDE + tidx] = fcj_buf.y;
+ +    f_buf[2 * FBUF_STRIDE + tidx] = fcj_buf.z;
+ +
+ +    /* Split the reduction between the first 3 column threads
+ +       Threads with column id 0 will do the reduction for (float3).x components
+ +       Threads with column id 1 will do the reduction for (float3).y components
+ +       Threads with column id 2 will do the reduction for (float3).z components.
+ +       The reduction is performed for each line tidxj of f_buf. */
+ +    if (tidxi < 3)
+ +    {
+ +        float f = 0.0f;
+ +        for (int j = tidxj * CL_SIZE; j < (tidxj + 1) * CL_SIZE; j++)
+ +        {
+ +            f += f_buf[FBUF_STRIDE * tidxi + j];
+ +        }
+ +
+ +        atomicAdd_g_f(&fout[3 * aidx + tidxi], f);
+ +    }
+ +}
+ +
+ +/*! Final j-force reduction
+ + */
+ +gmx_opencl_inline
+ +void reduce_force_j(__local float gmx_unused *f_buf, float3 fcj_buf, __global float *fout,
+ +                    int tidxi, int tidxj, int aidx)
+ +{
+ +#if REDUCE_SHUFFLE
+ +    reduce_force_j_shfl(fcj_buf, fout, tidxi, tidxj, aidx);
+ +#else
+ +    reduce_force_j_generic(f_buf, fcj_buf, fout, tidxi, tidxj, aidx);
+ +#endif
+ +}
+ +
+ +#if REDUCE_SHUFFLE
+ +gmx_opencl_inline
+ +void reduce_force_i_and_shift_shfl(float3* fci_buf, __global float *fout,
+ +                                   bool bCalcFshift, int tidxi, int tidxj,
+ +                                   int sci, int shift, __global float *fshift)
+ +{
+ +    /* Only does reduction over 4 elements in cluster (2 per warp). Needs to be changed
+ +     * for CL_SIZE>4.*/
+ +    float2 fshift_buf = 0;
+ +    for (int ci_offset = 0; ci_offset < NCL_PER_SUPERCL; ci_offset++)
+ +    {
+ +        int    aidx = (sci * NCL_PER_SUPERCL + ci_offset) * CL_SIZE + tidxi;
+ +        float3 fin  = fci_buf[ci_offset];
+ +        fin.x += intel_sub_group_shuffle_down(fin.x, fin.x, CL_SIZE);
+ +        fin.y += intel_sub_group_shuffle_up  (fin.y, fin.y, CL_SIZE);
+ +        fin.z += intel_sub_group_shuffle_down(fin.z, fin.z, CL_SIZE);
+ +
+ +        if (tidxj & 1)
+ +        {
+ +            fin.x = fin.y;
+ +        }
+ +        /* Threads 0,1 and 2,3 increment x,y for their warp */
+ +        atomicAdd_g_f(&fout[3*aidx + (tidxj & 1)], fin.x);
+ +        if (bCalcFshift)
+ +        {
+ +            fshift_buf[0] += fin.x;
+ +        }
+ +        /* Threads 0 and 2 increment z for their warp */
+ +        if ((tidxj & 1) == 0)
+ +        {
+ +            atomicAdd_g_f(&fout[3*aidx+2], fin.z);
+ +            if (bCalcFshift)
+ +            {
+ +                fshift_buf[1] += fin.z;
+ +            }
+ +        }
+ +    }
+ +    /* add up local shift forces into global mem */
+ +    if (bCalcFshift)
+ +    {
+ +        //Threads 0,1 and 2,3 update x,y
+ +        atomicAdd_g_f(&(fshift[3 * shift + (tidxj&1)]), fshift_buf[0]);
+ +        //Threads 0 and 2 update z
+ +        if ((tidxj & 1) == 0)
+ +        {
+ +            atomicAdd_g_f(&(fshift[3 * shift + 2]), fshift_buf[1]);
+ +        }
+ +    }
+ +}
+ +#endif
+ +
+ +/*! Final i-force reduction; this implementation works only with power of two
+ + *  array sizes.
+ + */
+ +gmx_opencl_inline
+ +void reduce_force_i_and_shift_pow2(volatile __local float *f_buf, float3* fci_buf,
+ +                                   __global float *fout,
+ +                                   bool bCalcFshift,
+ +                                   int tidxi, int tidxj,
+ +                                   int sci, int shift, __global float *fshift)
+ +{
+ +    float fshift_buf = 0;
+ +    for (int ci_offset = 0; ci_offset < NCL_PER_SUPERCL; ci_offset++)
+ +    {
+ +        int aidx = (sci * NCL_PER_SUPERCL + ci_offset) * CL_SIZE + tidxi;
+ +        int tidx = tidxi + tidxj*CL_SIZE;
+ +        /* store i forces in shmem */
+ +        f_buf[                  tidx] = fci_buf[ci_offset].x;
+ +        f_buf[    FBUF_STRIDE + tidx] = fci_buf[ci_offset].y;
+ +        f_buf[2 * FBUF_STRIDE + tidx] = fci_buf[ci_offset].z;
+ +        barrier(CLK_LOCAL_MEM_FENCE);
+ +
+ +        int     i, j;
+ +        /* Reduce the initial CL_SIZE values for each i atom to half
+ +         * every step by using CL_SIZE * i threads.
+ +         * Can't just use i as loop variable because than nvcc refuses to unroll.
+ +         */
+ +        i = CL_SIZE/2;
+ +        for (j = CL_SIZE_LOG2 - 1; j > 0; j--)
+ +        {
+ +            if (tidxj < i)
+ +            {
+ +
+ +                f_buf[                  tidxj * CL_SIZE + tidxi] += f_buf[                  (tidxj + i) * CL_SIZE + tidxi];
+ +                f_buf[    FBUF_STRIDE + tidxj * CL_SIZE + tidxi] += f_buf[    FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi];
+ +                f_buf[2 * FBUF_STRIDE + tidxj * CL_SIZE + tidxi] += f_buf[2 * FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi];
+ +            }
+ +            i >>= 1;
+ +        }
+ +        /* needed because
+ +         * a) for CL_SIZE<8: id 2 (doing z in next block) is in 2nd warp
+ +         * b) for all CL_SIZE a barrier is needed before f_buf is reused by next reduce_force_i call
+ +         * TODO: Test on Nvidia for performance difference between having the barrier here or after the atomicAdd
+ +         */
+ +        barrier(CLK_LOCAL_MEM_FENCE);
+ +
+ +        /* i == 1, last reduction step, writing to global mem */
+ +        /* Split the reduction between the first 3 line threads
+ +           Threads with line id 0 will do the reduction for (float3).x components
+ +           Threads with line id 1 will do the reduction for (float3).y components
+ +           Threads with line id 2 will do the reduction for (float3).z components. */
+ +        if (tidxj < 3)
+ +        {
+ +            float f = f_buf[tidxj * FBUF_STRIDE + tidxi] + f_buf[tidxj * FBUF_STRIDE + i * CL_SIZE + tidxi];
+ +
+ +            atomicAdd_g_f(&fout[3 * aidx + tidxj], f);
+ +
+ +            if (bCalcFshift)
+ +            {
+ +                fshift_buf += f;
+ +            }
+ +        }
+ +    }
+ +    /* add up local shift forces into global mem */
+ +    if (bCalcFshift)
+ +    {
+ +        /* Only threads with tidxj < 3 will update fshift.
+ +           The threads performing the update, must be the same as the threads
+ +           storing the reduction result above.
+ +         */
+ +        if (tidxj < 3)
+ +        {
+ +            atomicAdd_g_f(&(fshift[3 * shift + tidxj]), fshift_buf);
+ +        }
+ +    }
+ +}
+ +
+ +/*! Final i-force reduction
+ + */
+ +gmx_opencl_inline
+ +void reduce_force_i_and_shift(__local float gmx_unused *f_buf, float3* fci_buf, __global float *f,
+ +                              bool bCalcFshift, int tidxi, int tidxj, int sci,
+ +                              int shift, __global float *fshift)
+ +{
+ +#if REDUCE_SHUFFLE
+ +    reduce_force_i_and_shift_shfl(fci_buf, f, bCalcFshift, tidxi, tidxj,
+ +                                  sci, shift, fshift);
+ +#else
+ +    reduce_force_i_and_shift_pow2(f_buf, fci_buf, f, bCalcFshift, tidxi, tidxj,
+ +                                  sci, shift, fshift);
+ +#endif
+ +}
+ +
+ +
+ +
+ +#if REDUCE_SHUFFLE
+ +gmx_opencl_inline
+ +void reduce_energy_shfl(float E_lj, float E_el,
+ +                        volatile __global float *e_lj,
+ +                        volatile __global float *e_el,
+ +                        unsigned int    tidx)
+ +{
+ +    E_lj = sub_group_reduce_add(E_lj);
+ +    E_el = sub_group_reduce_add(E_el);
+ +    /* Should be get_sub_group_local_id()==0. Doesn't work with Intel Classic driver.
+ +     * To make it possible to use REDUCE_SHUFFLE with single subgroup per i-j pair
+ +     * (e.g. subgroup size 16 with CL_SIZE 4), either this "if" needs to be changed or
+ +     * the definition of WARP_SIZE (currently CL_SIZE*CL_SIZE/2) needs to be changed
+ +     * (by supporting c_nbnxnGpuClusterpairSplit=1). */
+ +    if (tidx == 0 || tidx == WARP_SIZE)
+ +    {
+ +        atomicAdd_g_f(e_lj, E_lj);
+ +        atomicAdd_g_f(e_el, E_el);
+ +    }
+ +}
+ +#endif
+ +
+ +/*! Energy reduction; this implementation works only with power of two
+ + *  array sizes.
+ + */
+ +gmx_opencl_inline
+ +void reduce_energy_pow2(volatile __local float  *buf,
+ +                        volatile __global float *e_lj,
+ +                        volatile __global float *e_el,
+ +                        unsigned int             tidx)
+ +{
+ +    int          j;
+ +
+ +    unsigned int i = WARP_SIZE/2;
+ +
+ +    /* Can't just use i as loop variable because than nvcc refuses to unroll. */
+ +    for (j = WARP_SIZE_LOG2 - 1; j > 0; j--)
+ +    {
+ +        if (tidx < i)
+ +        {
+ +            buf[              tidx] += buf[              tidx + i];
+ +            buf[FBUF_STRIDE + tidx] += buf[FBUF_STRIDE + tidx + i];
+ +        }
+ +        i >>= 1;
+ +    }
+ +
+ +    /* last reduction step, writing to global mem */
+ +    if (tidx == 0)
+ +    {
+ +        float e1 = buf[              tidx] + buf[              tidx + i];
+ +        float e2 = buf[FBUF_STRIDE + tidx] + buf[FBUF_STRIDE + tidx + i];
+ +
+ +        atomicAdd_g_f(e_lj, e1);
+ +        atomicAdd_g_f(e_el, e2);
+ +    }
+ +}
+ +
+ +gmx_opencl_inline
+ +void reduce_energy(volatile __local float  gmx_unused *buf,
+ +                   float E_lj, float E_el,
+ +                   volatile __global float *e_lj,
+ +                   volatile __global float *e_el,
+ +                   unsigned int             tidx)
+ +{
+ +#if REDUCE_SHUFFLE
+ +    reduce_energy_shfl(E_lj, E_el, e_lj, e_el, tidx);
+ +#else
+ +    /* flush the energies to shmem and reduce them */
+ +    buf[              tidx] = E_lj;
+ +    buf[FBUF_STRIDE + tidx] = E_el;
+ +    reduce_energy_pow2(buf + (tidx & WARP_SIZE), e_lj, e_el, tidx & ~WARP_SIZE);
+ +#endif
+ +}
+ +
+ +gmx_opencl_inline
+ +bool gmx_sub_group_any_localmem(volatile __local uint *warp_any, int widx, bool pred)
+ +{
+ +    if (pred)
+ +    {
+ +        warp_any[widx] = 1;
+ +    }
+ +
+ +    bool ret = warp_any[widx];
+ +
+ +    warp_any[widx] = 0;
+ +
+ +    return ret;
+ +}
+ +
+ +//! Returns a true if predicate is true for any work item in warp
+ +gmx_opencl_inline
+ +bool gmx_sub_group_any(volatile __local uint gmx_unused *warp_any, int gmx_unused widx, bool pred)
+ +{
+ +#if USE_SUBGROUP_ANY
+ +    return sub_group_any(pred);
+ +#else
+ +    return gmx_sub_group_any_localmem(warp_any, widx, pred);
+ +#endif
+ +}
+ +
+ +#endif /* NBNXN_OPENCL_KERNEL_UTILS_CLH */
diff --cc src/gromacs/taskassignment/decidegpuusage.cpp
Simple merge
diff --cc src/gromacs/taskassignment/resourcedivision.cpp
Simple merge
diff --cc src/gromacs/trajectoryanalysis/topologyinformation.cpp
Simple merge
author	Paul Bauer <paul.bauer.q@gmail.com>
	Wed, 20 Feb 2019 12:50:49 +0000 (13:50 +0100)
committer	Paul Bauer <paul.bauer.q@gmail.com>
	Wed, 20 Feb 2019 12:50:49 +0000 (13:50 +0100)
		1	2
cmake/gmxManageNvccConfig.cmake	patch \|	diff1 \|	diff2 \|	blob \| history
cmake/gmxSimdFlags.cmake	patch \|	diff1 \|	diff2 \|	blob \| history
cmake/gmxVersionInfo.cmake	patch \|	diff1 \|	diff2 \|	blob \| history
docs/CMakeLists.txt	patch \|	diff1 \|	diff2 \|	blob \| history
docs/install-guide/index.rst	patch \|	diff1 \|	diff2 \|	blob \| history
docs/release-notes/index.rst	patch \|	diff1 \|	diff2 \|	blob \| history
docs/user-guide/mdp-options.rst	patch \|	diff1 \|	diff2 \|	blob \| history
src/gromacs/CMakeLists.txt	patch \|	diff1 \|	diff2 \|	blob \| history
src/gromacs/domdec/partition.cpp	patch \|	diff1 \|	diff2 \|	blob \| history
src/gromacs/ewald/pme.cpp	patch \|	diff1 \|	diff2 \|	blob \| history
src/gromacs/ewald/tests/pmetestcommon.cpp	patch \|	diff1 \|	diff2 \|	blob \| history
src/gromacs/ewald/tests/testhardwarecontexts.cpp	patch \|	diff1 \|	diff2 \|	blob \| history
src/gromacs/gmxpreprocess/insert_molecules.cpp	patch \|	diff1 \|	\|	blob \| history
src/gromacs/gmxpreprocess/readir.cpp	patch \|	diff1 \|	diff2 \|	blob \| history
src/gromacs/gmxpreprocess/solvate.cpp	patch \|	diff1 \|	diff2 \|	blob \| history
src/gromacs/mdlib/constr.cpp	patch \|	diff1 \|	diff2 \|	blob \| history
src/gromacs/mdlib/constr.h	patch \|	diff1 \|	diff2 \|	blob \| history
src/gromacs/mdlib/force.cpp	patch \|	diff1 \|	diff2 \|	blob \| history
src/gromacs/mdlib/sim_util.cpp	patch \|	diff1 \|	diff2 \|	blob \| history
src/gromacs/mdrun/rerun.cpp	patch \|	diff1 \|	diff2 \|	blob \| history
src/gromacs/mdrun/runner.cpp	patch \|	diff1 \|	diff2 \|	blob \| history
src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel_utils.cuh	patch \|	diff1 \|	\|	blob \| history
src/gromacs/nbnxm/opencl/nbnxm_ocl_kernel_utils.clh	patch \|	diff1 \|	\|	blob \| history
src/gromacs/taskassignment/decidegpuusage.cpp	patch \|	diff1 \|	diff2 \|	blob \| history
src/gromacs/taskassignment/resourcedivision.cpp	patch \|	diff1 \|	diff2 \|	blob \| history
src/gromacs/trajectoryanalysis/topologyinformation.cpp	patch \|	diff1 \|	diff2 \|	blob \| history