Merge branch 'release-2019' into master
authorPaul Bauer <paul.bauer.q@gmail.com>
Wed, 20 Feb 2019 12:50:49 +0000 (13:50 +0100)
committerPaul Bauer <paul.bauer.q@gmail.com>
Wed, 20 Feb 2019 12:50:49 +0000 (13:50 +0100)
Kept the way topologyinformation was removed from solvate and
insert_molecules in 2019.

Resolved Conflicts:
cmake/gmxVersionInfo.cmake
docs/CMakeLists.txt
docs/user-guide/mdp-options.rst
src/gromacs/gmxpreprocess/insert-molecules.cpp
src/gromacs/gmxpreprocess/solvate.cpp
src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh
src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_utils.clh
src/gromacs/mdlib/sim_util.cpp
src/gromacs/trajectoryanalysis/topologyinformation.cpp

Change-Id: I2576133c4bc7a3c55cedb29e9832bb5afa8fe76f

26 files changed:
1  2 
cmake/gmxManageNvccConfig.cmake
cmake/gmxSimdFlags.cmake
cmake/gmxVersionInfo.cmake
docs/CMakeLists.txt
docs/install-guide/index.rst
docs/release-notes/index.rst
docs/user-guide/mdp-options.rst
src/gromacs/CMakeLists.txt
src/gromacs/domdec/partition.cpp
src/gromacs/ewald/pme.cpp
src/gromacs/ewald/tests/pmetestcommon.cpp
src/gromacs/ewald/tests/testhardwarecontexts.cpp
src/gromacs/gmxpreprocess/insert_molecules.cpp
src/gromacs/gmxpreprocess/readir.cpp
src/gromacs/gmxpreprocess/solvate.cpp
src/gromacs/mdlib/constr.cpp
src/gromacs/mdlib/constr.h
src/gromacs/mdlib/force.cpp
src/gromacs/mdlib/sim_util.cpp
src/gromacs/mdrun/rerun.cpp
src/gromacs/mdrun/runner.cpp
src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel_utils.cuh
src/gromacs/nbnxm/opencl/nbnxm_ocl_kernel_utils.clh
src/gromacs/taskassignment/decidegpuusage.cpp
src/gromacs/taskassignment/resourcedivision.cpp
src/gromacs/trajectoryanalysis/topologyinformation.cpp

Simple merge
Simple merge
Simple merge
index 8ad2e04f5a0fa9e773899a049354432bb002a513,65c8c5756f5b14ece9c420aa7be1ef112ef0bec5..9f6c80457a8492f81970588fc1ee8a9412cb69bf
@@@ -368,15 -367,7 +368,16 @@@ if (SPHINX_FOUND
          how-to/visualize.rst
          install-guide/index.rst
          release-notes/index.rst
 +        release-notes/highlights.rst
 +        release-notes/features.rst
 +        release-notes/performance.rst
 +        release-notes/tools.rst
 +        release-notes/bugs-fixed.rst
 +        release-notes/removed-functionality.rst
 +        release-notes/deprecated-functionality.rst
 +        release-notes/portability.rst
 +        release-notes/miscellaneous.rst
+         release-notes/2019/2019.2.rst
          release-notes/2019/2019.1.rst
          release-notes/2019/major/highlights.rst
          release-notes/2019/major/features.rst
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
index 6157cd3dd1bc251fe7a692e58c5800ba70865aab,5cbbcc311951ab6a4620288092dbdabd36c00943..e15c37c3a1f1c2eae738e2b2f7fd84c77d6e1682
@@@ -110,10 -109,11 +110,11 @@@ static gmx_hw_info_t *hardwareInit(
  
  void PmeTestEnvironment::SetUp()
  {
 -    hardwareContexts_.emplace_back(compat::make_unique<TestHardwareContext>(CodePath::CPU, "", nullptr));
 +    hardwareContexts_.emplace_back(std::make_unique<TestHardwareContext>(CodePath::CPU, "", nullptr));
  
      hardwareInfo_ = hardwareInit();
-     if (!pme_gpu_supports_build(*hardwareInfo_, nullptr))
+     if (!pme_gpu_supports_build(nullptr) ||
+         !pme_gpu_supports_hardware(*hardwareInfo_, nullptr))
      {
          // PME can only run on the CPU, so don't make any more test contexts.
          return;
index 572c385516faf0094710afd2bfaeb227ca9dfcde,0000000000000000000000000000000000000000..8f7df5bbbce7ffc533f485cd36be65b0f8e33dde
mode 100644,000000..100644
--- /dev/null
@@@ -1,616 -1,0 +1,621 @@@
- #include "gromacs/trajectoryanalysis/topologyinformation.h"
 +/*
 + * This file is part of the GROMACS molecular simulation package.
 + *
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team.
 + * Copyright (c) 2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
 + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
 + * and including many others, as listed in the AUTHORS file in the
 + * top-level source directory and at http://www.gromacs.org.
 + *
 + * GROMACS is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU Lesser General Public License
 + * as published by the Free Software Foundation; either version 2.1
 + * of the License, or (at your option) any later version.
 + *
 + * GROMACS is distributed in the hope that it will be useful,
 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 + * Lesser General Public License for more details.
 + *
 + * You should have received a copy of the GNU Lesser General Public
 + * License along with GROMACS; if not, see
 + * http://www.gnu.org/licenses, or write to the Free Software Foundation,
 + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
 + *
 + * If you want to redistribute modifications to GROMACS, please
 + * consider that scientific software is very special. Version
 + * control is crucial - bugs must be traceable. We will be happy to
 + * consider code for inclusion in the official distribution, but
 + * derived work must not be called official GROMACS. Details are found
 + * in the README & COPYING files - if they are missing, get the
 + * official version at http://www.gromacs.org.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the research papers on the package. Check out http://www.gromacs.org.
 + */
 +#include "gmxpre.h"
 +
 +#include "insert_molecules.h"
 +
 +#include <algorithm>
 +#include <memory>
 +#include <set>
 +#include <string>
 +#include <vector>
 +
 +#include "gromacs/commandline/cmdlineoptionsmodule.h"
 +#include "gromacs/fileio/confio.h"
 +#include "gromacs/fileio/filetypes.h"
 +#include "gromacs/fileio/xvgr.h"
 +#include "gromacs/gmxlib/conformation_utilities.h"
 +#include "gromacs/gmxpreprocess/makeexclusiondistances.h"
 +#include "gromacs/math/functions.h"
 +#include "gromacs/math/utilities.h"
 +#include "gromacs/math/vec.h"
 +#include "gromacs/options/basicoptions.h"
 +#include "gromacs/options/filenameoption.h"
 +#include "gromacs/options/ioptionscontainer.h"
 +#include "gromacs/pbcutil/pbc.h"
 +#include "gromacs/random/threefry.h"
 +#include "gromacs/random/uniformrealdistribution.h"
 +#include "gromacs/selection/nbsearch.h"
 +#include "gromacs/selection/selection.h"
 +#include "gromacs/selection/selectioncollection.h"
 +#include "gromacs/selection/selectionoption.h"
 +#include "gromacs/selection/selectionoptionbehavior.h"
 +#include "gromacs/topology/atomprop.h"
 +#include "gromacs/topology/atoms.h"
 +#include "gromacs/topology/atomsbuilder.h"
 +#include "gromacs/topology/mtop_util.h"
 +#include "gromacs/topology/symtab.h"
 +#include "gromacs/topology/topology.h"
 +#include "gromacs/trajectory/trajectoryframe.h"
- #include "gromacs/utility/unique_cptr.h"
 +#include "gromacs/utility/cstringutil.h"
 +#include "gromacs/utility/exceptions.h"
 +#include "gromacs/utility/fatalerror.h"
 +#include "gromacs/utility/smalloc.h"
-         gmx_mtop_t *getTopology(bool /*required*/) override { return topInfo_.mtop(); }
 +
 +using gmx::RVec;
 +
 +/* enum for random rotations of inserted solutes */
 +enum RotationType {
 +    en_rotXYZ, en_rotZ, en_rotNone
 +};
 +const char *const cRotationEnum[] = {"xyz", "z", "none"};
 +
 +static void center_molecule(gmx::ArrayRef<RVec> x)
 +{
 +    rvec center = {0};
 +    for (auto &xi : x)
 +    {
 +        rvec_inc(center, xi);
 +    }
 +    svmul(1.0/x.size(), center, center);
 +    for (auto &xi : x)
 +    {
 +        rvec_dec(xi, center);
 +    }
 +}
 +
 +static void generate_trial_conf(gmx::ArrayRef<RVec> xin,
 +                                const rvec offset, RotationType enum_rot,
 +                                gmx::DefaultRandomEngine * rng,
 +                                std::vector<RVec> *xout)
 +{
 +    gmx::UniformRealDistribution<real> dist(0, 2.0*M_PI);
 +    xout->assign(xin.begin(), xin.end());
 +
 +    real alfa = 0.0, beta = 0.0, gamma = 0.0;
 +    switch (enum_rot)
 +    {
 +        case en_rotXYZ:
 +            alfa  = dist(*rng);
 +            beta  = dist(*rng);
 +            gamma = dist(*rng);
 +            break;
 +        case en_rotZ:
 +            alfa  = beta = 0.;
 +            gamma = dist(*rng);
 +            break;
 +        case en_rotNone:
 +            alfa = beta = gamma = 0.;
 +            break;
 +    }
 +    if (enum_rot == en_rotXYZ || enum_rot == en_rotZ)
 +    {
 +        rotate_conf(xout->size(), as_rvec_array(xout->data()), nullptr, alfa, beta, gamma);
 +    }
 +    for (size_t i = 0; i < xout->size(); ++i)
 +    {
 +        rvec_inc((*xout)[i], offset);
 +    }
 +}
 +
 +static bool isInsertionAllowed(gmx::AnalysisNeighborhoodSearch *search,
 +                               const std::vector<real>         &exclusionDistances,
 +                               const std::vector<RVec>         &x,
 +                               const std::vector<real>         &exclusionDistances_insrt,
 +                               const t_atoms                   &atoms,
 +                               const std::set<int>             &removableAtoms,
 +                               gmx::AtomsRemover               *remover)
 +{
 +    gmx::AnalysisNeighborhoodPositions  pos(x);
 +    gmx::AnalysisNeighborhoodPairSearch pairSearch = search->startPairSearch(pos);
 +    gmx::AnalysisNeighborhoodPair       pair;
 +    while (pairSearch.findNextPair(&pair))
 +    {
 +        const real r1 = exclusionDistances[pair.refIndex()];
 +        const real r2 = exclusionDistances_insrt[pair.testIndex()];
 +        if (pair.distance2() < gmx::square(r1 + r2))
 +        {
 +            if (removableAtoms.count(pair.refIndex()) == 0)
 +            {
 +                return false;
 +            }
 +            // TODO: If molecule information is available, this should ideally
 +            // use it to remove whole molecules.
 +            remover->markResidue(atoms, pair.refIndex(), true);
 +        }
 +    }
 +    return true;
 +}
 +
 +static void insert_mols(int nmol_insrt, int ntry, int seed,
 +                        real defaultDistance, real scaleFactor,
 +                        t_atoms *atoms, t_symtab *symtab, std::vector<RVec> *x,
 +                        const std::set<int> &removableAtoms,
 +                        const t_atoms &atoms_insrt, gmx::ArrayRef<RVec> x_insrt,
 +                        int ePBC, matrix box,
 +                        const std::string &posfn, const rvec deltaR,
 +                        RotationType enum_rot)
 +{
 +    fprintf(stderr, "Initialising inter-atomic distances...\n");
 +    AtomProperties          aps;
 +    std::vector<real>       exclusionDistances(
 +            makeExclusionDistances(atoms, &aps, defaultDistance, scaleFactor));
 +    const std::vector<real> exclusionDistances_insrt(
 +            makeExclusionDistances(&atoms_insrt, &aps, defaultDistance, scaleFactor));
 +
 +    const real       maxInsertRadius
 +        = *std::max_element(exclusionDistances_insrt.begin(),
 +                            exclusionDistances_insrt.end());
 +    real             maxRadius = maxInsertRadius;
 +    if (!exclusionDistances.empty())
 +    {
 +        const real maxExistingRadius
 +            = *std::max_element(exclusionDistances.begin(),
 +                                exclusionDistances.end());
 +        maxRadius = std::max(maxInsertRadius, maxExistingRadius);
 +    }
 +
 +    // TODO: Make all of this exception-safe.
 +    gmx::AnalysisNeighborhood nb;
 +    nb.setCutoff(maxInsertRadius + maxRadius);
 +
 +
 +    if (seed == 0)
 +    {
 +        seed = static_cast<int>(gmx::makeRandomSeed());
 +    }
 +    fprintf(stderr, "Using random seed %d\n", seed);
 +
 +    gmx::DefaultRandomEngine rng(seed);
 +
 +    t_pbc                    pbc;
 +    set_pbc(&pbc, ePBC, box);
 +
 +    /* With -ip, take nmol_insrt from file posfn */
 +    double     **rpos              = nullptr;
 +    const bool   insertAtPositions = !posfn.empty();
 +    if (insertAtPositions)
 +    {
 +        int ncol;
 +        nmol_insrt = read_xvg(posfn.c_str(), &rpos, &ncol);
 +        if (ncol != 3)
 +        {
 +            gmx_fatal(FARGS, "Expected 3 columns (x/y/z coordinates) in file %s\n",
 +                      posfn.c_str());
 +        }
 +        fprintf(stderr, "Read %d positions from file %s\n\n",
 +                nmol_insrt, posfn.c_str());
 +    }
 +
 +    gmx::AtomsBuilder builder(atoms, symtab);
 +    gmx::AtomsRemover remover(*atoms);
 +    {
 +        const int finalAtomCount    = atoms->nr + nmol_insrt * atoms_insrt.nr;
 +        const int finalResidueCount = atoms->nres + nmol_insrt * atoms_insrt.nres;
 +        builder.reserve(finalAtomCount, finalResidueCount);
 +        x->reserve(finalAtomCount);
 +        exclusionDistances.reserve(finalAtomCount);
 +    }
 +
 +    std::vector<RVec>                    x_n(x_insrt.size());
 +
 +    int                                  mol        = 0;
 +    int                                  trial      = 0;
 +    int                                  firstTrial = 0;
 +    int                                  failed     = 0;
 +    gmx::UniformRealDistribution<real>   dist;
 +
 +    while (mol < nmol_insrt && trial < ntry*nmol_insrt)
 +    {
 +        rvec offset_x;
 +        if (!insertAtPositions)
 +        {
 +            // Insert at random positions.
 +            offset_x[XX] = box[XX][XX] * dist(rng);
 +            offset_x[YY] = box[YY][YY] * dist(rng);
 +            offset_x[ZZ] = box[ZZ][ZZ] * dist(rng);
 +        }
 +        else
 +        {
 +            // Skip a position if ntry trials were not successful.
 +            if (trial >= firstTrial + ntry)
 +            {
 +                fprintf(stderr, " skipped position (%.3f, %.3f, %.3f)\n",
 +                        rpos[XX][mol], rpos[YY][mol], rpos[ZZ][mol]);
 +                ++mol;
 +                ++failed;
 +                firstTrial = trial;
 +                continue;
 +            }
 +            // Insert at positions taken from option -ip file.
 +            offset_x[XX] = rpos[XX][mol] + deltaR[XX]*(2 * dist(rng)-1);
 +            offset_x[YY] = rpos[YY][mol] + deltaR[YY]*(2 * dist(rng)-1);
 +            offset_x[ZZ] = rpos[ZZ][mol] + deltaR[ZZ]*(2 * dist(rng)-1);
 +        }
 +        fprintf(stderr, "\rTry %d", ++trial);
 +        fflush(stderr);
 +
 +        generate_trial_conf(x_insrt, offset_x, enum_rot, &rng, &x_n);
 +        gmx::AnalysisNeighborhoodPositions pos(*x);
 +        gmx::AnalysisNeighborhoodSearch    search = nb.initSearch(&pbc, pos);
 +        if (isInsertionAllowed(&search, exclusionDistances, x_n, exclusionDistances_insrt,
 +                               *atoms, removableAtoms, &remover))
 +        {
 +            x->insert(x->end(), x_n.begin(), x_n.end());
 +            exclusionDistances.insert(exclusionDistances.end(),
 +                                      exclusionDistances_insrt.begin(),
 +                                      exclusionDistances_insrt.end());
 +            builder.mergeAtoms(atoms_insrt);
 +            ++mol;
 +            firstTrial = trial;
 +            fprintf(stderr, " success (now %d atoms)!\n", builder.currentAtomCount());
 +        }
 +    }
 +
 +    fprintf(stderr, "\n");
 +    /* print number of molecules added */
 +    fprintf(stderr, "Added %d molecules (out of %d requested)\n",
 +            mol - failed, nmol_insrt);
 +
 +    const int originalAtomCount    = atoms->nr;
 +    const int originalResidueCount = atoms->nres;
 +    remover.refreshAtomCount(*atoms);
 +    remover.removeMarkedElements(x);
 +    remover.removeMarkedAtoms(atoms);
 +    if (atoms->nr < originalAtomCount)
 +    {
 +        fprintf(stderr, "Replaced %d residues (%d atoms)\n",
 +                originalResidueCount - atoms->nres,
 +                originalAtomCount - atoms->nr);
 +    }
 +
 +    if (rpos != nullptr)
 +    {
 +        for (int i = 0; i < DIM; ++i)
 +        {
 +            sfree(rpos[i]);
 +        }
 +        sfree(rpos);
 +    }
 +}
 +
 +namespace gmx
 +{
 +
 +namespace
 +{
 +
 +class InsertMolecules : public ICommandLineOptionsModule, public ITopologyProvider
 +{
 +    public:
 +        InsertMolecules()
 +            : bBox_(false), nmolIns_(0), nmolTry_(10), seed_(0),
 +              defaultDistance_(0.105), scaleFactor_(0.57), enumRot_(en_rotXYZ)
 +        {
 +            clear_rvec(newBox_);
 +            clear_rvec(deltaR_);
++            clear_mat(box_);
 +        }
 +
 +        // From ITopologyProvider
-         TopologyInformation topInfo_;
++        gmx_mtop_t *getTopology(bool /*required*/) override { return &top_; }
 +        int getAtomCount() override { return 0; }
 +
 +        // From ICommandLineOptionsModule
 +        void init(CommandLineModuleSettings * /*settings*/) override
 +        {
 +        }
 +        void initOptions(IOptionsContainer                 *options,
 +                         ICommandLineOptionsModuleSettings *settings) override;
 +        void optionsFinished() override;
 +        int run() override;
 +
 +    private:
 +        SelectionCollection selections_;
 +
 +        std::string         inputConfFile_;
 +        std::string         insertConfFile_;
 +        std::string         positionFile_;
 +        std::string         outputConfFile_;
 +        rvec                newBox_;
 +        bool                bBox_;
 +        int                 nmolIns_;
 +        int                 nmolTry_;
 +        int                 seed_;
 +        real                defaultDistance_;
 +        real                scaleFactor_;
 +        rvec                deltaR_;
 +        RotationType        enumRot_;
 +        Selection           replaceSel_;
 +
-         topInfo_.fillFromInputFile(inputConfFile_);
-         if (topInfo_.mtop()->natoms == 0)
++        gmx_mtop_t          top_;
++        std::vector<RVec>   x_;
++        matrix              box_;
++        int                 ePBC_;
 +};
 +
 +void InsertMolecules::initOptions(IOptionsContainer                 *options,
 +                                  ICommandLineOptionsModuleSettings *settings)
 +{
 +    const char *const desc[] = {
 +        "[THISMODULE] inserts [TT]-nmol[tt] copies of the system specified in",
 +        "the [TT]-ci[tt] input file. The insertions take place either into",
 +        "vacant space in the solute conformation given with [TT]-f[tt], or",
 +        "into an empty box given by [TT]-box[tt]. Specifying both [TT]-f[tt]",
 +        "and [TT]-box[tt] behaves like [TT]-f[tt], but places a new box",
 +        "around the solute before insertions. Any velocities present are",
 +        "discarded.",
 +        "",
 +        "It is possible to also insert into a solvated configuration and",
 +        "replace solvent atoms with the inserted atoms. To do this, use",
 +        "[TT]-replace[tt] to specify a selection that identifies the atoms",
 +        "that can be replaced. The tool assumes that all molecules in this",
 +        "selection consist of single residues: each residue from this",
 +        "selection that overlaps with the inserted molecules will be removed",
 +        "instead of preventing insertion.",
 +        "",
 +        "By default, the insertion positions are random (with initial seed",
 +        "specified by [TT]-seed[tt]). The program iterates until [TT]-nmol[tt]",
 +        "molecules have been inserted in the box. Molecules are not inserted",
 +        "where the distance between any existing atom and any atom of the",
 +        "inserted molecule is less than the sum based on the van der Waals",
 +        "radii of both atoms. A database ([TT]vdwradii.dat[tt]) of van der",
 +        "Waals radii is read by the program, and the resulting radii scaled",
 +        "by [TT]-scale[tt]. If radii are not found in the database, those",
 +        "atoms are assigned the (pre-scaled) distance [TT]-radius[tt].",
 +        "Note that the usefulness of those radii depends on the atom names,",
 +        "and thus varies widely with force field.",
 +        "",
 +        "A total of [TT]-nmol[tt] * [TT]-try[tt] insertion attempts are made",
 +        "before giving up. Increase [TT]-try[tt] if you have several small",
 +        "holes to fill. Option [TT]-rot[tt] specifies whether the insertion",
 +        "molecules are randomly oriented before insertion attempts.",
 +        "",
 +        "Alternatively, the molecules can be inserted only at positions defined in",
 +        "positions.dat ([TT]-ip[tt]). That file should have 3 columns (x,y,z),",
 +        "that give the displacements compared to the input molecule position",
 +        "([TT]-ci[tt]). Hence, if that file should contain the absolute",
 +        "positions, the molecule must be centered on (0,0,0) before using",
 +        "[THISMODULE] (e.g. from [gmx-editconf] [TT]-center[tt]).",
 +        "Comments in that file starting with # are ignored. Option [TT]-dr[tt]",
 +        "defines the maximally allowed displacements during insertial trials.",
 +        "[TT]-try[tt] and [TT]-rot[tt] work as in the default mode (see above)."
 +    };
 +
 +    settings->setHelpText(desc);
 +
 +    std::shared_ptr<SelectionOptionBehavior> selectionOptionBehavior(
 +            new SelectionOptionBehavior(&selections_, this));
 +    settings->addOptionsBehavior(selectionOptionBehavior);
 +
 +    // TODO: Replace use of legacyType.
 +    options->addOption(FileNameOption("f")
 +                           .legacyType(efSTX).inputFile()
 +                           .store(&inputConfFile_)
 +                           .defaultBasename("protein")
 +                           .description("Existing configuration to insert into"));
 +    options->addOption(FileNameOption("ci")
 +                           .legacyType(efSTX).inputFile().required()
 +                           .store(&insertConfFile_)
 +                           .defaultBasename("insert")
 +                           .description("Configuration to insert"));
 +    options->addOption(FileNameOption("ip")
 +                           .filetype(eftGenericData).inputFile()
 +                           .store(&positionFile_)
 +                           .defaultBasename("positions")
 +                           .description("Predefined insertion trial positions"));
 +    options->addOption(FileNameOption("o")
 +                           .legacyType(efSTO).outputFile().required()
 +                           .store(&outputConfFile_)
 +                           .defaultBasename("out")
 +                           .description("Output configuration after insertion"));
 +
 +    options->addOption(SelectionOption("replace").onlyAtoms()
 +                           .store(&replaceSel_)
 +                           .description("Atoms that can be removed if overlapping"));
 +    selectionOptionBehavior->initOptions(options);
 +
 +    options->addOption(RealOption("box").vector()
 +                           .store(newBox_).storeIsSet(&bBox_)
 +                           .description("Box size (in nm)"));
 +    options->addOption(IntegerOption("nmol")
 +                           .store(&nmolIns_)
 +                           .description("Number of extra molecules to insert"));
 +    options->addOption(IntegerOption("try")
 +                           .store(&nmolTry_)
 +                           .description("Try inserting [TT]-nmol[tt] times [TT]-try[tt] times"));
 +    options->addOption(IntegerOption("seed")
 +                           .store(&seed_)
 +                           .description("Random generator seed (0 means generate)"));
 +    options->addOption(RealOption("radius")
 +                           .store(&defaultDistance_)
 +                           .description("Default van der Waals distance"));
 +    options->addOption(RealOption("scale")
 +                           .store(&scaleFactor_)
 +                           .description("Scale factor to multiply Van der Waals radii from the database in share/gromacs/top/vdwradii.dat. The default value of 0.57 yields density close to 1000 g/l for proteins in water."));
 +    options->addOption(RealOption("dr").vector()
 +                           .store(deltaR_)
 +                           .description("Allowed displacement in x/y/z from positions in [TT]-ip[tt] file"));
 +    options->addOption(EnumOption<RotationType>("rot").enumValue(cRotationEnum)
 +                           .store(&enumRot_)
 +                           .description("Rotate inserted molecules randomly"));
 +}
 +
 +void InsertMolecules::optionsFinished()
 +{
 +    if (nmolIns_ <= 0 && positionFile_.empty())
 +    {
 +        GMX_THROW(InconsistentInputError("Either -nmol must be larger than 0, "
 +                                         "or positions must be given with -ip."));
 +    }
 +    if (inputConfFile_.empty() && !bBox_)
 +    {
 +        GMX_THROW(InconsistentInputError("When no solute (-f) is specified, "
 +                                         "a box size (-box) must be specified."));
 +    }
 +    if (replaceSel_.isValid() && inputConfFile_.empty())
 +    {
 +        GMX_THROW(InconsistentInputError("Replacement (-replace) only makes sense "
 +                                         "together with an existing configuration (-f)."));
 +    }
 +
 +    if (!inputConfFile_.empty())
 +    {
++        bool  bTprFileWasRead;
++        rvec *temporaryX = nullptr;
 +        fprintf(stderr, "Reading solute configuration\n");
-     const char       *outputTitle = topInfo_.name();
-     std::vector<RVec> xOutput;
-     matrix            box = {{ 0 }};
-     if (topInfo_.hasTopology())
-     {
-         xOutput = copyOf(topInfo_.x());
-         topInfo_.getBox(box);
-     }
-     else
-     {
-         xOutput.resize(topInfo_.mtop()->natoms);
-     }
-     auto          atomsSolute = topInfo_.copyAtoms();
++        readConfAndTopology(inputConfFile_.c_str(), &bTprFileWasRead, &top_,
++                            &ePBC_, &temporaryX, nullptr, box_);
++        x_.assign(temporaryX, temporaryX + top_.natoms);
++        sfree(temporaryX);
++        if (top_.natoms == 0)
 +        {
 +            fprintf(stderr, "Note: no atoms in %s\n", inputConfFile_.c_str());
 +        }
 +    }
 +}
 +
 +int InsertMolecules::run()
 +{
-         set_pbc(&pbc, topInfo_.ePBC(), box);
 +    std::set<int> removableAtoms;
 +    if (replaceSel_.isValid())
 +    {
 +        t_pbc       pbc;
-         fr->natoms = topInfo_.mtop()->natoms;
++        set_pbc(&pbc, ePBC_, box_);
 +        t_trxframe *fr;
 +        snew(fr, 1);
-         fr->x      = as_rvec_array(xOutput.data());
++        fr->natoms = x_.size();
 +        fr->bX     = TRUE;
-     int ePBCForOutput = topInfo_.ePBC();
++        fr->x      = as_rvec_array(x_.data());
 +        selections_.evaluate(fr, &pbc);
 +        sfree(fr);
 +        removableAtoms.insert(replaceSel_.atomIndices().begin(),
 +                              replaceSel_.atomIndices().end());
 +        // TODO: It could be nice to check that removableAtoms contains full
 +        // residues, since we anyways remove whole residues instead of
 +        // individual atoms.
 +    }
 +
-         clear_mat(box);
-         box[XX][XX] = newBox_[XX];
-         box[YY][YY] = newBox_[YY];
-         box[ZZ][ZZ] = newBox_[ZZ];
++    int ePBCForOutput = ePBC_;
 +    if (bBox_)
 +    {
 +        ePBCForOutput = epbcXYZ;
-     if (det(box) == 0)
++        clear_mat(box_);
++        box_[XX][XX] = newBox_[XX];
++        box_[YY][YY] = newBox_[YY];
++        box_[ZZ][ZZ] = newBox_[ZZ];
 +    }
-     fprintf(stderr, "Reading molecule configuration\n");
-     TopologyInformation      topInfoForInsertedMolecule;
-     topInfoForInsertedMolecule.fillFromInputFile(insertConfFile_);
-     auto                     atomsInserted = topInfoForInsertedMolecule.atoms();
-     std::vector<RVec>        xInserted     = copyOf(topInfoForInsertedMolecule.x());
-     if (topInfoForInsertedMolecule.mtop()->natoms == 0)
-     {
-         gmx_fatal(FARGS, "No molecule in %s, please check your input",
-                   insertConfFile_.c_str());
-     }
-     if (outputTitle == nullptr)
-     {
-         outputTitle = topInfoForInsertedMolecule.name();
-     }
-     if (positionFile_.empty())
++    if (det(box_) == 0)
 +    {
 +        gmx_fatal(FARGS, "Undefined solute box.\nCreate one with gmx editconf "
 +                  "or give explicit -box command line option");
 +    }
 +
-         center_molecule(xInserted);
++    gmx_mtop_t         topInserted;
++    t_atoms            atomsInserted;
++    std::vector<RVec>  xInserted;
 +    {
-     auto              symtabInserted = duplicateSymtab(&topInfo_.mtop()->symtab);
-     const sfree_guard symtabInsertedGuard(symtabInserted);
++        bool        bTprFileWasRead;
++        int         ePBC_dummy;
++        matrix      box_dummy;
++        rvec       *temporaryX;
++        readConfAndTopology(insertConfFile_.c_str(), &bTprFileWasRead, &topInserted,
++                            &ePBC_dummy, &temporaryX, nullptr, box_dummy);
++        xInserted.assign(temporaryX, temporaryX + topInserted.natoms);
++        sfree(temporaryX);
++        atomsInserted = gmx_mtop_global_atoms(&topInserted);
++        if (atomsInserted.nr == 0)
++        {
++            gmx_fatal(FARGS, "No molecule in %s, please check your input",
++                      insertConfFile_.c_str());
++        }
++        if (top_.name == nullptr)
++        {
++            top_.name = topInserted.name;
++        }
++        if (positionFile_.empty())
++        {
++            center_molecule(xInserted);
++        }
 +    }
 +
-                 atomsSolute.get(), symtabInserted, &xOutput, removableAtoms, *atomsInserted, xInserted,
-                 ePBCForOutput, box, positionFile_, deltaR_, enumRot_);
++    t_atoms atoms = gmx_mtop_global_atoms(&top_);
++
 +    /* add nmol_ins molecules of atoms_ins
 +       in random orientation at random place */
 +    insert_mols(nmolIns_, nmolTry_, seed_, defaultDistance_, scaleFactor_,
-     write_sto_conf(outputConfFile_.c_str(), outputTitle, atomsSolute.get(),
-                    as_rvec_array(xOutput.data()), nullptr, ePBCForOutput, box);
++                &atoms, &top_.symtab, &x_, removableAtoms, atomsInserted, xInserted,
++                ePBCForOutput, box_, positionFile_, deltaR_, enumRot_);
 +
 +    /* write new configuration to file confout */
 +    fprintf(stderr, "Writing generated configuration to %s\n",
 +            outputConfFile_.c_str());
-             atomsSolute->nr, atomsSolute->nres);
-     done_symtab(symtabInserted);
++    write_sto_conf(outputConfFile_.c_str(), *top_.name, &atoms,
++                   as_rvec_array(x_.data()), nullptr, ePBCForOutput, box_);
 +
 +    /* print size of generated configuration */
 +    fprintf(stderr, "\nOutput configuration contains %d atoms in %d residues\n",
++            atoms.nr, atoms.nres);
++
++    done_atom(&atoms);
++    done_atom(&atomsInserted);
++
 +    return 0;
 +}
 +
 +}   // namespace
 +
 +
 +const char* InsertMoleculesInfo::name()
 +{
 +    static const char* name = "insert-molecules";
 +    return name;
 +}
 +
 +const char* InsertMoleculesInfo::shortDescription()
 +{
 +    static const char* shortDescription =
 +        "Insert molecules into existing vacancies";
 +    return shortDescription;
 +}
 +
 +ICommandLineOptionsModulePointer InsertMoleculesInfo::create()
 +{
 +    return ICommandLineOptionsModulePointer(new InsertMolecules());
 +}
 +
 +} // namespace gmx
Simple merge
index 60fa35e97cd941905fa394c0208f3cd9db5ec6b3,c6e6eba38e77f5d2db51b47add28790f379c4b3e..bf44bd395e803a013bc8a71a4191f255e7942e83
@@@ -638,11 -637,10 +637,10 @@@ static void removeExtraSolventMolecules
      remover.removeMarkedAtoms(atoms);
  }
  
- static void add_solv(const char *filename,
-                      const gmx::TopologyInformation &topInfo,
-                      t_atoms *atoms,
+ static void add_solv(const char *filename, t_atoms *atoms,
+                      t_symtab *symtab,
                       std::vector<RVec> *x, std::vector<RVec> *v,
-                      matrix box, AtomProperties *aps,
 -                     int ePBC, matrix box, gmx_atomprop_t aps,
++                     int ePBC, matrix box, AtomProperties *aps,
                       real defaultDistance, real scaleFactor,
                       real rshell, int max_sol)
  {
          done_atom(newatoms);
          sfree(newatoms);
      }
+     if (atomsSolvent)
+     {
+         done_atom(atomsSolvent);
+         sfree(atomsSolvent);
+     }
  }
  
 -static void update_top(t_atoms *atoms, int firstSolventResidueIndex, matrix box, int NFILE, t_filenm fnm[],
 -                       gmx_atomprop_t aps)
 +static void update_top(t_atoms        *atoms,
 +                       int             firstSolventResidueIndex,
 +                       matrix          box,
 +                       int             NFILE,
 +                       t_filenm        fnm[],
 +                       AtomProperties *aps)
  {
      FILE        *fpin, *fpout;
      char         buf[STRLEN*2], buf2[STRLEN], *temp;
@@@ -971,11 -976,15 +979,15 @@@ int gmx_solvate(int argc, char *argv[]
                    "a box size (-box) must be specified");
      }
  
 -    aps = gmx_atomprop_init();
 +    AtomProperties           aps;
  
-     gmx::TopologyInformation topInfo;
-     std::vector<RVec>        x, v;
-     matrix                   box = {{ 0 }};
+     /* solute configuration data */
+     gmx_mtop_t        top;
+     std::vector<RVec> x, v;
+     matrix            box  = {{ 0 }};
+     int               ePBC = -1;
+     t_atoms          *atoms;
+     snew(atoms, 1);
      if (bProt)
      {
          /* Generate a solute configuration */
                    "or give explicit -box command line option");
      }
  
-     add_solv(solventFileName, topInfo, atoms.get(), &x, &v, box,
+     add_solv(solventFileName, atoms, &top.symtab, &x, &v, ePBCForOutput, box,
 -             aps, defaultDistance, scaleFactor, r_shell, max_sol);
 +             &aps, defaultDistance, scaleFactor, r_shell, max_sol);
  
      /* write new configuration 1 to file confout */
      confout = ftp2fn(efSTO, NFILE, fnm);
      /* print size of generated configuration */
      fprintf(stderr, "\nOutput configuration contains %d atoms in %d residues\n",
              atoms->nr, atoms->nres);
-     update_top(atoms.get(), firstSolventResidueIndex, box, NFILE, fnm, &aps);
 -    update_top(atoms, firstSolventResidueIndex, box, NFILE, fnm, aps);
++    update_top(atoms, firstSolventResidueIndex, box, NFILE, fnm, &aps);
+     done_atom(atoms);
+     sfree(atoms);
 -    gmx_atomprop_destroy(aps);
      output_env_done(oenv);
  
      return 0;
Simple merge
Simple merge
Simple merge
index 6dedbb795b4a78c9bf5664e3dc175846d0ef0b43,4b0992916ab6f12b19a59b5ead139840cf39b156..3d5578a664b07f0849877da60a1a25f0b68985c1
@@@ -1202,8 -1290,8 +1202,8 @@@ static void do_force_cutsVERLET(FILE *f
          }
  
          /* launch local nonbonded work on GPU */
-         wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
+         wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
 -        do_nb_verlet(fr, ic, enerd, flags, eintLocal, enbvClearFNo,
 +        do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::Local, enbvClearFNo,
                       step, nrnb, wcycle);
          wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
          wallcycle_stop(wcycle, ewcLAUNCH_GPU);
              wallcycle_start(wcycle, ewcLAUNCH_GPU);
  
              /* launch non-local nonbonded tasks on GPU */
-             wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
+             wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
 -            nbnxn_gpu_copy_xq_to_gpu(nbv->gpu_nbv, nbv->nbat, eatNonlocal, ppForceWorkload->haveGpuBondedWork);
 +            Nbnxm::gpu_copy_xq_to_gpu(nbv->gpu_nbv, nbv->nbat, Nbnxm::AtomLocality::NonLocal, ppForceWorkload->haveGpuBondedWork);
              wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
  
              if (ppForceWorkload->haveGpuBondedWork)
Simple merge
Simple merge
index 34b932394b97f81c62fa1710c6529d8729c7902a,0000000000000000000000000000000000000000..758e5ec55d16c6ea3799dda3c8844016c03e6e5f
mode 100644,000000..100644
--- /dev/null
@@@ -1,737 -1,0 +1,737 @@@
-         c12*(-repu_shift_V2 + repu_shift_V3*r_switch)*r_switch*r_switch*inv_r;
 +/*
 + * This file is part of the GROMACS molecular simulation package.
 + *
 + * Copyright (c) 2012,2013,2014,2016,2017,2018,2019, by the GROMACS development team, led by
 + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
 + * and including many others, as listed in the AUTHORS file in the
 + * top-level source directory and at http://www.gromacs.org.
 + *
 + * GROMACS is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU Lesser General Public License
 + * as published by the Free Software Foundation; either version 2.1
 + * of the License, or (at your option) any later version.
 + *
 + * GROMACS is distributed in the hope that it will be useful,
 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 + * Lesser General Public License for more details.
 + *
 + * You should have received a copy of the GNU Lesser General Public
 + * License along with GROMACS; if not, see
 + * http://www.gnu.org/licenses, or write to the Free Software Foundation,
 + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
 + *
 + * If you want to redistribute modifications to GROMACS, please
 + * consider that scientific software is very special. Version
 + * control is crucial - bugs must be traceable. We will be happy to
 + * consider code for inclusion in the official distribution, but
 + * derived work must not be called official GROMACS. Details are found
 + * in the README & COPYING files - if they are missing, get the
 + * official version at http://www.gromacs.org.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the research papers on the package. Check out http://www.gromacs.org.
 + */
 +
 +/*! \internal \file
 + *  \brief
 + *  Utility constant and function declaration for the CUDA non-bonded kernels.
 + *  This header should be included once at the top level, just before the
 + *  kernels are included (has to be preceded by nbnxn_cuda_types.h).
 + *
 + *  \author Szilárd Páll <pall.szilard@gmail.com>
 + *  \ingroup module_nbnxm
 + */
 +#include <assert.h>
 +
 +/* Note that floating-point constants in CUDA code should be suffixed
 + * with f (e.g. 0.5f), to stop the compiler producing intermediate
 + * code that is in double precision.
 + */
 +
 +#include "gromacs/gpu_utils/cuda_arch_utils.cuh"
 +#include "gromacs/gpu_utils/cuda_kernel_utils.cuh"
 +#include "gromacs/gpu_utils/vectype_ops.cuh"
 +
 +#include "nbnxm_cuda_types.h"
 +
 +#ifndef NBNXM_CUDA_KERNEL_UTILS_CUH
 +#define NBNXM_CUDA_KERNEL_UTILS_CUH
 +
 +/*! \brief Log of the i and j cluster size.
 + *  change this together with c_clSize !*/
 +static const int __device__          c_clSizeLog2  = 3;
 +/*! \brief Square of cluster size. */
 +static const int __device__          c_clSizeSq    = c_clSize*c_clSize;
 +/*! \brief j-cluster size after split (4 in the current implementation). */
 +static const int __device__          c_splitClSize = c_clSize/c_nbnxnGpuClusterpairSplit;
 +/*! \brief Stride in the force accumualation buffer */
 +static const int __device__          c_fbufStride  = c_clSizeSq;
 +/*! \brief i-cluster interaction mask for a super-cluster with all c_numClPerSupercl=8 bits set */
 +static const unsigned __device__     superClInteractionMask = ((1U << c_numClPerSupercl) - 1U);
 +
 +static const float __device__        c_oneSixth    = 0.16666667f;
 +static const float __device__        c_oneTwelveth = 0.08333333f;
 +
 +
 +/*! Convert LJ sigma,epsilon parameters to C6,C12. */
 +static __forceinline__ __device__
 +void convert_sigma_epsilon_to_c6_c12(const float  sigma,
 +                                     const float  epsilon,
 +                                     float       *c6,
 +                                     float       *c12)
 +{
 +    float sigma2, sigma6;
 +
 +    sigma2 = sigma * sigma;
 +    sigma6 = sigma2 *sigma2 * sigma2;
 +    *c6    = epsilon * sigma6;
 +    *c12   = *c6 * sigma6;
 +}
 +
 +/*! Apply force switch,  force + energy version. */
 +static __forceinline__ __device__
 +void calculate_force_switch_F(const  cu_nbparam_t nbparam,
 +                              float               c6,
 +                              float               c12,
 +                              float               inv_r,
 +                              float               r2,
 +                              float              *F_invr)
 +{
 +    float r, r_switch;
 +
 +    /* force switch constants */
 +    float disp_shift_V2 = nbparam.dispersion_shift.c2;
 +    float disp_shift_V3 = nbparam.dispersion_shift.c3;
 +    float repu_shift_V2 = nbparam.repulsion_shift.c2;
 +    float repu_shift_V3 = nbparam.repulsion_shift.c3;
 +
 +    r         = r2 * inv_r;
 +    r_switch  = r - nbparam.rvdw_switch;
 +    r_switch  = r_switch >= 0.0f ? r_switch : 0.0f;
 +
 +    *F_invr  +=
 +        -c6*(disp_shift_V2 + disp_shift_V3*r_switch)*r_switch*r_switch*inv_r +
-         c12*(-repu_shift_V2 + repu_shift_V3*r_switch)*r_switch*r_switch*inv_r;
++        c12*(repu_shift_V2 + repu_shift_V3*r_switch)*r_switch*r_switch*inv_r;
 +}
 +
 +/*! Apply force switch, force-only version. */
 +static __forceinline__ __device__
 +void calculate_force_switch_F_E(const  cu_nbparam_t nbparam,
 +                                float               c6,
 +                                float               c12,
 +                                float               inv_r,
 +                                float               r2,
 +                                float              *F_invr,
 +                                float              *E_lj)
 +{
 +    float r, r_switch;
 +
 +    /* force switch constants */
 +    float disp_shift_V2 = nbparam.dispersion_shift.c2;
 +    float disp_shift_V3 = nbparam.dispersion_shift.c3;
 +    float repu_shift_V2 = nbparam.repulsion_shift.c2;
 +    float repu_shift_V3 = nbparam.repulsion_shift.c3;
 +
 +    float disp_shift_F2 = nbparam.dispersion_shift.c2/3;
 +    float disp_shift_F3 = nbparam.dispersion_shift.c3/4;
 +    float repu_shift_F2 = nbparam.repulsion_shift.c2/3;
 +    float repu_shift_F3 = nbparam.repulsion_shift.c3/4;
 +
 +    r         = r2 * inv_r;
 +    r_switch  = r - nbparam.rvdw_switch;
 +    r_switch  = r_switch >= 0.0f ? r_switch : 0.0f;
 +
 +    *F_invr  +=
 +        -c6*(disp_shift_V2 + disp_shift_V3*r_switch)*r_switch*r_switch*inv_r +
++        c12*(repu_shift_V2 + repu_shift_V3*r_switch)*r_switch*r_switch*inv_r;
 +    *E_lj    +=
 +        c6*(disp_shift_F2 + disp_shift_F3*r_switch)*r_switch*r_switch*r_switch -
 +        c12*(repu_shift_F2 + repu_shift_F3*r_switch)*r_switch*r_switch*r_switch;
 +}
 +
 +/*! Apply potential switch, force-only version. */
 +static __forceinline__ __device__
 +void calculate_potential_switch_F(const  cu_nbparam_t nbparam,
 +                                  float               inv_r,
 +                                  float               r2,
 +                                  float              *F_invr,
 +                                  float              *E_lj)
 +{
 +    float r, r_switch;
 +    float sw, dsw;
 +
 +    /* potential switch constants */
 +    float switch_V3 = nbparam.vdw_switch.c3;
 +    float switch_V4 = nbparam.vdw_switch.c4;
 +    float switch_V5 = nbparam.vdw_switch.c5;
 +    float switch_F2 = 3*nbparam.vdw_switch.c3;
 +    float switch_F3 = 4*nbparam.vdw_switch.c4;
 +    float switch_F4 = 5*nbparam.vdw_switch.c5;
 +
 +    r        = r2 * inv_r;
 +    r_switch = r - nbparam.rvdw_switch;
 +
 +    /* Unlike in the F+E kernel, conditional is faster here */
 +    if (r_switch > 0.0f)
 +    {
 +        sw      = 1.0f + (switch_V3 + (switch_V4 + switch_V5*r_switch)*r_switch)*r_switch*r_switch*r_switch;
 +        dsw     = (switch_F2 + (switch_F3 + switch_F4*r_switch)*r_switch)*r_switch*r_switch;
 +
 +        *F_invr = (*F_invr)*sw - inv_r*(*E_lj)*dsw;
 +    }
 +}
 +
 +/*! Apply potential switch, force + energy version. */
 +static __forceinline__ __device__
 +void calculate_potential_switch_F_E(const  cu_nbparam_t nbparam,
 +                                    float               inv_r,
 +                                    float               r2,
 +                                    float              *F_invr,
 +                                    float              *E_lj)
 +{
 +    float r, r_switch;
 +    float sw, dsw;
 +
 +    /* potential switch constants */
 +    float switch_V3 = nbparam.vdw_switch.c3;
 +    float switch_V4 = nbparam.vdw_switch.c4;
 +    float switch_V5 = nbparam.vdw_switch.c5;
 +    float switch_F2 = 3*nbparam.vdw_switch.c3;
 +    float switch_F3 = 4*nbparam.vdw_switch.c4;
 +    float switch_F4 = 5*nbparam.vdw_switch.c5;
 +
 +    r        = r2 * inv_r;
 +    r_switch = r - nbparam.rvdw_switch;
 +    r_switch = r_switch >= 0.0f ? r_switch : 0.0f;
 +
 +    /* Unlike in the F-only kernel, masking is faster here */
 +    sw       = 1.0f + (switch_V3 + (switch_V4 + switch_V5*r_switch)*r_switch)*r_switch*r_switch*r_switch;
 +    dsw      = (switch_F2 + (switch_F3 + switch_F4*r_switch)*r_switch)*r_switch*r_switch;
 +
 +    *F_invr  = (*F_invr)*sw - inv_r*(*E_lj)*dsw;
 +    *E_lj   *= sw;
 +}
 +
 +
 +/*! \brief Fetch C6 grid contribution coefficients and return the product of these.
 + *
 + *  Depending on what is supported, it fetches parameters either
 + *  using direct load, texture objects, or texrefs.
 + */
 +static __forceinline__ __device__
 +float calculate_lj_ewald_c6grid(const cu_nbparam_t nbparam,
 +                                int                typei,
 +                                int                typej)
 +{
 +#if DISABLE_CUDA_TEXTURES
 +    return LDG(&nbparam.nbfp_comb[2*typei]) * LDG(&nbparam.nbfp_comb[2*typej]);
 +#else
 +    return tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2*typei) * tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2*typej);
 +#endif /* DISABLE_CUDA_TEXTURES */
 +}
 +
 +
 +/*! Calculate LJ-PME grid force contribution with
 + *  geometric combination rule.
 + */
 +static __forceinline__ __device__
 +void calculate_lj_ewald_comb_geom_F(const cu_nbparam_t nbparam,
 +                                    int                typei,
 +                                    int                typej,
 +                                    float              r2,
 +                                    float              inv_r2,
 +                                    float              lje_coeff2,
 +                                    float              lje_coeff6_6,
 +                                    float             *F_invr)
 +{
 +    float c6grid, inv_r6_nm, cr2, expmcr2, poly;
 +
 +    c6grid = calculate_lj_ewald_c6grid(nbparam, typei, typej);
 +
 +    /* Recalculate inv_r6 without exclusion mask */
 +    inv_r6_nm = inv_r2*inv_r2*inv_r2;
 +    cr2       = lje_coeff2*r2;
 +    expmcr2   = expf(-cr2);
 +    poly      = 1.0f + cr2 + 0.5f*cr2*cr2;
 +
 +    /* Subtract the grid force from the total LJ force */
 +    *F_invr  += c6grid*(inv_r6_nm - expmcr2*(inv_r6_nm*poly + lje_coeff6_6))*inv_r2;
 +}
 +
 +
 +/*! Calculate LJ-PME grid force + energy contribution with
 + *  geometric combination rule.
 + */
 +static __forceinline__ __device__
 +void calculate_lj_ewald_comb_geom_F_E(const cu_nbparam_t nbparam,
 +                                      int                typei,
 +                                      int                typej,
 +                                      float              r2,
 +                                      float              inv_r2,
 +                                      float              lje_coeff2,
 +                                      float              lje_coeff6_6,
 +                                      float              int_bit,
 +                                      float             *F_invr,
 +                                      float             *E_lj)
 +{
 +    float c6grid, inv_r6_nm, cr2, expmcr2, poly, sh_mask;
 +
 +    c6grid = calculate_lj_ewald_c6grid(nbparam, typei, typej);
 +
 +    /* Recalculate inv_r6 without exclusion mask */
 +    inv_r6_nm = inv_r2*inv_r2*inv_r2;
 +    cr2       = lje_coeff2*r2;
 +    expmcr2   = expf(-cr2);
 +    poly      = 1.0f + cr2 + 0.5f*cr2*cr2;
 +
 +    /* Subtract the grid force from the total LJ force */
 +    *F_invr  += c6grid*(inv_r6_nm - expmcr2*(inv_r6_nm*poly + lje_coeff6_6))*inv_r2;
 +
 +    /* Shift should be applied only to real LJ pairs */
 +    sh_mask   = nbparam.sh_lj_ewald*int_bit;
 +    *E_lj    += c_oneSixth*c6grid*(inv_r6_nm*(1.0f - expmcr2*poly) + sh_mask);
 +}
 +
 +/*! Fetch per-type LJ parameters.
 + *
 + *  Depending on what is supported, it fetches parameters either
 + *  using direct load, texture objects, or texrefs.
 + */
 +static __forceinline__ __device__
 +float2 fetch_nbfp_comb_c6_c12(const cu_nbparam_t nbparam,
 +                              int                type)
 +{
 +    float2 c6c12;
 +#if DISABLE_CUDA_TEXTURES
 +    /* Force an 8-byte fetch to save a memory instruction. */
 +    float2 *nbfp_comb = (float2 *)nbparam.nbfp_comb;
 +    c6c12 = LDG(&nbfp_comb[type]);
 +#else
 +    /* NOTE: as we always do 8-byte aligned loads, we could
 +       fetch float2 here too just as above. */
 +    c6c12.x = tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2*type);
 +    c6c12.y = tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2*type + 1);
 +#endif /* DISABLE_CUDA_TEXTURES */
 +
 +    return c6c12;
 +}
 +
 +
 +/*! Calculate LJ-PME grid force + energy contribution (if E_lj != nullptr) with
 + *  Lorentz-Berthelot combination rule.
 + *  We use a single F+E kernel with conditional because the performance impact
 + *  of this is pretty small and LB on the CPU is anyway very slow.
 + */
 +static __forceinline__ __device__
 +void calculate_lj_ewald_comb_LB_F_E(const cu_nbparam_t nbparam,
 +                                    int                typei,
 +                                    int                typej,
 +                                    float              r2,
 +                                    float              inv_r2,
 +                                    float              lje_coeff2,
 +                                    float              lje_coeff6_6,
 +                                    float              int_bit,
 +                                    float             *F_invr,
 +                                    float             *E_lj)
 +{
 +    float c6grid, inv_r6_nm, cr2, expmcr2, poly;
 +    float sigma, sigma2, epsilon;
 +
 +    /* sigma and epsilon are scaled to give 6*C6 */
 +    float2 c6c12_i = fetch_nbfp_comb_c6_c12(nbparam, typei);
 +    float2 c6c12_j = fetch_nbfp_comb_c6_c12(nbparam, typej);
 +
 +    sigma   = c6c12_i.x + c6c12_j.x;
 +    epsilon = c6c12_i.y * c6c12_j.y;
 +
 +    sigma2  = sigma*sigma;
 +    c6grid  = epsilon*sigma2*sigma2*sigma2;
 +
 +    /* Recalculate inv_r6 without exclusion mask */
 +    inv_r6_nm = inv_r2*inv_r2*inv_r2;
 +    cr2       = lje_coeff2*r2;
 +    expmcr2   = expf(-cr2);
 +    poly      = 1.0f + cr2 + 0.5f*cr2*cr2;
 +
 +    /* Subtract the grid force from the total LJ force */
 +    *F_invr  += c6grid*(inv_r6_nm - expmcr2*(inv_r6_nm*poly + lje_coeff6_6))*inv_r2;
 +
 +    if (E_lj != nullptr)
 +    {
 +        float sh_mask;
 +
 +        /* Shift should be applied only to real LJ pairs */
 +        sh_mask   = nbparam.sh_lj_ewald*int_bit;
 +        *E_lj    += c_oneSixth*c6grid*(inv_r6_nm*(1.0f - expmcr2*poly) + sh_mask);
 +    }
 +}
 +
 +
 +/*! Fetch two consecutive values from the Ewald correction F*r table.
 + *
 + *  Depending on what is supported, it fetches parameters either
 + *  using direct load, texture objects, or texrefs.
 + */
 +static __forceinline__ __device__
 +float2 fetch_coulomb_force_r(const cu_nbparam_t nbparam,
 +                             int                index)
 +{
 +    float2 d;
 +
 +#if DISABLE_CUDA_TEXTURES
 +    /* Can't do 8-byte fetch because some of the addresses will be misaligned. */
 +    d.x = LDG(&nbparam.coulomb_tab[index]);
 +    d.y = LDG(&nbparam.coulomb_tab[index + 1]);
 +#else
 +    d.x = tex1Dfetch<float>(nbparam.coulomb_tab_texobj, index);
 +    d.y = tex1Dfetch<float>(nbparam.coulomb_tab_texobj, index + 1);
 +#endif // DISABLE_CUDA_TEXTURES
 +
 +    return d;
 +}
 +
 +/*! Linear interpolation using exactly two FMA operations.
 + *
 + *  Implements numeric equivalent of: (1-t)*d0 + t*d1
 + *  Note that CUDA does not have fnms, otherwise we'd use
 + *  fma(t, d1, fnms(t, d0, d0)
 + *  but input modifiers are designed for this and are fast.
 + */
 +template <typename T>
 +__forceinline__ __host__ __device__
 +T lerp(T d0, T d1, T t)
 +{
 +    return fma(t, d1, fma(-t, d0, d0));
 +}
 +
 +/*! Interpolate Ewald coulomb force correction using the F*r table.
 + */
 +static __forceinline__ __device__
 +float interpolate_coulomb_force_r(const cu_nbparam_t nbparam,
 +                                  float              r)
 +{
 +    float  normalized = nbparam.coulomb_tab_scale * r;
 +    int    index      = (int) normalized;
 +    float  fraction   = normalized - index;
 +
 +    float2 d01 = fetch_coulomb_force_r(nbparam, index);
 +
 +    return lerp(d01.x, d01.y, fraction);
 +}
 +
 +/*! Fetch C6 and C12 from the parameter table.
 + *
 + *  Depending on what is supported, it fetches parameters either
 + *  using direct load, texture objects, or texrefs.
 + */
 +static __forceinline__ __device__
 +void fetch_nbfp_c6_c12(float               &c6,
 +                       float               &c12,
 +                       const cu_nbparam_t   nbparam,
 +                       int                  baseIndex)
 +{
 +#if DISABLE_CUDA_TEXTURES
 +    /* Force an 8-byte fetch to save a memory instruction. */
 +    float2 *nbfp = (float2 *)nbparam.nbfp;
 +    float2  c6c12;
 +    c6c12 = LDG(&nbfp[baseIndex]);
 +    c6    = c6c12.x;
 +    c12   = c6c12.y;
 +#else
 +    /* NOTE: as we always do 8-byte aligned loads, we could
 +       fetch float2 here too just as above. */
 +    c6  = tex1Dfetch<float>(nbparam.nbfp_texobj, 2*baseIndex);
 +    c12 = tex1Dfetch<float>(nbparam.nbfp_texobj, 2*baseIndex + 1);
 +#endif // DISABLE_CUDA_TEXTURES
 +}
 +
 +
 +/*! Calculate analytical Ewald correction term. */
 +static __forceinline__ __device__
 +float pmecorrF(float z2)
 +{
 +    const float FN6 = -1.7357322914161492954e-8f;
 +    const float FN5 = 1.4703624142580877519e-6f;
 +    const float FN4 = -0.000053401640219807709149f;
 +    const float FN3 = 0.0010054721316683106153f;
 +    const float FN2 = -0.019278317264888380590f;
 +    const float FN1 = 0.069670166153766424023f;
 +    const float FN0 = -0.75225204789749321333f;
 +
 +    const float FD4 = 0.0011193462567257629232f;
 +    const float FD3 = 0.014866955030185295499f;
 +    const float FD2 = 0.11583842382862377919f;
 +    const float FD1 = 0.50736591960530292870f;
 +    const float FD0 = 1.0f;
 +
 +    float       z4;
 +    float       polyFN0, polyFN1, polyFD0, polyFD1;
 +
 +    z4          = z2*z2;
 +
 +    polyFD0     = FD4*z4 + FD2;
 +    polyFD1     = FD3*z4 + FD1;
 +    polyFD0     = polyFD0*z4 + FD0;
 +    polyFD0     = polyFD1*z2 + polyFD0;
 +
 +    polyFD0     = 1.0f/polyFD0;
 +
 +    polyFN0     = FN6*z4 + FN4;
 +    polyFN1     = FN5*z4 + FN3;
 +    polyFN0     = polyFN0*z4 + FN2;
 +    polyFN1     = polyFN1*z4 + FN1;
 +    polyFN0     = polyFN0*z4 + FN0;
 +    polyFN0     = polyFN1*z2 + polyFN0;
 +
 +    return polyFN0*polyFD0;
 +}
 +
 +/*! Final j-force reduction; this generic implementation works with
 + *  arbitrary array sizes.
 + */
 +static __forceinline__ __device__
 +void reduce_force_j_generic(float *f_buf, float3 *fout,
 +                            int tidxi, int tidxj, int aidx)
 +{
 +    if (tidxi < 3)
 +    {
 +        float f = 0.0f;
 +        for (int j = tidxj * c_clSize; j < (tidxj + 1) * c_clSize; j++)
 +        {
 +            f += f_buf[c_fbufStride * tidxi + j];
 +        }
 +
 +        atomicAdd((&fout[aidx].x)+tidxi, f);
 +    }
 +}
 +
 +/*! Final j-force reduction; this implementation only with power of two
 + *  array sizes.
 + */
 +static __forceinline__ __device__
 +void reduce_force_j_warp_shfl(float3 f, float3 *fout,
 +                              int tidxi, int aidx,
 +                              const unsigned int activemask)
 +{
 +    f.x += gmx_shfl_down_sync(activemask, f.x, 1);
 +    f.y += gmx_shfl_up_sync  (activemask, f.y, 1);
 +    f.z += gmx_shfl_down_sync(activemask, f.z, 1);
 +
 +    if (tidxi & 1)
 +    {
 +        f.x = f.y;
 +    }
 +
 +    f.x += gmx_shfl_down_sync(activemask, f.x, 2);
 +    f.z += gmx_shfl_up_sync  (activemask, f.z, 2);
 +
 +    if (tidxi & 2)
 +    {
 +        f.x = f.z;
 +    }
 +
 +    f.x += gmx_shfl_down_sync(activemask, f.x, 4);
 +
 +    if (tidxi < 3)
 +    {
 +        atomicAdd((&fout[aidx].x) + tidxi, f.x);
 +    }
 +}
 +
 +/*! Final i-force reduction; this generic implementation works with
 + *  arbitrary array sizes.
 + * TODO: add the tidxi < 3 trick
 + */
 +static __forceinline__ __device__
 +void reduce_force_i_generic(float *f_buf, float3 *fout,
 +                            float *fshift_buf, bool bCalcFshift,
 +                            int tidxi, int tidxj, int aidx)
 +{
 +    if (tidxj < 3)
 +    {
 +        float f = 0.0f;
 +        for (int j = tidxi; j < c_clSizeSq; j += c_clSize)
 +        {
 +            f += f_buf[tidxj * c_fbufStride + j];
 +        }
 +
 +        atomicAdd(&fout[aidx].x + tidxj, f);
 +
 +        if (bCalcFshift)
 +        {
 +            *fshift_buf += f;
 +        }
 +    }
 +}
 +
 +/*! Final i-force reduction; this implementation works only with power of two
 + *  array sizes.
 + */
 +static __forceinline__ __device__
 +void reduce_force_i_pow2(volatile float *f_buf, float3 *fout,
 +                         float *fshift_buf, bool bCalcFshift,
 +                         int tidxi, int tidxj, int aidx)
 +{
 +    int     i, j;
 +    float   f;
 +
 +    assert(c_clSize == 1 << c_clSizeLog2);
 +
 +    /* Reduce the initial c_clSize values for each i atom to half
 +     * every step by using c_clSize * i threads.
 +     * Can't just use i as loop variable because than nvcc refuses to unroll.
 +     */
 +    i = c_clSize/2;
 +#pragma unroll 5
 +    for (j = c_clSizeLog2 - 1; j > 0; j--)
 +    {
 +        if (tidxj < i)
 +        {
 +
 +            f_buf[                   tidxj * c_clSize + tidxi] += f_buf[                   (tidxj + i) * c_clSize + tidxi];
 +            f_buf[    c_fbufStride + tidxj * c_clSize + tidxi] += f_buf[    c_fbufStride + (tidxj + i) * c_clSize + tidxi];
 +            f_buf[2 * c_fbufStride + tidxj * c_clSize + tidxi] += f_buf[2 * c_fbufStride + (tidxj + i) * c_clSize + tidxi];
 +        }
 +        i >>= 1;
 +    }
 +
 +    /* i == 1, last reduction step, writing to global mem */
 +    if (tidxj < 3)
 +    {
 +        /* tidxj*c_fbufStride selects x, y or z */
 +        f = f_buf[tidxj * c_fbufStride               + tidxi] +
 +            f_buf[tidxj * c_fbufStride + i * c_clSize + tidxi];
 +
 +        atomicAdd(&(fout[aidx].x) + tidxj, f);
 +
 +        if (bCalcFshift)
 +        {
 +            *fshift_buf += f;
 +        }
 +    }
 +
 +}
 +
 +/*! Final i-force reduction wrapper; calls the generic or pow2 reduction depending
 + *  on whether the size of the array to be reduced is power of two or not.
 + */
 +static __forceinline__ __device__
 +void reduce_force_i(float *f_buf, float3 *f,
 +                    float *fshift_buf, bool bCalcFshift,
 +                    int tidxi, int tidxj, int ai)
 +{
 +    if ((c_clSize & (c_clSize - 1)))
 +    {
 +        reduce_force_i_generic(f_buf, f, fshift_buf, bCalcFshift, tidxi, tidxj, ai);
 +    }
 +    else
 +    {
 +        reduce_force_i_pow2(f_buf, f, fshift_buf, bCalcFshift, tidxi, tidxj, ai);
 +    }
 +}
 +
 +/*! Final i-force reduction; this implementation works only with power of two
 + *  array sizes.
 + */
 +static __forceinline__ __device__
 +void reduce_force_i_warp_shfl(float3 fin, float3 *fout,
 +                              float *fshift_buf, bool bCalcFshift,
 +                              int tidxj, int aidx,
 +                              const unsigned int activemask)
 +{
 +    fin.x += gmx_shfl_down_sync(activemask, fin.x, c_clSize);
 +    fin.y += gmx_shfl_up_sync  (activemask, fin.y, c_clSize);
 +    fin.z += gmx_shfl_down_sync(activemask, fin.z, c_clSize);
 +
 +    if (tidxj & 1)
 +    {
 +        fin.x = fin.y;
 +    }
 +
 +    fin.x += gmx_shfl_down_sync(activemask, fin.x, 2*c_clSize);
 +    fin.z += gmx_shfl_up_sync  (activemask, fin.z, 2*c_clSize);
 +
 +    if (tidxj & 2)
 +    {
 +        fin.x = fin.z;
 +    }
 +
 +    /* Threads 0,1,2 and 4,5,6 increment x,y,z for their warp */
 +    if ((tidxj & 3) < 3)
 +    {
 +        atomicAdd(&fout[aidx].x + (tidxj & 3), fin.x);
 +
 +        if (bCalcFshift)
 +        {
 +            *fshift_buf += fin.x;
 +        }
 +    }
 +}
 +
 +/*! Energy reduction; this implementation works only with power of two
 + *  array sizes.
 + */
 +static __forceinline__ __device__
 +void reduce_energy_pow2(volatile float *buf,
 +                        float *e_lj, float *e_el,
 +                        unsigned int tidx)
 +{
 +    float        e1, e2;
 +
 +    unsigned int i = warp_size/2;
 +
 +    /* Can't just use i as loop variable because than nvcc refuses to unroll. */
 +#pragma unroll 10
 +    for (int j = warp_size_log2 - 1; j > 0; j--)
 +    {
 +        if (tidx < i)
 +        {
 +            buf[               tidx] += buf[               tidx + i];
 +            buf[c_fbufStride + tidx] += buf[c_fbufStride + tidx + i];
 +        }
 +        i >>= 1;
 +    }
 +
 +    // TODO do this on two threads - requires e_lj and e_el to be stored on adjascent
 +    // memory locations to make sense
 +    /* last reduction step, writing to global mem */
 +    if (tidx == 0)
 +    {
 +        e1 = buf[               tidx] + buf[               tidx + i];
 +        e2 = buf[c_fbufStride + tidx] + buf[c_fbufStride + tidx + i];
 +
 +        atomicAdd(e_lj, e1);
 +        atomicAdd(e_el, e2);
 +    }
 +}
 +
 +/*! Energy reduction; this implementation works only with power of two
 + *  array sizes.
 + */
 +static __forceinline__ __device__
 +void reduce_energy_warp_shfl(float E_lj, float E_el,
 +                             float *e_lj, float *e_el,
 +                             int tidx,
 +                             const unsigned int activemask)
 +{
 +    int i, sh;
 +
 +    sh = 1;
 +#pragma unroll 5
 +    for (i = 0; i < 5; i++)
 +    {
 +        E_lj += gmx_shfl_down_sync(activemask, E_lj, sh);
 +        E_el += gmx_shfl_down_sync(activemask, E_el, sh);
 +        sh   += sh;
 +    }
 +
 +    /* The first thread in the warp writes the reduced energies */
 +    if (tidx == 0 || tidx == warp_size)
 +    {
 +        atomicAdd(e_lj, E_lj);
 +        atomicAdd(e_el, E_el);
 +    }
 +}
 +
 +#endif /* NBNXN_CUDA_KERNEL_UTILS_CUH */
index 74fcc398ac8080973c369dd0727da5c987ff5fbb,0000000000000000000000000000000000000000..c63c10d9571919fe14c3f15d12c9990d4eb6f1a3
mode 100644,000000..100644
--- /dev/null
@@@ -1,913 -1,0 +1,913 @@@
-         c12*(-repu_shift_V2 + repu_shift_V3*r_switch)*r_switch*r_switch*inv_r;
 +/*
 + * This file is part of the GROMACS molecular simulation package.
 + *
 + * Copyright (c) 2012,2013,2014,2016,2017,2018,2019, by the GROMACS development team, led by
 + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
 + * and including many others, as listed in the AUTHORS file in the
 + * top-level source directory and at http://www.gromacs.org.
 + *
 + * GROMACS is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU Lesser General Public License
 + * as published by the Free Software Foundation; either version 2.1
 + * of the License, or (at your option) any later version.
 + *
 + * GROMACS is distributed in the hope that it will be useful,
 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 + * Lesser General Public License for more details.
 + *
 + * You should have received a copy of the GNU Lesser General Public
 + * License along with GROMACS; if not, see
 + * http://www.gnu.org/licenses, or write to the Free Software Foundation,
 + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
 + *
 + * If you want to redistribute modifications to GROMACS, please
 + * consider that scientific software is very special. Version
 + * control is crucial - bugs must be traceable. We will be happy to
 + * consider code for inclusion in the official distribution, but
 + * derived work must not be called official GROMACS. Details are found
 + * in the README & COPYING files - if they are missing, get the
 + * official version at http://www.gromacs.org.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the research papers on the package. Check out http://www.gromacs.org.
 + */
 +
 +#define GMX_DOUBLE 0
 +
 +#include "gromacs/gpu_utils/device_utils.clh"
 +#include "gromacs/gpu_utils/vectype_ops.clh"
 +#include "gromacs/nbnxm/constants.h"
 +#include "gromacs/pbcutil/ishift.h"
 +
 +#include "nbnxm_ocl_consts.h"
 +
 +#define CL_SIZE                 (NBNXN_GPU_CLUSTER_SIZE)
 +#define NCL_PER_SUPERCL         c_nbnxnGpuNumClusterPerSupercluster
 +
 +#define WARP_SIZE  (CL_SIZE*CL_SIZE/2) //Currently only c_nbnxnGpuClusterpairSplit=2 supported
 +
 +#if defined _NVIDIA_SOURCE_ || defined _AMD_SOURCE_
 +/* Currently we enable CJ prefetch for AMD/NVIDIA and disable it for other vendors
 + * Note that this should precede the kernel_utils include.
 + */
 +#define USE_CJ_PREFETCH 1
 +#else
 +#define USE_CJ_PREFETCH 0
 +#endif
 +
 +#if defined cl_intel_subgroups || defined cl_khr_subgroups || (defined __OPENCL_VERSION__ && __OPENCL_VERSION__ >= 210)
 +#define HAVE_SUBGROUP 1
 +#else
 +#define HAVE_SUBGROUP 0
 +#endif
 +
 +#ifdef cl_intel_subgroups
 +#define HAVE_INTEL_SUBGROUP 1
 +#else
 +#define HAVE_INTEL_SUBGROUP 0
 +#endif
 +
 +#if defined _INTEL_SOURCE_
 +#define SUBGROUP_SIZE 8
 +#elif defined _AMD_SOURCE_
 +#define SUBGROUP_SIZE 64
 +#else
 +#define SUBGROUP_SIZE 32
 +#endif
 +
 +#define REDUCE_SHUFFLE (HAVE_INTEL_SUBGROUP && CL_SIZE == 4 && SUBGROUP_SIZE == WARP_SIZE)
 +#define USE_SUBGROUP_ANY (HAVE_SUBGROUP && SUBGROUP_SIZE == WARP_SIZE)
 +#define USE_SUBGROUP_PRELOAD HAVE_INTEL_SUBGROUP
 +
 +/* 1.0 / sqrt(M_PI) */
 +#define M_FLOAT_1_SQRTPI 0.564189583547756f
 +
 +//-------------------
 +
 +#ifndef NBNXN_OPENCL_KERNEL_UTILS_CLH
 +#define NBNXN_OPENCL_KERNEL_UTILS_CLH
 +
 +#if CL_SIZE == 8
 +#define WARP_SIZE_LOG2  (5)
 +#define CL_SIZE_LOG2    (3)
 +#elif CL_SIZE == 4
 +#define WARP_SIZE_LOG2  (3)
 +#define CL_SIZE_LOG2    (2)
 +#else
 +#error unsupported CL_SIZE
 +#endif
 +
 +#define CL_SIZE_SQ      (CL_SIZE * CL_SIZE)
 +#define FBUF_STRIDE     (CL_SIZE_SQ)
 +
 +#define ONE_SIXTH_F     0.16666667f
 +#define ONE_TWELVETH_F  0.08333333f
 +
 +
 +#ifdef __GNUC__
 +/* GCC, clang, and some ICC pretending to be GCC */
 +#  define gmx_unused __attribute__ ((unused))
 +#else
 +#  define gmx_unused
 +#endif
 +
 +// Data structures shared between OpenCL device code and OpenCL host code
 +// TODO: review, improve
 +// Replaced real by float for now, to avoid including any other header
 +typedef struct {
 +    float c2;
 +    float c3;
 +    float cpot;
 +} shift_consts_t;
 +
 +/* Used with potential switching:
 + * rsw        = max(r - r_switch, 0)
 + * sw         = 1 + c3*rsw^3 + c4*rsw^4 + c5*rsw^5
 + * dsw        = 3*c3*rsw^2 + 4*c4*rsw^3 + 5*c5*rsw^4
 + * force      = force*dsw - potential*sw
 + * potential *= sw
 + */
 +typedef struct {
 +    float c3;
 +    float c4;
 +    float c5;
 +} switch_consts_t;
 +
 +// Data structure shared between the OpenCL device code and OpenCL host code
 +// Must not contain OpenCL objects (buffers)
 +typedef struct cl_nbparam_params
 +{
 +
 +    int             eeltype;          /**< type of electrostatics, takes values from #eelCu */
 +    int             vdwtype;          /**< type of VdW impl., takes values from #evdwCu     */
 +
 +    float           epsfac;           /**< charge multiplication factor                      */
 +    float           c_rf;             /**< Reaction-field/plain cutoff electrostatics const. */
 +    float           two_k_rf;         /**< Reaction-field electrostatics constant            */
 +    float           ewald_beta;       /**< Ewald/PME parameter                               */
 +    float           sh_ewald;         /**< Ewald/PME correction term substracted from the direct-space potential */
 +    float           sh_lj_ewald;      /**< LJ-Ewald/PME correction term added to the correction potential        */
 +    float           ewaldcoeff_lj;    /**< LJ-Ewald/PME coefficient                          */
 +
 +    float           rcoulomb_sq;      /**< Coulomb cut-off squared                           */
 +
 +    float           rvdw_sq;          /**< VdW cut-off squared                               */
 +    float           rvdw_switch;      /**< VdW switched cut-off                              */
 +    float           rlistOuter_sq;    /**< Full, outer pair-list cut-off squared             */
 +    float           rlistInner_sq;    /**< Inner, dynamic pruned pair-list cut-off squared  XXX: this is only needed in the pruning kernels, but for now we also pass it to the nonbondeds */
 +
 +    shift_consts_t  dispersion_shift; /**< VdW shift dispersion constants           */
 +    shift_consts_t  repulsion_shift;  /**< VdW shift repulsion constants            */
 +    switch_consts_t vdw_switch;       /**< VdW switch constants                     */
 +
 +    /* Ewald Coulomb force table data - accessed through texture memory */
 +    float                  coulomb_tab_scale;  /**< table scale/spacing                        */
 +}cl_nbparam_params_t;
 +
 +typedef struct {
 +    int sci;            /* i-super-cluster       */
 +    int shift;          /* Shift vector index plus possible flags */
 +    int cj4_ind_start;  /* Start index into cj4  */
 +    int cj4_ind_end;    /* End index into cj4    */
 +} nbnxn_sci_t;
 +
 +typedef struct {
 +    unsigned int imask;    /* The i-cluster interactions mask for 1 warp  */
 +    int          excl_ind; /* Index into the exclusion array for 1 warp   */
 +} nbnxn_im_ei_t;
 +
 +typedef struct {
 +    int           cj[4];   /* The 4 j-clusters                            */
 +    nbnxn_im_ei_t imei[2]; /* The i-cluster mask data       for 2 warps   */
 +} nbnxn_cj4_t;
 +
 +
 +typedef struct {
 +    unsigned int pair[CL_SIZE*CL_SIZE/2]; /* Topology exclusion interaction bits for one warp,
 +                                           * each unsigned has bitS for 4*8 i clusters
 +                                           */
 +} nbnxn_excl_t;
 +
 +/*! i-cluster interaction mask for a super-cluster with all NCL_PER_SUPERCL bits set */
 +__constant unsigned supercl_interaction_mask = ((1U << NCL_PER_SUPERCL) - 1U);
 +
 +gmx_opencl_inline
 +void preloadCj4Generic(__local int        *sm_cjPreload,
 +                       const __global int *gm_cj,
 +                       int                 tidxi,
 +                       int                 tidxj,
 +                       bool gmx_unused     iMaskCond)
 +{
 +    /* Pre-load cj into shared memory */
 +#if defined _AMD_SOURCE_ //TODO: fix by setting c_nbnxnGpuClusterpairSplit properly
 +    if (tidxj == 0 & tidxi < c_nbnxnGpuJgroupSize)
 +    {
 +        sm_cjPreload[tidxi] = gm_cj[tidxi];
 +    }
 +#else
 +    const int c_clSize                   = CL_SIZE;
 +    const int c_nbnxnGpuClusterpairSplit = 2;
 +    const int c_splitClSize              = c_clSize/c_nbnxnGpuClusterpairSplit;
 +    if ((tidxj == 0 | tidxj == c_splitClSize) & (tidxi < c_nbnxnGpuJgroupSize))
 +    {
 +        sm_cjPreload[tidxi + tidxj * c_nbnxnGpuJgroupSize/c_splitClSize] = gm_cj[tidxi];
 +    }
 +#endif
 +}
 +
 +
 +#if USE_SUBGROUP_PRELOAD
 +gmx_opencl_inline
 +int  preloadCj4Subgroup(const __global int *gm_cj)
 +{
 +    //loads subgroup-size # of elements (8) instead of the 4 required
 +    //equivalent to *cjs = *gm_cj
 +    return intel_sub_group_block_read((const __global uint *)gm_cj);
 +}
 +#endif //USE_SUBGROUP_PRELOAD
 +
 +#if USE_SUBGROUP_PRELOAD
 +typedef size_t CjType;
 +#else
 +typedef __local int* CjType;
 +#endif
 +
 +/*! \brief Preload cj4
 + *
 + * - For AMD we load once for a wavefront of 64 threads (on 4 threads * NTHREAD_Z)
 + * - For NVIDIA once per warp (on 2x4 threads * NTHREAD_Z)
 + * - For Intel(/USE_SUBGROUP_PRELOAD) loads into private memory(/register) instead of local memory
 + *
 + * It is the caller's responsibility to make sure that data is consumed only when
 + * it's ready. This function does not call a barrier.
 + */
 +gmx_opencl_inline
 +void preloadCj4(CjType gmx_unused             *cjs,
 +                const __global int gmx_unused *gm_cj,
 +                int gmx_unused                 tidxi,
 +                int gmx_unused                 tidxj,
 +                bool gmx_unused                iMaskCond)
 +{
 +#if USE_SUBGROUP_PRELOAD
 +    *cjs = preloadCj4Subgroup(gm_cj);
 +#elif USE_CJ_PREFETCH
 +    preloadCj4Generic(*cjs, gm_cj, tidxi, tidxj, iMaskCond);
 +#else
 +    //nothing to do
 +#endif
 +}
 +
 +gmx_opencl_inline
 +int loadCjPreload(__local int           *        sm_cjPreload,
 +                  int                            jm,
 +                  int gmx_unused                 tidxi,
 +                  int gmx_unused                 tidxj)
 +{
 +#if defined _AMD_SOURCE_
 +    int       warpLoadOffset = 0; //TODO: fix by setting c_nbnxnGpuClusterpairSplit properly
 +#else
 +    const int c_clSize                   = CL_SIZE;
 +    const int c_nbnxnGpuClusterpairSplit = 2;
 +    const int c_splitClSize              = c_clSize/c_nbnxnGpuClusterpairSplit;
 +    int       warpLoadOffset             = (tidxj & c_splitClSize) * c_nbnxnGpuJgroupSize/c_splitClSize;
 +#endif
 +    return sm_cjPreload[jm + warpLoadOffset];
 +}
 +
 +/* \brief Load a cj given a jm index.
 + *
 + * If cj4 preloading is enabled, it loads from the local memory, otherwise from global.
 + */
 +gmx_opencl_inline
 +int loadCj(CjType cjs, const __global int gmx_unused* gm_cj,
 +           int jm, int gmx_unused tidxi, int gmx_unused tidxj)
 +{
 +#if USE_SUBGROUP_PRELOAD
 +    return sub_group_broadcast(cjs, jm);
 +#elif USE_CJ_PREFETCH
 +    return loadCjPreload(cjs, jm, tidxi, tidxj);
 +#else
 +    return gm_cj[jm];
 +#endif
 +}
 +
 +/*! Convert LJ sigma,epsilon parameters to C6,C12. */
 +gmx_opencl_inline
 +void convert_sigma_epsilon_to_c6_c12(const float  sigma,
 +                                     const float  epsilon,
 +                                     float       *c6,
 +                                     float       *c12)
 +{
 +    float sigma2, sigma6;
 +
 +    sigma2 = sigma * sigma;
 +    sigma6 = sigma2 *sigma2 * sigma2;
 +    *c6    = epsilon * sigma6;
 +    *c12   = *c6 * sigma6;
 +}
 +
 +
 +/*! Apply force switch,  force + energy version. */
 +gmx_opencl_inline
 +void calculate_force_switch_F(const cl_nbparam_params_t *nbparam,
 +                              float                      c6,
 +                              float                      c12,
 +                              float                      inv_r,
 +                              float                      r2,
 +                              float                     *F_invr)
 +{
 +    float r, r_switch;
 +
 +    /* force switch constants */
 +    float disp_shift_V2 = nbparam->dispersion_shift.c2;
 +    float disp_shift_V3 = nbparam->dispersion_shift.c3;
 +    float repu_shift_V2 = nbparam->repulsion_shift.c2;
 +    float repu_shift_V3 = nbparam->repulsion_shift.c3;
 +
 +    r         = r2 * inv_r;
 +    r_switch  = r - nbparam->rvdw_switch;
 +    r_switch  = r_switch >= 0.0f ? r_switch : 0.0f;
 +
 +    *F_invr  +=
 +        -c6*(disp_shift_V2 + disp_shift_V3*r_switch)*r_switch*r_switch*inv_r +
-         c12*(-repu_shift_V2 + repu_shift_V3*r_switch)*r_switch*r_switch*inv_r;
++        c12*(repu_shift_V2 + repu_shift_V3*r_switch)*r_switch*r_switch*inv_r;
 +}
 +
 +/*! Apply force switch, force-only version. */
 +gmx_opencl_inline
 +void calculate_force_switch_F_E(const cl_nbparam_params_t *nbparam,
 +                                float                      c6,
 +                                float                      c12,
 +                                float                      inv_r,
 +                                float                      r2,
 +                                float                     *F_invr,
 +                                float                     *E_lj)
 +{
 +    float r, r_switch;
 +
 +    /* force switch constants */
 +    float disp_shift_V2 = nbparam->dispersion_shift.c2;
 +    float disp_shift_V3 = nbparam->dispersion_shift.c3;
 +    float repu_shift_V2 = nbparam->repulsion_shift.c2;
 +    float repu_shift_V3 = nbparam->repulsion_shift.c3;
 +
 +    float disp_shift_F2 = nbparam->dispersion_shift.c2/3;
 +    float disp_shift_F3 = nbparam->dispersion_shift.c3/4;
 +    float repu_shift_F2 = nbparam->repulsion_shift.c2/3;
 +    float repu_shift_F3 = nbparam->repulsion_shift.c3/4;
 +
 +    r         = r2 * inv_r;
 +    r_switch  = r - nbparam->rvdw_switch;
 +    r_switch  = r_switch >= 0.0f ? r_switch : 0.0f;
 +
 +    *F_invr  +=
 +        -c6*(disp_shift_V2 + disp_shift_V3*r_switch)*r_switch*r_switch*inv_r +
++        c12*(repu_shift_V2 + repu_shift_V3*r_switch)*r_switch*r_switch*inv_r;
 +    *E_lj    +=
 +        c6*(disp_shift_F2 + disp_shift_F3*r_switch)*r_switch*r_switch*r_switch -
 +        c12*(repu_shift_F2 + repu_shift_F3*r_switch)*r_switch*r_switch*r_switch;
 +}
 +
 +/*! Apply potential switch, force-only version. */
 +gmx_opencl_inline
 +void calculate_potential_switch_F(const cl_nbparam_params_t *nbparam,
 +                                  float                      inv_r,
 +                                  float                      r2,
 +                                  float                     *F_invr,
 +                                  const float               *E_lj)
 +{
 +    float r, r_switch;
 +    float sw, dsw;
 +
 +    /* potential switch constants */
 +    float switch_V3 = nbparam->vdw_switch.c3;
 +    float switch_V4 = nbparam->vdw_switch.c4;
 +    float switch_V5 = nbparam->vdw_switch.c5;
 +    float switch_F2 = nbparam->vdw_switch.c3;
 +    float switch_F3 = nbparam->vdw_switch.c4;
 +    float switch_F4 = nbparam->vdw_switch.c5;
 +
 +    r        = r2 * inv_r;
 +    r_switch = r - nbparam->rvdw_switch;
 +
 +    /* Unlike in the F+E kernel, conditional is faster here */
 +    if (r_switch > 0.0f)
 +    {
 +        sw      = 1.0f + (switch_V3 + (switch_V4 + switch_V5*r_switch)*r_switch)*r_switch*r_switch*r_switch;
 +        dsw     = (switch_F2 + (switch_F3 + switch_F4*r_switch)*r_switch)*r_switch*r_switch;
 +
 +        *F_invr = (*F_invr)*sw - inv_r*(*E_lj)*dsw;
 +    }
 +}
 +
 +/*! Apply potential switch, force + energy version. */
 +gmx_opencl_inline
 +void calculate_potential_switch_F_E(const cl_nbparam_params_t *nbparam,
 +                                    float                      inv_r,
 +                                    float                      r2,
 +                                    float                     *F_invr,
 +                                    float                     *E_lj)
 +{
 +    float r, r_switch;
 +    float sw, dsw;
 +
 +    /* potential switch constants */
 +    float switch_V3 = nbparam->vdw_switch.c3;
 +    float switch_V4 = nbparam->vdw_switch.c4;
 +    float switch_V5 = nbparam->vdw_switch.c5;
 +    float switch_F2 = nbparam->vdw_switch.c3;
 +    float switch_F3 = nbparam->vdw_switch.c4;
 +    float switch_F4 = nbparam->vdw_switch.c5;
 +
 +    r        = r2 * inv_r;
 +    r_switch = r - nbparam->rvdw_switch;
 +    r_switch = r_switch >= 0.0f ? r_switch : 0.0f;
 +
 +    /* Unlike in the F-only kernel, masking is faster here */
 +    sw       = 1.0f + (switch_V3 + (switch_V4 + switch_V5*r_switch)*r_switch)*r_switch*r_switch*r_switch;
 +    dsw      = (switch_F2 + (switch_F3 + switch_F4*r_switch)*r_switch)*r_switch*r_switch;
 +
 +    *F_invr  = (*F_invr)*sw - inv_r*(*E_lj)*dsw;
 +    *E_lj   *= sw;
 +}
 +
 +/*! Calculate LJ-PME grid force contribution with
 + *  geometric combination rule.
 + */
 +gmx_opencl_inline
 +void calculate_lj_ewald_comb_geom_F(__constant const float *nbfp_comb_climg2d,
 +                                    int                     typei,
 +                                    int                     typej,
 +                                    float                   r2,
 +                                    float                   inv_r2,
 +                                    float                   lje_coeff2,
 +                                    float                   lje_coeff6_6,
 +                                    float                  *F_invr)
 +{
 +    float c6grid, inv_r6_nm, cr2, expmcr2, poly;
 +
 +    c6grid    = nbfp_comb_climg2d[2*typei]*nbfp_comb_climg2d[2*typej];
 +
 +    /* Recalculate inv_r6 without exclusion mask */
 +    inv_r6_nm = inv_r2*inv_r2*inv_r2;
 +    cr2       = lje_coeff2*r2;
 +    expmcr2   = exp(-cr2);
 +    poly      = 1.0f + cr2 + 0.5f*cr2*cr2;
 +
 +    /* Subtract the grid force from the total LJ force */
 +    *F_invr  += c6grid*(inv_r6_nm - expmcr2*(inv_r6_nm*poly + lje_coeff6_6))*inv_r2;
 +}
 +
 +/*! Calculate LJ-PME grid force + energy contribution with
 + *  geometric combination rule.
 + */
 +gmx_opencl_inline
 +void calculate_lj_ewald_comb_geom_F_E(__constant const float    *nbfp_comb_climg2d,
 +                                      const cl_nbparam_params_t *nbparam,
 +                                      int                        typei,
 +                                      int                        typej,
 +                                      float                      r2,
 +                                      float                      inv_r2,
 +                                      float                      lje_coeff2,
 +                                      float                      lje_coeff6_6,
 +                                      float                      int_bit,
 +                                      float                     *F_invr,
 +                                      float                     *E_lj)
 +{
 +    float c6grid, inv_r6_nm, cr2, expmcr2, poly, sh_mask;
 +
 +    c6grid    = nbfp_comb_climg2d[2*typei]*nbfp_comb_climg2d[2*typej];
 +
 +    /* Recalculate inv_r6 without exclusion mask */
 +    inv_r6_nm = inv_r2*inv_r2*inv_r2;
 +    cr2       = lje_coeff2*r2;
 +    expmcr2   = exp(-cr2);
 +    poly      = 1.0f + cr2 + 0.5f*cr2*cr2;
 +
 +    /* Subtract the grid force from the total LJ force */
 +    *F_invr  += c6grid*(inv_r6_nm - expmcr2*(inv_r6_nm*poly + lje_coeff6_6))*inv_r2;
 +
 +    /* Shift should be applied only to real LJ pairs */
 +    sh_mask   = nbparam->sh_lj_ewald*int_bit;
 +    *E_lj    += ONE_SIXTH_F*c6grid*(inv_r6_nm*(1.0f - expmcr2*poly) + sh_mask);
 +}
 +
 +/*! Calculate LJ-PME grid force + energy contribution (if E_lj != NULL) with
 + *  Lorentz-Berthelot combination rule.
 + *  We use a single F+E kernel with conditional because the performance impact
 + *  of this is pretty small and LB on the CPU is anyway very slow.
 + */
 +gmx_opencl_inline
 +void calculate_lj_ewald_comb_LB_F_E(__constant const float    *nbfp_comb_climg2d,
 +                                    const cl_nbparam_params_t *nbparam,
 +                                    int                        typei,
 +                                    int                        typej,
 +                                    float                      r2,
 +                                    float                      inv_r2,
 +                                    float                      lje_coeff2,
 +                                    float                      lje_coeff6_6,
 +                                    float                      int_bit,
 +                                    bool                       with_E_lj,
 +                                    float                     *F_invr,
 +                                    float                     *E_lj)
 +{
 +    float c6grid, inv_r6_nm, cr2, expmcr2, poly;
 +    float sigma, sigma2, epsilon;
 +
 +    /* sigma and epsilon are scaled to give 6*C6 */
 +    sigma      = nbfp_comb_climg2d[2*typei] + nbfp_comb_climg2d[2*typej];
 +
 +    epsilon    = nbfp_comb_climg2d[2*typei+1]*nbfp_comb_climg2d[2*typej+1];
 +
 +    sigma2  = sigma*sigma;
 +    c6grid  = epsilon*sigma2*sigma2*sigma2;
 +
 +    /* Recalculate inv_r6 without exclusion mask */
 +    inv_r6_nm = inv_r2*inv_r2*inv_r2;
 +    cr2       = lje_coeff2*r2;
 +    expmcr2   = exp(-cr2);
 +    poly      = 1.0f + cr2 + 0.5f*cr2*cr2;
 +
 +    /* Subtract the grid force from the total LJ force */
 +    *F_invr  += c6grid*(inv_r6_nm - expmcr2*(inv_r6_nm*poly + lje_coeff6_6))*inv_r2;
 +
 +    if (with_E_lj)
 +    {
 +        float sh_mask;
 +
 +        /* Shift should be applied only to real LJ pairs */
 +        sh_mask   = nbparam->sh_lj_ewald*int_bit;
 +        *E_lj    += ONE_SIXTH_F*c6grid*(inv_r6_nm*(1.0f - expmcr2*poly) + sh_mask);
 +    }
 +}
 +
 +/*! Interpolate Ewald coulomb force using the table through the tex_nbfp texture.
 + *  Original idea: from the OpenMM project
 + */
 +gmx_opencl_inline float
 +interpolate_coulomb_force_r(__constant const float *coulomb_tab_climg2d,
 +                            float                   r,
 +                            float                   scale)
 +{
 +    float   normalized = scale * r;
 +    int     index      = (int) normalized;
 +    float   fract2     = normalized - index;
 +    float   fract1     = 1.0f - fract2;
 +
 +    return fract1*coulomb_tab_climg2d[index] +
 +           fract2*coulomb_tab_climg2d[index + 1];
 +}
 +
 +/*! Calculate analytical Ewald correction term. */
 +gmx_opencl_inline
 +float pmecorrF(float z2)
 +{
 +    const float FN6 = -1.7357322914161492954e-8f;
 +    const float FN5 = 1.4703624142580877519e-6f;
 +    const float FN4 = -0.000053401640219807709149f;
 +    const float FN3 = 0.0010054721316683106153f;
 +    const float FN2 = -0.019278317264888380590f;
 +    const float FN1 = 0.069670166153766424023f;
 +    const float FN0 = -0.75225204789749321333f;
 +
 +    const float FD4 = 0.0011193462567257629232f;
 +    const float FD3 = 0.014866955030185295499f;
 +    const float FD2 = 0.11583842382862377919f;
 +    const float FD1 = 0.50736591960530292870f;
 +    const float FD0 = 1.0f;
 +
 +    float       z4;
 +    float       polyFN0, polyFN1, polyFD0, polyFD1;
 +
 +    z4          = z2*z2;
 +
 +    polyFD0     = FD4*z4 + FD2;
 +    polyFD1     = FD3*z4 + FD1;
 +    polyFD0     = polyFD0*z4 + FD0;
 +    polyFD0     = polyFD1*z2 + polyFD0;
 +
 +    polyFD0     = 1.0f/polyFD0;
 +
 +    polyFN0     = FN6*z4 + FN4;
 +    polyFN1     = FN5*z4 + FN3;
 +    polyFN0     = polyFN0*z4 + FN2;
 +    polyFN1     = polyFN1*z4 + FN1;
 +    polyFN0     = polyFN0*z4 + FN0;
 +    polyFN0     = polyFN1*z2 + polyFN0;
 +
 +    return polyFN0*polyFD0;
 +}
 +
 +#if REDUCE_SHUFFLE
 +gmx_opencl_inline
 +void reduce_force_j_shfl(float3 fin, __global float *fout,
 +                         int gmx_unused tidxi, int gmx_unused tidxj, int aidx)
 +{
 +    /* Only does reduction over 4 elements in cluster. Needs to be changed
 +     * for CL_SIZE>4. See CUDA code for required code */
 +    fin.x += intel_sub_group_shuffle_down(fin.x, fin.x, 1);
 +    fin.y += intel_sub_group_shuffle_up  (fin.y, fin.y, 1);
 +    fin.z += intel_sub_group_shuffle_down(fin.z, fin.z, 1);
 +    if ((tidxi & 1) == 1)
 +    {
 +        fin.x = fin.y;
 +    }
 +    fin.x += intel_sub_group_shuffle_down(fin.x, fin.x, 2);
 +    fin.z += intel_sub_group_shuffle_up  (fin.z, fin.z, 2);
 +    if (tidxi == 2)
 +    {
 +        fin.x = fin.z;
 +    }
 +    if (tidxi < 3)
 +    {
 +        atomicAdd_g_f(&fout[3 * aidx + tidxi], fin.x);
 +    }
 +}
 +#endif
 +
 +gmx_opencl_inline
 +void reduce_force_j_generic(__local float *f_buf, float3 fcj_buf, __global float *fout,
 +                            int tidxi, int tidxj, int aidx)
 +{
 +    int tidx = tidxi + tidxj*CL_SIZE;
 +    f_buf[                  tidx] = fcj_buf.x;
 +    f_buf[    FBUF_STRIDE + tidx] = fcj_buf.y;
 +    f_buf[2 * FBUF_STRIDE + tidx] = fcj_buf.z;
 +
 +    /* Split the reduction between the first 3 column threads
 +       Threads with column id 0 will do the reduction for (float3).x components
 +       Threads with column id 1 will do the reduction for (float3).y components
 +       Threads with column id 2 will do the reduction for (float3).z components.
 +       The reduction is performed for each line tidxj of f_buf. */
 +    if (tidxi < 3)
 +    {
 +        float f = 0.0f;
 +        for (int j = tidxj * CL_SIZE; j < (tidxj + 1) * CL_SIZE; j++)
 +        {
 +            f += f_buf[FBUF_STRIDE * tidxi + j];
 +        }
 +
 +        atomicAdd_g_f(&fout[3 * aidx + tidxi], f);
 +    }
 +}
 +
 +/*! Final j-force reduction
 + */
 +gmx_opencl_inline
 +void reduce_force_j(__local float gmx_unused *f_buf, float3 fcj_buf, __global float *fout,
 +                    int tidxi, int tidxj, int aidx)
 +{
 +#if REDUCE_SHUFFLE
 +    reduce_force_j_shfl(fcj_buf, fout, tidxi, tidxj, aidx);
 +#else
 +    reduce_force_j_generic(f_buf, fcj_buf, fout, tidxi, tidxj, aidx);
 +#endif
 +}
 +
 +#if REDUCE_SHUFFLE
 +gmx_opencl_inline
 +void reduce_force_i_and_shift_shfl(float3* fci_buf, __global float *fout,
 +                                   bool bCalcFshift, int tidxi, int tidxj,
 +                                   int sci, int shift, __global float *fshift)
 +{
 +    /* Only does reduction over 4 elements in cluster (2 per warp). Needs to be changed
 +     * for CL_SIZE>4.*/
 +    float2 fshift_buf = 0;
 +    for (int ci_offset = 0; ci_offset < NCL_PER_SUPERCL; ci_offset++)
 +    {
 +        int    aidx = (sci * NCL_PER_SUPERCL + ci_offset) * CL_SIZE + tidxi;
 +        float3 fin  = fci_buf[ci_offset];
 +        fin.x += intel_sub_group_shuffle_down(fin.x, fin.x, CL_SIZE);
 +        fin.y += intel_sub_group_shuffle_up  (fin.y, fin.y, CL_SIZE);
 +        fin.z += intel_sub_group_shuffle_down(fin.z, fin.z, CL_SIZE);
 +
 +        if (tidxj & 1)
 +        {
 +            fin.x = fin.y;
 +        }
 +        /* Threads 0,1 and 2,3 increment x,y for their warp */
 +        atomicAdd_g_f(&fout[3*aidx + (tidxj & 1)], fin.x);
 +        if (bCalcFshift)
 +        {
 +            fshift_buf[0] += fin.x;
 +        }
 +        /* Threads 0 and 2 increment z for their warp */
 +        if ((tidxj & 1) == 0)
 +        {
 +            atomicAdd_g_f(&fout[3*aidx+2], fin.z);
 +            if (bCalcFshift)
 +            {
 +                fshift_buf[1] += fin.z;
 +            }
 +        }
 +    }
 +    /* add up local shift forces into global mem */
 +    if (bCalcFshift)
 +    {
 +        //Threads 0,1 and 2,3 update x,y
 +        atomicAdd_g_f(&(fshift[3 * shift + (tidxj&1)]), fshift_buf[0]);
 +        //Threads 0 and 2 update z
 +        if ((tidxj & 1) == 0)
 +        {
 +            atomicAdd_g_f(&(fshift[3 * shift + 2]), fshift_buf[1]);
 +        }
 +    }
 +}
 +#endif
 +
 +/*! Final i-force reduction; this implementation works only with power of two
 + *  array sizes.
 + */
 +gmx_opencl_inline
 +void reduce_force_i_and_shift_pow2(volatile __local float *f_buf, float3* fci_buf,
 +                                   __global float *fout,
 +                                   bool bCalcFshift,
 +                                   int tidxi, int tidxj,
 +                                   int sci, int shift, __global float *fshift)
 +{
 +    float fshift_buf = 0;
 +    for (int ci_offset = 0; ci_offset < NCL_PER_SUPERCL; ci_offset++)
 +    {
 +        int aidx = (sci * NCL_PER_SUPERCL + ci_offset) * CL_SIZE + tidxi;
 +        int tidx = tidxi + tidxj*CL_SIZE;
 +        /* store i forces in shmem */
 +        f_buf[                  tidx] = fci_buf[ci_offset].x;
 +        f_buf[    FBUF_STRIDE + tidx] = fci_buf[ci_offset].y;
 +        f_buf[2 * FBUF_STRIDE + tidx] = fci_buf[ci_offset].z;
 +        barrier(CLK_LOCAL_MEM_FENCE);
 +
 +        int     i, j;
 +        /* Reduce the initial CL_SIZE values for each i atom to half
 +         * every step by using CL_SIZE * i threads.
 +         * Can't just use i as loop variable because than nvcc refuses to unroll.
 +         */
 +        i = CL_SIZE/2;
 +        for (j = CL_SIZE_LOG2 - 1; j > 0; j--)
 +        {
 +            if (tidxj < i)
 +            {
 +
 +                f_buf[                  tidxj * CL_SIZE + tidxi] += f_buf[                  (tidxj + i) * CL_SIZE + tidxi];
 +                f_buf[    FBUF_STRIDE + tidxj * CL_SIZE + tidxi] += f_buf[    FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi];
 +                f_buf[2 * FBUF_STRIDE + tidxj * CL_SIZE + tidxi] += f_buf[2 * FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi];
 +            }
 +            i >>= 1;
 +        }
 +        /* needed because
 +         * a) for CL_SIZE<8: id 2 (doing z in next block) is in 2nd warp
 +         * b) for all CL_SIZE a barrier is needed before f_buf is reused by next reduce_force_i call
 +         * TODO: Test on Nvidia for performance difference between having the barrier here or after the atomicAdd
 +         */
 +        barrier(CLK_LOCAL_MEM_FENCE);
 +
 +        /* i == 1, last reduction step, writing to global mem */
 +        /* Split the reduction between the first 3 line threads
 +           Threads with line id 0 will do the reduction for (float3).x components
 +           Threads with line id 1 will do the reduction for (float3).y components
 +           Threads with line id 2 will do the reduction for (float3).z components. */
 +        if (tidxj < 3)
 +        {
 +            float f = f_buf[tidxj * FBUF_STRIDE + tidxi] + f_buf[tidxj * FBUF_STRIDE + i * CL_SIZE + tidxi];
 +
 +            atomicAdd_g_f(&fout[3 * aidx + tidxj], f);
 +
 +            if (bCalcFshift)
 +            {
 +                fshift_buf += f;
 +            }
 +        }
 +    }
 +    /* add up local shift forces into global mem */
 +    if (bCalcFshift)
 +    {
 +        /* Only threads with tidxj < 3 will update fshift.
 +           The threads performing the update, must be the same as the threads
 +           storing the reduction result above.
 +         */
 +        if (tidxj < 3)
 +        {
 +            atomicAdd_g_f(&(fshift[3 * shift + tidxj]), fshift_buf);
 +        }
 +    }
 +}
 +
 +/*! Final i-force reduction
 + */
 +gmx_opencl_inline
 +void reduce_force_i_and_shift(__local float gmx_unused *f_buf, float3* fci_buf, __global float *f,
 +                              bool bCalcFshift, int tidxi, int tidxj, int sci,
 +                              int shift, __global float *fshift)
 +{
 +#if REDUCE_SHUFFLE
 +    reduce_force_i_and_shift_shfl(fci_buf, f, bCalcFshift, tidxi, tidxj,
 +                                  sci, shift, fshift);
 +#else
 +    reduce_force_i_and_shift_pow2(f_buf, fci_buf, f, bCalcFshift, tidxi, tidxj,
 +                                  sci, shift, fshift);
 +#endif
 +}
 +
 +
 +
 +#if REDUCE_SHUFFLE
 +gmx_opencl_inline
 +void reduce_energy_shfl(float E_lj, float E_el,
 +                        volatile __global float *e_lj,
 +                        volatile __global float *e_el,
 +                        unsigned int    tidx)
 +{
 +    E_lj = sub_group_reduce_add(E_lj);
 +    E_el = sub_group_reduce_add(E_el);
 +    /* Should be get_sub_group_local_id()==0. Doesn't work with Intel Classic driver.
 +     * To make it possible to use REDUCE_SHUFFLE with single subgroup per i-j pair
 +     * (e.g. subgroup size 16 with CL_SIZE 4), either this "if" needs to be changed or
 +     * the definition of WARP_SIZE (currently CL_SIZE*CL_SIZE/2) needs to be changed
 +     * (by supporting c_nbnxnGpuClusterpairSplit=1). */
 +    if (tidx == 0 || tidx == WARP_SIZE)
 +    {
 +        atomicAdd_g_f(e_lj, E_lj);
 +        atomicAdd_g_f(e_el, E_el);
 +    }
 +}
 +#endif
 +
 +/*! Energy reduction; this implementation works only with power of two
 + *  array sizes.
 + */
 +gmx_opencl_inline
 +void reduce_energy_pow2(volatile __local float  *buf,
 +                        volatile __global float *e_lj,
 +                        volatile __global float *e_el,
 +                        unsigned int             tidx)
 +{
 +    int          j;
 +
 +    unsigned int i = WARP_SIZE/2;
 +
 +    /* Can't just use i as loop variable because than nvcc refuses to unroll. */
 +    for (j = WARP_SIZE_LOG2 - 1; j > 0; j--)
 +    {
 +        if (tidx < i)
 +        {
 +            buf[              tidx] += buf[              tidx + i];
 +            buf[FBUF_STRIDE + tidx] += buf[FBUF_STRIDE + tidx + i];
 +        }
 +        i >>= 1;
 +    }
 +
 +    /* last reduction step, writing to global mem */
 +    if (tidx == 0)
 +    {
 +        float e1 = buf[              tidx] + buf[              tidx + i];
 +        float e2 = buf[FBUF_STRIDE + tidx] + buf[FBUF_STRIDE + tidx + i];
 +
 +        atomicAdd_g_f(e_lj, e1);
 +        atomicAdd_g_f(e_el, e2);
 +    }
 +}
 +
 +gmx_opencl_inline
 +void reduce_energy(volatile __local float  gmx_unused *buf,
 +                   float E_lj, float E_el,
 +                   volatile __global float *e_lj,
 +                   volatile __global float *e_el,
 +                   unsigned int             tidx)
 +{
 +#if REDUCE_SHUFFLE
 +    reduce_energy_shfl(E_lj, E_el, e_lj, e_el, tidx);
 +#else
 +    /* flush the energies to shmem and reduce them */
 +    buf[              tidx] = E_lj;
 +    buf[FBUF_STRIDE + tidx] = E_el;
 +    reduce_energy_pow2(buf + (tidx & WARP_SIZE), e_lj, e_el, tidx & ~WARP_SIZE);
 +#endif
 +}
 +
 +gmx_opencl_inline
 +bool gmx_sub_group_any_localmem(volatile __local uint *warp_any, int widx, bool pred)
 +{
 +    if (pred)
 +    {
 +        warp_any[widx] = 1;
 +    }
 +
 +    bool ret = warp_any[widx];
 +
 +    warp_any[widx] = 0;
 +
 +    return ret;
 +}
 +
 +//! Returns a true if predicate is true for any work item in warp
 +gmx_opencl_inline
 +bool gmx_sub_group_any(volatile __local uint gmx_unused *warp_any, int gmx_unused widx, bool pred)
 +{
 +#if USE_SUBGROUP_ANY
 +    return sub_group_any(pred);
 +#else
 +    return gmx_sub_group_any_localmem(warp_any, widx, pred);
 +#endif
 +}
 +
 +#endif /* NBNXN_OPENCL_KERNEL_UTILS_CLH */