Merge branch release-2019 into master

author Paul Bauer <paul.bauer.q@gmail.com>

Mon, 23 Sep 2019 07:44:00 +0000 (09:44 +0200)

committer Paul Bauer <paul.bauer.q@gmail.com>

Tue, 24 Sep 2019 14:59:39 +0000 (16:59 +0200)
author Paul Bauer <paul.bauer.q@gmail.com>
Mon, 23 Sep 2019 07:44:00 +0000 (09:44 +0200)
committer Paul Bauer <paul.bauer.q@gmail.com>
Tue, 24 Sep 2019 14:59:39 +0000 (16:59 +0200)
diff --cc cmake/gmxManageMPI.cmake

index 78600ba7e34d96f8f3243364c0eacfe2e5e0c72c,cc58daeafb4e5029e0926cb12f838aa8c8cd8194..0a614a430cae1577554e43a1abf2ccbdccdb329e
--- 1/cmake/gmxManageMPI.cmake
--- 2/cmake/gmxManageMPI.cmake
+++ b/cmake/gmxManageMPI.cmake
@@@ -50,10 -50,22 +50,10 @@@ if(GMX_MPI
     if(NOT MPI_FOUND)
         find_package(MPI)
         if(MPI_C_FOUND)
-         set(MPI_COMPILE_FLAGS ${MPI_C_COMPILE_FLAGS})
-         set(MPI_LINKER_FLAGS ${MPI_C_LINK_FLAGS})
-         include_directories(SYSTEM ${MPI_C_INCLUDE_PATH})
-         list(APPEND GMX_COMMON_LIBRARIES ${MPI_C_LIBRARIES})
- -          # workaround for rhel bug #1749463, cmake bug #18349, fixed in cmake-3.12.3
- -          if(CMAKE_VERSION VERSION_LESS "3.12.3")
- -              set(MPI_COMPILE_FLAGS)
- -              foreach(_MPI_FLAG ${MPI_C_COMPILE_FLAGS})
- -                  set(MPI_COMPILE_FLAGS "${MPI_COMPILE_FLAGS} ${_MPI_FLAG}")
- -              endforeach()
- -              set(MPI_LINKER_FLAGS)
- -              foreach(_MPI_FLAG ${MPI_C_LINK_FLAGS})
- -                  set(MPI_LINKER_FLAGS "${MPI_LINKER_FLAGS} ${_MPI_FLAG}")
- -              endforeach()
- -          else()
- -              set(MPI_COMPILE_FLAGS ${MPI_C_COMPILE_FLAGS})
- -              set(MPI_LINKER_FLAGS ${MPI_C_LINK_FLAGS})
- -          endif()
++          set(MPI_COMPILE_FLAGS ${MPI_C_COMPILE_FLAGS})
++          set(MPI_LINKER_FLAGS ${MPI_C_LINK_FLAGS})
+           include_directories(SYSTEM ${MPI_C_INCLUDE_PATH})
+           list(APPEND GMX_COMMON_LIBRARIES ${MPI_C_LIBRARIES})
         endif()
         set(MPI_FOUND ${MPI_C_FOUND})
     else()
diff --cc src/gromacs/gmxpreprocess/topshake.cpp

index d640f6ca22176bfc4ef9f7239b86b0dc7d5ec6a8,d8ee49e80d77e68c0ad13dbc89158db13fb952e0..37c25fabd309368d6b19b884d18c7a5597dbfb3c
--- 1/src/gromacs/gmxpreprocess/topshake.cpp
--- 2/src/gromacs/gmxpreprocess/topshake.cpp
+++ b/src/gromacs/gmxpreprocess/topshake.cpp
@@@ -104,13 -137,13 +104,13 @@@ void make_shake(gmx::ArrayRef<Interacti
               /* Add all the angles with hydrogens to the shake list
                * and remove them from the bond list
                */
- -            for (ftype = 0; (ftype < F_NRE); ftype++)
+ +            for (int ftype = 0; (ftype < F_NRE); ftype++)
               {
-                 if (interaction_function[ftype].flags & IF_BTYPE)
+                 if (interaction_function[ftype].flags & IF_CHEMBOND)
                   {
- -                    bonds = &(plist[ftype]);
+ +                    InteractionsOfType *bonds = &(plist[ftype]);
   
- -                    for (ftype_a = 0; (bonds->nr > 0 && ftype_a < F_NRE); ftype_a++)
+ +                    for (int ftype_a = 0; (gmx::ssize(*bonds) > 0 && ftype_a < F_NRE); ftype_a++)
                       {
                           if (interaction_function[ftype_a].flags & IF_ATYPE)
                           {
@@@ -156,18 -193,24 +156,24 @@@
                                       }
                                       if (bFound)
                                       {
+ +                                        real              param = std::sqrt( b_ij*b_ij + b_jk*b_jk -
+ +                                                                             2.0*b_ij*b_jk*cos(DEG2RAD*ang->c0()));
+ +                                        std::vector<real> forceParm = {param, param};
+                                         if (ftype == F_CONNBONDS ||
+                                             ftype_a == F_CONNBONDS)
+                                         {
+                                             gmx_fatal(FARGS, "Can not constrain all angles when they involved bonds of type %s",
+                                                       interaction_function[F_CONNBONDS].longname);
+                                         }
                                           /* apply law of cosines */
- -                                        p.c0() = std::sqrt( b_ij*b_ij + b_jk*b_jk -
- -                                                            2.0*b_ij*b_jk*cos(DEG2RAD*ang->c0()) );
- -                                        p.c1() = p.c0();
   #ifdef DEBUG
- -                                        printf("p: %d, q: %d, dist: %12.5e\n", p.ai(), p.aj(), p.c0());
+ +                                        printf("p: %d, q: %d, dist: %12.5e\n", atomNumbers[0],
+ +                                               atomNumbers[1], forceParm[0]);
   #endif
- -                                        add_param_to_list (&(plist[F_CONSTR]), &p);
+ +                                        add_param_to_list (&(plist[F_CONSTR]), InteractionOfType(atomNumbers, forceParm));
                                           /* move the last bond to this position */
- -                                        copy_bond (pr, i, pr->nr-1);
- -                                        /* should free memory here!! */
- -                                        pr->nr--;
+ +                                        *parm = *(pr->interactionTypes.end() - 1);
+ +                                        pr->interactionTypes.erase(pr->interactionTypes.end() - 1);
                                       }
                                   }
                                   else
diff --cc src/gromacs/listed_forces/bonded.cpp

index 902453320fe0ae8e50c8342202cc65077adc6021,0000000000000000000000000000000000000000..fcb85f8854a3eb1c6d38770d41dd92ed90f8e24b

mode 100644,000000..100644
--- 1/src/gromacs/listed_forces/bonded.cpp
--- /dev/null
+++ b/src/gromacs/listed_forces/bonded.cpp
@@@ -1,3912 -1,0 +1,3912 @@@
-             rvec_inc(fshift[t21], f2_i);
+ +/*
+ + * This file is part of the GROMACS molecular simulation package.
+ + *
+ + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ + * Copyright (c) 2001-2004, The GROMACS development team.
+ + * Copyright (c) 2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
+ + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ + * and including many others, as listed in the AUTHORS file in the
+ + * top-level source directory and at http://www.gromacs.org.
+ + *
+ + * GROMACS is free software; you can redistribute it and/or
+ + * modify it under the terms of the GNU Lesser General Public License
+ + * as published by the Free Software Foundation; either version 2.1
+ + * of the License, or (at your option) any later version.
+ + *
+ + * GROMACS is distributed in the hope that it will be useful,
+ + * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ + * Lesser General Public License for more details.
+ + *
+ + * You should have received a copy of the GNU Lesser General Public
+ + * License along with GROMACS; if not, see
+ + * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ + *
+ + * If you want to redistribute modifications to GROMACS, please
+ + * consider that scientific software is very special. Version
+ + * control is crucial - bugs must be traceable. We will be happy to
+ + * consider code for inclusion in the official distribution, but
+ + * derived work must not be called official GROMACS. Details are found
+ + * in the README & COPYING files - if they are missing, get the
+ + * official version at http://www.gromacs.org.
+ + *
+ + * To help us fund GROMACS development, we humbly ask that you cite
+ + * the research papers on the package. Check out http://www.gromacs.org.
+ + */
+ +/*! \internal \file
+ + *
+ + * \brief This file defines low-level functions necessary for
+ + * computing energies and forces for listed interactions.
+ + *
+ + * \author Mark Abraham <mark.j.abraham@gmail.com>
+ + *
+ + * \ingroup module_listed_forces
+ + */
+ +#include "gmxpre.h"
+ +
+ +#include "bonded.h"
+ +
+ +#include "config.h"
+ +
+ +#include <cassert>
+ +#include <cmath>
+ +
+ +#include <algorithm>
+ +
+ +#include "gromacs/gmxlib/nrnb.h"
+ +#include "gromacs/listed_forces/disre.h"
+ +#include "gromacs/listed_forces/orires.h"
+ +#include "gromacs/math/functions.h"
+ +#include "gromacs/math/units.h"
+ +#include "gromacs/math/utilities.h"
+ +#include "gromacs/math/vec.h"
+ +#include "gromacs/mdtypes/fcdata.h"
+ +#include "gromacs/mdtypes/mdatom.h"
+ +#include "gromacs/pbcutil/ishift.h"
+ +#include "gromacs/pbcutil/mshift.h"
+ +#include "gromacs/pbcutil/pbc.h"
+ +#include "gromacs/pbcutil/pbc_simd.h"
+ +#include "gromacs/simd/simd.h"
+ +#include "gromacs/simd/simd_math.h"
+ +#include "gromacs/simd/vector_operations.h"
+ +#include "gromacs/utility/basedefinitions.h"
+ +#include "gromacs/utility/enumerationhelpers.h"
+ +#include "gromacs/utility/fatalerror.h"
+ +#include "gromacs/utility/real.h"
+ +#include "gromacs/utility/smalloc.h"
+ +
+ +#include "listed_internal.h"
+ +#include "restcbt.h"
+ +
+ +using namespace gmx; // TODO: Remove when this file is moved into gmx namespace
+ +
+ +namespace
+ +{
+ +
+ +//! Type of CPU function to compute a bonded interaction.
+ +using BondedFunction = real(*)(int nbonds, const t_iatom iatoms[],
+ +                               const t_iparams iparams[],
+ +                               const rvec x[], rvec4 f[], rvec fshift[],
+ +                               const t_pbc *pbc, const t_graph *g,
+ +                               real lambda, real *dvdlambda,
+ +                               const t_mdatoms *md, t_fcdata *fcd,
+ +                               int *ddgatindex);
+ +
+ +/*! \brief Mysterious CMAP coefficient matrix */
+ +const int cmap_coeff_matrix[] = {
+ +    1, 0, -3,  2, 0, 0,  0,  0, -3,  0,  9, -6,  2,  0, -6,  4,
+ +    0, 0,  0,  0, 0, 0,  0,  0,  3,  0, -9,  6, -2,  0,  6, -4,
+ +    0, 0,  0,  0, 0, 0,  0,  0,  0,  0,  9, -6,  0,  0, -6,  4,
+ +    0, 0,  3, -2, 0, 0,  0,  0,  0,  0, -9,  6,  0,  0,  6, -4,
+ +    0, 0,  0,  0, 1, 0, -3,  2, -2,  0,  6, -4,  1,  0, -3,  2,
+ +    0, 0,  0,  0, 0, 0,  0,  0, -1,  0,  3, -2,  1,  0, -3,  2,
+ +    0, 0,  0,  0, 0, 0,  0,  0,  0,  0, -3,  2,  0,  0,  3, -2,
+ +    0, 0,  0,  0, 0, 0,  3, -2,  0,  0, -6,  4,  0,  0,  3, -2,
+ +    0, 1, -2,  1, 0, 0,  0,  0,  0, -3,  6, -3,  0,  2, -4,  2,
+ +    0, 0,  0,  0, 0, 0,  0,  0,  0,  3, -6,  3,  0, -2,  4, -2,
+ +    0, 0,  0,  0, 0, 0,  0,  0,  0,  0, -3,  3,  0,  0,  2, -2,
+ +    0, 0, -1,  1, 0, 0,  0,  0,  0,  0,  3, -3,  0,  0, -2,  2,
+ +    0, 0,  0,  0, 0, 1, -2,  1,  0, -2,  4, -2,  0,  1, -2,  1,
+ +    0, 0,  0,  0, 0, 0,  0,  0,  0, -1,  2, -1,  0,  1, -2,  1,
+ +    0, 0,  0,  0, 0, 0,  0,  0,  0,  0,  1, -1,  0,  0, -1,  1,
+ +    0, 0,  0,  0, 0, 0, -1,  1,  0,  0,  2, -2,  0,  0, -1,  1
+ +};
+ +
+ +
+ +/*! \brief Compute dx = xi - xj, modulo PBC if non-NULL
+ + *
+ + * \todo This kind of code appears in many places. Consolidate it */
+ +int pbc_rvec_sub(const t_pbc *pbc, const rvec xi, const rvec xj, rvec dx)
+ +{
+ +    if (pbc)
+ +    {
+ +        return pbc_dx_aiuc(pbc, xi, xj, dx);
+ +    }
+ +    else
+ +    {
+ +        rvec_sub(xi, xj, dx);
+ +        return CENTRAL;
+ +    }
+ +}
+ +
+ +} // namespace
+ +
+ +//! \cond
+ +real bond_angle(const rvec xi, const rvec xj, const rvec xk, const t_pbc *pbc,
+ +                rvec r_ij, rvec r_kj, real *costh,
+ +                int *t1, int *t2)
+ +/* Return value is the angle between the bonds i-j and j-k */
+ +{
+ +    /* 41 FLOPS */
+ +    real th;
+ +
+ +    *t1 = pbc_rvec_sub(pbc, xi, xj, r_ij); /*  3              */
+ +    *t2 = pbc_rvec_sub(pbc, xk, xj, r_kj); /*  3              */
+ +
+ +    *costh = cos_angle(r_ij, r_kj);        /* 25              */
+ +    th     = std::acos(*costh);            /* 10              */
+ +    /* 41 TOTAL       */
+ +    return th;
+ +}
+ +
+ +real dih_angle(const rvec xi, const rvec xj, const rvec xk, const rvec xl,
+ +               const t_pbc *pbc,
+ +               rvec r_ij, rvec r_kj, rvec r_kl, rvec m, rvec n,
+ +               int *t1, int *t2, int *t3)
+ +{
+ +    *t1 = pbc_rvec_sub(pbc, xi, xj, r_ij); /*  3        */
+ +    *t2 = pbc_rvec_sub(pbc, xk, xj, r_kj); /*  3              */
+ +    *t3 = pbc_rvec_sub(pbc, xk, xl, r_kl); /*  3              */
+ +
+ +    cprod(r_ij, r_kj, m);                  /*  9        */
+ +    cprod(r_kj, r_kl, n);                  /*  9              */
+ +    real phi  = gmx_angle(m, n);           /* 49 (assuming 25 for atan2) */
+ +    real ipr  = iprod(r_ij, n);            /*  5        */
+ +    real sign = (ipr < 0.0) ? -1.0 : 1.0;
+ +    phi       = sign*phi;                  /*  1              */
+ +    /* 82 TOTAL       */
+ +    return phi;
+ +}
+ +//! \endcond
+ +
+ +void make_dp_periodic(real *dp)  /* 1 flop? */
+ +{
+ +    /* dp cannot be outside (-pi,pi) */
+ +    if (*dp >= M_PI)
+ +    {
+ +        *dp -= 2*M_PI;
+ +    }
+ +    else if (*dp < -M_PI)
+ +    {
+ +        *dp += 2*M_PI;
+ +    }
+ +}
+ +
+ +namespace
+ +{
+ +
+ +/*! \brief Spreads and accumulates the bonded forces to the two atoms and adds the virial contribution when needed
+ + *
+ + * When \p g==nullptr, \p shiftIndex is used as the periodic shift.
+ + * When \p g!=nullptr, the graph is used to compute the periodic shift.
+ + */
+ +template <BondedKernelFlavor flavor>
+ +inline void spreadBondForces(const real     bondForce,
+ +                             const rvec     dx,
+ +                             const int      ai,
+ +                             const int      aj,
+ +                             rvec4         *f,
+ +                             int            shiftIndex,
+ +                             const t_graph *g,
+ +                             rvec          *fshift)
+ +{
+ +    if (computeVirial(flavor) && g)
+ +    {
+ +        ivec dt;
+ +        ivec_sub(SHIFT_IVEC(g, ai), SHIFT_IVEC(g, aj), dt);
+ +        shiftIndex = IVEC2IS(dt);
+ +    }
+ +
+ +    for (int m = 0; m < DIM; m++)                    /*  15          */
+ +    {
+ +        const real fij             = bondForce*dx[m];
+ +        f[ai][m]                  += fij;
+ +        f[aj][m]                  -= fij;
+ +        if (computeVirial(flavor))
+ +        {
+ +            fshift[shiftIndex][m] += fij;
+ +            fshift[CENTRAL][m]    -= fij;
+ +        }
+ +    }
+ +}
+ +
+ +/*! \brief Morse potential bond
+ + *
+ + * By Frank Everdij. Three parameters needed:
+ + *
+ + * b0 = equilibrium distance in nm
+ + * be = beta in nm^-1 (actually, it's nu_e*Sqrt(2*pi*pi*mu/D_e))
+ + * cb = well depth in kJ/mol
+ + *
+ + * Note: the potential is referenced to be +cb at infinite separation
+ + *       and zero at the equilibrium distance!
+ + */
+ +template <BondedKernelFlavor flavor>
+ +real morse_bonds(int nbonds,
+ +                 const t_iatom forceatoms[], const t_iparams forceparams[],
+ +                 const rvec x[], rvec4 f[], rvec fshift[],
+ +                 const t_pbc *pbc, const t_graph *g,
+ +                 real lambda, real *dvdlambda,
+ +                 const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+ +                 int gmx_unused *global_atom_index)
+ +{
+ +    const real one = 1.0;
+ +    const real two = 2.0;
+ +    real       dr, dr2, temp, omtemp, cbomtemp, fbond, vbond, vtot;
+ +    real       b0, be, cb, b0A, beA, cbA, b0B, beB, cbB, L1;
+ +    rvec       dx;
+ +    int        i, ki, type, ai, aj;
+ +
+ +    vtot = 0.0;
+ +    for (i = 0; (i < nbonds); )
+ +    {
+ +        type = forceatoms[i++];
+ +        ai   = forceatoms[i++];
+ +        aj   = forceatoms[i++];
+ +
+ +        b0A   = forceparams[type].morse.b0A;
+ +        beA   = forceparams[type].morse.betaA;
+ +        cbA   = forceparams[type].morse.cbA;
+ +
+ +        b0B   = forceparams[type].morse.b0B;
+ +        beB   = forceparams[type].morse.betaB;
+ +        cbB   = forceparams[type].morse.cbB;
+ +
+ +        L1 = one-lambda;                            /* 1 */
+ +        b0 = L1*b0A + lambda*b0B;                   /* 3 */
+ +        be = L1*beA + lambda*beB;                   /* 3 */
+ +        cb = L1*cbA + lambda*cbB;                   /* 3 */
+ +
+ +        ki   = pbc_rvec_sub(pbc, x[ai], x[aj], dx); /*   3          */
+ +        dr2  = iprod(dx, dx);                       /*   5          */
+ +        dr   = dr2*gmx::invsqrt(dr2);               /*  10          */
+ +        temp = std::exp(-be*(dr-b0));               /*  12          */
+ +
+ +        if (temp == one)
+ +        {
+ +            /* bonds are constrainted. This may _not_ include bond constraints if they are lambda dependent */
+ +            *dvdlambda += cbB-cbA;
+ +            continue;
+ +        }
+ +
+ +        omtemp    = one-temp;                                                                                        /*   1          */
+ +        cbomtemp  = cb*omtemp;                                                                                       /*   1          */
+ +        vbond     = cbomtemp*omtemp;                                                                                 /*   1          */
+ +        fbond     = -two*be*temp*cbomtemp*gmx::invsqrt(dr2);                                                         /*   9          */
+ +        vtot     += vbond;                                                                                           /*   1          */
+ +
+ +        *dvdlambda += (cbB - cbA) * omtemp * omtemp - (2-2*omtemp)*omtemp * cb * ((b0B-b0A)*be - (beB-beA)*(dr-b0)); /* 15 */
+ +
+ +        spreadBondForces<flavor>(fbond, dx, ai, aj, f, ki, g, fshift);                                               /* 15 */
+ +    }                                                                                                                /*  83 TOTAL    */
+ +    return vtot;
+ +}
+ +
+ +//! \cond
+ +template <BondedKernelFlavor flavor>
+ +real cubic_bonds(int nbonds,
+ +                 const t_iatom forceatoms[], const t_iparams forceparams[],
+ +                 const rvec x[], rvec4 f[], rvec fshift[],
+ +                 const t_pbc *pbc, const t_graph *g,
+ +                 real gmx_unused lambda, real gmx_unused *dvdlambda,
+ +                 const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+ +                 int gmx_unused *global_atom_index)
+ +{
+ +    const real three = 3.0;
+ +    const real two   = 2.0;
+ +    real       kb, b0, kcub;
+ +    real       dr, dr2, dist, kdist, kdist2, fbond, vbond, vtot;
+ +    rvec       dx;
+ +    int        i, ki, type, ai, aj;
+ +
+ +    vtot = 0.0;
+ +    for (i = 0; (i < nbonds); )
+ +    {
+ +        type = forceatoms[i++];
+ +        ai   = forceatoms[i++];
+ +        aj   = forceatoms[i++];
+ +
+ +        b0   = forceparams[type].cubic.b0;
+ +        kb   = forceparams[type].cubic.kb;
+ +        kcub = forceparams[type].cubic.kcub;
+ +
+ +        ki   = pbc_rvec_sub(pbc, x[ai], x[aj], dx);     /*   3          */
+ +        dr2  = iprod(dx, dx);                           /*   5          */
+ +
+ +        if (dr2 == 0.0)
+ +        {
+ +            continue;
+ +        }
+ +
+ +        dr         = dr2*gmx::invsqrt(dr2);                  /*  10          */
+ +        dist       = dr-b0;
+ +        kdist      = kb*dist;
+ +        kdist2     = kdist*dist;
+ +
+ +        vbond      = kdist2 + kcub*kdist2*dist;
+ +        fbond      = -(two*kdist + three*kdist2*kcub)/dr;
+ +
+ +        vtot      += vbond;                                            /* 21 */
+ +
+ +        spreadBondForces<flavor>(fbond, dx, ai, aj, f, ki, g, fshift); /* 15 */
+ +    }                                                                  /*  54 TOTAL    */
+ +    return vtot;
+ +}
+ +
+ +template <BondedKernelFlavor flavor>
+ +real FENE_bonds(int nbonds,
+ +                const t_iatom forceatoms[], const t_iparams forceparams[],
+ +                const rvec x[], rvec4 f[], rvec fshift[],
+ +                const t_pbc *pbc, const t_graph *g,
+ +                real gmx_unused lambda, real gmx_unused *dvdlambda,
+ +                const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+ +                int *global_atom_index)
+ +{
+ +    const real half = 0.5;
+ +    const real one  = 1.0;
+ +    real       bm, kb;
+ +    real       dr2, bm2, omdr2obm2, fbond, vbond, vtot;
+ +    rvec       dx;
+ +    int        i, ki, type, ai, aj;
+ +
+ +    vtot = 0.0;
+ +    for (i = 0; (i < nbonds); )
+ +    {
+ +        type = forceatoms[i++];
+ +        ai   = forceatoms[i++];
+ +        aj   = forceatoms[i++];
+ +
+ +        bm   = forceparams[type].fene.bm;
+ +        kb   = forceparams[type].fene.kb;
+ +
+ +        ki   = pbc_rvec_sub(pbc, x[ai], x[aj], dx);     /*   3          */
+ +        dr2  = iprod(dx, dx);                           /*   5          */
+ +
+ +        if (dr2 == 0.0)
+ +        {
+ +            continue;
+ +        }
+ +
+ +        bm2 = bm*bm;
+ +
+ +        if (dr2 >= bm2)
+ +        {
+ +            gmx_fatal(FARGS,
+ +                      "r^2 (%f) >= bm^2 (%f) in FENE bond between atoms %d and %d",
+ +                      dr2, bm2,
+ +                      glatnr(global_atom_index, ai),
+ +                      glatnr(global_atom_index, aj));
+ +        }
+ +
+ +        omdr2obm2  = one - dr2/bm2;
+ +
+ +        vbond      = -half*kb*bm2*std::log(omdr2obm2);
+ +        fbond      = -kb/omdr2obm2;
+ +
+ +        vtot      += vbond;                                            /* 35 */
+ +
+ +        spreadBondForces<flavor>(fbond, dx, ai, aj, f, ki, g, fshift); /* 15 */
+ +    }                                                                  /*  58 TOTAL    */
+ +    return vtot;
+ +}
+ +
+ +real harmonic(real kA, real kB, real xA, real xB, real x, real lambda,
+ +              real *V, real *F)
+ +{
+ +    const real half = 0.5;
+ +    real       L1, kk, x0, dx, dx2;
+ +    real       v, f, dvdlambda;
+ +
+ +    L1    = 1.0-lambda;
+ +    kk    = L1*kA+lambda*kB;
+ +    x0    = L1*xA+lambda*xB;
+ +
+ +    dx    = x-x0;
+ +    dx2   = dx*dx;
+ +
+ +    f          = -kk*dx;
+ +    v          = half*kk*dx2;
+ +    dvdlambda  = half*(kB-kA)*dx2 + (xA-xB)*kk*dx;
+ +
+ +    *F    = f;
+ +    *V    = v;
+ +
+ +    return dvdlambda;
+ +
+ +    /* That was 19 flops */
+ +}
+ +
+ +
+ +template <BondedKernelFlavor flavor>
+ +real bonds(int nbonds,
+ +           const t_iatom forceatoms[], const t_iparams forceparams[],
+ +           const rvec x[], rvec4 f[], rvec fshift[],
+ +           const t_pbc *pbc, const t_graph *g,
+ +           real lambda, real *dvdlambda,
+ +           const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+ +           int gmx_unused *global_atom_index)
+ +{
+ +    int  i, ki, ai, aj, type;
+ +    real dr, dr2, fbond, vbond, vtot;
+ +    rvec dx;
+ +
+ +    vtot = 0.0;
+ +    for (i = 0; (i < nbonds); )
+ +    {
+ +        type = forceatoms[i++];
+ +        ai   = forceatoms[i++];
+ +        aj   = forceatoms[i++];
+ +
+ +        ki   = pbc_rvec_sub(pbc, x[ai], x[aj], dx); /*   3      */
+ +        dr2  = iprod(dx, dx);                       /*   5            */
+ +        dr   = std::sqrt(dr2);                      /*  10            */
+ +
+ +        *dvdlambda += harmonic(forceparams[type].harmonic.krA,
+ +                               forceparams[type].harmonic.krB,
+ +                               forceparams[type].harmonic.rA,
+ +                               forceparams[type].harmonic.rB,
+ +                               dr, lambda, &vbond, &fbond); /*  19  */
+ +
+ +        if (dr2 == 0.0)
+ +        {
+ +            continue;
+ +        }
+ +
+ +
+ +        vtot  += vbond;                                                /* 1*/
+ +        fbond *= gmx::invsqrt(dr2);                                    /*   6         */
+ +
+ +        spreadBondForces<flavor>(fbond, dx, ai, aj, f, ki, g, fshift); /* 15 */
+ +    }                                                                  /* 59 TOTAL    */
+ +    return vtot;
+ +}
+ +
+ +template <BondedKernelFlavor flavor>
+ +real restraint_bonds(int nbonds,
+ +                     const t_iatom forceatoms[], const t_iparams forceparams[],
+ +                     const rvec x[], rvec4 f[], rvec fshift[],
+ +                     const t_pbc *pbc, const t_graph *g,
+ +                     real lambda, real *dvdlambda,
+ +                     const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+ +                     int gmx_unused *global_atom_index)
+ +{
+ +    int  i, ki, ai, aj, type;
+ +    real dr, dr2, fbond, vbond, vtot;
+ +    real L1;
+ +    real low, dlow, up1, dup1, up2, dup2, k, dk;
+ +    real drh, drh2;
+ +    rvec dx;
+ +
+ +    L1   = 1.0 - lambda;
+ +
+ +    vtot = 0.0;
+ +    for (i = 0; (i < nbonds); )
+ +    {
+ +        type = forceatoms[i++];
+ +        ai   = forceatoms[i++];
+ +        aj   = forceatoms[i++];
+ +
+ +        ki   = pbc_rvec_sub(pbc, x[ai], x[aj], dx); /*   3      */
+ +        dr2  = iprod(dx, dx);                       /*   5            */
+ +        dr   = dr2*gmx::invsqrt(dr2);               /*  10            */
+ +
+ +        low  = L1*forceparams[type].restraint.lowA + lambda*forceparams[type].restraint.lowB;
+ +        dlow =   -forceparams[type].restraint.lowA +        forceparams[type].restraint.lowB;
+ +        up1  = L1*forceparams[type].restraint.up1A + lambda*forceparams[type].restraint.up1B;
+ +        dup1 =   -forceparams[type].restraint.up1A +        forceparams[type].restraint.up1B;
+ +        up2  = L1*forceparams[type].restraint.up2A + lambda*forceparams[type].restraint.up2B;
+ +        dup2 =   -forceparams[type].restraint.up2A +        forceparams[type].restraint.up2B;
+ +        k    = L1*forceparams[type].restraint.kA   + lambda*forceparams[type].restraint.kB;
+ +        dk   =   -forceparams[type].restraint.kA   +        forceparams[type].restraint.kB;
+ +        /* 24 */
+ +
+ +        if (dr < low)
+ +        {
+ +            drh         = dr - low;
+ +            drh2        = drh*drh;
+ +            vbond       = 0.5*k*drh2;
+ +            fbond       = -k*drh;
+ +            *dvdlambda += 0.5*dk*drh2 - k*dlow*drh;
+ +        }   /* 11 */
+ +        else if (dr <= up1)
+ +        {
+ +            vbond = 0;
+ +            fbond = 0;
+ +        }
+ +        else if (dr <= up2)
+ +        {
+ +            drh         = dr - up1;
+ +            drh2        = drh*drh;
+ +            vbond       = 0.5*k*drh2;
+ +            fbond       = -k*drh;
+ +            *dvdlambda += 0.5*dk*drh2 - k*dup1*drh;
+ +        }   /* 11     */
+ +        else
+ +        {
+ +            drh         = dr - up2;
+ +            vbond       = k*(up2 - up1)*(0.5*(up2 - up1) + drh);
+ +            fbond       = -k*(up2 - up1);
+ +            *dvdlambda += dk*(up2 - up1)*(0.5*(up2 - up1) + drh)
+ +                + k*(dup2 - dup1)*(up2 - up1 + drh)
+ +                - k*(up2 - up1)*dup2;
+ +        }
+ +
+ +        if (dr2 == 0.0)
+ +        {
+ +            continue;
+ +        }
+ +
+ +        vtot  += vbond;                                                /* 1*/
+ +        fbond *= gmx::invsqrt(dr2);                                    /*   6         */
+ +
+ +        spreadBondForces<flavor>(fbond, dx, ai, aj, f, ki, g, fshift); /* 15 */
+ +    }                                                                  /* 59 TOTAL    */
+ +
+ +    return vtot;
+ +}
+ +
+ +template <BondedKernelFlavor flavor>
+ +real polarize(int nbonds,
+ +              const t_iatom forceatoms[], const t_iparams forceparams[],
+ +              const rvec x[], rvec4 f[], rvec fshift[],
+ +              const t_pbc *pbc, const t_graph *g,
+ +              real lambda, real *dvdlambda,
+ +              const t_mdatoms *md, t_fcdata gmx_unused *fcd,
+ +              int gmx_unused *global_atom_index)
+ +{
+ +    int  i, ki, ai, aj, type;
+ +    real dr, dr2, fbond, vbond, vtot, ksh;
+ +    rvec dx;
+ +
+ +    vtot = 0.0;
+ +    for (i = 0; (i < nbonds); )
+ +    {
+ +        type = forceatoms[i++];
+ +        ai   = forceatoms[i++];
+ +        aj   = forceatoms[i++];
+ +        ksh  = gmx::square(md->chargeA[aj])*ONE_4PI_EPS0/forceparams[type].polarize.alpha;
+ +
+ +        ki   = pbc_rvec_sub(pbc, x[ai], x[aj], dx);                         /*   3      */
+ +        dr2  = iprod(dx, dx);                                               /*   5            */
+ +        dr   = std::sqrt(dr2);                                              /*  10            */
+ +
+ +        *dvdlambda += harmonic(ksh, ksh, 0, 0, dr, lambda, &vbond, &fbond); /*  19  */
+ +
+ +        if (dr2 == 0.0)
+ +        {
+ +            continue;
+ +        }
+ +
+ +        vtot  += vbond;                                                /* 1*/
+ +        fbond *= gmx::invsqrt(dr2);                                    /*   6         */
+ +
+ +        spreadBondForces<flavor>(fbond, dx, ai, aj, f, ki, g, fshift); /* 15 */
+ +    }                                                                  /* 59 TOTAL    */
+ +    return vtot;
+ +}
+ +
+ +template <BondedKernelFlavor flavor>
+ +real anharm_polarize(int nbonds,
+ +                     const t_iatom forceatoms[], const t_iparams forceparams[],
+ +                     const rvec x[], rvec4 f[], rvec fshift[],
+ +                     const t_pbc *pbc, const t_graph *g,
+ +                     real lambda, real *dvdlambda,
+ +                     const t_mdatoms *md, t_fcdata gmx_unused *fcd,
+ +                     int gmx_unused *global_atom_index)
+ +{
+ +    int  i, ki, ai, aj, type;
+ +    real dr, dr2, fbond, vbond, vtot, ksh, khyp, drcut, ddr, ddr3;
+ +    rvec dx;
+ +
+ +    vtot = 0.0;
+ +    for (i = 0; (i < nbonds); )
+ +    {
+ +        type  = forceatoms[i++];
+ +        ai    = forceatoms[i++];
+ +        aj    = forceatoms[i++];
+ +        ksh   = gmx::square(md->chargeA[aj])*ONE_4PI_EPS0/forceparams[type].anharm_polarize.alpha; /* 7*/
+ +        khyp  = forceparams[type].anharm_polarize.khyp;
+ +        drcut = forceparams[type].anharm_polarize.drcut;
+ +
+ +        ki   = pbc_rvec_sub(pbc, x[ai], x[aj], dx);                         /*   3      */
+ +        dr2  = iprod(dx, dx);                                               /*   5            */
+ +        dr   = dr2*gmx::invsqrt(dr2);                                       /*  10            */
+ +
+ +        *dvdlambda += harmonic(ksh, ksh, 0, 0, dr, lambda, &vbond, &fbond); /*  19  */
+ +
+ +        if (dr2 == 0.0)
+ +        {
+ +            continue;
+ +        }
+ +
+ +        if (dr > drcut)
+ +        {
+ +            ddr    = dr-drcut;
+ +            ddr3   = ddr*ddr*ddr;
+ +            vbond += khyp*ddr*ddr3;
+ +            fbond -= 4*khyp*ddr3;
+ +        }
+ +        fbond *= gmx::invsqrt(dr2);                                    /*   6         */
+ +        vtot  += vbond;                                                /* 1*/
+ +
+ +        spreadBondForces<flavor>(fbond, dx, ai, aj, f, ki, g, fshift); /* 15 */
+ +    }                                                                  /* 72 TOTAL    */
+ +    return vtot;
+ +}
+ +
+ +template <BondedKernelFlavor flavor>
+ +real water_pol(int nbonds,
+ +               const t_iatom forceatoms[], const t_iparams forceparams[],
+ +               const rvec x[], rvec4 f[], rvec gmx_unused fshift[],
+ +               const t_pbc gmx_unused *pbc, const t_graph gmx_unused *g,
+ +               real gmx_unused lambda, real gmx_unused *dvdlambda,
+ +               const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+ +               int gmx_unused *global_atom_index)
+ +{
+ +    /* This routine implements anisotropic polarizibility for water, through
+ +     * a shell connected to a dummy with spring constant that differ in the
+ +     * three spatial dimensions in the molecular frame.
+ +     */
+ +    int  i, m, aO, aH1, aH2, aD, aS, type, type0, ki;
+ +    ivec dt;
+ +    rvec dOH1, dOH2, dHH, dOD, dDS, nW, kk, dx, kdx, proj;
+ +    real vtot, fij, r_HH, r_OD, r_nW, tx, ty, tz, qS;
+ +
+ +    vtot = 0.0;
+ +    if (nbonds > 0)
+ +    {
+ +        type0  = forceatoms[0];
+ +        aS     = forceatoms[5];
+ +        qS     = md->chargeA[aS];
+ +        kk[XX] = gmx::square(qS)*ONE_4PI_EPS0/forceparams[type0].wpol.al_x;
+ +        kk[YY] = gmx::square(qS)*ONE_4PI_EPS0/forceparams[type0].wpol.al_y;
+ +        kk[ZZ] = gmx::square(qS)*ONE_4PI_EPS0/forceparams[type0].wpol.al_z;
+ +        r_HH   = 1.0/forceparams[type0].wpol.rHH;
+ +        for (i = 0; (i < nbonds); i += 6)
+ +        {
+ +            type = forceatoms[i];
+ +            if (type != type0)
+ +            {
+ +                gmx_fatal(FARGS, "Sorry, type = %d, type0 = %d, file = %s, line = %d",
+ +                          type, type0, __FILE__, __LINE__);
+ +            }
+ +            aO   = forceatoms[i+1];
+ +            aH1  = forceatoms[i+2];
+ +            aH2  = forceatoms[i+3];
+ +            aD   = forceatoms[i+4];
+ +            aS   = forceatoms[i+5];
+ +
+ +            /* Compute vectors describing the water frame */
+ +            pbc_rvec_sub(pbc, x[aH1], x[aO], dOH1);
+ +            pbc_rvec_sub(pbc, x[aH2], x[aO], dOH2);
+ +            pbc_rvec_sub(pbc, x[aH2], x[aH1], dHH);
+ +            pbc_rvec_sub(pbc, x[aD], x[aO], dOD);
+ +            ki = pbc_rvec_sub(pbc, x[aS], x[aD], dDS);
+ +            cprod(dOH1, dOH2, nW);
+ +
+ +            /* Compute inverse length of normal vector
+ +             * (this one could be precomputed, but I'm too lazy now)
+ +             */
+ +            r_nW = gmx::invsqrt(iprod(nW, nW));
+ +            /* This is for precision, but does not make a big difference,
+ +             * it can go later.
+ +             */
+ +            r_OD = gmx::invsqrt(iprod(dOD, dOD));
+ +
+ +            /* Normalize the vectors in the water frame */
+ +            svmul(r_nW, nW, nW);
+ +            svmul(r_HH, dHH, dHH);
+ +            svmul(r_OD, dOD, dOD);
+ +
+ +            /* Compute displacement of shell along components of the vector */
+ +            dx[ZZ] = iprod(dDS, dOD);
+ +            /* Compute projection on the XY plane: dDS - dx[ZZ]*dOD */
+ +            for (m = 0; (m < DIM); m++)
+ +            {
+ +                proj[m] = dDS[m]-dx[ZZ]*dOD[m];
+ +            }
+ +
+ +            /*dx[XX] = iprod(dDS,nW);
+ +               dx[YY] = iprod(dDS,dHH);*/
+ +            dx[XX] = iprod(proj, nW);
+ +            for (m = 0; (m < DIM); m++)
+ +            {
+ +                proj[m] -= dx[XX]*nW[m];
+ +            }
+ +            dx[YY] = iprod(proj, dHH);
+ +            /* Now compute the forces and energy */
+ +            kdx[XX] = kk[XX]*dx[XX];
+ +            kdx[YY] = kk[YY]*dx[YY];
+ +            kdx[ZZ] = kk[ZZ]*dx[ZZ];
+ +            vtot   += iprod(dx, kdx);
+ +
+ +            if (computeVirial(flavor) && g)
+ +            {
+ +                ivec_sub(SHIFT_IVEC(g, aS), SHIFT_IVEC(g, aD), dt);
+ +                ki = IVEC2IS(dt);
+ +            }
+ +
+ +            for (m = 0; (m < DIM); m++)
+ +            {
+ +                /* This is a tensor operation but written out for speed */
+ +                tx                  =  nW[m]*kdx[XX];
+ +                ty                  = dHH[m]*kdx[YY];
+ +                tz                  = dOD[m]*kdx[ZZ];
+ +                fij                 = -tx-ty-tz;
+ +                f[aS][m]           += fij;
+ +                f[aD][m]           -= fij;
+ +                if (computeVirial(flavor))
+ +                {
+ +                    fshift[ki][m]      += fij;
+ +                    fshift[CENTRAL][m] -= fij;
+ +                }
+ +            }
+ +        }
+ +    }
+ +    return 0.5*vtot;
+ +}
+ +
+ +template <BondedKernelFlavor flavor>
+ +static real do_1_thole(const rvec xi, const rvec xj, rvec fi, rvec fj,
+ +                       const t_pbc *pbc, real qq,
+ +                       rvec fshift[], real afac)
+ +{
+ +    rvec r12;
+ +    real r12sq, r12_1, r12bar, v0, v1, fscal, ebar, fff;
+ +    int  m, t;
+ +
+ +    t      = pbc_rvec_sub(pbc, xi, xj, r12);                      /*  3 */
+ +
+ +    r12sq  = iprod(r12, r12);                                     /*  5 */
+ +    r12_1  = gmx::invsqrt(r12sq);                                 /*  5 */
+ +    r12bar = afac/r12_1;                                          /*  5 */
+ +    v0     = qq*ONE_4PI_EPS0*r12_1;                               /*  2 */
+ +    ebar   = std::exp(-r12bar);                                   /*  5 */
+ +    v1     = (1-(1+0.5*r12bar)*ebar);                             /*  4 */
+ +    fscal  = ((v0*r12_1)*v1 - v0*0.5*afac*ebar*(r12bar+1))*r12_1; /* 9 */
+ +
+ +    for (m = 0; (m < DIM); m++)
+ +    {
+ +        fff                 = fscal*r12[m];
+ +        fi[m]              += fff;
+ +        fj[m]              -= fff;
+ +        if (computeVirial(flavor))
+ +        {
+ +            fshift[t][m]       += fff;
+ +            fshift[CENTRAL][m] -= fff;
+ +        }
+ +    }             /* 15 */
+ +
+ +    return v0*v1; /* 1 */
+ +    /* 54 */
+ +}
+ +
+ +template <BondedKernelFlavor flavor>
+ +real thole_pol(int nbonds,
+ +               const t_iatom forceatoms[], const t_iparams forceparams[],
+ +               const rvec x[], rvec4 f[], rvec fshift[],
+ +               const t_pbc *pbc, const t_graph gmx_unused *g,
+ +               real gmx_unused lambda, real gmx_unused *dvdlambda,
+ +               const t_mdatoms *md, t_fcdata gmx_unused *fcd,
+ +               int gmx_unused *global_atom_index)
+ +{
+ +    /* Interaction between two pairs of particles with opposite charge */
+ +    int        i, type, a1, da1, a2, da2;
+ +    real       q1, q2, qq, a, al1, al2, afac;
+ +    real       V             = 0;
+ +
+ +    for (i = 0; (i < nbonds); )
+ +    {
+ +        type  = forceatoms[i++];
+ +        a1    = forceatoms[i++];
+ +        da1   = forceatoms[i++];
+ +        a2    = forceatoms[i++];
+ +        da2   = forceatoms[i++];
+ +        q1    = md->chargeA[da1];
+ +        q2    = md->chargeA[da2];
+ +        a     = forceparams[type].thole.a;
+ +        al1   = forceparams[type].thole.alpha1;
+ +        al2   = forceparams[type].thole.alpha2;
+ +        qq    = q1*q2;
+ +        afac  = a*gmx::invsixthroot(al1*al2);
+ +        V    += do_1_thole<flavor>(x[a1], x[a2], f[a1], f[a2], pbc, qq, fshift, afac);
+ +        V    += do_1_thole<flavor>(x[da1], x[a2], f[da1], f[a2], pbc, -qq, fshift, afac);
+ +        V    += do_1_thole<flavor>(x[a1], x[da2], f[a1], f[da2], pbc, -qq, fshift, afac);
+ +        V    += do_1_thole<flavor>(x[da1], x[da2], f[da1], f[da2], pbc, qq, fshift, afac);
+ +    }
+ +    /* 290 flops */
+ +    return V;
+ +}
+ +
+ +template <BondedKernelFlavor flavor>
+ +std::enable_if_t<flavor != BondedKernelFlavor::ForcesSimdWhenAvailable || !GMX_SIMD_HAVE_REAL, real>
+ +angles(int nbonds,
+ +       const t_iatom forceatoms[], const t_iparams forceparams[],
+ +       const rvec x[], rvec4 f[], rvec fshift[],
+ +       const t_pbc *pbc, const t_graph *g,
+ +       real lambda, real *dvdlambda,
+ +       const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+ +       int gmx_unused *global_atom_index)
+ +{
+ +    int  i, ai, aj, ak, t1, t2, type;
+ +    rvec r_ij, r_kj;
+ +    real cos_theta, cos_theta2, theta, dVdt, va, vtot;
+ +    ivec jt, dt_ij, dt_kj;
+ +
+ +    vtot = 0.0;
+ +    for (i = 0; i < nbonds; )
+ +    {
+ +        type = forceatoms[i++];
+ +        ai   = forceatoms[i++];
+ +        aj   = forceatoms[i++];
+ +        ak   = forceatoms[i++];
+ +
+ +        theta  = bond_angle(x[ai], x[aj], x[ak], pbc,
+ +                            r_ij, r_kj, &cos_theta, &t1, &t2);  /*  41                */
+ +
+ +        *dvdlambda += harmonic(forceparams[type].harmonic.krA,
+ +                               forceparams[type].harmonic.krB,
+ +                               forceparams[type].harmonic.rA*DEG2RAD,
+ +                               forceparams[type].harmonic.rB*DEG2RAD,
+ +                               theta, lambda, &va, &dVdt);  /*  21  */
+ +        vtot += va;
+ +
+ +        cos_theta2 = gmx::square(cos_theta);
+ +        if (cos_theta2 < 1)
+ +        {
+ +            int  m;
+ +            real st, sth;
+ +            real cik, cii, ckk;
+ +            real nrkj2, nrij2;
+ +            real nrkj_1, nrij_1;
+ +            rvec f_i, f_j, f_k;
+ +
+ +            st    = dVdt*gmx::invsqrt(1 - cos_theta2); /*  12         */
+ +            sth   = st*cos_theta;                      /*   1         */
+ +            nrij2 = iprod(r_ij, r_ij);                 /*   5         */
+ +            nrkj2 = iprod(r_kj, r_kj);                 /*   5         */
+ +
+ +            nrij_1 = gmx::invsqrt(nrij2);              /*  10         */
+ +            nrkj_1 = gmx::invsqrt(nrkj2);              /*  10         */
+ +
+ +            cik = st*nrij_1*nrkj_1;                    /*   2         */
+ +            cii = sth*nrij_1*nrij_1;                   /*   2         */
+ +            ckk = sth*nrkj_1*nrkj_1;                   /*   2         */
+ +
+ +            for (m = 0; m < DIM; m++)
+ +            {           /*  39                */
+ +                f_i[m]    = -(cik*r_kj[m] - cii*r_ij[m]);
+ +                f_k[m]    = -(cik*r_ij[m] - ckk*r_kj[m]);
+ +                f_j[m]    = -f_i[m] - f_k[m];
+ +                f[ai][m] += f_i[m];
+ +                f[aj][m] += f_j[m];
+ +                f[ak][m] += f_k[m];
+ +            }
+ +            if (computeVirial(flavor))
+ +            {
+ +                if (g != nullptr)
+ +                {
+ +                    copy_ivec(SHIFT_IVEC(g, aj), jt);
+ +
+ +                    ivec_sub(SHIFT_IVEC(g, ai), jt, dt_ij);
+ +                    ivec_sub(SHIFT_IVEC(g, ak), jt, dt_kj);
+ +                    t1 = IVEC2IS(dt_ij);
+ +                    t2 = IVEC2IS(dt_kj);
+ +                }
+ +                rvec_inc(fshift[t1], f_i);
+ +                rvec_inc(fshift[CENTRAL], f_j);
+ +                rvec_inc(fshift[t2], f_k);
+ +            }
+ +        }                                           /* 161 TOTAL      */
+ +    }
+ +
+ +    return vtot;
+ +}
+ +
+ +#if GMX_SIMD_HAVE_REAL
+ +
+ +/* As angles, but using SIMD to calculate many angles at once.
+ + * This routines does not calculate energies and shift forces.
+ + */
+ +template <BondedKernelFlavor flavor>
+ +std::enable_if_t<flavor == BondedKernelFlavor::ForcesSimdWhenAvailable, real>
+ +angles(int nbonds,
+ +       const t_iatom forceatoms[], const t_iparams forceparams[],
+ +       const rvec x[], rvec4 f[], rvec gmx_unused fshift[],
+ +       const t_pbc *pbc, const t_graph gmx_unused *g,
+ +       real gmx_unused lambda, real gmx_unused *dvdlambda,
+ +       const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+ +       int gmx_unused *global_atom_index)
+ +{
+ +    const int            nfa1 = 4;
+ +    int                  i, iu, s;
+ +    int                  type;
+ +    alignas(GMX_SIMD_ALIGNMENT) std::int32_t    ai[GMX_SIMD_REAL_WIDTH];
+ +    alignas(GMX_SIMD_ALIGNMENT) std::int32_t    aj[GMX_SIMD_REAL_WIDTH];
+ +    alignas(GMX_SIMD_ALIGNMENT) std::int32_t    ak[GMX_SIMD_REAL_WIDTH];
+ +    alignas(GMX_SIMD_ALIGNMENT) real            coeff[2*GMX_SIMD_REAL_WIDTH];
+ +    SimdReal             deg2rad_S(DEG2RAD);
+ +    SimdReal             xi_S, yi_S, zi_S;
+ +    SimdReal             xj_S, yj_S, zj_S;
+ +    SimdReal             xk_S, yk_S, zk_S;
+ +    SimdReal             k_S, theta0_S;
+ +    SimdReal             rijx_S, rijy_S, rijz_S;
+ +    SimdReal             rkjx_S, rkjy_S, rkjz_S;
+ +    SimdReal             one_S(1.0);
+ +    SimdReal             min_one_plus_eps_S(-1.0 + 2.0*GMX_REAL_EPS); // Smallest number > -1
+ +
+ +    SimdReal             rij_rkj_S;
+ +    SimdReal             nrij2_S, nrij_1_S;
+ +    SimdReal             nrkj2_S, nrkj_1_S;
+ +    SimdReal             cos_S, invsin_S;
+ +    SimdReal             theta_S;
+ +    SimdReal             st_S, sth_S;
+ +    SimdReal             cik_S, cii_S, ckk_S;
+ +    SimdReal             f_ix_S, f_iy_S, f_iz_S;
+ +    SimdReal             f_kx_S, f_ky_S, f_kz_S;
+ +    alignas(GMX_SIMD_ALIGNMENT) real    pbc_simd[9*GMX_SIMD_REAL_WIDTH];
+ +
+ +    set_pbc_simd(pbc, pbc_simd);
+ +
+ +    /* nbonds is the number of angles times nfa1, here we step GMX_SIMD_REAL_WIDTH angles */
+ +    for (i = 0; (i < nbonds); i += GMX_SIMD_REAL_WIDTH*nfa1)
+ +    {
+ +        /* Collect atoms for GMX_SIMD_REAL_WIDTH angles.
+ +         * iu indexes into forceatoms, we should not let iu go beyond nbonds.
+ +         */
+ +        iu = i;
+ +        for (s = 0; s < GMX_SIMD_REAL_WIDTH; s++)
+ +        {
+ +            type  = forceatoms[iu];
+ +            ai[s] = forceatoms[iu+1];
+ +            aj[s] = forceatoms[iu+2];
+ +            ak[s] = forceatoms[iu+3];
+ +
+ +            /* At the end fill the arrays with the last atoms and 0 params */
+ +            if (i + s*nfa1 < nbonds)
+ +            {
+ +                coeff[s]                     = forceparams[type].harmonic.krA;
+ +                coeff[GMX_SIMD_REAL_WIDTH+s] = forceparams[type].harmonic.rA;
+ +
+ +                if (iu + nfa1 < nbonds)
+ +                {
+ +                    iu += nfa1;
+ +                }
+ +            }
+ +            else
+ +            {
+ +                coeff[s]                     = 0;
+ +                coeff[GMX_SIMD_REAL_WIDTH+s] = 0;
+ +            }
+ +        }
+ +
+ +        /* Store the non PBC corrected distances packed and aligned */
+ +        gatherLoadUTranspose<3>(reinterpret_cast<const real *>(x), ai, &xi_S, &yi_S, &zi_S);
+ +        gatherLoadUTranspose<3>(reinterpret_cast<const real *>(x), aj, &xj_S, &yj_S, &zj_S);
+ +        gatherLoadUTranspose<3>(reinterpret_cast<const real *>(x), ak, &xk_S, &yk_S, &zk_S);
+ +        rijx_S = xi_S - xj_S;
+ +        rijy_S = yi_S - yj_S;
+ +        rijz_S = zi_S - zj_S;
+ +        rkjx_S = xk_S - xj_S;
+ +        rkjy_S = yk_S - yj_S;
+ +        rkjz_S = zk_S - zj_S;
+ +
+ +        k_S       = load<SimdReal>(coeff);
+ +        theta0_S  = load<SimdReal>(coeff+GMX_SIMD_REAL_WIDTH) * deg2rad_S;
+ +
+ +        pbc_correct_dx_simd(&rijx_S, &rijy_S, &rijz_S, pbc_simd);
+ +        pbc_correct_dx_simd(&rkjx_S, &rkjy_S, &rkjz_S, pbc_simd);
+ +
+ +        rij_rkj_S = iprod(rijx_S, rijy_S, rijz_S,
+ +                          rkjx_S, rkjy_S, rkjz_S);
+ +
+ +        nrij2_S   = norm2(rijx_S, rijy_S, rijz_S);
+ +        nrkj2_S   = norm2(rkjx_S, rkjy_S, rkjz_S);
+ +
+ +        nrij_1_S  = invsqrt(nrij2_S);
+ +        nrkj_1_S  = invsqrt(nrkj2_S);
+ +
+ +        cos_S     = rij_rkj_S * nrij_1_S * nrkj_1_S;
+ +
+ +        /* To allow for 180 degrees, we take the max of cos and -1 + 1bit,
+ +         * so we can safely get the 1/sin from 1/sqrt(1 - cos^2).
+ +         * This also ensures that rounding errors would cause the argument
+ +         * of simdAcos to be < -1.
+ +         * Note that we do not take precautions for cos(0)=1, so the outer
+ +         * atoms in an angle should not be on top of each other.
+ +         */
+ +        cos_S     = max(cos_S, min_one_plus_eps_S);
+ +
+ +        theta_S   = acos(cos_S);
+ +
+ +        invsin_S  = invsqrt( one_S - cos_S * cos_S );
+ +
+ +        st_S      = k_S * (theta0_S - theta_S) * invsin_S;
+ +        sth_S     = st_S * cos_S;
+ +
+ +        cik_S     = st_S  * nrij_1_S * nrkj_1_S;
+ +        cii_S     = sth_S * nrij_1_S * nrij_1_S;
+ +        ckk_S     = sth_S * nrkj_1_S * nrkj_1_S;
+ +
+ +        f_ix_S    = cii_S * rijx_S;
+ +        f_ix_S    = fnma(cik_S, rkjx_S, f_ix_S);
+ +        f_iy_S    = cii_S * rijy_S;
+ +        f_iy_S    = fnma(cik_S, rkjy_S, f_iy_S);
+ +        f_iz_S    = cii_S * rijz_S;
+ +        f_iz_S    = fnma(cik_S, rkjz_S, f_iz_S);
+ +        f_kx_S    = ckk_S * rkjx_S;
+ +        f_kx_S    = fnma(cik_S, rijx_S, f_kx_S);
+ +        f_ky_S    = ckk_S * rkjy_S;
+ +        f_ky_S    = fnma(cik_S, rijy_S, f_ky_S);
+ +        f_kz_S    = ckk_S * rkjz_S;
+ +        f_kz_S    = fnma(cik_S, rijz_S, f_kz_S);
+ +
+ +        transposeScatterIncrU<4>(reinterpret_cast<real *>(f), ai, f_ix_S, f_iy_S, f_iz_S);
+ +        transposeScatterDecrU<4>(reinterpret_cast<real *>(f), aj, f_ix_S + f_kx_S, f_iy_S + f_ky_S, f_iz_S + f_kz_S);
+ +        transposeScatterIncrU<4>(reinterpret_cast<real *>(f), ak, f_kx_S, f_ky_S, f_kz_S);
+ +    }
+ +
+ +    return 0;
+ +}
+ +
+ +#endif // GMX_SIMD_HAVE_REAL
+ +
+ +template <BondedKernelFlavor flavor>
+ +real linear_angles(int nbonds,
+ +                   const t_iatom forceatoms[], const t_iparams forceparams[],
+ +                   const rvec x[], rvec4 f[], rvec fshift[],
+ +                   const t_pbc *pbc, const t_graph *g,
+ +                   real lambda, real *dvdlambda,
+ +                   const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+ +                   int gmx_unused *global_atom_index)
+ +{
+ +    int  i, m, ai, aj, ak, t1, t2, type;
+ +    rvec f_i, f_j, f_k;
+ +    real L1, kA, kB, aA, aB, dr, dr2, va, vtot, a, b, klin;
+ +    ivec jt, dt_ij, dt_kj;
+ +    rvec r_ij, r_kj, r_ik, dx;
+ +
+ +    L1   = 1-lambda;
+ +    vtot = 0.0;
+ +    for (i = 0; (i < nbonds); )
+ +    {
+ +        type = forceatoms[i++];
+ +        ai   = forceatoms[i++];
+ +        aj   = forceatoms[i++];
+ +        ak   = forceatoms[i++];
+ +
+ +        kA   = forceparams[type].linangle.klinA;
+ +        kB   = forceparams[type].linangle.klinB;
+ +        klin = L1*kA + lambda*kB;
+ +
+ +        aA   = forceparams[type].linangle.aA;
+ +        aB   = forceparams[type].linangle.aB;
+ +        a    = L1*aA+lambda*aB;
+ +        b    = 1-a;
+ +
+ +        t1 = pbc_rvec_sub(pbc, x[ai], x[aj], r_ij);
+ +        t2 = pbc_rvec_sub(pbc, x[ak], x[aj], r_kj);
+ +        rvec_sub(r_ij, r_kj, r_ik);
+ +
+ +        dr2 = 0;
+ +        for (m = 0; (m < DIM); m++)
+ +        {
+ +            dr        = -a * r_ij[m] - b * r_kj[m];
+ +            dr2      += dr*dr;
+ +            dx[m]     = dr;
+ +            f_i[m]    = a*klin*dr;
+ +            f_k[m]    = b*klin*dr;
+ +            f_j[m]    = -(f_i[m]+f_k[m]);
+ +            f[ai][m] += f_i[m];
+ +            f[aj][m] += f_j[m];
+ +            f[ak][m] += f_k[m];
+ +        }
+ +        va          = 0.5*klin*dr2;
+ +        *dvdlambda += 0.5*(kB-kA)*dr2 + klin*(aB-aA)*iprod(dx, r_ik);
+ +
+ +        vtot += va;
+ +
+ +        if (computeVirial(flavor))
+ +        {
+ +            if (g)
+ +            {
+ +                copy_ivec(SHIFT_IVEC(g, aj), jt);
+ +
+ +                ivec_sub(SHIFT_IVEC(g, ai), jt, dt_ij);
+ +                ivec_sub(SHIFT_IVEC(g, ak), jt, dt_kj);
+ +                t1 = IVEC2IS(dt_ij);
+ +                t2 = IVEC2IS(dt_kj);
+ +            }
+ +            rvec_inc(fshift[t1], f_i);
+ +            rvec_inc(fshift[CENTRAL], f_j);
+ +            rvec_inc(fshift[t2], f_k);
+ +        }
+ +    }                                         /* 57 TOTAL     */
+ +    return vtot;
+ +}
+ +
+ +template <BondedKernelFlavor flavor>
+ +std::enable_if_t<flavor != BondedKernelFlavor::ForcesSimdWhenAvailable || !GMX_SIMD_HAVE_REAL, real>
+ +urey_bradley(int nbonds,
+ +             const t_iatom forceatoms[], const t_iparams forceparams[],
+ +             const rvec x[], rvec4 f[], rvec fshift[],
+ +             const t_pbc *pbc, const t_graph *g,
+ +             real lambda, real *dvdlambda,
+ +             const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+ +             int gmx_unused *global_atom_index)
+ +{
+ +    int  i, m, ai, aj, ak, t1, t2, type, ki;
+ +    rvec r_ij, r_kj, r_ik;
+ +    real cos_theta, cos_theta2, theta;
+ +    real dVdt, va, vtot, dr, dr2, vbond, fbond, fik;
+ +    real kthA, th0A, kUBA, r13A, kthB, th0B, kUBB, r13B;
+ +    ivec jt, dt_ij, dt_kj, dt_ik;
+ +
+ +    vtot = 0.0;
+ +    for (i = 0; (i < nbonds); )
+ +    {
+ +        type  = forceatoms[i++];
+ +        ai    = forceatoms[i++];
+ +        aj    = forceatoms[i++];
+ +        ak    = forceatoms[i++];
+ +        th0A  = forceparams[type].u_b.thetaA*DEG2RAD;
+ +        kthA  = forceparams[type].u_b.kthetaA;
+ +        r13A  = forceparams[type].u_b.r13A;
+ +        kUBA  = forceparams[type].u_b.kUBA;
+ +        th0B  = forceparams[type].u_b.thetaB*DEG2RAD;
+ +        kthB  = forceparams[type].u_b.kthetaB;
+ +        r13B  = forceparams[type].u_b.r13B;
+ +        kUBB  = forceparams[type].u_b.kUBB;
+ +
+ +        theta  = bond_angle(x[ai], x[aj], x[ak], pbc,
+ +                            r_ij, r_kj, &cos_theta, &t1, &t2);                     /*  41             */
+ +
+ +        *dvdlambda += harmonic(kthA, kthB, th0A, th0B, theta, lambda, &va, &dVdt); /*  21  */
+ +        vtot       += va;
+ +
+ +        ki   = pbc_rvec_sub(pbc, x[ai], x[ak], r_ik);                               /*   3      */
+ +        dr2  = iprod(r_ik, r_ik);                                                   /*   5            */
+ +        dr   = dr2*gmx::invsqrt(dr2);                                               /*  10            */
+ +
+ +        *dvdlambda += harmonic(kUBA, kUBB, r13A, r13B, dr, lambda, &vbond, &fbond); /*  19  */
+ +
+ +        cos_theta2 = gmx::square(cos_theta);                                        /*   1            */
+ +        if (cos_theta2 < 1)
+ +        {
+ +            real st, sth;
+ +            real cik, cii, ckk;
+ +            real nrkj2, nrij2;
+ +            rvec f_i, f_j, f_k;
+ +
+ +            st    = dVdt*gmx::invsqrt(1 - cos_theta2); /*  12         */
+ +            sth   = st*cos_theta;                      /*   1         */
+ +            nrkj2 = iprod(r_kj, r_kj);                 /*   5         */
+ +            nrij2 = iprod(r_ij, r_ij);
+ +
+ +            cik = st*gmx::invsqrt(nrkj2*nrij2); /*  12                */
+ +            cii = sth/nrij2;                    /*  10                */
+ +            ckk = sth/nrkj2;                    /*  10                */
+ +
+ +            for (m = 0; (m < DIM); m++)         /*  39                */
+ +            {
+ +                f_i[m]    = -(cik*r_kj[m]-cii*r_ij[m]);
+ +                f_k[m]    = -(cik*r_ij[m]-ckk*r_kj[m]);
+ +                f_j[m]    = -f_i[m]-f_k[m];
+ +                f[ai][m] += f_i[m];
+ +                f[aj][m] += f_j[m];
+ +                f[ak][m] += f_k[m];
+ +            }
+ +            if (computeVirial(flavor))
+ +            {
+ +                if (g)
+ +                {
+ +                    copy_ivec(SHIFT_IVEC(g, aj), jt);
+ +
+ +                    ivec_sub(SHIFT_IVEC(g, ai), jt, dt_ij);
+ +                    ivec_sub(SHIFT_IVEC(g, ak), jt, dt_kj);
+ +                    t1 = IVEC2IS(dt_ij);
+ +                    t2 = IVEC2IS(dt_kj);
+ +                }
+ +                rvec_inc(fshift[t1], f_i);
+ +                rvec_inc(fshift[CENTRAL], f_j);
+ +                rvec_inc(fshift[t2], f_k);
+ +            }
+ +        }                                       /* 161 TOTAL  */
+ +        /* Time for the bond calculations */
+ +        if (dr2 == 0.0)
+ +        {
+ +            continue;
+ +        }
+ +
+ +        vtot  += vbond;             /* 1*/
+ +        fbond *= gmx::invsqrt(dr2); /*   6            */
+ +
+ +        if (computeVirial(flavor) && g)
+ +        {
+ +            ivec_sub(SHIFT_IVEC(g, ai), SHIFT_IVEC(g, ak), dt_ik);
+ +            ki = IVEC2IS(dt_ik);
+ +        }
+ +        for (m = 0; (m < DIM); m++)     /*  15                */
+ +        {
+ +            fik                 = fbond*r_ik[m];
+ +            f[ai][m]           += fik;
+ +            f[ak][m]           -= fik;
+ +            if (computeVirial(flavor))
+ +            {
+ +                fshift[ki][m]      += fik;
+ +                fshift[CENTRAL][m] -= fik;
+ +            }
+ +        }
+ +    }
+ +    return vtot;
+ +}
+ +
+ +#if GMX_SIMD_HAVE_REAL
+ +
+ +/* As urey_bradley, but using SIMD to calculate many potentials at once.
+ + * This routines does not calculate energies and shift forces.
+ + */
+ +template <BondedKernelFlavor flavor>
+ +std::enable_if_t<flavor == BondedKernelFlavor::ForcesSimdWhenAvailable, real>
+ +urey_bradley(int nbonds,
+ +             const t_iatom forceatoms[], const t_iparams forceparams[],
+ +             const rvec x[], rvec4 f[], rvec gmx_unused fshift[],
+ +             const t_pbc *pbc, const t_graph gmx_unused *g,
+ +             real gmx_unused lambda, real gmx_unused *dvdlambda,
+ +             const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+ +             int gmx_unused *global_atom_index)
+ +{
+ +    constexpr int            nfa1 = 4;
+ +    alignas(GMX_SIMD_ALIGNMENT) std::int32_t    ai[GMX_SIMD_REAL_WIDTH];
+ +    alignas(GMX_SIMD_ALIGNMENT) std::int32_t    aj[GMX_SIMD_REAL_WIDTH];
+ +    alignas(GMX_SIMD_ALIGNMENT) std::int32_t    ak[GMX_SIMD_REAL_WIDTH];
+ +    alignas(GMX_SIMD_ALIGNMENT) real            coeff[4*GMX_SIMD_REAL_WIDTH];
+ +    alignas(GMX_SIMD_ALIGNMENT) real            pbc_simd[9*GMX_SIMD_REAL_WIDTH];
+ +
+ +    set_pbc_simd(pbc, pbc_simd);
+ +
+ +    /* nbonds is the number of angles times nfa1, here we step GMX_SIMD_REAL_WIDTH angles */
+ +    for (int i = 0; i < nbonds; i += GMX_SIMD_REAL_WIDTH*nfa1)
+ +    {
+ +        /* Collect atoms for GMX_SIMD_REAL_WIDTH angles.
+ +         * iu indexes into forceatoms, we should not let iu go beyond nbonds.
+ +         */
+ +        int iu = i;
+ +        for (int s = 0; s < GMX_SIMD_REAL_WIDTH; s++)
+ +        {
+ +            const int type                     = forceatoms[iu];
+ +            ai[s] = forceatoms[iu+1];
+ +            aj[s] = forceatoms[iu+2];
+ +            ak[s] = forceatoms[iu+3];
+ +
+ +            /* At the end fill the arrays with the last atoms and 0 params */
+ +            if (i + s*nfa1 < nbonds)
+ +            {
+ +                coeff[s]                       = forceparams[type].u_b.kthetaA;
+ +                coeff[GMX_SIMD_REAL_WIDTH+s]   = forceparams[type].u_b.thetaA;
+ +                coeff[GMX_SIMD_REAL_WIDTH*2+s] = forceparams[type].u_b.kUBA;
+ +                coeff[GMX_SIMD_REAL_WIDTH*3+s] = forceparams[type].u_b.r13A;
+ +
+ +                if (iu + nfa1 < nbonds)
+ +                {
+ +                    iu += nfa1;
+ +                }
+ +            }
+ +            else
+ +            {
+ +                coeff[s]                       = 0;
+ +                coeff[GMX_SIMD_REAL_WIDTH+s]   = 0;
+ +                coeff[GMX_SIMD_REAL_WIDTH*2+s] = 0;
+ +                coeff[GMX_SIMD_REAL_WIDTH*3+s] = 0;
+ +            }
+ +        }
+ +
+ +        SimdReal xi_S, yi_S, zi_S;
+ +        SimdReal xj_S, yj_S, zj_S;
+ +        SimdReal xk_S, yk_S, zk_S;
+ +
+ +        /* Store the non PBC corrected distances packed and aligned */
+ +        gatherLoadUTranspose<3>(reinterpret_cast<const real *>(x), ai, &xi_S, &yi_S, &zi_S);
+ +        gatherLoadUTranspose<3>(reinterpret_cast<const real *>(x), aj, &xj_S, &yj_S, &zj_S);
+ +        gatherLoadUTranspose<3>(reinterpret_cast<const real *>(x), ak, &xk_S, &yk_S, &zk_S);
+ +        SimdReal       rijx_S = xi_S - xj_S;
+ +        SimdReal       rijy_S = yi_S - yj_S;
+ +        SimdReal       rijz_S = zi_S - zj_S;
+ +        SimdReal       rkjx_S = xk_S - xj_S;
+ +        SimdReal       rkjy_S = yk_S - yj_S;
+ +        SimdReal       rkjz_S = zk_S - zj_S;
+ +        SimdReal       rikx_S = xi_S - xk_S;
+ +        SimdReal       riky_S = yi_S - yk_S;
+ +        SimdReal       rikz_S = zi_S - zk_S;
+ +
+ +        const SimdReal ktheta_S = load<SimdReal>(coeff);
+ +        const SimdReal theta0_S = load<SimdReal>(coeff+GMX_SIMD_REAL_WIDTH) * DEG2RAD;
+ +        const SimdReal kUB_S    = load<SimdReal>(coeff+2*GMX_SIMD_REAL_WIDTH);
+ +        const SimdReal r13_S    = load<SimdReal>(coeff+3*GMX_SIMD_REAL_WIDTH);
+ +
+ +        pbc_correct_dx_simd(&rijx_S, &rijy_S, &rijz_S, pbc_simd);
+ +        pbc_correct_dx_simd(&rkjx_S, &rkjy_S, &rkjz_S, pbc_simd);
+ +        pbc_correct_dx_simd(&rikx_S, &riky_S, &rikz_S, pbc_simd);
+ +
+ +        const SimdReal rij_rkj_S = iprod(rijx_S, rijy_S, rijz_S,
+ +                                         rkjx_S, rkjy_S, rkjz_S);
+ +
+ +        const SimdReal dr2_S     = iprod(rikx_S, riky_S, rikz_S,
+ +                                         rikx_S, riky_S, rikz_S);
+ +
+ +        const SimdReal nrij2_S   = norm2(rijx_S, rijy_S, rijz_S);
+ +        const SimdReal nrkj2_S   = norm2(rkjx_S, rkjy_S, rkjz_S);
+ +
+ +        const SimdReal nrij_1_S  = invsqrt(nrij2_S);
+ +        const SimdReal nrkj_1_S  = invsqrt(nrkj2_S);
+ +        const SimdReal invdr2_S  = invsqrt(dr2_S);
+ +        const SimdReal dr_S      = dr2_S*invdr2_S;
+ +
+ +        constexpr real min_one_plus_eps = -1.0 + 2.0*GMX_REAL_EPS; // Smallest number > -1
+ +
+ +        /* To allow for 180 degrees, we take the max of cos and -1 + 1bit,
+ +         * so we can safely get the 1/sin from 1/sqrt(1 - cos^2).
+ +         * This also ensures that rounding errors would cause the argument
+ +         * of simdAcos to be < -1.
+ +         * Note that we do not take precautions for cos(0)=1, so the bonds
+ +         * in an angle should not align at an angle of 0 degrees.
+ +         */
+ +        const SimdReal cos_S     = max(rij_rkj_S * nrij_1_S * nrkj_1_S, min_one_plus_eps);
+ +
+ +        const SimdReal theta_S   = acos(cos_S);
+ +        const SimdReal invsin_S  = invsqrt( 1.0 - cos_S * cos_S );
+ +        const SimdReal st_S      = ktheta_S * (theta0_S - theta_S) * invsin_S;
+ +        const SimdReal sth_S     = st_S * cos_S;
+ +
+ +        const SimdReal cik_S     = st_S  * nrij_1_S * nrkj_1_S;
+ +        const SimdReal cii_S     = sth_S * nrij_1_S * nrij_1_S;
+ +        const SimdReal ckk_S     = sth_S * nrkj_1_S * nrkj_1_S;
+ +
+ +        const SimdReal sUB_S     = kUB_S * (r13_S - dr_S) * invdr2_S;
+ +
+ +        const SimdReal f_ikx_S   = sUB_S * rikx_S;
+ +        const SimdReal f_iky_S   = sUB_S * riky_S;
+ +        const SimdReal f_ikz_S   = sUB_S * rikz_S;
+ +
+ +        const SimdReal f_ix_S    = fnma(cik_S, rkjx_S, cii_S * rijx_S) + f_ikx_S;
+ +        const SimdReal f_iy_S    = fnma(cik_S, rkjy_S, cii_S * rijy_S) + f_iky_S;
+ +        const SimdReal f_iz_S    = fnma(cik_S, rkjz_S, cii_S * rijz_S) + f_ikz_S;
+ +        const SimdReal f_kx_S    = fnma(cik_S, rijx_S, ckk_S * rkjx_S) - f_ikx_S;
+ +        const SimdReal f_ky_S    = fnma(cik_S, rijy_S, ckk_S * rkjy_S) - f_iky_S;
+ +        const SimdReal f_kz_S    = fnma(cik_S, rijz_S, ckk_S * rkjz_S) - f_ikz_S;
+ +
+ +        transposeScatterIncrU<4>(reinterpret_cast<real *>(f), ai, f_ix_S, f_iy_S, f_iz_S);
+ +        transposeScatterDecrU<4>(reinterpret_cast<real *>(f), aj, f_ix_S + f_kx_S, f_iy_S + f_ky_S, f_iz_S + f_kz_S);
+ +        transposeScatterIncrU<4>(reinterpret_cast<real *>(f), ak, f_kx_S, f_ky_S, f_kz_S);
+ +    }
+ +
+ +    return 0;
+ +}
+ +
+ +#endif // GMX_SIMD_HAVE_REAL
+ +
+ +template <BondedKernelFlavor flavor>
+ +real quartic_angles(int nbonds,
+ +                    const t_iatom forceatoms[], const t_iparams forceparams[],
+ +                    const rvec x[], rvec4 f[], rvec fshift[],
+ +                    const t_pbc *pbc, const t_graph *g,
+ +                    real gmx_unused lambda, real gmx_unused *dvdlambda,
+ +                    const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+ +                    int gmx_unused *global_atom_index)
+ +{
+ +    int  i, j, ai, aj, ak, t1, t2, type;
+ +    rvec r_ij, r_kj;
+ +    real cos_theta, cos_theta2, theta, dt, dVdt, va, dtp, c, vtot;
+ +    ivec jt, dt_ij, dt_kj;
+ +
+ +    vtot = 0.0;
+ +    for (i = 0; (i < nbonds); )
+ +    {
+ +        type = forceatoms[i++];
+ +        ai   = forceatoms[i++];
+ +        aj   = forceatoms[i++];
+ +        ak   = forceatoms[i++];
+ +
+ +        theta  = bond_angle(x[ai], x[aj], x[ak], pbc,
+ +                            r_ij, r_kj, &cos_theta, &t1, &t2); /*  41         */
+ +
+ +        dt = theta - forceparams[type].qangle.theta*DEG2RAD;   /* 2          */
+ +
+ +        dVdt = 0;
+ +        va   = forceparams[type].qangle.c[0];
+ +        dtp  = 1.0;
+ +        for (j = 1; j <= 4; j++)
+ +        {
+ +            c     = forceparams[type].qangle.c[j];
+ +            dVdt -= j*c*dtp;
+ +            dtp  *= dt;
+ +            va   += c*dtp;
+ +        }
+ +        /* 20 */
+ +
+ +        vtot += va;
+ +
+ +        cos_theta2 = gmx::square(cos_theta);            /*   1                */
+ +        if (cos_theta2 < 1)
+ +        {
+ +            int  m;
+ +            real st, sth;
+ +            real cik, cii, ckk;
+ +            real nrkj2, nrij2;
+ +            rvec f_i, f_j, f_k;
+ +
+ +            st    = dVdt*gmx::invsqrt(1 - cos_theta2); /*  12         */
+ +            sth   = st*cos_theta;                      /*   1         */
+ +            nrkj2 = iprod(r_kj, r_kj);                 /*   5         */
+ +            nrij2 = iprod(r_ij, r_ij);
+ +
+ +            cik = st*gmx::invsqrt(nrkj2*nrij2); /*  12                */
+ +            cii = sth/nrij2;                    /*  10                */
+ +            ckk = sth/nrkj2;                    /*  10                */
+ +
+ +            for (m = 0; (m < DIM); m++)         /*  39                */
+ +            {
+ +                f_i[m]    = -(cik*r_kj[m]-cii*r_ij[m]);
+ +                f_k[m]    = -(cik*r_ij[m]-ckk*r_kj[m]);
+ +                f_j[m]    = -f_i[m]-f_k[m];
+ +                f[ai][m] += f_i[m];
+ +                f[aj][m] += f_j[m];
+ +                f[ak][m] += f_k[m];
+ +            }
+ +
+ +            if (computeVirial(flavor))
+ +            {
+ +                if (g)
+ +                {
+ +                    copy_ivec(SHIFT_IVEC(g, aj), jt);
+ +
+ +                    ivec_sub(SHIFT_IVEC(g, ai), jt, dt_ij);
+ +                    ivec_sub(SHIFT_IVEC(g, ak), jt, dt_kj);
+ +                    t1 = IVEC2IS(dt_ij);
+ +                    t2 = IVEC2IS(dt_kj);
+ +                }
+ +                rvec_inc(fshift[t1], f_i);
+ +                rvec_inc(fshift[CENTRAL], f_j);
+ +                rvec_inc(fshift[t2], f_k);
+ +            }
+ +        }                                       /* 153 TOTAL  */
+ +    }
+ +    return vtot;
+ +}
+ +
+ +
+ +#if GMX_SIMD_HAVE_REAL
+ +
+ +/* As dih_angle above, but calculates 4 dihedral angles at once using SIMD,
+ + * also calculates the pre-factor required for the dihedral force update.
+ + * Note that bv and buf should be register aligned.
+ + */
+ +inline void
+ +dih_angle_simd(const rvec *x,
+ +               const int *ai, const int *aj, const int *ak, const int *al,
+ +               const real *pbc_simd,
+ +               SimdReal *phi_S,
+ +               SimdReal *mx_S, SimdReal *my_S, SimdReal *mz_S,
+ +               SimdReal *nx_S, SimdReal *ny_S, SimdReal *nz_S,
+ +               SimdReal *nrkj_m2_S,
+ +               SimdReal *nrkj_n2_S,
+ +               SimdReal *p_S,
+ +               SimdReal *q_S)
+ +{
+ +    SimdReal xi_S, yi_S, zi_S;
+ +    SimdReal xj_S, yj_S, zj_S;
+ +    SimdReal xk_S, yk_S, zk_S;
+ +    SimdReal xl_S, yl_S, zl_S;
+ +    SimdReal rijx_S, rijy_S, rijz_S;
+ +    SimdReal rkjx_S, rkjy_S, rkjz_S;
+ +    SimdReal rklx_S, rkly_S, rklz_S;
+ +    SimdReal cx_S, cy_S, cz_S;
+ +    SimdReal cn_S;
+ +    SimdReal s_S;
+ +    SimdReal ipr_S;
+ +    SimdReal iprm_S, iprn_S;
+ +    SimdReal nrkj2_S, nrkj_1_S, nrkj_2_S, nrkj_S;
+ +    SimdReal toler_S;
+ +    SimdReal nrkj2_min_S;
+ +    SimdReal real_eps_S;
+ +
+ +    /* Used to avoid division by zero.
+ +     * We take into acount that we multiply the result by real_eps_S.
+ +     */
+ +    nrkj2_min_S = SimdReal(GMX_REAL_MIN/(2*GMX_REAL_EPS));
+ +
+ +    /* The value of the last significant bit (GMX_REAL_EPS is half of that) */
+ +    real_eps_S  = SimdReal(2*GMX_REAL_EPS);
+ +
+ +    /* Store the non PBC corrected distances packed and aligned */
+ +    gatherLoadUTranspose<3>(reinterpret_cast<const real *>(x), ai, &xi_S, &yi_S, &zi_S);
+ +    gatherLoadUTranspose<3>(reinterpret_cast<const real *>(x), aj, &xj_S, &yj_S, &zj_S);
+ +    gatherLoadUTranspose<3>(reinterpret_cast<const real *>(x), ak, &xk_S, &yk_S, &zk_S);
+ +    gatherLoadUTranspose<3>(reinterpret_cast<const real *>(x), al, &xl_S, &yl_S, &zl_S);
+ +    rijx_S = xi_S - xj_S;
+ +    rijy_S = yi_S - yj_S;
+ +    rijz_S = zi_S - zj_S;
+ +    rkjx_S = xk_S - xj_S;
+ +    rkjy_S = yk_S - yj_S;
+ +    rkjz_S = zk_S - zj_S;
+ +    rklx_S = xk_S - xl_S;
+ +    rkly_S = yk_S - yl_S;
+ +    rklz_S = zk_S - zl_S;
+ +
+ +    pbc_correct_dx_simd(&rijx_S, &rijy_S, &rijz_S, pbc_simd);
+ +    pbc_correct_dx_simd(&rkjx_S, &rkjy_S, &rkjz_S, pbc_simd);
+ +    pbc_correct_dx_simd(&rklx_S, &rkly_S, &rklz_S, pbc_simd);
+ +
+ +    cprod(rijx_S, rijy_S, rijz_S,
+ +          rkjx_S, rkjy_S, rkjz_S,
+ +          mx_S, my_S, mz_S);
+ +
+ +    cprod(rkjx_S, rkjy_S, rkjz_S,
+ +          rklx_S, rkly_S, rklz_S,
+ +          nx_S, ny_S, nz_S);
+ +
+ +    cprod(*mx_S, *my_S, *mz_S,
+ +          *nx_S, *ny_S, *nz_S,
+ +          &cx_S, &cy_S, &cz_S);
+ +
+ +    cn_S       = sqrt(norm2(cx_S, cy_S, cz_S));
+ +
+ +    s_S        = iprod(*mx_S, *my_S, *mz_S, *nx_S, *ny_S, *nz_S);
+ +
+ +    /* Determine the dihedral angle, the sign might need correction */
+ +    *phi_S     = atan2(cn_S, s_S);
+ +
+ +    ipr_S      = iprod(rijx_S, rijy_S, rijz_S,
+ +                       *nx_S, *ny_S, *nz_S);
+ +
+ +    iprm_S     = norm2(*mx_S, *my_S, *mz_S);
+ +    iprn_S     = norm2(*nx_S, *ny_S, *nz_S);
+ +
+ +    nrkj2_S    = norm2(rkjx_S, rkjy_S, rkjz_S);
+ +
+ +    /* Avoid division by zero. When zero, the result is multiplied by 0
+ +     * anyhow, so the 3 max below do not affect the final result.
+ +     */
+ +    nrkj2_S    = max(nrkj2_S, nrkj2_min_S);
+ +    nrkj_1_S   = invsqrt(nrkj2_S);
+ +    nrkj_2_S   = nrkj_1_S * nrkj_1_S;
+ +    nrkj_S     = nrkj2_S * nrkj_1_S;
+ +
+ +    toler_S    = nrkj2_S * real_eps_S;
+ +
+ +    /* Here the plain-C code uses a conditional, but we can't do that in SIMD.
+ +     * So we take a max with the tolerance instead. Since we multiply with
+ +     * m or n later, the max does not affect the results.
+ +     */
+ +    iprm_S     = max(iprm_S, toler_S);
+ +    iprn_S     = max(iprn_S, toler_S);
+ +    *nrkj_m2_S = nrkj_S * inv(iprm_S);
+ +    *nrkj_n2_S = nrkj_S * inv(iprn_S);
+ +
+ +    /* Set sign of phi_S with the sign of ipr_S; phi_S is currently positive */
+ +    *phi_S     = copysign(*phi_S, ipr_S);
+ +    *p_S       = iprod(rijx_S, rijy_S, rijz_S, rkjx_S, rkjy_S, rkjz_S);
+ +    *p_S       = *p_S * nrkj_2_S;
+ +
+ +    *q_S       = iprod(rklx_S, rkly_S, rklz_S, rkjx_S, rkjy_S, rkjz_S);
+ +    *q_S       = *q_S * nrkj_2_S;
+ +}
+ +
+ +#endif // GMX_SIMD_HAVE_REAL
+ +
+ +}      // namespace
+ +
+ +template <BondedKernelFlavor flavor>
+ +void do_dih_fup(int i, int j, int k, int l, real ddphi,
+ +                rvec r_ij, rvec r_kj, rvec r_kl,
+ +                rvec m, rvec n, rvec4 f[], rvec fshift[],
+ +                const t_pbc *pbc, const t_graph *g,
+ +                const rvec x[], int t1, int t2, int t3)
+ +{
+ +    /* 143 FLOPS */
+ +    rvec f_i, f_j, f_k, f_l;
+ +    rvec uvec, vvec, svec, dx_jl;
+ +    real iprm, iprn, nrkj, nrkj2, nrkj_1, nrkj_2;
+ +    real a, b, p, q, toler;
+ +    ivec jt, dt_ij, dt_kj, dt_lj;
+ +
+ +    iprm  = iprod(m, m);       /*  5    */
+ +    iprn  = iprod(n, n);       /*  5  */
+ +    nrkj2 = iprod(r_kj, r_kj); /*  5  */
+ +    toler = nrkj2*GMX_REAL_EPS;
+ +    if ((iprm > toler) && (iprn > toler))
+ +    {
+ +        nrkj_1 = gmx::invsqrt(nrkj2); /* 10   */
+ +        nrkj_2 = nrkj_1*nrkj_1;       /*  1   */
+ +        nrkj   = nrkj2*nrkj_1;        /*  1   */
+ +        a      = -ddphi*nrkj/iprm;    /* 11   */
+ +        svmul(a, m, f_i);             /*  3   */
+ +        b     = ddphi*nrkj/iprn;      /* 11   */
+ +        svmul(b, n, f_l);             /*  3  */
+ +        p     = iprod(r_ij, r_kj);    /*  5   */
+ +        p    *= nrkj_2;               /*  1   */
+ +        q     = iprod(r_kl, r_kj);    /*  5   */
+ +        q    *= nrkj_2;               /*  1   */
+ +        svmul(p, f_i, uvec);          /*  3   */
+ +        svmul(q, f_l, vvec);          /*  3   */
+ +        rvec_sub(uvec, vvec, svec);   /*  3   */
+ +        rvec_sub(f_i, svec, f_j);     /*  3   */
+ +        rvec_add(f_l, svec, f_k);     /*  3   */
+ +        rvec_inc(f[i], f_i);          /*  3   */
+ +        rvec_dec(f[j], f_j);          /*  3   */
+ +        rvec_dec(f[k], f_k);          /*  3   */
+ +        rvec_inc(f[l], f_l);          /*  3   */
+ +
+ +        if (computeVirial(flavor))
+ +        {
+ +            if (g)
+ +            {
+ +                copy_ivec(SHIFT_IVEC(g, j), jt);
+ +                ivec_sub(SHIFT_IVEC(g, i), jt, dt_ij);
+ +                ivec_sub(SHIFT_IVEC(g, k), jt, dt_kj);
+ +                ivec_sub(SHIFT_IVEC(g, l), jt, dt_lj);
+ +                t1 = IVEC2IS(dt_ij);
+ +                t2 = IVEC2IS(dt_kj);
+ +                t3 = IVEC2IS(dt_lj);
+ +            }
+ +            else if (pbc)
+ +            {
+ +                t3 = pbc_rvec_sub(pbc, x[l], x[j], dx_jl);
+ +            }
+ +            else
+ +            {
+ +                t3 = CENTRAL;
+ +            }
+ +
+ +            rvec_inc(fshift[t1], f_i);
+ +            rvec_dec(fshift[CENTRAL], f_j);
+ +            rvec_dec(fshift[t2], f_k);
+ +            rvec_inc(fshift[t3], f_l);
+ +        }
+ +    }
+ +    /* 112 TOTAL    */
+ +}
+ +
+ +namespace
+ +{
+ +
+ +#if GMX_SIMD_HAVE_REAL
+ +/* As do_dih_fup_noshiftf above, but with SIMD and pre-calculated pre-factors */
+ +inline void gmx_simdcall
+ +do_dih_fup_noshiftf_simd(const int *ai, const int *aj, const int *ak, const int *al,
+ +                         SimdReal p, SimdReal q,
+ +                         SimdReal f_i_x,  SimdReal f_i_y,  SimdReal f_i_z,
+ +                         SimdReal mf_l_x, SimdReal mf_l_y, SimdReal mf_l_z,
+ +                         rvec4 f[])
+ +{
+ +    SimdReal sx    = p * f_i_x + q * mf_l_x;
+ +    SimdReal sy    = p * f_i_y + q * mf_l_y;
+ +    SimdReal sz    = p * f_i_z + q * mf_l_z;
+ +    SimdReal f_j_x = f_i_x - sx;
+ +    SimdReal f_j_y = f_i_y - sy;
+ +    SimdReal f_j_z = f_i_z - sz;
+ +    SimdReal f_k_x = mf_l_x - sx;
+ +    SimdReal f_k_y = mf_l_y - sy;
+ +    SimdReal f_k_z = mf_l_z - sz;
+ +    transposeScatterIncrU<4>(reinterpret_cast<real *>(f), ai, f_i_x, f_i_y, f_i_z);
+ +    transposeScatterDecrU<4>(reinterpret_cast<real *>(f), aj, f_j_x, f_j_y, f_j_z);
+ +    transposeScatterIncrU<4>(reinterpret_cast<real *>(f), ak, f_k_x, f_k_y, f_k_z);
+ +    transposeScatterDecrU<4>(reinterpret_cast<real *>(f), al, mf_l_x, mf_l_y, mf_l_z);
+ +}
+ +#endif // GMX_SIMD_HAVE_REAL
+ +
+ +/*! \brief Computes and returns the proper dihedral force
+ + *
+ + * With the appropriate kernel flavor, also computes and accumulates
+ + * the energy and dV/dlambda.
+ + */
+ +template <BondedKernelFlavor flavor>
+ +real dopdihs(real cpA, real cpB, real phiA, real phiB, int mult,
+ +             real phi, real lambda, real *V, real *dvdlambda)
+ +{
+ +    const real L1    = 1.0 - lambda;
+ +    const real ph0   = (L1*phiA + lambda*phiB)*DEG2RAD;
+ +    const real dph0  = (phiB - phiA)*DEG2RAD;
+ +    const real cp    = L1*cpA + lambda*cpB;
+ +
+ +    const real mdphi =  mult*phi - ph0;
+ +    const real sdphi = std::sin(mdphi);
+ +    const real ddphi = -cp*mult*sdphi;
+ +    if (computeEnergy(flavor))
+ +    {
+ +        const real v1  = 1 + std::cos(mdphi);
+ +        *V            += cp*v1;
+ +
+ +        *dvdlambda    += (cpB - cpA)*v1 + cp*dph0*sdphi;
+ +    }
+ +
+ +    return ddphi;
+ +    /* That was 40 flops */
+ +}
+ +
+ +/*! \brief Similar to \p dopdihs(), except for a minus sign and a different treatment of mult/phi0 */
+ +real dopdihs_min(real cpA, real cpB, real phiA, real phiB, int mult,
+ +                 real phi, real lambda, real *V, real *F)
+ +{
+ +    real v, dvdlambda, mdphi, v1, sdphi, ddphi;
+ +    real L1   = 1.0 - lambda;
+ +    real ph0  = (L1*phiA + lambda*phiB)*DEG2RAD;
+ +    real dph0 = (phiB - phiA)*DEG2RAD;
+ +    real cp   = L1*cpA + lambda*cpB;
+ +
+ +    mdphi = mult*(phi-ph0);
+ +    sdphi = std::sin(mdphi);
+ +    ddphi = cp*mult*sdphi;
+ +    v1    = 1.0-std::cos(mdphi);
+ +    v     = cp*v1;
+ +
+ +    dvdlambda  = (cpB-cpA)*v1 + cp*dph0*sdphi;
+ +
+ +    *V = v;
+ +    *F = ddphi;
+ +
+ +    return dvdlambda;
+ +
+ +    /* That was 40 flops */
+ +}
+ +
+ +template <BondedKernelFlavor flavor>
+ +std::enable_if_t<flavor != BondedKernelFlavor::ForcesSimdWhenAvailable || !GMX_SIMD_HAVE_REAL, real>
+ +pdihs(int nbonds,
+ +      const t_iatom forceatoms[], const t_iparams forceparams[],
+ +      const rvec x[], rvec4 f[], rvec fshift[],
+ +      const t_pbc *pbc, const t_graph *g,
+ +      real lambda, real *dvdlambda,
+ +      const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+ +      int gmx_unused *global_atom_index)
+ +{
+ +    int  t1, t2, t3;
+ +    rvec r_ij, r_kj, r_kl, m, n;
+ +
+ +    real vtot = 0.0;
+ +
+ +    for (int i = 0; i < nbonds; )
+ +    {
+ +        const int  ai  = forceatoms[i + 1];
+ +        const int  aj  = forceatoms[i + 2];
+ +        const int  ak  = forceatoms[i + 3];
+ +        const int  al  = forceatoms[i + 4];
+ +
+ +        const real phi = dih_angle(x[ai], x[aj], x[ak], x[al], pbc, r_ij, r_kj, r_kl, m, n,
+ +                                   &t1, &t2, &t3); /*  84      */
+ +
+ +        /* Loop over dihedrals working on the same atoms,
+ +         * so we avoid recalculating angles and distributing forces.
+ +         */
+ +        real ddphi_tot = 0;
+ +        do
+ +        {
+ +            const int type = forceatoms[i];
+ +            ddphi_tot +=
+ +                dopdihs<flavor>(forceparams[type].pdihs.cpA,
+ +                                forceparams[type].pdihs.cpB,
+ +                                forceparams[type].pdihs.phiA,
+ +                                forceparams[type].pdihs.phiB,
+ +                                forceparams[type].pdihs.mult,
+ +                                phi, lambda, &vtot, dvdlambda);
+ +
+ +            i += 5;
+ +        }
+ +        while (i < nbonds &&
+ +               forceatoms[i + 1] == ai &&
+ +               forceatoms[i + 2] == aj &&
+ +               forceatoms[i + 3] == ak &&
+ +               forceatoms[i + 4] == al);
+ +
+ +        do_dih_fup<flavor>(ai, aj, ak, al, ddphi_tot, r_ij, r_kj, r_kl, m, n,
+ +                           f, fshift, pbc, g, x, t1, t2, t3); /* 112          */
+ +    }                                                         /* 223 TOTAL  */
+ +
+ +    return vtot;
+ +}
+ +
+ +#if GMX_SIMD_HAVE_REAL
+ +
+ +/* As pdihs above, but using SIMD to calculate multiple dihedrals at once */
+ +template <BondedKernelFlavor flavor>
+ +std::enable_if_t<flavor == BondedKernelFlavor::ForcesSimdWhenAvailable, real>
+ +pdihs(int nbonds,
+ +      const t_iatom forceatoms[], const t_iparams forceparams[],
+ +      const rvec x[], rvec4 f[], rvec gmx_unused fshift[],
+ +      const t_pbc *pbc, const t_graph gmx_unused *g,
+ +      real gmx_unused lambda, real gmx_unused *dvdlambda,
+ +      const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+ +      int gmx_unused *global_atom_index)
+ +{
+ +    const int             nfa1 = 5;
+ +    int                   i, iu, s;
+ +    int                   type;
+ +    alignas(GMX_SIMD_ALIGNMENT) std::int32_t    ai[GMX_SIMD_REAL_WIDTH];
+ +    alignas(GMX_SIMD_ALIGNMENT) std::int32_t    aj[GMX_SIMD_REAL_WIDTH];
+ +    alignas(GMX_SIMD_ALIGNMENT) std::int32_t    ak[GMX_SIMD_REAL_WIDTH];
+ +    alignas(GMX_SIMD_ALIGNMENT) std::int32_t    al[GMX_SIMD_REAL_WIDTH];
+ +    alignas(GMX_SIMD_ALIGNMENT) real            buf[3*GMX_SIMD_REAL_WIDTH];
+ +    real                 *cp, *phi0, *mult;
+ +    SimdReal              deg2rad_S(DEG2RAD);
+ +    SimdReal              p_S, q_S;
+ +    SimdReal              phi0_S, phi_S;
+ +    SimdReal              mx_S, my_S, mz_S;
+ +    SimdReal              nx_S, ny_S, nz_S;
+ +    SimdReal              nrkj_m2_S, nrkj_n2_S;
+ +    SimdReal              cp_S, mdphi_S, mult_S;
+ +    SimdReal              sin_S, cos_S;
+ +    SimdReal              mddphi_S;
+ +    SimdReal              sf_i_S, msf_l_S;
+ +    alignas(GMX_SIMD_ALIGNMENT) real            pbc_simd[9*GMX_SIMD_REAL_WIDTH];
+ +
+ +    /* Extract aligned pointer for parameters and variables */
+ +    cp    = buf + 0*GMX_SIMD_REAL_WIDTH;
+ +    phi0  = buf + 1*GMX_SIMD_REAL_WIDTH;
+ +    mult  = buf + 2*GMX_SIMD_REAL_WIDTH;
+ +
+ +    set_pbc_simd(pbc, pbc_simd);
+ +
+ +    /* nbonds is the number of dihedrals times nfa1, here we step GMX_SIMD_REAL_WIDTH dihs */
+ +    for (i = 0; (i < nbonds); i += GMX_SIMD_REAL_WIDTH*nfa1)
+ +    {
+ +        /* Collect atoms quadruplets for GMX_SIMD_REAL_WIDTH dihedrals.
+ +         * iu indexes into forceatoms, we should not let iu go beyond nbonds.
+ +         */
+ +        iu = i;
+ +        for (s = 0; s < GMX_SIMD_REAL_WIDTH; s++)
+ +        {
+ +            type  = forceatoms[iu];
+ +            ai[s] = forceatoms[iu+1];
+ +            aj[s] = forceatoms[iu+2];
+ +            ak[s] = forceatoms[iu+3];
+ +            al[s] = forceatoms[iu+4];
+ +
+ +            /* At the end fill the arrays with the last atoms and 0 params */
+ +            if (i + s*nfa1 < nbonds)
+ +            {
+ +                cp[s]   = forceparams[type].pdihs.cpA;
+ +                phi0[s] = forceparams[type].pdihs.phiA;
+ +                mult[s] = forceparams[type].pdihs.mult;
+ +
+ +                if (iu + nfa1 < nbonds)
+ +                {
+ +                    iu += nfa1;
+ +                }
+ +            }
+ +            else
+ +            {
+ +                cp[s]   = 0;
+ +                phi0[s] = 0;
+ +                mult[s] = 0;
+ +            }
+ +        }
+ +
+ +        /* Caclulate GMX_SIMD_REAL_WIDTH dihedral angles at once */
+ +        dih_angle_simd(x, ai, aj, ak, al, pbc_simd,
+ +                       &phi_S,
+ +                       &mx_S, &my_S, &mz_S,
+ +                       &nx_S, &ny_S, &nz_S,
+ +                       &nrkj_m2_S,
+ +                       &nrkj_n2_S,
+ +                       &p_S, &q_S);
+ +
+ +        cp_S     = load<SimdReal>(cp);
+ +        phi0_S   = load<SimdReal>(phi0) * deg2rad_S;
+ +        mult_S   = load<SimdReal>(mult);
+ +
+ +        mdphi_S  = fms(mult_S, phi_S, phi0_S);
+ +
+ +        /* Calculate GMX_SIMD_REAL_WIDTH sines at once */
+ +        sincos(mdphi_S, &sin_S, &cos_S);
+ +        mddphi_S = cp_S * mult_S * sin_S;
+ +        sf_i_S   = mddphi_S * nrkj_m2_S;
+ +        msf_l_S  = mddphi_S * nrkj_n2_S;
+ +
+ +        /* After this m?_S will contain f[i] */
+ +        mx_S     = sf_i_S * mx_S;
+ +        my_S     = sf_i_S * my_S;
+ +        mz_S     = sf_i_S * mz_S;
+ +
+ +        /* After this m?_S will contain -f[l] */
+ +        nx_S     = msf_l_S * nx_S;
+ +        ny_S     = msf_l_S * ny_S;
+ +        nz_S     = msf_l_S * nz_S;
+ +
+ +        do_dih_fup_noshiftf_simd(ai, aj, ak, al,
+ +                                 p_S, q_S,
+ +                                 mx_S, my_S, mz_S,
+ +                                 nx_S, ny_S, nz_S,
+ +                                 f);
+ +    }
+ +
+ +    return 0;
+ +}
+ +
+ +/* This is mostly a copy of the SIMD flavor of pdihs above, but with using
+ + * the RB potential instead of a harmonic potential.
+ + * This function can replace rbdihs() when no energy and virial are needed.
+ + */
+ +template <BondedKernelFlavor flavor>
+ +std::enable_if_t<flavor == BondedKernelFlavor::ForcesSimdWhenAvailable, real>
+ +rbdihs(int nbonds,
+ +       const t_iatom forceatoms[], const t_iparams forceparams[],
+ +       const rvec x[], rvec4 f[], rvec gmx_unused fshift[],
+ +       const t_pbc *pbc, const t_graph gmx_unused *g,
+ +       real gmx_unused lambda, real gmx_unused *dvdlambda,
+ +       const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+ +       int gmx_unused *global_atom_index)
+ +{
+ +    const int             nfa1 = 5;
+ +    int                   i, iu, s, j;
+ +    int                   type;
+ +    alignas(GMX_SIMD_ALIGNMENT) std::int32_t  ai[GMX_SIMD_REAL_WIDTH];
+ +    alignas(GMX_SIMD_ALIGNMENT) std::int32_t  aj[GMX_SIMD_REAL_WIDTH];
+ +    alignas(GMX_SIMD_ALIGNMENT) std::int32_t  ak[GMX_SIMD_REAL_WIDTH];
+ +    alignas(GMX_SIMD_ALIGNMENT) std::int32_t  al[GMX_SIMD_REAL_WIDTH];
+ +    alignas(GMX_SIMD_ALIGNMENT) real          parm[NR_RBDIHS*GMX_SIMD_REAL_WIDTH];
+ +
+ +    SimdReal              p_S, q_S;
+ +    SimdReal              phi_S;
+ +    SimdReal              ddphi_S, cosfac_S;
+ +    SimdReal              mx_S, my_S, mz_S;
+ +    SimdReal              nx_S, ny_S, nz_S;
+ +    SimdReal              nrkj_m2_S, nrkj_n2_S;
+ +    SimdReal              parm_S, c_S;
+ +    SimdReal              sin_S, cos_S;
+ +    SimdReal              sf_i_S, msf_l_S;
+ +    alignas(GMX_SIMD_ALIGNMENT) real          pbc_simd[9*GMX_SIMD_REAL_WIDTH];
+ +
+ +    SimdReal              pi_S(M_PI);
+ +    SimdReal              one_S(1.0);
+ +
+ +    set_pbc_simd(pbc, pbc_simd);
+ +
+ +    /* nbonds is the number of dihedrals times nfa1, here we step GMX_SIMD_REAL_WIDTH dihs */
+ +    for (i = 0; (i < nbonds); i += GMX_SIMD_REAL_WIDTH*nfa1)
+ +    {
+ +        /* Collect atoms quadruplets for GMX_SIMD_REAL_WIDTH dihedrals.
+ +         * iu indexes into forceatoms, we should not let iu go beyond nbonds.
+ +         */
+ +        iu = i;
+ +        for (s = 0; s < GMX_SIMD_REAL_WIDTH; s++)
+ +        {
+ +            type  = forceatoms[iu];
+ +            ai[s] = forceatoms[iu+1];
+ +            aj[s] = forceatoms[iu+2];
+ +            ak[s] = forceatoms[iu+3];
+ +            al[s] = forceatoms[iu+4];
+ +
+ +            /* At the end fill the arrays with the last atoms and 0 params */
+ +            if (i + s*nfa1 < nbonds)
+ +            {
+ +                /* We don't need the first parameter, since that's a constant
+ +                 * which only affects the energies, not the forces.
+ +                 */
+ +                for (j = 1; j < NR_RBDIHS; j++)
+ +                {
+ +                    parm[j*GMX_SIMD_REAL_WIDTH + s] =
+ +                        forceparams[type].rbdihs.rbcA[j];
+ +                }
+ +
+ +                if (iu + nfa1 < nbonds)
+ +                {
+ +                    iu += nfa1;
+ +                }
+ +            }
+ +            else
+ +            {
+ +                for (j = 1; j < NR_RBDIHS; j++)
+ +                {
+ +                    parm[j*GMX_SIMD_REAL_WIDTH + s] = 0;
+ +                }
+ +            }
+ +        }
+ +
+ +        /* Caclulate GMX_SIMD_REAL_WIDTH dihedral angles at once */
+ +        dih_angle_simd(x, ai, aj, ak, al, pbc_simd,
+ +                       &phi_S,
+ +                       &mx_S, &my_S, &mz_S,
+ +                       &nx_S, &ny_S, &nz_S,
+ +                       &nrkj_m2_S,
+ +                       &nrkj_n2_S,
+ +                       &p_S, &q_S);
+ +
+ +        /* Change to polymer convention */
+ +        phi_S = phi_S - pi_S;
+ +
+ +        sincos(phi_S, &sin_S, &cos_S);
+ +
+ +        ddphi_S   = setZero();
+ +        c_S       = one_S;
+ +        cosfac_S  = one_S;
+ +        for (j = 1; j < NR_RBDIHS; j++)
+ +        {
+ +            parm_S   = load<SimdReal>(parm + j*GMX_SIMD_REAL_WIDTH);
+ +            ddphi_S  = fma(c_S * parm_S, cosfac_S, ddphi_S);
+ +            cosfac_S = cosfac_S * cos_S;
+ +            c_S      = c_S + one_S;
+ +        }
+ +
+ +        /* Note that here we do not use the minus sign which is present
+ +         * in the normal RB code. This is corrected for through (m)sf below.
+ +         */
+ +        ddphi_S  = ddphi_S * sin_S;
+ +
+ +        sf_i_S   = ddphi_S * nrkj_m2_S;
+ +        msf_l_S  = ddphi_S * nrkj_n2_S;
+ +
+ +        /* After this m?_S will contain f[i] */
+ +        mx_S     = sf_i_S * mx_S;
+ +        my_S     = sf_i_S * my_S;
+ +        mz_S     = sf_i_S * mz_S;
+ +
+ +        /* After this m?_S will contain -f[l] */
+ +        nx_S     = msf_l_S * nx_S;
+ +        ny_S     = msf_l_S * ny_S;
+ +        nz_S     = msf_l_S * nz_S;
+ +
+ +        do_dih_fup_noshiftf_simd(ai, aj, ak, al,
+ +                                 p_S, q_S,
+ +                                 mx_S, my_S, mz_S,
+ +                                 nx_S, ny_S, nz_S,
+ +                                 f);
+ +    }
+ +
+ +    return 0;
+ +}
+ +
+ +#endif // GMX_SIMD_HAVE_REAL
+ +
+ +
+ +template <BondedKernelFlavor flavor>
+ +real idihs(int nbonds,
+ +           const t_iatom forceatoms[], const t_iparams forceparams[],
+ +           const rvec x[], rvec4 f[], rvec fshift[],
+ +           const t_pbc *pbc, const t_graph *g,
+ +           real lambda, real *dvdlambda,
+ +           const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+ +           int gmx_unused *global_atom_index)
+ +{
+ +    int  i, type, ai, aj, ak, al;
+ +    int  t1, t2, t3;
+ +    real phi, phi0, dphi0, ddphi, vtot;
+ +    rvec r_ij, r_kj, r_kl, m, n;
+ +    real L1, kk, dp, dp2, kA, kB, pA, pB, dvdl_term;
+ +
+ +    L1        = 1.0-lambda;
+ +    dvdl_term = 0;
+ +    vtot      = 0.0;
+ +    for (i = 0; (i < nbonds); )
+ +    {
+ +        type = forceatoms[i++];
+ +        ai   = forceatoms[i++];
+ +        aj   = forceatoms[i++];
+ +        ak   = forceatoms[i++];
+ +        al   = forceatoms[i++];
+ +
+ +        phi = dih_angle(x[ai], x[aj], x[ak], x[al], pbc, r_ij, r_kj, r_kl, m, n,
+ +                        &t1, &t2, &t3);  /*  84               */
+ +
+ +        /* phi can jump if phi0 is close to Pi/-Pi, which will cause huge
+ +         * force changes if we just apply a normal harmonic.
+ +         * Instead, we first calculate phi-phi0 and take it modulo (-Pi,Pi).
+ +         * This means we will never have the periodicity problem, unless
+ +         * the dihedral is Pi away from phiO, which is very unlikely due to
+ +         * the potential.
+ +         */
+ +        kA = forceparams[type].harmonic.krA;
+ +        kB = forceparams[type].harmonic.krB;
+ +        pA = forceparams[type].harmonic.rA;
+ +        pB = forceparams[type].harmonic.rB;
+ +
+ +        kk    = L1*kA + lambda*kB;
+ +        phi0  = (L1*pA + lambda*pB)*DEG2RAD;
+ +        dphi0 = (pB - pA)*DEG2RAD;
+ +
+ +        dp = phi-phi0;
+ +
+ +        make_dp_periodic(&dp);
+ +
+ +        dp2 = dp*dp;
+ +
+ +        vtot += 0.5*kk*dp2;
+ +        ddphi = -kk*dp;
+ +
+ +        dvdl_term += 0.5*(kB - kA)*dp2 - kk*dphi0*dp;
+ +
+ +        do_dih_fup<flavor>(ai, aj, ak, al, -ddphi, r_ij, r_kj, r_kl, m, n,
+ +                           f, fshift, pbc, g, x, t1, t2, t3); /* 112          */
+ +        /* 218 TOTAL  */
+ +    }
+ +
+ +    *dvdlambda += dvdl_term;
+ +    return vtot;
+ +}
+ +
+ +/*! \brief Computes angle restraints of two different types */
+ +template <BondedKernelFlavor flavor>
+ +real low_angres(int nbonds,
+ +                const t_iatom forceatoms[], const t_iparams forceparams[],
+ +                const rvec x[], rvec4 f[], rvec fshift[],
+ +                const t_pbc *pbc, const t_graph *g,
+ +                real lambda, real *dvdlambda,
+ +                gmx_bool bZAxis)
+ +{
+ +    int  i, m, type, ai, aj, ak, al;
+ +    int  t1, t2;
+ +    real phi, cos_phi, cos_phi2, vid, vtot, dVdphi;
+ +    rvec r_ij, r_kl, f_i, f_k = {0, 0, 0};
+ +    real st, sth, nrij2, nrkl2, c, cij, ckl;
+ +
+ +    ivec dt;
+ +    t2 = 0; /* avoid warning with gcc-3.3. It is never used uninitialized */
+ +
+ +    vtot = 0.0;
+ +    ak   = al = 0; /* to avoid warnings */
+ +    for (i = 0; i < nbonds; )
+ +    {
+ +        type = forceatoms[i++];
+ +        ai   = forceatoms[i++];
+ +        aj   = forceatoms[i++];
+ +        t1   = pbc_rvec_sub(pbc, x[aj], x[ai], r_ij);       /*  3             */
+ +        if (!bZAxis)
+ +        {
+ +            ak   = forceatoms[i++];
+ +            al   = forceatoms[i++];
+ +            t2   = pbc_rvec_sub(pbc, x[al], x[ak], r_kl);  /*  3              */
+ +        }
+ +        else
+ +        {
+ +            r_kl[XX] = 0;
+ +            r_kl[YY] = 0;
+ +            r_kl[ZZ] = 1;
+ +        }
+ +
+ +        cos_phi = cos_angle(r_ij, r_kl); /* 25                */
+ +        phi     = std::acos(cos_phi);    /* 10           */
+ +
+ +        *dvdlambda += dopdihs_min(forceparams[type].pdihs.cpA,
+ +                                  forceparams[type].pdihs.cpB,
+ +                                  forceparams[type].pdihs.phiA,
+ +                                  forceparams[type].pdihs.phiB,
+ +                                  forceparams[type].pdihs.mult,
+ +                                  phi, lambda, &vid, &dVdphi); /*  40  */
+ +
+ +        vtot += vid;
+ +
+ +        cos_phi2 = gmx::square(cos_phi);                /*   1                */
+ +        if (cos_phi2 < 1)
+ +        {
+ +            st    = -dVdphi*gmx::invsqrt(1 - cos_phi2); /*  12                */
+ +            sth   = st*cos_phi;                         /*   1                */
+ +            nrij2 = iprod(r_ij, r_ij);                  /*   5                */
+ +            nrkl2 = iprod(r_kl, r_kl);                  /*   5          */
+ +
+ +            c   = st*gmx::invsqrt(nrij2*nrkl2);         /*  11                */
+ +            cij = sth/nrij2;                            /*  10                */
+ +            ckl = sth/nrkl2;                            /*  10                */
+ +
+ +            for (m = 0; m < DIM; m++)                   /*  18+18       */
+ +            {
+ +                f_i[m]    = (c*r_kl[m]-cij*r_ij[m]);
+ +                f[ai][m] += f_i[m];
+ +                f[aj][m] -= f_i[m];
+ +                if (!bZAxis)
+ +                {
+ +                    f_k[m]    = (c*r_ij[m]-ckl*r_kl[m]);
+ +                    f[ak][m] += f_k[m];
+ +                    f[al][m] -= f_k[m];
+ +                }
+ +            }
+ +
+ +            if (computeVirial(flavor))
+ +            {
+ +                if (g)
+ +                {
+ +                    ivec_sub(SHIFT_IVEC(g, ai), SHIFT_IVEC(g, aj), dt);
+ +                    t1 = IVEC2IS(dt);
+ +                }
+ +                rvec_inc(fshift[t1], f_i);
+ +                rvec_dec(fshift[CENTRAL], f_i);
+ +                if (!bZAxis)
+ +                {
+ +                    if (g)
+ +                    {
+ +                        ivec_sub(SHIFT_IVEC(g, ak), SHIFT_IVEC(g, al), dt);
+ +                        t2 = IVEC2IS(dt);
+ +                    }
+ +                    rvec_inc(fshift[t2], f_k);
+ +                    rvec_dec(fshift[CENTRAL], f_k);
+ +                }
+ +            }
+ +        }
+ +    }
+ +
+ +    return vtot; /*  184 / 157 (bZAxis)  total  */
+ +}
+ +
+ +template <BondedKernelFlavor flavor>
+ +real angres(int nbonds,
+ +            const t_iatom forceatoms[], const t_iparams forceparams[],
+ +            const rvec x[], rvec4 f[], rvec fshift[],
+ +            const t_pbc *pbc, const t_graph *g,
+ +            real lambda, real *dvdlambda,
+ +            const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+ +            int gmx_unused *global_atom_index)
+ +{
+ +    return low_angres<flavor>(nbonds, forceatoms, forceparams, x, f, fshift, pbc, g,
+ +                              lambda, dvdlambda, FALSE);
+ +}
+ +
+ +template <BondedKernelFlavor flavor>
+ +real angresz(int nbonds,
+ +             const t_iatom forceatoms[], const t_iparams forceparams[],
+ +             const rvec x[], rvec4 f[], rvec fshift[],
+ +             const t_pbc *pbc, const t_graph *g,
+ +             real lambda, real *dvdlambda,
+ +             const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+ +             int gmx_unused *global_atom_index)
+ +{
+ +    return low_angres<flavor>(nbonds, forceatoms, forceparams, x, f, fshift, pbc, g,
+ +                              lambda, dvdlambda, TRUE);
+ +}
+ +
+ +template <BondedKernelFlavor flavor>
+ +real dihres(int nbonds,
+ +            const t_iatom forceatoms[], const t_iparams forceparams[],
+ +            const rvec x[], rvec4 f[], rvec fshift[],
+ +            const t_pbc *pbc, const t_graph *g,
+ +            real lambda, real *dvdlambda,
+ +            const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+ +            int gmx_unused  *global_atom_index)
+ +{
+ +    real vtot = 0;
+ +    int  ai, aj, ak, al, i, type, t1, t2, t3;
+ +    real phi0A, phi0B, dphiA, dphiB, kfacA, kfacB, phi0, dphi, kfac;
+ +    real phi, ddphi, ddp, ddp2, dp, d2r, L1;
+ +    rvec r_ij, r_kj, r_kl, m, n;
+ +
+ +    L1 = 1.0-lambda;
+ +
+ +    d2r = DEG2RAD;
+ +
+ +    for (i = 0; (i < nbonds); )
+ +    {
+ +        type = forceatoms[i++];
+ +        ai   = forceatoms[i++];
+ +        aj   = forceatoms[i++];
+ +        ak   = forceatoms[i++];
+ +        al   = forceatoms[i++];
+ +
+ +        phi0A  = forceparams[type].dihres.phiA*d2r;
+ +        dphiA  = forceparams[type].dihres.dphiA*d2r;
+ +        kfacA  = forceparams[type].dihres.kfacA;
+ +
+ +        phi0B  = forceparams[type].dihres.phiB*d2r;
+ +        dphiB  = forceparams[type].dihres.dphiB*d2r;
+ +        kfacB  = forceparams[type].dihres.kfacB;
+ +
+ +        phi0  = L1*phi0A + lambda*phi0B;
+ +        dphi  = L1*dphiA + lambda*dphiB;
+ +        kfac  = L1*kfacA + lambda*kfacB;
+ +
+ +        phi = dih_angle(x[ai], x[aj], x[ak], x[al], pbc, r_ij, r_kj, r_kl, m, n,
+ +                        &t1, &t2, &t3);
+ +        /* 84 flops */
+ +
+ +        /* phi can jump if phi0 is close to Pi/-Pi, which will cause huge
+ +         * force changes if we just apply a normal harmonic.
+ +         * Instead, we first calculate phi-phi0 and take it modulo (-Pi,Pi).
+ +         * This means we will never have the periodicity problem, unless
+ +         * the dihedral is Pi away from phiO, which is very unlikely due to
+ +         * the potential.
+ +         */
+ +        dp = phi-phi0;
+ +        make_dp_periodic(&dp);
+ +
+ +        if (dp > dphi)
+ +        {
+ +            ddp = dp-dphi;
+ +        }
+ +        else if (dp < -dphi)
+ +        {
+ +            ddp = dp+dphi;
+ +        }
+ +        else
+ +        {
+ +            ddp = 0;
+ +        }
+ +
+ +        if (ddp != 0.0)
+ +        {
+ +            ddp2  = ddp*ddp;
+ +            vtot += 0.5*kfac*ddp2;
+ +            ddphi = kfac*ddp;
+ +
+ +            *dvdlambda += 0.5*(kfacB - kfacA)*ddp2;
+ +            /* lambda dependence from changing restraint distances */
+ +            if (ddp > 0)
+ +            {
+ +                *dvdlambda -= kfac*ddp*((dphiB - dphiA)+(phi0B - phi0A));
+ +            }
+ +            else if (ddp < 0)
+ +            {
+ +                *dvdlambda += kfac*ddp*((dphiB - dphiA)-(phi0B - phi0A));
+ +            }
+ +            do_dih_fup<flavor>(ai, aj, ak, al, ddphi, r_ij, r_kj, r_kl, m, n,
+ +                               f, fshift, pbc, g, x, t1, t2, t3);      /* 112         */
+ +        }
+ +    }
+ +    return vtot;
+ +}
+ +
+ +
+ +real unimplemented(int gmx_unused nbonds,
+ +                   const t_iatom gmx_unused forceatoms[], const t_iparams gmx_unused forceparams[],
+ +                   const rvec gmx_unused x[], rvec4 gmx_unused f[], rvec gmx_unused fshift[],
+ +                   const t_pbc gmx_unused *pbc, const t_graph  gmx_unused *g,
+ +                   real gmx_unused lambda, real gmx_unused *dvdlambda,
+ +                   const t_mdatoms  gmx_unused *md, t_fcdata gmx_unused *fcd,
+ +                   int gmx_unused *global_atom_index)
+ +{
+ +    gmx_impl("*** you are using a not implemented function");
+ +}
+ +
+ +template <BondedKernelFlavor flavor>
+ +real restrangles(int nbonds,
+ +                 const t_iatom forceatoms[], const t_iparams forceparams[],
+ +                 const rvec x[], rvec4 f[], rvec fshift[],
+ +                 const t_pbc *pbc, const t_graph *g,
+ +                 real gmx_unused lambda, real gmx_unused *dvdlambda,
+ +                 const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+ +                 int gmx_unused *global_atom_index)
+ +{
+ +    int    i, d, ai, aj, ak, type, m;
+ +    int    t1, t2;
+ +    real   v, vtot;
+ +    ivec   jt, dt_ij, dt_kj;
+ +    rvec   f_i, f_j, f_k;
+ +    double prefactor, ratio_ante, ratio_post;
+ +    rvec   delta_ante, delta_post, vec_temp;
+ +
+ +    vtot = 0.0;
+ +    for (i = 0; (i < nbonds); )
+ +    {
+ +        type = forceatoms[i++];
+ +        ai   = forceatoms[i++];
+ +        aj   = forceatoms[i++];
+ +        ak   = forceatoms[i++];
+ +
+ +        t1 = pbc_rvec_sub(pbc, x[ai], x[aj], vec_temp);
+ +        pbc_rvec_sub(pbc, x[aj], x[ai], delta_ante);
+ +        t2 = pbc_rvec_sub(pbc, x[ak], x[aj], delta_post);
+ +
+ +
+ +        /* This function computes factors needed for restricted angle potential.
+ +         * The restricted angle potential is used in coarse-grained simulations to avoid singularities
+ +         * when three particles align and the dihedral angle and dihedral potential
+ +         * cannot be calculated. This potential is calculated using the formula:
+ +           real restrangles(int nbonds,
+ +            const t_iatom forceatoms[],const t_iparams forceparams[],
+ +            const rvec x[],rvec4 f[],rvec fshift[],
+ +            const t_pbc *pbc,const t_graph *g,
+ +            real gmx_unused lambda,real gmx_unused *dvdlambda,
+ +            const t_mdatoms gmx_unused *md,t_fcdata gmx_unused *fcd,
+ +            int gmx_unused *global_atom_index)
+ +           {
+ +           int  i, d, ai, aj, ak, type, m;
+ +           int t1, t2;
+ +           rvec r_ij,r_kj;
+ +           real v, vtot;
+ +           ivec jt,dt_ij,dt_kj;
+ +           rvec f_i, f_j, f_k;
+ +           real prefactor, ratio_ante, ratio_post;
+ +           rvec delta_ante, delta_post, vec_temp;
+ +
+ +           vtot = 0.0;
+ +           for(i=0; (i<nbonds); )
+ +           {
+ +           type = forceatoms[i++];
+ +           ai   = forceatoms[i++];
+ +           aj   = forceatoms[i++];
+ +           ak   = forceatoms[i++];
+ +
+ +         * \f[V_{\rm ReB}(\theta_i) = \frac{1}{2} k_{\theta} \frac{(\cos\theta_i - \cos\theta_0)^2}
+ +         * {\sin^2\theta_i}\f] ({eq:ReB} and ref \cite{MonicaGoga2013} from the manual).
+ +         * For more explanations see comments file "restcbt.h". */
+ +
+ +        compute_factors_restangles(type, forceparams,  delta_ante, delta_post,
+ +                                   &prefactor, &ratio_ante, &ratio_post, &v);
+ +
+ +        /*   Forces are computed per component */
+ +        for (d = 0; d < DIM; d++)
+ +        {
+ +            f_i[d] = prefactor * (ratio_ante * delta_ante[d] - delta_post[d]);
+ +            f_j[d] = prefactor * ((ratio_post + 1.0) * delta_post[d] - (ratio_ante + 1.0) * delta_ante[d]);
+ +            f_k[d] = prefactor * (delta_ante[d] - ratio_post * delta_post[d]);
+ +        }
+ +
+ +        /*   Computation of potential energy   */
+ +
+ +        vtot += v;
+ +
+ +        /*   Update forces */
+ +
+ +        for (m = 0; (m < DIM); m++)
+ +        {
+ +            f[ai][m] += f_i[m];
+ +            f[aj][m] += f_j[m];
+ +            f[ak][m] += f_k[m];
+ +        }
+ +
+ +        if (computeVirial(flavor))
+ +        {
+ +            if (g)
+ +            {
+ +                copy_ivec(SHIFT_IVEC(g, aj), jt);
+ +                ivec_sub(SHIFT_IVEC(g, ai), jt, dt_ij);
+ +                ivec_sub(SHIFT_IVEC(g, ak), jt, dt_kj);
+ +                t1 = IVEC2IS(dt_ij);
+ +                t2 = IVEC2IS(dt_kj);
+ +            }
+ +
+ +            rvec_inc(fshift[t1], f_i);
+ +            rvec_inc(fshift[CENTRAL], f_j);
+ +            rvec_inc(fshift[t2], f_k);
+ +        }
+ +    }
+ +    return vtot;
+ +}
+ +
+ +
+ +template <BondedKernelFlavor flavor>
+ +real restrdihs(int nbonds,
+ +               const t_iatom forceatoms[], const t_iparams forceparams[],
+ +               const rvec x[], rvec4 f[], rvec fshift[],
+ +               const t_pbc *pbc, const t_graph *g,
+ +               real gmx_unused lambda, real gmx_unused *dvlambda,
+ +               const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+ +               int gmx_unused *global_atom_index)
+ +{
+ +    int  i, d, type, ai, aj, ak, al;
+ +    rvec f_i, f_j, f_k, f_l;
+ +    rvec dx_jl;
+ +    ivec jt, dt_ij, dt_kj, dt_lj;
+ +    int  t1, t2, t3;
+ +    real v, vtot;
+ +    rvec delta_ante,  delta_crnt, delta_post, vec_temp;
+ +    real factor_phi_ai_ante, factor_phi_ai_crnt, factor_phi_ai_post;
+ +    real factor_phi_aj_ante, factor_phi_aj_crnt, factor_phi_aj_post;
+ +    real factor_phi_ak_ante, factor_phi_ak_crnt, factor_phi_ak_post;
+ +    real factor_phi_al_ante, factor_phi_al_crnt, factor_phi_al_post;
+ +    real prefactor_phi;
+ +
+ +
+ +    vtot = 0.0;
+ +    for (i = 0; (i < nbonds); )
+ +    {
+ +        type = forceatoms[i++];
+ +        ai   = forceatoms[i++];
+ +        aj   = forceatoms[i++];
+ +        ak   = forceatoms[i++];
+ +        al   = forceatoms[i++];
+ +
+ +        t1 = pbc_rvec_sub(pbc, x[ai], x[aj], vec_temp);
+ +        pbc_rvec_sub(pbc, x[aj], x[ai], delta_ante);
+ +        t2 = pbc_rvec_sub(pbc, x[ak], x[aj], delta_crnt);
+ +        pbc_rvec_sub(pbc, x[ak], x[al], vec_temp);
+ +        pbc_rvec_sub(pbc, x[al], x[ak], delta_post);
+ +
+ +        /* This function computes factors needed for restricted angle potential.
+ +         * The restricted angle potential is used in coarse-grained simulations to avoid singularities
+ +         * when three particles align and the dihedral angle and dihedral potential cannot be calculated.
+ +         * This potential is calculated using the formula:
+ +         * \f[V_{\rm ReB}(\theta_i) = \frac{1}{2} k_{\theta}
+ +         * \frac{(\cos\theta_i - \cos\theta_0)^2}{\sin^2\theta_i}\f]
+ +         * ({eq:ReB} and ref \cite{MonicaGoga2013} from the manual).
+ +         * For more explanations see comments file "restcbt.h" */
+ +
+ +        compute_factors_restrdihs(type, forceparams,
+ +                                  delta_ante, delta_crnt, delta_post,
+ +                                  &factor_phi_ai_ante, &factor_phi_ai_crnt, &factor_phi_ai_post,
+ +                                  &factor_phi_aj_ante, &factor_phi_aj_crnt, &factor_phi_aj_post,
+ +                                  &factor_phi_ak_ante, &factor_phi_ak_crnt, &factor_phi_ak_post,
+ +                                  &factor_phi_al_ante, &factor_phi_al_crnt, &factor_phi_al_post,
+ +                                  &prefactor_phi, &v);
+ +
+ +
+ +        /*      Computation of forces per component */
+ +        for (d = 0; d < DIM; d++)
+ +        {
+ +            f_i[d] = prefactor_phi * (factor_phi_ai_ante * delta_ante[d] + factor_phi_ai_crnt * delta_crnt[d] + factor_phi_ai_post * delta_post[d]);
+ +            f_j[d] = prefactor_phi * (factor_phi_aj_ante * delta_ante[d] + factor_phi_aj_crnt * delta_crnt[d] + factor_phi_aj_post * delta_post[d]);
+ +            f_k[d] = prefactor_phi * (factor_phi_ak_ante * delta_ante[d] + factor_phi_ak_crnt * delta_crnt[d] + factor_phi_ak_post * delta_post[d]);
+ +            f_l[d] = prefactor_phi * (factor_phi_al_ante * delta_ante[d] + factor_phi_al_crnt * delta_crnt[d] + factor_phi_al_post * delta_post[d]);
+ +        }
+ +        /*      Computation of the energy */
+ +
+ +        vtot += v;
+ +
+ +
+ +
+ +        /*    Updating the forces */
+ +
+ +        rvec_inc(f[ai], f_i);
+ +        rvec_inc(f[aj], f_j);
+ +        rvec_inc(f[ak], f_k);
+ +        rvec_inc(f[al], f_l);
+ +
+ +        if (computeVirial(flavor))
+ +        {
+ +            if (g)
+ +            {
+ +                copy_ivec(SHIFT_IVEC(g, aj), jt);
+ +                ivec_sub(SHIFT_IVEC(g, ai), jt, dt_ij);
+ +                ivec_sub(SHIFT_IVEC(g, ak), jt, dt_kj);
+ +                ivec_sub(SHIFT_IVEC(g, al), jt, dt_lj);
+ +                t1 = IVEC2IS(dt_ij);
+ +                t2 = IVEC2IS(dt_kj);
+ +                t3 = IVEC2IS(dt_lj);
+ +            }
+ +            else if (pbc)
+ +            {
+ +                t3 = pbc_rvec_sub(pbc, x[al], x[aj], dx_jl);
+ +            }
+ +            else
+ +            {
+ +                t3 = CENTRAL;
+ +            }
+ +
+ +            rvec_inc(fshift[t1], f_i);
+ +            rvec_inc(fshift[CENTRAL], f_j);
+ +            rvec_inc(fshift[t2], f_k);
+ +            rvec_inc(fshift[t3], f_l);
+ +        }
+ +    }
+ +
+ +    return vtot;
+ +}
+ +
+ +
+ +template <BondedKernelFlavor flavor>
+ +real cbtdihs(int nbonds,
+ +             const t_iatom forceatoms[], const t_iparams forceparams[],
+ +             const rvec x[], rvec4 f[], rvec fshift[],
+ +             const t_pbc *pbc, const t_graph *g,
+ +             real gmx_unused lambda, real gmx_unused *dvdlambda,
+ +             const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+ +             int gmx_unused *global_atom_index)
+ +{
+ +    int  type, ai, aj, ak, al, i, d;
+ +    int  t1, t2, t3;
+ +    real v, vtot;
+ +    rvec vec_temp;
+ +    rvec f_i, f_j, f_k, f_l;
+ +    ivec jt, dt_ij, dt_kj, dt_lj;
+ +    rvec dx_jl;
+ +    rvec delta_ante, delta_crnt, delta_post;
+ +    rvec f_phi_ai, f_phi_aj, f_phi_ak, f_phi_al;
+ +    rvec f_theta_ante_ai, f_theta_ante_aj, f_theta_ante_ak;
+ +    rvec f_theta_post_aj, f_theta_post_ak, f_theta_post_al;
+ +
+ +
+ +
+ +
+ +    vtot = 0.0;
+ +    for (i = 0; (i < nbonds); )
+ +    {
+ +        type = forceatoms[i++];
+ +        ai   = forceatoms[i++];
+ +        aj   = forceatoms[i++];
+ +        ak   = forceatoms[i++];
+ +        al   = forceatoms[i++];
+ +
+ +
+ +        t1 = pbc_rvec_sub(pbc, x[ai], x[aj], vec_temp);
+ +        pbc_rvec_sub(pbc, x[aj], x[ai], delta_ante);
+ +        t2 = pbc_rvec_sub(pbc, x[ak], x[aj], vec_temp);
+ +        pbc_rvec_sub(pbc, x[ak], x[aj], delta_crnt);
+ +        pbc_rvec_sub(pbc, x[ak], x[al], vec_temp);
+ +        pbc_rvec_sub(pbc, x[al], x[ak], delta_post);
+ +
+ +        /* \brief Compute factors for CBT potential
+ +         * The combined bending-torsion potential goes to zero in a very smooth manner, eliminating the numerical
+ +         * instabilities, when three coarse-grained particles align and the dihedral angle and standard
+ +         * dihedral potentials cannot be calculated. The CBT potential is calculated using the formula:
+ +         * \f[V_{\rm CBT}(\theta_{i-1}, \theta_i, \phi_i) = k_{\phi} \sin^3\theta_{i-1} \sin^3\theta_{i}
+ +         * \sum_{n=0}^4 { a_n \cos^n\phi_i}\f] ({eq:CBT} and ref \cite{MonicaGoga2013} from the manual).
+ +         * It contains in its expression not only the dihedral angle \f$\phi\f$
+ +         * but also \f[\theta_{i-1}\f] (theta_ante bellow) and \f[\theta_{i}\f] (theta_post bellow)
+ +         * --- the adjacent bending angles.
+ +         * For more explanations see comments file "restcbt.h". */
+ +
+ +        compute_factors_cbtdihs(type, forceparams, delta_ante, delta_crnt, delta_post,
+ +                                f_phi_ai, f_phi_aj, f_phi_ak, f_phi_al,
+ +                                f_theta_ante_ai, f_theta_ante_aj, f_theta_ante_ak,
+ +                                f_theta_post_aj, f_theta_post_ak, f_theta_post_al,
+ +                                &v);
+ +
+ +
+ +        /*      Acumulate the resuts per beads */
+ +        for (d = 0; d < DIM; d++)
+ +        {
+ +            f_i[d] = f_phi_ai[d] + f_theta_ante_ai[d];
+ +            f_j[d] = f_phi_aj[d] + f_theta_ante_aj[d] + f_theta_post_aj[d];
+ +            f_k[d] = f_phi_ak[d] + f_theta_ante_ak[d] + f_theta_post_ak[d];
+ +            f_l[d] = f_phi_al[d] + f_theta_post_al[d];
+ +        }
+ +
+ +        /*      Compute the potential energy */
+ +
+ +        vtot += v;
+ +
+ +
+ +        /*  Updating the forces */
+ +        rvec_inc(f[ai], f_i);
+ +        rvec_inc(f[aj], f_j);
+ +        rvec_inc(f[ak], f_k);
+ +        rvec_inc(f[al], f_l);
+ +
+ +
+ +        if (computeVirial(flavor))
+ +        {
+ +            if (g)
+ +            {
+ +                copy_ivec(SHIFT_IVEC(g, aj), jt);
+ +                ivec_sub(SHIFT_IVEC(g, ai), jt, dt_ij);
+ +                ivec_sub(SHIFT_IVEC(g, ak), jt, dt_kj);
+ +                ivec_sub(SHIFT_IVEC(g, al), jt, dt_lj);
+ +                t1 = IVEC2IS(dt_ij);
+ +                t2 = IVEC2IS(dt_kj);
+ +                t3 = IVEC2IS(dt_lj);
+ +            }
+ +            else if (pbc)
+ +            {
+ +                t3 = pbc_rvec_sub(pbc, x[al], x[aj], dx_jl);
+ +            }
+ +            else
+ +            {
+ +                t3 = CENTRAL;
+ +            }
+ +
+ +            rvec_inc(fshift[t1], f_i);
+ +            rvec_inc(fshift[CENTRAL], f_j);
+ +            rvec_inc(fshift[t2], f_k);
+ +            rvec_inc(fshift[t3], f_l);
+ +        }
+ +    }
+ +
+ +    return vtot;
+ +}
+ +
+ +template <BondedKernelFlavor flavor>
+ +std::enable_if_t<flavor != BondedKernelFlavor::ForcesSimdWhenAvailable || !GMX_SIMD_HAVE_REAL, real>
+ +rbdihs(int nbonds,
+ +       const t_iatom forceatoms[], const t_iparams forceparams[],
+ +       const rvec x[], rvec4 f[], rvec fshift[],
+ +       const t_pbc *pbc, const t_graph *g,
+ +       real lambda, real *dvdlambda,
+ +       const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+ +       int gmx_unused *global_atom_index)
+ +{
+ +    const real c0 = 0.0, c1 = 1.0, c2 = 2.0, c3 = 3.0, c4 = 4.0, c5 = 5.0;
+ +    int        type, ai, aj, ak, al, i, j;
+ +    int        t1, t2, t3;
+ +    rvec       r_ij, r_kj, r_kl, m, n;
+ +    real       parmA[NR_RBDIHS];
+ +    real       parmB[NR_RBDIHS];
+ +    real       parm[NR_RBDIHS];
+ +    real       cos_phi, phi, rbp, rbpBA;
+ +    real       v, ddphi, sin_phi;
+ +    real       cosfac, vtot;
+ +    real       L1        = 1.0-lambda;
+ +    real       dvdl_term = 0;
+ +
+ +    vtot = 0.0;
+ +    for (i = 0; (i < nbonds); )
+ +    {
+ +        type = forceatoms[i++];
+ +        ai   = forceatoms[i++];
+ +        aj   = forceatoms[i++];
+ +        ak   = forceatoms[i++];
+ +        al   = forceatoms[i++];
+ +
+ +        phi = dih_angle(x[ai], x[aj], x[ak], x[al], pbc, r_ij, r_kj, r_kl, m, n,
+ +                        &t1, &t2, &t3);  /*  84               */
+ +
+ +        /* Change to polymer convention */
+ +        if (phi < c0)
+ +        {
+ +            phi += M_PI;
+ +        }
+ +        else
+ +        {
+ +            phi -= M_PI;    /*   1            */
+ +
+ +        }
+ +        cos_phi = std::cos(phi);
+ +        /* Beware of accuracy loss, cannot use 1-sqrt(cos^2) ! */
+ +        sin_phi = std::sin(phi);
+ +
+ +        for (j = 0; (j < NR_RBDIHS); j++)
+ +        {
+ +            parmA[j] = forceparams[type].rbdihs.rbcA[j];
+ +            parmB[j] = forceparams[type].rbdihs.rbcB[j];
+ +            parm[j]  = L1*parmA[j]+lambda*parmB[j];
+ +        }
+ +        /* Calculate cosine powers */
+ +        /* Calculate the energy */
+ +        /* Calculate the derivative */
+ +
+ +        v            = parm[0];
+ +        dvdl_term   += (parmB[0]-parmA[0]);
+ +        ddphi        = c0;
+ +        cosfac       = c1;
+ +
+ +        rbp          = parm[1];
+ +        rbpBA        = parmB[1]-parmA[1];
+ +        ddphi       += rbp*cosfac;
+ +        cosfac      *= cos_phi;
+ +        v           += cosfac*rbp;
+ +        dvdl_term   += cosfac*rbpBA;
+ +        rbp          = parm[2];
+ +        rbpBA        = parmB[2]-parmA[2];
+ +        ddphi       += c2*rbp*cosfac;
+ +        cosfac      *= cos_phi;
+ +        v           += cosfac*rbp;
+ +        dvdl_term   += cosfac*rbpBA;
+ +        rbp          = parm[3];
+ +        rbpBA        = parmB[3]-parmA[3];
+ +        ddphi       += c3*rbp*cosfac;
+ +        cosfac      *= cos_phi;
+ +        v           += cosfac*rbp;
+ +        dvdl_term   += cosfac*rbpBA;
+ +        rbp          = parm[4];
+ +        rbpBA        = parmB[4]-parmA[4];
+ +        ddphi       += c4*rbp*cosfac;
+ +        cosfac      *= cos_phi;
+ +        v           += cosfac*rbp;
+ +        dvdl_term   += cosfac*rbpBA;
+ +        rbp          = parm[5];
+ +        rbpBA        = parmB[5]-parmA[5];
+ +        ddphi       += c5*rbp*cosfac;
+ +        cosfac      *= cos_phi;
+ +        v           += cosfac*rbp;
+ +        dvdl_term   += cosfac*rbpBA;
+ +
+ +        ddphi = -ddphi*sin_phi;         /*  11                */
+ +
+ +        do_dih_fup<flavor>(ai, aj, ak, al, ddphi, r_ij, r_kj, r_kl, m, n,
+ +                           f, fshift, pbc, g, x, t1, t2, t3); /* 112          */
+ +        vtot += v;
+ +    }
+ +    *dvdlambda += dvdl_term;
+ +
+ +    return vtot;
+ +}
+ +
+ +//! \endcond
+ +
+ +/*! \brief Mysterious undocumented function */
+ +int
+ +cmap_setup_grid_index(int ip, int grid_spacing, int *ipm1, int *ipp1, int *ipp2)
+ +{
+ +    int im1, ip1, ip2;
+ +
+ +    if (ip < 0)
+ +    {
+ +        ip = ip + grid_spacing - 1;
+ +    }
+ +    else if (ip > grid_spacing)
+ +    {
+ +        ip = ip - grid_spacing - 1;
+ +    }
+ +
+ +    im1 = ip - 1;
+ +    ip1 = ip + 1;
+ +    ip2 = ip + 2;
+ +
+ +    if (ip == 0)
+ +    {
+ +        im1 = grid_spacing - 1;
+ +    }
+ +    else if (ip == grid_spacing-2)
+ +    {
+ +        ip2 = 0;
+ +    }
+ +    else if (ip == grid_spacing-1)
+ +    {
+ +        ip1 = 0;
+ +        ip2 = 1;
+ +    }
+ +
+ +    *ipm1 = im1;
+ +    *ipp1 = ip1;
+ +    *ipp2 = ip2;
+ +
+ +    return ip;
+ +
+ +}
+ +
+ +} // namespace
+ +
+ +real
+ +cmap_dihs(int nbonds,
+ +          const t_iatom forceatoms[], const t_iparams forceparams[],
+ +          const gmx_cmap_t *cmap_grid,
+ +          const rvec x[], rvec4 f[], rvec fshift[],
+ +          const struct t_pbc *pbc, const struct t_graph *g,
+ +          real gmx_unused lambda, real gmx_unused *dvdlambda,
+ +          const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+ +          int  gmx_unused *global_atom_index)
+ +{
+ +    int         i, n;
+ +    int         ai, aj, ak, al, am;
+ +    int         a1i, a1j, a1k, a1l, a2i, a2j, a2k, a2l;
+ +    int         type;
+ +    int         t11, t21, t31, t12, t22, t32;
+ +    int         iphi1, ip1m1, ip1p1, ip1p2;
+ +    int         iphi2, ip2m1, ip2p1, ip2p2;
+ +    int         l1, l2, l3;
+ +    int         pos1, pos2, pos3, pos4;
+ +
+ +    real        ty[4], ty1[4], ty2[4], ty12[4], tx[16];
+ +    real        phi1, cos_phi1, sin_phi1, xphi1;
+ +    real        phi2, cos_phi2, sin_phi2, xphi2;
+ +    real        dx, tt, tu, e, df1, df2, vtot;
+ +    real        ra21, rb21, rg21, rg1, rgr1, ra2r1, rb2r1, rabr1;
+ +    real        ra22, rb22, rg22, rg2, rgr2, ra2r2, rb2r2, rabr2;
+ +    real        fg1, hg1, fga1, hgb1, gaa1, gbb1;
+ +    real        fg2, hg2, fga2, hgb2, gaa2, gbb2;
+ +    real        fac;
+ +
+ +    rvec        r1_ij, r1_kj, r1_kl, m1, n1;
+ +    rvec        r2_ij, r2_kj, r2_kl, m2, n2;
+ +    rvec        f1_i, f1_j, f1_k, f1_l;
+ +    rvec        f2_i, f2_j, f2_k, f2_l;
+ +    rvec        a1, b1, a2, b2;
+ +    rvec        f1, g1, h1, f2, g2, h2;
+ +    rvec        dtf1, dtg1, dth1, dtf2, dtg2, dth2;
+ +    ivec        jt1, dt1_ij, dt1_kj, dt1_lj;
+ +    ivec        jt2, dt2_ij, dt2_kj, dt2_lj;
+ +
+ +    int         loop_index[4][4] = {
+ +        {0, 4, 8, 12},
+ +        {1, 5, 9, 13},
+ +        {2, 6, 10, 14},
+ +        {3, 7, 11, 15}
+ +    };
+ +
+ +    /* Total CMAP energy */
+ +    vtot = 0;
+ +
+ +    for (n = 0; n < nbonds; )
+ +    {
+ +        /* Five atoms are involved in the two torsions */
+ +        type   = forceatoms[n++];
+ +        ai     = forceatoms[n++];
+ +        aj     = forceatoms[n++];
+ +        ak     = forceatoms[n++];
+ +        al     = forceatoms[n++];
+ +        am     = forceatoms[n++];
+ +
+ +        /* Which CMAP type is this */
+ +        const int   cmapA = forceparams[type].cmap.cmapA;
+ +        const real *cmapd = cmap_grid->cmapdata[cmapA].cmap.data();
+ +
+ +        /* First torsion */
+ +        a1i   = ai;
+ +        a1j   = aj;
+ +        a1k   = ak;
+ +        a1l   = al;
+ +
+ +        phi1  = dih_angle(x[a1i], x[a1j], x[a1k], x[a1l], pbc, r1_ij, r1_kj, r1_kl, m1, n1,
+ +                          &t11, &t21, &t31);  /* 84 */
+ +
+ +        cos_phi1 = std::cos(phi1);
+ +
+ +        a1[0] = r1_ij[1]*r1_kj[2]-r1_ij[2]*r1_kj[1];
+ +        a1[1] = r1_ij[2]*r1_kj[0]-r1_ij[0]*r1_kj[2];
+ +        a1[2] = r1_ij[0]*r1_kj[1]-r1_ij[1]*r1_kj[0]; /* 9 */
+ +
+ +        b1[0] = r1_kl[1]*r1_kj[2]-r1_kl[2]*r1_kj[1];
+ +        b1[1] = r1_kl[2]*r1_kj[0]-r1_kl[0]*r1_kj[2];
+ +        b1[2] = r1_kl[0]*r1_kj[1]-r1_kl[1]*r1_kj[0]; /* 9 */
+ +
+ +        pbc_rvec_sub(pbc, x[a1l], x[a1k], h1);
+ +
+ +        ra21  = iprod(a1, a1);       /* 5 */
+ +        rb21  = iprod(b1, b1);       /* 5 */
+ +        rg21  = iprod(r1_kj, r1_kj); /* 5 */
+ +        rg1   = sqrt(rg21);
+ +
+ +        rgr1  = 1.0/rg1;
+ +        ra2r1 = 1.0/ra21;
+ +        rb2r1 = 1.0/rb21;
+ +        rabr1 = sqrt(ra2r1*rb2r1);
+ +
+ +        sin_phi1 = rg1 * rabr1 * iprod(a1, h1) * (-1);
+ +
+ +        if (cos_phi1 < -0.5 || cos_phi1 > 0.5)
+ +        {
+ +            phi1 = std::asin(sin_phi1);
+ +
+ +            if (cos_phi1 < 0)
+ +            {
+ +                if (phi1 > 0)
+ +                {
+ +                    phi1 = M_PI - phi1;
+ +                }
+ +                else
+ +                {
+ +                    phi1 = -M_PI - phi1;
+ +                }
+ +            }
+ +        }
+ +        else
+ +        {
+ +            phi1 = std::acos(cos_phi1);
+ +
+ +            if (sin_phi1 < 0)
+ +            {
+ +                phi1 = -phi1;
+ +            }
+ +        }
+ +
+ +        xphi1 = phi1 + M_PI; /* 1 */
+ +
+ +        /* Second torsion */
+ +        a2i   = aj;
+ +        a2j   = ak;
+ +        a2k   = al;
+ +        a2l   = am;
+ +
+ +        phi2  = dih_angle(x[a2i], x[a2j], x[a2k], x[a2l], pbc, r2_ij, r2_kj, r2_kl, m2, n2,
+ +                          &t12, &t22, &t32); /* 84 */
+ +
+ +        cos_phi2 = std::cos(phi2);
+ +
+ +        a2[0] = r2_ij[1]*r2_kj[2]-r2_ij[2]*r2_kj[1];
+ +        a2[1] = r2_ij[2]*r2_kj[0]-r2_ij[0]*r2_kj[2];
+ +        a2[2] = r2_ij[0]*r2_kj[1]-r2_ij[1]*r2_kj[0]; /* 9 */
+ +
+ +        b2[0] = r2_kl[1]*r2_kj[2]-r2_kl[2]*r2_kj[1];
+ +        b2[1] = r2_kl[2]*r2_kj[0]-r2_kl[0]*r2_kj[2];
+ +        b2[2] = r2_kl[0]*r2_kj[1]-r2_kl[1]*r2_kj[0]; /* 9 */
+ +
+ +        pbc_rvec_sub(pbc, x[a2l], x[a2k], h2);
+ +
+ +        ra22  = iprod(a2, a2);         /* 5 */
+ +        rb22  = iprod(b2, b2);         /* 5 */
+ +        rg22  = iprod(r2_kj, r2_kj);   /* 5 */
+ +        rg2   = sqrt(rg22);
+ +
+ +        rgr2  = 1.0/rg2;
+ +        ra2r2 = 1.0/ra22;
+ +        rb2r2 = 1.0/rb22;
+ +        rabr2 = sqrt(ra2r2*rb2r2);
+ +
+ +        sin_phi2 = rg2 * rabr2 * iprod(a2, h2) * (-1);
+ +
+ +        if (cos_phi2 < -0.5 || cos_phi2 > 0.5)
+ +        {
+ +            phi2 = std::asin(sin_phi2);
+ +
+ +            if (cos_phi2 < 0)
+ +            {
+ +                if (phi2 > 0)
+ +                {
+ +                    phi2 = M_PI - phi2;
+ +                }
+ +                else
+ +                {
+ +                    phi2 = -M_PI - phi2;
+ +                }
+ +            }
+ +        }
+ +        else
+ +        {
+ +            phi2 = std::acos(cos_phi2);
+ +
+ +            if (sin_phi2 < 0)
+ +            {
+ +                phi2 = -phi2;
+ +            }
+ +        }
+ +
+ +        xphi2 = phi2 + M_PI; /* 1 */
+ +
+ +        /* Range mangling */
+ +        if (xphi1 < 0)
+ +        {
+ +            xphi1 = xphi1 + 2*M_PI;
+ +        }
+ +        else if (xphi1 >= 2*M_PI)
+ +        {
+ +            xphi1 = xphi1 - 2*M_PI;
+ +        }
+ +
+ +        if (xphi2 < 0)
+ +        {
+ +            xphi2 = xphi2 + 2*M_PI;
+ +        }
+ +        else if (xphi2 >= 2*M_PI)
+ +        {
+ +            xphi2 = xphi2 - 2*M_PI;
+ +        }
+ +
+ +        /* Number of grid points */
+ +        dx = 2*M_PI / cmap_grid->grid_spacing;
+ +
+ +        /* Where on the grid are we */
+ +        iphi1 = static_cast<int>(xphi1/dx);
+ +        iphi2 = static_cast<int>(xphi2/dx);
+ +
+ +        iphi1 = cmap_setup_grid_index(iphi1, cmap_grid->grid_spacing, &ip1m1, &ip1p1, &ip1p2);
+ +        iphi2 = cmap_setup_grid_index(iphi2, cmap_grid->grid_spacing, &ip2m1, &ip2p1, &ip2p2);
+ +
+ +        pos1    = iphi1*cmap_grid->grid_spacing+iphi2;
+ +        pos2    = ip1p1*cmap_grid->grid_spacing+iphi2;
+ +        pos3    = ip1p1*cmap_grid->grid_spacing+ip2p1;
+ +        pos4    = iphi1*cmap_grid->grid_spacing+ip2p1;
+ +
+ +        ty[0]   = cmapd[pos1*4];
+ +        ty[1]   = cmapd[pos2*4];
+ +        ty[2]   = cmapd[pos3*4];
+ +        ty[3]   = cmapd[pos4*4];
+ +
+ +        ty1[0]   = cmapd[pos1*4+1];
+ +        ty1[1]   = cmapd[pos2*4+1];
+ +        ty1[2]   = cmapd[pos3*4+1];
+ +        ty1[3]   = cmapd[pos4*4+1];
+ +
+ +        ty2[0]   = cmapd[pos1*4+2];
+ +        ty2[1]   = cmapd[pos2*4+2];
+ +        ty2[2]   = cmapd[pos3*4+2];
+ +        ty2[3]   = cmapd[pos4*4+2];
+ +
+ +        ty12[0]   = cmapd[pos1*4+3];
+ +        ty12[1]   = cmapd[pos2*4+3];
+ +        ty12[2]   = cmapd[pos3*4+3];
+ +        ty12[3]   = cmapd[pos4*4+3];
+ +
+ +        /* Switch to degrees */
+ +        dx    = 360.0 / cmap_grid->grid_spacing;
+ +        xphi1 = xphi1 * RAD2DEG;
+ +        xphi2 = xphi2 * RAD2DEG;
+ +
+ +        for (i = 0; i < 4; i++) /* 16 */
+ +        {
+ +            tx[i]    = ty[i];
+ +            tx[i+4]  = ty1[i]*dx;
+ +            tx[i+8]  = ty2[i]*dx;
+ +            tx[i+12] = ty12[i]*dx*dx;
+ +        }
+ +
+ +        real tc[16] = {0};
+ +        for (int idx = 0; idx < 16; idx++) /* 1056 */
+ +        {
+ +            for (int k = 0; k < 16; k++)
+ +            {
+ +                tc[idx] += cmap_coeff_matrix[k*16+idx]*tx[k];
+ +            }
+ +        }
+ +
+ +        tt    = (xphi1-iphi1*dx)/dx;
+ +        tu    = (xphi2-iphi2*dx)/dx;
+ +
+ +        e     = 0;
+ +        df1   = 0;
+ +        df2   = 0;
+ +
+ +        for (i = 3; i >= 0; i--)
+ +        {
+ +            l1 = loop_index[i][3];
+ +            l2 = loop_index[i][2];
+ +            l3 = loop_index[i][1];
+ +
+ +            e     = tt * e    + ((tc[i*4+3]*tu+tc[i*4+2])*tu + tc[i*4+1])*tu+tc[i*4];
+ +            df1   = tu * df1  + (3.0*tc[l1]*tt+2.0*tc[l2])*tt+tc[l3];
+ +            df2   = tt * df2  + (3.0*tc[i*4+3]*tu+2.0*tc[i*4+2])*tu+tc[i*4+1];
+ +        }
+ +
+ +        fac     = RAD2DEG/dx;
+ +        df1     = df1   * fac;
+ +        df2     = df2   * fac;
+ +
+ +        /* CMAP energy */
+ +        vtot += e;
+ +
+ +        /* Do forces - first torsion */
+ +        fg1       = iprod(r1_ij, r1_kj);
+ +        hg1       = iprod(r1_kl, r1_kj);
+ +        fga1      = fg1*ra2r1*rgr1;
+ +        hgb1      = hg1*rb2r1*rgr1;
+ +        gaa1      = -ra2r1*rg1;
+ +        gbb1      = rb2r1*rg1;
+ +
+ +        for (i = 0; i < DIM; i++)
+ +        {
+ +            dtf1[i]   = gaa1 * a1[i];
+ +            dtg1[i]   = fga1 * a1[i] - hgb1 * b1[i];
+ +            dth1[i]   = gbb1 * b1[i];
+ +
+ +            f1[i]     = df1  * dtf1[i];
+ +            g1[i]     = df1  * dtg1[i];
+ +            h1[i]     = df1  * dth1[i];
+ +
+ +            f1_i[i]   =  f1[i];
+ +            f1_j[i]   = -f1[i] - g1[i];
+ +            f1_k[i]   =  h1[i] + g1[i];
+ +            f1_l[i]   = -h1[i];
+ +
+ +            f[a1i][i] = f[a1i][i] + f1_i[i];
+ +            f[a1j][i] = f[a1j][i] + f1_j[i]; /* - f1[i] - g1[i] */
+ +            f[a1k][i] = f[a1k][i] + f1_k[i]; /* h1[i] + g1[i] */
+ +            f[a1l][i] = f[a1l][i] + f1_l[i]; /* h1[i] */
+ +        }
+ +
+ +        /* Do forces - second torsion */
+ +        fg2       = iprod(r2_ij, r2_kj);
+ +        hg2       = iprod(r2_kl, r2_kj);
+ +        fga2      = fg2*ra2r2*rgr2;
+ +        hgb2      = hg2*rb2r2*rgr2;
+ +        gaa2      = -ra2r2*rg2;
+ +        gbb2      = rb2r2*rg2;
+ +
+ +        for (i = 0; i < DIM; i++)
+ +        {
+ +            dtf2[i]   = gaa2 * a2[i];
+ +            dtg2[i]   = fga2 * a2[i] - hgb2 * b2[i];
+ +            dth2[i]   = gbb2 * b2[i];
+ +
+ +            f2[i]     = df2  * dtf2[i];
+ +            g2[i]     = df2  * dtg2[i];
+ +            h2[i]     = df2  * dth2[i];
+ +
+ +            f2_i[i]   =  f2[i];
+ +            f2_j[i]   = -f2[i] - g2[i];
+ +            f2_k[i]   =  h2[i] + g2[i];
+ +            f2_l[i]   = -h2[i];
+ +
+ +            f[a2i][i] = f[a2i][i] + f2_i[i]; /* f2[i] */
+ +            f[a2j][i] = f[a2j][i] + f2_j[i]; /* - f2[i] - g2[i] */
+ +            f[a2k][i] = f[a2k][i] + f2_k[i]; /* h2[i] + g2[i] */
+ +            f[a2l][i] = f[a2l][i] + f2_l[i]; /* - h2[i] */
+ +        }
+ +
+ +        /* Shift forces */
+ +        if (fshift != nullptr)
+ +        {
+ +            if (g)
+ +            {
+ +                copy_ivec(SHIFT_IVEC(g, a1j), jt1);
+ +                ivec_sub(SHIFT_IVEC(g, a1i),  jt1, dt1_ij);
+ +                ivec_sub(SHIFT_IVEC(g, a1k),  jt1, dt1_kj);
+ +                ivec_sub(SHIFT_IVEC(g, a1l),  jt1, dt1_lj);
+ +                t11 = IVEC2IS(dt1_ij);
+ +                t21 = IVEC2IS(dt1_kj);
+ +                t31 = IVEC2IS(dt1_lj);
+ +
+ +                copy_ivec(SHIFT_IVEC(g, a2j), jt2);
+ +                ivec_sub(SHIFT_IVEC(g, a2i),  jt2, dt2_ij);
+ +                ivec_sub(SHIFT_IVEC(g, a2k),  jt2, dt2_kj);
+ +                ivec_sub(SHIFT_IVEC(g, a2l),  jt2, dt2_lj);
+ +                t12 = IVEC2IS(dt2_ij);
+ +                t22 = IVEC2IS(dt2_kj);
+ +                t32 = IVEC2IS(dt2_lj);
+ +            }
+ +            else if (pbc)
+ +            {
+ +                t31 = pbc_rvec_sub(pbc, x[a1l], x[a1j], h1);
+ +                t32 = pbc_rvec_sub(pbc, x[a2l], x[a2j], h2);
+ +            }
+ +            else
+ +            {
+ +                t31 = CENTRAL;
+ +                t32 = CENTRAL;
+ +            }
+ +
+ +            rvec_inc(fshift[t11], f1_i);
+ +            rvec_inc(fshift[CENTRAL], f1_j);
+ +            rvec_inc(fshift[t21], f1_k);
+ +            rvec_inc(fshift[t31], f1_l);
+ +
++            rvec_inc(fshift[t12], f2_i);
+ +            rvec_inc(fshift[CENTRAL], f2_j);
+ +            rvec_inc(fshift[t22], f2_k);
+ +            rvec_inc(fshift[t32], f2_l);
+ +        }
+ +    }
+ +    return vtot;
+ +}
+ +
+ +namespace
+ +{
+ +
+ +//! \cond
+ +/***********************************************************
+ + *
+ + *   G R O M O S  9 6   F U N C T I O N S
+ + *
+ + ***********************************************************/
+ +real g96harmonic(real kA, real kB, real xA, real xB, real x, real lambda,
+ +                 real *V, real *F)
+ +{
+ +    const real half = 0.5;
+ +    real       L1, kk, x0, dx, dx2;
+ +    real       v, f, dvdlambda;
+ +
+ +    L1    = 1.0-lambda;
+ +    kk    = L1*kA+lambda*kB;
+ +    x0    = L1*xA+lambda*xB;
+ +
+ +    dx    = x-x0;
+ +    dx2   = dx*dx;
+ +
+ +    f          = -kk*dx;
+ +    v          = half*kk*dx2;
+ +    dvdlambda  = half*(kB-kA)*dx2 + (xA-xB)*kk*dx;
+ +
+ +    *F    = f;
+ +    *V    = v;
+ +
+ +    return dvdlambda;
+ +
+ +    /* That was 21 flops */
+ +}
+ +
+ +template <BondedKernelFlavor flavor>
+ +real g96bonds(int nbonds,
+ +              const t_iatom forceatoms[], const t_iparams forceparams[],
+ +              const rvec x[], rvec4 f[], rvec fshift[],
+ +              const t_pbc *pbc, const t_graph *g,
+ +              real lambda, real *dvdlambda,
+ +              const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+ +              int gmx_unused *global_atom_index)
+ +{
+ +    int  i, ki, ai, aj, type;
+ +    real dr2, fbond, vbond, vtot;
+ +    rvec dx;
+ +
+ +    vtot = 0.0;
+ +    for (i = 0; (i < nbonds); )
+ +    {
+ +        type = forceatoms[i++];
+ +        ai   = forceatoms[i++];
+ +        aj   = forceatoms[i++];
+ +
+ +        ki   = pbc_rvec_sub(pbc, x[ai], x[aj], dx); /*   3      */
+ +        dr2  = iprod(dx, dx);                       /*   5            */
+ +
+ +        *dvdlambda += g96harmonic(forceparams[type].harmonic.krA,
+ +                                  forceparams[type].harmonic.krB,
+ +                                  forceparams[type].harmonic.rA,
+ +                                  forceparams[type].harmonic.rB,
+ +                                  dr2, lambda, &vbond, &fbond);
+ +
+ +        vtot  += 0.5*vbond;                                            /* 1*/
+ +
+ +        spreadBondForces<flavor>(fbond, dx, ai, aj, f, ki, g, fshift); /* 15 */
+ +    }                                                                  /* 44 TOTAL    */
+ +    return vtot;
+ +}
+ +
+ +real g96bond_angle(const rvec xi, const rvec xj, const rvec xk, const t_pbc *pbc,
+ +                   rvec r_ij, rvec r_kj,
+ +                   int *t1, int *t2)
+ +/* Return value is the angle between the bonds i-j and j-k */
+ +{
+ +    real costh;
+ +
+ +    *t1 = pbc_rvec_sub(pbc, xi, xj, r_ij); /*  3              */
+ +    *t2 = pbc_rvec_sub(pbc, xk, xj, r_kj); /*  3              */
+ +
+ +    costh = cos_angle(r_ij, r_kj);         /* 25              */
+ +    /* 41 TOTAL       */
+ +    return costh;
+ +}
+ +
+ +template <BondedKernelFlavor flavor>
+ +real g96angles(int nbonds,
+ +               const t_iatom forceatoms[], const t_iparams forceparams[],
+ +               const rvec x[], rvec4 f[], rvec fshift[],
+ +               const t_pbc *pbc, const t_graph *g,
+ +               real lambda, real *dvdlambda,
+ +               const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+ +               int gmx_unused *global_atom_index)
+ +{
+ +    int  i, ai, aj, ak, type, m, t1, t2;
+ +    rvec r_ij, r_kj;
+ +    real cos_theta, dVdt, va, vtot;
+ +    real rij_1, rij_2, rkj_1, rkj_2, rijrkj_1;
+ +    rvec f_i, f_j, f_k;
+ +    ivec jt, dt_ij, dt_kj;
+ +
+ +    vtot = 0.0;
+ +    for (i = 0; (i < nbonds); )
+ +    {
+ +        type = forceatoms[i++];
+ +        ai   = forceatoms[i++];
+ +        aj   = forceatoms[i++];
+ +        ak   = forceatoms[i++];
+ +
+ +        cos_theta  = g96bond_angle(x[ai], x[aj], x[ak], pbc, r_ij, r_kj, &t1, &t2);
+ +
+ +        *dvdlambda += g96harmonic(forceparams[type].harmonic.krA,
+ +                                  forceparams[type].harmonic.krB,
+ +                                  forceparams[type].harmonic.rA,
+ +                                  forceparams[type].harmonic.rB,
+ +                                  cos_theta, lambda, &va, &dVdt);
+ +        vtot    += va;
+ +
+ +        rij_1    = gmx::invsqrt(iprod(r_ij, r_ij));
+ +        rkj_1    = gmx::invsqrt(iprod(r_kj, r_kj));
+ +        rij_2    = rij_1*rij_1;
+ +        rkj_2    = rkj_1*rkj_1;
+ +        rijrkj_1 = rij_1*rkj_1;         /* 23 */
+ +
+ +        for (m = 0; (m < DIM); m++)     /*  42        */
+ +        {
+ +            f_i[m]    = dVdt*(r_kj[m]*rijrkj_1 - r_ij[m]*rij_2*cos_theta);
+ +            f_k[m]    = dVdt*(r_ij[m]*rijrkj_1 - r_kj[m]*rkj_2*cos_theta);
+ +            f_j[m]    = -f_i[m]-f_k[m];
+ +            f[ai][m] += f_i[m];
+ +            f[aj][m] += f_j[m];
+ +            f[ak][m] += f_k[m];
+ +        }
+ +
+ +        if (computeVirial(flavor))
+ +        {
+ +            if (g)
+ +            {
+ +                copy_ivec(SHIFT_IVEC(g, aj), jt);
+ +
+ +                ivec_sub(SHIFT_IVEC(g, ai), jt, dt_ij);
+ +                ivec_sub(SHIFT_IVEC(g, ak), jt, dt_kj);
+ +                t1 = IVEC2IS(dt_ij);
+ +                t2 = IVEC2IS(dt_kj);
+ +            }
+ +            rvec_inc(fshift[t1], f_i);
+ +            rvec_inc(fshift[CENTRAL], f_j);
+ +            rvec_inc(fshift[t2], f_k);          /* 9 */
+ +        }
+ +        /* 163 TOTAL  */
+ +    }
+ +    return vtot;
+ +}
+ +
+ +template <BondedKernelFlavor flavor>
+ +real cross_bond_bond(int nbonds,
+ +                     const t_iatom forceatoms[], const t_iparams forceparams[],
+ +                     const rvec x[], rvec4 f[], rvec fshift[],
+ +                     const t_pbc *pbc, const t_graph *g,
+ +                     real gmx_unused lambda, real gmx_unused *dvdlambda,
+ +                     const t_mdatoms gmx_unused *md, t_fcdata gmx_unused  *fcd,
+ +                     int gmx_unused *global_atom_index)
+ +{
+ +    /* Potential from Lawrence and Skimmer, Chem. Phys. Lett. 372 (2003)
+ +     * pp. 842-847
+ +     */
+ +    int  i, ai, aj, ak, type, m, t1, t2;
+ +    rvec r_ij, r_kj;
+ +    real vtot, vrr, s1, s2, r1, r2, r1e, r2e, krr;
+ +    rvec f_i, f_j, f_k;
+ +    ivec jt, dt_ij, dt_kj;
+ +
+ +    vtot = 0.0;
+ +    for (i = 0; (i < nbonds); )
+ +    {
+ +        type = forceatoms[i++];
+ +        ai   = forceatoms[i++];
+ +        aj   = forceatoms[i++];
+ +        ak   = forceatoms[i++];
+ +        r1e  = forceparams[type].cross_bb.r1e;
+ +        r2e  = forceparams[type].cross_bb.r2e;
+ +        krr  = forceparams[type].cross_bb.krr;
+ +
+ +        /* Compute distance vectors ... */
+ +        t1 = pbc_rvec_sub(pbc, x[ai], x[aj], r_ij);
+ +        t2 = pbc_rvec_sub(pbc, x[ak], x[aj], r_kj);
+ +
+ +        /* ... and their lengths */
+ +        r1 = norm(r_ij);
+ +        r2 = norm(r_kj);
+ +
+ +        /* Deviations from ideality */
+ +        s1 = r1-r1e;
+ +        s2 = r2-r2e;
+ +
+ +        /* Energy (can be negative!) */
+ +        vrr   = krr*s1*s2;
+ +        vtot += vrr;
+ +
+ +        /* Forces */
+ +        svmul(-krr*s2/r1, r_ij, f_i);
+ +        svmul(-krr*s1/r2, r_kj, f_k);
+ +
+ +        for (m = 0; (m < DIM); m++)     /*  12        */
+ +        {
+ +            f_j[m]    = -f_i[m] - f_k[m];
+ +            f[ai][m] += f_i[m];
+ +            f[aj][m] += f_j[m];
+ +            f[ak][m] += f_k[m];
+ +        }
+ +
+ +        if (computeVirial(flavor))
+ +        {
+ +            if (g)
+ +            {
+ +                copy_ivec(SHIFT_IVEC(g, aj), jt);
+ +
+ +                ivec_sub(SHIFT_IVEC(g, ai), jt, dt_ij);
+ +                ivec_sub(SHIFT_IVEC(g, ak), jt, dt_kj);
+ +                t1 = IVEC2IS(dt_ij);
+ +                t2 = IVEC2IS(dt_kj);
+ +            }
+ +            rvec_inc(fshift[t1], f_i);
+ +            rvec_inc(fshift[CENTRAL], f_j);
+ +            rvec_inc(fshift[t2], f_k);          /* 9 */
+ +        }
+ +        /* 163 TOTAL  */
+ +    }
+ +    return vtot;
+ +}
+ +
+ +template <BondedKernelFlavor flavor>
+ +real cross_bond_angle(int nbonds,
+ +                      const t_iatom forceatoms[], const t_iparams forceparams[],
+ +                      const rvec x[], rvec4 f[], rvec fshift[],
+ +                      const t_pbc *pbc, const t_graph *g,
+ +                      real gmx_unused lambda, real gmx_unused *dvdlambda,
+ +                      const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+ +                      int gmx_unused *global_atom_index)
+ +{
+ +    /* Potential from Lawrence and Skimmer, Chem. Phys. Lett. 372 (2003)
+ +     * pp. 842-847
+ +     */
+ +    int  i, ai, aj, ak, type, m, t1, t2;
+ +    rvec r_ij, r_kj, r_ik;
+ +    real vtot, vrt, s1, s2, s3, r1, r2, r3, r1e, r2e, r3e, krt, k1, k2, k3;
+ +    rvec f_i, f_j, f_k;
+ +    ivec jt, dt_ij, dt_kj;
+ +
+ +    vtot = 0.0;
+ +    for (i = 0; (i < nbonds); )
+ +    {
+ +        type = forceatoms[i++];
+ +        ai   = forceatoms[i++];
+ +        aj   = forceatoms[i++];
+ +        ak   = forceatoms[i++];
+ +        r1e  = forceparams[type].cross_ba.r1e;
+ +        r2e  = forceparams[type].cross_ba.r2e;
+ +        r3e  = forceparams[type].cross_ba.r3e;
+ +        krt  = forceparams[type].cross_ba.krt;
+ +
+ +        /* Compute distance vectors ... */
+ +        t1 = pbc_rvec_sub(pbc, x[ai], x[aj], r_ij);
+ +        t2 = pbc_rvec_sub(pbc, x[ak], x[aj], r_kj);
+ +        pbc_rvec_sub(pbc, x[ai], x[ak], r_ik);
+ +
+ +        /* ... and their lengths */
+ +        r1 = norm(r_ij);
+ +        r2 = norm(r_kj);
+ +        r3 = norm(r_ik);
+ +
+ +        /* Deviations from ideality */
+ +        s1 = r1-r1e;
+ +        s2 = r2-r2e;
+ +        s3 = r3-r3e;
+ +
+ +        /* Energy (can be negative!) */
+ +        vrt   = krt*s3*(s1+s2);
+ +        vtot += vrt;
+ +
+ +        /* Forces */
+ +        k1 = -krt*(s3/r1);
+ +        k2 = -krt*(s3/r2);
+ +        k3 = -krt*(s1+s2)/r3;
+ +        for (m = 0; (m < DIM); m++)
+ +        {
+ +            f_i[m] = k1*r_ij[m] + k3*r_ik[m];
+ +            f_k[m] = k2*r_kj[m] - k3*r_ik[m];
+ +            f_j[m] = -f_i[m] - f_k[m];
+ +        }
+ +
+ +        for (m = 0; (m < DIM); m++)     /*  12        */
+ +        {
+ +            f[ai][m] += f_i[m];
+ +            f[aj][m] += f_j[m];
+ +            f[ak][m] += f_k[m];
+ +        }
+ +
+ +        if (computeVirial(flavor))
+ +        {
+ +            if (g)
+ +            {
+ +                copy_ivec(SHIFT_IVEC(g, aj), jt);
+ +
+ +                ivec_sub(SHIFT_IVEC(g, ai), jt, dt_ij);
+ +                ivec_sub(SHIFT_IVEC(g, ak), jt, dt_kj);
+ +                t1 = IVEC2IS(dt_ij);
+ +                t2 = IVEC2IS(dt_kj);
+ +            }
+ +            rvec_inc(fshift[t1], f_i);
+ +            rvec_inc(fshift[CENTRAL], f_j);
+ +            rvec_inc(fshift[t2], f_k);          /* 9 */
+ +        }
+ +        /* 163 TOTAL  */
+ +    }
+ +    return vtot;
+ +}
+ +
+ +/*! \brief Computes the potential and force for a tabulated potential */
+ +real bonded_tab(const char *type, int table_nr,
+ +                const bondedtable_t *table, real kA, real kB, real r,
+ +                real lambda, real *V, real *F)
+ +{
+ +    real k, tabscale, *VFtab, rt, eps, eps2, Yt, Ft, Geps, Heps2, Fp, VV, FF;
+ +    int  n0, nnn;
+ +    real dvdlambda;
+ +
+ +    k = (1.0 - lambda)*kA + lambda*kB;
+ +
+ +    tabscale = table->scale;
+ +    VFtab    = table->data;
+ +
+ +    rt    = r*tabscale;
+ +    n0    = static_cast<int>(rt);
+ +    if (n0 >= table->n)
+ +    {
+ +        gmx_fatal(FARGS, "A tabulated %s interaction table number %d is out of the table range: r %f, between table indices %d and %d, table length %d",
+ +                  type, table_nr, r, n0, n0+1, table->n);
+ +    }
+ +    eps   = rt - n0;
+ +    eps2  = eps*eps;
+ +    nnn   = 4*n0;
+ +    Yt    = VFtab[nnn];
+ +    Ft    = VFtab[nnn+1];
+ +    Geps  = VFtab[nnn+2]*eps;
+ +    Heps2 = VFtab[nnn+3]*eps2;
+ +    Fp    = Ft + Geps + Heps2;
+ +    VV    = Yt + Fp*eps;
+ +    FF    = Fp + Geps + 2.0*Heps2;
+ +
+ +    *F         = -k*FF*tabscale;
+ +    *V         = k*VV;
+ +    dvdlambda  = (kB - kA)*VV;
+ +
+ +    return dvdlambda;
+ +
+ +    /* That was 22 flops */
+ +}
+ +
+ +template <BondedKernelFlavor flavor>
+ +real tab_bonds(int nbonds,
+ +               const t_iatom forceatoms[], const t_iparams forceparams[],
+ +               const rvec x[], rvec4 f[], rvec fshift[],
+ +               const t_pbc *pbc, const t_graph *g,
+ +               real lambda, real *dvdlambda,
+ +               const t_mdatoms gmx_unused *md, t_fcdata *fcd,
+ +               int gmx_unused  *global_atom_index)
+ +{
+ +    int  i, ki, ai, aj, type, table;
+ +    real dr, dr2, fbond, vbond, vtot;
+ +    rvec dx;
+ +
+ +    vtot = 0.0;
+ +    for (i = 0; (i < nbonds); )
+ +    {
+ +        type = forceatoms[i++];
+ +        ai   = forceatoms[i++];
+ +        aj   = forceatoms[i++];
+ +
+ +        ki   = pbc_rvec_sub(pbc, x[ai], x[aj], dx); /*   3      */
+ +        dr2  = iprod(dx, dx);                       /*   5            */
+ +        dr   = dr2*gmx::invsqrt(dr2);               /*  10            */
+ +
+ +        table = forceparams[type].tab.table;
+ +
+ +        *dvdlambda += bonded_tab("bond", table,
+ +                                 &fcd->bondtab[table],
+ +                                 forceparams[type].tab.kA,
+ +                                 forceparams[type].tab.kB,
+ +                                 dr, lambda, &vbond, &fbond); /*  22 */
+ +
+ +        if (dr2 == 0.0)
+ +        {
+ +            continue;
+ +        }
+ +
+ +
+ +        vtot  += vbond;                                                /* 1*/
+ +        fbond *= gmx::invsqrt(dr2);                                    /*   6         */
+ +
+ +        spreadBondForces<flavor>(fbond, dx, ai, aj, f, ki, g, fshift); /* 15 */
+ +    }                                                                  /* 62 TOTAL    */
+ +    return vtot;
+ +}
+ +
+ +template <BondedKernelFlavor flavor>
+ +real tab_angles(int nbonds,
+ +                const t_iatom forceatoms[], const t_iparams forceparams[],
+ +                const rvec x[], rvec4 f[], rvec fshift[],
+ +                const t_pbc *pbc, const t_graph *g,
+ +                real lambda, real *dvdlambda,
+ +                const t_mdatoms gmx_unused  *md, t_fcdata *fcd,
+ +                int gmx_unused *global_atom_index)
+ +{
+ +    int  i, ai, aj, ak, t1, t2, type, table;
+ +    rvec r_ij, r_kj;
+ +    real cos_theta, cos_theta2, theta, dVdt, va, vtot;
+ +    ivec jt, dt_ij, dt_kj;
+ +
+ +    vtot = 0.0;
+ +    for (i = 0; (i < nbonds); )
+ +    {
+ +        type = forceatoms[i++];
+ +        ai   = forceatoms[i++];
+ +        aj   = forceatoms[i++];
+ +        ak   = forceatoms[i++];
+ +
+ +        theta  = bond_angle(x[ai], x[aj], x[ak], pbc,
+ +                            r_ij, r_kj, &cos_theta, &t1, &t2); /*  41         */
+ +
+ +        table = forceparams[type].tab.table;
+ +
+ +        *dvdlambda += bonded_tab("angle", table,
+ +                                 &fcd->angletab[table],
+ +                                 forceparams[type].tab.kA,
+ +                                 forceparams[type].tab.kB,
+ +                                 theta, lambda, &va, &dVdt); /*  22  */
+ +        vtot += va;
+ +
+ +        cos_theta2 = gmx::square(cos_theta);            /*   1                */
+ +        if (cos_theta2 < 1)
+ +        {
+ +            int  m;
+ +            real st, sth;
+ +            real cik, cii, ckk;
+ +            real nrkj2, nrij2;
+ +            rvec f_i, f_j, f_k;
+ +
+ +            st    = dVdt*gmx::invsqrt(1 - cos_theta2); /*  12         */
+ +            sth   = st*cos_theta;                      /*   1         */
+ +            nrkj2 = iprod(r_kj, r_kj);                 /*   5         */
+ +            nrij2 = iprod(r_ij, r_ij);
+ +
+ +            cik = st*gmx::invsqrt(nrkj2*nrij2); /*  12                */
+ +            cii = sth/nrij2;                    /*  10                */
+ +            ckk = sth/nrkj2;                    /*  10                */
+ +
+ +            for (m = 0; (m < DIM); m++)         /*  39                */
+ +            {
+ +                f_i[m]    = -(cik*r_kj[m]-cii*r_ij[m]);
+ +                f_k[m]    = -(cik*r_ij[m]-ckk*r_kj[m]);
+ +                f_j[m]    = -f_i[m]-f_k[m];
+ +                f[ai][m] += f_i[m];
+ +                f[aj][m] += f_j[m];
+ +                f[ak][m] += f_k[m];
+ +            }
+ +
+ +            if (computeVirial(flavor))
+ +            {
+ +                if (g)
+ +                {
+ +                    copy_ivec(SHIFT_IVEC(g, aj), jt);
+ +
+ +                    ivec_sub(SHIFT_IVEC(g, ai), jt, dt_ij);
+ +                    ivec_sub(SHIFT_IVEC(g, ak), jt, dt_kj);
+ +                    t1 = IVEC2IS(dt_ij);
+ +                    t2 = IVEC2IS(dt_kj);
+ +                }
+ +                rvec_inc(fshift[t1], f_i);
+ +                rvec_inc(fshift[CENTRAL], f_j);
+ +                rvec_inc(fshift[t2], f_k);
+ +            }
+ +        }                                       /* 169 TOTAL  */
+ +    }
+ +    return vtot;
+ +}
+ +
+ +template <BondedKernelFlavor flavor>
+ +real tab_dihs(int nbonds,
+ +              const t_iatom forceatoms[], const t_iparams forceparams[],
+ +              const rvec x[], rvec4 f[], rvec fshift[],
+ +              const t_pbc *pbc, const t_graph *g,
+ +              real lambda, real *dvdlambda,
+ +              const t_mdatoms gmx_unused *md, t_fcdata *fcd,
+ +              int gmx_unused *global_atom_index)
+ +{
+ +    int  i, type, ai, aj, ak, al, table;
+ +    int  t1, t2, t3;
+ +    rvec r_ij, r_kj, r_kl, m, n;
+ +    real phi, ddphi, vpd, vtot;
+ +
+ +    vtot = 0.0;
+ +    for (i = 0; (i < nbonds); )
+ +    {
+ +        type = forceatoms[i++];
+ +        ai   = forceatoms[i++];
+ +        aj   = forceatoms[i++];
+ +        ak   = forceatoms[i++];
+ +        al   = forceatoms[i++];
+ +
+ +        phi = dih_angle(x[ai], x[aj], x[ak], x[al], pbc, r_ij, r_kj, r_kl, m, n,
+ +                        &t1, &t2, &t3);  /*  84  */
+ +
+ +        table = forceparams[type].tab.table;
+ +
+ +        /* Hopefully phi+M_PI never results in values < 0 */
+ +        *dvdlambda += bonded_tab("dihedral", table,
+ +                                 &fcd->dihtab[table],
+ +                                 forceparams[type].tab.kA,
+ +                                 forceparams[type].tab.kB,
+ +                                 phi+M_PI, lambda, &vpd, &ddphi);
+ +
+ +        vtot += vpd;
+ +        do_dih_fup<flavor>(ai, aj, ak, al, -ddphi, r_ij, r_kj, r_kl, m, n,
+ +                           f, fshift, pbc, g, x, t1, t2, t3); /* 112  */
+ +
+ +    }                                                         /* 227 TOTAL  */
+ +
+ +    return vtot;
+ +}
+ +
+ +struct BondedInteractions
+ +{
+ +    BondedFunction function;
+ +    int            nrnbIndex;
+ +};
+ +
+ +/*! \brief Lookup table of bonded interaction functions
+ + *
+ + * This must have as many entries as interaction_function in ifunc.cpp */
+ +template<BondedKernelFlavor flavor>
+ +const std::array<BondedInteractions, F_NRE> c_bondedInteractionFunctions
+ +    = {
+ +    BondedInteractions {bonds<flavor>, eNR_BONDS },                       // F_BONDS
+ +    BondedInteractions {g96bonds<flavor>, eNR_BONDS },                    // F_G96BONDS
+ +    BondedInteractions {morse_bonds<flavor>, eNR_MORSE },                 // F_MORSE
+ +    BondedInteractions {cubic_bonds<flavor>, eNR_CUBICBONDS },            // F_CUBICBONDS
+ +    BondedInteractions {unimplemented, -1 },                              // F_CONNBONDS
+ +    BondedInteractions {bonds<flavor>, eNR_BONDS },                       // F_HARMONIC
+ +    BondedInteractions {FENE_bonds<flavor>, eNR_FENEBONDS },              // F_FENEBONDS
+ +    BondedInteractions {tab_bonds<flavor>, eNR_TABBONDS },                // F_TABBONDS
+ +    BondedInteractions {tab_bonds<flavor>, eNR_TABBONDS },                // F_TABBONDSNC
+ +    BondedInteractions {restraint_bonds<flavor>, eNR_RESTRBONDS },        // F_RESTRBONDS
+ +    BondedInteractions {angles<flavor>, eNR_ANGLES },                     // F_ANGLES
+ +    BondedInteractions {g96angles<flavor>, eNR_ANGLES },                  // F_G96ANGLES
+ +    BondedInteractions {restrangles<flavor>, eNR_ANGLES },                // F_RESTRANGLES
+ +    BondedInteractions {linear_angles<flavor>, eNR_ANGLES },              // F_LINEAR_ANGLES
+ +    BondedInteractions {cross_bond_bond<flavor>, eNR_CROSS_BOND_BOND },   // F_CROSS_BOND_BONDS
+ +    BondedInteractions {cross_bond_angle<flavor>, eNR_CROSS_BOND_ANGLE }, // F_CROSS_BOND_ANGLES
+ +    BondedInteractions {urey_bradley<flavor>, eNR_UREY_BRADLEY },         // F_UREY_BRADLEY
+ +    BondedInteractions {quartic_angles<flavor>, eNR_QANGLES },            // F_QUARTIC_ANGLES
+ +    BondedInteractions {tab_angles<flavor>, eNR_TABANGLES },              // F_TABANGLES
+ +    BondedInteractions {pdihs<flavor>, eNR_PROPER },                      // F_PDIHS
+ +    BondedInteractions {rbdihs<flavor>, eNR_RB },                         // F_RBDIHS
+ +    BondedInteractions {restrdihs<flavor>, eNR_PROPER },                  // F_RESTRDIHS
+ +    BondedInteractions {cbtdihs<flavor>, eNR_RB },                        // F_CBTDIHS
+ +    BondedInteractions {rbdihs<flavor>, eNR_FOURDIH },                    // F_FOURDIHS
+ +    BondedInteractions {idihs<flavor>, eNR_IMPROPER },                    // F_IDIHS
+ +    BondedInteractions {pdihs<flavor>, eNR_IMPROPER },                    // F_PIDIHS
+ +    BondedInteractions {tab_dihs<flavor>, eNR_TABDIHS },                  // F_TABDIHS
+ +    BondedInteractions {unimplemented, eNR_CMAP },                        // F_CMAP
+ +    BondedInteractions {unimplemented, -1 },                              // F_GB12_NOLONGERUSED
+ +    BondedInteractions {unimplemented, -1 },                              // F_GB13_NOLONGERUSED
+ +    BondedInteractions {unimplemented, -1 },                              // F_GB14_NOLONGERUSED
+ +    BondedInteractions {unimplemented, -1 },                              // F_GBPOL_NOLONGERUSED
+ +    BondedInteractions {unimplemented, -1 },                              // F_NPSOLVATION_NOLONGERUSED
+ +    BondedInteractions {unimplemented, eNR_NB14 },                        // F_LJ14
+ +    BondedInteractions {unimplemented, -1 },                              // F_COUL14
+ +    BondedInteractions {unimplemented, eNR_NB14 },                        // F_LJC14_Q
+ +    BondedInteractions {unimplemented, eNR_NB14 },                        // F_LJC_PAIRS_NB
+ +    BondedInteractions {unimplemented, -1 },                              // F_LJ
+ +    BondedInteractions {unimplemented, -1 },                              // F_BHAM
+ +    BondedInteractions {unimplemented, -1 },                              // F_LJ_LR_NOLONGERUSED
+ +    BondedInteractions {unimplemented, -1 },                              // F_BHAM_LR_NOLONGERUSED
+ +    BondedInteractions {unimplemented, -1 },                              // F_DISPCORR
+ +    BondedInteractions {unimplemented, -1 },                              // F_COUL_SR
+ +    BondedInteractions {unimplemented, -1 },                              // F_COUL_LR_NOLONGERUSED
+ +    BondedInteractions {unimplemented, -1 },                              // F_RF_EXCL
+ +    BondedInteractions {unimplemented, -1 },                              // F_COUL_RECIP
+ +    BondedInteractions {unimplemented, -1 },                              // F_LJ_RECIP
+ +    BondedInteractions {unimplemented, -1 },                              // F_DPD
+ +    BondedInteractions {polarize<flavor>, eNR_POLARIZE },                 // F_POLARIZATION
+ +    BondedInteractions {water_pol<flavor>, eNR_WPOL },                    // F_WATER_POL
+ +    BondedInteractions {thole_pol<flavor>, eNR_THOLE },                   // F_THOLE_POL
+ +    BondedInteractions {anharm_polarize<flavor>, eNR_ANHARM_POL },        // F_ANHARM_POL
+ +    BondedInteractions {unimplemented, -1 },                              // F_POSRES
+ +    BondedInteractions {unimplemented, -1 },                              // F_FBPOSRES
+ +    BondedInteractions {ta_disres, eNR_DISRES },                          // F_DISRES
+ +    BondedInteractions {unimplemented, -1 },                              // F_DISRESVIOL
+ +    BondedInteractions {orires, eNR_ORIRES },                             // F_ORIRES
+ +    BondedInteractions {unimplemented, -1 },                              // F_ORIRESDEV
+ +    BondedInteractions {angres<flavor>, eNR_ANGRES },                     // F_ANGRES
+ +    BondedInteractions {angresz<flavor>, eNR_ANGRESZ },                   // F_ANGRESZ
+ +    BondedInteractions {dihres<flavor>, eNR_DIHRES },                     // F_DIHRES
+ +    BondedInteractions {unimplemented, -1 },                              // F_DIHRESVIOL
+ +    BondedInteractions {unimplemented, -1 },                              // F_CONSTR
+ +    BondedInteractions {unimplemented, -1 },                              // F_CONSTRNC
+ +    BondedInteractions {unimplemented, -1 },                              // F_SETTLE
+ +    BondedInteractions {unimplemented, -1 },                              // F_VSITE2
+ +    BondedInteractions {unimplemented, -1 },                              // F_VSITE3
+ +    BondedInteractions {unimplemented, -1 },                              // F_VSITE3FD
+ +    BondedInteractions {unimplemented, -1 },                              // F_VSITE3FAD
+ +    BondedInteractions {unimplemented, -1 },                              // F_VSITE3OUT
+ +    BondedInteractions {unimplemented, -1 },                              // F_VSITE4FD
+ +    BondedInteractions {unimplemented, -1 },                              // F_VSITE4FDN
+ +    BondedInteractions {unimplemented, -1 },                              // F_VSITEN
+ +    BondedInteractions {unimplemented, -1 },                              // F_COM_PULL
+ +    BondedInteractions {unimplemented, -1 },                              // F_DENSITYFITTING
+ +    BondedInteractions {unimplemented, -1 },                              // F_EQM
+ +    BondedInteractions {unimplemented, -1 },                              // F_EPOT
+ +    BondedInteractions {unimplemented, -1 },                              // F_EKIN
+ +    BondedInteractions {unimplemented, -1 },                              // F_ETOT
+ +    BondedInteractions {unimplemented, -1 },                              // F_ECONSERVED
+ +    BondedInteractions {unimplemented, -1 },                              // F_TEMP
+ +    BondedInteractions {unimplemented, -1 },                              // F_VTEMP_NOLONGERUSED
+ +    BondedInteractions {unimplemented, -1 },                              // F_PDISPCORR
+ +    BondedInteractions {unimplemented, -1 },                              // F_PRES
+ +    BondedInteractions {unimplemented, -1 },                              // F_DVDL_CONSTR
+ +    BondedInteractions {unimplemented, -1 },                              // F_DVDL
+ +    BondedInteractions {unimplemented, -1 },                              // F_DKDL
+ +    BondedInteractions {unimplemented, -1 },                              // F_DVDL_COUL
+ +    BondedInteractions {unimplemented, -1 },                              // F_DVDL_VDW
+ +    BondedInteractions {unimplemented, -1 },                              // F_DVDL_BONDED
+ +    BondedInteractions {unimplemented, -1 },                              // F_DVDL_RESTRAINT
+ +    BondedInteractions {unimplemented, -1 },                              // F_DVDL_TEMPERATURE
+ +    };
+ +
+ +/*! \brief List of instantiated BondedInteractions list */
+ +const gmx::EnumerationArray < BondedKernelFlavor, std::array < BondedInteractions, F_NRE>> c_bondedInteractionFunctionsPerFlavor =
+ +{
+ +    c_bondedInteractionFunctions<BondedKernelFlavor::ForcesSimdWhenAvailable>,
+ +    c_bondedInteractionFunctions<BondedKernelFlavor::ForcesNoSimd>,
+ +    c_bondedInteractionFunctions<BondedKernelFlavor::ForcesAndVirialAndEnergy>,
+ +    c_bondedInteractionFunctions<BondedKernelFlavor::ForcesAndEnergy>
+ +};
+ +
+ +//! \endcond
+ +
+ +} // namespace
+ +
+ +real calculateSimpleBond(const int ftype,
+ +                         const int numForceatoms, const t_iatom forceatoms[],
+ +                         const t_iparams forceparams[],
+ +                         const rvec x[], rvec4 f[], rvec fshift[],
+ +                         const struct t_pbc *pbc,
+ +                         const struct t_graph gmx_unused *g,
+ +                         const real lambda, real *dvdlambda,
+ +                         const t_mdatoms *md, t_fcdata *fcd,
+ +                         int gmx_unused *global_atom_index,
+ +                         const BondedKernelFlavor bondedKernelFlavor)
+ +{
+ +    const BondedInteractions &bonded =
+ +        c_bondedInteractionFunctionsPerFlavor[bondedKernelFlavor][ftype];
+ +
+ +    real v = bonded.function(numForceatoms, forceatoms,
+ +                             forceparams,
+ +                             x, f, fshift,
+ +                             pbc, g, lambda, dvdlambda,
+ +                             md, fcd, global_atom_index);
+ +
+ +    return v;
+ +}
+ +
+ +int nrnbIndex(int ftype)
+ +{
+ +    return c_bondedInteractionFunctions<BondedKernelFlavor::ForcesAndVirialAndEnergy>[ftype].nrnbIndex;
+ +}
diff --cc src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu

index bdeefcbb660965d271ad34ed41f5c3e477e041dd,0000000000000000000000000000000000000000..1df63f01db93d40e2a8a87bce1c542fb3c8b2d93

mode 100644,000000..100644
--- 1/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu
--- /dev/null
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu
@@@ -1,991 -1,0 +1,991 @@@
- static int pick_ewald_kernel_type(bool                     bTwinCut)
+ +/*
+ + * This file is part of the GROMACS molecular simulation package.
+ + *
+ + * Copyright (c) 2012,2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
+ + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ + * and including many others, as listed in the AUTHORS file in the
+ + * top-level source directory and at http://www.gromacs.org.
+ + *
+ + * GROMACS is free software; you can redistribute it and/or
+ + * modify it under the terms of the GNU Lesser General Public License
+ + * as published by the Free Software Foundation; either version 2.1
+ + * of the License, or (at your option) any later version.
+ + *
+ + * GROMACS is distributed in the hope that it will be useful,
+ + * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ + * Lesser General Public License for more details.
+ + *
+ + * You should have received a copy of the GNU Lesser General Public
+ + * License along with GROMACS; if not, see
+ + * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ + *
+ + * If you want to redistribute modifications to GROMACS, please
+ + * consider that scientific software is very special. Version
+ + * control is crucial - bugs must be traceable. We will be happy to
+ + * consider code for inclusion in the official distribution, but
+ + * derived work must not be called official GROMACS. Details are found
+ + * in the README & COPYING files - if they are missing, get the
+ + * official version at http://www.gromacs.org.
+ + *
+ + * To help us fund GROMACS development, we humbly ask that you cite
+ + * the research papers on the package. Check out http://www.gromacs.org.
+ + */
+ +/*! \file
+ + *  \brief Define CUDA implementation of nbnxn_gpu_data_mgmt.h
+ + *
+ + *  \author Szilard Pall <pall.szilard@gmail.com>
+ + */
+ +#include "gmxpre.h"
+ +
+ +#include <assert.h>
+ +#include <stdarg.h>
+ +#include <stdio.h>
+ +#include <stdlib.h>
+ +
+ +// TODO We would like to move this down, but the way gmx_nbnxn_gpu_t
+ +//      is currently declared means this has to be before gpu_types.h
+ +#include "nbnxm_cuda_types.h"
+ +
+ +// TODO Remove this comment when the above order issue is resolved
+ +#include "gromacs/gpu_utils/cudautils.cuh"
+ +#include "gromacs/gpu_utils/gpu_utils.h"
+ +#include "gromacs/gpu_utils/gpueventsynchronizer.cuh"
+ +#include "gromacs/gpu_utils/pmalloc_cuda.h"
+ +#include "gromacs/hardware/gpu_hw_info.h"
+ +#include "gromacs/math/vectypes.h"
+ +#include "gromacs/mdlib/force_flags.h"
+ +#include "gromacs/mdtypes/interaction_const.h"
+ +#include "gromacs/mdtypes/md_enums.h"
+ +#include "gromacs/nbnxm/atomdata.h"
+ +#include "gromacs/nbnxm/gpu_data_mgmt.h"
+ +#include "gromacs/nbnxm/gridset.h"
+ +#include "gromacs/nbnxm/nbnxm.h"
+ +#include "gromacs/nbnxm/nbnxm_gpu.h"
+ +#include "gromacs/nbnxm/pairlistsets.h"
+ +#include "gromacs/pbcutil/ishift.h"
+ +#include "gromacs/timing/gpu_timing.h"
+ +#include "gromacs/utility/basedefinitions.h"
+ +#include "gromacs/utility/cstringutil.h"
+ +#include "gromacs/utility/fatalerror.h"
+ +#include "gromacs/utility/real.h"
+ +#include "gromacs/utility/smalloc.h"
+ +
+ +#include "nbnxm_cuda.h"
+ +
+ +namespace Nbnxm
+ +{
+ +
+ +/* This is a heuristically determined parameter for the Kepler
+ + * and Maxwell architectures for the minimum size of ci lists by multiplying
+ + * this constant with the # of multiprocessors on the current device.
+ + * Since the maximum number of blocks per multiprocessor is 16, the ideal
+ + * count for small systems is 32 or 48 blocks per multiprocessor. Because
+ + * there is a bit of fluctuations in the generated block counts, we use
+ + * a target of 44 instead of the ideal value of 48.
+ + */
+ +static unsigned int gpu_min_ci_balanced_factor = 44;
+ +
+ +/* Fw. decl. */
+ +static void nbnxn_cuda_clear_e_fshift(gmx_nbnxn_cuda_t *nb);
+ +
+ +/* Fw. decl, */
+ +static void nbnxn_cuda_free_nbparam_table(cu_nbparam_t            *nbparam);
+ +
+ +/*! \brief Return whether combination rules are used.
+ + *
+ + * \param[in]   pointer to nonbonded paramter struct
+ + * \return      true if combination rules are used in this run, false otherwise
+ + */
+ +static inline bool useLjCombRule(const cu_nbparam_t  *nbparam)
+ +{
+ +    return (nbparam->vdwtype == evdwCuCUTCOMBGEOM ||
+ +            nbparam->vdwtype == evdwCuCUTCOMBLB);
+ +}
+ +
+ +/*! \brief Initialized the Ewald Coulomb correction GPU table.
+ +
+ +    Tabulates the Ewald Coulomb force and initializes the size/scale
+ +    and the table GPU array. If called with an already allocated table,
+ +    it just re-uploads the table.
+ + */
+ +static void init_ewald_coulomb_force_table(const EwaldCorrectionTables &tables,
+ +                                           cu_nbparam_t                *nbp)
+ +{
+ +    if (nbp->coulomb_tab != nullptr)
+ +    {
+ +        nbnxn_cuda_free_nbparam_table(nbp);
+ +    }
+ +
+ +    nbp->coulomb_tab_scale = tables.scale;
+ +    initParamLookupTable(nbp->coulomb_tab, nbp->coulomb_tab_texobj,
+ +                         tables.tableF.data(), tables.tableF.size());
+ +}
+ +
+ +
+ +/*! Initializes the atomdata structure first time, it only gets filled at
+ +    pair-search. */
+ +static void init_atomdata_first(cu_atomdata_t *ad, int ntypes)
+ +{
+ +    cudaError_t stat;
+ +
+ +    ad->ntypes  = ntypes;
+ +    stat        = cudaMalloc((void**)&ad->shift_vec, SHIFTS*sizeof(*ad->shift_vec));
+ +    CU_RET_ERR(stat, "cudaMalloc failed on ad->shift_vec");
+ +    ad->bShiftVecUploaded = false;
+ +
+ +    stat = cudaMalloc((void**)&ad->fshift, SHIFTS*sizeof(*ad->fshift));
+ +    CU_RET_ERR(stat, "cudaMalloc failed on ad->fshift");
+ +
+ +    stat = cudaMalloc((void**)&ad->e_lj, sizeof(*ad->e_lj));
+ +    CU_RET_ERR(stat, "cudaMalloc failed on ad->e_lj");
+ +    stat = cudaMalloc((void**)&ad->e_el, sizeof(*ad->e_el));
+ +    CU_RET_ERR(stat, "cudaMalloc failed on ad->e_el");
+ +
+ +    /* initialize to nullptr poiters to data that is not allocated here and will
+ +       need reallocation in nbnxn_cuda_init_atomdata */
+ +    ad->xq = nullptr;
+ +    ad->f  = nullptr;
+ +
+ +    /* size -1 indicates that the respective array hasn't been initialized yet */
+ +    ad->natoms = -1;
+ +    ad->nalloc = -1;
+ +}
+ +
+ +/*! Selects the Ewald kernel type, analytical on SM 3.0 and later, tabulated on
+ +    earlier GPUs, single or twin cut-off. */
-         /* Initially rcoulomb == rvdw, so it's surely not twin cut-off. */
-         nbp->eeltype = pick_ewald_kernel_type(false);
++static int pick_ewald_kernel_type(const interaction_const_t &ic)
+ +{
++    bool bTwinCut = (ic.rcoulomb != ic.rvdw);
+ +    bool bUseAnalyticalEwald, bForceAnalyticalEwald, bForceTabulatedEwald;
+ +    int  kernel_type;
+ +
+ +    /* Benchmarking/development environment variables to force the use of
+ +       analytical or tabulated Ewald kernel. */
+ +    bForceAnalyticalEwald = (getenv("GMX_CUDA_NB_ANA_EWALD") != nullptr);
+ +    bForceTabulatedEwald  = (getenv("GMX_CUDA_NB_TAB_EWALD") != nullptr);
+ +
+ +    if (bForceAnalyticalEwald && bForceTabulatedEwald)
+ +    {
+ +        gmx_incons("Both analytical and tabulated Ewald CUDA non-bonded kernels "
+ +                   "requested through environment variables.");
+ +    }
+ +
+ +    /* By default use analytical Ewald. */
+ +    bUseAnalyticalEwald = true;
+ +    if (bForceAnalyticalEwald)
+ +    {
+ +        if (debug)
+ +        {
+ +            fprintf(debug, "Using analytical Ewald CUDA kernels\n");
+ +        }
+ +    }
+ +    else if (bForceTabulatedEwald)
+ +    {
+ +        bUseAnalyticalEwald = false;
+ +
+ +        if (debug)
+ +        {
+ +            fprintf(debug, "Using tabulated Ewald CUDA kernels\n");
+ +        }
+ +    }
+ +
+ +    /* Use twin cut-off kernels if requested by bTwinCut or the env. var.
+ +       forces it (use it for debugging/benchmarking only). */
+ +    if (!bTwinCut && (getenv("GMX_CUDA_NB_EWALD_TWINCUT") == nullptr))
+ +    {
+ +        kernel_type = bUseAnalyticalEwald ? eelCuEWALD_ANA : eelCuEWALD_TAB;
+ +    }
+ +    else
+ +    {
+ +        kernel_type = bUseAnalyticalEwald ? eelCuEWALD_ANA_TWIN : eelCuEWALD_TAB_TWIN;
+ +    }
+ +
+ +    return kernel_type;
+ +}
+ +
+ +/*! Copies all parameters related to the cut-off from ic to nbp */
+ +static void set_cutoff_parameters(cu_nbparam_t              *nbp,
+ +                                  const interaction_const_t *ic,
+ +                                  const PairlistParams      &listParams)
+ +{
+ +    nbp->ewald_beta        = ic->ewaldcoeff_q;
+ +    nbp->sh_ewald          = ic->sh_ewald;
+ +    nbp->epsfac            = ic->epsfac;
+ +    nbp->two_k_rf          = 2.0 * ic->k_rf;
+ +    nbp->c_rf              = ic->c_rf;
+ +    nbp->rvdw_sq           = ic->rvdw * ic->rvdw;
+ +    nbp->rcoulomb_sq       = ic->rcoulomb * ic->rcoulomb;
+ +    nbp->rlistOuter_sq     = listParams.rlistOuter * listParams.rlistOuter;
+ +    nbp->rlistInner_sq     = listParams.rlistInner * listParams.rlistInner;
+ +    nbp->useDynamicPruning = listParams.useDynamicPruning;
+ +
+ +    nbp->sh_lj_ewald       = ic->sh_lj_ewald;
+ +    nbp->ewaldcoeff_lj     = ic->ewaldcoeff_lj;
+ +
+ +    nbp->rvdw_switch       = ic->rvdw_switch;
+ +    nbp->dispersion_shift  = ic->dispersion_shift;
+ +    nbp->repulsion_shift   = ic->repulsion_shift;
+ +    nbp->vdw_switch        = ic->vdw_switch;
+ +}
+ +
+ +/*! Initializes the nonbonded parameter data structure. */
+ +static void init_nbparam(cu_nbparam_t                   *nbp,
+ +                         const interaction_const_t      *ic,
+ +                         const PairlistParams           &listParams,
+ +                         const nbnxn_atomdata_t::Params &nbatParams)
+ +{
+ +    int         ntypes;
+ +
+ +    ntypes  = nbatParams.numTypes;
+ +
+ +    set_cutoff_parameters(nbp, ic, listParams);
+ +
+ +    /* The kernel code supports LJ combination rules (geometric and LB) for
+ +     * all kernel types, but we only generate useful combination rule kernels.
+ +     * We currently only use LJ combination rule (geometric and LB) kernels
+ +     * for plain cut-off LJ. On Maxwell the force only kernels speed up 15%
+ +     * with PME and 20% with RF, the other kernels speed up about half as much.
+ +     * For LJ force-switch the geometric rule would give 7% speed-up, but this
+ +     * combination is rarely used. LJ force-switch with LB rule is more common,
+ +     * but gives only 1% speed-up.
+ +     */
+ +    if (ic->vdwtype == evdwCUT)
+ +    {
+ +        switch (ic->vdw_modifier)
+ +        {
+ +            case eintmodNONE:
+ +            case eintmodPOTSHIFT:
+ +                switch (nbatParams.comb_rule)
+ +                {
+ +                    case ljcrNONE:
+ +                        nbp->vdwtype = evdwCuCUT;
+ +                        break;
+ +                    case ljcrGEOM:
+ +                        nbp->vdwtype = evdwCuCUTCOMBGEOM;
+ +                        break;
+ +                    case ljcrLB:
+ +                        nbp->vdwtype = evdwCuCUTCOMBLB;
+ +                        break;
+ +                    default:
+ +                        gmx_incons("The requested LJ combination rule is not implemented in the CUDA GPU accelerated kernels!");
+ +                }
+ +                break;
+ +            case eintmodFORCESWITCH:
+ +                nbp->vdwtype = evdwCuFSWITCH;
+ +                break;
+ +            case eintmodPOTSWITCH:
+ +                nbp->vdwtype = evdwCuPSWITCH;
+ +                break;
+ +            default:
+ +                gmx_incons("The requested VdW interaction modifier is not implemented in the CUDA GPU accelerated kernels!");
+ +        }
+ +    }
+ +    else if (ic->vdwtype == evdwPME)
+ +    {
+ +        if (ic->ljpme_comb_rule == ljcrGEOM)
+ +        {
+ +            assert(nbatParams.comb_rule == ljcrGEOM);
+ +            nbp->vdwtype = evdwCuEWALDGEOM;
+ +        }
+ +        else
+ +        {
+ +            assert(nbatParams.comb_rule == ljcrLB);
+ +            nbp->vdwtype = evdwCuEWALDLB;
+ +        }
+ +    }
+ +    else
+ +    {
+ +        gmx_incons("The requested VdW type is not implemented in the CUDA GPU accelerated kernels!");
+ +    }
+ +
+ +    if (ic->eeltype == eelCUT)
+ +    {
+ +        nbp->eeltype = eelCuCUT;
+ +    }
+ +    else if (EEL_RF(ic->eeltype))
+ +    {
+ +        nbp->eeltype = eelCuRF;
+ +    }
+ +    else if ((EEL_PME(ic->eeltype) || ic->eeltype == eelEWALD))
+ +    {
-     nbp->eeltype        = pick_ewald_kernel_type(ic->rcoulomb != ic->rvdw);
++        nbp->eeltype = pick_ewald_kernel_type(*ic);
+ +    }
+ +    else
+ +    {
+ +        /* Shouldn't happen, as this is checked when choosing Verlet-scheme */
+ +        gmx_incons("The requested electrostatics type is not implemented in the CUDA GPU accelerated kernels!");
+ +    }
+ +
+ +    /* generate table for PME */
+ +    nbp->coulomb_tab = nullptr;
+ +    if (nbp->eeltype == eelCuEWALD_TAB || nbp->eeltype == eelCuEWALD_TAB_TWIN)
+ +    {
+ +        GMX_RELEASE_ASSERT(ic->coulombEwaldTables, "Need valid Coulomb Ewald correction tables");
+ +        init_ewald_coulomb_force_table(*ic->coulombEwaldTables, nbp);
+ +    }
+ +
+ +    /* set up LJ parameter lookup table */
+ +    if (!useLjCombRule(nbp))
+ +    {
+ +        initParamLookupTable(nbp->nbfp, nbp->nbfp_texobj,
+ +                             nbatParams.nbfp.data(), 2*ntypes*ntypes);
+ +    }
+ +
+ +    /* set up LJ-PME parameter lookup table */
+ +    if (ic->vdwtype == evdwPME)
+ +    {
+ +        initParamLookupTable(nbp->nbfp_comb, nbp->nbfp_comb_texobj,
+ +                             nbatParams.nbfp_comb.data(), 2*ntypes);
+ +    }
+ +}
+ +
+ +/*! Re-generate the GPU Ewald force table, resets rlist, and update the
+ + *  electrostatic type switching to twin cut-off (or back) if needed. */
+ +void gpu_pme_loadbal_update_param(const nonbonded_verlet_t    *nbv,
+ +                                  const interaction_const_t   *ic)
+ +{
+ +    if (!nbv || !nbv->useGpu())
+ +    {
+ +        return;
+ +    }
+ +    cu_nbparam_t *nbp   = nbv->gpu_nbv->nbparam;
+ +
+ +    set_cutoff_parameters(nbp, ic, nbv->pairlistSets().params());
+ +
++    nbp->eeltype        = pick_ewald_kernel_type(*ic);
+ +
+ +    GMX_RELEASE_ASSERT(ic->coulombEwaldTables, "Need valid Coulomb Ewald correction tables");
+ +    init_ewald_coulomb_force_table(*ic->coulombEwaldTables, nbp);
+ +}
+ +
+ +/*! Initializes the pair list data structure. */
+ +static void init_plist(cu_plist_t *pl)
+ +{
+ +    /* initialize to nullptr pointers to data that is not allocated here and will
+ +       need reallocation in nbnxn_gpu_init_pairlist */
+ +    pl->sci      = nullptr;
+ +    pl->cj4      = nullptr;
+ +    pl->imask    = nullptr;
+ +    pl->excl     = nullptr;
+ +
+ +    /* size -1 indicates that the respective array hasn't been initialized yet */
+ +    pl->na_c           = -1;
+ +    pl->nsci           = -1;
+ +    pl->sci_nalloc     = -1;
+ +    pl->ncj4           = -1;
+ +    pl->cj4_nalloc     = -1;
+ +    pl->nimask         = -1;
+ +    pl->imask_nalloc   = -1;
+ +    pl->nexcl          = -1;
+ +    pl->excl_nalloc    = -1;
+ +    pl->haveFreshList  = false;
+ +}
+ +
+ +/*! Initializes the timings data structure. */
+ +static void init_timings(gmx_wallclock_gpu_nbnxn_t *t)
+ +{
+ +    int i, j;
+ +
+ +    t->nb_h2d_t = 0.0;
+ +    t->nb_d2h_t = 0.0;
+ +    t->nb_c     = 0;
+ +    t->pl_h2d_t = 0.0;
+ +    t->pl_h2d_c = 0;
+ +    for (i = 0; i < 2; i++)
+ +    {
+ +        for (j = 0; j < 2; j++)
+ +        {
+ +            t->ktime[i][j].t = 0.0;
+ +            t->ktime[i][j].c = 0;
+ +        }
+ +    }
+ +    t->pruneTime.c        = 0;
+ +    t->pruneTime.t        = 0.0;
+ +    t->dynamicPruneTime.c = 0;
+ +    t->dynamicPruneTime.t = 0.0;
+ +}
+ +
+ +/*! Initializes simulation constant data. */
+ +static void cuda_init_const(gmx_nbnxn_cuda_t               *nb,
+ +                            const interaction_const_t      *ic,
+ +                            const PairlistParams           &listParams,
+ +                            const nbnxn_atomdata_t::Params &nbatParams)
+ +{
+ +    init_atomdata_first(nb->atdat, nbatParams.numTypes);
+ +    init_nbparam(nb->nbparam, ic, listParams, nbatParams);
+ +
+ +    /* clear energy and shift force outputs */
+ +    nbnxn_cuda_clear_e_fshift(nb);
+ +}
+ +
+ +gmx_nbnxn_cuda_t *
+ +gpu_init(const gmx_device_info_t   *deviceInfo,
+ +         const interaction_const_t *ic,
+ +         const PairlistParams      &listParams,
+ +         const nbnxn_atomdata_t    *nbat,
+ +         int                        /*rank*/,
+ +         gmx_bool                   bLocalAndNonlocal)
+ +{
+ +    cudaError_t       stat;
+ +
+ +    gmx_nbnxn_cuda_t *nb;
+ +    snew(nb, 1);
+ +    snew(nb->atdat, 1);
+ +    snew(nb->nbparam, 1);
+ +    snew(nb->plist[InteractionLocality::Local], 1);
+ +    if (bLocalAndNonlocal)
+ +    {
+ +        snew(nb->plist[InteractionLocality::NonLocal], 1);
+ +    }
+ +
+ +    nb->bUseTwoStreams = bLocalAndNonlocal;
+ +
+ +    nb->timers = new cu_timers_t();
+ +    snew(nb->timings, 1);
+ +
+ +    /* init nbst */
+ +    pmalloc((void**)&nb->nbst.e_lj, sizeof(*nb->nbst.e_lj));
+ +    pmalloc((void**)&nb->nbst.e_el, sizeof(*nb->nbst.e_el));
+ +    pmalloc((void**)&nb->nbst.fshift, SHIFTS * sizeof(*nb->nbst.fshift));
+ +
+ +    init_plist(nb->plist[InteractionLocality::Local]);
+ +
+ +    /* set device info, just point it to the right GPU among the detected ones */
+ +    nb->dev_info = deviceInfo;
+ +
+ +    /* local/non-local GPU streams */
+ +    stat = cudaStreamCreate(&nb->stream[InteractionLocality::Local]);
+ +    CU_RET_ERR(stat, "cudaStreamCreate on stream[InterationLocality::Local] failed");
+ +    if (nb->bUseTwoStreams)
+ +    {
+ +        init_plist(nb->plist[InteractionLocality::NonLocal]);
+ +
+ +        /* Note that the device we're running on does not have to support
+ +         * priorities, because we are querying the priority range which in this
+ +         * case will be a single value.
+ +         */
+ +        int highest_priority;
+ +        stat = cudaDeviceGetStreamPriorityRange(nullptr, &highest_priority);
+ +        CU_RET_ERR(stat, "cudaDeviceGetStreamPriorityRange failed");
+ +
+ +        stat = cudaStreamCreateWithPriority(&nb->stream[InteractionLocality::NonLocal],
+ +                                            cudaStreamDefault,
+ +                                            highest_priority);
+ +        CU_RET_ERR(stat, "cudaStreamCreateWithPriority on stream[InteractionLocality::NonLocal] failed");
+ +    }
+ +
+ +    /* init events for sychronization (timing disabled for performance reasons!) */
+ +    stat = cudaEventCreateWithFlags(&nb->nonlocal_done, cudaEventDisableTiming);
+ +    CU_RET_ERR(stat, "cudaEventCreate on nonlocal_done failed");
+ +    stat = cudaEventCreateWithFlags(&nb->misc_ops_and_local_H2D_done, cudaEventDisableTiming);
+ +    CU_RET_ERR(stat, "cudaEventCreate on misc_ops_and_local_H2D_done failed");
+ +
+ +    nb->xAvailableOnDevice   = new GpuEventSynchronizer();
+ +    nb->xNonLocalCopyD2HDone = new GpuEventSynchronizer();
+ +
+ +    /* WARNING: CUDA timings are incorrect with multiple streams.
+ +     *          This is the main reason why they are disabled by default.
+ +     */
+ +    // TODO: Consider turning on by default when we can detect nr of streams.
+ +    nb->bDoTime = (getenv("GMX_ENABLE_GPU_TIMING") != nullptr);
+ +
+ +    if (nb->bDoTime)
+ +    {
+ +        init_timings(nb->timings);
+ +    }
+ +
+ +    /* set the kernel type for the current GPU */
+ +    /* pick L1 cache configuration */
+ +    cuda_set_cacheconfig();
+ +
+ +    cuda_init_const(nb, ic, listParams, nbat->params());
+ +
+ +    nb->natoms                   = 0;
+ +    nb->natoms_alloc             = 0;
+ +    nb->atomIndicesSize          = 0;
+ +    nb->atomIndicesSize_alloc    = 0;
+ +    nb->ncxy_na                  = 0;
+ +    nb->ncxy_na_alloc            = 0;
+ +    nb->ncxy_ind                 = 0;
+ +    nb->ncxy_ind_alloc           = 0;
+ +    nb->nfrvec                   = 0;
+ +    nb->nfrvec_alloc             = 0;
+ +    nb->ncell                    = 0;
+ +    nb->ncell_alloc              = 0;
+ +
+ +    if (debug)
+ +    {
+ +        fprintf(debug, "Initialized CUDA data structures.\n");
+ +    }
+ +
+ +    return nb;
+ +}
+ +
+ +void gpu_init_pairlist(gmx_nbnxn_cuda_t          *nb,
+ +                       const NbnxnPairlistGpu    *h_plist,
+ +                       const InteractionLocality  iloc)
+ +{
+ +    char          sbuf[STRLEN];
+ +    bool          bDoTime    =  (nb->bDoTime && !h_plist->sci.empty());
+ +    cudaStream_t  stream     = nb->stream[iloc];
+ +    cu_plist_t   *d_plist    = nb->plist[iloc];
+ +
+ +    if (d_plist->na_c < 0)
+ +    {
+ +        d_plist->na_c = h_plist->na_ci;
+ +    }
+ +    else
+ +    {
+ +        if (d_plist->na_c != h_plist->na_ci)
+ +        {
+ +            sprintf(sbuf, "In cu_init_plist: the #atoms per cell has changed (from %d to %d)",
+ +                    d_plist->na_c, h_plist->na_ci);
+ +            gmx_incons(sbuf);
+ +        }
+ +    }
+ +
+ +    gpu_timers_t::Interaction &iTimers = nb->timers->interaction[iloc];
+ +
+ +    if (bDoTime)
+ +    {
+ +        iTimers.pl_h2d.openTimingRegion(stream);
+ +        iTimers.didPairlistH2D = true;
+ +    }
+ +
+ +    Context context = nullptr;
+ +
+ +    reallocateDeviceBuffer(&d_plist->sci, h_plist->sci.size(),
+ +                           &d_plist->nsci, &d_plist->sci_nalloc, context);
+ +    copyToDeviceBuffer(&d_plist->sci, h_plist->sci.data(), 0, h_plist->sci.size(),
+ +                       stream, GpuApiCallBehavior::Async,
+ +                       bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
+ +
+ +    reallocateDeviceBuffer(&d_plist->cj4, h_plist->cj4.size(),
+ +                           &d_plist->ncj4, &d_plist->cj4_nalloc, context);
+ +    copyToDeviceBuffer(&d_plist->cj4, h_plist->cj4.data(), 0, h_plist->cj4.size(),
+ +                       stream, GpuApiCallBehavior::Async,
+ +                       bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
+ +
+ +    reallocateDeviceBuffer(&d_plist->imask, h_plist->cj4.size()*c_nbnxnGpuClusterpairSplit,
+ +                           &d_plist->nimask, &d_plist->imask_nalloc, context);
+ +
+ +    reallocateDeviceBuffer(&d_plist->excl, h_plist->excl.size(),
+ +                           &d_plist->nexcl, &d_plist->excl_nalloc, context);
+ +    copyToDeviceBuffer(&d_plist->excl, h_plist->excl.data(), 0, h_plist->excl.size(),
+ +                       stream, GpuApiCallBehavior::Async,
+ +                       bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
+ +
+ +    if (bDoTime)
+ +    {
+ +        iTimers.pl_h2d.closeTimingRegion(stream);
+ +    }
+ +
+ +    /* the next use of thist list we be the first one, so we need to prune */
+ +    d_plist->haveFreshList = true;
+ +}
+ +
+ +void gpu_upload_shiftvec(gmx_nbnxn_cuda_t       *nb,
+ +                         const nbnxn_atomdata_t *nbatom)
+ +{
+ +    cu_atomdata_t *adat  = nb->atdat;
+ +    cudaStream_t   ls    = nb->stream[InteractionLocality::Local];
+ +
+ +    /* only if we have a dynamic box */
+ +    if (nbatom->bDynamicBox || !adat->bShiftVecUploaded)
+ +    {
+ +        cu_copy_H2D_async(adat->shift_vec, nbatom->shift_vec.data(),
+ +                          SHIFTS * sizeof(*adat->shift_vec), ls);
+ +        adat->bShiftVecUploaded = true;
+ +    }
+ +}
+ +
+ +/*! Clears the first natoms_clear elements of the GPU nonbonded force output array. */
+ +static void nbnxn_cuda_clear_f(gmx_nbnxn_cuda_t *nb, int natoms_clear)
+ +{
+ +    cudaError_t    stat;
+ +    cu_atomdata_t *adat  = nb->atdat;
+ +    cudaStream_t   ls    = nb->stream[InteractionLocality::Local];
+ +
+ +    stat = cudaMemsetAsync(adat->f, 0, natoms_clear * sizeof(*adat->f), ls);
+ +    CU_RET_ERR(stat, "cudaMemsetAsync on f falied");
+ +}
+ +
+ +/*! Clears nonbonded shift force output array and energy outputs on the GPU. */
+ +static void nbnxn_cuda_clear_e_fshift(gmx_nbnxn_cuda_t *nb)
+ +{
+ +    cudaError_t    stat;
+ +    cu_atomdata_t *adat  = nb->atdat;
+ +    cudaStream_t   ls    = nb->stream[InteractionLocality::Local];
+ +
+ +    stat = cudaMemsetAsync(adat->fshift, 0, SHIFTS * sizeof(*adat->fshift), ls);
+ +    CU_RET_ERR(stat, "cudaMemsetAsync on fshift falied");
+ +    stat = cudaMemsetAsync(adat->e_lj, 0, sizeof(*adat->e_lj), ls);
+ +    CU_RET_ERR(stat, "cudaMemsetAsync on e_lj falied");
+ +    stat = cudaMemsetAsync(adat->e_el, 0, sizeof(*adat->e_el), ls);
+ +    CU_RET_ERR(stat, "cudaMemsetAsync on e_el falied");
+ +}
+ +
+ +void gpu_clear_outputs(gmx_nbnxn_cuda_t *nb,
+ +                       bool              computeVirial)
+ +{
+ +    nbnxn_cuda_clear_f(nb, nb->atdat->natoms);
+ +    /* clear shift force array and energies if the outputs were
+ +       used in the current step */
+ +    if (computeVirial)
+ +    {
+ +        nbnxn_cuda_clear_e_fshift(nb);
+ +    }
+ +}
+ +
+ +void gpu_init_atomdata(gmx_nbnxn_cuda_t       *nb,
+ +                       const nbnxn_atomdata_t *nbat)
+ +{
+ +    cudaError_t    stat;
+ +    int            nalloc, natoms;
+ +    bool           realloced;
+ +    bool           bDoTime   = nb->bDoTime;
+ +    cu_timers_t   *timers    = nb->timers;
+ +    cu_atomdata_t *d_atdat   = nb->atdat;
+ +    cudaStream_t   ls        = nb->stream[InteractionLocality::Local];
+ +
+ +    natoms    = nbat->numAtoms();
+ +    realloced = false;
+ +
+ +    if (bDoTime)
+ +    {
+ +        /* time async copy */
+ +        timers->atdat.openTimingRegion(ls);
+ +    }
+ +
+ +    /* need to reallocate if we have to copy more atoms than the amount of space
+ +       available and only allocate if we haven't initialized yet, i.e d_atdat->natoms == -1 */
+ +    if (natoms > d_atdat->nalloc)
+ +    {
+ +        nalloc = over_alloc_small(natoms);
+ +
+ +        /* free up first if the arrays have already been initialized */
+ +        if (d_atdat->nalloc != -1)
+ +        {
+ +            freeDeviceBuffer(&d_atdat->f);
+ +            freeDeviceBuffer(&d_atdat->xq);
+ +            freeDeviceBuffer(&d_atdat->atom_types);
+ +            freeDeviceBuffer(&d_atdat->lj_comb);
+ +        }
+ +
+ +        stat = cudaMalloc((void **)&d_atdat->f, nalloc*sizeof(*d_atdat->f));
+ +        CU_RET_ERR(stat, "cudaMalloc failed on d_atdat->f");
+ +        stat = cudaMalloc((void **)&d_atdat->xq, nalloc*sizeof(*d_atdat->xq));
+ +        CU_RET_ERR(stat, "cudaMalloc failed on d_atdat->xq");
+ +        if (useLjCombRule(nb->nbparam))
+ +        {
+ +            stat = cudaMalloc((void **)&d_atdat->lj_comb, nalloc*sizeof(*d_atdat->lj_comb));
+ +            CU_RET_ERR(stat, "cudaMalloc failed on d_atdat->lj_comb");
+ +        }
+ +        else
+ +        {
+ +            stat = cudaMalloc((void **)&d_atdat->atom_types, nalloc*sizeof(*d_atdat->atom_types));
+ +            CU_RET_ERR(stat, "cudaMalloc failed on d_atdat->atom_types");
+ +        }
+ +
+ +        d_atdat->nalloc = nalloc;
+ +        realloced       = true;
+ +    }
+ +
+ +    d_atdat->natoms       = natoms;
+ +    d_atdat->natoms_local = nbat->natoms_local;
+ +
+ +    /* need to clear GPU f output if realloc happened */
+ +    if (realloced)
+ +    {
+ +        nbnxn_cuda_clear_f(nb, nalloc);
+ +    }
+ +
+ +    if (useLjCombRule(nb->nbparam))
+ +    {
+ +        cu_copy_H2D_async(d_atdat->lj_comb, nbat->params().lj_comb.data(),
+ +                          natoms*sizeof(*d_atdat->lj_comb), ls);
+ +    }
+ +    else
+ +    {
+ +        cu_copy_H2D_async(d_atdat->atom_types, nbat->params().type.data(),
+ +                          natoms*sizeof(*d_atdat->atom_types), ls);
+ +    }
+ +
+ +    if (bDoTime)
+ +    {
+ +        timers->atdat.closeTimingRegion(ls);
+ +    }
+ +}
+ +
+ +static void nbnxn_cuda_free_nbparam_table(cu_nbparam_t            *nbparam)
+ +{
+ +    if (nbparam->eeltype == eelCuEWALD_TAB || nbparam->eeltype == eelCuEWALD_TAB_TWIN)
+ +    {
+ +        destroyParamLookupTable(nbparam->coulomb_tab, nbparam->coulomb_tab_texobj);
+ +    }
+ +}
+ +
+ +void gpu_free(gmx_nbnxn_cuda_t *nb)
+ +{
+ +    cudaError_t      stat;
+ +    cu_atomdata_t   *atdat;
+ +    cu_nbparam_t    *nbparam;
+ +
+ +    if (nb == nullptr)
+ +    {
+ +        return;
+ +    }
+ +
+ +    atdat       = nb->atdat;
+ +    nbparam     = nb->nbparam;
+ +
+ +    nbnxn_cuda_free_nbparam_table(nbparam);
+ +
+ +    stat = cudaEventDestroy(nb->nonlocal_done);
+ +    CU_RET_ERR(stat, "cudaEventDestroy failed on timers->nonlocal_done");
+ +    stat = cudaEventDestroy(nb->misc_ops_and_local_H2D_done);
+ +    CU_RET_ERR(stat, "cudaEventDestroy failed on timers->misc_ops_and_local_H2D_done");
+ +
+ +    delete nb->timers;
+ +    if (nb->bDoTime)
+ +    {
+ +        /* The non-local counters/stream (second in the array) are needed only with DD. */
+ +        for (int i = 0; i <= (nb->bUseTwoStreams ? 1 : 0); i++)
+ +        {
+ +            stat = cudaStreamDestroy(nb->stream[i]);
+ +            CU_RET_ERR(stat, "cudaStreamDestroy failed on stream");
+ +        }
+ +    }
+ +
+ +    if (!useLjCombRule(nb->nbparam))
+ +    {
+ +        destroyParamLookupTable(nbparam->nbfp, nbparam->nbfp_texobj);
+ +
+ +    }
+ +
+ +    if (nbparam->vdwtype == evdwCuEWALDGEOM || nbparam->vdwtype == evdwCuEWALDLB)
+ +    {
+ +        destroyParamLookupTable(nbparam->nbfp_comb, nbparam->nbfp_comb_texobj);
+ +    }
+ +
+ +    stat = cudaFree(atdat->shift_vec);
+ +    CU_RET_ERR(stat, "cudaFree failed on atdat->shift_vec");
+ +    stat = cudaFree(atdat->fshift);
+ +    CU_RET_ERR(stat, "cudaFree failed on atdat->fshift");
+ +
+ +    stat = cudaFree(atdat->e_lj);
+ +    CU_RET_ERR(stat, "cudaFree failed on atdat->e_lj");
+ +    stat = cudaFree(atdat->e_el);
+ +    CU_RET_ERR(stat, "cudaFree failed on atdat->e_el");
+ +
+ +    freeDeviceBuffer(&atdat->f);
+ +    freeDeviceBuffer(&atdat->xq);
+ +    freeDeviceBuffer(&atdat->atom_types);
+ +    freeDeviceBuffer(&atdat->lj_comb);
+ +
+ +    /* Free plist */
+ +    auto *plist = nb->plist[InteractionLocality::Local];
+ +    freeDeviceBuffer(&plist->sci);
+ +    freeDeviceBuffer(&plist->cj4);
+ +    freeDeviceBuffer(&plist->imask);
+ +    freeDeviceBuffer(&plist->excl);
+ +    sfree(plist);
+ +    if (nb->bUseTwoStreams)
+ +    {
+ +        auto *plist_nl = nb->plist[InteractionLocality::NonLocal];
+ +        freeDeviceBuffer(&plist_nl->sci);
+ +        freeDeviceBuffer(&plist_nl->cj4);
+ +        freeDeviceBuffer(&plist_nl->imask);
+ +        freeDeviceBuffer(&plist_nl->excl);
+ +        sfree(plist_nl);
+ +    }
+ +
+ +    /* Free nbst */
+ +    pfree(nb->nbst.e_lj);
+ +    nb->nbst.e_lj = nullptr;
+ +
+ +    pfree(nb->nbst.e_el);
+ +    nb->nbst.e_el = nullptr;
+ +
+ +    pfree(nb->nbst.fshift);
+ +    nb->nbst.fshift = nullptr;
+ +
+ +    sfree(atdat);
+ +    sfree(nbparam);
+ +    sfree(nb->timings);
+ +    sfree(nb);
+ +
+ +    if (debug)
+ +    {
+ +        fprintf(debug, "Cleaned up CUDA data structures.\n");
+ +    }
+ +}
+ +
+ +//! This function is documented in the header file
+ +gmx_wallclock_gpu_nbnxn_t *gpu_get_timings(gmx_nbnxn_cuda_t *nb)
+ +{
+ +    return (nb != nullptr && nb->bDoTime) ? nb->timings : nullptr;
+ +}
+ +
+ +void gpu_reset_timings(nonbonded_verlet_t* nbv)
+ +{
+ +    if (nbv->gpu_nbv && nbv->gpu_nbv->bDoTime)
+ +    {
+ +        init_timings(nbv->gpu_nbv->timings);
+ +    }
+ +}
+ +
+ +int gpu_min_ci_balanced(gmx_nbnxn_cuda_t *nb)
+ +{
+ +    return nb != nullptr ?
+ +           gpu_min_ci_balanced_factor*nb->dev_info->prop.multiProcessorCount : 0;
+ +
+ +}
+ +
+ +gmx_bool gpu_is_kernel_ewald_analytical(const gmx_nbnxn_cuda_t *nb)
+ +{
+ +    return ((nb->nbparam->eeltype == eelCuEWALD_ANA) ||
+ +            (nb->nbparam->eeltype == eelCuEWALD_ANA_TWIN));
+ +}
+ +
+ +void *gpu_get_command_stream(gmx_nbnxn_gpu_t           *nb,
+ +                             const InteractionLocality  iloc)
+ +{
+ +    assert(nb);
+ +
+ +    return static_cast<void *>(&nb->stream[iloc]);
+ +}
+ +
+ +void *gpu_get_xq(gmx_nbnxn_gpu_t *nb)
+ +{
+ +    assert(nb);
+ +
+ +    return static_cast<void *>(nb->atdat->xq);
+ +}
+ +
+ +void *gpu_get_f(gmx_nbnxn_gpu_t *nb)
+ +{
+ +    assert(nb);
+ +
+ +    return static_cast<void *>(nb->atdat->f);
+ +}
+ +
+ +rvec *gpu_get_fshift(gmx_nbnxn_gpu_t *nb)
+ +{
+ +    assert(nb);
+ +
+ +    return reinterpret_cast<rvec *>(nb->atdat->fshift);
+ +}
+ +
+ +/* Initialization for X buffer operations on GPU. */
+ +/* TODO  Remove explicit pinning from host arrays from here and manage in a more natural way*/
+ +void nbnxn_gpu_init_x_to_nbat_x(const Nbnxm::GridSet            &gridSet,
+ +                                gmx_nbnxn_gpu_t                 *gpu_nbv)
+ +{
+ +    cudaStream_t                     stream    = gpu_nbv->stream[InteractionLocality::Local];
+ +    bool                             bDoTime   = gpu_nbv->bDoTime;
+ +    const int maxNumColumns                    = gridSet.numColumnsMax();
+ +
+ +    reallocateDeviceBuffer(&gpu_nbv->cxy_na, maxNumColumns*gridSet.grids().size(),
+ +                           &gpu_nbv->ncxy_na, &gpu_nbv->ncxy_na_alloc, nullptr);
+ +    reallocateDeviceBuffer(&gpu_nbv->cxy_ind, maxNumColumns*gridSet.grids().size(),
+ +                           &gpu_nbv->ncxy_ind, &gpu_nbv->ncxy_ind_alloc, nullptr);
+ +
+ +    for (unsigned int g = 0; g < gridSet.grids().size(); g++)
+ +    {
+ +
+ +        const Nbnxm::Grid  &grid       = gridSet.grids()[g];
+ +
+ +        const int           numColumns        = grid.numColumns();
+ +        const int          *atomIndices       = gridSet.atomIndices().data();
+ +        const int           atomIndicesSize   = gridSet.atomIndices().size();
+ +        const int          *cxy_na            = grid.cxy_na().data();
+ +        const int          *cxy_ind           = grid.cxy_ind().data();
+ +        // TODO Should be done once per gridset
+ +        const int           numRealAtomsTotal = gridSet.numRealAtomsTotal();
+ +
+ +        reallocateDeviceBuffer(&gpu_nbv->xrvec, numRealAtomsTotal, &gpu_nbv->natoms, &gpu_nbv->natoms_alloc, nullptr);
+ +        reallocateDeviceBuffer(&gpu_nbv->atomIndices, atomIndicesSize, &gpu_nbv->atomIndicesSize, &gpu_nbv->atomIndicesSize_alloc, nullptr);
+ +
+ +        if (atomIndicesSize > 0)
+ +        {
+ +
+ +            if (bDoTime)
+ +            {
+ +                gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.openTimingRegion(stream);
+ +            }
+ +
+ +            copyToDeviceBuffer(&gpu_nbv->atomIndices, atomIndices, 0, atomIndicesSize, stream, GpuApiCallBehavior::Async, nullptr);
+ +
+ +            if (bDoTime)
+ +            {
+ +                gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.closeTimingRegion(stream);
+ +            }
+ +
+ +        }
+ +
+ +        if (numColumns > 0)
+ +        {
+ +            if (bDoTime)
+ +            {
+ +                gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.openTimingRegion(stream);
+ +            }
+ +
+ +            int* destPtr = &gpu_nbv->cxy_na[maxNumColumns*g];
+ +            copyToDeviceBuffer(&destPtr, cxy_na, 0, numColumns, stream, GpuApiCallBehavior::Async, nullptr);
+ +
+ +            if (bDoTime)
+ +            {
+ +                gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.closeTimingRegion(stream);
+ +            }
+ +
+ +            if (bDoTime)
+ +            {
+ +                gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.openTimingRegion(stream);
+ +            }
+ +
+ +            destPtr = &gpu_nbv->cxy_ind[maxNumColumns*g];
+ +            copyToDeviceBuffer(&destPtr, cxy_ind, 0, numColumns, stream, GpuApiCallBehavior::Async, nullptr);
+ +
+ +            if (bDoTime)
+ +            {
+ +                gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.closeTimingRegion(stream);
+ +            }
+ +
+ +        }
+ +    }
+ +
+ +    // The above data is transferred on the local stream but is a
+ +    // dependency of the nonlocal stream (specifically the nonlocal X
+ +    // buf ops kernel).  We therefore set a dependency to ensure
+ +    // that the nonlocal stream waits on the local stream here.
+ +    // This call records an event in the local stream:
+ +    nbnxnInsertNonlocalGpuDependency(gpu_nbv, Nbnxm::InteractionLocality::Local);
+ +    // ...and this call instructs the nonlocal stream to wait on that event:
+ +    nbnxnInsertNonlocalGpuDependency(gpu_nbv, Nbnxm::InteractionLocality::NonLocal);
+ +
+ +    return;
+ +}
+ +
+ +/* Initialization for F buffer operations on GPU. */
+ +void nbnxn_gpu_init_add_nbat_f_to_f(const int                *cell,
+ +                                    gmx_nbnxn_gpu_t          *gpu_nbv,
+ +                                    int                       natoms_total)
+ +{
+ +
+ +    cudaStream_t         stream  = gpu_nbv->stream[InteractionLocality::Local];
+ +
+ +    reallocateDeviceBuffer(&gpu_nbv->frvec, natoms_total, &gpu_nbv->nfrvec, &gpu_nbv->nfrvec_alloc, nullptr);
+ +
+ +    if (natoms_total > 0)
+ +    {
+ +        reallocateDeviceBuffer(&gpu_nbv->cell, natoms_total, &gpu_nbv->ncell, &gpu_nbv->ncell_alloc, nullptr);
+ +        copyToDeviceBuffer(&gpu_nbv->cell, cell, 0, natoms_total, stream, GpuApiCallBehavior::Async, nullptr);
+ +    }
+ +
+ +    return;
+ +}
+ +
+ +} // namespace Nbnxm
diff --cc src/gromacs/nbnxm/nbnxm_gpu.h

index fef2e749bc6d21d235dae9fced165031d4ee7d91,0000000000000000000000000000000000000000..f30028180270048a24667127a8387a7b2deaff1b

mode 100644,000000..100644
--- 1/src/gromacs/nbnxm/nbnxm_gpu.h
--- /dev/null
+++ b/src/gromacs/nbnxm/nbnxm_gpu.h
@@@ -1,427 -1,0 +1,428 @@@
- int gpu_pick_ewald_kernel_type(bool gmx_unused bTwinCut) GPU_FUNC_TERM_WITH_RETURN(-1);
+ +/*
+ + * This file is part of the GROMACS molecular simulation package.
+ + *
+ + * Copyright (c) 2012,2013,2014,2015,2017,2018,2019, by the GROMACS development team, led by
+ + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ + * and including many others, as listed in the AUTHORS file in the
+ + * top-level source directory and at http://www.gromacs.org.
+ + *
+ + * GROMACS is free software; you can redistribute it and/or
+ + * modify it under the terms of the GNU Lesser General Public License
+ + * as published by the Free Software Foundation; either version 2.1
+ + * of the License, or (at your option) any later version.
+ + *
+ + * GROMACS is distributed in the hope that it will be useful,
+ + * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ + * Lesser General Public License for more details.
+ + *
+ + * You should have received a copy of the GNU Lesser General Public
+ + * License along with GROMACS; if not, see
+ + * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ + *
+ + * If you want to redistribute modifications to GROMACS, please
+ + * consider that scientific software is very special. Version
+ + * control is crucial - bugs must be traceable. We will be happy to
+ + * consider code for inclusion in the official distribution, but
+ + * derived work must not be called official GROMACS. Details are found
+ + * in the README & COPYING files - if they are missing, get the
+ + * official version at http://www.gromacs.org.
+ + *
+ + * To help us fund GROMACS development, we humbly ask that you cite
+ + * the research papers on the package. Check out http://www.gromacs.org.
+ + */
+ +/*! \libinternal \file
+ + *  \brief Declare interface for GPU execution for NBNXN module
+ + *
+ + *  \author Szilard Pall <pall.szilard@gmail.com>
+ + *  \author Mark Abraham <mark.j.abraham@gmail.com>
+ + *  \ingroup module_nbnxm
+ + */
+ +
+ +#ifndef GMX_NBNXM_NBNXM_GPU_H
+ +#define GMX_NBNXM_NBNXM_GPU_H
+ +
+ +#include "gromacs/gpu_utils/gpu_macros.h"
+ +#include "gromacs/math/vectypes.h"
+ +#include "gromacs/nbnxm/atomdata.h"
+ +#include "gromacs/utility/basedefinitions.h"
+ +#include "gromacs/utility/real.h"
+ +
+ +#include "gpu_types.h"
+ +#include "locality.h"
+ +
++struct interaction_const_t;
+ +struct nbnxn_atomdata_t;
+ +struct gmx_wallcycle;
+ +enum class GpuTaskCompletion;
+ +
+ +namespace gmx
+ +{
+ +class GpuBonded;
+ +class StepWorkload;
+ +}
+ +
+ +namespace Nbnxm
+ +{
+ +
+ +class Grid;
+ +
+ +/*! \brief
+ + * Launch asynchronously the xq buffer host to device copy.
+ + *
+ + * The nonlocal copy is skipped if there is no dependent work to do,
+ + * neither non-local nonbonded interactions nor bonded GPU work.
+ + *
+ + * \param [in]    nb        GPU nonbonded data.
+ + * \param [in]    nbdata    Host-side atom data structure.
+ + * \param [in]    aloc      Atom locality flag.
+ + */
+ +GPU_FUNC_QUALIFIER
+ +void gpu_copy_xq_to_gpu(gmx_nbnxn_gpu_t gmx_unused               *nb,
+ +                        const struct nbnxn_atomdata_t gmx_unused *nbdata,
+ +                        AtomLocality gmx_unused                   aloc) GPU_FUNC_TERM;
+ +
+ +/*! \brief
+ + * Launch asynchronously the nonbonded force calculations.
+ + *
+ + *  Also launches the initial pruning of a fresh list after search.
+ + *
+ + *  The local and non-local interaction calculations are launched in two
+ + *  separate streams. If there is no work (i.e. empty pair list), the
+ + *  force kernel launch is omitted.
+ + *
+ + */
+ +GPU_FUNC_QUALIFIER
+ +void gpu_launch_kernel(gmx_nbnxn_gpu_t gmx_unused         *nb,
+ +                       const gmx::StepWorkload gmx_unused &stepWork,
+ +                       InteractionLocality gmx_unused      iloc) GPU_FUNC_TERM;
+ +
+ +/*! \brief
+ + * Launch asynchronously the nonbonded prune-only kernel.
+ + *
+ + *  The local and non-local list pruning are launched in their separate streams.
+ + *
+ + *  Notes for future scheduling tuning:
+ + *  Currently we schedule the dynamic pruning between two MD steps *after* both local and
+ + *  nonlocal force D2H transfers completed. We could launch already after the cpyback
+ + *  is launched, but we want to avoid prune kernels (especially in the non-local
+ + *  high prio-stream) competing with nonbonded work.
+ + *
+ + *  However, this is not ideal as this schedule does not expose the available
+ + *  concurrency. The dynamic pruning kernel:
+ + *    - should be allowed to overlap with any task other than force compute, including
+ + *      transfers (F D2H and the next step's x H2D as well as force clearing).
+ + *    - we'd prefer to avoid competition with non-bonded force kernels belonging
+ + *      to the same rank and ideally other ranks too.
+ + *
+ + *  In the most general case, the former would require scheduling pruning in a separate
+ + *  stream and adding additional event sync points to ensure that force kernels read
+ + *  consistent pair list data. This would lead to some overhead (due to extra
+ + *  cudaStreamWaitEvent calls, 3-5 us/call) which we might be able to live with.
+ + *  The gains from additional overlap might not be significant as long as
+ + *  update+constraints anyway takes longer than pruning, but there will still
+ + *  be use-cases where more overlap may help (e.g. multiple ranks per GPU,
+ + *  no/hbonds only constraints).
+ + *  The above second point is harder to address given that multiple ranks will often
+ + *  share a GPU. Ranks that complete their nonbondeds sooner can schedule pruning earlier
+ + *  and without a third priority level it is difficult to avoid some interference of
+ + *  prune kernels with force tasks (in particular preemption of low-prio local force task).
+ + *
+ + * \param [inout] nb        GPU nonbonded data.
+ + * \param [in]    iloc      Interaction locality flag.
+ + * \param [in]    numParts  Number of parts the pair list is split into in the rolling kernel.
+ + */
+ +GPU_FUNC_QUALIFIER
+ +void gpu_launch_kernel_pruneonly(gmx_nbnxn_gpu_t gmx_unused     *nb,
+ +                                 InteractionLocality gmx_unused  iloc,
+ +                                 int gmx_unused                  numParts) GPU_FUNC_TERM;
+ +
+ +/*! \brief
+ + * Launch asynchronously the download of short-range forces from the GPU
+ + * (and energies/shift forces if required).
+ + */
+ +GPU_FUNC_QUALIFIER
+ +void gpu_launch_cpyback(gmx_nbnxn_gpu_t         gmx_unused *nb,
+ +                        nbnxn_atomdata_t        gmx_unused *nbatom,
+ +                        const gmx::StepWorkload gmx_unused &stepWork,
+ +                        AtomLocality            gmx_unused aloc,
+ +                        bool                    gmx_unused copyBackNbForce) GPU_FUNC_TERM;
+ +
+ +/*! \brief Attempts to complete nonbonded GPU task.
+ + *
+ + *  This function attempts to complete the nonbonded task (both GPU and CPU auxiliary work).
+ + *  Success, i.e. that the tasks completed and results are ready to be consumed, is signaled
+ + *  by the return value (always true if blocking wait mode requested).
+ + *
+ + *  The \p completionKind parameter controls whether the behavior is non-blocking
+ + *  (achieved by passing GpuTaskCompletion::Check) or blocking wait until the results
+ + *  are ready (when GpuTaskCompletion::Wait is passed).
+ + *  As the "Check" mode the function will return immediately if the GPU stream
+ + *  still contain tasks that have not completed, it allows more flexible overlapping
+ + *  of work on the CPU with GPU execution.
+ + *
+ + *  Note that it is only safe to use the results, and to continue to the next MD
+ + *  step when this function has returned true which indicates successful completion of
+ + *  - All nonbonded GPU tasks: both compute and device transfer(s)
+ + *  - auxiliary tasks: updating the internal module state (timing accumulation, list pruning states) and
+ + *  - internal staging reduction of (\p fshift, \p e_el, \p e_lj).
+ + *
+ + * In GpuTaskCompletion::Check mode this function does the timing and keeps correct count
+ + * for the nonbonded task (incrementing only once per taks), in the GpuTaskCompletion::Wait mode
+ + * timing is expected to be done in the caller.
+ + *
+ + *  TODO: improve the handling of outputs e.g. by ensuring that this function explcitly returns the
+ + *  force buffer (instead of that being passed only to nbnxn_gpu_launch_cpyback()) and by returning
+ + *  the energy and Fshift contributions for some external/centralized reduction.
+ + *
+ + * \param[in]  nb             The nonbonded data GPU structure
+ + * \param[in]  stepWork       Step schedule flags
+ + * \param[in]  aloc           Atom locality identifier
+ + * \param[out] e_lj           Pointer to the LJ energy output to accumulate into
+ + * \param[out] e_el           Pointer to the electrostatics energy output to accumulate into
+ + * \param[out] shiftForces    Shift forces buffer to accumulate into
+ + * \param[in]  completionKind Indicates whether nnbonded task completion should only be checked rather than waited for
+ + * \param[out] wcycle         Pointer to wallcycle data structure
+ + * \returns                   True if the nonbonded tasks associated with \p aloc locality have completed
+ + */
+ +GPU_FUNC_QUALIFIER
+ +bool gpu_try_finish_task(gmx_nbnxn_gpu_t gmx_unused          *nb,
+ +                         const gmx::StepWorkload gmx_unused  &stepWork,
+ +                         AtomLocality    gmx_unused           aloc,
+ +                         real            gmx_unused          *e_lj,
+ +                         real            gmx_unused          *e_el,
+ +                         gmx::ArrayRef<gmx::RVec> gmx_unused  shiftForces,
+ +                         GpuTaskCompletion gmx_unused         completionKind,
+ +                         gmx_wallcycle    gmx_unused         *wcycle) GPU_FUNC_TERM_WITH_RETURN(false);
+ +
+ +/*! \brief  Completes the nonbonded GPU task blocking until GPU tasks and data
+ + * transfers to finish.
+ + *
+ + * Also does timing accounting and reduction of the internal staging buffers.
+ + * As this is called at the end of the step, it also resets the pair list and
+ + * pruning flags.
+ + *
+ + * \param[in] nb The nonbonded data GPU structure
+ + * \param[in]  stepWork        Step schedule flags
+ + * \param[in] aloc Atom locality identifier
+ + * \param[out] e_lj Pointer to the LJ energy output to accumulate into
+ + * \param[out] e_el Pointer to the electrostatics energy output to accumulate into
+ + * \param[out] shiftForces Shift forces buffer to accumulate into
+ + */
+ +GPU_FUNC_QUALIFIER
+ +float gpu_wait_finish_task(gmx_nbnxn_gpu_t          gmx_unused *nb,
+ +                           const gmx::StepWorkload  gmx_unused &stepWork,
+ +                           AtomLocality             gmx_unused  aloc,
+ +                           real                     gmx_unused *e_lj,
+ +                           real                     gmx_unused *e_el,
+ +                           gmx::ArrayRef<gmx::RVec> gmx_unused  shiftForces,
+ +                           gmx_wallcycle            gmx_unused  *wcycle) GPU_FUNC_TERM_WITH_RETURN(0.0);
+ +
+ +/*! \brief Selects the Ewald kernel type, analytical or tabulated, single or twin cut-off. */
+ +GPU_FUNC_QUALIFIER
++int nbnxn_gpu_pick_ewald_kernel_type(const interaction_const_t gmx_unused &ic) GPU_FUNC_TERM_WITH_RETURN(-1);
+ +
+ +/*! \brief Initialization for X buffer operations on GPU.
+ + * Called on the NS step and performs (re-)allocations and memory copies. !*/
+ +CUDA_FUNC_QUALIFIER
+ +void nbnxn_gpu_init_x_to_nbat_x(const Nbnxm::GridSet gmx_unused &gridSet,
+ +                                gmx_nbnxn_gpu_t    gmx_unused *gpu_nbv) CUDA_FUNC_TERM;
+ +
+ +/*! \brief Copy coordinates from host to device memory.
+ + *
+ + * \todo This will be removed as the management of the buffers is taken out of the NBNXM module.
+ + *
+ + * \param[in]     grid             Grid to be copied.
+ + * \param[in,out] gpu_nbv          The nonbonded data GPU structure.
+ + * \param[in]     locality         Copy coordinates for local or non-local atoms.
+ + * \param[in]     coordinatesHost  Host-side coordinates in plain rvec format.
+ + */
+ +CUDA_FUNC_QUALIFIER
+ +void nbnxn_gpu_copy_x_to_gpu(const Nbnxm::Grid   gmx_unused &grid,
+ +                             gmx_nbnxn_gpu_t     gmx_unused *gpu_nbv,
+ +                             Nbnxm::AtomLocality gmx_unused  locality,
+ +                             const rvec          gmx_unused *coordinatesHost) CUDA_FUNC_TERM;
+ +
+ +/*! \brief Getter for the device coordinates buffer.
+ + *
+ + * \todo This will be removed as the management of the buffers is taken out of the NBNXM module.
+ + *
+ + * \param[in]  gpu_nbv  The nonbonded data GPU structure.
+ + *
+ + * \returns Device coordinates buffer in plain rvec format.
+ + */
+ +CUDA_FUNC_QUALIFIER
+ +DeviceBuffer<float> nbnxn_gpu_get_x_gpu(gmx_nbnxn_gpu_t gmx_unused *gpu_nbv) CUDA_FUNC_TERM_WITH_RETURN(DeviceBuffer<float> {});
+ +
+ +
+ +/*! \brief X buffer operations on GPU: performs conversion from rvec to nb format.
+ + *
+ + * \param[in]     grid               Grid to be converted.
+ + * \param[in]     setFillerCoords    If the filler coordinates are used.
+ + * \param[in,out] gpu_nbv            The nonbonded data GPU structure.
+ + * \param[in]     coordinatesDevice  Device-side coordinates in plain rvec format.
+ + * \param[in]     locality           Copy coordinates for local or non-local atoms.
+ + * \param[in]     gridId             Index of the grid being converted.
+ + * \param[in]     numColumnsMax      Maximum number of columns in the grid.
+ + */
+ +CUDA_FUNC_QUALIFIER
+ +void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid   gmx_unused &grid,
+ +                           bool                gmx_unused  setFillerCoords,
+ +                           gmx_nbnxn_gpu_t     gmx_unused *gpu_nbv,
+ +                           DeviceBuffer<float> gmx_unused  coordinatesDevice,
+ +                           Nbnxm::AtomLocality gmx_unused  locality,
+ +                           int                 gmx_unused  gridId,
+ +                           int                 gmx_unused  numColumnsMax) CUDA_FUNC_TERM;
+ +
+ +/*! \brief Sync the nonlocal stream with dependent tasks in the local queue.
+ + * \param[in] nb                   The nonbonded data GPU structure
+ + * \param[in] interactionLocality  Local or NonLocal sync point
+ + */
+ +CUDA_FUNC_QUALIFIER
+ +void nbnxnInsertNonlocalGpuDependency(const gmx_nbnxn_gpu_t gmx_unused    *nb,
+ +                                      InteractionLocality   gmx_unused interactionLocality) CUDA_FUNC_TERM;
+ +
+ +/*! \brief Set up internal flags that indicate what type of short-range work there is.
+ + *
+ + * As nonbondeds and bondeds share input/output buffers and GPU queues,
+ + * both are considered when checking for work in the current domain.
+ + *
+ + * This function is expected to be called every time the work-distribution
+ + * can change (i.e. at search/domain decomposition steps).
+ + *
+ + * \param[inout]  nb         Pointer to the nonbonded GPU data structure
+ + * \param[in]     gpuBonded  Pointer to the GPU bonded data structure
+ + * \param[in]     iLocality  Interaction locality identifier
+ + */
+ +GPU_FUNC_QUALIFIER
+ +void setupGpuShortRangeWork(gmx_nbnxn_gpu_t                  gmx_unused *nb,
+ +                            const gmx::GpuBonded             gmx_unused *gpuBonded,
+ +                            Nbnxm::InteractionLocality       gmx_unused  iLocality) GPU_FUNC_TERM;
+ +
+ +/*! \brief Returns true if there is GPU short-range work for the given atom locality.
+ + *
+ + * Note that as, unlike nonbonded tasks, bonded tasks are not split into local/nonlocal,
+ + * and therefore if there are GPU offloaded bonded interactions, this function will return
+ + * true for both local and nonlocal atom range.
+ + *
+ + * \param[inout]  nb        Pointer to the nonbonded GPU data structure
+ + * \param[in]     aLocality Atom locality identifier
+ + */
+ +GPU_FUNC_QUALIFIER
+ +bool haveGpuShortRangeWork(const gmx_nbnxn_gpu_t     gmx_unused *nb,
+ +                           Nbnxm::AtomLocality       gmx_unused  aLocality) GPU_FUNC_TERM_WITH_RETURN(false);
+ +
+ +/*! \brief Initialization for F buffer operations on GPU */
+ +CUDA_FUNC_QUALIFIER
+ +void nbnxn_gpu_init_add_nbat_f_to_f(const int               gmx_unused *cell,
+ +                                    gmx_nbnxn_gpu_t         gmx_unused *gpu_nbv,
+ +                                    int                     gmx_unused  natoms_total) CUDA_FUNC_TERM;
+ +
+ +/*! \brief Force buffer operations on GPU.
+ + *
+ + * Transforms non-bonded forces into plain rvec format and add all the force components to the total
+ + * force buffer
+ + *
+ + * \param[in]   atomLocality         If the reduction should be performed on local or non-local atoms.
+ + * \param[in]   totalForcesDevice    Device buffer to accumulate resulting force.
+ + * \param[in]   gpu_nbv              The NBNXM GPU data structure.
+ + * \param[in]   pmeForcesDevice      Device buffer with PME forces.
+ + * \param[in]   pmeForcesReady       Event that signals when the PME forces are ready for the reduction.
+ + * \param[in]   atomStart            Index of the first atom to reduce forces for.
+ + * \param[in]   numAtoms             Number of atoms to reduce forces for.
+ + * \param[in]   useGpuFPmeReduction  Whether PME forces should be added.
+ + * \param[in]   accumulateForce      Whether there are usefull data already in the total force buffer.
+ + *
+ + */
+ +CUDA_FUNC_QUALIFIER
+ +void nbnxn_gpu_add_nbat_f_to_f(AtomLocality                 gmx_unused  atomLocality,
+ +                               DeviceBuffer<float>          gmx_unused  totalForcesDevice,
+ +                               gmx_nbnxn_gpu_t              gmx_unused *gpu_nbv,
+ +                               void                         gmx_unused *pmeForcesDevice,
+ +                               GpuEventSynchronizer         gmx_unused *pmeForcesReady,
+ +                               int                          gmx_unused  atomStart,
+ +                               int                          gmx_unused  numAtoms,
+ +                               bool                         gmx_unused  useGpuFPmeReduction,
+ +                               bool                         gmx_unused  accumulateForce) CUDA_FUNC_TERM;
+ +
+ +/*! \brief Getter for the device coordinates buffer.
+ + *
+ + * \todo This will be removed as the management of the buffers is taken out of the NBNXM module.
+ + *
+ + * \param[in]  gpu_nbv  The nonbonded data GPU structure.
+ + *
+ + * \returns Device coordinates buffer in plain rvec format.
+ + */
+ +CUDA_FUNC_QUALIFIER
+ +DeviceBuffer<float> nbnxn_gpu_get_f_gpu(gmx_nbnxn_gpu_t gmx_unused *gpu_nbv) CUDA_FUNC_TERM_WITH_RETURN(DeviceBuffer<float> {});
+ +
+ +/*! \brief Copy force buffer from CPU to GPU */
+ +CUDA_FUNC_QUALIFIER
+ +void nbnxn_launch_copy_f_to_gpu(AtomLocality            gmx_unused  atomLocality,
+ +                                const Nbnxm::GridSet    gmx_unused &gridSet,
+ +                                gmx_nbnxn_gpu_t         gmx_unused *nb,
+ +                                rvec                    gmx_unused *f) CUDA_FUNC_TERM;
+ +
+ +/*! \brief Copy force buffer from GPU to CPU */
+ +CUDA_FUNC_QUALIFIER
+ +void nbnxn_launch_copy_f_from_gpu(AtomLocality            gmx_unused  atomLocality,
+ +                                  const Nbnxm::GridSet    gmx_unused &gridSet,
+ +                                  gmx_nbnxn_gpu_t         gmx_unused *nb,
+ +                                  rvec                    gmx_unused *f) CUDA_FUNC_TERM;
+ +
+ +/*! \brief Asynchronous launch of copying coordinate buffer from GPU to CPU
+ + * \param[in]  atomLocality  Locality for data trasnfer
+ + * \param[in]  gridSet       The Grid Set data object
+ + * \param[in]  nb            The nonbonded data GPU structure
+ + * \param[out] x             Coordinate buffer on CPU
+ + */
+ +CUDA_FUNC_QUALIFIER
+ +void nbnxn_launch_copy_x_from_gpu(AtomLocality            gmx_unused  atomLocality,
+ +                                  const Nbnxm::GridSet    gmx_unused &gridSet,
+ +                                  gmx_nbnxn_gpu_t         gmx_unused *nb,
+ +                                  rvec                    gmx_unused *x) CUDA_FUNC_TERM;
+ +
+ +/*! \brief Wait for GPU stream to complete */
+ +CUDA_FUNC_QUALIFIER
+ +void nbnxn_wait_for_gpu_force_reduction(AtomLocality            gmx_unused  atomLocality,
+ +                                        gmx_nbnxn_gpu_t         gmx_unused *nb) CUDA_FUNC_TERM;
+ +
+ +/*! \brief sync CPU thread on coordinate copy to device
+ + * \param[in] nb                   The nonbonded data GPU structure
+ + */
+ +CUDA_FUNC_QUALIFIER
+ +void nbnxn_wait_x_on_device(gmx_nbnxn_gpu_t     gmx_unused *nb) CUDA_FUNC_TERM;
+ +
+ +/*! \brief return pointer to event recorded when coordinates have been copied to device
+ + * \param[in] nb                   The nonbonded data GPU structure
+ + */
+ +CUDA_FUNC_QUALIFIER
+ +void* nbnxn_get_x_on_device_event(const gmx_nbnxn_gpu_t gmx_unused    *nb) CUDA_FUNC_TERM_WITH_RETURN(nullptr);
+ +
+ +/*! \brief return GPU pointer to x in rvec format
+ + * \param[in] nb                   The nonbonded data GPU structure
+ + */
+ +CUDA_FUNC_QUALIFIER
+ +void* nbnxn_get_gpu_xrvec(gmx_nbnxn_gpu_t     gmx_unused *nb) CUDA_FUNC_TERM_WITH_RETURN(nullptr);
+ +
+ +/*! \brief Wait for non-local copy of coordinate buffer from device to host
+ + * \param[in] nb                   The nonbonded data GPU structure
+ + */
+ +CUDA_FUNC_QUALIFIER
+ +void nbnxn_wait_nonlocal_x_copy_D2H_done(gmx_nbnxn_gpu_t     gmx_unused *nb) CUDA_FUNC_TERM;
+ +
+ +/*! \brief return GPU pointer to f in rvec format
+ + * \param[in] nb                   The nonbonded data GPU structure
+ + */
+ +CUDA_FUNC_QUALIFIER
+ +void* nbnxn_get_gpu_frvec(gmx_nbnxn_gpu_t     gmx_unused *nb) CUDA_FUNC_TERM_WITH_RETURN(nullptr);
+ +
+ +/*! \brief Ensure local stream waits for non-local stream
+ + * \param[in] nb                   The nonbonded data GPU structure
+ + */
+ +CUDA_FUNC_QUALIFIER
+ +void nbnxn_stream_local_wait_for_nonlocal(gmx_nbnxn_gpu_t     gmx_unused *nb) CUDA_FUNC_TERM;
+ +
+ +} // namespace Nbnxm
+ +#endif
diff --cc src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp

index 634801dc444b5930127888e7e9393676ded9e649,0000000000000000000000000000000000000000..2ce49b2eb995736ec74de1d52799fd071af94d36

mode 100644,000000..100644
--- 1/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp
--- /dev/null
+++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp
@@@ -1,887 -1,0 +1,888 @@@
- int gpu_pick_ewald_kernel_type(const bool bTwinCut)
+ +/*
+ + * This file is part of the GROMACS molecular simulation package.
+ + *
+ + * Copyright (c) 2012,2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
+ + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ + * and including many others, as listed in the AUTHORS file in the
+ + * top-level source directory and at http://www.gromacs.org.
+ + *
+ + * GROMACS is free software; you can redistribute it and/or
+ + * modify it under the terms of the GNU Lesser General Public License
+ + * as published by the Free Software Foundation; either version 2.1
+ + * of the License, or (at your option) any later version.
+ + *
+ + * GROMACS is distributed in the hope that it will be useful,
+ + * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ + * Lesser General Public License for more details.
+ + *
+ + * You should have received a copy of the GNU Lesser General Public
+ + * License along with GROMACS; if not, see
+ + * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ + *
+ + * If you want to redistribute modifications to GROMACS, please
+ + * consider that scientific software is very special. Version
+ + * control is crucial - bugs must be traceable. We will be happy to
+ + * consider code for inclusion in the official distribution, but
+ + * derived work must not be called official GROMACS. Details are found
+ + * in the README & COPYING files - if they are missing, get the
+ + * official version at http://www.gromacs.org.
+ + *
+ + * To help us fund GROMACS development, we humbly ask that you cite
+ + * the research papers on the package. Check out http://www.gromacs.org.
+ + */
+ +/*! \internal \file
+ + *  \brief Define OpenCL implementation of nbnxm_gpu.h
+ + *
+ + *  \author Anca Hamuraru <anca@streamcomputing.eu>
+ + *  \author Teemu Virolainen <teemu@streamcomputing.eu>
+ + *  \author Dimitrios Karkoulis <dimitris.karkoulis@gmail.com>
+ + *  \author Szilárd Páll <pall.szilard@gmail.com>
+ + *  \ingroup module_nbnxm
+ + *
+ + *  TODO (psz):
+ + *  - Add a static const cl_uint c_pruneKernelWorkDim / c_nbnxnKernelWorkDim = 3;
+ + *  - Rework the copying of OCL data structures done before every invocation of both
+ + *    nb and prune kernels (using fillin_ocl_structures); also consider at the same
+ + *    time calling clSetKernelArg only on the updated parameters (if tracking changed
+ + *    parameters is feasible);
+ + *  - Consider using the event_wait_list argument to clEnqueueNDRangeKernel to mark
+ + *    dependencies on the kernel launched: e.g. the non-local nb kernel's dependency
+ + *    on the misc_ops_and_local_H2D_done event could be better expressed this way.
+ + *
+ + *  - Consider extracting common sections of the OpenCL and CUDA nbnxn logic, e.g:
+ + *    - in nbnxn_gpu_launch_kernel_pruneonly() the pre- and post-kernel launch logic
+ + *      is identical in the two implementations, so a 3-way split might allow sharing
+ + *      code;
+ + *    -
+ + *
+ + */
+ +#include "gmxpre.h"
+ +
+ +#include <assert.h>
+ +#include <stdlib.h>
+ +
+ +#if defined(_MSVC)
+ +#include <limits>
+ +#endif
+ +
+ +#include "thread_mpi/atomic.h"
+ +
+ +#include "gromacs/gpu_utils/gputraits_ocl.h"
+ +#include "gromacs/gpu_utils/oclutils.h"
+ +#include "gromacs/hardware/hw_info.h"
+ +#include "gromacs/mdtypes/simulation_workload.h"
+ +#include "gromacs/nbnxm/atomdata.h"
+ +#include "gromacs/nbnxm/gpu_common.h"
+ +#include "gromacs/nbnxm/gpu_common_utils.h"
+ +#include "gromacs/nbnxm/gpu_data_mgmt.h"
+ +#include "gromacs/nbnxm/nbnxm.h"
+ +#include "gromacs/nbnxm/nbnxm_gpu.h"
+ +#include "gromacs/nbnxm/pairlist.h"
+ +#include "gromacs/pbcutil/ishift.h"
+ +#include "gromacs/timing/gpu_timing.h"
+ +#include "gromacs/utility/cstringutil.h"
+ +#include "gromacs/utility/fatalerror.h"
+ +#include "gromacs/utility/gmxassert.h"
+ +
+ +#include "nbnxm_ocl_internal.h"
+ +#include "nbnxm_ocl_types.h"
+ +
+ +namespace Nbnxm
+ +{
+ +
+ +/*! \brief Convenience constants */
+ +//@{
+ +static const int c_numClPerSupercl = c_nbnxnGpuNumClusterPerSupercluster;
+ +static const int c_clSize          = c_nbnxnGpuClusterSize;
+ +//@}
+ +
+ +
+ +/*! \brief Validates the input global work size parameter.
+ + */
+ +static inline void validate_global_work_size(const KernelLaunchConfig &config, int work_dim, const gmx_device_info_t *dinfo)
+ +{
+ +    cl_uint device_size_t_size_bits;
+ +    cl_uint host_size_t_size_bits;
+ +
+ +    assert(dinfo);
+ +
+ +    size_t global_work_size[3];
+ +    GMX_ASSERT(work_dim <= 3, "Not supporting hyper-grids just yet");
+ +    for (int i = 0; i < work_dim; i++)
+ +    {
+ +        global_work_size[i] = config.blockSize[i] * config.gridSize[i];
+ +    }
+ +
+ +    /* Each component of a global_work_size must not exceed the range given by the
+ +       sizeof(device size_t) for the device on which the kernel execution will
+ +       be enqueued. See:
+ +       https://www.khronos.org/registry/cl/sdk/1.0/docs/man/xhtml/clEnqueueNDRangeKernel.html
+ +     */
+ +    device_size_t_size_bits = dinfo->adress_bits;
+ +    host_size_t_size_bits   = static_cast<cl_uint>(sizeof(size_t) * 8);
+ +
+ +    /* If sizeof(host size_t) <= sizeof(device size_t)
+ +            => global_work_size components will always be valid
+ +       else
+ +            => get device limit for global work size and
+ +            compare it against each component of global_work_size.
+ +     */
+ +    if (host_size_t_size_bits > device_size_t_size_bits)
+ +    {
+ +        size_t device_limit;
+ +
+ +        device_limit = (1ULL << device_size_t_size_bits) - 1;
+ +
+ +        for (int i = 0; i < work_dim; i++)
+ +        {
+ +            if (global_work_size[i] > device_limit)
+ +            {
+ +                gmx_fatal(FARGS, "Watch out, the input system is too large to simulate!\n"
+ +                          "The number of nonbonded work units (=number of super-clusters) exceeds the"
+ +                          "device capabilities. Global work size limit exceeded (%zu > %zu)!",
+ +                          global_work_size[i], device_limit);
+ +            }
+ +        }
+ +    }
+ +}
+ +
+ +/* Constant arrays listing non-bonded kernel function names. The arrays are
+ + * organized in 2-dim arrays by: electrostatics and VDW type.
+ + *
+ + *  Note that the row- and column-order of function pointers has to match the
+ + *  order of corresponding enumerated electrostatics and vdw types, resp.,
+ + *  defined in nbnxm_ocl_types.h.
+ + */
+ +
+ +/*! \brief Force-only kernel function names. */
+ +static const char* nb_kfunc_noener_noprune_ptr[eelOclNR][evdwOclNR] =
+ +{
+ +    { "nbnxn_kernel_ElecCut_VdwLJ_F_opencl",            "nbnxn_kernel_ElecCut_VdwLJCombGeom_F_opencl",            "nbnxn_kernel_ElecCut_VdwLJCombLB_F_opencl",            "nbnxn_kernel_ElecCut_VdwLJFsw_F_opencl",            "nbnxn_kernel_ElecCut_VdwLJPsw_F_opencl",            "nbnxn_kernel_ElecCut_VdwLJEwCombGeom_F_opencl",            "nbnxn_kernel_ElecCut_VdwLJEwCombLB_F_opencl"            },
+ +    { "nbnxn_kernel_ElecRF_VdwLJ_F_opencl",             "nbnxn_kernel_ElecRF_VdwLJCombGeom_F_opencl",             "nbnxn_kernel_ElecRF_VdwLJCombLB_F_opencl",             "nbnxn_kernel_ElecRF_VdwLJFsw_F_opencl",             "nbnxn_kernel_ElecRF_VdwLJPsw_F_opencl",             "nbnxn_kernel_ElecRF_VdwLJEwCombGeom_F_opencl",             "nbnxn_kernel_ElecRF_VdwLJEwCombLB_F_opencl"             },
+ +    { "nbnxn_kernel_ElecEwQSTab_VdwLJ_F_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJCombGeom_F_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJCombLB_F_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJFsw_F_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJPsw_F_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_F_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_F_opencl"        },
+ +    { "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_F_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombGeom_F_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombLB_F_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_F_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_F_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_F_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_F_opencl" },
+ +    { "nbnxn_kernel_ElecEw_VdwLJ_F_opencl",             "nbnxn_kernel_ElecEw_VdwLJCombGeom_F_opencl",             "nbnxn_kernel_ElecEw_VdwLJCombLB_F_opencl",             "nbnxn_kernel_ElecEw_VdwLJFsw_F_opencl",             "nbnxn_kernel_ElecEw_VdwLJPsw_F_opencl",             "nbnxn_kernel_ElecEw_VdwLJEwCombGeom_F_opencl",             "nbnxn_kernel_ElecEw_VdwLJEwCombLB_F_opencl"             },
+ +    { "nbnxn_kernel_ElecEwTwinCut_VdwLJ_F_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJCombGeom_F_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJCombLB_F_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_F_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_F_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_F_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_F_opencl"      }
+ +};
+ +
+ +/*! \brief Force + energy kernel function pointers. */
+ +static const char* nb_kfunc_ener_noprune_ptr[eelOclNR][evdwOclNR] =
+ +{
+ +    { "nbnxn_kernel_ElecCut_VdwLJ_VF_opencl",            "nbnxn_kernel_ElecCut_VdwLJCombGeom_VF_opencl",            "nbnxn_kernel_ElecCut_VdwLJCombLB_VF_opencl",            "nbnxn_kernel_ElecCut_VdwLJFsw_VF_opencl",            "nbnxn_kernel_ElecCut_VdwLJPsw_VF_opencl",            "nbnxn_kernel_ElecCut_VdwLJEwCombGeom_VF_opencl",            "nbnxn_kernel_ElecCut_VdwLJEwCombLB_VF_opencl"            },
+ +    { "nbnxn_kernel_ElecRF_VdwLJ_VF_opencl",             "nbnxn_kernel_ElecRF_VdwLJCombGeom_VF_opencl",             "nbnxn_kernel_ElecRF_VdwLJCombLB_VF_opencl",             "nbnxn_kernel_ElecRF_VdwLJFsw_VF_opencl",             "nbnxn_kernel_ElecRF_VdwLJPsw_VF_opencl",             "nbnxn_kernel_ElecRF_VdwLJEwCombGeom_VF_opencl",             "nbnxn_kernel_ElecRF_VdwLJEwCombLB_VF_opencl"             },
+ +    { "nbnxn_kernel_ElecEwQSTab_VdwLJ_VF_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJCombGeom_VF_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJCombLB_VF_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJFsw_VF_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJPsw_VF_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_VF_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_VF_opencl"        },
+ +    { "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_VF_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombGeom_VF_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombLB_VF_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_VF_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_VF_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_VF_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_VF_opencl" },
+ +    { "nbnxn_kernel_ElecEw_VdwLJ_VF_opencl",             "nbnxn_kernel_ElecEw_VdwLJCombGeom_VF_opencl",             "nbnxn_kernel_ElecEw_VdwLJCombLB_VF_opencl",             "nbnxn_kernel_ElecEw_VdwLJFsw_VF_opencl",             "nbnxn_kernel_ElecEw_VdwLJPsw_VF_opencl",             "nbnxn_kernel_ElecEw_VdwLJEwCombGeom_VF_opencl",             "nbnxn_kernel_ElecEw_VdwLJEwCombLB_VF_opencl"             },
+ +    { "nbnxn_kernel_ElecEwTwinCut_VdwLJ_VF_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJCombGeom_VF_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJCombLB_VF_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_VF_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_VF_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_VF_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_VF_opencl"      }
+ +};
+ +
+ +/*! \brief Force + pruning kernel function pointers. */
+ +static const char* nb_kfunc_noener_prune_ptr[eelOclNR][evdwOclNR] =
+ +{
+ +    { "nbnxn_kernel_ElecCut_VdwLJ_F_prune_opencl",            "nbnxn_kernel_ElecCut_VdwLJCombGeom_F_prune_opencl",            "nbnxn_kernel_ElecCut_VdwLJCombLB_F_prune_opencl",            "nbnxn_kernel_ElecCut_VdwLJFsw_F_prune_opencl",            "nbnxn_kernel_ElecCut_VdwLJPsw_F_prune_opencl",            "nbnxn_kernel_ElecCut_VdwLJEwCombGeom_F_prune_opencl",            "nbnxn_kernel_ElecCut_VdwLJEwCombLB_F_prune_opencl"             },
+ +    { "nbnxn_kernel_ElecRF_VdwLJ_F_prune_opencl",             "nbnxn_kernel_ElecRF_VdwLJCombGeom_F_prune_opencl",             "nbnxn_kernel_ElecRF_VdwLJCombLB_F_prune_opencl",             "nbnxn_kernel_ElecRF_VdwLJFsw_F_prune_opencl",             "nbnxn_kernel_ElecRF_VdwLJPsw_F_prune_opencl",             "nbnxn_kernel_ElecRF_VdwLJEwCombGeom_F_prune_opencl",             "nbnxn_kernel_ElecRF_VdwLJEwCombLB_F_prune_opencl"              },
+ +    { "nbnxn_kernel_ElecEwQSTab_VdwLJ_F_prune_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJCombGeom_F_prune_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJCombLB_F_prune_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJFsw_F_prune_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJPsw_F_prune_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_F_prune_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_F_prune_opencl"         },
+ +    { "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_F_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombGeom_F_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombLB_F_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_F_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_F_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_F_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_F_prune_opencl"  },
+ +    { "nbnxn_kernel_ElecEw_VdwLJ_F_prune_opencl",             "nbnxn_kernel_ElecEw_VdwLJCombGeom_F_prune_opencl",             "nbnxn_kernel_ElecEw_VdwLJCombLB_F_prune_opencl",             "nbnxn_kernel_ElecEw_VdwLJFsw_F_prune_opencl",             "nbnxn_kernel_ElecEw_VdwLJPsw_F_prune_opencl",             "nbnxn_kernel_ElecEw_VdwLJEwCombGeom_F_prune_opencl",             "nbnxn_kernel_ElecEw_VdwLJEwCombLB_F_prune_opencl"              },
+ +    { "nbnxn_kernel_ElecEwTwinCut_VdwLJ_F_prune_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJCombGeom_F_prune_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJCombLB_F_prune_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_F_prune_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_F_prune_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_F_prune_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_F_prune_opencl"       }
+ +};
+ +
+ +/*! \brief Force + energy + pruning kernel function pointers. */
+ +static const char* nb_kfunc_ener_prune_ptr[eelOclNR][evdwOclNR] =
+ +{
+ +    { "nbnxn_kernel_ElecCut_VdwLJ_VF_prune_opencl",            "nbnxn_kernel_ElecCut_VdwLJCombGeom_VF_prune_opencl",            "nbnxn_kernel_ElecCut_VdwLJCombLB_VF_prune_opencl",            "nbnxn_kernel_ElecCut_VdwLJFsw_VF_prune_opencl",            "nbnxn_kernel_ElecCut_VdwLJPsw_VF_prune_opencl",            "nbnxn_kernel_ElecCut_VdwLJEwCombGeom_VF_prune_opencl",            "nbnxn_kernel_ElecCut_VdwLJEwCombLB_VF_prune_opencl"            },
+ +    { "nbnxn_kernel_ElecRF_VdwLJ_VF_prune_opencl",             "nbnxn_kernel_ElecRF_VdwLJCombGeom_VF_prune_opencl",             "nbnxn_kernel_ElecRF_VdwLJCombLB_VF_prune_opencl",             "nbnxn_kernel_ElecRF_VdwLJFsw_VF_prune_opencl",             "nbnxn_kernel_ElecRF_VdwLJPsw_VF_prune_opencl",             "nbnxn_kernel_ElecRF_VdwLJEwCombGeom_VF_prune_opencl",             "nbnxn_kernel_ElecRF_VdwLJEwCombLB_VF_prune_opencl"             },
+ +    { "nbnxn_kernel_ElecEwQSTab_VdwLJ_VF_prune_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJCombGeom_VF_prune_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJCombLB_VF_prune_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJFsw_VF_prune_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJPsw_VF_prune_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_VF_prune_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_VF_prune_opencl"        },
+ +    { "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_VF_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombGeom_VF_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombLB_VF_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_VF_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_VF_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_VF_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_VF_prune_opencl" },
+ +    { "nbnxn_kernel_ElecEw_VdwLJ_VF_prune_opencl",             "nbnxn_kernel_ElecEw_VdwLJCombGeom_VF_prune_opencl",             "nbnxn_kernel_ElecEw_VdwLJCombLB_VF_prune_opencl",             "nbnxn_kernel_ElecEw_VdwLJFsw_VF_prune_opencl",             "nbnxn_kernel_ElecEw_VdwLJPsw_VF_prune_opencl",             "nbnxn_kernel_ElecEw_VdwLJEwCombGeom_VF_prune_opencl",             "nbnxn_kernel_ElecEw_VdwLJEwCombLB_VF_prune_opencl"             },
+ +    { "nbnxn_kernel_ElecEwTwinCut_VdwLJ_VF_prune_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJCombGeom_VF_prune_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJCombLB_VF_prune_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_VF_prune_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_VF_prune_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_VF_prune_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_VF_prune_opencl"      }
+ +};
+ +
+ +/*! \brief Return a pointer to the prune kernel version to be executed at the current invocation.
+ + *
+ + * \param[in] kernel_pruneonly  array of prune kernel objects
+ + * \param[in] firstPrunePass    true if the first pruning pass is being executed
+ + */
+ +static inline cl_kernel selectPruneKernel(cl_kernel kernel_pruneonly[],
+ +                                          bool      firstPrunePass)
+ +{
+ +    cl_kernel  *kernelPtr;
+ +
+ +    if (firstPrunePass)
+ +    {
+ +        kernelPtr = &(kernel_pruneonly[epruneFirst]);
+ +    }
+ +    else
+ +    {
+ +        kernelPtr = &(kernel_pruneonly[epruneRolling]);
+ +    }
+ +    // TODO: consider creating the prune kernel object here to avoid a
+ +    // clCreateKernel for the rolling prune kernel if this is not needed.
+ +    return *kernelPtr;
+ +}
+ +
+ +/*! \brief Return a pointer to the kernel version to be executed at the current step.
+ + *  OpenCL kernel objects are cached in nb. If the requested kernel is not
+ + *  found in the cache, it will be created and the cache will be updated.
+ + */
+ +static inline cl_kernel select_nbnxn_kernel(gmx_nbnxn_ocl_t   *nb,
+ +                                            int                eeltype,
+ +                                            int                evdwtype,
+ +                                            bool               bDoEne,
+ +                                            bool               bDoPrune)
+ +{
+ +    const char* kernel_name_to_run;
+ +    cl_kernel  *kernel_ptr;
+ +    cl_int      cl_error;
+ +
+ +    assert(eeltype  < eelOclNR);
+ +    assert(evdwtype < evdwOclNR);
+ +
+ +    if (bDoEne)
+ +    {
+ +        if (bDoPrune)
+ +        {
+ +            kernel_name_to_run = nb_kfunc_ener_prune_ptr[eeltype][evdwtype];
+ +            kernel_ptr         = &(nb->kernel_ener_prune_ptr[eeltype][evdwtype]);
+ +        }
+ +        else
+ +        {
+ +            kernel_name_to_run = nb_kfunc_ener_noprune_ptr[eeltype][evdwtype];
+ +            kernel_ptr         = &(nb->kernel_ener_noprune_ptr[eeltype][evdwtype]);
+ +        }
+ +    }
+ +    else
+ +    {
+ +        if (bDoPrune)
+ +        {
+ +            kernel_name_to_run = nb_kfunc_noener_prune_ptr[eeltype][evdwtype];
+ +            kernel_ptr         = &(nb->kernel_noener_prune_ptr[eeltype][evdwtype]);
+ +        }
+ +        else
+ +        {
+ +            kernel_name_to_run = nb_kfunc_noener_noprune_ptr[eeltype][evdwtype];
+ +            kernel_ptr         = &(nb->kernel_noener_noprune_ptr[eeltype][evdwtype]);
+ +        }
+ +    }
+ +
+ +    if (nullptr == kernel_ptr[0])
+ +    {
+ +        *kernel_ptr = clCreateKernel(nb->dev_rundata->program, kernel_name_to_run, &cl_error);
+ +        assert(cl_error == CL_SUCCESS);
+ +    }
+ +    // TODO: handle errors
+ +
+ +    return *kernel_ptr;
+ +}
+ +
+ +/*! \brief Calculates the amount of shared memory required by the nonbonded kernel in use.
+ + */
+ +static inline int calc_shmem_required_nonbonded(int  vdwType,
+ +                                                bool bPrefetchLjParam)
+ +{
+ +    int shmem;
+ +
+ +    /* size of shmem (force-buffers/xq/atom type preloading) */
+ +    /* NOTE: with the default kernel on sm3.0 we need shmem only for pre-loading */
+ +    /* i-atom x+q in shared memory */
+ +    shmem  = c_numClPerSupercl * c_clSize * sizeof(float) * 4; /* xqib */
+ +    /* cj in shared memory, for both warps separately
+ +     * TODO: in the "nowarp kernels we load cj only once  so the factor 2 is not needed.
+ +     */
+ +    shmem += 2 * c_nbnxnGpuJgroupSize * sizeof(int);           /* cjs  */
+ +    if (bPrefetchLjParam)
+ +    {
+ +        if (useLjCombRule(vdwType))
+ +        {
+ +            /* i-atom LJ combination parameters in shared memory */
+ +            shmem += c_numClPerSupercl * c_clSize * 2*sizeof(float); /* atib abused for ljcp, float2 */
+ +        }
+ +        else
+ +        {
+ +            /* i-atom types in shared memory */
+ +            shmem += c_numClPerSupercl * c_clSize * sizeof(int); /* atib */
+ +        }
+ +    }
+ +    /* force reduction buffers in shared memory */
+ +    shmem += c_clSize * c_clSize * 3 * sizeof(float);    /* f_buf */
+ +    /* Warp vote. In fact it must be * number of warps in block.. */
+ +    shmem += sizeof(cl_uint) * 2;                        /* warp_any */
+ +    return shmem;
+ +}
+ +
+ +/*! \brief Initializes data structures that are going to be sent to the OpenCL device.
+ + *
+ + *  The device can't use the same data structures as the host for two main reasons:
+ + *  - OpenCL restrictions (pointers are not accepted inside data structures)
+ + *  - some host side fields are not needed for the OpenCL kernels.
+ + *
+ + *  This function is called before the launch of both nbnxn and prune kernels.
+ + */
+ +static void fillin_ocl_structures(cl_nbparam_t        *nbp,
+ +                                  cl_nbparam_params_t *nbparams_params)
+ +{
+ +    nbparams_params->coulomb_tab_scale = nbp->coulomb_tab_scale;
+ +    nbparams_params->c_rf              = nbp->c_rf;
+ +    nbparams_params->dispersion_shift  = nbp->dispersion_shift;
+ +    nbparams_params->eeltype           = nbp->eeltype;
+ +    nbparams_params->epsfac            = nbp->epsfac;
+ +    nbparams_params->ewaldcoeff_lj     = nbp->ewaldcoeff_lj;
+ +    nbparams_params->ewald_beta        = nbp->ewald_beta;
+ +    nbparams_params->rcoulomb_sq       = nbp->rcoulomb_sq;
+ +    nbparams_params->repulsion_shift   = nbp->repulsion_shift;
+ +    nbparams_params->rlistOuter_sq     = nbp->rlistOuter_sq;
+ +    nbparams_params->rvdw_sq           = nbp->rvdw_sq;
+ +    nbparams_params->rlistInner_sq     = nbp->rlistInner_sq;
+ +    nbparams_params->rvdw_switch       = nbp->rvdw_switch;
+ +    nbparams_params->sh_ewald          = nbp->sh_ewald;
+ +    nbparams_params->sh_lj_ewald       = nbp->sh_lj_ewald;
+ +    nbparams_params->two_k_rf          = nbp->two_k_rf;
+ +    nbparams_params->vdwtype           = nbp->vdwtype;
+ +    nbparams_params->vdw_switch        = nbp->vdw_switch;
+ +}
+ +
+ +/*! \brief Enqueues a wait for event completion.
+ + *
+ + * Then it releases the event and sets it to 0.
+ + * Don't use this function when more than one wait will be issued for the event.
+ + * Equivalent to Cuda Stream Sync. */
+ +static void sync_ocl_event(cl_command_queue stream, cl_event *ocl_event)
+ +{
+ +    cl_int gmx_unused cl_error;
+ +
+ +    /* Enqueue wait */
+ +    cl_error = clEnqueueBarrierWithWaitList(stream, 1, ocl_event, nullptr);
+ +    GMX_RELEASE_ASSERT(CL_SUCCESS == cl_error, ocl_get_error_string(cl_error).c_str());
+ +
+ +    /* Release event and reset it to 0. It is ok to release it as enqueuewaitforevents performs implicit retain for events. */
+ +    cl_error = clReleaseEvent(*ocl_event);
+ +    assert(CL_SUCCESS == cl_error);
+ +    *ocl_event = nullptr;
+ +}
+ +
+ +/*! \brief Launch asynchronously the xq buffer host to device copy. */
+ +void gpu_copy_xq_to_gpu(gmx_nbnxn_ocl_t        *nb,
+ +                        const nbnxn_atomdata_t *nbatom,
+ +                        const AtomLocality      atomLocality)
+ +{
+ +    GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
+ +
+ +    const InteractionLocality iloc = gpuAtomToInteractionLocality(atomLocality);
+ +
+ +    /* local/nonlocal offset and length used for xq and f */
+ +    int                  adat_begin, adat_len;
+ +
+ +    cl_atomdata_t       *adat    = nb->atdat;
+ +    cl_plist_t          *plist   = nb->plist[iloc];
+ +    cl_timers_t         *t       = nb->timers;
+ +    cl_command_queue     stream  = nb->stream[iloc];
+ +
+ +    bool                 bDoTime = (nb->bDoTime) != 0;
+ +
+ +    /* Don't launch the non-local H2D copy if there is no dependent
+ +       work to do: neither non-local nor other (e.g. bonded) work
+ +       to do that has as input the nbnxn coordinates.
+ +       Doing the same for the local kernel is more complicated, since the
+ +       local part of the force array also depends on the non-local kernel.
+ +       So to avoid complicating the code and to reduce the risk of bugs,
+ +       we always call the local local x+q copy (and the rest of the local
+ +       work in nbnxn_gpu_launch_kernel().
+ +     */
+ +    if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(*nb, iloc))
+ +    {
+ +        plist->haveFreshList = false;
+ +
+ +        return;
+ +    }
+ +
+ +    /* calculate the atom data index range based on locality */
+ +    if (atomLocality == AtomLocality::Local)
+ +    {
+ +        adat_begin  = 0;
+ +        adat_len    = adat->natoms_local;
+ +    }
+ +    else
+ +    {
+ +        adat_begin  = adat->natoms_local;
+ +        adat_len    = adat->natoms - adat->natoms_local;
+ +    }
+ +
+ +    /* beginning of timed HtoD section */
+ +    if (bDoTime)
+ +    {
+ +        t->xf[atomLocality].nb_h2d.openTimingRegion(stream);
+ +    }
+ +
+ +    /* HtoD x, q */
+ +    ocl_copy_H2D_async(adat->xq, nbatom->x().data() + adat_begin * 4, adat_begin*sizeof(float)*4,
+ +                       adat_len * sizeof(float) * 4, stream, bDoTime ? t->xf[atomLocality].nb_h2d.fetchNextEvent() : nullptr);
+ +
+ +    if (bDoTime)
+ +    {
+ +        t->xf[atomLocality].nb_h2d.closeTimingRegion(stream);
+ +    }
+ +
+ +    /* When we get here all misc operations issues in the local stream as well as
+ +       the local xq H2D are done,
+ +       so we record that in the local stream and wait for it in the nonlocal one. */
+ +    if (nb->bUseTwoStreams)
+ +    {
+ +        if (iloc == InteractionLocality::Local)
+ +        {
+ +            cl_int gmx_used_in_debug cl_error = clEnqueueMarkerWithWaitList(stream, 0, nullptr, &(nb->misc_ops_and_local_H2D_done));
+ +            assert(CL_SUCCESS == cl_error);
+ +
+ +            /* Based on the v1.2 section 5.13 of the OpenCL spec, a flush is needed
+ +             * in the local stream in order to be able to sync with the above event
+ +             * from the non-local stream.
+ +             */
+ +            cl_error = clFlush(stream);
+ +            assert(CL_SUCCESS == cl_error);
+ +        }
+ +        else
+ +        {
+ +            sync_ocl_event(stream, &(nb->misc_ops_and_local_H2D_done));
+ +        }
+ +    }
+ +}
+ +
+ +
+ +/*! \brief Launch GPU kernel
+ +
+ +   As we execute nonbonded workload in separate queues, before launching
+ +   the kernel we need to make sure that he following operations have completed:
+ +   - atomdata allocation and related H2D transfers (every nstlist step);
+ +   - pair list H2D transfer (every nstlist step);
+ +   - shift vector H2D transfer (every nstlist step);
+ +   - force (+shift force and energy) output clearing (every step).
+ +
+ +   These operations are issued in the local queue at the beginning of the step
+ +   and therefore always complete before the local kernel launch. The non-local
+ +   kernel is launched after the local on the same device/context, so this is
+ +   inherently scheduled after the operations in the local stream (including the
+ +   above "misc_ops").
+ +   However, for the sake of having a future-proof implementation, we use the
+ +   misc_ops_done event to record the point in time when the above  operations
+ +   are finished and synchronize with this event in the non-local stream.
+ + */
+ +void gpu_launch_kernel(gmx_nbnxn_ocl_t                  *nb,
+ +                       const gmx::StepWorkload          &stepWork,
+ +                       const Nbnxm::InteractionLocality  iloc)
+ +{
+ +    cl_atomdata_t       *adat    = nb->atdat;
+ +    cl_nbparam_t        *nbp     = nb->nbparam;
+ +    cl_plist_t          *plist   = nb->plist[iloc];
+ +    cl_timers_t         *t       = nb->timers;
+ +    cl_command_queue     stream  = nb->stream[iloc];
+ +
+ +    bool                 bDoTime     = (nb->bDoTime) != 0;
+ +
+ +    cl_nbparam_params_t  nbparams_params;
+ +
+ +    /* Don't launch the non-local kernel if there is no work to do.
+ +       Doing the same for the local kernel is more complicated, since the
+ +       local part of the force array also depends on the non-local kernel.
+ +       So to avoid complicating the code and to reduce the risk of bugs,
+ +       we always call the local kernel and later (not in
+ +       this function) the stream wait, local f copyback and the f buffer
+ +       clearing. All these operations, except for the local interaction kernel,
+ +       are needed for the non-local interactions. The skip of the local kernel
+ +       call is taken care of later in this function. */
+ +    if (canSkipNonbondedWork(*nb, iloc))
+ +    {
+ +        plist->haveFreshList = false;
+ +
+ +        return;
+ +    }
+ +
+ +    if (nbp->useDynamicPruning && plist->haveFreshList)
+ +    {
+ +        /* Prunes for rlistOuter and rlistInner, sets plist->haveFreshList=false
+ +           (that's the way the timing accounting can distinguish between
+ +           separate prune kernel and combined force+prune).
+ +         */
+ +        Nbnxm::gpu_launch_kernel_pruneonly(nb, iloc, 1);
+ +    }
+ +
+ +    if (plist->nsci == 0)
+ +    {
+ +        /* Don't launch an empty local kernel (is not allowed with OpenCL).
+ +         */
+ +        return;
+ +    }
+ +
+ +    /* beginning of timed nonbonded calculation section */
+ +    if (bDoTime)
+ +    {
+ +        t->interaction[iloc].nb_k.openTimingRegion(stream);
+ +    }
+ +
+ +    /* kernel launch config */
+ +
+ +    KernelLaunchConfig config;
+ +    config.sharedMemorySize = calc_shmem_required_nonbonded(nbp->vdwtype, nb->bPrefetchLjParam);
+ +    config.stream           = stream;
+ +    config.blockSize[0]     = c_clSize;
+ +    config.blockSize[1]     = c_clSize;
+ +    config.gridSize[0]      = plist->nsci;
+ +
+ +    validate_global_work_size(config, 3, nb->dev_info);
+ +
+ +    if (debug)
+ +    {
+ +        fprintf(debug, "Non-bonded GPU launch configuration:\n\tLocal work size: %zux%zux%zu\n\t"
+ +                "Global work size : %zux%zu\n\t#Super-clusters/clusters: %d/%d (%d)\n",
+ +                config.blockSize[0], config.blockSize[1], config.blockSize[2],
+ +                config.blockSize[0] * config.gridSize[0], config.blockSize[1] * config.gridSize[1], plist->nsci*c_numClPerSupercl,
+ +                c_numClPerSupercl, plist->na_c);
+ +    }
+ +
+ +    fillin_ocl_structures(nbp, &nbparams_params);
+ +
+ +    auto          *timingEvent  = bDoTime ? t->interaction[iloc].nb_k.fetchNextEvent() : nullptr;
+ +    constexpr char kernelName[] = "k_calc_nb";
+ +    const auto     kernel       = select_nbnxn_kernel(nb,
+ +                                                      nbp->eeltype,
+ +                                                      nbp->vdwtype,
+ +                                                      stepWork.computeEnergy,
+ +                                                      (plist->haveFreshList && !nb->timers->interaction[iloc].didPrune));
+ +
+ +
+ +    // The OpenCL kernel takes int as second to last argument because bool is
+ +    // not supported as a kernel argument type (sizeof(bool) is implementation defined).
+ +    const int computeFshift = static_cast<int>(stepWork.computeVirial);
+ +    if (useLjCombRule(nb->nbparam->vdwtype))
+ +    {
+ +        const auto kernelArgs = prepareGpuKernelArguments(kernel, config,
+ +                                                          &nbparams_params, &adat->xq, &adat->f, &adat->e_lj, &adat->e_el, &adat->fshift,
+ +                                                          &adat->lj_comb,
+ +                                                          &adat->shift_vec, &nbp->nbfp_climg2d, &nbp->nbfp_comb_climg2d, &nbp->coulomb_tab_climg2d,
+ +                                                          &plist->sci, &plist->cj4, &plist->excl, &computeFshift);
+ +
+ +        launchGpuKernel(kernel, config, timingEvent, kernelName, kernelArgs);
+ +    }
+ +    else
+ +    {
+ +        const auto kernelArgs = prepareGpuKernelArguments(kernel, config,
+ +                                                          &adat->ntypes,
+ +                                                          &nbparams_params, &adat->xq, &adat->f, &adat->e_lj, &adat->e_el, &adat->fshift,
+ +                                                          &adat->atom_types,
+ +                                                          &adat->shift_vec, &nbp->nbfp_climg2d, &nbp->nbfp_comb_climg2d, &nbp->coulomb_tab_climg2d,
+ +                                                          &plist->sci, &plist->cj4, &plist->excl, &computeFshift);
+ +        launchGpuKernel(kernel, config, timingEvent, kernelName, kernelArgs);
+ +    }
+ +
+ +    if (bDoTime)
+ +    {
+ +        t->interaction[iloc].nb_k.closeTimingRegion(stream);
+ +    }
+ +}
+ +
+ +
+ +/*! \brief Calculates the amount of shared memory required by the prune kernel.
+ + *
+ + *  Note that for the sake of simplicity we use the CUDA terminology "shared memory"
+ + *  for OpenCL local memory.
+ + *
+ + * \param[in] num_threads_z cj4 concurrency equal to the number of threads/work items in the 3-rd dimension.
+ + * \returns   the amount of local memory in bytes required by the pruning kernel
+ + */
+ +static inline int calc_shmem_required_prune(const int num_threads_z)
+ +{
+ +    int shmem;
+ +
+ +    /* i-atom x in shared memory (for convenience we load all 4 components including q) */
+ +    shmem  = c_numClPerSupercl * c_clSize * sizeof(float)*4;
+ +    /* cj in shared memory, for each warp separately
+ +     * Note: only need to load once per wavefront, but to keep the code simple,
+ +     * for now we load twice on AMD.
+ +     */
+ +    shmem += num_threads_z * c_nbnxnGpuClusterpairSplit * c_nbnxnGpuJgroupSize * sizeof(int);
+ +    /* Warp vote, requires one uint per warp/32 threads per block. */
+ +    shmem += sizeof(cl_uint) * 2*num_threads_z;
+ +
+ +    return shmem;
+ +}
+ +
+ +void gpu_launch_kernel_pruneonly(gmx_nbnxn_gpu_t           *nb,
+ +                                 const InteractionLocality  iloc,
+ +                                 const int                  numParts)
+ +{
+ +    cl_atomdata_t       *adat    = nb->atdat;
+ +    cl_nbparam_t        *nbp     = nb->nbparam;
+ +    cl_plist_t          *plist   = nb->plist[iloc];
+ +    cl_timers_t         *t       = nb->timers;
+ +    cl_command_queue     stream  = nb->stream[iloc];
+ +    bool                 bDoTime = nb->bDoTime == CL_TRUE;
+ +
+ +    if (plist->haveFreshList)
+ +    {
+ +        GMX_ASSERT(numParts == 1, "With first pruning we expect 1 part");
+ +
+ +        /* Set rollingPruningNumParts to signal that it is not set */
+ +        plist->rollingPruningNumParts = 0;
+ +        plist->rollingPruningPart     = 0;
+ +    }
+ +    else
+ +    {
+ +        if (plist->rollingPruningNumParts == 0)
+ +        {
+ +            plist->rollingPruningNumParts = numParts;
+ +        }
+ +        else
+ +        {
+ +            GMX_ASSERT(numParts == plist->rollingPruningNumParts, "It is not allowed to change numParts in between list generation steps");
+ +        }
+ +    }
+ +
+ +    /* Use a local variable for part and update in plist, so we can return here
+ +     * without duplicating the part increment code.
+ +     */
+ +    int part = plist->rollingPruningPart;
+ +
+ +    plist->rollingPruningPart++;
+ +    if (plist->rollingPruningPart >= plist->rollingPruningNumParts)
+ +    {
+ +        plist->rollingPruningPart = 0;
+ +    }
+ +
+ +    /* Compute the number of list entries to prune in this pass */
+ +    int numSciInPart = (plist->nsci - part)/numParts;
+ +
+ +    /* Don't launch the kernel if there is no work to do. */
+ +    if (numSciInPart <= 0)
+ +    {
+ +        plist->haveFreshList = false;
+ +
+ +        return;
+ +    }
+ +
+ +    GpuRegionTimer *timer = nullptr;
+ +    if (bDoTime)
+ +    {
+ +        timer = &(plist->haveFreshList ? t->interaction[iloc].prune_k : t->interaction[iloc].rollingPrune_k);
+ +    }
+ +
+ +    /* beginning of timed prune calculation section */
+ +    if (bDoTime)
+ +    {
+ +        timer->openTimingRegion(stream);
+ +    }
+ +
+ +    /* Kernel launch config:
+ +     * - The thread block dimensions match the size of i-clusters, j-clusters,
+ +     *   and j-cluster concurrency, in x, y, and z, respectively.
+ +     * - The 1D block-grid contains as many blocks as super-clusters.
+ +     */
+ +    int       num_threads_z = getOclPruneKernelJ4Concurrency(nb->dev_info->vendor_e);
+ +
+ +    /* kernel launch config */
+ +    KernelLaunchConfig config;
+ +    config.sharedMemorySize = calc_shmem_required_prune(num_threads_z);
+ +    config.stream           = stream;
+ +    config.blockSize[0]     = c_clSize;
+ +    config.blockSize[1]     = c_clSize;
+ +    config.blockSize[2]     = num_threads_z;
+ +    config.gridSize[0]      = numSciInPart;
+ +
+ +    validate_global_work_size(config, 3, nb->dev_info);
+ +
+ +    if (debug)
+ +    {
+ +        fprintf(debug, "Pruning GPU kernel launch configuration:\n\tLocal work size: %zux%zux%zu\n\t"
+ +                "\tGlobal work size: %zux%zu\n\t#Super-clusters/clusters: %d/%d (%d)\n"
+ +                "\tShMem: %zu\n",
+ +                config.blockSize[0], config.blockSize[1], config.blockSize[2],
+ +                config.blockSize[0] * config.gridSize[0], config.blockSize[1] * config.gridSize[1], plist->nsci*c_numClPerSupercl,
+ +                c_numClPerSupercl, plist->na_c, config.sharedMemorySize);
+ +    }
+ +
+ +    cl_nbparam_params_t  nbparams_params;
+ +    fillin_ocl_structures(nbp, &nbparams_params);
+ +
+ +    auto          *timingEvent  = bDoTime ? timer->fetchNextEvent() : nullptr;
+ +    constexpr char kernelName[] = "k_pruneonly";
+ +    const auto     pruneKernel  = selectPruneKernel(nb->kernel_pruneonly, plist->haveFreshList);
+ +    const auto     kernelArgs   = prepareGpuKernelArguments(pruneKernel, config,
+ +                                                            &nbparams_params, &adat->xq, &adat->shift_vec,
+ +                                                            &plist->sci, &plist->cj4, &plist->imask, &numParts, &part);
+ +    launchGpuKernel(pruneKernel, config, timingEvent, kernelName, kernelArgs);
+ +
+ +    if (plist->haveFreshList)
+ +    {
+ +        plist->haveFreshList         = false;
+ +        /* Mark that pruning has been done */
+ +        nb->timers->interaction[iloc].didPrune = true;
+ +    }
+ +    else
+ +    {
+ +        /* Mark that rolling pruning has been done */
+ +        nb->timers->interaction[iloc].didRollingPrune = true;
+ +    }
+ +
+ +    if (bDoTime)
+ +    {
+ +        timer->closeTimingRegion(stream);
+ +    }
+ +}
+ +
+ +/*! \brief
+ + * Launch asynchronously the download of nonbonded forces from the GPU
+ + * (and energies/shift forces if required).
+ + */
+ +void gpu_launch_cpyback(gmx_nbnxn_ocl_t                          *nb,
+ +                        struct nbnxn_atomdata_t                  *nbatom,
+ +                        const gmx::StepWorkload                  &stepWork,
+ +                        const AtomLocality                        aloc,
+ +                        const bool                     gmx_unused copyBackNbForce)
+ +{
+ +    GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
+ +
+ +    cl_int gmx_unused cl_error;
+ +    int               adat_begin, adat_len; /* local/nonlocal offset and length used for xq and f */
+ +
+ +    /* determine interaction locality from atom locality */
+ +    const InteractionLocality iloc = gpuAtomToInteractionLocality(aloc);
+ +
+ +    cl_atomdata_t            *adat    = nb->atdat;
+ +    cl_timers_t              *t       = nb->timers;
+ +    bool                      bDoTime = nb->bDoTime == CL_TRUE;
+ +    cl_command_queue          stream  = nb->stream[iloc];
+ +
+ +    /* don't launch non-local copy-back if there was no non-local work to do */
+ +    if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(*nb, iloc))
+ +    {
+ +        /* TODO An alternative way to signal that non-local work is
+ +           complete is to use a clEnqueueMarker+clEnqueueBarrier
+ +           pair. However, the use of bNonLocalStreamActive has the
+ +           advantage of being local to the host, so probably minimizes
+ +           overhead. Curiously, for NVIDIA OpenCL with an empty-domain
+ +           test case, overall simulation performance was higher with
+ +           the API calls, but this has not been tested on AMD OpenCL,
+ +           so could be worth considering in future. */
+ +        nb->bNonLocalStreamActive = CL_FALSE;
+ +        return;
+ +    }
+ +
+ +    getGpuAtomRange(adat, aloc, &adat_begin, &adat_len);
+ +
+ +    /* beginning of timed D2H section */
+ +    if (bDoTime)
+ +    {
+ +        t->xf[aloc].nb_d2h.openTimingRegion(stream);
+ +    }
+ +
+ +    /* With DD the local D2H transfer can only start after the non-local
+ +       has been launched. */
+ +    if (iloc == InteractionLocality::Local && nb->bNonLocalStreamActive)
+ +    {
+ +        sync_ocl_event(stream, &(nb->nonlocal_done));
+ +    }
+ +
+ +    /* DtoH f */
+ +    ocl_copy_D2H_async(nbatom->out[0].f.data() + adat_begin * 3, adat->f, adat_begin*3*sizeof(float),
+ +                       (adat_len)* adat->f_elem_size, stream, bDoTime ? t->xf[aloc].nb_d2h.fetchNextEvent() : nullptr);
+ +
+ +    /* kick off work */
+ +    cl_error = clFlush(stream);
+ +    assert(CL_SUCCESS == cl_error);
+ +
+ +    /* After the non-local D2H is launched the nonlocal_done event can be
+ +       recorded which signals that the local D2H can proceed. This event is not
+ +       placed after the non-local kernel because we first need the non-local
+ +       data back first. */
+ +    if (iloc == InteractionLocality::NonLocal)
+ +    {
+ +        cl_error = clEnqueueMarkerWithWaitList(stream, 0, nullptr, &(nb->nonlocal_done));
+ +        assert(CL_SUCCESS == cl_error);
+ +        nb->bNonLocalStreamActive = CL_TRUE;
+ +    }
+ +
+ +    /* only transfer energies in the local stream */
+ +    if (iloc == InteractionLocality::Local)
+ +    {
+ +        /* DtoH fshift when virial is needed */
+ +        if (stepWork.computeVirial)
+ +        {
+ +            ocl_copy_D2H_async(nb->nbst.fshift, adat->fshift, 0,
+ +                               SHIFTS * adat->fshift_elem_size, stream, bDoTime ? t->xf[aloc].nb_d2h.fetchNextEvent() : nullptr);
+ +        }
+ +
+ +        /* DtoH energies */
+ +        if (stepWork.computeEnergy)
+ +        {
+ +            ocl_copy_D2H_async(nb->nbst.e_lj, adat->e_lj, 0,
+ +                               sizeof(float), stream, bDoTime ? t->xf[aloc].nb_d2h.fetchNextEvent() : nullptr);
+ +
+ +            ocl_copy_D2H_async(nb->nbst.e_el, adat->e_el, 0,
+ +                               sizeof(float), stream, bDoTime ? t->xf[aloc].nb_d2h.fetchNextEvent() : nullptr);
+ +        }
+ +    }
+ +
+ +    if (bDoTime)
+ +    {
+ +        t->xf[aloc].nb_d2h.closeTimingRegion(stream);
+ +    }
+ +}
+ +
+ +
+ +/*! \brief Selects the Ewald kernel type, analytical or tabulated, single or twin cut-off. */
++int nbnxn_gpu_pick_ewald_kernel_type(const interaction_const_t &ic)
+ +{
++    bool bTwinCut = (ic.rcoulomb != ic.rvdw);
+ +    bool bUseAnalyticalEwald, bForceAnalyticalEwald, bForceTabulatedEwald;
+ +    int  kernel_type;
+ +
+ +    /* Benchmarking/development environment variables to force the use of
+ +       analytical or tabulated Ewald kernel. */
+ +    bForceAnalyticalEwald = (getenv("GMX_OCL_NB_ANA_EWALD") != nullptr);
+ +    bForceTabulatedEwald  = (getenv("GMX_OCL_NB_TAB_EWALD") != nullptr);
+ +
+ +    if (bForceAnalyticalEwald && bForceTabulatedEwald)
+ +    {
+ +        gmx_incons("Both analytical and tabulated Ewald OpenCL non-bonded kernels "
+ +                   "requested through environment variables.");
+ +    }
+ +
+ +    /* OpenCL: By default, use analytical Ewald
+ +     * TODO: tabulated does not work, it needs fixing, see init_nbparam() in nbnxn_ocl_data_mgmt.cpp
+ +     *
+ +     * TODO: decide if dev_info parameter should be added to recognize NVIDIA CC>=3.0 devices.
+ +     *
+ +     */
+ +    /* By default use analytical Ewald. */
+ +    bUseAnalyticalEwald = true;
+ +    if (bForceAnalyticalEwald)
+ +    {
+ +        if (debug)
+ +        {
+ +            fprintf(debug, "Using analytical Ewald OpenCL kernels\n");
+ +        }
+ +    }
+ +    else if (bForceTabulatedEwald)
+ +    {
+ +        bUseAnalyticalEwald = false;
+ +
+ +        if (debug)
+ +        {
+ +            fprintf(debug, "Using tabulated Ewald OpenCL kernels\n");
+ +        }
+ +    }
+ +
+ +    /* Use twin cut-off kernels if requested by bTwinCut or the env. var.
+ +       forces it (use it for debugging/benchmarking only). */
+ +    if (!bTwinCut && (getenv("GMX_OCL_NB_EWALD_TWINCUT") == nullptr))
+ +    {
+ +        kernel_type = bUseAnalyticalEwald ? eelOclEWALD_ANA : eelOclEWALD_TAB;
+ +    }
+ +    else
+ +    {
+ +        kernel_type = bUseAnalyticalEwald ? eelOclEWALD_ANA_TWIN : eelOclEWALD_TAB_TWIN;
+ +    }
+ +
+ +    return kernel_type;
+ +}
+ +
+ +} // namespace Nbnxm
diff --cc src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp

index a531330459199caeaa30288c2c4392a23a1ebdb9,0000000000000000000000000000000000000000..a26ab41c990e8f6f643860f02baf1dceadac1dc8

mode 100644,000000..100644
--- 1/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp
--- /dev/null
+++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp
@@@ -1,1163 -1,0 +1,1162 @@@
-         /* Initially rcoulomb == rvdw, so it's surely not twin cut-off. */
-         *gpu_eeltype = gpu_pick_ewald_kernel_type(false);
+ +/*
+ + * This file is part of the GROMACS molecular simulation package.
+ + *
+ + * Copyright (c) 2012,2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
+ + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ + * and including many others, as listed in the AUTHORS file in the
+ + * top-level source directory and at http://www.gromacs.org.
+ + *
+ + * GROMACS is free software; you can redistribute it and/or
+ + * modify it under the terms of the GNU Lesser General Public License
+ + * as published by the Free Software Foundation; either version 2.1
+ + * of the License, or (at your option) any later version.
+ + *
+ + * GROMACS is distributed in the hope that it will be useful,
+ + * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ + * Lesser General Public License for more details.
+ + *
+ + * You should have received a copy of the GNU Lesser General Public
+ + * License along with GROMACS; if not, see
+ + * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ + *
+ + * If you want to redistribute modifications to GROMACS, please
+ + * consider that scientific software is very special. Version
+ + * control is crucial - bugs must be traceable. We will be happy to
+ + * consider code for inclusion in the official distribution, but
+ + * derived work must not be called official GROMACS. Details are found
+ + * in the README & COPYING files - if they are missing, get the
+ + * official version at http://www.gromacs.org.
+ + *
+ + * To help us fund GROMACS development, we humbly ask that you cite
+ + * the research papers on the package. Check out http://www.gromacs.org.
+ + */
+ +/*! \internal \file
+ + *  \brief Define OpenCL implementation of nbnxm_gpu_data_mgmt.h
+ + *
+ + *  \author Anca Hamuraru <anca@streamcomputing.eu>
+ + *  \author Dimitrios Karkoulis <dimitris.karkoulis@gmail.com>
+ + *  \author Teemu Virolainen <teemu@streamcomputing.eu>
+ + *  \author Szilárd Páll <pall.szilard@gmail.com>
+ + */
+ +#include "gmxpre.h"
+ +
+ +#include <assert.h>
+ +#include <stdarg.h>
+ +#include <stdio.h>
+ +#include <stdlib.h>
+ +#include <string.h>
+ +
+ +#include <cmath>
+ +
+ +// TODO We would like to move this down, but the way gmx_nbnxn_gpu_t
+ +//      is currently declared means this has to be before gpu_types.h
+ +#include "nbnxm_ocl_types.h"
+ +
+ +// TODO Remove this comment when the above order issue is resolved
+ +#include "gromacs/gpu_utils/gpu_utils.h"
+ +#include "gromacs/gpu_utils/oclutils.h"
+ +#include "gromacs/hardware/gpu_hw_info.h"
+ +#include "gromacs/math/vectypes.h"
+ +#include "gromacs/mdlib/force_flags.h"
+ +#include "gromacs/mdtypes/interaction_const.h"
+ +#include "gromacs/mdtypes/md_enums.h"
+ +#include "gromacs/nbnxm/atomdata.h"
+ +#include "gromacs/nbnxm/gpu_data_mgmt.h"
+ +#include "gromacs/nbnxm/gpu_jit_support.h"
+ +#include "gromacs/nbnxm/nbnxm.h"
+ +#include "gromacs/nbnxm/nbnxm_gpu.h"
+ +#include "gromacs/nbnxm/pairlistsets.h"
+ +#include "gromacs/pbcutil/ishift.h"
+ +#include "gromacs/timing/gpu_timing.h"
+ +#include "gromacs/utility/cstringutil.h"
+ +#include "gromacs/utility/fatalerror.h"
+ +#include "gromacs/utility/gmxassert.h"
+ +#include "gromacs/utility/real.h"
+ +#include "gromacs/utility/smalloc.h"
+ +
+ +#include "nbnxm_ocl_internal.h"
+ +
+ +namespace Nbnxm
+ +{
+ +
+ +/*! \brief Copies of values from cl_driver_diagnostics_intel.h,
+ + * which isn't guaranteed to be available. */
+ +/**@{*/
+ +#define CL_CONTEXT_SHOW_DIAGNOSTICS_INTEL           0x4106
+ +#define CL_CONTEXT_DIAGNOSTICS_LEVEL_GOOD_INTEL     0x1
+ +#define CL_CONTEXT_DIAGNOSTICS_LEVEL_BAD_INTEL      0x2
+ +#define CL_CONTEXT_DIAGNOSTICS_LEVEL_NEUTRAL_INTEL  0x4
+ +/**@}*/
+ +
+ +/*! \brief This parameter should be determined heuristically from the
+ + * kernel execution times
+ + *
+ + * This value is best for small systems on a single AMD Radeon R9 290X
+ + * (and about 5% faster than 40, which is the default for CUDA
+ + * devices). Larger simulation systems were quite insensitive to the
+ + * value of this parameter.
+ + */
+ +static unsigned int gpu_min_ci_balanced_factor = 50;
+ +
+ +
+ +/*! \brief Returns true if LJ combination rules are used in the non-bonded kernels.
+ + *
+ + * Full doc in nbnxn_ocl_internal.h */
+ +bool useLjCombRule(int vdwType)
+ +{
+ +    return (vdwType == evdwOclCUTCOMBGEOM ||
+ +            vdwType == evdwOclCUTCOMBLB);
+ +}
+ +
+ +/*! \brief Tabulates the Ewald Coulomb force and initializes the size/scale
+ + * and the table GPU array.
+ + *
+ + * If called with an already allocated table, it just re-uploads the
+ + * table.
+ + */
+ +static void init_ewald_coulomb_force_table(const EwaldCorrectionTables     &tables,
+ +                                           cl_nbparam_t                    *nbp,
+ +                                           const gmx_device_runtime_data_t *runData)
+ +{
+ +    cl_mem       coul_tab;
+ +
+ +    cl_int       cl_error;
+ +
+ +    if (nbp->coulomb_tab_climg2d != nullptr)
+ +    {
+ +        freeDeviceBuffer(&(nbp->coulomb_tab_climg2d));
+ +    }
+ +
+ +    /* Switched from using textures to using buffers */
+ +    // TODO: decide which alternative is most efficient - textures or buffers.
+ +    /*
+ +       cl_image_format array_format;
+ +
+ +       array_format.image_channel_data_type = CL_FLOAT;
+ +       array_format.image_channel_order     = CL_R;
+ +
+ +       coul_tab = clCreateImage2D(runData->context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
+ +       &array_format, tabsize, 1, 0, ftmp, &cl_error);
+ +     */
+ +
+ +    coul_tab = clCreateBuffer(runData->context, CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY | CL_MEM_COPY_HOST_PTR,
+ +                              tables.tableF.size()*sizeof(cl_float), const_cast<real *>(tables.tableF.data()), &cl_error);
+ +    GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
+ +                       ("clCreateBuffer failed: " + ocl_get_error_string(cl_error)).c_str());
+ +
+ +    nbp->coulomb_tab_climg2d  = coul_tab;
+ +    nbp->coulomb_tab_scale    = tables.scale;
+ +}
+ +
+ +
+ +/*! \brief Initializes the atomdata structure first time, it only gets filled at
+ +    pair-search.
+ + */
+ +static void init_atomdata_first(cl_atomdata_t *ad, int ntypes, gmx_device_runtime_data_t *runData)
+ +{
+ +    cl_int cl_error;
+ +
+ +    ad->ntypes  = ntypes;
+ +
+ +    /* An element of the shift_vec device buffer has the same size as one element
+ +       of the host side shift_vec buffer. */
+ +    ad->shift_vec_elem_size = sizeof(*nbnxn_atomdata_t::shift_vec.data());
+ +
+ +    ad->shift_vec = clCreateBuffer(runData->context, CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY,
+ +                                   SHIFTS * ad->shift_vec_elem_size, nullptr, &cl_error);
+ +    GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
+ +                       ("clCreateBuffer failed: " + ocl_get_error_string(cl_error)).c_str());
+ +    ad->bShiftVecUploaded = CL_FALSE;
+ +
+ +    /* An element of the fshift device buffer has the same size as one element
+ +       of the host side fshift buffer. */
+ +    ad->fshift_elem_size = sizeof(*cl_nb_staging_t::fshift);
+ +
+ +    ad->fshift = clCreateBuffer(runData->context, CL_MEM_READ_WRITE | CL_MEM_HOST_READ_ONLY,
+ +                                SHIFTS * ad->fshift_elem_size, nullptr, &cl_error);
+ +    GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
+ +                       ("clCreateBuffer failed: " + ocl_get_error_string(cl_error)).c_str());
+ +
+ +    ad->e_lj = clCreateBuffer(runData->context, CL_MEM_READ_WRITE | CL_MEM_HOST_READ_ONLY,
+ +                              sizeof(float), nullptr, &cl_error);
+ +    GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
+ +                       ("clCreateBuffer failed: " + ocl_get_error_string(cl_error)).c_str());
+ +
+ +    ad->e_el = clCreateBuffer(runData->context, CL_MEM_READ_WRITE | CL_MEM_HOST_READ_ONLY,
+ +                              sizeof(float), nullptr, &cl_error);
+ +    GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
+ +                       ("clCreateBuffer failed: " + ocl_get_error_string(cl_error)).c_str());
+ +
+ +    /* initialize to nullptr pointers to data that is not allocated here and will
+ +       need reallocation in nbnxn_gpu_init_atomdata */
+ +    ad->xq = nullptr;
+ +    ad->f  = nullptr;
+ +
+ +    /* size -1 indicates that the respective array hasn't been initialized yet */
+ +    ad->natoms = -1;
+ +    ad->nalloc = -1;
+ +}
+ +
+ +/*! \brief Copies all parameters related to the cut-off from ic to nbp
+ + */
+ +static void set_cutoff_parameters(cl_nbparam_t              *nbp,
+ +                                  const interaction_const_t *ic,
+ +                                  const PairlistParams      &listParams)
+ +{
+ +    nbp->ewald_beta        = ic->ewaldcoeff_q;
+ +    nbp->sh_ewald          = ic->sh_ewald;
+ +    nbp->epsfac            = ic->epsfac;
+ +    nbp->two_k_rf          = 2.0 * ic->k_rf;
+ +    nbp->c_rf              = ic->c_rf;
+ +    nbp->rvdw_sq           = ic->rvdw * ic->rvdw;
+ +    nbp->rcoulomb_sq       = ic->rcoulomb * ic->rcoulomb;
+ +    nbp->rlistOuter_sq     = listParams.rlistOuter * listParams.rlistOuter;
+ +    nbp->rlistInner_sq     = listParams.rlistInner * listParams.rlistInner;
+ +    nbp->useDynamicPruning = listParams.useDynamicPruning;
+ +
+ +    nbp->sh_lj_ewald       = ic->sh_lj_ewald;
+ +    nbp->ewaldcoeff_lj     = ic->ewaldcoeff_lj;
+ +
+ +    nbp->rvdw_switch       = ic->rvdw_switch;
+ +    nbp->dispersion_shift  = ic->dispersion_shift;
+ +    nbp->repulsion_shift   = ic->repulsion_shift;
+ +    nbp->vdw_switch        = ic->vdw_switch;
+ +}
+ +
+ +/*! \brief Returns the kinds of electrostatics and Vdw OpenCL
+ + *  kernels that will be used.
+ + *
+ + * Respectively, these values are from enum eelOcl and enum
+ + * evdwOcl. */
+ +static void
+ +map_interaction_types_to_gpu_kernel_flavors(const interaction_const_t *ic,
+ +                                            int                        combRule,
+ +                                            int                       *gpu_eeltype,
+ +                                            int                       *gpu_vdwtype)
+ +{
+ +    if (ic->vdwtype == evdwCUT)
+ +    {
+ +        switch (ic->vdw_modifier)
+ +        {
+ +            case eintmodNONE:
+ +            case eintmodPOTSHIFT:
+ +                switch (combRule)
+ +                {
+ +                    case ljcrNONE:
+ +                        *gpu_vdwtype = evdwOclCUT;
+ +                        break;
+ +                    case ljcrGEOM:
+ +                        *gpu_vdwtype = evdwOclCUTCOMBGEOM;
+ +                        break;
+ +                    case ljcrLB:
+ +                        *gpu_vdwtype = evdwOclCUTCOMBLB;
+ +                        break;
+ +                    default:
+ +                        gmx_incons("The requested LJ combination rule is not implemented in the OpenCL GPU accelerated kernels!");
+ +                }
+ +                break;
+ +            case eintmodFORCESWITCH:
+ +                *gpu_vdwtype = evdwOclFSWITCH;
+ +                break;
+ +            case eintmodPOTSWITCH:
+ +                *gpu_vdwtype = evdwOclPSWITCH;
+ +                break;
+ +            default:
+ +                gmx_incons("The requested VdW interaction modifier is not implemented in the GPU accelerated kernels!");
+ +        }
+ +    }
+ +    else if (ic->vdwtype == evdwPME)
+ +    {
+ +        if (ic->ljpme_comb_rule == ljcrGEOM)
+ +        {
+ +            *gpu_vdwtype = evdwOclEWALDGEOM;
+ +        }
+ +        else
+ +        {
+ +            *gpu_vdwtype = evdwOclEWALDLB;
+ +        }
+ +    }
+ +    else
+ +    {
+ +        gmx_incons("The requested VdW type is not implemented in the GPU accelerated kernels!");
+ +    }
+ +
+ +    if (ic->eeltype == eelCUT)
+ +    {
+ +        *gpu_eeltype = eelOclCUT;
+ +    }
+ +    else if (EEL_RF(ic->eeltype))
+ +    {
+ +        *gpu_eeltype = eelOclRF;
+ +    }
+ +    else if ((EEL_PME(ic->eeltype) || ic->eeltype == eelEWALD))
+ +    {
-     nbp->eeltype = gpu_pick_ewald_kernel_type(ic->rcoulomb != ic->rvdw);
++        *gpu_eeltype = nbnxn_gpu_pick_ewald_kernel_type(*ic);
+ +    }
+ +    else
+ +    {
+ +        /* Shouldn't happen, as this is checked when choosing Verlet-scheme */
+ +        gmx_incons("The requested electrostatics type is not implemented in the GPU accelerated kernels!");
+ +    }
+ +}
+ +
+ +/*! \brief Initializes the nonbonded parameter data structure.
+ + */
+ +static void init_nbparam(cl_nbparam_t                    *nbp,
+ +                         const interaction_const_t       *ic,
+ +                         const PairlistParams            &listParams,
+ +                         const nbnxn_atomdata_t::Params  &nbatParams,
+ +                         const gmx_device_runtime_data_t *runData)
+ +{
+ +    cl_int cl_error;
+ +
+ +    set_cutoff_parameters(nbp, ic, listParams);
+ +
+ +    map_interaction_types_to_gpu_kernel_flavors(ic,
+ +                                                nbatParams.comb_rule,
+ +                                                &(nbp->eeltype),
+ +                                                &(nbp->vdwtype));
+ +
+ +    if (ic->vdwtype == evdwPME)
+ +    {
+ +        if (ic->ljpme_comb_rule == ljcrGEOM)
+ +        {
+ +            GMX_ASSERT(nbatParams.comb_rule == ljcrGEOM, "Combination rule mismatch!");
+ +        }
+ +        else
+ +        {
+ +            GMX_ASSERT(nbatParams.comb_rule == ljcrLB, "Combination rule mismatch!");
+ +        }
+ +    }
+ +    /* generate table for PME */
+ +    nbp->coulomb_tab_climg2d = nullptr;
+ +    if (nbp->eeltype == eelOclEWALD_TAB || nbp->eeltype == eelOclEWALD_TAB_TWIN)
+ +    {
+ +        GMX_RELEASE_ASSERT(ic->coulombEwaldTables, "Need valid Coulomb Ewald correction tables");
+ +        init_ewald_coulomb_force_table(*ic->coulombEwaldTables, nbp, runData);
+ +    }
+ +    else
+ +    // TODO: improvement needed.
+ +    // The image2d is created here even if eeltype is not eelCuEWALD_TAB or eelCuEWALD_TAB_TWIN because the OpenCL kernels
+ +    // don't accept nullptr values for image2D parameters.
+ +    {
+ +        /* Switched from using textures to using buffers */
+ +        // TODO: decide which alternative is most efficient - textures or buffers.
+ +        /*
+ +           cl_image_format array_format;
+ +
+ +           array_format.image_channel_data_type = CL_FLOAT;
+ +           array_format.image_channel_order     = CL_R;
+ +
+ +           nbp->coulomb_tab_climg2d = clCreateImage2D(runData->context, CL_MEM_READ_WRITE,
+ +            &array_format, 1, 1, 0, nullptr, &cl_error);
+ +         */
+ +
+ +        nbp->coulomb_tab_climg2d = clCreateBuffer(runData->context, CL_MEM_READ_ONLY, sizeof(cl_float), nullptr, &cl_error);
+ +        GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
+ +                           ("clCreateBuffer failed: " + ocl_get_error_string(cl_error)).c_str());
+ +    }
+ +
+ +    const int nnbfp      = 2*nbatParams.numTypes*nbatParams.numTypes;
+ +    const int nnbfp_comb = 2*nbatParams.numTypes;
+ +
+ +    {
+ +        /* Switched from using textures to using buffers */
+ +        // TODO: decide which alternative is most efficient - textures or buffers.
+ +        /*
+ +           cl_image_format array_format;
+ +
+ +           array_format.image_channel_data_type = CL_FLOAT;
+ +           array_format.image_channel_order     = CL_R;
+ +
+ +           nbp->nbfp_climg2d = clCreateImage2D(runData->context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
+ +            &array_format, nnbfp, 1, 0, nbat->nbfp, &cl_error);
+ +         */
+ +
+ +        nbp->nbfp_climg2d =
+ +            clCreateBuffer(runData->context,
+ +                           CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY | CL_MEM_COPY_HOST_PTR,
+ +                           nnbfp*sizeof(cl_float),
+ +                           const_cast<float *>(nbatParams.nbfp.data()),
+ +                           &cl_error);
+ +        GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
+ +                           ("clCreateBuffer failed: " + ocl_get_error_string(cl_error)).c_str());
+ +
+ +        if (ic->vdwtype == evdwPME)
+ +        {
+ +            /* Switched from using textures to using buffers */
+ +            // TODO: decide which alternative is most efficient - textures or buffers.
+ +            /*  nbp->nbfp_comb_climg2d = clCreateImage2D(runData->context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
+ +                &array_format, nnbfp_comb, 1, 0, nbat->nbfp_comb, &cl_error);*/
+ +            nbp->nbfp_comb_climg2d =
+ +                clCreateBuffer(runData->context,
+ +                               CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY | CL_MEM_COPY_HOST_PTR,
+ +                               nnbfp_comb*sizeof(cl_float),
+ +                               const_cast<float *>(nbatParams.nbfp_comb.data()),
+ +                               &cl_error);
+ +            GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
+ +                               ("clCreateBuffer failed: " + ocl_get_error_string(cl_error)).c_str());
+ +        }
+ +        else
+ +        {
+ +            // TODO: improvement needed.
+ +            // The image2d is created here even if vdwtype is not evdwPME because the OpenCL kernels
+ +            // don't accept nullptr values for image2D parameters.
+ +            /* Switched from using textures to using buffers */
+ +            // TODO: decide which alternative is most efficient - textures or buffers.
+ +            /* nbp->nbfp_comb_climg2d = clCreateImage2D(runData->context, CL_MEM_READ_WRITE,
+ +                &array_format, 1, 1, 0, nullptr, &cl_error);*/
+ +            nbp->nbfp_comb_climg2d = clCreateBuffer(runData->context, CL_MEM_READ_ONLY, sizeof(cl_float), nullptr, &cl_error);
+ +            GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
+ +                               ("clCreateBuffer failed: " + ocl_get_error_string(cl_error)).c_str());
+ +        }
+ +    }
+ +}
+ +
+ +//! This function is documented in the header file
+ +void gpu_pme_loadbal_update_param(const nonbonded_verlet_t    *nbv,
+ +                                  const interaction_const_t   *ic)
+ +{
+ +    if (!nbv || !nbv->useGpu())
+ +    {
+ +        return;
+ +    }
+ +    gmx_nbnxn_ocl_t    *nb  = nbv->gpu_nbv;
+ +    cl_nbparam_t       *nbp = nb->nbparam;
+ +
+ +    set_cutoff_parameters(nbp, ic, nbv->pairlistSets().params());
+ +
++    nbp->eeltype = nbnxn_gpu_pick_ewald_kernel_type(*ic);
+ +
+ +    GMX_RELEASE_ASSERT(ic->coulombEwaldTables, "Need valid Coulomb Ewald correction tables");
+ +    init_ewald_coulomb_force_table(*ic->coulombEwaldTables, nbp, nb->dev_rundata);
+ +}
+ +
+ +/*! \brief Initializes the pair list data structure.
+ + */
+ +static void init_plist(cl_plist_t *pl)
+ +{
+ +    /* initialize to nullptr pointers to data that is not allocated here and will
+ +       need reallocation in nbnxn_gpu_init_pairlist */
+ +    pl->sci     = nullptr;
+ +    pl->cj4     = nullptr;
+ +    pl->imask   = nullptr;
+ +    pl->excl    = nullptr;
+ +
+ +    /* size -1 indicates that the respective array hasn't been initialized yet */
+ +    pl->na_c           = -1;
+ +    pl->nsci           = -1;
+ +    pl->sci_nalloc     = -1;
+ +    pl->ncj4           = -1;
+ +    pl->cj4_nalloc     = -1;
+ +    pl->nimask         = -1;
+ +    pl->imask_nalloc   = -1;
+ +    pl->nexcl          = -1;
+ +    pl->excl_nalloc    = -1;
+ +    pl->haveFreshList  = false;
+ +}
+ +
+ +/*! \brief Initializes the timings data structure.
+ + */
+ +static void init_timings(gmx_wallclock_gpu_nbnxn_t *t)
+ +{
+ +    int i, j;
+ +
+ +    t->nb_h2d_t = 0.0;
+ +    t->nb_d2h_t = 0.0;
+ +    t->nb_c     = 0;
+ +    t->pl_h2d_t = 0.0;
+ +    t->pl_h2d_c = 0;
+ +    for (i = 0; i < 2; i++)
+ +    {
+ +        for (j = 0; j < 2; j++)
+ +        {
+ +            t->ktime[i][j].t = 0.0;
+ +            t->ktime[i][j].c = 0;
+ +        }
+ +    }
+ +
+ +    t->pruneTime.c        = 0;
+ +    t->pruneTime.t        = 0.0;
+ +    t->dynamicPruneTime.c = 0;
+ +    t->dynamicPruneTime.t = 0.0;
+ +}
+ +
+ +
+ +//! OpenCL notification callback function
+ +static void CL_CALLBACK
+ +ocl_notify_fn(const char *pErrInfo, const void gmx_unused *private_info, size_t gmx_unused cb, void gmx_unused *user_data)
+ +{
+ +    if (pErrInfo != nullptr)
+ +    {
+ +        printf("%s\n", pErrInfo ); // Print error/hint
+ +    }
+ +}
+ +
+ +/*! \brief Creates context for OpenCL GPU given by \p mygpu
+ + *
+ + * A fatal error results if creation fails.
+ + *
+ + * \param[inout] runtimeData runtime data including program and context
+ + * \param[in]    devInfo     device info struct
+ + * \param[in]    rank        MPI rank (for error reporting)
+ + */
+ +static void
+ +nbnxn_gpu_create_context(gmx_device_runtime_data_t *runtimeData,
+ +                         const gmx_device_info_t   *devInfo,
+ +                         int                        rank)
+ +{
+ +    cl_context_properties     context_properties[5];
+ +    cl_platform_id            platform_id;
+ +    cl_device_id              device_id;
+ +    cl_context                context;
+ +    cl_int                    cl_error;
+ +
+ +    assert(runtimeData != nullptr);
+ +    assert(devInfo != nullptr);
+ +
+ +    platform_id      = devInfo->ocl_gpu_id.ocl_platform_id;
+ +    device_id        = devInfo->ocl_gpu_id.ocl_device_id;
+ +
+ +    int i = 0;
+ +    context_properties[i++] = CL_CONTEXT_PLATFORM;
+ +    context_properties[i++] = reinterpret_cast<cl_context_properties>(platform_id);
+ +    if (getenv("GMX_OCL_SHOW_DIAGNOSTICS"))
+ +    {
+ +        context_properties[i++] = CL_CONTEXT_SHOW_DIAGNOSTICS_INTEL;
+ +        context_properties[i++] = CL_CONTEXT_DIAGNOSTICS_LEVEL_BAD_INTEL | CL_CONTEXT_DIAGNOSTICS_LEVEL_NEUTRAL_INTEL;
+ +    }
+ +    context_properties[i++] = 0; /* Terminates the list of properties */
+ +
+ +    context = clCreateContext(context_properties, 1, &device_id, ocl_notify_fn, nullptr, &cl_error);
+ +    if (CL_SUCCESS != cl_error)
+ +    {
+ +        gmx_fatal(FARGS, "On rank %d failed to create context for GPU #%s:\n OpenCL error %d: %s",
+ +                  rank,
+ +                  devInfo->device_name,
+ +                  cl_error, ocl_get_error_string(cl_error).c_str());
+ +    }
+ +
+ +    runtimeData->context = context;
+ +}
+ +
+ +/*! \brief Initializes the OpenCL kernel pointers of the nbnxn_ocl_ptr_t input data structure. */
+ +static cl_kernel nbnxn_gpu_create_kernel(gmx_nbnxn_ocl_t *nb,
+ +                                         const char      *kernel_name)
+ +{
+ +    cl_kernel kernel;
+ +    cl_int    cl_error;
+ +
+ +    kernel = clCreateKernel(nb->dev_rundata->program, kernel_name, &cl_error);
+ +    if (CL_SUCCESS != cl_error)
+ +    {
+ +        gmx_fatal(FARGS, "Failed to create kernel '%s' for GPU #%s: OpenCL error %d",
+ +                  kernel_name,
+ +                  nb->dev_info->device_name,
+ +                  cl_error);
+ +    }
+ +
+ +    return kernel;
+ +}
+ +
+ +/*! \brief Clears nonbonded shift force output array and energy outputs on the GPU.
+ + */
+ +static void
+ +nbnxn_ocl_clear_e_fshift(gmx_nbnxn_ocl_t *nb)
+ +{
+ +
+ +    cl_int               cl_error;
+ +    cl_atomdata_t *      adat     = nb->atdat;
+ +    cl_command_queue     ls       = nb->stream[InteractionLocality::Local];
+ +
+ +    size_t               local_work_size[3]   = {1, 1, 1};
+ +    size_t               global_work_size[3]  = {1, 1, 1};
+ +
+ +    cl_int               shifts   = SHIFTS*3;
+ +
+ +    cl_int               arg_no;
+ +
+ +    cl_kernel            zero_e_fshift = nb->kernel_zero_e_fshift;
+ +
+ +    local_work_size[0]   = 64;
+ +    // Round the total number of threads up from the array size
+ +    global_work_size[0]  = ((shifts + local_work_size[0] - 1)/local_work_size[0])*local_work_size[0];
+ +
+ +    arg_no    = 0;
+ +    cl_error  = clSetKernelArg(zero_e_fshift, arg_no++, sizeof(cl_mem), &(adat->fshift));
+ +    cl_error |= clSetKernelArg(zero_e_fshift, arg_no++, sizeof(cl_mem), &(adat->e_lj));
+ +    cl_error |= clSetKernelArg(zero_e_fshift, arg_no++, sizeof(cl_mem), &(adat->e_el));
+ +    cl_error |= clSetKernelArg(zero_e_fshift, arg_no++, sizeof(cl_uint), &shifts);
+ +    GMX_ASSERT(cl_error == CL_SUCCESS, ocl_get_error_string(cl_error).c_str());
+ +
+ +    cl_error = clEnqueueNDRangeKernel(ls, zero_e_fshift, 3, nullptr, global_work_size, local_work_size, 0, nullptr, nullptr);
+ +    GMX_ASSERT(cl_error == CL_SUCCESS, ocl_get_error_string(cl_error).c_str());
+ +}
+ +
+ +/*! \brief Initializes the OpenCL kernel pointers of the nbnxn_ocl_ptr_t input data structure. */
+ +static void nbnxn_gpu_init_kernels(gmx_nbnxn_ocl_t *nb)
+ +{
+ +    /* Init to 0 main kernel arrays */
+ +    /* They will be later on initialized in select_nbnxn_kernel */
+ +    // TODO: consider always creating all variants of the kernels here so that there is no
+ +    // need for late call to clCreateKernel -- if that gives any advantage?
+ +    memset(nb->kernel_ener_noprune_ptr, 0, sizeof(nb->kernel_ener_noprune_ptr));
+ +    memset(nb->kernel_ener_prune_ptr, 0, sizeof(nb->kernel_ener_prune_ptr));
+ +    memset(nb->kernel_noener_noprune_ptr, 0, sizeof(nb->kernel_noener_noprune_ptr));
+ +    memset(nb->kernel_noener_prune_ptr, 0, sizeof(nb->kernel_noener_prune_ptr));
+ +
+ +    /* Init pruning kernels
+ +     *
+ +     * TODO: we could avoid creating kernels if dynamic pruning is turned off,
+ +     * but ATM that depends on force flags not passed into the initialization.
+ +     */
+ +    nb->kernel_pruneonly[epruneFirst]   = nbnxn_gpu_create_kernel(nb, "nbnxn_kernel_prune_opencl");
+ +    nb->kernel_pruneonly[epruneRolling] = nbnxn_gpu_create_kernel(nb, "nbnxn_kernel_prune_rolling_opencl");
+ +
+ +    /* Init auxiliary kernels */
+ +    nb->kernel_zero_e_fshift = nbnxn_gpu_create_kernel(nb, "zero_e_fshift");
+ +}
+ +
+ +/*! \brief Initializes simulation constant data.
+ + *
+ + *  Initializes members of the atomdata and nbparam structs and
+ + *  clears e/fshift output buffers.
+ + */
+ +static void nbnxn_ocl_init_const(gmx_nbnxn_ocl_t                *nb,
+ +                                 const interaction_const_t      *ic,
+ +                                 const PairlistParams           &listParams,
+ +                                 const nbnxn_atomdata_t::Params &nbatParams)
+ +{
+ +    init_atomdata_first(nb->atdat, nbatParams.numTypes, nb->dev_rundata);
+ +    init_nbparam(nb->nbparam, ic, listParams, nbatParams, nb->dev_rundata);
+ +}
+ +
+ +
+ +//! This function is documented in the header file
+ +gmx_nbnxn_ocl_t *
+ +gpu_init(const gmx_device_info_t   *deviceInfo,
+ +         const interaction_const_t *ic,
+ +         const PairlistParams      &listParams,
+ +         const nbnxn_atomdata_t    *nbat,
+ +         const int                  rank,
+ +         const gmx_bool             bLocalAndNonlocal)
+ +{
+ +    gmx_nbnxn_ocl_t            *nb;
+ +    cl_int                      cl_error;
+ +    cl_command_queue_properties queue_properties;
+ +
+ +    assert(ic);
+ +
+ +    snew(nb, 1);
+ +    snew(nb->atdat, 1);
+ +    snew(nb->nbparam, 1);
+ +    snew(nb->plist[InteractionLocality::Local], 1);
+ +    if (bLocalAndNonlocal)
+ +    {
+ +        snew(nb->plist[InteractionLocality::NonLocal], 1);
+ +    }
+ +
+ +    nb->bUseTwoStreams = static_cast<cl_bool>(bLocalAndNonlocal);
+ +
+ +    nb->timers = new cl_timers_t();
+ +    snew(nb->timings, 1);
+ +
+ +    /* set device info, just point it to the right GPU among the detected ones */
+ +    nb->dev_info = deviceInfo;
+ +    snew(nb->dev_rundata, 1);
+ +
+ +    /* init nbst */
+ +    pmalloc(reinterpret_cast<void**>(&nb->nbst.e_lj), sizeof(*nb->nbst.e_lj));
+ +    pmalloc(reinterpret_cast<void**>(&nb->nbst.e_el), sizeof(*nb->nbst.e_el));
+ +    pmalloc(reinterpret_cast<void**>(&nb->nbst.fshift), SHIFTS * sizeof(*nb->nbst.fshift));
+ +
+ +    init_plist(nb->plist[InteractionLocality::Local]);
+ +
+ +    /* OpenCL timing disabled if GMX_DISABLE_GPU_TIMING is defined. */
+ +    nb->bDoTime = static_cast<cl_bool>(getenv("GMX_DISABLE_GPU_TIMING") == nullptr);
+ +
+ +    /* Create queues only after bDoTime has been initialized */
+ +    if (nb->bDoTime)
+ +    {
+ +        queue_properties = CL_QUEUE_PROFILING_ENABLE;
+ +    }
+ +    else
+ +    {
+ +        queue_properties = 0;
+ +    }
+ +
+ +    nbnxn_gpu_create_context(nb->dev_rundata, nb->dev_info, rank);
+ +
+ +    /* local/non-local GPU streams */
+ +    nb->stream[InteractionLocality::Local] =
+ +        clCreateCommandQueue(nb->dev_rundata->context, nb->dev_info->ocl_gpu_id.ocl_device_id, queue_properties, &cl_error);
+ +    if (CL_SUCCESS != cl_error)
+ +    {
+ +        gmx_fatal(FARGS, "On rank %d failed to create context for GPU #%s: OpenCL error %d",
+ +                  rank,
+ +                  nb->dev_info->device_name,
+ +                  cl_error);
+ +    }
+ +
+ +    if (nb->bUseTwoStreams)
+ +    {
+ +        init_plist(nb->plist[InteractionLocality::NonLocal]);
+ +
+ +        nb->stream[InteractionLocality::NonLocal] =
+ +            clCreateCommandQueue(nb->dev_rundata->context, nb->dev_info->ocl_gpu_id.ocl_device_id, queue_properties, &cl_error);
+ +        if (CL_SUCCESS != cl_error)
+ +        {
+ +            gmx_fatal(FARGS, "On rank %d failed to create context for GPU #%s: OpenCL error %d",
+ +                      rank,
+ +                      nb->dev_info->device_name,
+ +                      cl_error);
+ +        }
+ +    }
+ +
+ +    if (nb->bDoTime)
+ +    {
+ +        init_timings(nb->timings);
+ +    }
+ +
+ +    nbnxn_ocl_init_const(nb, ic, listParams, nbat->params());
+ +
+ +    /* Enable LJ param manual prefetch for AMD or Intel or if we request through env. var.
+ +     * TODO: decide about NVIDIA
+ +     */
+ +    nb->bPrefetchLjParam =
+ +        (getenv("GMX_OCL_DISABLE_I_PREFETCH") == nullptr) &&
+ +        ((nb->dev_info->vendor_e == OCL_VENDOR_AMD) || (nb->dev_info->vendor_e == OCL_VENDOR_INTEL)
+ +         || (getenv("GMX_OCL_ENABLE_I_PREFETCH") != nullptr));
+ +
+ +    /* NOTE: in CUDA we pick L1 cache configuration for the nbnxn kernels here,
+ +     * but sadly this is not supported in OpenCL (yet?). Consider adding it if
+ +     * it becomes supported.
+ +     */
+ +    nbnxn_gpu_compile_kernels(nb);
+ +    nbnxn_gpu_init_kernels(nb);
+ +
+ +    /* clear energy and shift force outputs */
+ +    nbnxn_ocl_clear_e_fshift(nb);
+ +
+ +    if (debug)
+ +    {
+ +        fprintf(debug, "Initialized OpenCL data structures.\n");
+ +    }
+ +
+ +    return nb;
+ +}
+ +
+ +/*! \brief Clears the first natoms_clear elements of the GPU nonbonded force output array.
+ + */
+ +static void nbnxn_ocl_clear_f(gmx_nbnxn_ocl_t *nb, int natoms_clear)
+ +{
+ +    if (natoms_clear == 0)
+ +    {
+ +        return;
+ +    }
+ +
+ +    cl_int gmx_used_in_debug cl_error;
+ +
+ +    cl_atomdata_t           *atomData = nb->atdat;
+ +    cl_command_queue         ls       = nb->stream[InteractionLocality::Local];
+ +    cl_float                 value    = 0.0F;
+ +
+ +    cl_error = clEnqueueFillBuffer(ls, atomData->f, &value, sizeof(cl_float),
+ +                                   0, natoms_clear*sizeof(rvec), 0, nullptr, nullptr);
+ +    GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS, ("clEnqueueFillBuffer failed: " +
+ +                                                ocl_get_error_string(cl_error)).c_str());
+ +}
+ +
+ +//! This function is documented in the header file
+ +void gpu_clear_outputs(gmx_nbnxn_ocl_t *nb,
+ +                       bool             computeVirial)
+ +{
+ +    nbnxn_ocl_clear_f(nb, nb->atdat->natoms);
+ +    /* clear shift force array and energies if the outputs were
+ +       used in the current step */
+ +    if (computeVirial)
+ +    {
+ +        nbnxn_ocl_clear_e_fshift(nb);
+ +    }
+ +
+ +    /* kick off buffer clearing kernel to ensure concurrency with constraints/update */
+ +    cl_int gmx_unused cl_error;
+ +    cl_error = clFlush(nb->stream[InteractionLocality::Local]);
+ +    assert(CL_SUCCESS == cl_error);
+ +}
+ +
+ +//! This function is documented in the header file
+ +void gpu_init_pairlist(gmx_nbnxn_ocl_t           *nb,
+ +                       const NbnxnPairlistGpu    *h_plist,
+ +                       const InteractionLocality  iloc)
+ +{
+ +    char             sbuf[STRLEN];
+ +    // Timing accumulation should happen only if there was work to do
+ +    // because getLastRangeTime() gets skipped with empty lists later
+ +    // which leads to the counter not being reset.
+ +    bool             bDoTime    = ((nb->bDoTime == CL_TRUE) && !h_plist->sci.empty());
+ +    cl_command_queue stream     = nb->stream[iloc];
+ +    cl_plist_t      *d_plist    = nb->plist[iloc];
+ +
+ +    if (d_plist->na_c < 0)
+ +    {
+ +        d_plist->na_c = h_plist->na_ci;
+ +    }
+ +    else
+ +    {
+ +        if (d_plist->na_c != h_plist->na_ci)
+ +        {
+ +            sprintf(sbuf, "In cu_init_plist: the #atoms per cell has changed (from %d to %d)",
+ +                    d_plist->na_c, h_plist->na_ci);
+ +            gmx_incons(sbuf);
+ +        }
+ +    }
+ +
+ +    gpu_timers_t::Interaction &iTimers = nb->timers->interaction[iloc];
+ +
+ +    if (bDoTime)
+ +    {
+ +        iTimers.pl_h2d.openTimingRegion(stream);
+ +        iTimers.didPairlistH2D = true;
+ +    }
+ +
+ +    // TODO most of this function is same in CUDA and OpenCL, move into the header
+ +    Context context = nb->dev_rundata->context;
+ +
+ +    reallocateDeviceBuffer(&d_plist->sci, h_plist->sci.size(),
+ +                           &d_plist->nsci, &d_plist->sci_nalloc, context);
+ +    copyToDeviceBuffer(&d_plist->sci, h_plist->sci.data(), 0, h_plist->sci.size(),
+ +                       stream, GpuApiCallBehavior::Async,
+ +                       bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
+ +
+ +    reallocateDeviceBuffer(&d_plist->cj4, h_plist->cj4.size(),
+ +                           &d_plist->ncj4, &d_plist->cj4_nalloc, context);
+ +    copyToDeviceBuffer(&d_plist->cj4, h_plist->cj4.data(), 0, h_plist->cj4.size(),
+ +                       stream, GpuApiCallBehavior::Async,
+ +                       bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
+ +
+ +    reallocateDeviceBuffer(&d_plist->imask, h_plist->cj4.size()*c_nbnxnGpuClusterpairSplit,
+ +                           &d_plist->nimask, &d_plist->imask_nalloc, context);
+ +
+ +    reallocateDeviceBuffer(&d_plist->excl, h_plist->excl.size(),
+ +                           &d_plist->nexcl, &d_plist->excl_nalloc, context);
+ +    copyToDeviceBuffer(&d_plist->excl, h_plist->excl.data(), 0, h_plist->excl.size(),
+ +                       stream, GpuApiCallBehavior::Async,
+ +                       bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
+ +
+ +    if (bDoTime)
+ +    {
+ +        iTimers.pl_h2d.closeTimingRegion(stream);
+ +    }
+ +
+ +    /* need to prune the pair list during the next step */
+ +    d_plist->haveFreshList = true;
+ +}
+ +
+ +//! This function is documented in the header file
+ +void gpu_upload_shiftvec(gmx_nbnxn_ocl_t        *nb,
+ +                         const nbnxn_atomdata_t *nbatom)
+ +{
+ +    cl_atomdata_t   *adat  = nb->atdat;
+ +    cl_command_queue ls    = nb->stream[InteractionLocality::Local];
+ +
+ +    /* only if we have a dynamic box */
+ +    if (nbatom->bDynamicBox || !adat->bShiftVecUploaded)
+ +    {
+ +        ocl_copy_H2D_async(adat->shift_vec, nbatom->shift_vec.data(), 0,
+ +                           SHIFTS * adat->shift_vec_elem_size, ls, nullptr);
+ +        adat->bShiftVecUploaded = CL_TRUE;
+ +    }
+ +}
+ +
+ +//! This function is documented in the header file
+ +void gpu_init_atomdata(gmx_nbnxn_ocl_t        *nb,
+ +                       const nbnxn_atomdata_t *nbat)
+ +{
+ +    cl_int           cl_error;
+ +    int              nalloc, natoms;
+ +    bool             realloced;
+ +    bool             bDoTime = nb->bDoTime == CL_TRUE;
+ +    cl_timers_t     *timers  = nb->timers;
+ +    cl_atomdata_t   *d_atdat = nb->atdat;
+ +    cl_command_queue ls      = nb->stream[InteractionLocality::Local];
+ +
+ +    natoms    = nbat->numAtoms();
+ +    realloced = false;
+ +
+ +    if (bDoTime)
+ +    {
+ +        /* time async copy */
+ +        timers->atdat.openTimingRegion(ls);
+ +    }
+ +
+ +    /* need to reallocate if we have to copy more atoms than the amount of space
+ +       available and only allocate if we haven't initialized yet, i.e d_atdat->natoms == -1 */
+ +    if (natoms > d_atdat->nalloc)
+ +    {
+ +        nalloc = over_alloc_small(natoms);
+ +
+ +        /* free up first if the arrays have already been initialized */
+ +        if (d_atdat->nalloc != -1)
+ +        {
+ +            freeDeviceBuffer(&d_atdat->f);
+ +            freeDeviceBuffer(&d_atdat->xq);
+ +            freeDeviceBuffer(&d_atdat->lj_comb);
+ +            freeDeviceBuffer(&d_atdat->atom_types);
+ +        }
+ +
+ +        d_atdat->f_elem_size = sizeof(rvec);
+ +
+ +        d_atdat->f = clCreateBuffer(nb->dev_rundata->context, CL_MEM_READ_WRITE | CL_MEM_HOST_READ_ONLY,
+ +                                    nalloc * d_atdat->f_elem_size, nullptr, &cl_error);
+ +        GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
+ +                           ("clCreateBuffer failed: " + ocl_get_error_string(cl_error)).c_str());
+ +
+ +        d_atdat->xq = clCreateBuffer(nb->dev_rundata->context, CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY,
+ +                                     nalloc * sizeof(cl_float4), nullptr, &cl_error);
+ +        GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
+ +                           ("clCreateBuffer failed: " + ocl_get_error_string(cl_error)).c_str());
+ +
+ +        if (useLjCombRule(nb->nbparam->vdwtype))
+ +        {
+ +            d_atdat->lj_comb = clCreateBuffer(nb->dev_rundata->context, CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY,
+ +                                              nalloc * sizeof(cl_float2), nullptr, &cl_error);
+ +            GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
+ +                               ("clCreateBuffer failed: " + ocl_get_error_string(cl_error)).c_str());
+ +        }
+ +        else
+ +        {
+ +            d_atdat->atom_types = clCreateBuffer(nb->dev_rundata->context, CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY,
+ +                                                 nalloc * sizeof(int), nullptr, &cl_error);
+ +            GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
+ +                               ("clCreateBuffer failed: " + ocl_get_error_string(cl_error)).c_str());
+ +        }
+ +
+ +        d_atdat->nalloc = nalloc;
+ +        realloced       = true;
+ +    }
+ +
+ +    d_atdat->natoms       = natoms;
+ +    d_atdat->natoms_local = nbat->natoms_local;
+ +
+ +    /* need to clear GPU f output if realloc happened */
+ +    if (realloced)
+ +    {
+ +        nbnxn_ocl_clear_f(nb, nalloc);
+ +    }
+ +
+ +    if (useLjCombRule(nb->nbparam->vdwtype))
+ +    {
+ +        ocl_copy_H2D_async(d_atdat->lj_comb, nbat->params().lj_comb.data(), 0,
+ +                           natoms*sizeof(cl_float2), ls, bDoTime ? timers->atdat.fetchNextEvent() : nullptr);
+ +    }
+ +    else
+ +    {
+ +        ocl_copy_H2D_async(d_atdat->atom_types, nbat->params().type.data(), 0,
+ +                           natoms*sizeof(int), ls, bDoTime ? timers->atdat.fetchNextEvent() : nullptr);
+ +
+ +    }
+ +
+ +    if (bDoTime)
+ +    {
+ +        timers->atdat.closeTimingRegion(ls);
+ +    }
+ +
+ +    /* kick off the tasks enqueued above to ensure concurrency with the search */
+ +    cl_error = clFlush(ls);
+ +    GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
+ +                       ("clFlush failed: " + ocl_get_error_string(cl_error)).c_str());
+ +}
+ +
+ +/*! \brief Releases an OpenCL kernel pointer */
+ +static void free_kernel(cl_kernel *kernel_ptr)
+ +{
+ +    cl_int gmx_unused cl_error;
+ +
+ +    assert(nullptr != kernel_ptr);
+ +
+ +    if (*kernel_ptr)
+ +    {
+ +        cl_error = clReleaseKernel(*kernel_ptr);
+ +        GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS, ("clReleaseKernel failed: " + ocl_get_error_string(cl_error)).c_str());
+ +
+ +        *kernel_ptr = nullptr;
+ +    }
+ +}
+ +
+ +/*! \brief Releases a list of OpenCL kernel pointers */
+ +static void free_kernels(cl_kernel *kernels, int count)
+ +{
+ +    int i;
+ +
+ +    for (i = 0; i < count; i++)
+ +    {
+ +        free_kernel(kernels + i);
+ +    }
+ +}
+ +
+ +/*! \brief Free the OpenCL runtime data (context and program).
+ + *
+ + *  The function releases the OpenCL context and program assuciated with the
+ + *  device that the calling PP rank is running on.
+ + *
+ + *  \param runData [in]  porinter to the structure with runtime data.
+ + */
+ +static void free_gpu_device_runtime_data(gmx_device_runtime_data_t *runData)
+ +{
+ +    if (runData == nullptr)
+ +    {
+ +        return;
+ +    }
+ +
+ +    cl_int gmx_unused cl_error;
+ +
+ +    if (runData->context)
+ +    {
+ +        cl_error         = clReleaseContext(runData->context);
+ +        GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS, ("clReleaseContext failed: " + ocl_get_error_string(cl_error)).c_str());
+ +        runData->context = nullptr;
+ +    }
+ +
+ +    if (runData->program)
+ +    {
+ +        cl_error         = clReleaseProgram(runData->program);
+ +        GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS, ("clReleaseProgram failed: " + ocl_get_error_string(cl_error)).c_str());
+ +        runData->program = nullptr;
+ +    }
+ +
+ +}
+ +
+ +//! This function is documented in the header file
+ +void gpu_free(gmx_nbnxn_ocl_t *nb)
+ +{
+ +    if (nb == nullptr)
+ +    {
+ +        return;
+ +    }
+ +
+ +    /* Free kernels */
+ +    int kernel_count = sizeof(nb->kernel_ener_noprune_ptr) / sizeof(nb->kernel_ener_noprune_ptr[0][0]);
+ +    free_kernels(nb->kernel_ener_noprune_ptr[0], kernel_count);
+ +
+ +    kernel_count = sizeof(nb->kernel_ener_prune_ptr) / sizeof(nb->kernel_ener_prune_ptr[0][0]);
+ +    free_kernels(nb->kernel_ener_prune_ptr[0], kernel_count);
+ +
+ +    kernel_count = sizeof(nb->kernel_noener_noprune_ptr) / sizeof(nb->kernel_noener_noprune_ptr[0][0]);
+ +    free_kernels(nb->kernel_noener_noprune_ptr[0], kernel_count);
+ +
+ +    kernel_count = sizeof(nb->kernel_noener_prune_ptr) / sizeof(nb->kernel_noener_prune_ptr[0][0]);
+ +    free_kernels(nb->kernel_noener_prune_ptr[0], kernel_count);
+ +
+ +    free_kernel(&(nb->kernel_zero_e_fshift));
+ +
+ +    /* Free atdat */
+ +    freeDeviceBuffer(&(nb->atdat->xq));
+ +    freeDeviceBuffer(&(nb->atdat->f));
+ +    freeDeviceBuffer(&(nb->atdat->e_lj));
+ +    freeDeviceBuffer(&(nb->atdat->e_el));
+ +    freeDeviceBuffer(&(nb->atdat->fshift));
+ +    freeDeviceBuffer(&(nb->atdat->lj_comb));
+ +    freeDeviceBuffer(&(nb->atdat->atom_types));
+ +    freeDeviceBuffer(&(nb->atdat->shift_vec));
+ +    sfree(nb->atdat);
+ +
+ +    /* Free nbparam */
+ +    freeDeviceBuffer(&(nb->nbparam->nbfp_climg2d));
+ +    freeDeviceBuffer(&(nb->nbparam->nbfp_comb_climg2d));
+ +    freeDeviceBuffer(&(nb->nbparam->coulomb_tab_climg2d));
+ +    sfree(nb->nbparam);
+ +
+ +    /* Free plist */
+ +    auto *plist = nb->plist[InteractionLocality::Local];
+ +    freeDeviceBuffer(&plist->sci);
+ +    freeDeviceBuffer(&plist->cj4);
+ +    freeDeviceBuffer(&plist->imask);
+ +    freeDeviceBuffer(&plist->excl);
+ +    sfree(plist);
+ +    if (nb->bUseTwoStreams)
+ +    {
+ +        auto *plist_nl = nb->plist[InteractionLocality::NonLocal];
+ +        freeDeviceBuffer(&plist_nl->sci);
+ +        freeDeviceBuffer(&plist_nl->cj4);
+ +        freeDeviceBuffer(&plist_nl->imask);
+ +        freeDeviceBuffer(&plist_nl->excl);
+ +        sfree(plist_nl);
+ +    }
+ +
+ +    /* Free nbst */
+ +    pfree(nb->nbst.e_lj);
+ +    nb->nbst.e_lj = nullptr;
+ +
+ +    pfree(nb->nbst.e_el);
+ +    nb->nbst.e_el = nullptr;
+ +
+ +    pfree(nb->nbst.fshift);
+ +    nb->nbst.fshift = nullptr;
+ +
+ +    /* Free command queues */
+ +    clReleaseCommandQueue(nb->stream[InteractionLocality::Local]);
+ +    nb->stream[InteractionLocality::Local] = nullptr;
+ +    if (nb->bUseTwoStreams)
+ +    {
+ +        clReleaseCommandQueue(nb->stream[InteractionLocality::NonLocal]);
+ +        nb->stream[InteractionLocality::NonLocal] = nullptr;
+ +    }
+ +    /* Free other events */
+ +    if (nb->nonlocal_done)
+ +    {
+ +        clReleaseEvent(nb->nonlocal_done);
+ +        nb->nonlocal_done = nullptr;
+ +    }
+ +    if (nb->misc_ops_and_local_H2D_done)
+ +    {
+ +        clReleaseEvent(nb->misc_ops_and_local_H2D_done);
+ +        nb->misc_ops_and_local_H2D_done = nullptr;
+ +    }
+ +
+ +    free_gpu_device_runtime_data(nb->dev_rundata);
+ +    sfree(nb->dev_rundata);
+ +
+ +    /* Free timers and timings */
+ +    delete nb->timers;
+ +    sfree(nb->timings);
+ +    sfree(nb);
+ +
+ +    if (debug)
+ +    {
+ +        fprintf(debug, "Cleaned up OpenCL data structures.\n");
+ +    }
+ +}
+ +
+ +//! This function is documented in the header file
+ +gmx_wallclock_gpu_nbnxn_t *gpu_get_timings(gmx_nbnxn_ocl_t *nb)
+ +{
+ +    return (nb != nullptr && nb->bDoTime) ? nb->timings : nullptr;
+ +}
+ +
+ +//! This function is documented in the header file
+ +void gpu_reset_timings(nonbonded_verlet_t* nbv)
+ +{
+ +    if (nbv->gpu_nbv && nbv->gpu_nbv->bDoTime)
+ +    {
+ +        init_timings(nbv->gpu_nbv->timings);
+ +    }
+ +}
+ +
+ +//! This function is documented in the header file
+ +int gpu_min_ci_balanced(gmx_nbnxn_ocl_t *nb)
+ +{
+ +    return nb != nullptr ?
+ +           gpu_min_ci_balanced_factor * nb->dev_info->compute_units : 0;
+ +}
+ +
+ +//! This function is documented in the header file
+ +gmx_bool gpu_is_kernel_ewald_analytical(const gmx_nbnxn_ocl_t *nb)
+ +{
+ +    return ((nb->nbparam->eeltype == eelOclEWALD_ANA) ||
+ +            (nb->nbparam->eeltype == eelOclEWALD_ANA_TWIN));
+ +}
+ +
+ +} // namespace Nbnxm
diff --cc src/gromacs/simd/support.cpp

index 8b334bb0b0fdc094696eef6726c028f53f983ec1,d58c4286ffd8f451b4b3edb21c169ed11bea0e41..220b782cdb6cfa3c53563015371689313f068873
--- 1/src/gromacs/simd/support.cpp
--- 2/src/gromacs/simd/support.cpp
+++ b/src/gromacs/simd/support.cpp
@@@ -127,13 -151,14 +151,15 @@@ simdSuggested(const CpuInfo &c
                   }
                   break;
               case CpuInfo::Vendor::Amd:
+ +            case CpuInfo::Vendor::Hygon:
                   if (c.feature(CpuInfo::Feature::X86_Avx2))
                   {
-                     // AMD Ryzen supports 256-bit AVX2, but performs better with 128-bit
+                     // AMD Zen supports 256-bit AVX2, but Zen1 performs better with 128-bit
                       // since it can execute two independent such instructions per cycle,
                       // and wider SIMD has slightly lower efficiency in GROMACS.
-                     suggested = SimdType::X86_Avx2_128;
+                     // However... Zen2 supports full-width execution of 256-bit AVX2,
+                     // so we only want to apply this hack to Zen/Zen+.
+                     suggested = cpuIsAmdZen1(c) ? SimdType::X86_Avx2_128 : SimdType::X86_Avx2;
                   }
                   else if (c.feature(CpuInfo::Feature::X86_Avx))
                   {
author	Paul Bauer <paul.bauer.q@gmail.com>
	Mon, 23 Sep 2019 07:44:00 +0000 (09:44 +0200)
committer	Paul Bauer <paul.bauer.q@gmail.com>
	Tue, 24 Sep 2019 14:59:39 +0000 (16:59 +0200)
		1	2
cmake/gmxManageMPI.cmake	patch \|	diff1 \|	diff2 \|	blob \| history
src/gromacs/gmxpreprocess/topshake.cpp	patch \|	diff1 \|	diff2 \|	blob \| history
src/gromacs/listed_forces/bonded.cpp	patch \|	diff1 \|	\|	blob \| history
src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu	patch \|	diff1 \|	\|	blob \| history
src/gromacs/nbnxm/nbnxm_gpu.h	patch \|	diff1 \|	\|	blob \| history
src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp	patch \|	diff1 \|	\|	blob \| history
src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp	patch \|	diff1 \|	\|	blob \| history
src/gromacs/simd/support.cpp	patch \|	diff1 \|	diff2 \|	blob \| history