--- /dev/null
- rvec_inc(fshift[t21], f2_i);
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2004, The GROMACS development team.
+ * Copyright (c) 2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ *
+ * \brief This file defines low-level functions necessary for
+ * computing energies and forces for listed interactions.
+ *
+ * \author Mark Abraham <mark.j.abraham@gmail.com>
+ *
+ * \ingroup module_listed_forces
+ */
+#include "gmxpre.h"
+
+#include "bonded.h"
+
+#include "config.h"
+
+#include <cassert>
+#include <cmath>
+
+#include <algorithm>
+
+#include "gromacs/gmxlib/nrnb.h"
+#include "gromacs/listed_forces/disre.h"
+#include "gromacs/listed_forces/orires.h"
+#include "gromacs/math/functions.h"
+#include "gromacs/math/units.h"
+#include "gromacs/math/utilities.h"
+#include "gromacs/math/vec.h"
+#include "gromacs/mdtypes/fcdata.h"
+#include "gromacs/mdtypes/mdatom.h"
+#include "gromacs/pbcutil/ishift.h"
+#include "gromacs/pbcutil/mshift.h"
+#include "gromacs/pbcutil/pbc.h"
+#include "gromacs/pbcutil/pbc_simd.h"
+#include "gromacs/simd/simd.h"
+#include "gromacs/simd/simd_math.h"
+#include "gromacs/simd/vector_operations.h"
+#include "gromacs/utility/basedefinitions.h"
+#include "gromacs/utility/enumerationhelpers.h"
+#include "gromacs/utility/fatalerror.h"
+#include "gromacs/utility/real.h"
+#include "gromacs/utility/smalloc.h"
+
+#include "listed_internal.h"
+#include "restcbt.h"
+
+using namespace gmx; // TODO: Remove when this file is moved into gmx namespace
+
+namespace
+{
+
+//! Type of CPU function to compute a bonded interaction.
+using BondedFunction = real(*)(int nbonds, const t_iatom iatoms[],
+ const t_iparams iparams[],
+ const rvec x[], rvec4 f[], rvec fshift[],
+ const t_pbc *pbc, const t_graph *g,
+ real lambda, real *dvdlambda,
+ const t_mdatoms *md, t_fcdata *fcd,
+ int *ddgatindex);
+
+/*! \brief Mysterious CMAP coefficient matrix */
+const int cmap_coeff_matrix[] = {
+ 1, 0, -3, 2, 0, 0, 0, 0, -3, 0, 9, -6, 2, 0, -6, 4,
+ 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, -9, 6, -2, 0, 6, -4,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, -6, 0, 0, -6, 4,
+ 0, 0, 3, -2, 0, 0, 0, 0, 0, 0, -9, 6, 0, 0, 6, -4,
+ 0, 0, 0, 0, 1, 0, -3, 2, -2, 0, 6, -4, 1, 0, -3, 2,
+ 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 3, -2, 1, 0, -3, 2,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -3, 2, 0, 0, 3, -2,
+ 0, 0, 0, 0, 0, 0, 3, -2, 0, 0, -6, 4, 0, 0, 3, -2,
+ 0, 1, -2, 1, 0, 0, 0, 0, 0, -3, 6, -3, 0, 2, -4, 2,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, -6, 3, 0, -2, 4, -2,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -3, 3, 0, 0, 2, -2,
+ 0, 0, -1, 1, 0, 0, 0, 0, 0, 0, 3, -3, 0, 0, -2, 2,
+ 0, 0, 0, 0, 0, 1, -2, 1, 0, -2, 4, -2, 0, 1, -2, 1,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 2, -1, 0, 1, -2, 1,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, -1, 0, 0, -1, 1,
+ 0, 0, 0, 0, 0, 0, -1, 1, 0, 0, 2, -2, 0, 0, -1, 1
+};
+
+
+/*! \brief Compute dx = xi - xj, modulo PBC if non-NULL
+ *
+ * \todo This kind of code appears in many places. Consolidate it */
+int pbc_rvec_sub(const t_pbc *pbc, const rvec xi, const rvec xj, rvec dx)
+{
+ if (pbc)
+ {
+ return pbc_dx_aiuc(pbc, xi, xj, dx);
+ }
+ else
+ {
+ rvec_sub(xi, xj, dx);
+ return CENTRAL;
+ }
+}
+
+} // namespace
+
+//! \cond
+real bond_angle(const rvec xi, const rvec xj, const rvec xk, const t_pbc *pbc,
+ rvec r_ij, rvec r_kj, real *costh,
+ int *t1, int *t2)
+/* Return value is the angle between the bonds i-j and j-k */
+{
+ /* 41 FLOPS */
+ real th;
+
+ *t1 = pbc_rvec_sub(pbc, xi, xj, r_ij); /* 3 */
+ *t2 = pbc_rvec_sub(pbc, xk, xj, r_kj); /* 3 */
+
+ *costh = cos_angle(r_ij, r_kj); /* 25 */
+ th = std::acos(*costh); /* 10 */
+ /* 41 TOTAL */
+ return th;
+}
+
+real dih_angle(const rvec xi, const rvec xj, const rvec xk, const rvec xl,
+ const t_pbc *pbc,
+ rvec r_ij, rvec r_kj, rvec r_kl, rvec m, rvec n,
+ int *t1, int *t2, int *t3)
+{
+ *t1 = pbc_rvec_sub(pbc, xi, xj, r_ij); /* 3 */
+ *t2 = pbc_rvec_sub(pbc, xk, xj, r_kj); /* 3 */
+ *t3 = pbc_rvec_sub(pbc, xk, xl, r_kl); /* 3 */
+
+ cprod(r_ij, r_kj, m); /* 9 */
+ cprod(r_kj, r_kl, n); /* 9 */
+ real phi = gmx_angle(m, n); /* 49 (assuming 25 for atan2) */
+ real ipr = iprod(r_ij, n); /* 5 */
+ real sign = (ipr < 0.0) ? -1.0 : 1.0;
+ phi = sign*phi; /* 1 */
+ /* 82 TOTAL */
+ return phi;
+}
+//! \endcond
+
+void make_dp_periodic(real *dp) /* 1 flop? */
+{
+ /* dp cannot be outside (-pi,pi) */
+ if (*dp >= M_PI)
+ {
+ *dp -= 2*M_PI;
+ }
+ else if (*dp < -M_PI)
+ {
+ *dp += 2*M_PI;
+ }
+}
+
+namespace
+{
+
+/*! \brief Spreads and accumulates the bonded forces to the two atoms and adds the virial contribution when needed
+ *
+ * When \p g==nullptr, \p shiftIndex is used as the periodic shift.
+ * When \p g!=nullptr, the graph is used to compute the periodic shift.
+ */
+template <BondedKernelFlavor flavor>
+inline void spreadBondForces(const real bondForce,
+ const rvec dx,
+ const int ai,
+ const int aj,
+ rvec4 *f,
+ int shiftIndex,
+ const t_graph *g,
+ rvec *fshift)
+{
+ if (computeVirial(flavor) && g)
+ {
+ ivec dt;
+ ivec_sub(SHIFT_IVEC(g, ai), SHIFT_IVEC(g, aj), dt);
+ shiftIndex = IVEC2IS(dt);
+ }
+
+ for (int m = 0; m < DIM; m++) /* 15 */
+ {
+ const real fij = bondForce*dx[m];
+ f[ai][m] += fij;
+ f[aj][m] -= fij;
+ if (computeVirial(flavor))
+ {
+ fshift[shiftIndex][m] += fij;
+ fshift[CENTRAL][m] -= fij;
+ }
+ }
+}
+
+/*! \brief Morse potential bond
+ *
+ * By Frank Everdij. Three parameters needed:
+ *
+ * b0 = equilibrium distance in nm
+ * be = beta in nm^-1 (actually, it's nu_e*Sqrt(2*pi*pi*mu/D_e))
+ * cb = well depth in kJ/mol
+ *
+ * Note: the potential is referenced to be +cb at infinite separation
+ * and zero at the equilibrium distance!
+ */
+template <BondedKernelFlavor flavor>
+real morse_bonds(int nbonds,
+ const t_iatom forceatoms[], const t_iparams forceparams[],
+ const rvec x[], rvec4 f[], rvec fshift[],
+ const t_pbc *pbc, const t_graph *g,
+ real lambda, real *dvdlambda,
+ const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+ int gmx_unused *global_atom_index)
+{
+ const real one = 1.0;
+ const real two = 2.0;
+ real dr, dr2, temp, omtemp, cbomtemp, fbond, vbond, vtot;
+ real b0, be, cb, b0A, beA, cbA, b0B, beB, cbB, L1;
+ rvec dx;
+ int i, ki, type, ai, aj;
+
+ vtot = 0.0;
+ for (i = 0; (i < nbonds); )
+ {
+ type = forceatoms[i++];
+ ai = forceatoms[i++];
+ aj = forceatoms[i++];
+
+ b0A = forceparams[type].morse.b0A;
+ beA = forceparams[type].morse.betaA;
+ cbA = forceparams[type].morse.cbA;
+
+ b0B = forceparams[type].morse.b0B;
+ beB = forceparams[type].morse.betaB;
+ cbB = forceparams[type].morse.cbB;
+
+ L1 = one-lambda; /* 1 */
+ b0 = L1*b0A + lambda*b0B; /* 3 */
+ be = L1*beA + lambda*beB; /* 3 */
+ cb = L1*cbA + lambda*cbB; /* 3 */
+
+ ki = pbc_rvec_sub(pbc, x[ai], x[aj], dx); /* 3 */
+ dr2 = iprod(dx, dx); /* 5 */
+ dr = dr2*gmx::invsqrt(dr2); /* 10 */
+ temp = std::exp(-be*(dr-b0)); /* 12 */
+
+ if (temp == one)
+ {
+ /* bonds are constrainted. This may _not_ include bond constraints if they are lambda dependent */
+ *dvdlambda += cbB-cbA;
+ continue;
+ }
+
+ omtemp = one-temp; /* 1 */
+ cbomtemp = cb*omtemp; /* 1 */
+ vbond = cbomtemp*omtemp; /* 1 */
+ fbond = -two*be*temp*cbomtemp*gmx::invsqrt(dr2); /* 9 */
+ vtot += vbond; /* 1 */
+
+ *dvdlambda += (cbB - cbA) * omtemp * omtemp - (2-2*omtemp)*omtemp * cb * ((b0B-b0A)*be - (beB-beA)*(dr-b0)); /* 15 */
+
+ spreadBondForces<flavor>(fbond, dx, ai, aj, f, ki, g, fshift); /* 15 */
+ } /* 83 TOTAL */
+ return vtot;
+}
+
+//! \cond
+template <BondedKernelFlavor flavor>
+real cubic_bonds(int nbonds,
+ const t_iatom forceatoms[], const t_iparams forceparams[],
+ const rvec x[], rvec4 f[], rvec fshift[],
+ const t_pbc *pbc, const t_graph *g,
+ real gmx_unused lambda, real gmx_unused *dvdlambda,
+ const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+ int gmx_unused *global_atom_index)
+{
+ const real three = 3.0;
+ const real two = 2.0;
+ real kb, b0, kcub;
+ real dr, dr2, dist, kdist, kdist2, fbond, vbond, vtot;
+ rvec dx;
+ int i, ki, type, ai, aj;
+
+ vtot = 0.0;
+ for (i = 0; (i < nbonds); )
+ {
+ type = forceatoms[i++];
+ ai = forceatoms[i++];
+ aj = forceatoms[i++];
+
+ b0 = forceparams[type].cubic.b0;
+ kb = forceparams[type].cubic.kb;
+ kcub = forceparams[type].cubic.kcub;
+
+ ki = pbc_rvec_sub(pbc, x[ai], x[aj], dx); /* 3 */
+ dr2 = iprod(dx, dx); /* 5 */
+
+ if (dr2 == 0.0)
+ {
+ continue;
+ }
+
+ dr = dr2*gmx::invsqrt(dr2); /* 10 */
+ dist = dr-b0;
+ kdist = kb*dist;
+ kdist2 = kdist*dist;
+
+ vbond = kdist2 + kcub*kdist2*dist;
+ fbond = -(two*kdist + three*kdist2*kcub)/dr;
+
+ vtot += vbond; /* 21 */
+
+ spreadBondForces<flavor>(fbond, dx, ai, aj, f, ki, g, fshift); /* 15 */
+ } /* 54 TOTAL */
+ return vtot;
+}
+
+template <BondedKernelFlavor flavor>
+real FENE_bonds(int nbonds,
+ const t_iatom forceatoms[], const t_iparams forceparams[],
+ const rvec x[], rvec4 f[], rvec fshift[],
+ const t_pbc *pbc, const t_graph *g,
+ real gmx_unused lambda, real gmx_unused *dvdlambda,
+ const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+ int *global_atom_index)
+{
+ const real half = 0.5;
+ const real one = 1.0;
+ real bm, kb;
+ real dr2, bm2, omdr2obm2, fbond, vbond, vtot;
+ rvec dx;
+ int i, ki, type, ai, aj;
+
+ vtot = 0.0;
+ for (i = 0; (i < nbonds); )
+ {
+ type = forceatoms[i++];
+ ai = forceatoms[i++];
+ aj = forceatoms[i++];
+
+ bm = forceparams[type].fene.bm;
+ kb = forceparams[type].fene.kb;
+
+ ki = pbc_rvec_sub(pbc, x[ai], x[aj], dx); /* 3 */
+ dr2 = iprod(dx, dx); /* 5 */
+
+ if (dr2 == 0.0)
+ {
+ continue;
+ }
+
+ bm2 = bm*bm;
+
+ if (dr2 >= bm2)
+ {
+ gmx_fatal(FARGS,
+ "r^2 (%f) >= bm^2 (%f) in FENE bond between atoms %d and %d",
+ dr2, bm2,
+ glatnr(global_atom_index, ai),
+ glatnr(global_atom_index, aj));
+ }
+
+ omdr2obm2 = one - dr2/bm2;
+
+ vbond = -half*kb*bm2*std::log(omdr2obm2);
+ fbond = -kb/omdr2obm2;
+
+ vtot += vbond; /* 35 */
+
+ spreadBondForces<flavor>(fbond, dx, ai, aj, f, ki, g, fshift); /* 15 */
+ } /* 58 TOTAL */
+ return vtot;
+}
+
+real harmonic(real kA, real kB, real xA, real xB, real x, real lambda,
+ real *V, real *F)
+{
+ const real half = 0.5;
+ real L1, kk, x0, dx, dx2;
+ real v, f, dvdlambda;
+
+ L1 = 1.0-lambda;
+ kk = L1*kA+lambda*kB;
+ x0 = L1*xA+lambda*xB;
+
+ dx = x-x0;
+ dx2 = dx*dx;
+
+ f = -kk*dx;
+ v = half*kk*dx2;
+ dvdlambda = half*(kB-kA)*dx2 + (xA-xB)*kk*dx;
+
+ *F = f;
+ *V = v;
+
+ return dvdlambda;
+
+ /* That was 19 flops */
+}
+
+
+template <BondedKernelFlavor flavor>
+real bonds(int nbonds,
+ const t_iatom forceatoms[], const t_iparams forceparams[],
+ const rvec x[], rvec4 f[], rvec fshift[],
+ const t_pbc *pbc, const t_graph *g,
+ real lambda, real *dvdlambda,
+ const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+ int gmx_unused *global_atom_index)
+{
+ int i, ki, ai, aj, type;
+ real dr, dr2, fbond, vbond, vtot;
+ rvec dx;
+
+ vtot = 0.0;
+ for (i = 0; (i < nbonds); )
+ {
+ type = forceatoms[i++];
+ ai = forceatoms[i++];
+ aj = forceatoms[i++];
+
+ ki = pbc_rvec_sub(pbc, x[ai], x[aj], dx); /* 3 */
+ dr2 = iprod(dx, dx); /* 5 */
+ dr = std::sqrt(dr2); /* 10 */
+
+ *dvdlambda += harmonic(forceparams[type].harmonic.krA,
+ forceparams[type].harmonic.krB,
+ forceparams[type].harmonic.rA,
+ forceparams[type].harmonic.rB,
+ dr, lambda, &vbond, &fbond); /* 19 */
+
+ if (dr2 == 0.0)
+ {
+ continue;
+ }
+
+
+ vtot += vbond; /* 1*/
+ fbond *= gmx::invsqrt(dr2); /* 6 */
+
+ spreadBondForces<flavor>(fbond, dx, ai, aj, f, ki, g, fshift); /* 15 */
+ } /* 59 TOTAL */
+ return vtot;
+}
+
+template <BondedKernelFlavor flavor>
+real restraint_bonds(int nbonds,
+ const t_iatom forceatoms[], const t_iparams forceparams[],
+ const rvec x[], rvec4 f[], rvec fshift[],
+ const t_pbc *pbc, const t_graph *g,
+ real lambda, real *dvdlambda,
+ const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+ int gmx_unused *global_atom_index)
+{
+ int i, ki, ai, aj, type;
+ real dr, dr2, fbond, vbond, vtot;
+ real L1;
+ real low, dlow, up1, dup1, up2, dup2, k, dk;
+ real drh, drh2;
+ rvec dx;
+
+ L1 = 1.0 - lambda;
+
+ vtot = 0.0;
+ for (i = 0; (i < nbonds); )
+ {
+ type = forceatoms[i++];
+ ai = forceatoms[i++];
+ aj = forceatoms[i++];
+
+ ki = pbc_rvec_sub(pbc, x[ai], x[aj], dx); /* 3 */
+ dr2 = iprod(dx, dx); /* 5 */
+ dr = dr2*gmx::invsqrt(dr2); /* 10 */
+
+ low = L1*forceparams[type].restraint.lowA + lambda*forceparams[type].restraint.lowB;
+ dlow = -forceparams[type].restraint.lowA + forceparams[type].restraint.lowB;
+ up1 = L1*forceparams[type].restraint.up1A + lambda*forceparams[type].restraint.up1B;
+ dup1 = -forceparams[type].restraint.up1A + forceparams[type].restraint.up1B;
+ up2 = L1*forceparams[type].restraint.up2A + lambda*forceparams[type].restraint.up2B;
+ dup2 = -forceparams[type].restraint.up2A + forceparams[type].restraint.up2B;
+ k = L1*forceparams[type].restraint.kA + lambda*forceparams[type].restraint.kB;
+ dk = -forceparams[type].restraint.kA + forceparams[type].restraint.kB;
+ /* 24 */
+
+ if (dr < low)
+ {
+ drh = dr - low;
+ drh2 = drh*drh;
+ vbond = 0.5*k*drh2;
+ fbond = -k*drh;
+ *dvdlambda += 0.5*dk*drh2 - k*dlow*drh;
+ } /* 11 */
+ else if (dr <= up1)
+ {
+ vbond = 0;
+ fbond = 0;
+ }
+ else if (dr <= up2)
+ {
+ drh = dr - up1;
+ drh2 = drh*drh;
+ vbond = 0.5*k*drh2;
+ fbond = -k*drh;
+ *dvdlambda += 0.5*dk*drh2 - k*dup1*drh;
+ } /* 11 */
+ else
+ {
+ drh = dr - up2;
+ vbond = k*(up2 - up1)*(0.5*(up2 - up1) + drh);
+ fbond = -k*(up2 - up1);
+ *dvdlambda += dk*(up2 - up1)*(0.5*(up2 - up1) + drh)
+ + k*(dup2 - dup1)*(up2 - up1 + drh)
+ - k*(up2 - up1)*dup2;
+ }
+
+ if (dr2 == 0.0)
+ {
+ continue;
+ }
+
+ vtot += vbond; /* 1*/
+ fbond *= gmx::invsqrt(dr2); /* 6 */
+
+ spreadBondForces<flavor>(fbond, dx, ai, aj, f, ki, g, fshift); /* 15 */
+ } /* 59 TOTAL */
+
+ return vtot;
+}
+
+template <BondedKernelFlavor flavor>
+real polarize(int nbonds,
+ const t_iatom forceatoms[], const t_iparams forceparams[],
+ const rvec x[], rvec4 f[], rvec fshift[],
+ const t_pbc *pbc, const t_graph *g,
+ real lambda, real *dvdlambda,
+ const t_mdatoms *md, t_fcdata gmx_unused *fcd,
+ int gmx_unused *global_atom_index)
+{
+ int i, ki, ai, aj, type;
+ real dr, dr2, fbond, vbond, vtot, ksh;
+ rvec dx;
+
+ vtot = 0.0;
+ for (i = 0; (i < nbonds); )
+ {
+ type = forceatoms[i++];
+ ai = forceatoms[i++];
+ aj = forceatoms[i++];
+ ksh = gmx::square(md->chargeA[aj])*ONE_4PI_EPS0/forceparams[type].polarize.alpha;
+
+ ki = pbc_rvec_sub(pbc, x[ai], x[aj], dx); /* 3 */
+ dr2 = iprod(dx, dx); /* 5 */
+ dr = std::sqrt(dr2); /* 10 */
+
+ *dvdlambda += harmonic(ksh, ksh, 0, 0, dr, lambda, &vbond, &fbond); /* 19 */
+
+ if (dr2 == 0.0)
+ {
+ continue;
+ }
+
+ vtot += vbond; /* 1*/
+ fbond *= gmx::invsqrt(dr2); /* 6 */
+
+ spreadBondForces<flavor>(fbond, dx, ai, aj, f, ki, g, fshift); /* 15 */
+ } /* 59 TOTAL */
+ return vtot;
+}
+
+template <BondedKernelFlavor flavor>
+real anharm_polarize(int nbonds,
+ const t_iatom forceatoms[], const t_iparams forceparams[],
+ const rvec x[], rvec4 f[], rvec fshift[],
+ const t_pbc *pbc, const t_graph *g,
+ real lambda, real *dvdlambda,
+ const t_mdatoms *md, t_fcdata gmx_unused *fcd,
+ int gmx_unused *global_atom_index)
+{
+ int i, ki, ai, aj, type;
+ real dr, dr2, fbond, vbond, vtot, ksh, khyp, drcut, ddr, ddr3;
+ rvec dx;
+
+ vtot = 0.0;
+ for (i = 0; (i < nbonds); )
+ {
+ type = forceatoms[i++];
+ ai = forceatoms[i++];
+ aj = forceatoms[i++];
+ ksh = gmx::square(md->chargeA[aj])*ONE_4PI_EPS0/forceparams[type].anharm_polarize.alpha; /* 7*/
+ khyp = forceparams[type].anharm_polarize.khyp;
+ drcut = forceparams[type].anharm_polarize.drcut;
+
+ ki = pbc_rvec_sub(pbc, x[ai], x[aj], dx); /* 3 */
+ dr2 = iprod(dx, dx); /* 5 */
+ dr = dr2*gmx::invsqrt(dr2); /* 10 */
+
+ *dvdlambda += harmonic(ksh, ksh, 0, 0, dr, lambda, &vbond, &fbond); /* 19 */
+
+ if (dr2 == 0.0)
+ {
+ continue;
+ }
+
+ if (dr > drcut)
+ {
+ ddr = dr-drcut;
+ ddr3 = ddr*ddr*ddr;
+ vbond += khyp*ddr*ddr3;
+ fbond -= 4*khyp*ddr3;
+ }
+ fbond *= gmx::invsqrt(dr2); /* 6 */
+ vtot += vbond; /* 1*/
+
+ spreadBondForces<flavor>(fbond, dx, ai, aj, f, ki, g, fshift); /* 15 */
+ } /* 72 TOTAL */
+ return vtot;
+}
+
+template <BondedKernelFlavor flavor>
+real water_pol(int nbonds,
+ const t_iatom forceatoms[], const t_iparams forceparams[],
+ const rvec x[], rvec4 f[], rvec gmx_unused fshift[],
+ const t_pbc gmx_unused *pbc, const t_graph gmx_unused *g,
+ real gmx_unused lambda, real gmx_unused *dvdlambda,
+ const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+ int gmx_unused *global_atom_index)
+{
+ /* This routine implements anisotropic polarizibility for water, through
+ * a shell connected to a dummy with spring constant that differ in the
+ * three spatial dimensions in the molecular frame.
+ */
+ int i, m, aO, aH1, aH2, aD, aS, type, type0, ki;
+ ivec dt;
+ rvec dOH1, dOH2, dHH, dOD, dDS, nW, kk, dx, kdx, proj;
+ real vtot, fij, r_HH, r_OD, r_nW, tx, ty, tz, qS;
+
+ vtot = 0.0;
+ if (nbonds > 0)
+ {
+ type0 = forceatoms[0];
+ aS = forceatoms[5];
+ qS = md->chargeA[aS];
+ kk[XX] = gmx::square(qS)*ONE_4PI_EPS0/forceparams[type0].wpol.al_x;
+ kk[YY] = gmx::square(qS)*ONE_4PI_EPS0/forceparams[type0].wpol.al_y;
+ kk[ZZ] = gmx::square(qS)*ONE_4PI_EPS0/forceparams[type0].wpol.al_z;
+ r_HH = 1.0/forceparams[type0].wpol.rHH;
+ for (i = 0; (i < nbonds); i += 6)
+ {
+ type = forceatoms[i];
+ if (type != type0)
+ {
+ gmx_fatal(FARGS, "Sorry, type = %d, type0 = %d, file = %s, line = %d",
+ type, type0, __FILE__, __LINE__);
+ }
+ aO = forceatoms[i+1];
+ aH1 = forceatoms[i+2];
+ aH2 = forceatoms[i+3];
+ aD = forceatoms[i+4];
+ aS = forceatoms[i+5];
+
+ /* Compute vectors describing the water frame */
+ pbc_rvec_sub(pbc, x[aH1], x[aO], dOH1);
+ pbc_rvec_sub(pbc, x[aH2], x[aO], dOH2);
+ pbc_rvec_sub(pbc, x[aH2], x[aH1], dHH);
+ pbc_rvec_sub(pbc, x[aD], x[aO], dOD);
+ ki = pbc_rvec_sub(pbc, x[aS], x[aD], dDS);
+ cprod(dOH1, dOH2, nW);
+
+ /* Compute inverse length of normal vector
+ * (this one could be precomputed, but I'm too lazy now)
+ */
+ r_nW = gmx::invsqrt(iprod(nW, nW));
+ /* This is for precision, but does not make a big difference,
+ * it can go later.
+ */
+ r_OD = gmx::invsqrt(iprod(dOD, dOD));
+
+ /* Normalize the vectors in the water frame */
+ svmul(r_nW, nW, nW);
+ svmul(r_HH, dHH, dHH);
+ svmul(r_OD, dOD, dOD);
+
+ /* Compute displacement of shell along components of the vector */
+ dx[ZZ] = iprod(dDS, dOD);
+ /* Compute projection on the XY plane: dDS - dx[ZZ]*dOD */
+ for (m = 0; (m < DIM); m++)
+ {
+ proj[m] = dDS[m]-dx[ZZ]*dOD[m];
+ }
+
+ /*dx[XX] = iprod(dDS,nW);
+ dx[YY] = iprod(dDS,dHH);*/
+ dx[XX] = iprod(proj, nW);
+ for (m = 0; (m < DIM); m++)
+ {
+ proj[m] -= dx[XX]*nW[m];
+ }
+ dx[YY] = iprod(proj, dHH);
+ /* Now compute the forces and energy */
+ kdx[XX] = kk[XX]*dx[XX];
+ kdx[YY] = kk[YY]*dx[YY];
+ kdx[ZZ] = kk[ZZ]*dx[ZZ];
+ vtot += iprod(dx, kdx);
+
+ if (computeVirial(flavor) && g)
+ {
+ ivec_sub(SHIFT_IVEC(g, aS), SHIFT_IVEC(g, aD), dt);
+ ki = IVEC2IS(dt);
+ }
+
+ for (m = 0; (m < DIM); m++)
+ {
+ /* This is a tensor operation but written out for speed */
+ tx = nW[m]*kdx[XX];
+ ty = dHH[m]*kdx[YY];
+ tz = dOD[m]*kdx[ZZ];
+ fij = -tx-ty-tz;
+ f[aS][m] += fij;
+ f[aD][m] -= fij;
+ if (computeVirial(flavor))
+ {
+ fshift[ki][m] += fij;
+ fshift[CENTRAL][m] -= fij;
+ }
+ }
+ }
+ }
+ return 0.5*vtot;
+}
+
+template <BondedKernelFlavor flavor>
+static real do_1_thole(const rvec xi, const rvec xj, rvec fi, rvec fj,
+ const t_pbc *pbc, real qq,
+ rvec fshift[], real afac)
+{
+ rvec r12;
+ real r12sq, r12_1, r12bar, v0, v1, fscal, ebar, fff;
+ int m, t;
+
+ t = pbc_rvec_sub(pbc, xi, xj, r12); /* 3 */
+
+ r12sq = iprod(r12, r12); /* 5 */
+ r12_1 = gmx::invsqrt(r12sq); /* 5 */
+ r12bar = afac/r12_1; /* 5 */
+ v0 = qq*ONE_4PI_EPS0*r12_1; /* 2 */
+ ebar = std::exp(-r12bar); /* 5 */
+ v1 = (1-(1+0.5*r12bar)*ebar); /* 4 */
+ fscal = ((v0*r12_1)*v1 - v0*0.5*afac*ebar*(r12bar+1))*r12_1; /* 9 */
+
+ for (m = 0; (m < DIM); m++)
+ {
+ fff = fscal*r12[m];
+ fi[m] += fff;
+ fj[m] -= fff;
+ if (computeVirial(flavor))
+ {
+ fshift[t][m] += fff;
+ fshift[CENTRAL][m] -= fff;
+ }
+ } /* 15 */
+
+ return v0*v1; /* 1 */
+ /* 54 */
+}
+
+template <BondedKernelFlavor flavor>
+real thole_pol(int nbonds,
+ const t_iatom forceatoms[], const t_iparams forceparams[],
+ const rvec x[], rvec4 f[], rvec fshift[],
+ const t_pbc *pbc, const t_graph gmx_unused *g,
+ real gmx_unused lambda, real gmx_unused *dvdlambda,
+ const t_mdatoms *md, t_fcdata gmx_unused *fcd,
+ int gmx_unused *global_atom_index)
+{
+ /* Interaction between two pairs of particles with opposite charge */
+ int i, type, a1, da1, a2, da2;
+ real q1, q2, qq, a, al1, al2, afac;
+ real V = 0;
+
+ for (i = 0; (i < nbonds); )
+ {
+ type = forceatoms[i++];
+ a1 = forceatoms[i++];
+ da1 = forceatoms[i++];
+ a2 = forceatoms[i++];
+ da2 = forceatoms[i++];
+ q1 = md->chargeA[da1];
+ q2 = md->chargeA[da2];
+ a = forceparams[type].thole.a;
+ al1 = forceparams[type].thole.alpha1;
+ al2 = forceparams[type].thole.alpha2;
+ qq = q1*q2;
+ afac = a*gmx::invsixthroot(al1*al2);
+ V += do_1_thole<flavor>(x[a1], x[a2], f[a1], f[a2], pbc, qq, fshift, afac);
+ V += do_1_thole<flavor>(x[da1], x[a2], f[da1], f[a2], pbc, -qq, fshift, afac);
+ V += do_1_thole<flavor>(x[a1], x[da2], f[a1], f[da2], pbc, -qq, fshift, afac);
+ V += do_1_thole<flavor>(x[da1], x[da2], f[da1], f[da2], pbc, qq, fshift, afac);
+ }
+ /* 290 flops */
+ return V;
+}
+
+template <BondedKernelFlavor flavor>
+std::enable_if_t<flavor != BondedKernelFlavor::ForcesSimdWhenAvailable || !GMX_SIMD_HAVE_REAL, real>
+angles(int nbonds,
+ const t_iatom forceatoms[], const t_iparams forceparams[],
+ const rvec x[], rvec4 f[], rvec fshift[],
+ const t_pbc *pbc, const t_graph *g,
+ real lambda, real *dvdlambda,
+ const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+ int gmx_unused *global_atom_index)
+{
+ int i, ai, aj, ak, t1, t2, type;
+ rvec r_ij, r_kj;
+ real cos_theta, cos_theta2, theta, dVdt, va, vtot;
+ ivec jt, dt_ij, dt_kj;
+
+ vtot = 0.0;
+ for (i = 0; i < nbonds; )
+ {
+ type = forceatoms[i++];
+ ai = forceatoms[i++];
+ aj = forceatoms[i++];
+ ak = forceatoms[i++];
+
+ theta = bond_angle(x[ai], x[aj], x[ak], pbc,
+ r_ij, r_kj, &cos_theta, &t1, &t2); /* 41 */
+
+ *dvdlambda += harmonic(forceparams[type].harmonic.krA,
+ forceparams[type].harmonic.krB,
+ forceparams[type].harmonic.rA*DEG2RAD,
+ forceparams[type].harmonic.rB*DEG2RAD,
+ theta, lambda, &va, &dVdt); /* 21 */
+ vtot += va;
+
+ cos_theta2 = gmx::square(cos_theta);
+ if (cos_theta2 < 1)
+ {
+ int m;
+ real st, sth;
+ real cik, cii, ckk;
+ real nrkj2, nrij2;
+ real nrkj_1, nrij_1;
+ rvec f_i, f_j, f_k;
+
+ st = dVdt*gmx::invsqrt(1 - cos_theta2); /* 12 */
+ sth = st*cos_theta; /* 1 */
+ nrij2 = iprod(r_ij, r_ij); /* 5 */
+ nrkj2 = iprod(r_kj, r_kj); /* 5 */
+
+ nrij_1 = gmx::invsqrt(nrij2); /* 10 */
+ nrkj_1 = gmx::invsqrt(nrkj2); /* 10 */
+
+ cik = st*nrij_1*nrkj_1; /* 2 */
+ cii = sth*nrij_1*nrij_1; /* 2 */
+ ckk = sth*nrkj_1*nrkj_1; /* 2 */
+
+ for (m = 0; m < DIM; m++)
+ { /* 39 */
+ f_i[m] = -(cik*r_kj[m] - cii*r_ij[m]);
+ f_k[m] = -(cik*r_ij[m] - ckk*r_kj[m]);
+ f_j[m] = -f_i[m] - f_k[m];
+ f[ai][m] += f_i[m];
+ f[aj][m] += f_j[m];
+ f[ak][m] += f_k[m];
+ }
+ if (computeVirial(flavor))
+ {
+ if (g != nullptr)
+ {
+ copy_ivec(SHIFT_IVEC(g, aj), jt);
+
+ ivec_sub(SHIFT_IVEC(g, ai), jt, dt_ij);
+ ivec_sub(SHIFT_IVEC(g, ak), jt, dt_kj);
+ t1 = IVEC2IS(dt_ij);
+ t2 = IVEC2IS(dt_kj);
+ }
+ rvec_inc(fshift[t1], f_i);
+ rvec_inc(fshift[CENTRAL], f_j);
+ rvec_inc(fshift[t2], f_k);
+ }
+ } /* 161 TOTAL */
+ }
+
+ return vtot;
+}
+
+#if GMX_SIMD_HAVE_REAL
+
+/* As angles, but using SIMD to calculate many angles at once.
+ * This routines does not calculate energies and shift forces.
+ */
+template <BondedKernelFlavor flavor>
+std::enable_if_t<flavor == BondedKernelFlavor::ForcesSimdWhenAvailable, real>
+angles(int nbonds,
+ const t_iatom forceatoms[], const t_iparams forceparams[],
+ const rvec x[], rvec4 f[], rvec gmx_unused fshift[],
+ const t_pbc *pbc, const t_graph gmx_unused *g,
+ real gmx_unused lambda, real gmx_unused *dvdlambda,
+ const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+ int gmx_unused *global_atom_index)
+{
+ const int nfa1 = 4;
+ int i, iu, s;
+ int type;
+ alignas(GMX_SIMD_ALIGNMENT) std::int32_t ai[GMX_SIMD_REAL_WIDTH];
+ alignas(GMX_SIMD_ALIGNMENT) std::int32_t aj[GMX_SIMD_REAL_WIDTH];
+ alignas(GMX_SIMD_ALIGNMENT) std::int32_t ak[GMX_SIMD_REAL_WIDTH];
+ alignas(GMX_SIMD_ALIGNMENT) real coeff[2*GMX_SIMD_REAL_WIDTH];
+ SimdReal deg2rad_S(DEG2RAD);
+ SimdReal xi_S, yi_S, zi_S;
+ SimdReal xj_S, yj_S, zj_S;
+ SimdReal xk_S, yk_S, zk_S;
+ SimdReal k_S, theta0_S;
+ SimdReal rijx_S, rijy_S, rijz_S;
+ SimdReal rkjx_S, rkjy_S, rkjz_S;
+ SimdReal one_S(1.0);
+ SimdReal min_one_plus_eps_S(-1.0 + 2.0*GMX_REAL_EPS); // Smallest number > -1
+
+ SimdReal rij_rkj_S;
+ SimdReal nrij2_S, nrij_1_S;
+ SimdReal nrkj2_S, nrkj_1_S;
+ SimdReal cos_S, invsin_S;
+ SimdReal theta_S;
+ SimdReal st_S, sth_S;
+ SimdReal cik_S, cii_S, ckk_S;
+ SimdReal f_ix_S, f_iy_S, f_iz_S;
+ SimdReal f_kx_S, f_ky_S, f_kz_S;
+ alignas(GMX_SIMD_ALIGNMENT) real pbc_simd[9*GMX_SIMD_REAL_WIDTH];
+
+ set_pbc_simd(pbc, pbc_simd);
+
+ /* nbonds is the number of angles times nfa1, here we step GMX_SIMD_REAL_WIDTH angles */
+ for (i = 0; (i < nbonds); i += GMX_SIMD_REAL_WIDTH*nfa1)
+ {
+ /* Collect atoms for GMX_SIMD_REAL_WIDTH angles.
+ * iu indexes into forceatoms, we should not let iu go beyond nbonds.
+ */
+ iu = i;
+ for (s = 0; s < GMX_SIMD_REAL_WIDTH; s++)
+ {
+ type = forceatoms[iu];
+ ai[s] = forceatoms[iu+1];
+ aj[s] = forceatoms[iu+2];
+ ak[s] = forceatoms[iu+3];
+
+ /* At the end fill the arrays with the last atoms and 0 params */
+ if (i + s*nfa1 < nbonds)
+ {
+ coeff[s] = forceparams[type].harmonic.krA;
+ coeff[GMX_SIMD_REAL_WIDTH+s] = forceparams[type].harmonic.rA;
+
+ if (iu + nfa1 < nbonds)
+ {
+ iu += nfa1;
+ }
+ }
+ else
+ {
+ coeff[s] = 0;
+ coeff[GMX_SIMD_REAL_WIDTH+s] = 0;
+ }
+ }
+
+ /* Store the non PBC corrected distances packed and aligned */
+ gatherLoadUTranspose<3>(reinterpret_cast<const real *>(x), ai, &xi_S, &yi_S, &zi_S);
+ gatherLoadUTranspose<3>(reinterpret_cast<const real *>(x), aj, &xj_S, &yj_S, &zj_S);
+ gatherLoadUTranspose<3>(reinterpret_cast<const real *>(x), ak, &xk_S, &yk_S, &zk_S);
+ rijx_S = xi_S - xj_S;
+ rijy_S = yi_S - yj_S;
+ rijz_S = zi_S - zj_S;
+ rkjx_S = xk_S - xj_S;
+ rkjy_S = yk_S - yj_S;
+ rkjz_S = zk_S - zj_S;
+
+ k_S = load<SimdReal>(coeff);
+ theta0_S = load<SimdReal>(coeff+GMX_SIMD_REAL_WIDTH) * deg2rad_S;
+
+ pbc_correct_dx_simd(&rijx_S, &rijy_S, &rijz_S, pbc_simd);
+ pbc_correct_dx_simd(&rkjx_S, &rkjy_S, &rkjz_S, pbc_simd);
+
+ rij_rkj_S = iprod(rijx_S, rijy_S, rijz_S,
+ rkjx_S, rkjy_S, rkjz_S);
+
+ nrij2_S = norm2(rijx_S, rijy_S, rijz_S);
+ nrkj2_S = norm2(rkjx_S, rkjy_S, rkjz_S);
+
+ nrij_1_S = invsqrt(nrij2_S);
+ nrkj_1_S = invsqrt(nrkj2_S);
+
+ cos_S = rij_rkj_S * nrij_1_S * nrkj_1_S;
+
+ /* To allow for 180 degrees, we take the max of cos and -1 + 1bit,
+ * so we can safely get the 1/sin from 1/sqrt(1 - cos^2).
+ * This also ensures that rounding errors would cause the argument
+ * of simdAcos to be < -1.
+ * Note that we do not take precautions for cos(0)=1, so the outer
+ * atoms in an angle should not be on top of each other.
+ */
+ cos_S = max(cos_S, min_one_plus_eps_S);
+
+ theta_S = acos(cos_S);
+
+ invsin_S = invsqrt( one_S - cos_S * cos_S );
+
+ st_S = k_S * (theta0_S - theta_S) * invsin_S;
+ sth_S = st_S * cos_S;
+
+ cik_S = st_S * nrij_1_S * nrkj_1_S;
+ cii_S = sth_S * nrij_1_S * nrij_1_S;
+ ckk_S = sth_S * nrkj_1_S * nrkj_1_S;
+
+ f_ix_S = cii_S * rijx_S;
+ f_ix_S = fnma(cik_S, rkjx_S, f_ix_S);
+ f_iy_S = cii_S * rijy_S;
+ f_iy_S = fnma(cik_S, rkjy_S, f_iy_S);
+ f_iz_S = cii_S * rijz_S;
+ f_iz_S = fnma(cik_S, rkjz_S, f_iz_S);
+ f_kx_S = ckk_S * rkjx_S;
+ f_kx_S = fnma(cik_S, rijx_S, f_kx_S);
+ f_ky_S = ckk_S * rkjy_S;
+ f_ky_S = fnma(cik_S, rijy_S, f_ky_S);
+ f_kz_S = ckk_S * rkjz_S;
+ f_kz_S = fnma(cik_S, rijz_S, f_kz_S);
+
+ transposeScatterIncrU<4>(reinterpret_cast<real *>(f), ai, f_ix_S, f_iy_S, f_iz_S);
+ transposeScatterDecrU<4>(reinterpret_cast<real *>(f), aj, f_ix_S + f_kx_S, f_iy_S + f_ky_S, f_iz_S + f_kz_S);
+ transposeScatterIncrU<4>(reinterpret_cast<real *>(f), ak, f_kx_S, f_ky_S, f_kz_S);
+ }
+
+ return 0;
+}
+
+#endif // GMX_SIMD_HAVE_REAL
+
+template <BondedKernelFlavor flavor>
+real linear_angles(int nbonds,
+ const t_iatom forceatoms[], const t_iparams forceparams[],
+ const rvec x[], rvec4 f[], rvec fshift[],
+ const t_pbc *pbc, const t_graph *g,
+ real lambda, real *dvdlambda,
+ const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+ int gmx_unused *global_atom_index)
+{
+ int i, m, ai, aj, ak, t1, t2, type;
+ rvec f_i, f_j, f_k;
+ real L1, kA, kB, aA, aB, dr, dr2, va, vtot, a, b, klin;
+ ivec jt, dt_ij, dt_kj;
+ rvec r_ij, r_kj, r_ik, dx;
+
+ L1 = 1-lambda;
+ vtot = 0.0;
+ for (i = 0; (i < nbonds); )
+ {
+ type = forceatoms[i++];
+ ai = forceatoms[i++];
+ aj = forceatoms[i++];
+ ak = forceatoms[i++];
+
+ kA = forceparams[type].linangle.klinA;
+ kB = forceparams[type].linangle.klinB;
+ klin = L1*kA + lambda*kB;
+
+ aA = forceparams[type].linangle.aA;
+ aB = forceparams[type].linangle.aB;
+ a = L1*aA+lambda*aB;
+ b = 1-a;
+
+ t1 = pbc_rvec_sub(pbc, x[ai], x[aj], r_ij);
+ t2 = pbc_rvec_sub(pbc, x[ak], x[aj], r_kj);
+ rvec_sub(r_ij, r_kj, r_ik);
+
+ dr2 = 0;
+ for (m = 0; (m < DIM); m++)
+ {
+ dr = -a * r_ij[m] - b * r_kj[m];
+ dr2 += dr*dr;
+ dx[m] = dr;
+ f_i[m] = a*klin*dr;
+ f_k[m] = b*klin*dr;
+ f_j[m] = -(f_i[m]+f_k[m]);
+ f[ai][m] += f_i[m];
+ f[aj][m] += f_j[m];
+ f[ak][m] += f_k[m];
+ }
+ va = 0.5*klin*dr2;
+ *dvdlambda += 0.5*(kB-kA)*dr2 + klin*(aB-aA)*iprod(dx, r_ik);
+
+ vtot += va;
+
+ if (computeVirial(flavor))
+ {
+ if (g)
+ {
+ copy_ivec(SHIFT_IVEC(g, aj), jt);
+
+ ivec_sub(SHIFT_IVEC(g, ai), jt, dt_ij);
+ ivec_sub(SHIFT_IVEC(g, ak), jt, dt_kj);
+ t1 = IVEC2IS(dt_ij);
+ t2 = IVEC2IS(dt_kj);
+ }
+ rvec_inc(fshift[t1], f_i);
+ rvec_inc(fshift[CENTRAL], f_j);
+ rvec_inc(fshift[t2], f_k);
+ }
+ } /* 57 TOTAL */
+ return vtot;
+}
+
+template <BondedKernelFlavor flavor>
+std::enable_if_t<flavor != BondedKernelFlavor::ForcesSimdWhenAvailable || !GMX_SIMD_HAVE_REAL, real>
+urey_bradley(int nbonds,
+ const t_iatom forceatoms[], const t_iparams forceparams[],
+ const rvec x[], rvec4 f[], rvec fshift[],
+ const t_pbc *pbc, const t_graph *g,
+ real lambda, real *dvdlambda,
+ const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+ int gmx_unused *global_atom_index)
+{
+ int i, m, ai, aj, ak, t1, t2, type, ki;
+ rvec r_ij, r_kj, r_ik;
+ real cos_theta, cos_theta2, theta;
+ real dVdt, va, vtot, dr, dr2, vbond, fbond, fik;
+ real kthA, th0A, kUBA, r13A, kthB, th0B, kUBB, r13B;
+ ivec jt, dt_ij, dt_kj, dt_ik;
+
+ vtot = 0.0;
+ for (i = 0; (i < nbonds); )
+ {
+ type = forceatoms[i++];
+ ai = forceatoms[i++];
+ aj = forceatoms[i++];
+ ak = forceatoms[i++];
+ th0A = forceparams[type].u_b.thetaA*DEG2RAD;
+ kthA = forceparams[type].u_b.kthetaA;
+ r13A = forceparams[type].u_b.r13A;
+ kUBA = forceparams[type].u_b.kUBA;
+ th0B = forceparams[type].u_b.thetaB*DEG2RAD;
+ kthB = forceparams[type].u_b.kthetaB;
+ r13B = forceparams[type].u_b.r13B;
+ kUBB = forceparams[type].u_b.kUBB;
+
+ theta = bond_angle(x[ai], x[aj], x[ak], pbc,
+ r_ij, r_kj, &cos_theta, &t1, &t2); /* 41 */
+
+ *dvdlambda += harmonic(kthA, kthB, th0A, th0B, theta, lambda, &va, &dVdt); /* 21 */
+ vtot += va;
+
+ ki = pbc_rvec_sub(pbc, x[ai], x[ak], r_ik); /* 3 */
+ dr2 = iprod(r_ik, r_ik); /* 5 */
+ dr = dr2*gmx::invsqrt(dr2); /* 10 */
+
+ *dvdlambda += harmonic(kUBA, kUBB, r13A, r13B, dr, lambda, &vbond, &fbond); /* 19 */
+
+ cos_theta2 = gmx::square(cos_theta); /* 1 */
+ if (cos_theta2 < 1)
+ {
+ real st, sth;
+ real cik, cii, ckk;
+ real nrkj2, nrij2;
+ rvec f_i, f_j, f_k;
+
+ st = dVdt*gmx::invsqrt(1 - cos_theta2); /* 12 */
+ sth = st*cos_theta; /* 1 */
+ nrkj2 = iprod(r_kj, r_kj); /* 5 */
+ nrij2 = iprod(r_ij, r_ij);
+
+ cik = st*gmx::invsqrt(nrkj2*nrij2); /* 12 */
+ cii = sth/nrij2; /* 10 */
+ ckk = sth/nrkj2; /* 10 */
+
+ for (m = 0; (m < DIM); m++) /* 39 */
+ {
+ f_i[m] = -(cik*r_kj[m]-cii*r_ij[m]);
+ f_k[m] = -(cik*r_ij[m]-ckk*r_kj[m]);
+ f_j[m] = -f_i[m]-f_k[m];
+ f[ai][m] += f_i[m];
+ f[aj][m] += f_j[m];
+ f[ak][m] += f_k[m];
+ }
+ if (computeVirial(flavor))
+ {
+ if (g)
+ {
+ copy_ivec(SHIFT_IVEC(g, aj), jt);
+
+ ivec_sub(SHIFT_IVEC(g, ai), jt, dt_ij);
+ ivec_sub(SHIFT_IVEC(g, ak), jt, dt_kj);
+ t1 = IVEC2IS(dt_ij);
+ t2 = IVEC2IS(dt_kj);
+ }
+ rvec_inc(fshift[t1], f_i);
+ rvec_inc(fshift[CENTRAL], f_j);
+ rvec_inc(fshift[t2], f_k);
+ }
+ } /* 161 TOTAL */
+ /* Time for the bond calculations */
+ if (dr2 == 0.0)
+ {
+ continue;
+ }
+
+ vtot += vbond; /* 1*/
+ fbond *= gmx::invsqrt(dr2); /* 6 */
+
+ if (computeVirial(flavor) && g)
+ {
+ ivec_sub(SHIFT_IVEC(g, ai), SHIFT_IVEC(g, ak), dt_ik);
+ ki = IVEC2IS(dt_ik);
+ }
+ for (m = 0; (m < DIM); m++) /* 15 */
+ {
+ fik = fbond*r_ik[m];
+ f[ai][m] += fik;
+ f[ak][m] -= fik;
+ if (computeVirial(flavor))
+ {
+ fshift[ki][m] += fik;
+ fshift[CENTRAL][m] -= fik;
+ }
+ }
+ }
+ return vtot;
+}
+
+#if GMX_SIMD_HAVE_REAL
+
+/* As urey_bradley, but using SIMD to calculate many potentials at once.
+ * This routines does not calculate energies and shift forces.
+ */
+template <BondedKernelFlavor flavor>
+std::enable_if_t<flavor == BondedKernelFlavor::ForcesSimdWhenAvailable, real>
+urey_bradley(int nbonds,
+ const t_iatom forceatoms[], const t_iparams forceparams[],
+ const rvec x[], rvec4 f[], rvec gmx_unused fshift[],
+ const t_pbc *pbc, const t_graph gmx_unused *g,
+ real gmx_unused lambda, real gmx_unused *dvdlambda,
+ const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+ int gmx_unused *global_atom_index)
+{
+ constexpr int nfa1 = 4;
+ alignas(GMX_SIMD_ALIGNMENT) std::int32_t ai[GMX_SIMD_REAL_WIDTH];
+ alignas(GMX_SIMD_ALIGNMENT) std::int32_t aj[GMX_SIMD_REAL_WIDTH];
+ alignas(GMX_SIMD_ALIGNMENT) std::int32_t ak[GMX_SIMD_REAL_WIDTH];
+ alignas(GMX_SIMD_ALIGNMENT) real coeff[4*GMX_SIMD_REAL_WIDTH];
+ alignas(GMX_SIMD_ALIGNMENT) real pbc_simd[9*GMX_SIMD_REAL_WIDTH];
+
+ set_pbc_simd(pbc, pbc_simd);
+
+ /* nbonds is the number of angles times nfa1, here we step GMX_SIMD_REAL_WIDTH angles */
+ for (int i = 0; i < nbonds; i += GMX_SIMD_REAL_WIDTH*nfa1)
+ {
+ /* Collect atoms for GMX_SIMD_REAL_WIDTH angles.
+ * iu indexes into forceatoms, we should not let iu go beyond nbonds.
+ */
+ int iu = i;
+ for (int s = 0; s < GMX_SIMD_REAL_WIDTH; s++)
+ {
+ const int type = forceatoms[iu];
+ ai[s] = forceatoms[iu+1];
+ aj[s] = forceatoms[iu+2];
+ ak[s] = forceatoms[iu+3];
+
+ /* At the end fill the arrays with the last atoms and 0 params */
+ if (i + s*nfa1 < nbonds)
+ {
+ coeff[s] = forceparams[type].u_b.kthetaA;
+ coeff[GMX_SIMD_REAL_WIDTH+s] = forceparams[type].u_b.thetaA;
+ coeff[GMX_SIMD_REAL_WIDTH*2+s] = forceparams[type].u_b.kUBA;
+ coeff[GMX_SIMD_REAL_WIDTH*3+s] = forceparams[type].u_b.r13A;
+
+ if (iu + nfa1 < nbonds)
+ {
+ iu += nfa1;
+ }
+ }
+ else
+ {
+ coeff[s] = 0;
+ coeff[GMX_SIMD_REAL_WIDTH+s] = 0;
+ coeff[GMX_SIMD_REAL_WIDTH*2+s] = 0;
+ coeff[GMX_SIMD_REAL_WIDTH*3+s] = 0;
+ }
+ }
+
+ SimdReal xi_S, yi_S, zi_S;
+ SimdReal xj_S, yj_S, zj_S;
+ SimdReal xk_S, yk_S, zk_S;
+
+ /* Store the non PBC corrected distances packed and aligned */
+ gatherLoadUTranspose<3>(reinterpret_cast<const real *>(x), ai, &xi_S, &yi_S, &zi_S);
+ gatherLoadUTranspose<3>(reinterpret_cast<const real *>(x), aj, &xj_S, &yj_S, &zj_S);
+ gatherLoadUTranspose<3>(reinterpret_cast<const real *>(x), ak, &xk_S, &yk_S, &zk_S);
+ SimdReal rijx_S = xi_S - xj_S;
+ SimdReal rijy_S = yi_S - yj_S;
+ SimdReal rijz_S = zi_S - zj_S;
+ SimdReal rkjx_S = xk_S - xj_S;
+ SimdReal rkjy_S = yk_S - yj_S;
+ SimdReal rkjz_S = zk_S - zj_S;
+ SimdReal rikx_S = xi_S - xk_S;
+ SimdReal riky_S = yi_S - yk_S;
+ SimdReal rikz_S = zi_S - zk_S;
+
+ const SimdReal ktheta_S = load<SimdReal>(coeff);
+ const SimdReal theta0_S = load<SimdReal>(coeff+GMX_SIMD_REAL_WIDTH) * DEG2RAD;
+ const SimdReal kUB_S = load<SimdReal>(coeff+2*GMX_SIMD_REAL_WIDTH);
+ const SimdReal r13_S = load<SimdReal>(coeff+3*GMX_SIMD_REAL_WIDTH);
+
+ pbc_correct_dx_simd(&rijx_S, &rijy_S, &rijz_S, pbc_simd);
+ pbc_correct_dx_simd(&rkjx_S, &rkjy_S, &rkjz_S, pbc_simd);
+ pbc_correct_dx_simd(&rikx_S, &riky_S, &rikz_S, pbc_simd);
+
+ const SimdReal rij_rkj_S = iprod(rijx_S, rijy_S, rijz_S,
+ rkjx_S, rkjy_S, rkjz_S);
+
+ const SimdReal dr2_S = iprod(rikx_S, riky_S, rikz_S,
+ rikx_S, riky_S, rikz_S);
+
+ const SimdReal nrij2_S = norm2(rijx_S, rijy_S, rijz_S);
+ const SimdReal nrkj2_S = norm2(rkjx_S, rkjy_S, rkjz_S);
+
+ const SimdReal nrij_1_S = invsqrt(nrij2_S);
+ const SimdReal nrkj_1_S = invsqrt(nrkj2_S);
+ const SimdReal invdr2_S = invsqrt(dr2_S);
+ const SimdReal dr_S = dr2_S*invdr2_S;
+
+ constexpr real min_one_plus_eps = -1.0 + 2.0*GMX_REAL_EPS; // Smallest number > -1
+
+ /* To allow for 180 degrees, we take the max of cos and -1 + 1bit,
+ * so we can safely get the 1/sin from 1/sqrt(1 - cos^2).
+ * This also ensures that rounding errors would cause the argument
+ * of simdAcos to be < -1.
+ * Note that we do not take precautions for cos(0)=1, so the bonds
+ * in an angle should not align at an angle of 0 degrees.
+ */
+ const SimdReal cos_S = max(rij_rkj_S * nrij_1_S * nrkj_1_S, min_one_plus_eps);
+
+ const SimdReal theta_S = acos(cos_S);
+ const SimdReal invsin_S = invsqrt( 1.0 - cos_S * cos_S );
+ const SimdReal st_S = ktheta_S * (theta0_S - theta_S) * invsin_S;
+ const SimdReal sth_S = st_S * cos_S;
+
+ const SimdReal cik_S = st_S * nrij_1_S * nrkj_1_S;
+ const SimdReal cii_S = sth_S * nrij_1_S * nrij_1_S;
+ const SimdReal ckk_S = sth_S * nrkj_1_S * nrkj_1_S;
+
+ const SimdReal sUB_S = kUB_S * (r13_S - dr_S) * invdr2_S;
+
+ const SimdReal f_ikx_S = sUB_S * rikx_S;
+ const SimdReal f_iky_S = sUB_S * riky_S;
+ const SimdReal f_ikz_S = sUB_S * rikz_S;
+
+ const SimdReal f_ix_S = fnma(cik_S, rkjx_S, cii_S * rijx_S) + f_ikx_S;
+ const SimdReal f_iy_S = fnma(cik_S, rkjy_S, cii_S * rijy_S) + f_iky_S;
+ const SimdReal f_iz_S = fnma(cik_S, rkjz_S, cii_S * rijz_S) + f_ikz_S;
+ const SimdReal f_kx_S = fnma(cik_S, rijx_S, ckk_S * rkjx_S) - f_ikx_S;
+ const SimdReal f_ky_S = fnma(cik_S, rijy_S, ckk_S * rkjy_S) - f_iky_S;
+ const SimdReal f_kz_S = fnma(cik_S, rijz_S, ckk_S * rkjz_S) - f_ikz_S;
+
+ transposeScatterIncrU<4>(reinterpret_cast<real *>(f), ai, f_ix_S, f_iy_S, f_iz_S);
+ transposeScatterDecrU<4>(reinterpret_cast<real *>(f), aj, f_ix_S + f_kx_S, f_iy_S + f_ky_S, f_iz_S + f_kz_S);
+ transposeScatterIncrU<4>(reinterpret_cast<real *>(f), ak, f_kx_S, f_ky_S, f_kz_S);
+ }
+
+ return 0;
+}
+
+#endif // GMX_SIMD_HAVE_REAL
+
+template <BondedKernelFlavor flavor>
+real quartic_angles(int nbonds,
+ const t_iatom forceatoms[], const t_iparams forceparams[],
+ const rvec x[], rvec4 f[], rvec fshift[],
+ const t_pbc *pbc, const t_graph *g,
+ real gmx_unused lambda, real gmx_unused *dvdlambda,
+ const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+ int gmx_unused *global_atom_index)
+{
+ int i, j, ai, aj, ak, t1, t2, type;
+ rvec r_ij, r_kj;
+ real cos_theta, cos_theta2, theta, dt, dVdt, va, dtp, c, vtot;
+ ivec jt, dt_ij, dt_kj;
+
+ vtot = 0.0;
+ for (i = 0; (i < nbonds); )
+ {
+ type = forceatoms[i++];
+ ai = forceatoms[i++];
+ aj = forceatoms[i++];
+ ak = forceatoms[i++];
+
+ theta = bond_angle(x[ai], x[aj], x[ak], pbc,
+ r_ij, r_kj, &cos_theta, &t1, &t2); /* 41 */
+
+ dt = theta - forceparams[type].qangle.theta*DEG2RAD; /* 2 */
+
+ dVdt = 0;
+ va = forceparams[type].qangle.c[0];
+ dtp = 1.0;
+ for (j = 1; j <= 4; j++)
+ {
+ c = forceparams[type].qangle.c[j];
+ dVdt -= j*c*dtp;
+ dtp *= dt;
+ va += c*dtp;
+ }
+ /* 20 */
+
+ vtot += va;
+
+ cos_theta2 = gmx::square(cos_theta); /* 1 */
+ if (cos_theta2 < 1)
+ {
+ int m;
+ real st, sth;
+ real cik, cii, ckk;
+ real nrkj2, nrij2;
+ rvec f_i, f_j, f_k;
+
+ st = dVdt*gmx::invsqrt(1 - cos_theta2); /* 12 */
+ sth = st*cos_theta; /* 1 */
+ nrkj2 = iprod(r_kj, r_kj); /* 5 */
+ nrij2 = iprod(r_ij, r_ij);
+
+ cik = st*gmx::invsqrt(nrkj2*nrij2); /* 12 */
+ cii = sth/nrij2; /* 10 */
+ ckk = sth/nrkj2; /* 10 */
+
+ for (m = 0; (m < DIM); m++) /* 39 */
+ {
+ f_i[m] = -(cik*r_kj[m]-cii*r_ij[m]);
+ f_k[m] = -(cik*r_ij[m]-ckk*r_kj[m]);
+ f_j[m] = -f_i[m]-f_k[m];
+ f[ai][m] += f_i[m];
+ f[aj][m] += f_j[m];
+ f[ak][m] += f_k[m];
+ }
+
+ if (computeVirial(flavor))
+ {
+ if (g)
+ {
+ copy_ivec(SHIFT_IVEC(g, aj), jt);
+
+ ivec_sub(SHIFT_IVEC(g, ai), jt, dt_ij);
+ ivec_sub(SHIFT_IVEC(g, ak), jt, dt_kj);
+ t1 = IVEC2IS(dt_ij);
+ t2 = IVEC2IS(dt_kj);
+ }
+ rvec_inc(fshift[t1], f_i);
+ rvec_inc(fshift[CENTRAL], f_j);
+ rvec_inc(fshift[t2], f_k);
+ }
+ } /* 153 TOTAL */
+ }
+ return vtot;
+}
+
+
+#if GMX_SIMD_HAVE_REAL
+
+/* As dih_angle above, but calculates 4 dihedral angles at once using SIMD,
+ * also calculates the pre-factor required for the dihedral force update.
+ * Note that bv and buf should be register aligned.
+ */
+inline void
+dih_angle_simd(const rvec *x,
+ const int *ai, const int *aj, const int *ak, const int *al,
+ const real *pbc_simd,
+ SimdReal *phi_S,
+ SimdReal *mx_S, SimdReal *my_S, SimdReal *mz_S,
+ SimdReal *nx_S, SimdReal *ny_S, SimdReal *nz_S,
+ SimdReal *nrkj_m2_S,
+ SimdReal *nrkj_n2_S,
+ SimdReal *p_S,
+ SimdReal *q_S)
+{
+ SimdReal xi_S, yi_S, zi_S;
+ SimdReal xj_S, yj_S, zj_S;
+ SimdReal xk_S, yk_S, zk_S;
+ SimdReal xl_S, yl_S, zl_S;
+ SimdReal rijx_S, rijy_S, rijz_S;
+ SimdReal rkjx_S, rkjy_S, rkjz_S;
+ SimdReal rklx_S, rkly_S, rklz_S;
+ SimdReal cx_S, cy_S, cz_S;
+ SimdReal cn_S;
+ SimdReal s_S;
+ SimdReal ipr_S;
+ SimdReal iprm_S, iprn_S;
+ SimdReal nrkj2_S, nrkj_1_S, nrkj_2_S, nrkj_S;
+ SimdReal toler_S;
+ SimdReal nrkj2_min_S;
+ SimdReal real_eps_S;
+
+ /* Used to avoid division by zero.
+ * We take into acount that we multiply the result by real_eps_S.
+ */
+ nrkj2_min_S = SimdReal(GMX_REAL_MIN/(2*GMX_REAL_EPS));
+
+ /* The value of the last significant bit (GMX_REAL_EPS is half of that) */
+ real_eps_S = SimdReal(2*GMX_REAL_EPS);
+
+ /* Store the non PBC corrected distances packed and aligned */
+ gatherLoadUTranspose<3>(reinterpret_cast<const real *>(x), ai, &xi_S, &yi_S, &zi_S);
+ gatherLoadUTranspose<3>(reinterpret_cast<const real *>(x), aj, &xj_S, &yj_S, &zj_S);
+ gatherLoadUTranspose<3>(reinterpret_cast<const real *>(x), ak, &xk_S, &yk_S, &zk_S);
+ gatherLoadUTranspose<3>(reinterpret_cast<const real *>(x), al, &xl_S, &yl_S, &zl_S);
+ rijx_S = xi_S - xj_S;
+ rijy_S = yi_S - yj_S;
+ rijz_S = zi_S - zj_S;
+ rkjx_S = xk_S - xj_S;
+ rkjy_S = yk_S - yj_S;
+ rkjz_S = zk_S - zj_S;
+ rklx_S = xk_S - xl_S;
+ rkly_S = yk_S - yl_S;
+ rklz_S = zk_S - zl_S;
+
+ pbc_correct_dx_simd(&rijx_S, &rijy_S, &rijz_S, pbc_simd);
+ pbc_correct_dx_simd(&rkjx_S, &rkjy_S, &rkjz_S, pbc_simd);
+ pbc_correct_dx_simd(&rklx_S, &rkly_S, &rklz_S, pbc_simd);
+
+ cprod(rijx_S, rijy_S, rijz_S,
+ rkjx_S, rkjy_S, rkjz_S,
+ mx_S, my_S, mz_S);
+
+ cprod(rkjx_S, rkjy_S, rkjz_S,
+ rklx_S, rkly_S, rklz_S,
+ nx_S, ny_S, nz_S);
+
+ cprod(*mx_S, *my_S, *mz_S,
+ *nx_S, *ny_S, *nz_S,
+ &cx_S, &cy_S, &cz_S);
+
+ cn_S = sqrt(norm2(cx_S, cy_S, cz_S));
+
+ s_S = iprod(*mx_S, *my_S, *mz_S, *nx_S, *ny_S, *nz_S);
+
+ /* Determine the dihedral angle, the sign might need correction */
+ *phi_S = atan2(cn_S, s_S);
+
+ ipr_S = iprod(rijx_S, rijy_S, rijz_S,
+ *nx_S, *ny_S, *nz_S);
+
+ iprm_S = norm2(*mx_S, *my_S, *mz_S);
+ iprn_S = norm2(*nx_S, *ny_S, *nz_S);
+
+ nrkj2_S = norm2(rkjx_S, rkjy_S, rkjz_S);
+
+ /* Avoid division by zero. When zero, the result is multiplied by 0
+ * anyhow, so the 3 max below do not affect the final result.
+ */
+ nrkj2_S = max(nrkj2_S, nrkj2_min_S);
+ nrkj_1_S = invsqrt(nrkj2_S);
+ nrkj_2_S = nrkj_1_S * nrkj_1_S;
+ nrkj_S = nrkj2_S * nrkj_1_S;
+
+ toler_S = nrkj2_S * real_eps_S;
+
+ /* Here the plain-C code uses a conditional, but we can't do that in SIMD.
+ * So we take a max with the tolerance instead. Since we multiply with
+ * m or n later, the max does not affect the results.
+ */
+ iprm_S = max(iprm_S, toler_S);
+ iprn_S = max(iprn_S, toler_S);
+ *nrkj_m2_S = nrkj_S * inv(iprm_S);
+ *nrkj_n2_S = nrkj_S * inv(iprn_S);
+
+ /* Set sign of phi_S with the sign of ipr_S; phi_S is currently positive */
+ *phi_S = copysign(*phi_S, ipr_S);
+ *p_S = iprod(rijx_S, rijy_S, rijz_S, rkjx_S, rkjy_S, rkjz_S);
+ *p_S = *p_S * nrkj_2_S;
+
+ *q_S = iprod(rklx_S, rkly_S, rklz_S, rkjx_S, rkjy_S, rkjz_S);
+ *q_S = *q_S * nrkj_2_S;
+}
+
+#endif // GMX_SIMD_HAVE_REAL
+
+} // namespace
+
+template <BondedKernelFlavor flavor>
+void do_dih_fup(int i, int j, int k, int l, real ddphi,
+ rvec r_ij, rvec r_kj, rvec r_kl,
+ rvec m, rvec n, rvec4 f[], rvec fshift[],
+ const t_pbc *pbc, const t_graph *g,
+ const rvec x[], int t1, int t2, int t3)
+{
+ /* 143 FLOPS */
+ rvec f_i, f_j, f_k, f_l;
+ rvec uvec, vvec, svec, dx_jl;
+ real iprm, iprn, nrkj, nrkj2, nrkj_1, nrkj_2;
+ real a, b, p, q, toler;
+ ivec jt, dt_ij, dt_kj, dt_lj;
+
+ iprm = iprod(m, m); /* 5 */
+ iprn = iprod(n, n); /* 5 */
+ nrkj2 = iprod(r_kj, r_kj); /* 5 */
+ toler = nrkj2*GMX_REAL_EPS;
+ if ((iprm > toler) && (iprn > toler))
+ {
+ nrkj_1 = gmx::invsqrt(nrkj2); /* 10 */
+ nrkj_2 = nrkj_1*nrkj_1; /* 1 */
+ nrkj = nrkj2*nrkj_1; /* 1 */
+ a = -ddphi*nrkj/iprm; /* 11 */
+ svmul(a, m, f_i); /* 3 */
+ b = ddphi*nrkj/iprn; /* 11 */
+ svmul(b, n, f_l); /* 3 */
+ p = iprod(r_ij, r_kj); /* 5 */
+ p *= nrkj_2; /* 1 */
+ q = iprod(r_kl, r_kj); /* 5 */
+ q *= nrkj_2; /* 1 */
+ svmul(p, f_i, uvec); /* 3 */
+ svmul(q, f_l, vvec); /* 3 */
+ rvec_sub(uvec, vvec, svec); /* 3 */
+ rvec_sub(f_i, svec, f_j); /* 3 */
+ rvec_add(f_l, svec, f_k); /* 3 */
+ rvec_inc(f[i], f_i); /* 3 */
+ rvec_dec(f[j], f_j); /* 3 */
+ rvec_dec(f[k], f_k); /* 3 */
+ rvec_inc(f[l], f_l); /* 3 */
+
+ if (computeVirial(flavor))
+ {
+ if (g)
+ {
+ copy_ivec(SHIFT_IVEC(g, j), jt);
+ ivec_sub(SHIFT_IVEC(g, i), jt, dt_ij);
+ ivec_sub(SHIFT_IVEC(g, k), jt, dt_kj);
+ ivec_sub(SHIFT_IVEC(g, l), jt, dt_lj);
+ t1 = IVEC2IS(dt_ij);
+ t2 = IVEC2IS(dt_kj);
+ t3 = IVEC2IS(dt_lj);
+ }
+ else if (pbc)
+ {
+ t3 = pbc_rvec_sub(pbc, x[l], x[j], dx_jl);
+ }
+ else
+ {
+ t3 = CENTRAL;
+ }
+
+ rvec_inc(fshift[t1], f_i);
+ rvec_dec(fshift[CENTRAL], f_j);
+ rvec_dec(fshift[t2], f_k);
+ rvec_inc(fshift[t3], f_l);
+ }
+ }
+ /* 112 TOTAL */
+}
+
+namespace
+{
+
+#if GMX_SIMD_HAVE_REAL
+/* As do_dih_fup_noshiftf above, but with SIMD and pre-calculated pre-factors */
+inline void gmx_simdcall
+do_dih_fup_noshiftf_simd(const int *ai, const int *aj, const int *ak, const int *al,
+ SimdReal p, SimdReal q,
+ SimdReal f_i_x, SimdReal f_i_y, SimdReal f_i_z,
+ SimdReal mf_l_x, SimdReal mf_l_y, SimdReal mf_l_z,
+ rvec4 f[])
+{
+ SimdReal sx = p * f_i_x + q * mf_l_x;
+ SimdReal sy = p * f_i_y + q * mf_l_y;
+ SimdReal sz = p * f_i_z + q * mf_l_z;
+ SimdReal f_j_x = f_i_x - sx;
+ SimdReal f_j_y = f_i_y - sy;
+ SimdReal f_j_z = f_i_z - sz;
+ SimdReal f_k_x = mf_l_x - sx;
+ SimdReal f_k_y = mf_l_y - sy;
+ SimdReal f_k_z = mf_l_z - sz;
+ transposeScatterIncrU<4>(reinterpret_cast<real *>(f), ai, f_i_x, f_i_y, f_i_z);
+ transposeScatterDecrU<4>(reinterpret_cast<real *>(f), aj, f_j_x, f_j_y, f_j_z);
+ transposeScatterIncrU<4>(reinterpret_cast<real *>(f), ak, f_k_x, f_k_y, f_k_z);
+ transposeScatterDecrU<4>(reinterpret_cast<real *>(f), al, mf_l_x, mf_l_y, mf_l_z);
+}
+#endif // GMX_SIMD_HAVE_REAL
+
+/*! \brief Computes and returns the proper dihedral force
+ *
+ * With the appropriate kernel flavor, also computes and accumulates
+ * the energy and dV/dlambda.
+ */
+template <BondedKernelFlavor flavor>
+real dopdihs(real cpA, real cpB, real phiA, real phiB, int mult,
+ real phi, real lambda, real *V, real *dvdlambda)
+{
+ const real L1 = 1.0 - lambda;
+ const real ph0 = (L1*phiA + lambda*phiB)*DEG2RAD;
+ const real dph0 = (phiB - phiA)*DEG2RAD;
+ const real cp = L1*cpA + lambda*cpB;
+
+ const real mdphi = mult*phi - ph0;
+ const real sdphi = std::sin(mdphi);
+ const real ddphi = -cp*mult*sdphi;
+ if (computeEnergy(flavor))
+ {
+ const real v1 = 1 + std::cos(mdphi);
+ *V += cp*v1;
+
+ *dvdlambda += (cpB - cpA)*v1 + cp*dph0*sdphi;
+ }
+
+ return ddphi;
+ /* That was 40 flops */
+}
+
+/*! \brief Similar to \p dopdihs(), except for a minus sign and a different treatment of mult/phi0 */
+real dopdihs_min(real cpA, real cpB, real phiA, real phiB, int mult,
+ real phi, real lambda, real *V, real *F)
+{
+ real v, dvdlambda, mdphi, v1, sdphi, ddphi;
+ real L1 = 1.0 - lambda;
+ real ph0 = (L1*phiA + lambda*phiB)*DEG2RAD;
+ real dph0 = (phiB - phiA)*DEG2RAD;
+ real cp = L1*cpA + lambda*cpB;
+
+ mdphi = mult*(phi-ph0);
+ sdphi = std::sin(mdphi);
+ ddphi = cp*mult*sdphi;
+ v1 = 1.0-std::cos(mdphi);
+ v = cp*v1;
+
+ dvdlambda = (cpB-cpA)*v1 + cp*dph0*sdphi;
+
+ *V = v;
+ *F = ddphi;
+
+ return dvdlambda;
+
+ /* That was 40 flops */
+}
+
+template <BondedKernelFlavor flavor>
+std::enable_if_t<flavor != BondedKernelFlavor::ForcesSimdWhenAvailable || !GMX_SIMD_HAVE_REAL, real>
+pdihs(int nbonds,
+ const t_iatom forceatoms[], const t_iparams forceparams[],
+ const rvec x[], rvec4 f[], rvec fshift[],
+ const t_pbc *pbc, const t_graph *g,
+ real lambda, real *dvdlambda,
+ const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+ int gmx_unused *global_atom_index)
+{
+ int t1, t2, t3;
+ rvec r_ij, r_kj, r_kl, m, n;
+
+ real vtot = 0.0;
+
+ for (int i = 0; i < nbonds; )
+ {
+ const int ai = forceatoms[i + 1];
+ const int aj = forceatoms[i + 2];
+ const int ak = forceatoms[i + 3];
+ const int al = forceatoms[i + 4];
+
+ const real phi = dih_angle(x[ai], x[aj], x[ak], x[al], pbc, r_ij, r_kj, r_kl, m, n,
+ &t1, &t2, &t3); /* 84 */
+
+ /* Loop over dihedrals working on the same atoms,
+ * so we avoid recalculating angles and distributing forces.
+ */
+ real ddphi_tot = 0;
+ do
+ {
+ const int type = forceatoms[i];
+ ddphi_tot +=
+ dopdihs<flavor>(forceparams[type].pdihs.cpA,
+ forceparams[type].pdihs.cpB,
+ forceparams[type].pdihs.phiA,
+ forceparams[type].pdihs.phiB,
+ forceparams[type].pdihs.mult,
+ phi, lambda, &vtot, dvdlambda);
+
+ i += 5;
+ }
+ while (i < nbonds &&
+ forceatoms[i + 1] == ai &&
+ forceatoms[i + 2] == aj &&
+ forceatoms[i + 3] == ak &&
+ forceatoms[i + 4] == al);
+
+ do_dih_fup<flavor>(ai, aj, ak, al, ddphi_tot, r_ij, r_kj, r_kl, m, n,
+ f, fshift, pbc, g, x, t1, t2, t3); /* 112 */
+ } /* 223 TOTAL */
+
+ return vtot;
+}
+
+#if GMX_SIMD_HAVE_REAL
+
+/* As pdihs above, but using SIMD to calculate multiple dihedrals at once */
+template <BondedKernelFlavor flavor>
+std::enable_if_t<flavor == BondedKernelFlavor::ForcesSimdWhenAvailable, real>
+pdihs(int nbonds,
+ const t_iatom forceatoms[], const t_iparams forceparams[],
+ const rvec x[], rvec4 f[], rvec gmx_unused fshift[],
+ const t_pbc *pbc, const t_graph gmx_unused *g,
+ real gmx_unused lambda, real gmx_unused *dvdlambda,
+ const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+ int gmx_unused *global_atom_index)
+{
+ const int nfa1 = 5;
+ int i, iu, s;
+ int type;
+ alignas(GMX_SIMD_ALIGNMENT) std::int32_t ai[GMX_SIMD_REAL_WIDTH];
+ alignas(GMX_SIMD_ALIGNMENT) std::int32_t aj[GMX_SIMD_REAL_WIDTH];
+ alignas(GMX_SIMD_ALIGNMENT) std::int32_t ak[GMX_SIMD_REAL_WIDTH];
+ alignas(GMX_SIMD_ALIGNMENT) std::int32_t al[GMX_SIMD_REAL_WIDTH];
+ alignas(GMX_SIMD_ALIGNMENT) real buf[3*GMX_SIMD_REAL_WIDTH];
+ real *cp, *phi0, *mult;
+ SimdReal deg2rad_S(DEG2RAD);
+ SimdReal p_S, q_S;
+ SimdReal phi0_S, phi_S;
+ SimdReal mx_S, my_S, mz_S;
+ SimdReal nx_S, ny_S, nz_S;
+ SimdReal nrkj_m2_S, nrkj_n2_S;
+ SimdReal cp_S, mdphi_S, mult_S;
+ SimdReal sin_S, cos_S;
+ SimdReal mddphi_S;
+ SimdReal sf_i_S, msf_l_S;
+ alignas(GMX_SIMD_ALIGNMENT) real pbc_simd[9*GMX_SIMD_REAL_WIDTH];
+
+ /* Extract aligned pointer for parameters and variables */
+ cp = buf + 0*GMX_SIMD_REAL_WIDTH;
+ phi0 = buf + 1*GMX_SIMD_REAL_WIDTH;
+ mult = buf + 2*GMX_SIMD_REAL_WIDTH;
+
+ set_pbc_simd(pbc, pbc_simd);
+
+ /* nbonds is the number of dihedrals times nfa1, here we step GMX_SIMD_REAL_WIDTH dihs */
+ for (i = 0; (i < nbonds); i += GMX_SIMD_REAL_WIDTH*nfa1)
+ {
+ /* Collect atoms quadruplets for GMX_SIMD_REAL_WIDTH dihedrals.
+ * iu indexes into forceatoms, we should not let iu go beyond nbonds.
+ */
+ iu = i;
+ for (s = 0; s < GMX_SIMD_REAL_WIDTH; s++)
+ {
+ type = forceatoms[iu];
+ ai[s] = forceatoms[iu+1];
+ aj[s] = forceatoms[iu+2];
+ ak[s] = forceatoms[iu+3];
+ al[s] = forceatoms[iu+4];
+
+ /* At the end fill the arrays with the last atoms and 0 params */
+ if (i + s*nfa1 < nbonds)
+ {
+ cp[s] = forceparams[type].pdihs.cpA;
+ phi0[s] = forceparams[type].pdihs.phiA;
+ mult[s] = forceparams[type].pdihs.mult;
+
+ if (iu + nfa1 < nbonds)
+ {
+ iu += nfa1;
+ }
+ }
+ else
+ {
+ cp[s] = 0;
+ phi0[s] = 0;
+ mult[s] = 0;
+ }
+ }
+
+ /* Caclulate GMX_SIMD_REAL_WIDTH dihedral angles at once */
+ dih_angle_simd(x, ai, aj, ak, al, pbc_simd,
+ &phi_S,
+ &mx_S, &my_S, &mz_S,
+ &nx_S, &ny_S, &nz_S,
+ &nrkj_m2_S,
+ &nrkj_n2_S,
+ &p_S, &q_S);
+
+ cp_S = load<SimdReal>(cp);
+ phi0_S = load<SimdReal>(phi0) * deg2rad_S;
+ mult_S = load<SimdReal>(mult);
+
+ mdphi_S = fms(mult_S, phi_S, phi0_S);
+
+ /* Calculate GMX_SIMD_REAL_WIDTH sines at once */
+ sincos(mdphi_S, &sin_S, &cos_S);
+ mddphi_S = cp_S * mult_S * sin_S;
+ sf_i_S = mddphi_S * nrkj_m2_S;
+ msf_l_S = mddphi_S * nrkj_n2_S;
+
+ /* After this m?_S will contain f[i] */
+ mx_S = sf_i_S * mx_S;
+ my_S = sf_i_S * my_S;
+ mz_S = sf_i_S * mz_S;
+
+ /* After this m?_S will contain -f[l] */
+ nx_S = msf_l_S * nx_S;
+ ny_S = msf_l_S * ny_S;
+ nz_S = msf_l_S * nz_S;
+
+ do_dih_fup_noshiftf_simd(ai, aj, ak, al,
+ p_S, q_S,
+ mx_S, my_S, mz_S,
+ nx_S, ny_S, nz_S,
+ f);
+ }
+
+ return 0;
+}
+
+/* This is mostly a copy of the SIMD flavor of pdihs above, but with using
+ * the RB potential instead of a harmonic potential.
+ * This function can replace rbdihs() when no energy and virial are needed.
+ */
+template <BondedKernelFlavor flavor>
+std::enable_if_t<flavor == BondedKernelFlavor::ForcesSimdWhenAvailable, real>
+rbdihs(int nbonds,
+ const t_iatom forceatoms[], const t_iparams forceparams[],
+ const rvec x[], rvec4 f[], rvec gmx_unused fshift[],
+ const t_pbc *pbc, const t_graph gmx_unused *g,
+ real gmx_unused lambda, real gmx_unused *dvdlambda,
+ const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+ int gmx_unused *global_atom_index)
+{
+ const int nfa1 = 5;
+ int i, iu, s, j;
+ int type;
+ alignas(GMX_SIMD_ALIGNMENT) std::int32_t ai[GMX_SIMD_REAL_WIDTH];
+ alignas(GMX_SIMD_ALIGNMENT) std::int32_t aj[GMX_SIMD_REAL_WIDTH];
+ alignas(GMX_SIMD_ALIGNMENT) std::int32_t ak[GMX_SIMD_REAL_WIDTH];
+ alignas(GMX_SIMD_ALIGNMENT) std::int32_t al[GMX_SIMD_REAL_WIDTH];
+ alignas(GMX_SIMD_ALIGNMENT) real parm[NR_RBDIHS*GMX_SIMD_REAL_WIDTH];
+
+ SimdReal p_S, q_S;
+ SimdReal phi_S;
+ SimdReal ddphi_S, cosfac_S;
+ SimdReal mx_S, my_S, mz_S;
+ SimdReal nx_S, ny_S, nz_S;
+ SimdReal nrkj_m2_S, nrkj_n2_S;
+ SimdReal parm_S, c_S;
+ SimdReal sin_S, cos_S;
+ SimdReal sf_i_S, msf_l_S;
+ alignas(GMX_SIMD_ALIGNMENT) real pbc_simd[9*GMX_SIMD_REAL_WIDTH];
+
+ SimdReal pi_S(M_PI);
+ SimdReal one_S(1.0);
+
+ set_pbc_simd(pbc, pbc_simd);
+
+ /* nbonds is the number of dihedrals times nfa1, here we step GMX_SIMD_REAL_WIDTH dihs */
+ for (i = 0; (i < nbonds); i += GMX_SIMD_REAL_WIDTH*nfa1)
+ {
+ /* Collect atoms quadruplets for GMX_SIMD_REAL_WIDTH dihedrals.
+ * iu indexes into forceatoms, we should not let iu go beyond nbonds.
+ */
+ iu = i;
+ for (s = 0; s < GMX_SIMD_REAL_WIDTH; s++)
+ {
+ type = forceatoms[iu];
+ ai[s] = forceatoms[iu+1];
+ aj[s] = forceatoms[iu+2];
+ ak[s] = forceatoms[iu+3];
+ al[s] = forceatoms[iu+4];
+
+ /* At the end fill the arrays with the last atoms and 0 params */
+ if (i + s*nfa1 < nbonds)
+ {
+ /* We don't need the first parameter, since that's a constant
+ * which only affects the energies, not the forces.
+ */
+ for (j = 1; j < NR_RBDIHS; j++)
+ {
+ parm[j*GMX_SIMD_REAL_WIDTH + s] =
+ forceparams[type].rbdihs.rbcA[j];
+ }
+
+ if (iu + nfa1 < nbonds)
+ {
+ iu += nfa1;
+ }
+ }
+ else
+ {
+ for (j = 1; j < NR_RBDIHS; j++)
+ {
+ parm[j*GMX_SIMD_REAL_WIDTH + s] = 0;
+ }
+ }
+ }
+
+ /* Caclulate GMX_SIMD_REAL_WIDTH dihedral angles at once */
+ dih_angle_simd(x, ai, aj, ak, al, pbc_simd,
+ &phi_S,
+ &mx_S, &my_S, &mz_S,
+ &nx_S, &ny_S, &nz_S,
+ &nrkj_m2_S,
+ &nrkj_n2_S,
+ &p_S, &q_S);
+
+ /* Change to polymer convention */
+ phi_S = phi_S - pi_S;
+
+ sincos(phi_S, &sin_S, &cos_S);
+
+ ddphi_S = setZero();
+ c_S = one_S;
+ cosfac_S = one_S;
+ for (j = 1; j < NR_RBDIHS; j++)
+ {
+ parm_S = load<SimdReal>(parm + j*GMX_SIMD_REAL_WIDTH);
+ ddphi_S = fma(c_S * parm_S, cosfac_S, ddphi_S);
+ cosfac_S = cosfac_S * cos_S;
+ c_S = c_S + one_S;
+ }
+
+ /* Note that here we do not use the minus sign which is present
+ * in the normal RB code. This is corrected for through (m)sf below.
+ */
+ ddphi_S = ddphi_S * sin_S;
+
+ sf_i_S = ddphi_S * nrkj_m2_S;
+ msf_l_S = ddphi_S * nrkj_n2_S;
+
+ /* After this m?_S will contain f[i] */
+ mx_S = sf_i_S * mx_S;
+ my_S = sf_i_S * my_S;
+ mz_S = sf_i_S * mz_S;
+
+ /* After this m?_S will contain -f[l] */
+ nx_S = msf_l_S * nx_S;
+ ny_S = msf_l_S * ny_S;
+ nz_S = msf_l_S * nz_S;
+
+ do_dih_fup_noshiftf_simd(ai, aj, ak, al,
+ p_S, q_S,
+ mx_S, my_S, mz_S,
+ nx_S, ny_S, nz_S,
+ f);
+ }
+
+ return 0;
+}
+
+#endif // GMX_SIMD_HAVE_REAL
+
+
+template <BondedKernelFlavor flavor>
+real idihs(int nbonds,
+ const t_iatom forceatoms[], const t_iparams forceparams[],
+ const rvec x[], rvec4 f[], rvec fshift[],
+ const t_pbc *pbc, const t_graph *g,
+ real lambda, real *dvdlambda,
+ const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+ int gmx_unused *global_atom_index)
+{
+ int i, type, ai, aj, ak, al;
+ int t1, t2, t3;
+ real phi, phi0, dphi0, ddphi, vtot;
+ rvec r_ij, r_kj, r_kl, m, n;
+ real L1, kk, dp, dp2, kA, kB, pA, pB, dvdl_term;
+
+ L1 = 1.0-lambda;
+ dvdl_term = 0;
+ vtot = 0.0;
+ for (i = 0; (i < nbonds); )
+ {
+ type = forceatoms[i++];
+ ai = forceatoms[i++];
+ aj = forceatoms[i++];
+ ak = forceatoms[i++];
+ al = forceatoms[i++];
+
+ phi = dih_angle(x[ai], x[aj], x[ak], x[al], pbc, r_ij, r_kj, r_kl, m, n,
+ &t1, &t2, &t3); /* 84 */
+
+ /* phi can jump if phi0 is close to Pi/-Pi, which will cause huge
+ * force changes if we just apply a normal harmonic.
+ * Instead, we first calculate phi-phi0 and take it modulo (-Pi,Pi).
+ * This means we will never have the periodicity problem, unless
+ * the dihedral is Pi away from phiO, which is very unlikely due to
+ * the potential.
+ */
+ kA = forceparams[type].harmonic.krA;
+ kB = forceparams[type].harmonic.krB;
+ pA = forceparams[type].harmonic.rA;
+ pB = forceparams[type].harmonic.rB;
+
+ kk = L1*kA + lambda*kB;
+ phi0 = (L1*pA + lambda*pB)*DEG2RAD;
+ dphi0 = (pB - pA)*DEG2RAD;
+
+ dp = phi-phi0;
+
+ make_dp_periodic(&dp);
+
+ dp2 = dp*dp;
+
+ vtot += 0.5*kk*dp2;
+ ddphi = -kk*dp;
+
+ dvdl_term += 0.5*(kB - kA)*dp2 - kk*dphi0*dp;
+
+ do_dih_fup<flavor>(ai, aj, ak, al, -ddphi, r_ij, r_kj, r_kl, m, n,
+ f, fshift, pbc, g, x, t1, t2, t3); /* 112 */
+ /* 218 TOTAL */
+ }
+
+ *dvdlambda += dvdl_term;
+ return vtot;
+}
+
+/*! \brief Computes angle restraints of two different types */
+template <BondedKernelFlavor flavor>
+real low_angres(int nbonds,
+ const t_iatom forceatoms[], const t_iparams forceparams[],
+ const rvec x[], rvec4 f[], rvec fshift[],
+ const t_pbc *pbc, const t_graph *g,
+ real lambda, real *dvdlambda,
+ gmx_bool bZAxis)
+{
+ int i, m, type, ai, aj, ak, al;
+ int t1, t2;
+ real phi, cos_phi, cos_phi2, vid, vtot, dVdphi;
+ rvec r_ij, r_kl, f_i, f_k = {0, 0, 0};
+ real st, sth, nrij2, nrkl2, c, cij, ckl;
+
+ ivec dt;
+ t2 = 0; /* avoid warning with gcc-3.3. It is never used uninitialized */
+
+ vtot = 0.0;
+ ak = al = 0; /* to avoid warnings */
+ for (i = 0; i < nbonds; )
+ {
+ type = forceatoms[i++];
+ ai = forceatoms[i++];
+ aj = forceatoms[i++];
+ t1 = pbc_rvec_sub(pbc, x[aj], x[ai], r_ij); /* 3 */
+ if (!bZAxis)
+ {
+ ak = forceatoms[i++];
+ al = forceatoms[i++];
+ t2 = pbc_rvec_sub(pbc, x[al], x[ak], r_kl); /* 3 */
+ }
+ else
+ {
+ r_kl[XX] = 0;
+ r_kl[YY] = 0;
+ r_kl[ZZ] = 1;
+ }
+
+ cos_phi = cos_angle(r_ij, r_kl); /* 25 */
+ phi = std::acos(cos_phi); /* 10 */
+
+ *dvdlambda += dopdihs_min(forceparams[type].pdihs.cpA,
+ forceparams[type].pdihs.cpB,
+ forceparams[type].pdihs.phiA,
+ forceparams[type].pdihs.phiB,
+ forceparams[type].pdihs.mult,
+ phi, lambda, &vid, &dVdphi); /* 40 */
+
+ vtot += vid;
+
+ cos_phi2 = gmx::square(cos_phi); /* 1 */
+ if (cos_phi2 < 1)
+ {
+ st = -dVdphi*gmx::invsqrt(1 - cos_phi2); /* 12 */
+ sth = st*cos_phi; /* 1 */
+ nrij2 = iprod(r_ij, r_ij); /* 5 */
+ nrkl2 = iprod(r_kl, r_kl); /* 5 */
+
+ c = st*gmx::invsqrt(nrij2*nrkl2); /* 11 */
+ cij = sth/nrij2; /* 10 */
+ ckl = sth/nrkl2; /* 10 */
+
+ for (m = 0; m < DIM; m++) /* 18+18 */
+ {
+ f_i[m] = (c*r_kl[m]-cij*r_ij[m]);
+ f[ai][m] += f_i[m];
+ f[aj][m] -= f_i[m];
+ if (!bZAxis)
+ {
+ f_k[m] = (c*r_ij[m]-ckl*r_kl[m]);
+ f[ak][m] += f_k[m];
+ f[al][m] -= f_k[m];
+ }
+ }
+
+ if (computeVirial(flavor))
+ {
+ if (g)
+ {
+ ivec_sub(SHIFT_IVEC(g, ai), SHIFT_IVEC(g, aj), dt);
+ t1 = IVEC2IS(dt);
+ }
+ rvec_inc(fshift[t1], f_i);
+ rvec_dec(fshift[CENTRAL], f_i);
+ if (!bZAxis)
+ {
+ if (g)
+ {
+ ivec_sub(SHIFT_IVEC(g, ak), SHIFT_IVEC(g, al), dt);
+ t2 = IVEC2IS(dt);
+ }
+ rvec_inc(fshift[t2], f_k);
+ rvec_dec(fshift[CENTRAL], f_k);
+ }
+ }
+ }
+ }
+
+ return vtot; /* 184 / 157 (bZAxis) total */
+}
+
+template <BondedKernelFlavor flavor>
+real angres(int nbonds,
+ const t_iatom forceatoms[], const t_iparams forceparams[],
+ const rvec x[], rvec4 f[], rvec fshift[],
+ const t_pbc *pbc, const t_graph *g,
+ real lambda, real *dvdlambda,
+ const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+ int gmx_unused *global_atom_index)
+{
+ return low_angres<flavor>(nbonds, forceatoms, forceparams, x, f, fshift, pbc, g,
+ lambda, dvdlambda, FALSE);
+}
+
+template <BondedKernelFlavor flavor>
+real angresz(int nbonds,
+ const t_iatom forceatoms[], const t_iparams forceparams[],
+ const rvec x[], rvec4 f[], rvec fshift[],
+ const t_pbc *pbc, const t_graph *g,
+ real lambda, real *dvdlambda,
+ const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+ int gmx_unused *global_atom_index)
+{
+ return low_angres<flavor>(nbonds, forceatoms, forceparams, x, f, fshift, pbc, g,
+ lambda, dvdlambda, TRUE);
+}
+
+template <BondedKernelFlavor flavor>
+real dihres(int nbonds,
+ const t_iatom forceatoms[], const t_iparams forceparams[],
+ const rvec x[], rvec4 f[], rvec fshift[],
+ const t_pbc *pbc, const t_graph *g,
+ real lambda, real *dvdlambda,
+ const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+ int gmx_unused *global_atom_index)
+{
+ real vtot = 0;
+ int ai, aj, ak, al, i, type, t1, t2, t3;
+ real phi0A, phi0B, dphiA, dphiB, kfacA, kfacB, phi0, dphi, kfac;
+ real phi, ddphi, ddp, ddp2, dp, d2r, L1;
+ rvec r_ij, r_kj, r_kl, m, n;
+
+ L1 = 1.0-lambda;
+
+ d2r = DEG2RAD;
+
+ for (i = 0; (i < nbonds); )
+ {
+ type = forceatoms[i++];
+ ai = forceatoms[i++];
+ aj = forceatoms[i++];
+ ak = forceatoms[i++];
+ al = forceatoms[i++];
+
+ phi0A = forceparams[type].dihres.phiA*d2r;
+ dphiA = forceparams[type].dihres.dphiA*d2r;
+ kfacA = forceparams[type].dihres.kfacA;
+
+ phi0B = forceparams[type].dihres.phiB*d2r;
+ dphiB = forceparams[type].dihres.dphiB*d2r;
+ kfacB = forceparams[type].dihres.kfacB;
+
+ phi0 = L1*phi0A + lambda*phi0B;
+ dphi = L1*dphiA + lambda*dphiB;
+ kfac = L1*kfacA + lambda*kfacB;
+
+ phi = dih_angle(x[ai], x[aj], x[ak], x[al], pbc, r_ij, r_kj, r_kl, m, n,
+ &t1, &t2, &t3);
+ /* 84 flops */
+
+ /* phi can jump if phi0 is close to Pi/-Pi, which will cause huge
+ * force changes if we just apply a normal harmonic.
+ * Instead, we first calculate phi-phi0 and take it modulo (-Pi,Pi).
+ * This means we will never have the periodicity problem, unless
+ * the dihedral is Pi away from phiO, which is very unlikely due to
+ * the potential.
+ */
+ dp = phi-phi0;
+ make_dp_periodic(&dp);
+
+ if (dp > dphi)
+ {
+ ddp = dp-dphi;
+ }
+ else if (dp < -dphi)
+ {
+ ddp = dp+dphi;
+ }
+ else
+ {
+ ddp = 0;
+ }
+
+ if (ddp != 0.0)
+ {
+ ddp2 = ddp*ddp;
+ vtot += 0.5*kfac*ddp2;
+ ddphi = kfac*ddp;
+
+ *dvdlambda += 0.5*(kfacB - kfacA)*ddp2;
+ /* lambda dependence from changing restraint distances */
+ if (ddp > 0)
+ {
+ *dvdlambda -= kfac*ddp*((dphiB - dphiA)+(phi0B - phi0A));
+ }
+ else if (ddp < 0)
+ {
+ *dvdlambda += kfac*ddp*((dphiB - dphiA)-(phi0B - phi0A));
+ }
+ do_dih_fup<flavor>(ai, aj, ak, al, ddphi, r_ij, r_kj, r_kl, m, n,
+ f, fshift, pbc, g, x, t1, t2, t3); /* 112 */
+ }
+ }
+ return vtot;
+}
+
+
+real unimplemented(int gmx_unused nbonds,
+ const t_iatom gmx_unused forceatoms[], const t_iparams gmx_unused forceparams[],
+ const rvec gmx_unused x[], rvec4 gmx_unused f[], rvec gmx_unused fshift[],
+ const t_pbc gmx_unused *pbc, const t_graph gmx_unused *g,
+ real gmx_unused lambda, real gmx_unused *dvdlambda,
+ const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+ int gmx_unused *global_atom_index)
+{
+ gmx_impl("*** you are using a not implemented function");
+}
+
+template <BondedKernelFlavor flavor>
+real restrangles(int nbonds,
+ const t_iatom forceatoms[], const t_iparams forceparams[],
+ const rvec x[], rvec4 f[], rvec fshift[],
+ const t_pbc *pbc, const t_graph *g,
+ real gmx_unused lambda, real gmx_unused *dvdlambda,
+ const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+ int gmx_unused *global_atom_index)
+{
+ int i, d, ai, aj, ak, type, m;
+ int t1, t2;
+ real v, vtot;
+ ivec jt, dt_ij, dt_kj;
+ rvec f_i, f_j, f_k;
+ double prefactor, ratio_ante, ratio_post;
+ rvec delta_ante, delta_post, vec_temp;
+
+ vtot = 0.0;
+ for (i = 0; (i < nbonds); )
+ {
+ type = forceatoms[i++];
+ ai = forceatoms[i++];
+ aj = forceatoms[i++];
+ ak = forceatoms[i++];
+
+ t1 = pbc_rvec_sub(pbc, x[ai], x[aj], vec_temp);
+ pbc_rvec_sub(pbc, x[aj], x[ai], delta_ante);
+ t2 = pbc_rvec_sub(pbc, x[ak], x[aj], delta_post);
+
+
+ /* This function computes factors needed for restricted angle potential.
+ * The restricted angle potential is used in coarse-grained simulations to avoid singularities
+ * when three particles align and the dihedral angle and dihedral potential
+ * cannot be calculated. This potential is calculated using the formula:
+ real restrangles(int nbonds,
+ const t_iatom forceatoms[],const t_iparams forceparams[],
+ const rvec x[],rvec4 f[],rvec fshift[],
+ const t_pbc *pbc,const t_graph *g,
+ real gmx_unused lambda,real gmx_unused *dvdlambda,
+ const t_mdatoms gmx_unused *md,t_fcdata gmx_unused *fcd,
+ int gmx_unused *global_atom_index)
+ {
+ int i, d, ai, aj, ak, type, m;
+ int t1, t2;
+ rvec r_ij,r_kj;
+ real v, vtot;
+ ivec jt,dt_ij,dt_kj;
+ rvec f_i, f_j, f_k;
+ real prefactor, ratio_ante, ratio_post;
+ rvec delta_ante, delta_post, vec_temp;
+
+ vtot = 0.0;
+ for(i=0; (i<nbonds); )
+ {
+ type = forceatoms[i++];
+ ai = forceatoms[i++];
+ aj = forceatoms[i++];
+ ak = forceatoms[i++];
+
+ * \f[V_{\rm ReB}(\theta_i) = \frac{1}{2} k_{\theta} \frac{(\cos\theta_i - \cos\theta_0)^2}
+ * {\sin^2\theta_i}\f] ({eq:ReB} and ref \cite{MonicaGoga2013} from the manual).
+ * For more explanations see comments file "restcbt.h". */
+
+ compute_factors_restangles(type, forceparams, delta_ante, delta_post,
+ &prefactor, &ratio_ante, &ratio_post, &v);
+
+ /* Forces are computed per component */
+ for (d = 0; d < DIM; d++)
+ {
+ f_i[d] = prefactor * (ratio_ante * delta_ante[d] - delta_post[d]);
+ f_j[d] = prefactor * ((ratio_post + 1.0) * delta_post[d] - (ratio_ante + 1.0) * delta_ante[d]);
+ f_k[d] = prefactor * (delta_ante[d] - ratio_post * delta_post[d]);
+ }
+
+ /* Computation of potential energy */
+
+ vtot += v;
+
+ /* Update forces */
+
+ for (m = 0; (m < DIM); m++)
+ {
+ f[ai][m] += f_i[m];
+ f[aj][m] += f_j[m];
+ f[ak][m] += f_k[m];
+ }
+
+ if (computeVirial(flavor))
+ {
+ if (g)
+ {
+ copy_ivec(SHIFT_IVEC(g, aj), jt);
+ ivec_sub(SHIFT_IVEC(g, ai), jt, dt_ij);
+ ivec_sub(SHIFT_IVEC(g, ak), jt, dt_kj);
+ t1 = IVEC2IS(dt_ij);
+ t2 = IVEC2IS(dt_kj);
+ }
+
+ rvec_inc(fshift[t1], f_i);
+ rvec_inc(fshift[CENTRAL], f_j);
+ rvec_inc(fshift[t2], f_k);
+ }
+ }
+ return vtot;
+}
+
+
+template <BondedKernelFlavor flavor>
+real restrdihs(int nbonds,
+ const t_iatom forceatoms[], const t_iparams forceparams[],
+ const rvec x[], rvec4 f[], rvec fshift[],
+ const t_pbc *pbc, const t_graph *g,
+ real gmx_unused lambda, real gmx_unused *dvlambda,
+ const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+ int gmx_unused *global_atom_index)
+{
+ int i, d, type, ai, aj, ak, al;
+ rvec f_i, f_j, f_k, f_l;
+ rvec dx_jl;
+ ivec jt, dt_ij, dt_kj, dt_lj;
+ int t1, t2, t3;
+ real v, vtot;
+ rvec delta_ante, delta_crnt, delta_post, vec_temp;
+ real factor_phi_ai_ante, factor_phi_ai_crnt, factor_phi_ai_post;
+ real factor_phi_aj_ante, factor_phi_aj_crnt, factor_phi_aj_post;
+ real factor_phi_ak_ante, factor_phi_ak_crnt, factor_phi_ak_post;
+ real factor_phi_al_ante, factor_phi_al_crnt, factor_phi_al_post;
+ real prefactor_phi;
+
+
+ vtot = 0.0;
+ for (i = 0; (i < nbonds); )
+ {
+ type = forceatoms[i++];
+ ai = forceatoms[i++];
+ aj = forceatoms[i++];
+ ak = forceatoms[i++];
+ al = forceatoms[i++];
+
+ t1 = pbc_rvec_sub(pbc, x[ai], x[aj], vec_temp);
+ pbc_rvec_sub(pbc, x[aj], x[ai], delta_ante);
+ t2 = pbc_rvec_sub(pbc, x[ak], x[aj], delta_crnt);
+ pbc_rvec_sub(pbc, x[ak], x[al], vec_temp);
+ pbc_rvec_sub(pbc, x[al], x[ak], delta_post);
+
+ /* This function computes factors needed for restricted angle potential.
+ * The restricted angle potential is used in coarse-grained simulations to avoid singularities
+ * when three particles align and the dihedral angle and dihedral potential cannot be calculated.
+ * This potential is calculated using the formula:
+ * \f[V_{\rm ReB}(\theta_i) = \frac{1}{2} k_{\theta}
+ * \frac{(\cos\theta_i - \cos\theta_0)^2}{\sin^2\theta_i}\f]
+ * ({eq:ReB} and ref \cite{MonicaGoga2013} from the manual).
+ * For more explanations see comments file "restcbt.h" */
+
+ compute_factors_restrdihs(type, forceparams,
+ delta_ante, delta_crnt, delta_post,
+ &factor_phi_ai_ante, &factor_phi_ai_crnt, &factor_phi_ai_post,
+ &factor_phi_aj_ante, &factor_phi_aj_crnt, &factor_phi_aj_post,
+ &factor_phi_ak_ante, &factor_phi_ak_crnt, &factor_phi_ak_post,
+ &factor_phi_al_ante, &factor_phi_al_crnt, &factor_phi_al_post,
+ &prefactor_phi, &v);
+
+
+ /* Computation of forces per component */
+ for (d = 0; d < DIM; d++)
+ {
+ f_i[d] = prefactor_phi * (factor_phi_ai_ante * delta_ante[d] + factor_phi_ai_crnt * delta_crnt[d] + factor_phi_ai_post * delta_post[d]);
+ f_j[d] = prefactor_phi * (factor_phi_aj_ante * delta_ante[d] + factor_phi_aj_crnt * delta_crnt[d] + factor_phi_aj_post * delta_post[d]);
+ f_k[d] = prefactor_phi * (factor_phi_ak_ante * delta_ante[d] + factor_phi_ak_crnt * delta_crnt[d] + factor_phi_ak_post * delta_post[d]);
+ f_l[d] = prefactor_phi * (factor_phi_al_ante * delta_ante[d] + factor_phi_al_crnt * delta_crnt[d] + factor_phi_al_post * delta_post[d]);
+ }
+ /* Computation of the energy */
+
+ vtot += v;
+
+
+
+ /* Updating the forces */
+
+ rvec_inc(f[ai], f_i);
+ rvec_inc(f[aj], f_j);
+ rvec_inc(f[ak], f_k);
+ rvec_inc(f[al], f_l);
+
+ if (computeVirial(flavor))
+ {
+ if (g)
+ {
+ copy_ivec(SHIFT_IVEC(g, aj), jt);
+ ivec_sub(SHIFT_IVEC(g, ai), jt, dt_ij);
+ ivec_sub(SHIFT_IVEC(g, ak), jt, dt_kj);
+ ivec_sub(SHIFT_IVEC(g, al), jt, dt_lj);
+ t1 = IVEC2IS(dt_ij);
+ t2 = IVEC2IS(dt_kj);
+ t3 = IVEC2IS(dt_lj);
+ }
+ else if (pbc)
+ {
+ t3 = pbc_rvec_sub(pbc, x[al], x[aj], dx_jl);
+ }
+ else
+ {
+ t3 = CENTRAL;
+ }
+
+ rvec_inc(fshift[t1], f_i);
+ rvec_inc(fshift[CENTRAL], f_j);
+ rvec_inc(fshift[t2], f_k);
+ rvec_inc(fshift[t3], f_l);
+ }
+ }
+
+ return vtot;
+}
+
+
+template <BondedKernelFlavor flavor>
+real cbtdihs(int nbonds,
+ const t_iatom forceatoms[], const t_iparams forceparams[],
+ const rvec x[], rvec4 f[], rvec fshift[],
+ const t_pbc *pbc, const t_graph *g,
+ real gmx_unused lambda, real gmx_unused *dvdlambda,
+ const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+ int gmx_unused *global_atom_index)
+{
+ int type, ai, aj, ak, al, i, d;
+ int t1, t2, t3;
+ real v, vtot;
+ rvec vec_temp;
+ rvec f_i, f_j, f_k, f_l;
+ ivec jt, dt_ij, dt_kj, dt_lj;
+ rvec dx_jl;
+ rvec delta_ante, delta_crnt, delta_post;
+ rvec f_phi_ai, f_phi_aj, f_phi_ak, f_phi_al;
+ rvec f_theta_ante_ai, f_theta_ante_aj, f_theta_ante_ak;
+ rvec f_theta_post_aj, f_theta_post_ak, f_theta_post_al;
+
+
+
+
+ vtot = 0.0;
+ for (i = 0; (i < nbonds); )
+ {
+ type = forceatoms[i++];
+ ai = forceatoms[i++];
+ aj = forceatoms[i++];
+ ak = forceatoms[i++];
+ al = forceatoms[i++];
+
+
+ t1 = pbc_rvec_sub(pbc, x[ai], x[aj], vec_temp);
+ pbc_rvec_sub(pbc, x[aj], x[ai], delta_ante);
+ t2 = pbc_rvec_sub(pbc, x[ak], x[aj], vec_temp);
+ pbc_rvec_sub(pbc, x[ak], x[aj], delta_crnt);
+ pbc_rvec_sub(pbc, x[ak], x[al], vec_temp);
+ pbc_rvec_sub(pbc, x[al], x[ak], delta_post);
+
+ /* \brief Compute factors for CBT potential
+ * The combined bending-torsion potential goes to zero in a very smooth manner, eliminating the numerical
+ * instabilities, when three coarse-grained particles align and the dihedral angle and standard
+ * dihedral potentials cannot be calculated. The CBT potential is calculated using the formula:
+ * \f[V_{\rm CBT}(\theta_{i-1}, \theta_i, \phi_i) = k_{\phi} \sin^3\theta_{i-1} \sin^3\theta_{i}
+ * \sum_{n=0}^4 { a_n \cos^n\phi_i}\f] ({eq:CBT} and ref \cite{MonicaGoga2013} from the manual).
+ * It contains in its expression not only the dihedral angle \f$\phi\f$
+ * but also \f[\theta_{i-1}\f] (theta_ante bellow) and \f[\theta_{i}\f] (theta_post bellow)
+ * --- the adjacent bending angles.
+ * For more explanations see comments file "restcbt.h". */
+
+ compute_factors_cbtdihs(type, forceparams, delta_ante, delta_crnt, delta_post,
+ f_phi_ai, f_phi_aj, f_phi_ak, f_phi_al,
+ f_theta_ante_ai, f_theta_ante_aj, f_theta_ante_ak,
+ f_theta_post_aj, f_theta_post_ak, f_theta_post_al,
+ &v);
+
+
+ /* Acumulate the resuts per beads */
+ for (d = 0; d < DIM; d++)
+ {
+ f_i[d] = f_phi_ai[d] + f_theta_ante_ai[d];
+ f_j[d] = f_phi_aj[d] + f_theta_ante_aj[d] + f_theta_post_aj[d];
+ f_k[d] = f_phi_ak[d] + f_theta_ante_ak[d] + f_theta_post_ak[d];
+ f_l[d] = f_phi_al[d] + f_theta_post_al[d];
+ }
+
+ /* Compute the potential energy */
+
+ vtot += v;
+
+
+ /* Updating the forces */
+ rvec_inc(f[ai], f_i);
+ rvec_inc(f[aj], f_j);
+ rvec_inc(f[ak], f_k);
+ rvec_inc(f[al], f_l);
+
+
+ if (computeVirial(flavor))
+ {
+ if (g)
+ {
+ copy_ivec(SHIFT_IVEC(g, aj), jt);
+ ivec_sub(SHIFT_IVEC(g, ai), jt, dt_ij);
+ ivec_sub(SHIFT_IVEC(g, ak), jt, dt_kj);
+ ivec_sub(SHIFT_IVEC(g, al), jt, dt_lj);
+ t1 = IVEC2IS(dt_ij);
+ t2 = IVEC2IS(dt_kj);
+ t3 = IVEC2IS(dt_lj);
+ }
+ else if (pbc)
+ {
+ t3 = pbc_rvec_sub(pbc, x[al], x[aj], dx_jl);
+ }
+ else
+ {
+ t3 = CENTRAL;
+ }
+
+ rvec_inc(fshift[t1], f_i);
+ rvec_inc(fshift[CENTRAL], f_j);
+ rvec_inc(fshift[t2], f_k);
+ rvec_inc(fshift[t3], f_l);
+ }
+ }
+
+ return vtot;
+}
+
+template <BondedKernelFlavor flavor>
+std::enable_if_t<flavor != BondedKernelFlavor::ForcesSimdWhenAvailable || !GMX_SIMD_HAVE_REAL, real>
+rbdihs(int nbonds,
+ const t_iatom forceatoms[], const t_iparams forceparams[],
+ const rvec x[], rvec4 f[], rvec fshift[],
+ const t_pbc *pbc, const t_graph *g,
+ real lambda, real *dvdlambda,
+ const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+ int gmx_unused *global_atom_index)
+{
+ const real c0 = 0.0, c1 = 1.0, c2 = 2.0, c3 = 3.0, c4 = 4.0, c5 = 5.0;
+ int type, ai, aj, ak, al, i, j;
+ int t1, t2, t3;
+ rvec r_ij, r_kj, r_kl, m, n;
+ real parmA[NR_RBDIHS];
+ real parmB[NR_RBDIHS];
+ real parm[NR_RBDIHS];
+ real cos_phi, phi, rbp, rbpBA;
+ real v, ddphi, sin_phi;
+ real cosfac, vtot;
+ real L1 = 1.0-lambda;
+ real dvdl_term = 0;
+
+ vtot = 0.0;
+ for (i = 0; (i < nbonds); )
+ {
+ type = forceatoms[i++];
+ ai = forceatoms[i++];
+ aj = forceatoms[i++];
+ ak = forceatoms[i++];
+ al = forceatoms[i++];
+
+ phi = dih_angle(x[ai], x[aj], x[ak], x[al], pbc, r_ij, r_kj, r_kl, m, n,
+ &t1, &t2, &t3); /* 84 */
+
+ /* Change to polymer convention */
+ if (phi < c0)
+ {
+ phi += M_PI;
+ }
+ else
+ {
+ phi -= M_PI; /* 1 */
+
+ }
+ cos_phi = std::cos(phi);
+ /* Beware of accuracy loss, cannot use 1-sqrt(cos^2) ! */
+ sin_phi = std::sin(phi);
+
+ for (j = 0; (j < NR_RBDIHS); j++)
+ {
+ parmA[j] = forceparams[type].rbdihs.rbcA[j];
+ parmB[j] = forceparams[type].rbdihs.rbcB[j];
+ parm[j] = L1*parmA[j]+lambda*parmB[j];
+ }
+ /* Calculate cosine powers */
+ /* Calculate the energy */
+ /* Calculate the derivative */
+
+ v = parm[0];
+ dvdl_term += (parmB[0]-parmA[0]);
+ ddphi = c0;
+ cosfac = c1;
+
+ rbp = parm[1];
+ rbpBA = parmB[1]-parmA[1];
+ ddphi += rbp*cosfac;
+ cosfac *= cos_phi;
+ v += cosfac*rbp;
+ dvdl_term += cosfac*rbpBA;
+ rbp = parm[2];
+ rbpBA = parmB[2]-parmA[2];
+ ddphi += c2*rbp*cosfac;
+ cosfac *= cos_phi;
+ v += cosfac*rbp;
+ dvdl_term += cosfac*rbpBA;
+ rbp = parm[3];
+ rbpBA = parmB[3]-parmA[3];
+ ddphi += c3*rbp*cosfac;
+ cosfac *= cos_phi;
+ v += cosfac*rbp;
+ dvdl_term += cosfac*rbpBA;
+ rbp = parm[4];
+ rbpBA = parmB[4]-parmA[4];
+ ddphi += c4*rbp*cosfac;
+ cosfac *= cos_phi;
+ v += cosfac*rbp;
+ dvdl_term += cosfac*rbpBA;
+ rbp = parm[5];
+ rbpBA = parmB[5]-parmA[5];
+ ddphi += c5*rbp*cosfac;
+ cosfac *= cos_phi;
+ v += cosfac*rbp;
+ dvdl_term += cosfac*rbpBA;
+
+ ddphi = -ddphi*sin_phi; /* 11 */
+
+ do_dih_fup<flavor>(ai, aj, ak, al, ddphi, r_ij, r_kj, r_kl, m, n,
+ f, fshift, pbc, g, x, t1, t2, t3); /* 112 */
+ vtot += v;
+ }
+ *dvdlambda += dvdl_term;
+
+ return vtot;
+}
+
+//! \endcond
+
+/*! \brief Mysterious undocumented function */
+int
+cmap_setup_grid_index(int ip, int grid_spacing, int *ipm1, int *ipp1, int *ipp2)
+{
+ int im1, ip1, ip2;
+
+ if (ip < 0)
+ {
+ ip = ip + grid_spacing - 1;
+ }
+ else if (ip > grid_spacing)
+ {
+ ip = ip - grid_spacing - 1;
+ }
+
+ im1 = ip - 1;
+ ip1 = ip + 1;
+ ip2 = ip + 2;
+
+ if (ip == 0)
+ {
+ im1 = grid_spacing - 1;
+ }
+ else if (ip == grid_spacing-2)
+ {
+ ip2 = 0;
+ }
+ else if (ip == grid_spacing-1)
+ {
+ ip1 = 0;
+ ip2 = 1;
+ }
+
+ *ipm1 = im1;
+ *ipp1 = ip1;
+ *ipp2 = ip2;
+
+ return ip;
+
+}
+
+} // namespace
+
+real
+cmap_dihs(int nbonds,
+ const t_iatom forceatoms[], const t_iparams forceparams[],
+ const gmx_cmap_t *cmap_grid,
+ const rvec x[], rvec4 f[], rvec fshift[],
+ const struct t_pbc *pbc, const struct t_graph *g,
+ real gmx_unused lambda, real gmx_unused *dvdlambda,
+ const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+ int gmx_unused *global_atom_index)
+{
+ int i, n;
+ int ai, aj, ak, al, am;
+ int a1i, a1j, a1k, a1l, a2i, a2j, a2k, a2l;
+ int type;
+ int t11, t21, t31, t12, t22, t32;
+ int iphi1, ip1m1, ip1p1, ip1p2;
+ int iphi2, ip2m1, ip2p1, ip2p2;
+ int l1, l2, l3;
+ int pos1, pos2, pos3, pos4;
+
+ real ty[4], ty1[4], ty2[4], ty12[4], tx[16];
+ real phi1, cos_phi1, sin_phi1, xphi1;
+ real phi2, cos_phi2, sin_phi2, xphi2;
+ real dx, tt, tu, e, df1, df2, vtot;
+ real ra21, rb21, rg21, rg1, rgr1, ra2r1, rb2r1, rabr1;
+ real ra22, rb22, rg22, rg2, rgr2, ra2r2, rb2r2, rabr2;
+ real fg1, hg1, fga1, hgb1, gaa1, gbb1;
+ real fg2, hg2, fga2, hgb2, gaa2, gbb2;
+ real fac;
+
+ rvec r1_ij, r1_kj, r1_kl, m1, n1;
+ rvec r2_ij, r2_kj, r2_kl, m2, n2;
+ rvec f1_i, f1_j, f1_k, f1_l;
+ rvec f2_i, f2_j, f2_k, f2_l;
+ rvec a1, b1, a2, b2;
+ rvec f1, g1, h1, f2, g2, h2;
+ rvec dtf1, dtg1, dth1, dtf2, dtg2, dth2;
+ ivec jt1, dt1_ij, dt1_kj, dt1_lj;
+ ivec jt2, dt2_ij, dt2_kj, dt2_lj;
+
+ int loop_index[4][4] = {
+ {0, 4, 8, 12},
+ {1, 5, 9, 13},
+ {2, 6, 10, 14},
+ {3, 7, 11, 15}
+ };
+
+ /* Total CMAP energy */
+ vtot = 0;
+
+ for (n = 0; n < nbonds; )
+ {
+ /* Five atoms are involved in the two torsions */
+ type = forceatoms[n++];
+ ai = forceatoms[n++];
+ aj = forceatoms[n++];
+ ak = forceatoms[n++];
+ al = forceatoms[n++];
+ am = forceatoms[n++];
+
+ /* Which CMAP type is this */
+ const int cmapA = forceparams[type].cmap.cmapA;
+ const real *cmapd = cmap_grid->cmapdata[cmapA].cmap.data();
+
+ /* First torsion */
+ a1i = ai;
+ a1j = aj;
+ a1k = ak;
+ a1l = al;
+
+ phi1 = dih_angle(x[a1i], x[a1j], x[a1k], x[a1l], pbc, r1_ij, r1_kj, r1_kl, m1, n1,
+ &t11, &t21, &t31); /* 84 */
+
+ cos_phi1 = std::cos(phi1);
+
+ a1[0] = r1_ij[1]*r1_kj[2]-r1_ij[2]*r1_kj[1];
+ a1[1] = r1_ij[2]*r1_kj[0]-r1_ij[0]*r1_kj[2];
+ a1[2] = r1_ij[0]*r1_kj[1]-r1_ij[1]*r1_kj[0]; /* 9 */
+
+ b1[0] = r1_kl[1]*r1_kj[2]-r1_kl[2]*r1_kj[1];
+ b1[1] = r1_kl[2]*r1_kj[0]-r1_kl[0]*r1_kj[2];
+ b1[2] = r1_kl[0]*r1_kj[1]-r1_kl[1]*r1_kj[0]; /* 9 */
+
+ pbc_rvec_sub(pbc, x[a1l], x[a1k], h1);
+
+ ra21 = iprod(a1, a1); /* 5 */
+ rb21 = iprod(b1, b1); /* 5 */
+ rg21 = iprod(r1_kj, r1_kj); /* 5 */
+ rg1 = sqrt(rg21);
+
+ rgr1 = 1.0/rg1;
+ ra2r1 = 1.0/ra21;
+ rb2r1 = 1.0/rb21;
+ rabr1 = sqrt(ra2r1*rb2r1);
+
+ sin_phi1 = rg1 * rabr1 * iprod(a1, h1) * (-1);
+
+ if (cos_phi1 < -0.5 || cos_phi1 > 0.5)
+ {
+ phi1 = std::asin(sin_phi1);
+
+ if (cos_phi1 < 0)
+ {
+ if (phi1 > 0)
+ {
+ phi1 = M_PI - phi1;
+ }
+ else
+ {
+ phi1 = -M_PI - phi1;
+ }
+ }
+ }
+ else
+ {
+ phi1 = std::acos(cos_phi1);
+
+ if (sin_phi1 < 0)
+ {
+ phi1 = -phi1;
+ }
+ }
+
+ xphi1 = phi1 + M_PI; /* 1 */
+
+ /* Second torsion */
+ a2i = aj;
+ a2j = ak;
+ a2k = al;
+ a2l = am;
+
+ phi2 = dih_angle(x[a2i], x[a2j], x[a2k], x[a2l], pbc, r2_ij, r2_kj, r2_kl, m2, n2,
+ &t12, &t22, &t32); /* 84 */
+
+ cos_phi2 = std::cos(phi2);
+
+ a2[0] = r2_ij[1]*r2_kj[2]-r2_ij[2]*r2_kj[1];
+ a2[1] = r2_ij[2]*r2_kj[0]-r2_ij[0]*r2_kj[2];
+ a2[2] = r2_ij[0]*r2_kj[1]-r2_ij[1]*r2_kj[0]; /* 9 */
+
+ b2[0] = r2_kl[1]*r2_kj[2]-r2_kl[2]*r2_kj[1];
+ b2[1] = r2_kl[2]*r2_kj[0]-r2_kl[0]*r2_kj[2];
+ b2[2] = r2_kl[0]*r2_kj[1]-r2_kl[1]*r2_kj[0]; /* 9 */
+
+ pbc_rvec_sub(pbc, x[a2l], x[a2k], h2);
+
+ ra22 = iprod(a2, a2); /* 5 */
+ rb22 = iprod(b2, b2); /* 5 */
+ rg22 = iprod(r2_kj, r2_kj); /* 5 */
+ rg2 = sqrt(rg22);
+
+ rgr2 = 1.0/rg2;
+ ra2r2 = 1.0/ra22;
+ rb2r2 = 1.0/rb22;
+ rabr2 = sqrt(ra2r2*rb2r2);
+
+ sin_phi2 = rg2 * rabr2 * iprod(a2, h2) * (-1);
+
+ if (cos_phi2 < -0.5 || cos_phi2 > 0.5)
+ {
+ phi2 = std::asin(sin_phi2);
+
+ if (cos_phi2 < 0)
+ {
+ if (phi2 > 0)
+ {
+ phi2 = M_PI - phi2;
+ }
+ else
+ {
+ phi2 = -M_PI - phi2;
+ }
+ }
+ }
+ else
+ {
+ phi2 = std::acos(cos_phi2);
+
+ if (sin_phi2 < 0)
+ {
+ phi2 = -phi2;
+ }
+ }
+
+ xphi2 = phi2 + M_PI; /* 1 */
+
+ /* Range mangling */
+ if (xphi1 < 0)
+ {
+ xphi1 = xphi1 + 2*M_PI;
+ }
+ else if (xphi1 >= 2*M_PI)
+ {
+ xphi1 = xphi1 - 2*M_PI;
+ }
+
+ if (xphi2 < 0)
+ {
+ xphi2 = xphi2 + 2*M_PI;
+ }
+ else if (xphi2 >= 2*M_PI)
+ {
+ xphi2 = xphi2 - 2*M_PI;
+ }
+
+ /* Number of grid points */
+ dx = 2*M_PI / cmap_grid->grid_spacing;
+
+ /* Where on the grid are we */
+ iphi1 = static_cast<int>(xphi1/dx);
+ iphi2 = static_cast<int>(xphi2/dx);
+
+ iphi1 = cmap_setup_grid_index(iphi1, cmap_grid->grid_spacing, &ip1m1, &ip1p1, &ip1p2);
+ iphi2 = cmap_setup_grid_index(iphi2, cmap_grid->grid_spacing, &ip2m1, &ip2p1, &ip2p2);
+
+ pos1 = iphi1*cmap_grid->grid_spacing+iphi2;
+ pos2 = ip1p1*cmap_grid->grid_spacing+iphi2;
+ pos3 = ip1p1*cmap_grid->grid_spacing+ip2p1;
+ pos4 = iphi1*cmap_grid->grid_spacing+ip2p1;
+
+ ty[0] = cmapd[pos1*4];
+ ty[1] = cmapd[pos2*4];
+ ty[2] = cmapd[pos3*4];
+ ty[3] = cmapd[pos4*4];
+
+ ty1[0] = cmapd[pos1*4+1];
+ ty1[1] = cmapd[pos2*4+1];
+ ty1[2] = cmapd[pos3*4+1];
+ ty1[3] = cmapd[pos4*4+1];
+
+ ty2[0] = cmapd[pos1*4+2];
+ ty2[1] = cmapd[pos2*4+2];
+ ty2[2] = cmapd[pos3*4+2];
+ ty2[3] = cmapd[pos4*4+2];
+
+ ty12[0] = cmapd[pos1*4+3];
+ ty12[1] = cmapd[pos2*4+3];
+ ty12[2] = cmapd[pos3*4+3];
+ ty12[3] = cmapd[pos4*4+3];
+
+ /* Switch to degrees */
+ dx = 360.0 / cmap_grid->grid_spacing;
+ xphi1 = xphi1 * RAD2DEG;
+ xphi2 = xphi2 * RAD2DEG;
+
+ for (i = 0; i < 4; i++) /* 16 */
+ {
+ tx[i] = ty[i];
+ tx[i+4] = ty1[i]*dx;
+ tx[i+8] = ty2[i]*dx;
+ tx[i+12] = ty12[i]*dx*dx;
+ }
+
+ real tc[16] = {0};
+ for (int idx = 0; idx < 16; idx++) /* 1056 */
+ {
+ for (int k = 0; k < 16; k++)
+ {
+ tc[idx] += cmap_coeff_matrix[k*16+idx]*tx[k];
+ }
+ }
+
+ tt = (xphi1-iphi1*dx)/dx;
+ tu = (xphi2-iphi2*dx)/dx;
+
+ e = 0;
+ df1 = 0;
+ df2 = 0;
+
+ for (i = 3; i >= 0; i--)
+ {
+ l1 = loop_index[i][3];
+ l2 = loop_index[i][2];
+ l3 = loop_index[i][1];
+
+ e = tt * e + ((tc[i*4+3]*tu+tc[i*4+2])*tu + tc[i*4+1])*tu+tc[i*4];
+ df1 = tu * df1 + (3.0*tc[l1]*tt+2.0*tc[l2])*tt+tc[l3];
+ df2 = tt * df2 + (3.0*tc[i*4+3]*tu+2.0*tc[i*4+2])*tu+tc[i*4+1];
+ }
+
+ fac = RAD2DEG/dx;
+ df1 = df1 * fac;
+ df2 = df2 * fac;
+
+ /* CMAP energy */
+ vtot += e;
+
+ /* Do forces - first torsion */
+ fg1 = iprod(r1_ij, r1_kj);
+ hg1 = iprod(r1_kl, r1_kj);
+ fga1 = fg1*ra2r1*rgr1;
+ hgb1 = hg1*rb2r1*rgr1;
+ gaa1 = -ra2r1*rg1;
+ gbb1 = rb2r1*rg1;
+
+ for (i = 0; i < DIM; i++)
+ {
+ dtf1[i] = gaa1 * a1[i];
+ dtg1[i] = fga1 * a1[i] - hgb1 * b1[i];
+ dth1[i] = gbb1 * b1[i];
+
+ f1[i] = df1 * dtf1[i];
+ g1[i] = df1 * dtg1[i];
+ h1[i] = df1 * dth1[i];
+
+ f1_i[i] = f1[i];
+ f1_j[i] = -f1[i] - g1[i];
+ f1_k[i] = h1[i] + g1[i];
+ f1_l[i] = -h1[i];
+
+ f[a1i][i] = f[a1i][i] + f1_i[i];
+ f[a1j][i] = f[a1j][i] + f1_j[i]; /* - f1[i] - g1[i] */
+ f[a1k][i] = f[a1k][i] + f1_k[i]; /* h1[i] + g1[i] */
+ f[a1l][i] = f[a1l][i] + f1_l[i]; /* h1[i] */
+ }
+
+ /* Do forces - second torsion */
+ fg2 = iprod(r2_ij, r2_kj);
+ hg2 = iprod(r2_kl, r2_kj);
+ fga2 = fg2*ra2r2*rgr2;
+ hgb2 = hg2*rb2r2*rgr2;
+ gaa2 = -ra2r2*rg2;
+ gbb2 = rb2r2*rg2;
+
+ for (i = 0; i < DIM; i++)
+ {
+ dtf2[i] = gaa2 * a2[i];
+ dtg2[i] = fga2 * a2[i] - hgb2 * b2[i];
+ dth2[i] = gbb2 * b2[i];
+
+ f2[i] = df2 * dtf2[i];
+ g2[i] = df2 * dtg2[i];
+ h2[i] = df2 * dth2[i];
+
+ f2_i[i] = f2[i];
+ f2_j[i] = -f2[i] - g2[i];
+ f2_k[i] = h2[i] + g2[i];
+ f2_l[i] = -h2[i];
+
+ f[a2i][i] = f[a2i][i] + f2_i[i]; /* f2[i] */
+ f[a2j][i] = f[a2j][i] + f2_j[i]; /* - f2[i] - g2[i] */
+ f[a2k][i] = f[a2k][i] + f2_k[i]; /* h2[i] + g2[i] */
+ f[a2l][i] = f[a2l][i] + f2_l[i]; /* - h2[i] */
+ }
+
+ /* Shift forces */
+ if (fshift != nullptr)
+ {
+ if (g)
+ {
+ copy_ivec(SHIFT_IVEC(g, a1j), jt1);
+ ivec_sub(SHIFT_IVEC(g, a1i), jt1, dt1_ij);
+ ivec_sub(SHIFT_IVEC(g, a1k), jt1, dt1_kj);
+ ivec_sub(SHIFT_IVEC(g, a1l), jt1, dt1_lj);
+ t11 = IVEC2IS(dt1_ij);
+ t21 = IVEC2IS(dt1_kj);
+ t31 = IVEC2IS(dt1_lj);
+
+ copy_ivec(SHIFT_IVEC(g, a2j), jt2);
+ ivec_sub(SHIFT_IVEC(g, a2i), jt2, dt2_ij);
+ ivec_sub(SHIFT_IVEC(g, a2k), jt2, dt2_kj);
+ ivec_sub(SHIFT_IVEC(g, a2l), jt2, dt2_lj);
+ t12 = IVEC2IS(dt2_ij);
+ t22 = IVEC2IS(dt2_kj);
+ t32 = IVEC2IS(dt2_lj);
+ }
+ else if (pbc)
+ {
+ t31 = pbc_rvec_sub(pbc, x[a1l], x[a1j], h1);
+ t32 = pbc_rvec_sub(pbc, x[a2l], x[a2j], h2);
+ }
+ else
+ {
+ t31 = CENTRAL;
+ t32 = CENTRAL;
+ }
+
+ rvec_inc(fshift[t11], f1_i);
+ rvec_inc(fshift[CENTRAL], f1_j);
+ rvec_inc(fshift[t21], f1_k);
+ rvec_inc(fshift[t31], f1_l);
+
++ rvec_inc(fshift[t12], f2_i);
+ rvec_inc(fshift[CENTRAL], f2_j);
+ rvec_inc(fshift[t22], f2_k);
+ rvec_inc(fshift[t32], f2_l);
+ }
+ }
+ return vtot;
+}
+
+namespace
+{
+
+//! \cond
+/***********************************************************
+ *
+ * G R O M O S 9 6 F U N C T I O N S
+ *
+ ***********************************************************/
+real g96harmonic(real kA, real kB, real xA, real xB, real x, real lambda,
+ real *V, real *F)
+{
+ const real half = 0.5;
+ real L1, kk, x0, dx, dx2;
+ real v, f, dvdlambda;
+
+ L1 = 1.0-lambda;
+ kk = L1*kA+lambda*kB;
+ x0 = L1*xA+lambda*xB;
+
+ dx = x-x0;
+ dx2 = dx*dx;
+
+ f = -kk*dx;
+ v = half*kk*dx2;
+ dvdlambda = half*(kB-kA)*dx2 + (xA-xB)*kk*dx;
+
+ *F = f;
+ *V = v;
+
+ return dvdlambda;
+
+ /* That was 21 flops */
+}
+
+template <BondedKernelFlavor flavor>
+real g96bonds(int nbonds,
+ const t_iatom forceatoms[], const t_iparams forceparams[],
+ const rvec x[], rvec4 f[], rvec fshift[],
+ const t_pbc *pbc, const t_graph *g,
+ real lambda, real *dvdlambda,
+ const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+ int gmx_unused *global_atom_index)
+{
+ int i, ki, ai, aj, type;
+ real dr2, fbond, vbond, vtot;
+ rvec dx;
+
+ vtot = 0.0;
+ for (i = 0; (i < nbonds); )
+ {
+ type = forceatoms[i++];
+ ai = forceatoms[i++];
+ aj = forceatoms[i++];
+
+ ki = pbc_rvec_sub(pbc, x[ai], x[aj], dx); /* 3 */
+ dr2 = iprod(dx, dx); /* 5 */
+
+ *dvdlambda += g96harmonic(forceparams[type].harmonic.krA,
+ forceparams[type].harmonic.krB,
+ forceparams[type].harmonic.rA,
+ forceparams[type].harmonic.rB,
+ dr2, lambda, &vbond, &fbond);
+
+ vtot += 0.5*vbond; /* 1*/
+
+ spreadBondForces<flavor>(fbond, dx, ai, aj, f, ki, g, fshift); /* 15 */
+ } /* 44 TOTAL */
+ return vtot;
+}
+
+real g96bond_angle(const rvec xi, const rvec xj, const rvec xk, const t_pbc *pbc,
+ rvec r_ij, rvec r_kj,
+ int *t1, int *t2)
+/* Return value is the angle between the bonds i-j and j-k */
+{
+ real costh;
+
+ *t1 = pbc_rvec_sub(pbc, xi, xj, r_ij); /* 3 */
+ *t2 = pbc_rvec_sub(pbc, xk, xj, r_kj); /* 3 */
+
+ costh = cos_angle(r_ij, r_kj); /* 25 */
+ /* 41 TOTAL */
+ return costh;
+}
+
+template <BondedKernelFlavor flavor>
+real g96angles(int nbonds,
+ const t_iatom forceatoms[], const t_iparams forceparams[],
+ const rvec x[], rvec4 f[], rvec fshift[],
+ const t_pbc *pbc, const t_graph *g,
+ real lambda, real *dvdlambda,
+ const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+ int gmx_unused *global_atom_index)
+{
+ int i, ai, aj, ak, type, m, t1, t2;
+ rvec r_ij, r_kj;
+ real cos_theta, dVdt, va, vtot;
+ real rij_1, rij_2, rkj_1, rkj_2, rijrkj_1;
+ rvec f_i, f_j, f_k;
+ ivec jt, dt_ij, dt_kj;
+
+ vtot = 0.0;
+ for (i = 0; (i < nbonds); )
+ {
+ type = forceatoms[i++];
+ ai = forceatoms[i++];
+ aj = forceatoms[i++];
+ ak = forceatoms[i++];
+
+ cos_theta = g96bond_angle(x[ai], x[aj], x[ak], pbc, r_ij, r_kj, &t1, &t2);
+
+ *dvdlambda += g96harmonic(forceparams[type].harmonic.krA,
+ forceparams[type].harmonic.krB,
+ forceparams[type].harmonic.rA,
+ forceparams[type].harmonic.rB,
+ cos_theta, lambda, &va, &dVdt);
+ vtot += va;
+
+ rij_1 = gmx::invsqrt(iprod(r_ij, r_ij));
+ rkj_1 = gmx::invsqrt(iprod(r_kj, r_kj));
+ rij_2 = rij_1*rij_1;
+ rkj_2 = rkj_1*rkj_1;
+ rijrkj_1 = rij_1*rkj_1; /* 23 */
+
+ for (m = 0; (m < DIM); m++) /* 42 */
+ {
+ f_i[m] = dVdt*(r_kj[m]*rijrkj_1 - r_ij[m]*rij_2*cos_theta);
+ f_k[m] = dVdt*(r_ij[m]*rijrkj_1 - r_kj[m]*rkj_2*cos_theta);
+ f_j[m] = -f_i[m]-f_k[m];
+ f[ai][m] += f_i[m];
+ f[aj][m] += f_j[m];
+ f[ak][m] += f_k[m];
+ }
+
+ if (computeVirial(flavor))
+ {
+ if (g)
+ {
+ copy_ivec(SHIFT_IVEC(g, aj), jt);
+
+ ivec_sub(SHIFT_IVEC(g, ai), jt, dt_ij);
+ ivec_sub(SHIFT_IVEC(g, ak), jt, dt_kj);
+ t1 = IVEC2IS(dt_ij);
+ t2 = IVEC2IS(dt_kj);
+ }
+ rvec_inc(fshift[t1], f_i);
+ rvec_inc(fshift[CENTRAL], f_j);
+ rvec_inc(fshift[t2], f_k); /* 9 */
+ }
+ /* 163 TOTAL */
+ }
+ return vtot;
+}
+
+template <BondedKernelFlavor flavor>
+real cross_bond_bond(int nbonds,
+ const t_iatom forceatoms[], const t_iparams forceparams[],
+ const rvec x[], rvec4 f[], rvec fshift[],
+ const t_pbc *pbc, const t_graph *g,
+ real gmx_unused lambda, real gmx_unused *dvdlambda,
+ const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+ int gmx_unused *global_atom_index)
+{
+ /* Potential from Lawrence and Skimmer, Chem. Phys. Lett. 372 (2003)
+ * pp. 842-847
+ */
+ int i, ai, aj, ak, type, m, t1, t2;
+ rvec r_ij, r_kj;
+ real vtot, vrr, s1, s2, r1, r2, r1e, r2e, krr;
+ rvec f_i, f_j, f_k;
+ ivec jt, dt_ij, dt_kj;
+
+ vtot = 0.0;
+ for (i = 0; (i < nbonds); )
+ {
+ type = forceatoms[i++];
+ ai = forceatoms[i++];
+ aj = forceatoms[i++];
+ ak = forceatoms[i++];
+ r1e = forceparams[type].cross_bb.r1e;
+ r2e = forceparams[type].cross_bb.r2e;
+ krr = forceparams[type].cross_bb.krr;
+
+ /* Compute distance vectors ... */
+ t1 = pbc_rvec_sub(pbc, x[ai], x[aj], r_ij);
+ t2 = pbc_rvec_sub(pbc, x[ak], x[aj], r_kj);
+
+ /* ... and their lengths */
+ r1 = norm(r_ij);
+ r2 = norm(r_kj);
+
+ /* Deviations from ideality */
+ s1 = r1-r1e;
+ s2 = r2-r2e;
+
+ /* Energy (can be negative!) */
+ vrr = krr*s1*s2;
+ vtot += vrr;
+
+ /* Forces */
+ svmul(-krr*s2/r1, r_ij, f_i);
+ svmul(-krr*s1/r2, r_kj, f_k);
+
+ for (m = 0; (m < DIM); m++) /* 12 */
+ {
+ f_j[m] = -f_i[m] - f_k[m];
+ f[ai][m] += f_i[m];
+ f[aj][m] += f_j[m];
+ f[ak][m] += f_k[m];
+ }
+
+ if (computeVirial(flavor))
+ {
+ if (g)
+ {
+ copy_ivec(SHIFT_IVEC(g, aj), jt);
+
+ ivec_sub(SHIFT_IVEC(g, ai), jt, dt_ij);
+ ivec_sub(SHIFT_IVEC(g, ak), jt, dt_kj);
+ t1 = IVEC2IS(dt_ij);
+ t2 = IVEC2IS(dt_kj);
+ }
+ rvec_inc(fshift[t1], f_i);
+ rvec_inc(fshift[CENTRAL], f_j);
+ rvec_inc(fshift[t2], f_k); /* 9 */
+ }
+ /* 163 TOTAL */
+ }
+ return vtot;
+}
+
+template <BondedKernelFlavor flavor>
+real cross_bond_angle(int nbonds,
+ const t_iatom forceatoms[], const t_iparams forceparams[],
+ const rvec x[], rvec4 f[], rvec fshift[],
+ const t_pbc *pbc, const t_graph *g,
+ real gmx_unused lambda, real gmx_unused *dvdlambda,
+ const t_mdatoms gmx_unused *md, t_fcdata gmx_unused *fcd,
+ int gmx_unused *global_atom_index)
+{
+ /* Potential from Lawrence and Skimmer, Chem. Phys. Lett. 372 (2003)
+ * pp. 842-847
+ */
+ int i, ai, aj, ak, type, m, t1, t2;
+ rvec r_ij, r_kj, r_ik;
+ real vtot, vrt, s1, s2, s3, r1, r2, r3, r1e, r2e, r3e, krt, k1, k2, k3;
+ rvec f_i, f_j, f_k;
+ ivec jt, dt_ij, dt_kj;
+
+ vtot = 0.0;
+ for (i = 0; (i < nbonds); )
+ {
+ type = forceatoms[i++];
+ ai = forceatoms[i++];
+ aj = forceatoms[i++];
+ ak = forceatoms[i++];
+ r1e = forceparams[type].cross_ba.r1e;
+ r2e = forceparams[type].cross_ba.r2e;
+ r3e = forceparams[type].cross_ba.r3e;
+ krt = forceparams[type].cross_ba.krt;
+
+ /* Compute distance vectors ... */
+ t1 = pbc_rvec_sub(pbc, x[ai], x[aj], r_ij);
+ t2 = pbc_rvec_sub(pbc, x[ak], x[aj], r_kj);
+ pbc_rvec_sub(pbc, x[ai], x[ak], r_ik);
+
+ /* ... and their lengths */
+ r1 = norm(r_ij);
+ r2 = norm(r_kj);
+ r3 = norm(r_ik);
+
+ /* Deviations from ideality */
+ s1 = r1-r1e;
+ s2 = r2-r2e;
+ s3 = r3-r3e;
+
+ /* Energy (can be negative!) */
+ vrt = krt*s3*(s1+s2);
+ vtot += vrt;
+
+ /* Forces */
+ k1 = -krt*(s3/r1);
+ k2 = -krt*(s3/r2);
+ k3 = -krt*(s1+s2)/r3;
+ for (m = 0; (m < DIM); m++)
+ {
+ f_i[m] = k1*r_ij[m] + k3*r_ik[m];
+ f_k[m] = k2*r_kj[m] - k3*r_ik[m];
+ f_j[m] = -f_i[m] - f_k[m];
+ }
+
+ for (m = 0; (m < DIM); m++) /* 12 */
+ {
+ f[ai][m] += f_i[m];
+ f[aj][m] += f_j[m];
+ f[ak][m] += f_k[m];
+ }
+
+ if (computeVirial(flavor))
+ {
+ if (g)
+ {
+ copy_ivec(SHIFT_IVEC(g, aj), jt);
+
+ ivec_sub(SHIFT_IVEC(g, ai), jt, dt_ij);
+ ivec_sub(SHIFT_IVEC(g, ak), jt, dt_kj);
+ t1 = IVEC2IS(dt_ij);
+ t2 = IVEC2IS(dt_kj);
+ }
+ rvec_inc(fshift[t1], f_i);
+ rvec_inc(fshift[CENTRAL], f_j);
+ rvec_inc(fshift[t2], f_k); /* 9 */
+ }
+ /* 163 TOTAL */
+ }
+ return vtot;
+}
+
+/*! \brief Computes the potential and force for a tabulated potential */
+real bonded_tab(const char *type, int table_nr,
+ const bondedtable_t *table, real kA, real kB, real r,
+ real lambda, real *V, real *F)
+{
+ real k, tabscale, *VFtab, rt, eps, eps2, Yt, Ft, Geps, Heps2, Fp, VV, FF;
+ int n0, nnn;
+ real dvdlambda;
+
+ k = (1.0 - lambda)*kA + lambda*kB;
+
+ tabscale = table->scale;
+ VFtab = table->data;
+
+ rt = r*tabscale;
+ n0 = static_cast<int>(rt);
+ if (n0 >= table->n)
+ {
+ gmx_fatal(FARGS, "A tabulated %s interaction table number %d is out of the table range: r %f, between table indices %d and %d, table length %d",
+ type, table_nr, r, n0, n0+1, table->n);
+ }
+ eps = rt - n0;
+ eps2 = eps*eps;
+ nnn = 4*n0;
+ Yt = VFtab[nnn];
+ Ft = VFtab[nnn+1];
+ Geps = VFtab[nnn+2]*eps;
+ Heps2 = VFtab[nnn+3]*eps2;
+ Fp = Ft + Geps + Heps2;
+ VV = Yt + Fp*eps;
+ FF = Fp + Geps + 2.0*Heps2;
+
+ *F = -k*FF*tabscale;
+ *V = k*VV;
+ dvdlambda = (kB - kA)*VV;
+
+ return dvdlambda;
+
+ /* That was 22 flops */
+}
+
+template <BondedKernelFlavor flavor>
+real tab_bonds(int nbonds,
+ const t_iatom forceatoms[], const t_iparams forceparams[],
+ const rvec x[], rvec4 f[], rvec fshift[],
+ const t_pbc *pbc, const t_graph *g,
+ real lambda, real *dvdlambda,
+ const t_mdatoms gmx_unused *md, t_fcdata *fcd,
+ int gmx_unused *global_atom_index)
+{
+ int i, ki, ai, aj, type, table;
+ real dr, dr2, fbond, vbond, vtot;
+ rvec dx;
+
+ vtot = 0.0;
+ for (i = 0; (i < nbonds); )
+ {
+ type = forceatoms[i++];
+ ai = forceatoms[i++];
+ aj = forceatoms[i++];
+
+ ki = pbc_rvec_sub(pbc, x[ai], x[aj], dx); /* 3 */
+ dr2 = iprod(dx, dx); /* 5 */
+ dr = dr2*gmx::invsqrt(dr2); /* 10 */
+
+ table = forceparams[type].tab.table;
+
+ *dvdlambda += bonded_tab("bond", table,
+ &fcd->bondtab[table],
+ forceparams[type].tab.kA,
+ forceparams[type].tab.kB,
+ dr, lambda, &vbond, &fbond); /* 22 */
+
+ if (dr2 == 0.0)
+ {
+ continue;
+ }
+
+
+ vtot += vbond; /* 1*/
+ fbond *= gmx::invsqrt(dr2); /* 6 */
+
+ spreadBondForces<flavor>(fbond, dx, ai, aj, f, ki, g, fshift); /* 15 */
+ } /* 62 TOTAL */
+ return vtot;
+}
+
+template <BondedKernelFlavor flavor>
+real tab_angles(int nbonds,
+ const t_iatom forceatoms[], const t_iparams forceparams[],
+ const rvec x[], rvec4 f[], rvec fshift[],
+ const t_pbc *pbc, const t_graph *g,
+ real lambda, real *dvdlambda,
+ const t_mdatoms gmx_unused *md, t_fcdata *fcd,
+ int gmx_unused *global_atom_index)
+{
+ int i, ai, aj, ak, t1, t2, type, table;
+ rvec r_ij, r_kj;
+ real cos_theta, cos_theta2, theta, dVdt, va, vtot;
+ ivec jt, dt_ij, dt_kj;
+
+ vtot = 0.0;
+ for (i = 0; (i < nbonds); )
+ {
+ type = forceatoms[i++];
+ ai = forceatoms[i++];
+ aj = forceatoms[i++];
+ ak = forceatoms[i++];
+
+ theta = bond_angle(x[ai], x[aj], x[ak], pbc,
+ r_ij, r_kj, &cos_theta, &t1, &t2); /* 41 */
+
+ table = forceparams[type].tab.table;
+
+ *dvdlambda += bonded_tab("angle", table,
+ &fcd->angletab[table],
+ forceparams[type].tab.kA,
+ forceparams[type].tab.kB,
+ theta, lambda, &va, &dVdt); /* 22 */
+ vtot += va;
+
+ cos_theta2 = gmx::square(cos_theta); /* 1 */
+ if (cos_theta2 < 1)
+ {
+ int m;
+ real st, sth;
+ real cik, cii, ckk;
+ real nrkj2, nrij2;
+ rvec f_i, f_j, f_k;
+
+ st = dVdt*gmx::invsqrt(1 - cos_theta2); /* 12 */
+ sth = st*cos_theta; /* 1 */
+ nrkj2 = iprod(r_kj, r_kj); /* 5 */
+ nrij2 = iprod(r_ij, r_ij);
+
+ cik = st*gmx::invsqrt(nrkj2*nrij2); /* 12 */
+ cii = sth/nrij2; /* 10 */
+ ckk = sth/nrkj2; /* 10 */
+
+ for (m = 0; (m < DIM); m++) /* 39 */
+ {
+ f_i[m] = -(cik*r_kj[m]-cii*r_ij[m]);
+ f_k[m] = -(cik*r_ij[m]-ckk*r_kj[m]);
+ f_j[m] = -f_i[m]-f_k[m];
+ f[ai][m] += f_i[m];
+ f[aj][m] += f_j[m];
+ f[ak][m] += f_k[m];
+ }
+
+ if (computeVirial(flavor))
+ {
+ if (g)
+ {
+ copy_ivec(SHIFT_IVEC(g, aj), jt);
+
+ ivec_sub(SHIFT_IVEC(g, ai), jt, dt_ij);
+ ivec_sub(SHIFT_IVEC(g, ak), jt, dt_kj);
+ t1 = IVEC2IS(dt_ij);
+ t2 = IVEC2IS(dt_kj);
+ }
+ rvec_inc(fshift[t1], f_i);
+ rvec_inc(fshift[CENTRAL], f_j);
+ rvec_inc(fshift[t2], f_k);
+ }
+ } /* 169 TOTAL */
+ }
+ return vtot;
+}
+
+template <BondedKernelFlavor flavor>
+real tab_dihs(int nbonds,
+ const t_iatom forceatoms[], const t_iparams forceparams[],
+ const rvec x[], rvec4 f[], rvec fshift[],
+ const t_pbc *pbc, const t_graph *g,
+ real lambda, real *dvdlambda,
+ const t_mdatoms gmx_unused *md, t_fcdata *fcd,
+ int gmx_unused *global_atom_index)
+{
+ int i, type, ai, aj, ak, al, table;
+ int t1, t2, t3;
+ rvec r_ij, r_kj, r_kl, m, n;
+ real phi, ddphi, vpd, vtot;
+
+ vtot = 0.0;
+ for (i = 0; (i < nbonds); )
+ {
+ type = forceatoms[i++];
+ ai = forceatoms[i++];
+ aj = forceatoms[i++];
+ ak = forceatoms[i++];
+ al = forceatoms[i++];
+
+ phi = dih_angle(x[ai], x[aj], x[ak], x[al], pbc, r_ij, r_kj, r_kl, m, n,
+ &t1, &t2, &t3); /* 84 */
+
+ table = forceparams[type].tab.table;
+
+ /* Hopefully phi+M_PI never results in values < 0 */
+ *dvdlambda += bonded_tab("dihedral", table,
+ &fcd->dihtab[table],
+ forceparams[type].tab.kA,
+ forceparams[type].tab.kB,
+ phi+M_PI, lambda, &vpd, &ddphi);
+
+ vtot += vpd;
+ do_dih_fup<flavor>(ai, aj, ak, al, -ddphi, r_ij, r_kj, r_kl, m, n,
+ f, fshift, pbc, g, x, t1, t2, t3); /* 112 */
+
+ } /* 227 TOTAL */
+
+ return vtot;
+}
+
+struct BondedInteractions
+{
+ BondedFunction function;
+ int nrnbIndex;
+};
+
+/*! \brief Lookup table of bonded interaction functions
+ *
+ * This must have as many entries as interaction_function in ifunc.cpp */
+template<BondedKernelFlavor flavor>
+const std::array<BondedInteractions, F_NRE> c_bondedInteractionFunctions
+ = {
+ BondedInteractions {bonds<flavor>, eNR_BONDS }, // F_BONDS
+ BondedInteractions {g96bonds<flavor>, eNR_BONDS }, // F_G96BONDS
+ BondedInteractions {morse_bonds<flavor>, eNR_MORSE }, // F_MORSE
+ BondedInteractions {cubic_bonds<flavor>, eNR_CUBICBONDS }, // F_CUBICBONDS
+ BondedInteractions {unimplemented, -1 }, // F_CONNBONDS
+ BondedInteractions {bonds<flavor>, eNR_BONDS }, // F_HARMONIC
+ BondedInteractions {FENE_bonds<flavor>, eNR_FENEBONDS }, // F_FENEBONDS
+ BondedInteractions {tab_bonds<flavor>, eNR_TABBONDS }, // F_TABBONDS
+ BondedInteractions {tab_bonds<flavor>, eNR_TABBONDS }, // F_TABBONDSNC
+ BondedInteractions {restraint_bonds<flavor>, eNR_RESTRBONDS }, // F_RESTRBONDS
+ BondedInteractions {angles<flavor>, eNR_ANGLES }, // F_ANGLES
+ BondedInteractions {g96angles<flavor>, eNR_ANGLES }, // F_G96ANGLES
+ BondedInteractions {restrangles<flavor>, eNR_ANGLES }, // F_RESTRANGLES
+ BondedInteractions {linear_angles<flavor>, eNR_ANGLES }, // F_LINEAR_ANGLES
+ BondedInteractions {cross_bond_bond<flavor>, eNR_CROSS_BOND_BOND }, // F_CROSS_BOND_BONDS
+ BondedInteractions {cross_bond_angle<flavor>, eNR_CROSS_BOND_ANGLE }, // F_CROSS_BOND_ANGLES
+ BondedInteractions {urey_bradley<flavor>, eNR_UREY_BRADLEY }, // F_UREY_BRADLEY
+ BondedInteractions {quartic_angles<flavor>, eNR_QANGLES }, // F_QUARTIC_ANGLES
+ BondedInteractions {tab_angles<flavor>, eNR_TABANGLES }, // F_TABANGLES
+ BondedInteractions {pdihs<flavor>, eNR_PROPER }, // F_PDIHS
+ BondedInteractions {rbdihs<flavor>, eNR_RB }, // F_RBDIHS
+ BondedInteractions {restrdihs<flavor>, eNR_PROPER }, // F_RESTRDIHS
+ BondedInteractions {cbtdihs<flavor>, eNR_RB }, // F_CBTDIHS
+ BondedInteractions {rbdihs<flavor>, eNR_FOURDIH }, // F_FOURDIHS
+ BondedInteractions {idihs<flavor>, eNR_IMPROPER }, // F_IDIHS
+ BondedInteractions {pdihs<flavor>, eNR_IMPROPER }, // F_PIDIHS
+ BondedInteractions {tab_dihs<flavor>, eNR_TABDIHS }, // F_TABDIHS
+ BondedInteractions {unimplemented, eNR_CMAP }, // F_CMAP
+ BondedInteractions {unimplemented, -1 }, // F_GB12_NOLONGERUSED
+ BondedInteractions {unimplemented, -1 }, // F_GB13_NOLONGERUSED
+ BondedInteractions {unimplemented, -1 }, // F_GB14_NOLONGERUSED
+ BondedInteractions {unimplemented, -1 }, // F_GBPOL_NOLONGERUSED
+ BondedInteractions {unimplemented, -1 }, // F_NPSOLVATION_NOLONGERUSED
+ BondedInteractions {unimplemented, eNR_NB14 }, // F_LJ14
+ BondedInteractions {unimplemented, -1 }, // F_COUL14
+ BondedInteractions {unimplemented, eNR_NB14 }, // F_LJC14_Q
+ BondedInteractions {unimplemented, eNR_NB14 }, // F_LJC_PAIRS_NB
+ BondedInteractions {unimplemented, -1 }, // F_LJ
+ BondedInteractions {unimplemented, -1 }, // F_BHAM
+ BondedInteractions {unimplemented, -1 }, // F_LJ_LR_NOLONGERUSED
+ BondedInteractions {unimplemented, -1 }, // F_BHAM_LR_NOLONGERUSED
+ BondedInteractions {unimplemented, -1 }, // F_DISPCORR
+ BondedInteractions {unimplemented, -1 }, // F_COUL_SR
+ BondedInteractions {unimplemented, -1 }, // F_COUL_LR_NOLONGERUSED
+ BondedInteractions {unimplemented, -1 }, // F_RF_EXCL
+ BondedInteractions {unimplemented, -1 }, // F_COUL_RECIP
+ BondedInteractions {unimplemented, -1 }, // F_LJ_RECIP
+ BondedInteractions {unimplemented, -1 }, // F_DPD
+ BondedInteractions {polarize<flavor>, eNR_POLARIZE }, // F_POLARIZATION
+ BondedInteractions {water_pol<flavor>, eNR_WPOL }, // F_WATER_POL
+ BondedInteractions {thole_pol<flavor>, eNR_THOLE }, // F_THOLE_POL
+ BondedInteractions {anharm_polarize<flavor>, eNR_ANHARM_POL }, // F_ANHARM_POL
+ BondedInteractions {unimplemented, -1 }, // F_POSRES
+ BondedInteractions {unimplemented, -1 }, // F_FBPOSRES
+ BondedInteractions {ta_disres, eNR_DISRES }, // F_DISRES
+ BondedInteractions {unimplemented, -1 }, // F_DISRESVIOL
+ BondedInteractions {orires, eNR_ORIRES }, // F_ORIRES
+ BondedInteractions {unimplemented, -1 }, // F_ORIRESDEV
+ BondedInteractions {angres<flavor>, eNR_ANGRES }, // F_ANGRES
+ BondedInteractions {angresz<flavor>, eNR_ANGRESZ }, // F_ANGRESZ
+ BondedInteractions {dihres<flavor>, eNR_DIHRES }, // F_DIHRES
+ BondedInteractions {unimplemented, -1 }, // F_DIHRESVIOL
+ BondedInteractions {unimplemented, -1 }, // F_CONSTR
+ BondedInteractions {unimplemented, -1 }, // F_CONSTRNC
+ BondedInteractions {unimplemented, -1 }, // F_SETTLE
+ BondedInteractions {unimplemented, -1 }, // F_VSITE2
+ BondedInteractions {unimplemented, -1 }, // F_VSITE3
+ BondedInteractions {unimplemented, -1 }, // F_VSITE3FD
+ BondedInteractions {unimplemented, -1 }, // F_VSITE3FAD
+ BondedInteractions {unimplemented, -1 }, // F_VSITE3OUT
+ BondedInteractions {unimplemented, -1 }, // F_VSITE4FD
+ BondedInteractions {unimplemented, -1 }, // F_VSITE4FDN
+ BondedInteractions {unimplemented, -1 }, // F_VSITEN
+ BondedInteractions {unimplemented, -1 }, // F_COM_PULL
+ BondedInteractions {unimplemented, -1 }, // F_DENSITYFITTING
+ BondedInteractions {unimplemented, -1 }, // F_EQM
+ BondedInteractions {unimplemented, -1 }, // F_EPOT
+ BondedInteractions {unimplemented, -1 }, // F_EKIN
+ BondedInteractions {unimplemented, -1 }, // F_ETOT
+ BondedInteractions {unimplemented, -1 }, // F_ECONSERVED
+ BondedInteractions {unimplemented, -1 }, // F_TEMP
+ BondedInteractions {unimplemented, -1 }, // F_VTEMP_NOLONGERUSED
+ BondedInteractions {unimplemented, -1 }, // F_PDISPCORR
+ BondedInteractions {unimplemented, -1 }, // F_PRES
+ BondedInteractions {unimplemented, -1 }, // F_DVDL_CONSTR
+ BondedInteractions {unimplemented, -1 }, // F_DVDL
+ BondedInteractions {unimplemented, -1 }, // F_DKDL
+ BondedInteractions {unimplemented, -1 }, // F_DVDL_COUL
+ BondedInteractions {unimplemented, -1 }, // F_DVDL_VDW
+ BondedInteractions {unimplemented, -1 }, // F_DVDL_BONDED
+ BondedInteractions {unimplemented, -1 }, // F_DVDL_RESTRAINT
+ BondedInteractions {unimplemented, -1 }, // F_DVDL_TEMPERATURE
+ };
+
+/*! \brief List of instantiated BondedInteractions list */
+const gmx::EnumerationArray < BondedKernelFlavor, std::array < BondedInteractions, F_NRE>> c_bondedInteractionFunctionsPerFlavor =
+{
+ c_bondedInteractionFunctions<BondedKernelFlavor::ForcesSimdWhenAvailable>,
+ c_bondedInteractionFunctions<BondedKernelFlavor::ForcesNoSimd>,
+ c_bondedInteractionFunctions<BondedKernelFlavor::ForcesAndVirialAndEnergy>,
+ c_bondedInteractionFunctions<BondedKernelFlavor::ForcesAndEnergy>
+};
+
+//! \endcond
+
+} // namespace
+
+real calculateSimpleBond(const int ftype,
+ const int numForceatoms, const t_iatom forceatoms[],
+ const t_iparams forceparams[],
+ const rvec x[], rvec4 f[], rvec fshift[],
+ const struct t_pbc *pbc,
+ const struct t_graph gmx_unused *g,
+ const real lambda, real *dvdlambda,
+ const t_mdatoms *md, t_fcdata *fcd,
+ int gmx_unused *global_atom_index,
+ const BondedKernelFlavor bondedKernelFlavor)
+{
+ const BondedInteractions &bonded =
+ c_bondedInteractionFunctionsPerFlavor[bondedKernelFlavor][ftype];
+
+ real v = bonded.function(numForceatoms, forceatoms,
+ forceparams,
+ x, f, fshift,
+ pbc, g, lambda, dvdlambda,
+ md, fcd, global_atom_index);
+
+ return v;
+}
+
+int nrnbIndex(int ftype)
+{
+ return c_bondedInteractionFunctions<BondedKernelFlavor::ForcesAndVirialAndEnergy>[ftype].nrnbIndex;
+}
--- /dev/null
- int gpu_pick_ewald_kernel_type(const bool bTwinCut)
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ * \brief Define OpenCL implementation of nbnxm_gpu.h
+ *
+ * \author Anca Hamuraru <anca@streamcomputing.eu>
+ * \author Teemu Virolainen <teemu@streamcomputing.eu>
+ * \author Dimitrios Karkoulis <dimitris.karkoulis@gmail.com>
+ * \author Szilárd Páll <pall.szilard@gmail.com>
+ * \ingroup module_nbnxm
+ *
+ * TODO (psz):
+ * - Add a static const cl_uint c_pruneKernelWorkDim / c_nbnxnKernelWorkDim = 3;
+ * - Rework the copying of OCL data structures done before every invocation of both
+ * nb and prune kernels (using fillin_ocl_structures); also consider at the same
+ * time calling clSetKernelArg only on the updated parameters (if tracking changed
+ * parameters is feasible);
+ * - Consider using the event_wait_list argument to clEnqueueNDRangeKernel to mark
+ * dependencies on the kernel launched: e.g. the non-local nb kernel's dependency
+ * on the misc_ops_and_local_H2D_done event could be better expressed this way.
+ *
+ * - Consider extracting common sections of the OpenCL and CUDA nbnxn logic, e.g:
+ * - in nbnxn_gpu_launch_kernel_pruneonly() the pre- and post-kernel launch logic
+ * is identical in the two implementations, so a 3-way split might allow sharing
+ * code;
+ * -
+ *
+ */
+#include "gmxpre.h"
+
+#include <assert.h>
+#include <stdlib.h>
+
+#if defined(_MSVC)
+#include <limits>
+#endif
+
+#include "thread_mpi/atomic.h"
+
+#include "gromacs/gpu_utils/gputraits_ocl.h"
+#include "gromacs/gpu_utils/oclutils.h"
+#include "gromacs/hardware/hw_info.h"
+#include "gromacs/mdtypes/simulation_workload.h"
+#include "gromacs/nbnxm/atomdata.h"
+#include "gromacs/nbnxm/gpu_common.h"
+#include "gromacs/nbnxm/gpu_common_utils.h"
+#include "gromacs/nbnxm/gpu_data_mgmt.h"
+#include "gromacs/nbnxm/nbnxm.h"
+#include "gromacs/nbnxm/nbnxm_gpu.h"
+#include "gromacs/nbnxm/pairlist.h"
+#include "gromacs/pbcutil/ishift.h"
+#include "gromacs/timing/gpu_timing.h"
+#include "gromacs/utility/cstringutil.h"
+#include "gromacs/utility/fatalerror.h"
+#include "gromacs/utility/gmxassert.h"
+
+#include "nbnxm_ocl_internal.h"
+#include "nbnxm_ocl_types.h"
+
+namespace Nbnxm
+{
+
+/*! \brief Convenience constants */
+//@{
+static const int c_numClPerSupercl = c_nbnxnGpuNumClusterPerSupercluster;
+static const int c_clSize = c_nbnxnGpuClusterSize;
+//@}
+
+
+/*! \brief Validates the input global work size parameter.
+ */
+static inline void validate_global_work_size(const KernelLaunchConfig &config, int work_dim, const gmx_device_info_t *dinfo)
+{
+ cl_uint device_size_t_size_bits;
+ cl_uint host_size_t_size_bits;
+
+ assert(dinfo);
+
+ size_t global_work_size[3];
+ GMX_ASSERT(work_dim <= 3, "Not supporting hyper-grids just yet");
+ for (int i = 0; i < work_dim; i++)
+ {
+ global_work_size[i] = config.blockSize[i] * config.gridSize[i];
+ }
+
+ /* Each component of a global_work_size must not exceed the range given by the
+ sizeof(device size_t) for the device on which the kernel execution will
+ be enqueued. See:
+ https://www.khronos.org/registry/cl/sdk/1.0/docs/man/xhtml/clEnqueueNDRangeKernel.html
+ */
+ device_size_t_size_bits = dinfo->adress_bits;
+ host_size_t_size_bits = static_cast<cl_uint>(sizeof(size_t) * 8);
+
+ /* If sizeof(host size_t) <= sizeof(device size_t)
+ => global_work_size components will always be valid
+ else
+ => get device limit for global work size and
+ compare it against each component of global_work_size.
+ */
+ if (host_size_t_size_bits > device_size_t_size_bits)
+ {
+ size_t device_limit;
+
+ device_limit = (1ULL << device_size_t_size_bits) - 1;
+
+ for (int i = 0; i < work_dim; i++)
+ {
+ if (global_work_size[i] > device_limit)
+ {
+ gmx_fatal(FARGS, "Watch out, the input system is too large to simulate!\n"
+ "The number of nonbonded work units (=number of super-clusters) exceeds the"
+ "device capabilities. Global work size limit exceeded (%zu > %zu)!",
+ global_work_size[i], device_limit);
+ }
+ }
+ }
+}
+
+/* Constant arrays listing non-bonded kernel function names. The arrays are
+ * organized in 2-dim arrays by: electrostatics and VDW type.
+ *
+ * Note that the row- and column-order of function pointers has to match the
+ * order of corresponding enumerated electrostatics and vdw types, resp.,
+ * defined in nbnxm_ocl_types.h.
+ */
+
+/*! \brief Force-only kernel function names. */
+static const char* nb_kfunc_noener_noprune_ptr[eelOclNR][evdwOclNR] =
+{
+ { "nbnxn_kernel_ElecCut_VdwLJ_F_opencl", "nbnxn_kernel_ElecCut_VdwLJCombGeom_F_opencl", "nbnxn_kernel_ElecCut_VdwLJCombLB_F_opencl", "nbnxn_kernel_ElecCut_VdwLJFsw_F_opencl", "nbnxn_kernel_ElecCut_VdwLJPsw_F_opencl", "nbnxn_kernel_ElecCut_VdwLJEwCombGeom_F_opencl", "nbnxn_kernel_ElecCut_VdwLJEwCombLB_F_opencl" },
+ { "nbnxn_kernel_ElecRF_VdwLJ_F_opencl", "nbnxn_kernel_ElecRF_VdwLJCombGeom_F_opencl", "nbnxn_kernel_ElecRF_VdwLJCombLB_F_opencl", "nbnxn_kernel_ElecRF_VdwLJFsw_F_opencl", "nbnxn_kernel_ElecRF_VdwLJPsw_F_opencl", "nbnxn_kernel_ElecRF_VdwLJEwCombGeom_F_opencl", "nbnxn_kernel_ElecRF_VdwLJEwCombLB_F_opencl" },
+ { "nbnxn_kernel_ElecEwQSTab_VdwLJ_F_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJCombGeom_F_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJCombLB_F_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJFsw_F_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJPsw_F_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_F_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_F_opencl" },
+ { "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_F_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombGeom_F_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombLB_F_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_F_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_F_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_F_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_F_opencl" },
+ { "nbnxn_kernel_ElecEw_VdwLJ_F_opencl", "nbnxn_kernel_ElecEw_VdwLJCombGeom_F_opencl", "nbnxn_kernel_ElecEw_VdwLJCombLB_F_opencl", "nbnxn_kernel_ElecEw_VdwLJFsw_F_opencl", "nbnxn_kernel_ElecEw_VdwLJPsw_F_opencl", "nbnxn_kernel_ElecEw_VdwLJEwCombGeom_F_opencl", "nbnxn_kernel_ElecEw_VdwLJEwCombLB_F_opencl" },
+ { "nbnxn_kernel_ElecEwTwinCut_VdwLJ_F_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJCombGeom_F_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJCombLB_F_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_F_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_F_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_F_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_F_opencl" }
+};
+
+/*! \brief Force + energy kernel function pointers. */
+static const char* nb_kfunc_ener_noprune_ptr[eelOclNR][evdwOclNR] =
+{
+ { "nbnxn_kernel_ElecCut_VdwLJ_VF_opencl", "nbnxn_kernel_ElecCut_VdwLJCombGeom_VF_opencl", "nbnxn_kernel_ElecCut_VdwLJCombLB_VF_opencl", "nbnxn_kernel_ElecCut_VdwLJFsw_VF_opencl", "nbnxn_kernel_ElecCut_VdwLJPsw_VF_opencl", "nbnxn_kernel_ElecCut_VdwLJEwCombGeom_VF_opencl", "nbnxn_kernel_ElecCut_VdwLJEwCombLB_VF_opencl" },
+ { "nbnxn_kernel_ElecRF_VdwLJ_VF_opencl", "nbnxn_kernel_ElecRF_VdwLJCombGeom_VF_opencl", "nbnxn_kernel_ElecRF_VdwLJCombLB_VF_opencl", "nbnxn_kernel_ElecRF_VdwLJFsw_VF_opencl", "nbnxn_kernel_ElecRF_VdwLJPsw_VF_opencl", "nbnxn_kernel_ElecRF_VdwLJEwCombGeom_VF_opencl", "nbnxn_kernel_ElecRF_VdwLJEwCombLB_VF_opencl" },
+ { "nbnxn_kernel_ElecEwQSTab_VdwLJ_VF_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJCombGeom_VF_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJCombLB_VF_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJFsw_VF_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJPsw_VF_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_VF_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_VF_opencl" },
+ { "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_VF_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombGeom_VF_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombLB_VF_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_VF_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_VF_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_VF_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_VF_opencl" },
+ { "nbnxn_kernel_ElecEw_VdwLJ_VF_opencl", "nbnxn_kernel_ElecEw_VdwLJCombGeom_VF_opencl", "nbnxn_kernel_ElecEw_VdwLJCombLB_VF_opencl", "nbnxn_kernel_ElecEw_VdwLJFsw_VF_opencl", "nbnxn_kernel_ElecEw_VdwLJPsw_VF_opencl", "nbnxn_kernel_ElecEw_VdwLJEwCombGeom_VF_opencl", "nbnxn_kernel_ElecEw_VdwLJEwCombLB_VF_opencl" },
+ { "nbnxn_kernel_ElecEwTwinCut_VdwLJ_VF_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJCombGeom_VF_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJCombLB_VF_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_VF_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_VF_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_VF_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_VF_opencl" }
+};
+
+/*! \brief Force + pruning kernel function pointers. */
+static const char* nb_kfunc_noener_prune_ptr[eelOclNR][evdwOclNR] =
+{
+ { "nbnxn_kernel_ElecCut_VdwLJ_F_prune_opencl", "nbnxn_kernel_ElecCut_VdwLJCombGeom_F_prune_opencl", "nbnxn_kernel_ElecCut_VdwLJCombLB_F_prune_opencl", "nbnxn_kernel_ElecCut_VdwLJFsw_F_prune_opencl", "nbnxn_kernel_ElecCut_VdwLJPsw_F_prune_opencl", "nbnxn_kernel_ElecCut_VdwLJEwCombGeom_F_prune_opencl", "nbnxn_kernel_ElecCut_VdwLJEwCombLB_F_prune_opencl" },
+ { "nbnxn_kernel_ElecRF_VdwLJ_F_prune_opencl", "nbnxn_kernel_ElecRF_VdwLJCombGeom_F_prune_opencl", "nbnxn_kernel_ElecRF_VdwLJCombLB_F_prune_opencl", "nbnxn_kernel_ElecRF_VdwLJFsw_F_prune_opencl", "nbnxn_kernel_ElecRF_VdwLJPsw_F_prune_opencl", "nbnxn_kernel_ElecRF_VdwLJEwCombGeom_F_prune_opencl", "nbnxn_kernel_ElecRF_VdwLJEwCombLB_F_prune_opencl" },
+ { "nbnxn_kernel_ElecEwQSTab_VdwLJ_F_prune_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJCombGeom_F_prune_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJCombLB_F_prune_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJFsw_F_prune_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJPsw_F_prune_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_F_prune_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_F_prune_opencl" },
+ { "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_F_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombGeom_F_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombLB_F_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_F_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_F_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_F_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_F_prune_opencl" },
+ { "nbnxn_kernel_ElecEw_VdwLJ_F_prune_opencl", "nbnxn_kernel_ElecEw_VdwLJCombGeom_F_prune_opencl", "nbnxn_kernel_ElecEw_VdwLJCombLB_F_prune_opencl", "nbnxn_kernel_ElecEw_VdwLJFsw_F_prune_opencl", "nbnxn_kernel_ElecEw_VdwLJPsw_F_prune_opencl", "nbnxn_kernel_ElecEw_VdwLJEwCombGeom_F_prune_opencl", "nbnxn_kernel_ElecEw_VdwLJEwCombLB_F_prune_opencl" },
+ { "nbnxn_kernel_ElecEwTwinCut_VdwLJ_F_prune_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJCombGeom_F_prune_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJCombLB_F_prune_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_F_prune_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_F_prune_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_F_prune_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_F_prune_opencl" }
+};
+
+/*! \brief Force + energy + pruning kernel function pointers. */
+static const char* nb_kfunc_ener_prune_ptr[eelOclNR][evdwOclNR] =
+{
+ { "nbnxn_kernel_ElecCut_VdwLJ_VF_prune_opencl", "nbnxn_kernel_ElecCut_VdwLJCombGeom_VF_prune_opencl", "nbnxn_kernel_ElecCut_VdwLJCombLB_VF_prune_opencl", "nbnxn_kernel_ElecCut_VdwLJFsw_VF_prune_opencl", "nbnxn_kernel_ElecCut_VdwLJPsw_VF_prune_opencl", "nbnxn_kernel_ElecCut_VdwLJEwCombGeom_VF_prune_opencl", "nbnxn_kernel_ElecCut_VdwLJEwCombLB_VF_prune_opencl" },
+ { "nbnxn_kernel_ElecRF_VdwLJ_VF_prune_opencl", "nbnxn_kernel_ElecRF_VdwLJCombGeom_VF_prune_opencl", "nbnxn_kernel_ElecRF_VdwLJCombLB_VF_prune_opencl", "nbnxn_kernel_ElecRF_VdwLJFsw_VF_prune_opencl", "nbnxn_kernel_ElecRF_VdwLJPsw_VF_prune_opencl", "nbnxn_kernel_ElecRF_VdwLJEwCombGeom_VF_prune_opencl", "nbnxn_kernel_ElecRF_VdwLJEwCombLB_VF_prune_opencl" },
+ { "nbnxn_kernel_ElecEwQSTab_VdwLJ_VF_prune_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJCombGeom_VF_prune_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJCombLB_VF_prune_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJFsw_VF_prune_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJPsw_VF_prune_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_VF_prune_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_VF_prune_opencl" },
+ { "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_VF_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombGeom_VF_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombLB_VF_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_VF_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_VF_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_VF_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_VF_prune_opencl" },
+ { "nbnxn_kernel_ElecEw_VdwLJ_VF_prune_opencl", "nbnxn_kernel_ElecEw_VdwLJCombGeom_VF_prune_opencl", "nbnxn_kernel_ElecEw_VdwLJCombLB_VF_prune_opencl", "nbnxn_kernel_ElecEw_VdwLJFsw_VF_prune_opencl", "nbnxn_kernel_ElecEw_VdwLJPsw_VF_prune_opencl", "nbnxn_kernel_ElecEw_VdwLJEwCombGeom_VF_prune_opencl", "nbnxn_kernel_ElecEw_VdwLJEwCombLB_VF_prune_opencl" },
+ { "nbnxn_kernel_ElecEwTwinCut_VdwLJ_VF_prune_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJCombGeom_VF_prune_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJCombLB_VF_prune_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_VF_prune_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_VF_prune_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_VF_prune_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_VF_prune_opencl" }
+};
+
+/*! \brief Return a pointer to the prune kernel version to be executed at the current invocation.
+ *
+ * \param[in] kernel_pruneonly array of prune kernel objects
+ * \param[in] firstPrunePass true if the first pruning pass is being executed
+ */
+static inline cl_kernel selectPruneKernel(cl_kernel kernel_pruneonly[],
+ bool firstPrunePass)
+{
+ cl_kernel *kernelPtr;
+
+ if (firstPrunePass)
+ {
+ kernelPtr = &(kernel_pruneonly[epruneFirst]);
+ }
+ else
+ {
+ kernelPtr = &(kernel_pruneonly[epruneRolling]);
+ }
+ // TODO: consider creating the prune kernel object here to avoid a
+ // clCreateKernel for the rolling prune kernel if this is not needed.
+ return *kernelPtr;
+}
+
+/*! \brief Return a pointer to the kernel version to be executed at the current step.
+ * OpenCL kernel objects are cached in nb. If the requested kernel is not
+ * found in the cache, it will be created and the cache will be updated.
+ */
+static inline cl_kernel select_nbnxn_kernel(gmx_nbnxn_ocl_t *nb,
+ int eeltype,
+ int evdwtype,
+ bool bDoEne,
+ bool bDoPrune)
+{
+ const char* kernel_name_to_run;
+ cl_kernel *kernel_ptr;
+ cl_int cl_error;
+
+ assert(eeltype < eelOclNR);
+ assert(evdwtype < evdwOclNR);
+
+ if (bDoEne)
+ {
+ if (bDoPrune)
+ {
+ kernel_name_to_run = nb_kfunc_ener_prune_ptr[eeltype][evdwtype];
+ kernel_ptr = &(nb->kernel_ener_prune_ptr[eeltype][evdwtype]);
+ }
+ else
+ {
+ kernel_name_to_run = nb_kfunc_ener_noprune_ptr[eeltype][evdwtype];
+ kernel_ptr = &(nb->kernel_ener_noprune_ptr[eeltype][evdwtype]);
+ }
+ }
+ else
+ {
+ if (bDoPrune)
+ {
+ kernel_name_to_run = nb_kfunc_noener_prune_ptr[eeltype][evdwtype];
+ kernel_ptr = &(nb->kernel_noener_prune_ptr[eeltype][evdwtype]);
+ }
+ else
+ {
+ kernel_name_to_run = nb_kfunc_noener_noprune_ptr[eeltype][evdwtype];
+ kernel_ptr = &(nb->kernel_noener_noprune_ptr[eeltype][evdwtype]);
+ }
+ }
+
+ if (nullptr == kernel_ptr[0])
+ {
+ *kernel_ptr = clCreateKernel(nb->dev_rundata->program, kernel_name_to_run, &cl_error);
+ assert(cl_error == CL_SUCCESS);
+ }
+ // TODO: handle errors
+
+ return *kernel_ptr;
+}
+
+/*! \brief Calculates the amount of shared memory required by the nonbonded kernel in use.
+ */
+static inline int calc_shmem_required_nonbonded(int vdwType,
+ bool bPrefetchLjParam)
+{
+ int shmem;
+
+ /* size of shmem (force-buffers/xq/atom type preloading) */
+ /* NOTE: with the default kernel on sm3.0 we need shmem only for pre-loading */
+ /* i-atom x+q in shared memory */
+ shmem = c_numClPerSupercl * c_clSize * sizeof(float) * 4; /* xqib */
+ /* cj in shared memory, for both warps separately
+ * TODO: in the "nowarp kernels we load cj only once so the factor 2 is not needed.
+ */
+ shmem += 2 * c_nbnxnGpuJgroupSize * sizeof(int); /* cjs */
+ if (bPrefetchLjParam)
+ {
+ if (useLjCombRule(vdwType))
+ {
+ /* i-atom LJ combination parameters in shared memory */
+ shmem += c_numClPerSupercl * c_clSize * 2*sizeof(float); /* atib abused for ljcp, float2 */
+ }
+ else
+ {
+ /* i-atom types in shared memory */
+ shmem += c_numClPerSupercl * c_clSize * sizeof(int); /* atib */
+ }
+ }
+ /* force reduction buffers in shared memory */
+ shmem += c_clSize * c_clSize * 3 * sizeof(float); /* f_buf */
+ /* Warp vote. In fact it must be * number of warps in block.. */
+ shmem += sizeof(cl_uint) * 2; /* warp_any */
+ return shmem;
+}
+
+/*! \brief Initializes data structures that are going to be sent to the OpenCL device.
+ *
+ * The device can't use the same data structures as the host for two main reasons:
+ * - OpenCL restrictions (pointers are not accepted inside data structures)
+ * - some host side fields are not needed for the OpenCL kernels.
+ *
+ * This function is called before the launch of both nbnxn and prune kernels.
+ */
+static void fillin_ocl_structures(cl_nbparam_t *nbp,
+ cl_nbparam_params_t *nbparams_params)
+{
+ nbparams_params->coulomb_tab_scale = nbp->coulomb_tab_scale;
+ nbparams_params->c_rf = nbp->c_rf;
+ nbparams_params->dispersion_shift = nbp->dispersion_shift;
+ nbparams_params->eeltype = nbp->eeltype;
+ nbparams_params->epsfac = nbp->epsfac;
+ nbparams_params->ewaldcoeff_lj = nbp->ewaldcoeff_lj;
+ nbparams_params->ewald_beta = nbp->ewald_beta;
+ nbparams_params->rcoulomb_sq = nbp->rcoulomb_sq;
+ nbparams_params->repulsion_shift = nbp->repulsion_shift;
+ nbparams_params->rlistOuter_sq = nbp->rlistOuter_sq;
+ nbparams_params->rvdw_sq = nbp->rvdw_sq;
+ nbparams_params->rlistInner_sq = nbp->rlistInner_sq;
+ nbparams_params->rvdw_switch = nbp->rvdw_switch;
+ nbparams_params->sh_ewald = nbp->sh_ewald;
+ nbparams_params->sh_lj_ewald = nbp->sh_lj_ewald;
+ nbparams_params->two_k_rf = nbp->two_k_rf;
+ nbparams_params->vdwtype = nbp->vdwtype;
+ nbparams_params->vdw_switch = nbp->vdw_switch;
+}
+
+/*! \brief Enqueues a wait for event completion.
+ *
+ * Then it releases the event and sets it to 0.
+ * Don't use this function when more than one wait will be issued for the event.
+ * Equivalent to Cuda Stream Sync. */
+static void sync_ocl_event(cl_command_queue stream, cl_event *ocl_event)
+{
+ cl_int gmx_unused cl_error;
+
+ /* Enqueue wait */
+ cl_error = clEnqueueBarrierWithWaitList(stream, 1, ocl_event, nullptr);
+ GMX_RELEASE_ASSERT(CL_SUCCESS == cl_error, ocl_get_error_string(cl_error).c_str());
+
+ /* Release event and reset it to 0. It is ok to release it as enqueuewaitforevents performs implicit retain for events. */
+ cl_error = clReleaseEvent(*ocl_event);
+ assert(CL_SUCCESS == cl_error);
+ *ocl_event = nullptr;
+}
+
+/*! \brief Launch asynchronously the xq buffer host to device copy. */
+void gpu_copy_xq_to_gpu(gmx_nbnxn_ocl_t *nb,
+ const nbnxn_atomdata_t *nbatom,
+ const AtomLocality atomLocality)
+{
+ GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
+
+ const InteractionLocality iloc = gpuAtomToInteractionLocality(atomLocality);
+
+ /* local/nonlocal offset and length used for xq and f */
+ int adat_begin, adat_len;
+
+ cl_atomdata_t *adat = nb->atdat;
+ cl_plist_t *plist = nb->plist[iloc];
+ cl_timers_t *t = nb->timers;
+ cl_command_queue stream = nb->stream[iloc];
+
+ bool bDoTime = (nb->bDoTime) != 0;
+
+ /* Don't launch the non-local H2D copy if there is no dependent
+ work to do: neither non-local nor other (e.g. bonded) work
+ to do that has as input the nbnxn coordinates.
+ Doing the same for the local kernel is more complicated, since the
+ local part of the force array also depends on the non-local kernel.
+ So to avoid complicating the code and to reduce the risk of bugs,
+ we always call the local local x+q copy (and the rest of the local
+ work in nbnxn_gpu_launch_kernel().
+ */
+ if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(*nb, iloc))
+ {
+ plist->haveFreshList = false;
+
+ return;
+ }
+
+ /* calculate the atom data index range based on locality */
+ if (atomLocality == AtomLocality::Local)
+ {
+ adat_begin = 0;
+ adat_len = adat->natoms_local;
+ }
+ else
+ {
+ adat_begin = adat->natoms_local;
+ adat_len = adat->natoms - adat->natoms_local;
+ }
+
+ /* beginning of timed HtoD section */
+ if (bDoTime)
+ {
+ t->xf[atomLocality].nb_h2d.openTimingRegion(stream);
+ }
+
+ /* HtoD x, q */
+ ocl_copy_H2D_async(adat->xq, nbatom->x().data() + adat_begin * 4, adat_begin*sizeof(float)*4,
+ adat_len * sizeof(float) * 4, stream, bDoTime ? t->xf[atomLocality].nb_h2d.fetchNextEvent() : nullptr);
+
+ if (bDoTime)
+ {
+ t->xf[atomLocality].nb_h2d.closeTimingRegion(stream);
+ }
+
+ /* When we get here all misc operations issues in the local stream as well as
+ the local xq H2D are done,
+ so we record that in the local stream and wait for it in the nonlocal one. */
+ if (nb->bUseTwoStreams)
+ {
+ if (iloc == InteractionLocality::Local)
+ {
+ cl_int gmx_used_in_debug cl_error = clEnqueueMarkerWithWaitList(stream, 0, nullptr, &(nb->misc_ops_and_local_H2D_done));
+ assert(CL_SUCCESS == cl_error);
+
+ /* Based on the v1.2 section 5.13 of the OpenCL spec, a flush is needed
+ * in the local stream in order to be able to sync with the above event
+ * from the non-local stream.
+ */
+ cl_error = clFlush(stream);
+ assert(CL_SUCCESS == cl_error);
+ }
+ else
+ {
+ sync_ocl_event(stream, &(nb->misc_ops_and_local_H2D_done));
+ }
+ }
+}
+
+
+/*! \brief Launch GPU kernel
+
+ As we execute nonbonded workload in separate queues, before launching
+ the kernel we need to make sure that he following operations have completed:
+ - atomdata allocation and related H2D transfers (every nstlist step);
+ - pair list H2D transfer (every nstlist step);
+ - shift vector H2D transfer (every nstlist step);
+ - force (+shift force and energy) output clearing (every step).
+
+ These operations are issued in the local queue at the beginning of the step
+ and therefore always complete before the local kernel launch. The non-local
+ kernel is launched after the local on the same device/context, so this is
+ inherently scheduled after the operations in the local stream (including the
+ above "misc_ops").
+ However, for the sake of having a future-proof implementation, we use the
+ misc_ops_done event to record the point in time when the above operations
+ are finished and synchronize with this event in the non-local stream.
+ */
+void gpu_launch_kernel(gmx_nbnxn_ocl_t *nb,
+ const gmx::StepWorkload &stepWork,
+ const Nbnxm::InteractionLocality iloc)
+{
+ cl_atomdata_t *adat = nb->atdat;
+ cl_nbparam_t *nbp = nb->nbparam;
+ cl_plist_t *plist = nb->plist[iloc];
+ cl_timers_t *t = nb->timers;
+ cl_command_queue stream = nb->stream[iloc];
+
+ bool bDoTime = (nb->bDoTime) != 0;
+
+ cl_nbparam_params_t nbparams_params;
+
+ /* Don't launch the non-local kernel if there is no work to do.
+ Doing the same for the local kernel is more complicated, since the
+ local part of the force array also depends on the non-local kernel.
+ So to avoid complicating the code and to reduce the risk of bugs,
+ we always call the local kernel and later (not in
+ this function) the stream wait, local f copyback and the f buffer
+ clearing. All these operations, except for the local interaction kernel,
+ are needed for the non-local interactions. The skip of the local kernel
+ call is taken care of later in this function. */
+ if (canSkipNonbondedWork(*nb, iloc))
+ {
+ plist->haveFreshList = false;
+
+ return;
+ }
+
+ if (nbp->useDynamicPruning && plist->haveFreshList)
+ {
+ /* Prunes for rlistOuter and rlistInner, sets plist->haveFreshList=false
+ (that's the way the timing accounting can distinguish between
+ separate prune kernel and combined force+prune).
+ */
+ Nbnxm::gpu_launch_kernel_pruneonly(nb, iloc, 1);
+ }
+
+ if (plist->nsci == 0)
+ {
+ /* Don't launch an empty local kernel (is not allowed with OpenCL).
+ */
+ return;
+ }
+
+ /* beginning of timed nonbonded calculation section */
+ if (bDoTime)
+ {
+ t->interaction[iloc].nb_k.openTimingRegion(stream);
+ }
+
+ /* kernel launch config */
+
+ KernelLaunchConfig config;
+ config.sharedMemorySize = calc_shmem_required_nonbonded(nbp->vdwtype, nb->bPrefetchLjParam);
+ config.stream = stream;
+ config.blockSize[0] = c_clSize;
+ config.blockSize[1] = c_clSize;
+ config.gridSize[0] = plist->nsci;
+
+ validate_global_work_size(config, 3, nb->dev_info);
+
+ if (debug)
+ {
+ fprintf(debug, "Non-bonded GPU launch configuration:\n\tLocal work size: %zux%zux%zu\n\t"
+ "Global work size : %zux%zu\n\t#Super-clusters/clusters: %d/%d (%d)\n",
+ config.blockSize[0], config.blockSize[1], config.blockSize[2],
+ config.blockSize[0] * config.gridSize[0], config.blockSize[1] * config.gridSize[1], plist->nsci*c_numClPerSupercl,
+ c_numClPerSupercl, plist->na_c);
+ }
+
+ fillin_ocl_structures(nbp, &nbparams_params);
+
+ auto *timingEvent = bDoTime ? t->interaction[iloc].nb_k.fetchNextEvent() : nullptr;
+ constexpr char kernelName[] = "k_calc_nb";
+ const auto kernel = select_nbnxn_kernel(nb,
+ nbp->eeltype,
+ nbp->vdwtype,
+ stepWork.computeEnergy,
+ (plist->haveFreshList && !nb->timers->interaction[iloc].didPrune));
+
+
+ // The OpenCL kernel takes int as second to last argument because bool is
+ // not supported as a kernel argument type (sizeof(bool) is implementation defined).
+ const int computeFshift = static_cast<int>(stepWork.computeVirial);
+ if (useLjCombRule(nb->nbparam->vdwtype))
+ {
+ const auto kernelArgs = prepareGpuKernelArguments(kernel, config,
+ &nbparams_params, &adat->xq, &adat->f, &adat->e_lj, &adat->e_el, &adat->fshift,
+ &adat->lj_comb,
+ &adat->shift_vec, &nbp->nbfp_climg2d, &nbp->nbfp_comb_climg2d, &nbp->coulomb_tab_climg2d,
+ &plist->sci, &plist->cj4, &plist->excl, &computeFshift);
+
+ launchGpuKernel(kernel, config, timingEvent, kernelName, kernelArgs);
+ }
+ else
+ {
+ const auto kernelArgs = prepareGpuKernelArguments(kernel, config,
+ &adat->ntypes,
+ &nbparams_params, &adat->xq, &adat->f, &adat->e_lj, &adat->e_el, &adat->fshift,
+ &adat->atom_types,
+ &adat->shift_vec, &nbp->nbfp_climg2d, &nbp->nbfp_comb_climg2d, &nbp->coulomb_tab_climg2d,
+ &plist->sci, &plist->cj4, &plist->excl, &computeFshift);
+ launchGpuKernel(kernel, config, timingEvent, kernelName, kernelArgs);
+ }
+
+ if (bDoTime)
+ {
+ t->interaction[iloc].nb_k.closeTimingRegion(stream);
+ }
+}
+
+
+/*! \brief Calculates the amount of shared memory required by the prune kernel.
+ *
+ * Note that for the sake of simplicity we use the CUDA terminology "shared memory"
+ * for OpenCL local memory.
+ *
+ * \param[in] num_threads_z cj4 concurrency equal to the number of threads/work items in the 3-rd dimension.
+ * \returns the amount of local memory in bytes required by the pruning kernel
+ */
+static inline int calc_shmem_required_prune(const int num_threads_z)
+{
+ int shmem;
+
+ /* i-atom x in shared memory (for convenience we load all 4 components including q) */
+ shmem = c_numClPerSupercl * c_clSize * sizeof(float)*4;
+ /* cj in shared memory, for each warp separately
+ * Note: only need to load once per wavefront, but to keep the code simple,
+ * for now we load twice on AMD.
+ */
+ shmem += num_threads_z * c_nbnxnGpuClusterpairSplit * c_nbnxnGpuJgroupSize * sizeof(int);
+ /* Warp vote, requires one uint per warp/32 threads per block. */
+ shmem += sizeof(cl_uint) * 2*num_threads_z;
+
+ return shmem;
+}
+
+void gpu_launch_kernel_pruneonly(gmx_nbnxn_gpu_t *nb,
+ const InteractionLocality iloc,
+ const int numParts)
+{
+ cl_atomdata_t *adat = nb->atdat;
+ cl_nbparam_t *nbp = nb->nbparam;
+ cl_plist_t *plist = nb->plist[iloc];
+ cl_timers_t *t = nb->timers;
+ cl_command_queue stream = nb->stream[iloc];
+ bool bDoTime = nb->bDoTime == CL_TRUE;
+
+ if (plist->haveFreshList)
+ {
+ GMX_ASSERT(numParts == 1, "With first pruning we expect 1 part");
+
+ /* Set rollingPruningNumParts to signal that it is not set */
+ plist->rollingPruningNumParts = 0;
+ plist->rollingPruningPart = 0;
+ }
+ else
+ {
+ if (plist->rollingPruningNumParts == 0)
+ {
+ plist->rollingPruningNumParts = numParts;
+ }
+ else
+ {
+ GMX_ASSERT(numParts == plist->rollingPruningNumParts, "It is not allowed to change numParts in between list generation steps");
+ }
+ }
+
+ /* Use a local variable for part and update in plist, so we can return here
+ * without duplicating the part increment code.
+ */
+ int part = plist->rollingPruningPart;
+
+ plist->rollingPruningPart++;
+ if (plist->rollingPruningPart >= plist->rollingPruningNumParts)
+ {
+ plist->rollingPruningPart = 0;
+ }
+
+ /* Compute the number of list entries to prune in this pass */
+ int numSciInPart = (plist->nsci - part)/numParts;
+
+ /* Don't launch the kernel if there is no work to do. */
+ if (numSciInPart <= 0)
+ {
+ plist->haveFreshList = false;
+
+ return;
+ }
+
+ GpuRegionTimer *timer = nullptr;
+ if (bDoTime)
+ {
+ timer = &(plist->haveFreshList ? t->interaction[iloc].prune_k : t->interaction[iloc].rollingPrune_k);
+ }
+
+ /* beginning of timed prune calculation section */
+ if (bDoTime)
+ {
+ timer->openTimingRegion(stream);
+ }
+
+ /* Kernel launch config:
+ * - The thread block dimensions match the size of i-clusters, j-clusters,
+ * and j-cluster concurrency, in x, y, and z, respectively.
+ * - The 1D block-grid contains as many blocks as super-clusters.
+ */
+ int num_threads_z = getOclPruneKernelJ4Concurrency(nb->dev_info->vendor_e);
+
+ /* kernel launch config */
+ KernelLaunchConfig config;
+ config.sharedMemorySize = calc_shmem_required_prune(num_threads_z);
+ config.stream = stream;
+ config.blockSize[0] = c_clSize;
+ config.blockSize[1] = c_clSize;
+ config.blockSize[2] = num_threads_z;
+ config.gridSize[0] = numSciInPart;
+
+ validate_global_work_size(config, 3, nb->dev_info);
+
+ if (debug)
+ {
+ fprintf(debug, "Pruning GPU kernel launch configuration:\n\tLocal work size: %zux%zux%zu\n\t"
+ "\tGlobal work size: %zux%zu\n\t#Super-clusters/clusters: %d/%d (%d)\n"
+ "\tShMem: %zu\n",
+ config.blockSize[0], config.blockSize[1], config.blockSize[2],
+ config.blockSize[0] * config.gridSize[0], config.blockSize[1] * config.gridSize[1], plist->nsci*c_numClPerSupercl,
+ c_numClPerSupercl, plist->na_c, config.sharedMemorySize);
+ }
+
+ cl_nbparam_params_t nbparams_params;
+ fillin_ocl_structures(nbp, &nbparams_params);
+
+ auto *timingEvent = bDoTime ? timer->fetchNextEvent() : nullptr;
+ constexpr char kernelName[] = "k_pruneonly";
+ const auto pruneKernel = selectPruneKernel(nb->kernel_pruneonly, plist->haveFreshList);
+ const auto kernelArgs = prepareGpuKernelArguments(pruneKernel, config,
+ &nbparams_params, &adat->xq, &adat->shift_vec,
+ &plist->sci, &plist->cj4, &plist->imask, &numParts, &part);
+ launchGpuKernel(pruneKernel, config, timingEvent, kernelName, kernelArgs);
+
+ if (plist->haveFreshList)
+ {
+ plist->haveFreshList = false;
+ /* Mark that pruning has been done */
+ nb->timers->interaction[iloc].didPrune = true;
+ }
+ else
+ {
+ /* Mark that rolling pruning has been done */
+ nb->timers->interaction[iloc].didRollingPrune = true;
+ }
+
+ if (bDoTime)
+ {
+ timer->closeTimingRegion(stream);
+ }
+}
+
+/*! \brief
+ * Launch asynchronously the download of nonbonded forces from the GPU
+ * (and energies/shift forces if required).
+ */
+void gpu_launch_cpyback(gmx_nbnxn_ocl_t *nb,
+ struct nbnxn_atomdata_t *nbatom,
+ const gmx::StepWorkload &stepWork,
+ const AtomLocality aloc,
+ const bool gmx_unused copyBackNbForce)
+{
+ GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
+
+ cl_int gmx_unused cl_error;
+ int adat_begin, adat_len; /* local/nonlocal offset and length used for xq and f */
+
+ /* determine interaction locality from atom locality */
+ const InteractionLocality iloc = gpuAtomToInteractionLocality(aloc);
+
+ cl_atomdata_t *adat = nb->atdat;
+ cl_timers_t *t = nb->timers;
+ bool bDoTime = nb->bDoTime == CL_TRUE;
+ cl_command_queue stream = nb->stream[iloc];
+
+ /* don't launch non-local copy-back if there was no non-local work to do */
+ if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(*nb, iloc))
+ {
+ /* TODO An alternative way to signal that non-local work is
+ complete is to use a clEnqueueMarker+clEnqueueBarrier
+ pair. However, the use of bNonLocalStreamActive has the
+ advantage of being local to the host, so probably minimizes
+ overhead. Curiously, for NVIDIA OpenCL with an empty-domain
+ test case, overall simulation performance was higher with
+ the API calls, but this has not been tested on AMD OpenCL,
+ so could be worth considering in future. */
+ nb->bNonLocalStreamActive = CL_FALSE;
+ return;
+ }
+
+ getGpuAtomRange(adat, aloc, &adat_begin, &adat_len);
+
+ /* beginning of timed D2H section */
+ if (bDoTime)
+ {
+ t->xf[aloc].nb_d2h.openTimingRegion(stream);
+ }
+
+ /* With DD the local D2H transfer can only start after the non-local
+ has been launched. */
+ if (iloc == InteractionLocality::Local && nb->bNonLocalStreamActive)
+ {
+ sync_ocl_event(stream, &(nb->nonlocal_done));
+ }
+
+ /* DtoH f */
+ ocl_copy_D2H_async(nbatom->out[0].f.data() + adat_begin * 3, adat->f, adat_begin*3*sizeof(float),
+ (adat_len)* adat->f_elem_size, stream, bDoTime ? t->xf[aloc].nb_d2h.fetchNextEvent() : nullptr);
+
+ /* kick off work */
+ cl_error = clFlush(stream);
+ assert(CL_SUCCESS == cl_error);
+
+ /* After the non-local D2H is launched the nonlocal_done event can be
+ recorded which signals that the local D2H can proceed. This event is not
+ placed after the non-local kernel because we first need the non-local
+ data back first. */
+ if (iloc == InteractionLocality::NonLocal)
+ {
+ cl_error = clEnqueueMarkerWithWaitList(stream, 0, nullptr, &(nb->nonlocal_done));
+ assert(CL_SUCCESS == cl_error);
+ nb->bNonLocalStreamActive = CL_TRUE;
+ }
+
+ /* only transfer energies in the local stream */
+ if (iloc == InteractionLocality::Local)
+ {
+ /* DtoH fshift when virial is needed */
+ if (stepWork.computeVirial)
+ {
+ ocl_copy_D2H_async(nb->nbst.fshift, adat->fshift, 0,
+ SHIFTS * adat->fshift_elem_size, stream, bDoTime ? t->xf[aloc].nb_d2h.fetchNextEvent() : nullptr);
+ }
+
+ /* DtoH energies */
+ if (stepWork.computeEnergy)
+ {
+ ocl_copy_D2H_async(nb->nbst.e_lj, adat->e_lj, 0,
+ sizeof(float), stream, bDoTime ? t->xf[aloc].nb_d2h.fetchNextEvent() : nullptr);
+
+ ocl_copy_D2H_async(nb->nbst.e_el, adat->e_el, 0,
+ sizeof(float), stream, bDoTime ? t->xf[aloc].nb_d2h.fetchNextEvent() : nullptr);
+ }
+ }
+
+ if (bDoTime)
+ {
+ t->xf[aloc].nb_d2h.closeTimingRegion(stream);
+ }
+}
+
+
+/*! \brief Selects the Ewald kernel type, analytical or tabulated, single or twin cut-off. */
++int nbnxn_gpu_pick_ewald_kernel_type(const interaction_const_t &ic)
+{
++ bool bTwinCut = (ic.rcoulomb != ic.rvdw);
+ bool bUseAnalyticalEwald, bForceAnalyticalEwald, bForceTabulatedEwald;
+ int kernel_type;
+
+ /* Benchmarking/development environment variables to force the use of
+ analytical or tabulated Ewald kernel. */
+ bForceAnalyticalEwald = (getenv("GMX_OCL_NB_ANA_EWALD") != nullptr);
+ bForceTabulatedEwald = (getenv("GMX_OCL_NB_TAB_EWALD") != nullptr);
+
+ if (bForceAnalyticalEwald && bForceTabulatedEwald)
+ {
+ gmx_incons("Both analytical and tabulated Ewald OpenCL non-bonded kernels "
+ "requested through environment variables.");
+ }
+
+ /* OpenCL: By default, use analytical Ewald
+ * TODO: tabulated does not work, it needs fixing, see init_nbparam() in nbnxn_ocl_data_mgmt.cpp
+ *
+ * TODO: decide if dev_info parameter should be added to recognize NVIDIA CC>=3.0 devices.
+ *
+ */
+ /* By default use analytical Ewald. */
+ bUseAnalyticalEwald = true;
+ if (bForceAnalyticalEwald)
+ {
+ if (debug)
+ {
+ fprintf(debug, "Using analytical Ewald OpenCL kernels\n");
+ }
+ }
+ else if (bForceTabulatedEwald)
+ {
+ bUseAnalyticalEwald = false;
+
+ if (debug)
+ {
+ fprintf(debug, "Using tabulated Ewald OpenCL kernels\n");
+ }
+ }
+
+ /* Use twin cut-off kernels if requested by bTwinCut or the env. var.
+ forces it (use it for debugging/benchmarking only). */
+ if (!bTwinCut && (getenv("GMX_OCL_NB_EWALD_TWINCUT") == nullptr))
+ {
+ kernel_type = bUseAnalyticalEwald ? eelOclEWALD_ANA : eelOclEWALD_TAB;
+ }
+ else
+ {
+ kernel_type = bUseAnalyticalEwald ? eelOclEWALD_ANA_TWIN : eelOclEWALD_TAB_TWIN;
+ }
+
+ return kernel_type;
+}
+
+} // namespace Nbnxm
--- /dev/null
- /* Initially rcoulomb == rvdw, so it's surely not twin cut-off. */
- *gpu_eeltype = gpu_pick_ewald_kernel_type(false);
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ * \brief Define OpenCL implementation of nbnxm_gpu_data_mgmt.h
+ *
+ * \author Anca Hamuraru <anca@streamcomputing.eu>
+ * \author Dimitrios Karkoulis <dimitris.karkoulis@gmail.com>
+ * \author Teemu Virolainen <teemu@streamcomputing.eu>
+ * \author Szilárd Páll <pall.szilard@gmail.com>
+ */
+#include "gmxpre.h"
+
+#include <assert.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <cmath>
+
+// TODO We would like to move this down, but the way gmx_nbnxn_gpu_t
+// is currently declared means this has to be before gpu_types.h
+#include "nbnxm_ocl_types.h"
+
+// TODO Remove this comment when the above order issue is resolved
+#include "gromacs/gpu_utils/gpu_utils.h"
+#include "gromacs/gpu_utils/oclutils.h"
+#include "gromacs/hardware/gpu_hw_info.h"
+#include "gromacs/math/vectypes.h"
+#include "gromacs/mdlib/force_flags.h"
+#include "gromacs/mdtypes/interaction_const.h"
+#include "gromacs/mdtypes/md_enums.h"
+#include "gromacs/nbnxm/atomdata.h"
+#include "gromacs/nbnxm/gpu_data_mgmt.h"
+#include "gromacs/nbnxm/gpu_jit_support.h"
+#include "gromacs/nbnxm/nbnxm.h"
+#include "gromacs/nbnxm/nbnxm_gpu.h"
+#include "gromacs/nbnxm/pairlistsets.h"
+#include "gromacs/pbcutil/ishift.h"
+#include "gromacs/timing/gpu_timing.h"
+#include "gromacs/utility/cstringutil.h"
+#include "gromacs/utility/fatalerror.h"
+#include "gromacs/utility/gmxassert.h"
+#include "gromacs/utility/real.h"
+#include "gromacs/utility/smalloc.h"
+
+#include "nbnxm_ocl_internal.h"
+
+namespace Nbnxm
+{
+
+/*! \brief Copies of values from cl_driver_diagnostics_intel.h,
+ * which isn't guaranteed to be available. */
+/**@{*/
+#define CL_CONTEXT_SHOW_DIAGNOSTICS_INTEL 0x4106
+#define CL_CONTEXT_DIAGNOSTICS_LEVEL_GOOD_INTEL 0x1
+#define CL_CONTEXT_DIAGNOSTICS_LEVEL_BAD_INTEL 0x2
+#define CL_CONTEXT_DIAGNOSTICS_LEVEL_NEUTRAL_INTEL 0x4
+/**@}*/
+
+/*! \brief This parameter should be determined heuristically from the
+ * kernel execution times
+ *
+ * This value is best for small systems on a single AMD Radeon R9 290X
+ * (and about 5% faster than 40, which is the default for CUDA
+ * devices). Larger simulation systems were quite insensitive to the
+ * value of this parameter.
+ */
+static unsigned int gpu_min_ci_balanced_factor = 50;
+
+
+/*! \brief Returns true if LJ combination rules are used in the non-bonded kernels.
+ *
+ * Full doc in nbnxn_ocl_internal.h */
+bool useLjCombRule(int vdwType)
+{
+ return (vdwType == evdwOclCUTCOMBGEOM ||
+ vdwType == evdwOclCUTCOMBLB);
+}
+
+/*! \brief Tabulates the Ewald Coulomb force and initializes the size/scale
+ * and the table GPU array.
+ *
+ * If called with an already allocated table, it just re-uploads the
+ * table.
+ */
+static void init_ewald_coulomb_force_table(const EwaldCorrectionTables &tables,
+ cl_nbparam_t *nbp,
+ const gmx_device_runtime_data_t *runData)
+{
+ cl_mem coul_tab;
+
+ cl_int cl_error;
+
+ if (nbp->coulomb_tab_climg2d != nullptr)
+ {
+ freeDeviceBuffer(&(nbp->coulomb_tab_climg2d));
+ }
+
+ /* Switched from using textures to using buffers */
+ // TODO: decide which alternative is most efficient - textures or buffers.
+ /*
+ cl_image_format array_format;
+
+ array_format.image_channel_data_type = CL_FLOAT;
+ array_format.image_channel_order = CL_R;
+
+ coul_tab = clCreateImage2D(runData->context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
+ &array_format, tabsize, 1, 0, ftmp, &cl_error);
+ */
+
+ coul_tab = clCreateBuffer(runData->context, CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY | CL_MEM_COPY_HOST_PTR,
+ tables.tableF.size()*sizeof(cl_float), const_cast<real *>(tables.tableF.data()), &cl_error);
+ GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
+ ("clCreateBuffer failed: " + ocl_get_error_string(cl_error)).c_str());
+
+ nbp->coulomb_tab_climg2d = coul_tab;
+ nbp->coulomb_tab_scale = tables.scale;
+}
+
+
+/*! \brief Initializes the atomdata structure first time, it only gets filled at
+ pair-search.
+ */
+static void init_atomdata_first(cl_atomdata_t *ad, int ntypes, gmx_device_runtime_data_t *runData)
+{
+ cl_int cl_error;
+
+ ad->ntypes = ntypes;
+
+ /* An element of the shift_vec device buffer has the same size as one element
+ of the host side shift_vec buffer. */
+ ad->shift_vec_elem_size = sizeof(*nbnxn_atomdata_t::shift_vec.data());
+
+ ad->shift_vec = clCreateBuffer(runData->context, CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY,
+ SHIFTS * ad->shift_vec_elem_size, nullptr, &cl_error);
+ GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
+ ("clCreateBuffer failed: " + ocl_get_error_string(cl_error)).c_str());
+ ad->bShiftVecUploaded = CL_FALSE;
+
+ /* An element of the fshift device buffer has the same size as one element
+ of the host side fshift buffer. */
+ ad->fshift_elem_size = sizeof(*cl_nb_staging_t::fshift);
+
+ ad->fshift = clCreateBuffer(runData->context, CL_MEM_READ_WRITE | CL_MEM_HOST_READ_ONLY,
+ SHIFTS * ad->fshift_elem_size, nullptr, &cl_error);
+ GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
+ ("clCreateBuffer failed: " + ocl_get_error_string(cl_error)).c_str());
+
+ ad->e_lj = clCreateBuffer(runData->context, CL_MEM_READ_WRITE | CL_MEM_HOST_READ_ONLY,
+ sizeof(float), nullptr, &cl_error);
+ GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
+ ("clCreateBuffer failed: " + ocl_get_error_string(cl_error)).c_str());
+
+ ad->e_el = clCreateBuffer(runData->context, CL_MEM_READ_WRITE | CL_MEM_HOST_READ_ONLY,
+ sizeof(float), nullptr, &cl_error);
+ GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
+ ("clCreateBuffer failed: " + ocl_get_error_string(cl_error)).c_str());
+
+ /* initialize to nullptr pointers to data that is not allocated here and will
+ need reallocation in nbnxn_gpu_init_atomdata */
+ ad->xq = nullptr;
+ ad->f = nullptr;
+
+ /* size -1 indicates that the respective array hasn't been initialized yet */
+ ad->natoms = -1;
+ ad->nalloc = -1;
+}
+
+/*! \brief Copies all parameters related to the cut-off from ic to nbp
+ */
+static void set_cutoff_parameters(cl_nbparam_t *nbp,
+ const interaction_const_t *ic,
+ const PairlistParams &listParams)
+{
+ nbp->ewald_beta = ic->ewaldcoeff_q;
+ nbp->sh_ewald = ic->sh_ewald;
+ nbp->epsfac = ic->epsfac;
+ nbp->two_k_rf = 2.0 * ic->k_rf;
+ nbp->c_rf = ic->c_rf;
+ nbp->rvdw_sq = ic->rvdw * ic->rvdw;
+ nbp->rcoulomb_sq = ic->rcoulomb * ic->rcoulomb;
+ nbp->rlistOuter_sq = listParams.rlistOuter * listParams.rlistOuter;
+ nbp->rlistInner_sq = listParams.rlistInner * listParams.rlistInner;
+ nbp->useDynamicPruning = listParams.useDynamicPruning;
+
+ nbp->sh_lj_ewald = ic->sh_lj_ewald;
+ nbp->ewaldcoeff_lj = ic->ewaldcoeff_lj;
+
+ nbp->rvdw_switch = ic->rvdw_switch;
+ nbp->dispersion_shift = ic->dispersion_shift;
+ nbp->repulsion_shift = ic->repulsion_shift;
+ nbp->vdw_switch = ic->vdw_switch;
+}
+
+/*! \brief Returns the kinds of electrostatics and Vdw OpenCL
+ * kernels that will be used.
+ *
+ * Respectively, these values are from enum eelOcl and enum
+ * evdwOcl. */
+static void
+map_interaction_types_to_gpu_kernel_flavors(const interaction_const_t *ic,
+ int combRule,
+ int *gpu_eeltype,
+ int *gpu_vdwtype)
+{
+ if (ic->vdwtype == evdwCUT)
+ {
+ switch (ic->vdw_modifier)
+ {
+ case eintmodNONE:
+ case eintmodPOTSHIFT:
+ switch (combRule)
+ {
+ case ljcrNONE:
+ *gpu_vdwtype = evdwOclCUT;
+ break;
+ case ljcrGEOM:
+ *gpu_vdwtype = evdwOclCUTCOMBGEOM;
+ break;
+ case ljcrLB:
+ *gpu_vdwtype = evdwOclCUTCOMBLB;
+ break;
+ default:
+ gmx_incons("The requested LJ combination rule is not implemented in the OpenCL GPU accelerated kernels!");
+ }
+ break;
+ case eintmodFORCESWITCH:
+ *gpu_vdwtype = evdwOclFSWITCH;
+ break;
+ case eintmodPOTSWITCH:
+ *gpu_vdwtype = evdwOclPSWITCH;
+ break;
+ default:
+ gmx_incons("The requested VdW interaction modifier is not implemented in the GPU accelerated kernels!");
+ }
+ }
+ else if (ic->vdwtype == evdwPME)
+ {
+ if (ic->ljpme_comb_rule == ljcrGEOM)
+ {
+ *gpu_vdwtype = evdwOclEWALDGEOM;
+ }
+ else
+ {
+ *gpu_vdwtype = evdwOclEWALDLB;
+ }
+ }
+ else
+ {
+ gmx_incons("The requested VdW type is not implemented in the GPU accelerated kernels!");
+ }
+
+ if (ic->eeltype == eelCUT)
+ {
+ *gpu_eeltype = eelOclCUT;
+ }
+ else if (EEL_RF(ic->eeltype))
+ {
+ *gpu_eeltype = eelOclRF;
+ }
+ else if ((EEL_PME(ic->eeltype) || ic->eeltype == eelEWALD))
+ {
- nbp->eeltype = gpu_pick_ewald_kernel_type(ic->rcoulomb != ic->rvdw);
++ *gpu_eeltype = nbnxn_gpu_pick_ewald_kernel_type(*ic);
+ }
+ else
+ {
+ /* Shouldn't happen, as this is checked when choosing Verlet-scheme */
+ gmx_incons("The requested electrostatics type is not implemented in the GPU accelerated kernels!");
+ }
+}
+
+/*! \brief Initializes the nonbonded parameter data structure.
+ */
+static void init_nbparam(cl_nbparam_t *nbp,
+ const interaction_const_t *ic,
+ const PairlistParams &listParams,
+ const nbnxn_atomdata_t::Params &nbatParams,
+ const gmx_device_runtime_data_t *runData)
+{
+ cl_int cl_error;
+
+ set_cutoff_parameters(nbp, ic, listParams);
+
+ map_interaction_types_to_gpu_kernel_flavors(ic,
+ nbatParams.comb_rule,
+ &(nbp->eeltype),
+ &(nbp->vdwtype));
+
+ if (ic->vdwtype == evdwPME)
+ {
+ if (ic->ljpme_comb_rule == ljcrGEOM)
+ {
+ GMX_ASSERT(nbatParams.comb_rule == ljcrGEOM, "Combination rule mismatch!");
+ }
+ else
+ {
+ GMX_ASSERT(nbatParams.comb_rule == ljcrLB, "Combination rule mismatch!");
+ }
+ }
+ /* generate table for PME */
+ nbp->coulomb_tab_climg2d = nullptr;
+ if (nbp->eeltype == eelOclEWALD_TAB || nbp->eeltype == eelOclEWALD_TAB_TWIN)
+ {
+ GMX_RELEASE_ASSERT(ic->coulombEwaldTables, "Need valid Coulomb Ewald correction tables");
+ init_ewald_coulomb_force_table(*ic->coulombEwaldTables, nbp, runData);
+ }
+ else
+ // TODO: improvement needed.
+ // The image2d is created here even if eeltype is not eelCuEWALD_TAB or eelCuEWALD_TAB_TWIN because the OpenCL kernels
+ // don't accept nullptr values for image2D parameters.
+ {
+ /* Switched from using textures to using buffers */
+ // TODO: decide which alternative is most efficient - textures or buffers.
+ /*
+ cl_image_format array_format;
+
+ array_format.image_channel_data_type = CL_FLOAT;
+ array_format.image_channel_order = CL_R;
+
+ nbp->coulomb_tab_climg2d = clCreateImage2D(runData->context, CL_MEM_READ_WRITE,
+ &array_format, 1, 1, 0, nullptr, &cl_error);
+ */
+
+ nbp->coulomb_tab_climg2d = clCreateBuffer(runData->context, CL_MEM_READ_ONLY, sizeof(cl_float), nullptr, &cl_error);
+ GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
+ ("clCreateBuffer failed: " + ocl_get_error_string(cl_error)).c_str());
+ }
+
+ const int nnbfp = 2*nbatParams.numTypes*nbatParams.numTypes;
+ const int nnbfp_comb = 2*nbatParams.numTypes;
+
+ {
+ /* Switched from using textures to using buffers */
+ // TODO: decide which alternative is most efficient - textures or buffers.
+ /*
+ cl_image_format array_format;
+
+ array_format.image_channel_data_type = CL_FLOAT;
+ array_format.image_channel_order = CL_R;
+
+ nbp->nbfp_climg2d = clCreateImage2D(runData->context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
+ &array_format, nnbfp, 1, 0, nbat->nbfp, &cl_error);
+ */
+
+ nbp->nbfp_climg2d =
+ clCreateBuffer(runData->context,
+ CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY | CL_MEM_COPY_HOST_PTR,
+ nnbfp*sizeof(cl_float),
+ const_cast<float *>(nbatParams.nbfp.data()),
+ &cl_error);
+ GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
+ ("clCreateBuffer failed: " + ocl_get_error_string(cl_error)).c_str());
+
+ if (ic->vdwtype == evdwPME)
+ {
+ /* Switched from using textures to using buffers */
+ // TODO: decide which alternative is most efficient - textures or buffers.
+ /* nbp->nbfp_comb_climg2d = clCreateImage2D(runData->context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
+ &array_format, nnbfp_comb, 1, 0, nbat->nbfp_comb, &cl_error);*/
+ nbp->nbfp_comb_climg2d =
+ clCreateBuffer(runData->context,
+ CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY | CL_MEM_COPY_HOST_PTR,
+ nnbfp_comb*sizeof(cl_float),
+ const_cast<float *>(nbatParams.nbfp_comb.data()),
+ &cl_error);
+ GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
+ ("clCreateBuffer failed: " + ocl_get_error_string(cl_error)).c_str());
+ }
+ else
+ {
+ // TODO: improvement needed.
+ // The image2d is created here even if vdwtype is not evdwPME because the OpenCL kernels
+ // don't accept nullptr values for image2D parameters.
+ /* Switched from using textures to using buffers */
+ // TODO: decide which alternative is most efficient - textures or buffers.
+ /* nbp->nbfp_comb_climg2d = clCreateImage2D(runData->context, CL_MEM_READ_WRITE,
+ &array_format, 1, 1, 0, nullptr, &cl_error);*/
+ nbp->nbfp_comb_climg2d = clCreateBuffer(runData->context, CL_MEM_READ_ONLY, sizeof(cl_float), nullptr, &cl_error);
+ GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
+ ("clCreateBuffer failed: " + ocl_get_error_string(cl_error)).c_str());
+ }
+ }
+}
+
+//! This function is documented in the header file
+void gpu_pme_loadbal_update_param(const nonbonded_verlet_t *nbv,
+ const interaction_const_t *ic)
+{
+ if (!nbv || !nbv->useGpu())
+ {
+ return;
+ }
+ gmx_nbnxn_ocl_t *nb = nbv->gpu_nbv;
+ cl_nbparam_t *nbp = nb->nbparam;
+
+ set_cutoff_parameters(nbp, ic, nbv->pairlistSets().params());
+
++ nbp->eeltype = nbnxn_gpu_pick_ewald_kernel_type(*ic);
+
+ GMX_RELEASE_ASSERT(ic->coulombEwaldTables, "Need valid Coulomb Ewald correction tables");
+ init_ewald_coulomb_force_table(*ic->coulombEwaldTables, nbp, nb->dev_rundata);
+}
+
+/*! \brief Initializes the pair list data structure.
+ */
+static void init_plist(cl_plist_t *pl)
+{
+ /* initialize to nullptr pointers to data that is not allocated here and will
+ need reallocation in nbnxn_gpu_init_pairlist */
+ pl->sci = nullptr;
+ pl->cj4 = nullptr;
+ pl->imask = nullptr;
+ pl->excl = nullptr;
+
+ /* size -1 indicates that the respective array hasn't been initialized yet */
+ pl->na_c = -1;
+ pl->nsci = -1;
+ pl->sci_nalloc = -1;
+ pl->ncj4 = -1;
+ pl->cj4_nalloc = -1;
+ pl->nimask = -1;
+ pl->imask_nalloc = -1;
+ pl->nexcl = -1;
+ pl->excl_nalloc = -1;
+ pl->haveFreshList = false;
+}
+
+/*! \brief Initializes the timings data structure.
+ */
+static void init_timings(gmx_wallclock_gpu_nbnxn_t *t)
+{
+ int i, j;
+
+ t->nb_h2d_t = 0.0;
+ t->nb_d2h_t = 0.0;
+ t->nb_c = 0;
+ t->pl_h2d_t = 0.0;
+ t->pl_h2d_c = 0;
+ for (i = 0; i < 2; i++)
+ {
+ for (j = 0; j < 2; j++)
+ {
+ t->ktime[i][j].t = 0.0;
+ t->ktime[i][j].c = 0;
+ }
+ }
+
+ t->pruneTime.c = 0;
+ t->pruneTime.t = 0.0;
+ t->dynamicPruneTime.c = 0;
+ t->dynamicPruneTime.t = 0.0;
+}
+
+
+//! OpenCL notification callback function
+static void CL_CALLBACK
+ocl_notify_fn(const char *pErrInfo, const void gmx_unused *private_info, size_t gmx_unused cb, void gmx_unused *user_data)
+{
+ if (pErrInfo != nullptr)
+ {
+ printf("%s\n", pErrInfo ); // Print error/hint
+ }
+}
+
+/*! \brief Creates context for OpenCL GPU given by \p mygpu
+ *
+ * A fatal error results if creation fails.
+ *
+ * \param[inout] runtimeData runtime data including program and context
+ * \param[in] devInfo device info struct
+ * \param[in] rank MPI rank (for error reporting)
+ */
+static void
+nbnxn_gpu_create_context(gmx_device_runtime_data_t *runtimeData,
+ const gmx_device_info_t *devInfo,
+ int rank)
+{
+ cl_context_properties context_properties[5];
+ cl_platform_id platform_id;
+ cl_device_id device_id;
+ cl_context context;
+ cl_int cl_error;
+
+ assert(runtimeData != nullptr);
+ assert(devInfo != nullptr);
+
+ platform_id = devInfo->ocl_gpu_id.ocl_platform_id;
+ device_id = devInfo->ocl_gpu_id.ocl_device_id;
+
+ int i = 0;
+ context_properties[i++] = CL_CONTEXT_PLATFORM;
+ context_properties[i++] = reinterpret_cast<cl_context_properties>(platform_id);
+ if (getenv("GMX_OCL_SHOW_DIAGNOSTICS"))
+ {
+ context_properties[i++] = CL_CONTEXT_SHOW_DIAGNOSTICS_INTEL;
+ context_properties[i++] = CL_CONTEXT_DIAGNOSTICS_LEVEL_BAD_INTEL | CL_CONTEXT_DIAGNOSTICS_LEVEL_NEUTRAL_INTEL;
+ }
+ context_properties[i++] = 0; /* Terminates the list of properties */
+
+ context = clCreateContext(context_properties, 1, &device_id, ocl_notify_fn, nullptr, &cl_error);
+ if (CL_SUCCESS != cl_error)
+ {
+ gmx_fatal(FARGS, "On rank %d failed to create context for GPU #%s:\n OpenCL error %d: %s",
+ rank,
+ devInfo->device_name,
+ cl_error, ocl_get_error_string(cl_error).c_str());
+ }
+
+ runtimeData->context = context;
+}
+
+/*! \brief Initializes the OpenCL kernel pointers of the nbnxn_ocl_ptr_t input data structure. */
+static cl_kernel nbnxn_gpu_create_kernel(gmx_nbnxn_ocl_t *nb,
+ const char *kernel_name)
+{
+ cl_kernel kernel;
+ cl_int cl_error;
+
+ kernel = clCreateKernel(nb->dev_rundata->program, kernel_name, &cl_error);
+ if (CL_SUCCESS != cl_error)
+ {
+ gmx_fatal(FARGS, "Failed to create kernel '%s' for GPU #%s: OpenCL error %d",
+ kernel_name,
+ nb->dev_info->device_name,
+ cl_error);
+ }
+
+ return kernel;
+}
+
+/*! \brief Clears nonbonded shift force output array and energy outputs on the GPU.
+ */
+static void
+nbnxn_ocl_clear_e_fshift(gmx_nbnxn_ocl_t *nb)
+{
+
+ cl_int cl_error;
+ cl_atomdata_t * adat = nb->atdat;
+ cl_command_queue ls = nb->stream[InteractionLocality::Local];
+
+ size_t local_work_size[3] = {1, 1, 1};
+ size_t global_work_size[3] = {1, 1, 1};
+
+ cl_int shifts = SHIFTS*3;
+
+ cl_int arg_no;
+
+ cl_kernel zero_e_fshift = nb->kernel_zero_e_fshift;
+
+ local_work_size[0] = 64;
+ // Round the total number of threads up from the array size
+ global_work_size[0] = ((shifts + local_work_size[0] - 1)/local_work_size[0])*local_work_size[0];
+
+ arg_no = 0;
+ cl_error = clSetKernelArg(zero_e_fshift, arg_no++, sizeof(cl_mem), &(adat->fshift));
+ cl_error |= clSetKernelArg(zero_e_fshift, arg_no++, sizeof(cl_mem), &(adat->e_lj));
+ cl_error |= clSetKernelArg(zero_e_fshift, arg_no++, sizeof(cl_mem), &(adat->e_el));
+ cl_error |= clSetKernelArg(zero_e_fshift, arg_no++, sizeof(cl_uint), &shifts);
+ GMX_ASSERT(cl_error == CL_SUCCESS, ocl_get_error_string(cl_error).c_str());
+
+ cl_error = clEnqueueNDRangeKernel(ls, zero_e_fshift, 3, nullptr, global_work_size, local_work_size, 0, nullptr, nullptr);
+ GMX_ASSERT(cl_error == CL_SUCCESS, ocl_get_error_string(cl_error).c_str());
+}
+
+/*! \brief Initializes the OpenCL kernel pointers of the nbnxn_ocl_ptr_t input data structure. */
+static void nbnxn_gpu_init_kernels(gmx_nbnxn_ocl_t *nb)
+{
+ /* Init to 0 main kernel arrays */
+ /* They will be later on initialized in select_nbnxn_kernel */
+ // TODO: consider always creating all variants of the kernels here so that there is no
+ // need for late call to clCreateKernel -- if that gives any advantage?
+ memset(nb->kernel_ener_noprune_ptr, 0, sizeof(nb->kernel_ener_noprune_ptr));
+ memset(nb->kernel_ener_prune_ptr, 0, sizeof(nb->kernel_ener_prune_ptr));
+ memset(nb->kernel_noener_noprune_ptr, 0, sizeof(nb->kernel_noener_noprune_ptr));
+ memset(nb->kernel_noener_prune_ptr, 0, sizeof(nb->kernel_noener_prune_ptr));
+
+ /* Init pruning kernels
+ *
+ * TODO: we could avoid creating kernels if dynamic pruning is turned off,
+ * but ATM that depends on force flags not passed into the initialization.
+ */
+ nb->kernel_pruneonly[epruneFirst] = nbnxn_gpu_create_kernel(nb, "nbnxn_kernel_prune_opencl");
+ nb->kernel_pruneonly[epruneRolling] = nbnxn_gpu_create_kernel(nb, "nbnxn_kernel_prune_rolling_opencl");
+
+ /* Init auxiliary kernels */
+ nb->kernel_zero_e_fshift = nbnxn_gpu_create_kernel(nb, "zero_e_fshift");
+}
+
+/*! \brief Initializes simulation constant data.
+ *
+ * Initializes members of the atomdata and nbparam structs and
+ * clears e/fshift output buffers.
+ */
+static void nbnxn_ocl_init_const(gmx_nbnxn_ocl_t *nb,
+ const interaction_const_t *ic,
+ const PairlistParams &listParams,
+ const nbnxn_atomdata_t::Params &nbatParams)
+{
+ init_atomdata_first(nb->atdat, nbatParams.numTypes, nb->dev_rundata);
+ init_nbparam(nb->nbparam, ic, listParams, nbatParams, nb->dev_rundata);
+}
+
+
+//! This function is documented in the header file
+gmx_nbnxn_ocl_t *
+gpu_init(const gmx_device_info_t *deviceInfo,
+ const interaction_const_t *ic,
+ const PairlistParams &listParams,
+ const nbnxn_atomdata_t *nbat,
+ const int rank,
+ const gmx_bool bLocalAndNonlocal)
+{
+ gmx_nbnxn_ocl_t *nb;
+ cl_int cl_error;
+ cl_command_queue_properties queue_properties;
+
+ assert(ic);
+
+ snew(nb, 1);
+ snew(nb->atdat, 1);
+ snew(nb->nbparam, 1);
+ snew(nb->plist[InteractionLocality::Local], 1);
+ if (bLocalAndNonlocal)
+ {
+ snew(nb->plist[InteractionLocality::NonLocal], 1);
+ }
+
+ nb->bUseTwoStreams = static_cast<cl_bool>(bLocalAndNonlocal);
+
+ nb->timers = new cl_timers_t();
+ snew(nb->timings, 1);
+
+ /* set device info, just point it to the right GPU among the detected ones */
+ nb->dev_info = deviceInfo;
+ snew(nb->dev_rundata, 1);
+
+ /* init nbst */
+ pmalloc(reinterpret_cast<void**>(&nb->nbst.e_lj), sizeof(*nb->nbst.e_lj));
+ pmalloc(reinterpret_cast<void**>(&nb->nbst.e_el), sizeof(*nb->nbst.e_el));
+ pmalloc(reinterpret_cast<void**>(&nb->nbst.fshift), SHIFTS * sizeof(*nb->nbst.fshift));
+
+ init_plist(nb->plist[InteractionLocality::Local]);
+
+ /* OpenCL timing disabled if GMX_DISABLE_GPU_TIMING is defined. */
+ nb->bDoTime = static_cast<cl_bool>(getenv("GMX_DISABLE_GPU_TIMING") == nullptr);
+
+ /* Create queues only after bDoTime has been initialized */
+ if (nb->bDoTime)
+ {
+ queue_properties = CL_QUEUE_PROFILING_ENABLE;
+ }
+ else
+ {
+ queue_properties = 0;
+ }
+
+ nbnxn_gpu_create_context(nb->dev_rundata, nb->dev_info, rank);
+
+ /* local/non-local GPU streams */
+ nb->stream[InteractionLocality::Local] =
+ clCreateCommandQueue(nb->dev_rundata->context, nb->dev_info->ocl_gpu_id.ocl_device_id, queue_properties, &cl_error);
+ if (CL_SUCCESS != cl_error)
+ {
+ gmx_fatal(FARGS, "On rank %d failed to create context for GPU #%s: OpenCL error %d",
+ rank,
+ nb->dev_info->device_name,
+ cl_error);
+ }
+
+ if (nb->bUseTwoStreams)
+ {
+ init_plist(nb->plist[InteractionLocality::NonLocal]);
+
+ nb->stream[InteractionLocality::NonLocal] =
+ clCreateCommandQueue(nb->dev_rundata->context, nb->dev_info->ocl_gpu_id.ocl_device_id, queue_properties, &cl_error);
+ if (CL_SUCCESS != cl_error)
+ {
+ gmx_fatal(FARGS, "On rank %d failed to create context for GPU #%s: OpenCL error %d",
+ rank,
+ nb->dev_info->device_name,
+ cl_error);
+ }
+ }
+
+ if (nb->bDoTime)
+ {
+ init_timings(nb->timings);
+ }
+
+ nbnxn_ocl_init_const(nb, ic, listParams, nbat->params());
+
+ /* Enable LJ param manual prefetch for AMD or Intel or if we request through env. var.
+ * TODO: decide about NVIDIA
+ */
+ nb->bPrefetchLjParam =
+ (getenv("GMX_OCL_DISABLE_I_PREFETCH") == nullptr) &&
+ ((nb->dev_info->vendor_e == OCL_VENDOR_AMD) || (nb->dev_info->vendor_e == OCL_VENDOR_INTEL)
+ || (getenv("GMX_OCL_ENABLE_I_PREFETCH") != nullptr));
+
+ /* NOTE: in CUDA we pick L1 cache configuration for the nbnxn kernels here,
+ * but sadly this is not supported in OpenCL (yet?). Consider adding it if
+ * it becomes supported.
+ */
+ nbnxn_gpu_compile_kernels(nb);
+ nbnxn_gpu_init_kernels(nb);
+
+ /* clear energy and shift force outputs */
+ nbnxn_ocl_clear_e_fshift(nb);
+
+ if (debug)
+ {
+ fprintf(debug, "Initialized OpenCL data structures.\n");
+ }
+
+ return nb;
+}
+
+/*! \brief Clears the first natoms_clear elements of the GPU nonbonded force output array.
+ */
+static void nbnxn_ocl_clear_f(gmx_nbnxn_ocl_t *nb, int natoms_clear)
+{
+ if (natoms_clear == 0)
+ {
+ return;
+ }
+
+ cl_int gmx_used_in_debug cl_error;
+
+ cl_atomdata_t *atomData = nb->atdat;
+ cl_command_queue ls = nb->stream[InteractionLocality::Local];
+ cl_float value = 0.0F;
+
+ cl_error = clEnqueueFillBuffer(ls, atomData->f, &value, sizeof(cl_float),
+ 0, natoms_clear*sizeof(rvec), 0, nullptr, nullptr);
+ GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS, ("clEnqueueFillBuffer failed: " +
+ ocl_get_error_string(cl_error)).c_str());
+}
+
+//! This function is documented in the header file
+void gpu_clear_outputs(gmx_nbnxn_ocl_t *nb,
+ bool computeVirial)
+{
+ nbnxn_ocl_clear_f(nb, nb->atdat->natoms);
+ /* clear shift force array and energies if the outputs were
+ used in the current step */
+ if (computeVirial)
+ {
+ nbnxn_ocl_clear_e_fshift(nb);
+ }
+
+ /* kick off buffer clearing kernel to ensure concurrency with constraints/update */
+ cl_int gmx_unused cl_error;
+ cl_error = clFlush(nb->stream[InteractionLocality::Local]);
+ assert(CL_SUCCESS == cl_error);
+}
+
+//! This function is documented in the header file
+void gpu_init_pairlist(gmx_nbnxn_ocl_t *nb,
+ const NbnxnPairlistGpu *h_plist,
+ const InteractionLocality iloc)
+{
+ char sbuf[STRLEN];
+ // Timing accumulation should happen only if there was work to do
+ // because getLastRangeTime() gets skipped with empty lists later
+ // which leads to the counter not being reset.
+ bool bDoTime = ((nb->bDoTime == CL_TRUE) && !h_plist->sci.empty());
+ cl_command_queue stream = nb->stream[iloc];
+ cl_plist_t *d_plist = nb->plist[iloc];
+
+ if (d_plist->na_c < 0)
+ {
+ d_plist->na_c = h_plist->na_ci;
+ }
+ else
+ {
+ if (d_plist->na_c != h_plist->na_ci)
+ {
+ sprintf(sbuf, "In cu_init_plist: the #atoms per cell has changed (from %d to %d)",
+ d_plist->na_c, h_plist->na_ci);
+ gmx_incons(sbuf);
+ }
+ }
+
+ gpu_timers_t::Interaction &iTimers = nb->timers->interaction[iloc];
+
+ if (bDoTime)
+ {
+ iTimers.pl_h2d.openTimingRegion(stream);
+ iTimers.didPairlistH2D = true;
+ }
+
+ // TODO most of this function is same in CUDA and OpenCL, move into the header
+ Context context = nb->dev_rundata->context;
+
+ reallocateDeviceBuffer(&d_plist->sci, h_plist->sci.size(),
+ &d_plist->nsci, &d_plist->sci_nalloc, context);
+ copyToDeviceBuffer(&d_plist->sci, h_plist->sci.data(), 0, h_plist->sci.size(),
+ stream, GpuApiCallBehavior::Async,
+ bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
+
+ reallocateDeviceBuffer(&d_plist->cj4, h_plist->cj4.size(),
+ &d_plist->ncj4, &d_plist->cj4_nalloc, context);
+ copyToDeviceBuffer(&d_plist->cj4, h_plist->cj4.data(), 0, h_plist->cj4.size(),
+ stream, GpuApiCallBehavior::Async,
+ bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
+
+ reallocateDeviceBuffer(&d_plist->imask, h_plist->cj4.size()*c_nbnxnGpuClusterpairSplit,
+ &d_plist->nimask, &d_plist->imask_nalloc, context);
+
+ reallocateDeviceBuffer(&d_plist->excl, h_plist->excl.size(),
+ &d_plist->nexcl, &d_plist->excl_nalloc, context);
+ copyToDeviceBuffer(&d_plist->excl, h_plist->excl.data(), 0, h_plist->excl.size(),
+ stream, GpuApiCallBehavior::Async,
+ bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
+
+ if (bDoTime)
+ {
+ iTimers.pl_h2d.closeTimingRegion(stream);
+ }
+
+ /* need to prune the pair list during the next step */
+ d_plist->haveFreshList = true;
+}
+
+//! This function is documented in the header file
+void gpu_upload_shiftvec(gmx_nbnxn_ocl_t *nb,
+ const nbnxn_atomdata_t *nbatom)
+{
+ cl_atomdata_t *adat = nb->atdat;
+ cl_command_queue ls = nb->stream[InteractionLocality::Local];
+
+ /* only if we have a dynamic box */
+ if (nbatom->bDynamicBox || !adat->bShiftVecUploaded)
+ {
+ ocl_copy_H2D_async(adat->shift_vec, nbatom->shift_vec.data(), 0,
+ SHIFTS * adat->shift_vec_elem_size, ls, nullptr);
+ adat->bShiftVecUploaded = CL_TRUE;
+ }
+}
+
+//! This function is documented in the header file
+void gpu_init_atomdata(gmx_nbnxn_ocl_t *nb,
+ const nbnxn_atomdata_t *nbat)
+{
+ cl_int cl_error;
+ int nalloc, natoms;
+ bool realloced;
+ bool bDoTime = nb->bDoTime == CL_TRUE;
+ cl_timers_t *timers = nb->timers;
+ cl_atomdata_t *d_atdat = nb->atdat;
+ cl_command_queue ls = nb->stream[InteractionLocality::Local];
+
+ natoms = nbat->numAtoms();
+ realloced = false;
+
+ if (bDoTime)
+ {
+ /* time async copy */
+ timers->atdat.openTimingRegion(ls);
+ }
+
+ /* need to reallocate if we have to copy more atoms than the amount of space
+ available and only allocate if we haven't initialized yet, i.e d_atdat->natoms == -1 */
+ if (natoms > d_atdat->nalloc)
+ {
+ nalloc = over_alloc_small(natoms);
+
+ /* free up first if the arrays have already been initialized */
+ if (d_atdat->nalloc != -1)
+ {
+ freeDeviceBuffer(&d_atdat->f);
+ freeDeviceBuffer(&d_atdat->xq);
+ freeDeviceBuffer(&d_atdat->lj_comb);
+ freeDeviceBuffer(&d_atdat->atom_types);
+ }
+
+ d_atdat->f_elem_size = sizeof(rvec);
+
+ d_atdat->f = clCreateBuffer(nb->dev_rundata->context, CL_MEM_READ_WRITE | CL_MEM_HOST_READ_ONLY,
+ nalloc * d_atdat->f_elem_size, nullptr, &cl_error);
+ GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
+ ("clCreateBuffer failed: " + ocl_get_error_string(cl_error)).c_str());
+
+ d_atdat->xq = clCreateBuffer(nb->dev_rundata->context, CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY,
+ nalloc * sizeof(cl_float4), nullptr, &cl_error);
+ GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
+ ("clCreateBuffer failed: " + ocl_get_error_string(cl_error)).c_str());
+
+ if (useLjCombRule(nb->nbparam->vdwtype))
+ {
+ d_atdat->lj_comb = clCreateBuffer(nb->dev_rundata->context, CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY,
+ nalloc * sizeof(cl_float2), nullptr, &cl_error);
+ GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
+ ("clCreateBuffer failed: " + ocl_get_error_string(cl_error)).c_str());
+ }
+ else
+ {
+ d_atdat->atom_types = clCreateBuffer(nb->dev_rundata->context, CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY,
+ nalloc * sizeof(int), nullptr, &cl_error);
+ GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
+ ("clCreateBuffer failed: " + ocl_get_error_string(cl_error)).c_str());
+ }
+
+ d_atdat->nalloc = nalloc;
+ realloced = true;
+ }
+
+ d_atdat->natoms = natoms;
+ d_atdat->natoms_local = nbat->natoms_local;
+
+ /* need to clear GPU f output if realloc happened */
+ if (realloced)
+ {
+ nbnxn_ocl_clear_f(nb, nalloc);
+ }
+
+ if (useLjCombRule(nb->nbparam->vdwtype))
+ {
+ ocl_copy_H2D_async(d_atdat->lj_comb, nbat->params().lj_comb.data(), 0,
+ natoms*sizeof(cl_float2), ls, bDoTime ? timers->atdat.fetchNextEvent() : nullptr);
+ }
+ else
+ {
+ ocl_copy_H2D_async(d_atdat->atom_types, nbat->params().type.data(), 0,
+ natoms*sizeof(int), ls, bDoTime ? timers->atdat.fetchNextEvent() : nullptr);
+
+ }
+
+ if (bDoTime)
+ {
+ timers->atdat.closeTimingRegion(ls);
+ }
+
+ /* kick off the tasks enqueued above to ensure concurrency with the search */
+ cl_error = clFlush(ls);
+ GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
+ ("clFlush failed: " + ocl_get_error_string(cl_error)).c_str());
+}
+
+/*! \brief Releases an OpenCL kernel pointer */
+static void free_kernel(cl_kernel *kernel_ptr)
+{
+ cl_int gmx_unused cl_error;
+
+ assert(nullptr != kernel_ptr);
+
+ if (*kernel_ptr)
+ {
+ cl_error = clReleaseKernel(*kernel_ptr);
+ GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS, ("clReleaseKernel failed: " + ocl_get_error_string(cl_error)).c_str());
+
+ *kernel_ptr = nullptr;
+ }
+}
+
+/*! \brief Releases a list of OpenCL kernel pointers */
+static void free_kernels(cl_kernel *kernels, int count)
+{
+ int i;
+
+ for (i = 0; i < count; i++)
+ {
+ free_kernel(kernels + i);
+ }
+}
+
+/*! \brief Free the OpenCL runtime data (context and program).
+ *
+ * The function releases the OpenCL context and program assuciated with the
+ * device that the calling PP rank is running on.
+ *
+ * \param runData [in] porinter to the structure with runtime data.
+ */
+static void free_gpu_device_runtime_data(gmx_device_runtime_data_t *runData)
+{
+ if (runData == nullptr)
+ {
+ return;
+ }
+
+ cl_int gmx_unused cl_error;
+
+ if (runData->context)
+ {
+ cl_error = clReleaseContext(runData->context);
+ GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS, ("clReleaseContext failed: " + ocl_get_error_string(cl_error)).c_str());
+ runData->context = nullptr;
+ }
+
+ if (runData->program)
+ {
+ cl_error = clReleaseProgram(runData->program);
+ GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS, ("clReleaseProgram failed: " + ocl_get_error_string(cl_error)).c_str());
+ runData->program = nullptr;
+ }
+
+}
+
+//! This function is documented in the header file
+void gpu_free(gmx_nbnxn_ocl_t *nb)
+{
+ if (nb == nullptr)
+ {
+ return;
+ }
+
+ /* Free kernels */
+ int kernel_count = sizeof(nb->kernel_ener_noprune_ptr) / sizeof(nb->kernel_ener_noprune_ptr[0][0]);
+ free_kernels(nb->kernel_ener_noprune_ptr[0], kernel_count);
+
+ kernel_count = sizeof(nb->kernel_ener_prune_ptr) / sizeof(nb->kernel_ener_prune_ptr[0][0]);
+ free_kernels(nb->kernel_ener_prune_ptr[0], kernel_count);
+
+ kernel_count = sizeof(nb->kernel_noener_noprune_ptr) / sizeof(nb->kernel_noener_noprune_ptr[0][0]);
+ free_kernels(nb->kernel_noener_noprune_ptr[0], kernel_count);
+
+ kernel_count = sizeof(nb->kernel_noener_prune_ptr) / sizeof(nb->kernel_noener_prune_ptr[0][0]);
+ free_kernels(nb->kernel_noener_prune_ptr[0], kernel_count);
+
+ free_kernel(&(nb->kernel_zero_e_fshift));
+
+ /* Free atdat */
+ freeDeviceBuffer(&(nb->atdat->xq));
+ freeDeviceBuffer(&(nb->atdat->f));
+ freeDeviceBuffer(&(nb->atdat->e_lj));
+ freeDeviceBuffer(&(nb->atdat->e_el));
+ freeDeviceBuffer(&(nb->atdat->fshift));
+ freeDeviceBuffer(&(nb->atdat->lj_comb));
+ freeDeviceBuffer(&(nb->atdat->atom_types));
+ freeDeviceBuffer(&(nb->atdat->shift_vec));
+ sfree(nb->atdat);
+
+ /* Free nbparam */
+ freeDeviceBuffer(&(nb->nbparam->nbfp_climg2d));
+ freeDeviceBuffer(&(nb->nbparam->nbfp_comb_climg2d));
+ freeDeviceBuffer(&(nb->nbparam->coulomb_tab_climg2d));
+ sfree(nb->nbparam);
+
+ /* Free plist */
+ auto *plist = nb->plist[InteractionLocality::Local];
+ freeDeviceBuffer(&plist->sci);
+ freeDeviceBuffer(&plist->cj4);
+ freeDeviceBuffer(&plist->imask);
+ freeDeviceBuffer(&plist->excl);
+ sfree(plist);
+ if (nb->bUseTwoStreams)
+ {
+ auto *plist_nl = nb->plist[InteractionLocality::NonLocal];
+ freeDeviceBuffer(&plist_nl->sci);
+ freeDeviceBuffer(&plist_nl->cj4);
+ freeDeviceBuffer(&plist_nl->imask);
+ freeDeviceBuffer(&plist_nl->excl);
+ sfree(plist_nl);
+ }
+
+ /* Free nbst */
+ pfree(nb->nbst.e_lj);
+ nb->nbst.e_lj = nullptr;
+
+ pfree(nb->nbst.e_el);
+ nb->nbst.e_el = nullptr;
+
+ pfree(nb->nbst.fshift);
+ nb->nbst.fshift = nullptr;
+
+ /* Free command queues */
+ clReleaseCommandQueue(nb->stream[InteractionLocality::Local]);
+ nb->stream[InteractionLocality::Local] = nullptr;
+ if (nb->bUseTwoStreams)
+ {
+ clReleaseCommandQueue(nb->stream[InteractionLocality::NonLocal]);
+ nb->stream[InteractionLocality::NonLocal] = nullptr;
+ }
+ /* Free other events */
+ if (nb->nonlocal_done)
+ {
+ clReleaseEvent(nb->nonlocal_done);
+ nb->nonlocal_done = nullptr;
+ }
+ if (nb->misc_ops_and_local_H2D_done)
+ {
+ clReleaseEvent(nb->misc_ops_and_local_H2D_done);
+ nb->misc_ops_and_local_H2D_done = nullptr;
+ }
+
+ free_gpu_device_runtime_data(nb->dev_rundata);
+ sfree(nb->dev_rundata);
+
+ /* Free timers and timings */
+ delete nb->timers;
+ sfree(nb->timings);
+ sfree(nb);
+
+ if (debug)
+ {
+ fprintf(debug, "Cleaned up OpenCL data structures.\n");
+ }
+}
+
+//! This function is documented in the header file
+gmx_wallclock_gpu_nbnxn_t *gpu_get_timings(gmx_nbnxn_ocl_t *nb)
+{
+ return (nb != nullptr && nb->bDoTime) ? nb->timings : nullptr;
+}
+
+//! This function is documented in the header file
+void gpu_reset_timings(nonbonded_verlet_t* nbv)
+{
+ if (nbv->gpu_nbv && nbv->gpu_nbv->bDoTime)
+ {
+ init_timings(nbv->gpu_nbv->timings);
+ }
+}
+
+//! This function is documented in the header file
+int gpu_min_ci_balanced(gmx_nbnxn_ocl_t *nb)
+{
+ return nb != nullptr ?
+ gpu_min_ci_balanced_factor * nb->dev_info->compute_units : 0;
+}
+
+//! This function is documented in the header file
+gmx_bool gpu_is_kernel_ewald_analytical(const gmx_nbnxn_ocl_t *nb)
+{
+ return ((nb->nbparam->eeltype == eelOclEWALD_ANA) ||
+ (nb->nbparam->eeltype == eelOclEWALD_ANA_TWIN));
+}
+
+} // namespace Nbnxm