Fused GPU bonded kernels
authorMagnus Lundborg <lundborg.magnus@gmail.com>
Thu, 23 May 2019 14:39:50 +0000 (16:39 +0200)
committerArtem Zhmurov <zhmurov@gmail.com>
Fri, 28 Jun 2019 10:37:24 +0000 (12:37 +0200)
To reduce the GPU kernel launch times the
GPU bonded kernels have been fused to a single kernel.

Refs #2818

Change-Id: I2299e1857df5db469739e3aefbc0f771968a6bd5

docs/release-notes/2020/major/performance.rst
src/gromacs/listed_forces/gpubonded.h
src/gromacs/listed_forces/gpubonded_impl.cpp
src/gromacs/listed_forces/gpubonded_impl.cu
src/gromacs/listed_forces/gpubonded_impl.h
src/gromacs/listed_forces/gpubondedkernels.cu
src/gromacs/mdlib/sim_util.cpp

index b7ad1cd9317a5b628a0ba8784fd53e2bc02d64cd..9d8b32ef64584c19a8d34d395398c33ae7175770 100644 (file)
@@ -16,3 +16,10 @@ if the (often noisy) FFTW auto-tuner picks an AVX512 kernel in a run that otherw
 only uses AVX/AVX2 which could run at higher CPU clocks without AVX512 clock speed limitation.
 Now AVX512 is only used for the internal FFTW if GROMACS is also configured with
 the same SIMD flavor.
+
+Bonded kernels on GPU have been fused
+"""""""""""""""""""""""""""""""""""""
+
+Instead of launching one GPU kernel for each listed interaction type there is now one
+GPU kernel that handles all listed interactions. This improves the performance when
+running bonded calculations on a GPU.
index 442e42041febc5cd40f1585bcf1a6d96bbfa4b55..3f0f5eba3f3b571be19c7f2aff8f3b2f4cb7c0c5 100644 (file)
@@ -60,12 +60,12 @@ struct t_forcerec;
 struct t_idef;
 struct t_inputrec;
 
+/*! \brief The number on bonded function types supported on GPUs */
+static constexpr int nFtypesOnGpu = 8;
+
 namespace gmx
 {
 
-/*! \brief The number on bonded function types supported on GPUs */
-constexpr int c_numFtypesOnGpu = 8;
-
 /*! \brief List of all bonded function types supported on GPUs
  *
  * \note This list should be in sync with the actual GPU code.
@@ -73,7 +73,7 @@ constexpr int c_numFtypesOnGpu = 8;
  * \note The function types in the list are ordered on increasing value.
  * \note Currently bonded are only supported with CUDA, not with OpenCL.
  */
-constexpr std::array<int, c_numFtypesOnGpu> ftypesOnGpu =
+constexpr std::array<int, nFtypesOnGpu> ftypesOnGpu =
 {
     F_BONDS,
     F_ANGLES,
@@ -131,10 +131,10 @@ class GpuBonded
         /*! \brief Returns whether there are bonded interactions
          * assigned to the GPU */
         bool haveInteractions() const;
-        /*! \brief Launches bonded kernels on a GPU */
-        void launchKernels(const t_forcerec *fr,
-                           int               forceFlags,
-                           const matrix      box);
+        /*! \brief Launches bonded kernel on a GPU */
+        void launchKernel(const t_forcerec *fr,
+                          int               forceFlags,
+                          const matrix      box);
         /*! \brief Launches the transfer of computed bonded energies. */
         void launchEnergyTransfer();
         /*! \brief Waits on the energy transfer, and accumulates bonded energies to \c enerd. */
index 77c3256510565b5bc93c743f34e32d834c49265d..dc816facc3865503a46039ad13989691aebe5a44 100644 (file)
@@ -192,9 +192,9 @@ GpuBonded::haveInteractions() const
 }
 
 void
-GpuBonded::launchKernels(const t_forcerec * /* fr */,
-                         int            /* forceFlags */,
-                         const matrix   /* box */)
+GpuBonded::launchKernel(const t_forcerec * /* fr */,
+                        int            /* forceFlags */,
+                        const matrix   /* box */)
 {
 }
 
index 40f2c6dd186b906c271ce339d2e89ce261bb375e..050efc0f2d753e8f7e88c0365da74ea506e32d5d 100644 (file)
@@ -39,6 +39,7 @@
  * \author Berk Hess <hess@kth.se>
  * \author Szilárd Páll <pall.szilard@gmail.com>
  * \author Mark Abraham <mark.j.abraham@gmail.com>
+ * \author Magnus Lundborg <lundborg.magnus@gmail.com>
  *
  * \ingroup module_listed_forces
  */
@@ -47,6 +48,7 @@
 
 #include "gpubonded_impl.h"
 
+#include "gromacs/gpu_utils/cuda_arch_utils.cuh"
 #include "gromacs/gpu_utils/cudautils.cuh"
 #include "gromacs/gpu_utils/devicebuffer.h"
 #include "gromacs/gpu_utils/gpu_vec.cuh"
@@ -86,6 +88,18 @@ GpuBonded::Impl::Impl(const gmx_ffparams_t &ffparams,
         iListsDevice[ftype].iatoms = nullptr;
         iListsDevice[ftype].nalloc = 0;
     }
+
+    kernelParams_.forceparamsDevice      = forceparamsDevice;
+    kernelParams_.xqDevice               = xqDevice;
+    kernelParams_.forceDevice            = forceDevice;
+    kernelParams_.fshiftDevice           = fshiftDevice;
+    kernelParams_.vtotDevice             = vtotDevice;
+    for (int i = 0; i < nFtypesOnGpu; i++)
+    {
+        kernelParams_.iatoms[i]          = nullptr;
+        kernelParams_.ftypeRangeStart[i] = 0;
+        kernelParams_.ftypeRangeEnd[i]   = -1;
+    }
 }
 
 GpuBonded::Impl::~Impl()
@@ -136,6 +150,20 @@ static void convertIlistToNbnxnOrder(const t_ilist       &src,
     }
 }
 
+//! Returns \p input rounded up to the closest multiple of \p factor.
+static inline int roundUpToFactor(const int input, const int factor)
+{
+    GMX_ASSERT(factor > 0, "The factor to round up to must be > 0.");
+
+    int remainder = input % factor;
+
+    if (remainder == 0)
+    {
+        return (input);
+    }
+    return (input + (factor - remainder));
+}
+
 // TODO Consider whether this function should be a factory method that
 // makes an object that is the only one capable of the device
 // operations needed for the lifetime of an interaction list. This
@@ -143,7 +171,12 @@ static void convertIlistToNbnxnOrder(const t_ilist       &src,
 // of naming this method for the problem of what to name the
 // BondedDeviceInteractionListHandler type.
 
-//! Divides bonded interactions over threads and GPU
+/*! Divides bonded interactions over threads and GPU.
+ *  The bonded interactions are assigned by interaction type to GPU threads. The intereaction
+ *  types are assigned in blocks sized as <warp_size>. The beginning and end (thread index) of each
+ *  interaction type are stored in kernelParams_. Pointers to the relevant data structures on the
+ *  GPU are also stored in kernelParams_.
+ */
 void
 GpuBonded::Impl::updateInteractionListsAndDeviceBuffers(ArrayRef<const int>  nbnxnAtomOrder,
                                                         const t_idef        &idef,
@@ -153,6 +186,7 @@ GpuBonded::Impl::updateInteractionListsAndDeviceBuffers(ArrayRef<const int>  nbn
 {
     // TODO wallcycle sub start
     haveInteractions_ = false;
+    int ftypesCounter = 0;
 
     for (int ftype : ftypesOnGpu)
     {
@@ -189,11 +223,39 @@ GpuBonded::Impl::updateInteractionListsAndDeviceBuffers(ArrayRef<const int>  nbn
                                0, iList.size(),
                                stream, GpuApiCallBehavior::Async, nullptr);
         }
+        kernelParams_.ftypesOnGpu[ftypesCounter]   = ftype;
+        kernelParams_.nrFTypeIAtoms[ftypesCounter] = iList.size();
+        int nBonds = iList.size() / (interaction_function[ftype].nratoms + 1);
+        kernelParams_.nrFTypeBonds[ftypesCounter]  = nBonds;
+        kernelParams_.iatoms[ftypesCounter]        = iListsDevice[ftype].iatoms;
+        if (ftypesCounter == 0)
+        {
+            kernelParams_.ftypeRangeStart[ftypesCounter] = 0;
+        }
+        else
+        {
+            kernelParams_.ftypeRangeStart[ftypesCounter] = kernelParams_.ftypeRangeEnd[ftypesCounter - 1] + 1;
+        }
+        kernelParams_.ftypeRangeEnd[ftypesCounter] = kernelParams_.ftypeRangeStart[ftypesCounter] + roundUpToFactor(nBonds, warp_size) - 1;
+
+        GMX_ASSERT(nBonds > 0 || kernelParams_.ftypeRangeEnd[ftypesCounter] <= kernelParams_.ftypeRangeStart[ftypesCounter],
+                   "Invalid GPU listed forces setup. nBonds must be > 0 if there are threads allocated to do work on that interaction function type.");
+        GMX_ASSERT(kernelParams_.ftypeRangeStart[ftypesCounter] % warp_size == 0 && (kernelParams_.ftypeRangeEnd[ftypesCounter] + 1) % warp_size == 0,
+                   "The bonded interactions must be assigned to the GPU in blocks of warp size.");
+
+        ftypesCounter++;
     }
 
     xqDevice     = static_cast<float4 *>(xqDevicePtr);
     forceDevice  = static_cast<fvec *>(forceDevicePtr);
     fshiftDevice = static_cast<fvec *>(fshiftDevicePtr);
+
+    kernelParams_.xqDevice          = xqDevice;
+    kernelParams_.forceDevice       = forceDevice;
+    kernelParams_.fshiftDevice      = fshiftDevice;
+    kernelParams_.forceparamsDevice = forceparamsDevice;
+    kernelParams_.vtotDevice        = vtotDevice;
+
     // TODO wallcycle sub stop
 }
 
index 79c0ad157e4bf3027624a93c197951187c6369e8..0cfc429f4467898a7676aca934b1a98571a550d9 100644 (file)
@@ -52,6 +52,7 @@
 #include "gromacs/gpu_utils/gputraits.cuh"
 #include "gromacs/gpu_utils/hostallocator.h"
 #include "gromacs/listed_forces/gpubonded.h"
+#include "gromacs/pbcutil/pbc_aiuc.h"
 #include "gromacs/topology/idef.h"
 
 struct gmx_ffparams_t;
@@ -73,6 +74,57 @@ struct HostInteractionList
     HostVector<int> iatoms = {{}, gmx::HostAllocationPolicy(gmx::PinningPolicy::PinnedIfSupported)};
 };
 
+/* \brief Bonded parameters and GPU pointers
+ *
+ * This is used to accumulate all the parameters and pointers so they can be passed
+ * to the GPU as a single structure.
+ *
+ */
+struct BondedCudaKernelParameters
+{
+    //! Periodic boundary data
+    PbcAiuc             pbcAiuc;
+    //! Scale factor
+    float               scaleFactor;
+    //! The bonded types on GPU
+    int                 ftypesOnGpu[nFtypesOnGpu];
+    //! The number of interaction atom (iatom) elements for every function type
+    int                 nrFTypeIAtoms[nFtypesOnGpu];
+    //! The number of bonds for every function type
+    int                 nrFTypeBonds[nFtypesOnGpu];
+    //! The start index in the range of each interaction type
+    int                 ftypeRangeStart[nFtypesOnGpu];
+    //! The end index in the range of each interaction type
+    int                 ftypeRangeEnd[nFtypesOnGpu];
+
+    //! Force parameters (on GPU)
+    t_iparams          *forceparamsDevice;
+    //! Coordinates before the timestep (on GPU)
+    const float4       *xqDevice;
+    //! Forces on atoms (on GPU)
+    fvec               *forceDevice;
+    //! Force shifts on atoms (on GPU)
+    fvec               *fshiftDevice;
+    //! Total Energy (on GPU)
+    float              *vtotDevice;
+    //! Interaction list atoms (on GPU)
+    t_iatom            *iatoms[nFtypesOnGpu];
+
+    BondedCudaKernelParameters()
+    {
+        matrix boxDummy = { {0, 0, 0}, {0, 0, 0}, {0, 0, 0} };
+
+        setPbcAiuc(0, boxDummy, &pbcAiuc);
+
+        scaleFactor       = 1.0;
+        forceparamsDevice = nullptr;
+        xqDevice          = nullptr;
+        forceDevice       = nullptr;
+        fshiftDevice      = nullptr;
+        vtotDevice        = nullptr;
+    }
+};
+
 /*! \internal \brief Implements GPU bondeds */
 class GpuBonded::Impl
 {
@@ -96,11 +148,11 @@ class GpuBonded::Impl
                                                     void                *forceDevice,
                                                     void                *fshiftDevice);
 
-        /*! \brief Launches bonded kernels on a GPU */
+        /*! \brief Launches bonded kernel on a GPU */
         template <bool calcVir, bool calcEner>
         void
-        launchKernels(const t_forcerec *fr,
-                      const matrix      box);
+        launchKernel(const t_forcerec *fr,
+                     const matrix      box);
         /*! \brief Returns whether there are bonded interactions
          * assigned to the GPU */
         bool haveInteractions() const;
@@ -136,6 +188,9 @@ class GpuBonded::Impl
 
         //! \brief Bonded GPU stream, not owned by this module
         CommandStream         stream;
+
+        //! Parameters and pointers, passed to the CUDA kernel
+        BondedCudaKernelParameters kernelParams_;
 };
 
 }   // namespace gmx
index d72562f6d44275f0e1514237d1e6b320f7a01087..b0f19a866883a5ba4469ec6dcab31d39ff23dc7a 100644 (file)
@@ -48,6 +48,8 @@
 
 #include "gmxpre.h"
 
+#include <cassert>
+
 #include <math_constants.h>
 
 #include "gromacs/gpu_utils/cudautils.cuh"
@@ -99,39 +101,19 @@ static void harmonic_gpu(const float kA, const float xA, const float x, float *V
 }
 
 template <bool calcVir, bool calcEner>
-__global__
-void bonds_gpu(float *vtot, const int nbonds,
+__device__
+void bonds_gpu(const int i, float *vtot_loc, const int nBonds,
                const t_iatom forceatoms[], const t_iparams forceparams[],
-               const float4 xq[], fvec force[], fvec fshift[],
+               const float4 xq[], fvec force[], fvec sm_fShiftLoc[],
                const PbcAiuc pbcAiuc)
 {
-    const int        i = blockIdx.x * blockDim.x + threadIdx.x;
-
-    __shared__ float vtot_loc;
-    __shared__ fvec  fshift_loc[SHIFTS];
-
-    if (calcVir || calcEner)
-    {
-        if (threadIdx.x == 0)
-        {
-            vtot_loc = 0.0f;
-        }
-        if (threadIdx.x < SHIFTS)
-        {
-            fshift_loc[threadIdx.x][XX] = 0.0f;
-            fshift_loc[threadIdx.x][YY] = 0.0f;
-            fshift_loc[threadIdx.x][ZZ] = 0.0f;
-        }
-        __syncthreads();
-    }
-
-    if (i < nbonds)
+    if (i < nBonds)
     {
         int type = forceatoms[3*i];
         int ai   = forceatoms[3*i + 1];
         int aj   = forceatoms[3*i + 2];
 
-        /* dx = xi - xj, corrected for periodic boundry conditions. */
+        /* dx = xi - xj, corrected for periodic boundary conditions. */
         fvec  dx;
         int   ki = pbcDxAiuc<calcVir>(pbcAiuc, xq[ai], xq[aj], dx);
 
@@ -146,7 +128,7 @@ void bonds_gpu(float *vtot, const int nbonds,
 
         if (calcEner)
         {
-            atomicAdd(&vtot_loc, vbond);
+            *vtot_loc += vbond;
         }
 
         if (dr2 != 0.0f)
@@ -161,25 +143,12 @@ void bonds_gpu(float *vtot, const int nbonds,
                 atomicAdd(&force[aj][m], -fij);
                 if (calcVir && ki != CENTRAL)
                 {
-                    atomicAdd(&fshift_loc[ki][m], fij);
-                    atomicAdd(&fshift_loc[CENTRAL][m], -fij);
+                    atomicAdd(&sm_fShiftLoc[ki][m], fij);
+                    atomicAdd(&sm_fShiftLoc[CENTRAL][m], -fij);
                 }
             }
         }
     }
-
-    if (calcVir || calcEner)
-    {
-        __syncthreads();
-        if (calcEner && threadIdx.x == 0)
-        {
-            atomicAdd(vtot, vtot_loc);
-        }
-        if (calcVir && threadIdx.x < SHIFTS)
-        {
-            fvec_inc_atomic(fshift[threadIdx.x], fshift_loc[threadIdx.x]);
-        }
-    }
 }
 
 template <bool returnShift>
@@ -200,34 +169,13 @@ static float bond_angle_gpu(const float4 xi, const float4 xj, const float4 xk,
 }
 
 template <bool calcVir, bool calcEner>
-__global__
-void angles_gpu(float *vtot, const int nbonds,
+__device__
+void angles_gpu(const int i, float *vtot_loc, const int nBonds,
                 const t_iatom forceatoms[], const t_iparams forceparams[],
-                const float4 x[], fvec force[], fvec fshift[],
+                const float4 xq[], fvec force[], fvec sm_fShiftLoc[],
                 const PbcAiuc pbcAiuc)
 {
-    __shared__ float vtot_loc;
-    __shared__ fvec  fshift_loc[SHIFTS];
-
-    const int        i = blockIdx.x*blockDim.x + threadIdx.x;
-
-    if (calcVir || calcEner)
-    {
-        if (threadIdx.x == 0)
-        {
-            vtot_loc = 0.0f;
-        }
-        if (threadIdx.x < SHIFTS)
-        {
-            fshift_loc[threadIdx.x][XX] = 0.0f;
-            fshift_loc[threadIdx.x][YY] = 0.0f;
-            fshift_loc[threadIdx.x][ZZ] = 0.0f;
-        }
-
-        __syncthreads();
-    }
-
-    if (i < nbonds)
+    if (i < nBonds)
     {
         int   type = forceatoms[4*i];
         int   ai   = forceatoms[4*i + 1];
@@ -240,7 +188,7 @@ void angles_gpu(float *vtot, const int nbonds,
         int   t1;
         int   t2;
         float theta =
-            bond_angle_gpu<calcVir>(x[ai], x[aj], x[ak], pbcAiuc,
+            bond_angle_gpu<calcVir>(xq[ai], xq[aj], xq[ak], pbcAiuc,
                                     r_ij, r_kj, &cos_theta, &t1, &t2);
 
         float va;
@@ -251,7 +199,7 @@ void angles_gpu(float *vtot, const int nbonds,
 
         if (calcEner)
         {
-            atomicAdd(&vtot_loc, va);
+            *vtot_loc += va;
         }
 
         float cos_theta2 = cos_theta*cos_theta;
@@ -272,6 +220,7 @@ void angles_gpu(float *vtot, const int nbonds,
             fvec  f_i;
             fvec  f_k;
             fvec  f_j;
+#pragma unroll
             for (int m = 0; m < DIM; m++)
             {
                 f_i[m]    = -(cik*r_kj[m] - cii*r_ij[m]);
@@ -280,61 +229,26 @@ void angles_gpu(float *vtot, const int nbonds,
                 atomicAdd(&force[ai][m], f_i[m]);
                 atomicAdd(&force[aj][m], f_j[m]);
                 atomicAdd(&force[ak][m], f_k[m]);
-            }
-            if (calcVir)
-            {
-                fvec_inc_atomic(fshift_loc[t1], f_i);
-                fvec_inc_atomic(fshift_loc[CENTRAL], f_j);
-                fvec_inc_atomic(fshift_loc[t2], f_k);
+                if (calcVir)
+                {
+                    atomicAdd(&sm_fShiftLoc[t1][m], f_i[m]);
+                    atomicAdd(&sm_fShiftLoc[CENTRAL][m], f_j[m]);
+                    atomicAdd(&sm_fShiftLoc[t2][m], f_k[m]);
+                }
             }
         }
 
     }
-
-    if (calcVir || calcEner)
-    {
-        __syncthreads();
-
-        if (calcEner && threadIdx.x == 0)
-        {
-            atomicAdd(vtot, vtot_loc);
-        }
-        if (calcVir && threadIdx.x < SHIFTS)
-        {
-            fvec_inc_atomic(fshift[threadIdx.x], fshift_loc[threadIdx.x]);
-        }
-    }
 }
 
 template <bool calcVir, bool calcEner>
-__global__
-void urey_bradley_gpu(float *vtot, const int nbonds,
+__device__
+void urey_bradley_gpu(const int i, float *vtot_loc, const int nBonds,
                       const t_iatom forceatoms[], const t_iparams forceparams[],
-                      const float4 x[], fvec force[], fvec fshift[],
+                      const float4 xq[], fvec force[], fvec sm_fShiftLoc[],
                       const PbcAiuc pbcAiuc)
 {
-    __shared__ float vtot_loc;
-    __shared__ fvec  fshift_loc[SHIFTS];
-
-    const int        i = blockIdx.x*blockDim.x + threadIdx.x;
-
-    if (calcVir || calcEner)
-    {
-        if (threadIdx.x == 0)
-        {
-            vtot_loc = 0.0f;
-        }
-        if (threadIdx.x < SHIFTS)
-        {
-            fshift_loc[threadIdx.x][XX] = 0.0f;
-            fshift_loc[threadIdx.x][YY] = 0.0f;
-            fshift_loc[threadIdx.x][ZZ] = 0.0f;
-        }
-
-        __syncthreads();
-    }
-
-    if (i < nbonds)
+    if (i < nBonds)
     {
         int   type  = forceatoms[4*i];
         int   ai    = forceatoms[4*i+1];
@@ -351,7 +265,7 @@ void urey_bradley_gpu(float *vtot, const int nbonds,
         float cos_theta;
         int   t1;
         int   t2;
-        float theta = bond_angle_gpu<calcVir>(x[ai], x[aj], x[ak], pbcAiuc,
+        float theta = bond_angle_gpu<calcVir>(xq[ai], xq[aj], xq[ak], pbcAiuc,
                                               r_ij, r_kj, &cos_theta, &t1, &t2);
 
         float va;
@@ -360,11 +274,11 @@ void urey_bradley_gpu(float *vtot, const int nbonds,
 
         if (calcEner)
         {
-            atomicAdd(&vtot_loc, va);
+            *vtot_loc += va;
         }
 
         fvec  r_ik;
-        int   ki = pbcDxAiuc<calcVir>(pbcAiuc, x[ai], x[ak], r_ik);
+        int   ki = pbcDxAiuc<calcVir>(pbcAiuc, xq[ai], xq[ak], r_ik);
 
         float dr2  = iprod_gpu(r_ik, r_ik);
         float dr   = dr2*rsqrtf(dr2);
@@ -389,6 +303,7 @@ void urey_bradley_gpu(float *vtot, const int nbonds,
             fvec  f_i;
             fvec  f_j;
             fvec  f_k;
+#pragma unroll
             for (int m = 0; m < DIM; m++)
             {
                 f_i[m]    = -(cik*r_kj[m]-cii*r_ij[m]);
@@ -397,10 +312,13 @@ void urey_bradley_gpu(float *vtot, const int nbonds,
                 atomicAdd(&force[ai][m], f_i[m]);
                 atomicAdd(&force[aj][m], f_j[m]);
                 atomicAdd(&force[ak][m], f_k[m]);
+                if (calcVir)
+                {
+                    atomicAdd(&sm_fShiftLoc[t1][m], f_i[m]);
+                    atomicAdd(&sm_fShiftLoc[CENTRAL][m], f_j[m]);
+                    atomicAdd(&sm_fShiftLoc[t2][m], f_k[m]);
+                }
             }
-            fvec_inc_atomic(fshift_loc[t1], f_i);
-            fvec_inc_atomic(fshift_loc[CENTRAL], f_j);
-            fvec_inc_atomic(fshift_loc[t2], f_k);
         }
 
         /* Time for the bond calculations */
@@ -408,11 +326,12 @@ void urey_bradley_gpu(float *vtot, const int nbonds,
         {
             if (calcEner)
             {
-                atomicAdd(&vtot_loc, vbond);
+                *vtot_loc += vbond;
             }
 
             fbond *= rsqrtf(dr2);
 
+#pragma unroll
             for (int m = 0; m < DIM; m++)
             {
                 float fik = fbond*r_ik[m];
@@ -421,26 +340,12 @@ void urey_bradley_gpu(float *vtot, const int nbonds,
 
                 if (calcVir && ki != CENTRAL)
                 {
-                    atomicAdd(&fshift_loc[ki][m], fik);
-                    atomicAdd(&fshift_loc[CENTRAL][m], -fik);
+                    atomicAdd(&sm_fShiftLoc[ki][m], fik);
+                    atomicAdd(&sm_fShiftLoc[CENTRAL][m], -fik);
                 }
             }
         }
     }
-
-    if (calcVir || calcEner)
-    {
-        __syncthreads();
-
-        if (calcEner && threadIdx.x == 0)
-        {
-            atomicAdd(vtot, vtot_loc);
-        }
-        if (calcVir && threadIdx.x < SHIFTS)
-        {
-            fvec_inc_atomic(fshift[threadIdx.x], fshift_loc[threadIdx.x]);
-        }
-    }
 }
 
 template <bool returnShift, typename T>
@@ -483,7 +388,7 @@ static void do_dih_fup_gpu(const int i, const int j, const int k, const int l,
                            const float ddphi, const fvec r_ij, const fvec r_kj, const fvec r_kl,
                            const fvec m, const fvec n, fvec force[], fvec fshift[],
                            const PbcAiuc &pbcAiuc,
-                           const float4 x[], const int t1, const int t2, const int gmx_unused t3)
+                           const float4 xq[], const int t1, const int t2, const int gmx_unused t3)
 {
     float iprm  = iprod_gpu(m, m);
     float iprn  = iprod_gpu(n, n);
@@ -526,44 +431,28 @@ static void do_dih_fup_gpu(const int i, const int j, const int k, const int l,
         if (calcVir)
         {
             fvec dx_jl;
-            int  t3 = pbcDxAiuc<calcVir>(pbcAiuc, x[l], x[j], dx_jl);
+            int  t3 = pbcDxAiuc<calcVir>(pbcAiuc, xq[l], xq[j], dx_jl);
 
-            fvec_inc_atomic(fshift[t1], f_i);
-            fvec_dec_atomic(fshift[CENTRAL], f_j);
-            fvec_dec_atomic(fshift[t2], f_k);
-            fvec_inc_atomic(fshift[t3], f_l);
+#pragma unroll
+            for (int m = 0; (m < DIM); m++)
+            {
+                atomicAdd(&fshift[t1][m], f_i[m]);
+                atomicAdd(&fshift[CENTRAL][m], -f_j[m]);
+                atomicAdd(&fshift[t2][m], -f_k[m]);
+                atomicAdd(&fshift[t3][m], f_l[m]);
+            }
         }
     }
 }
 
 template <bool calcVir, bool calcEner>
-__global__
-void  pdihs_gpu(float *vtot, const int nbonds,
+__device__
+void  pdihs_gpu(const int i, float *vtot_loc, const int nBonds,
                 const t_iatom forceatoms[], const t_iparams forceparams[],
-                const float4 x[], fvec f[], fvec fshift[],
+                const float4 xq[], fvec f[], fvec sm_fShiftLoc[],
                 const PbcAiuc pbcAiuc)
 {
-    const int        i = blockIdx.x*blockDim.x + threadIdx.x;
-
-    __shared__ float vtot_loc;
-    __shared__ fvec  fshift_loc[SHIFTS];
-
-    if (calcVir || calcEner)
-    {
-        if (threadIdx.x == 0)
-        {
-            vtot_loc = 0.0f;
-        }
-        if (threadIdx.x < SHIFTS)
-        {
-            fshift_loc[threadIdx.x][XX] = 0.0f;
-            fshift_loc[threadIdx.x][YY] = 0.0f;
-            fshift_loc[threadIdx.x][ZZ] = 0.0f;
-        }
-        __syncthreads();
-    }
-
-    if (i < nbonds)
+    if (i < nBonds)
     {
         int   type = forceatoms[5*i];
         int   ai   = forceatoms[5*i + 1];
@@ -580,7 +469,7 @@ void  pdihs_gpu(float *vtot, const int nbonds,
         int   t2;
         int   t3;
         float phi  =
-            dih_angle_gpu<calcVir>(x[ai], x[aj], x[ak], x[al], pbcAiuc,
+            dih_angle_gpu<calcVir>(xq[ai], xq[aj], xq[ak], xq[al], pbcAiuc,
                                    r_ij, r_kj, r_kl, m, n, &t1, &t2, &t3);
 
         float vpd;
@@ -592,62 +481,27 @@ void  pdihs_gpu(float *vtot, const int nbonds,
 
         if (calcEner)
         {
-            atomicAdd(&vtot_loc, vpd);
+            *vtot_loc += vpd;
         }
 
         do_dih_fup_gpu<calcVir>(ai, aj, ak, al,
                                 ddphi, r_ij, r_kj, r_kl, m, n,
-                                f, fshift_loc, pbcAiuc,
-                                x, t1, t2, t3);
+                                f, sm_fShiftLoc, pbcAiuc,
+                                xq, t1, t2, t3);
 
     }
-
-    if (calcVir || calcEner)
-    {
-        __syncthreads();
-
-        if (calcEner && threadIdx.x == 0)
-        {
-            atomicAdd(vtot, vtot_loc);
-        }
-        if (calcVir && threadIdx.x < SHIFTS)
-        {
-            fvec_inc_atomic(fshift[threadIdx.x], fshift_loc[threadIdx.x]);
-        }
-    }
 }
 
 template <bool calcVir, bool calcEner>
-__global__
-void rbdihs_gpu(float *vtot, const int nbonds,
+__device__
+void rbdihs_gpu(const int i, float *vtot_loc, const int nBonds,
                 const t_iatom forceatoms[], const t_iparams forceparams[],
-                const float4 x[], fvec f[], fvec fshift[],
+                const float4 xq[], fvec f[], fvec sm_fShiftLoc[],
                 const PbcAiuc pbcAiuc)
 {
     constexpr float  c0 = 0.0f, c1 = 1.0f, c2 = 2.0f, c3 = 3.0f, c4 = 4.0f, c5 = 5.0f;
 
-    __shared__ float vtot_loc;
-    __shared__ fvec  fshift_loc[SHIFTS];
-
-    const int        i = blockIdx.x*blockDim.x + threadIdx.x;
-
-    if (calcVir || calcEner)
-    {
-        if (threadIdx.x == 0)
-        {
-            vtot_loc = 0.0f;
-        }
-        if (threadIdx.x < SHIFTS)
-        {
-            fshift_loc[threadIdx.x][XX] = 0.0f;
-            fshift_loc[threadIdx.x][YY] = 0.0f;
-            fshift_loc[threadIdx.x][ZZ] = 0.0f;
-        }
-
-        __syncthreads();
-    }
-
-    if (i < nbonds)
+    if (i < nBonds)
     {
         int   type = forceatoms[5*i];
         int   ai   = forceatoms[5*i+1];
@@ -664,7 +518,7 @@ void rbdihs_gpu(float *vtot, const int nbonds,
         int   t2;
         int   t3;
         float phi  =
-            dih_angle_gpu<calcVir>(x[ai], x[aj], x[ak], x[al], pbcAiuc,
+            dih_angle_gpu<calcVir>(xq[ai], xq[aj], xq[ak], xq[al], pbcAiuc,
                                    r_ij, r_kj, r_kl, m, n, &t1, &t2, &t3);
 
         /* Change to polymer convention */
@@ -733,25 +587,11 @@ void rbdihs_gpu(float *vtot, const int nbonds,
 
         do_dih_fup_gpu<calcVir>(ai, aj, ak, al,
                                 ddphi, r_ij, r_kj, r_kl, m, n,
-                                f, fshift_loc, pbcAiuc,
-                                x, t1, t2, t3);
+                                f, sm_fShiftLoc, pbcAiuc,
+                                xq, t1, t2, t3);
         if (calcEner)
         {
-            atomicAdd(&vtot_loc, v);
-        }
-    }
-
-    if (calcVir || calcEner)
-    {
-        __syncthreads();
-
-        if (calcEner && threadIdx.x == 0)
-        {
-            atomicAdd(vtot, vtot_loc);
-        }
-        if (calcVir && threadIdx.x < SHIFTS)
-        {
-            fvec_inc_atomic(fshift[threadIdx.x], fshift_loc[threadIdx.x]);
+            *vtot_loc += v;
         }
     }
 }
@@ -771,33 +611,13 @@ static void make_dp_periodic_gpu(float *dp)
 }
 
 template <bool calcVir, bool calcEner>
-__global__
-void  idihs_gpu(float *vtot, const int nbonds,
+__device__
+void  idihs_gpu(const int i, float *vtot_loc, const int nBonds,
                 const t_iatom forceatoms[], const t_iparams forceparams[],
-                const float4 x[], fvec f[], fvec fshift[],
+                const float4 xq[], fvec f[], fvec sm_fShiftLoc[],
                 const PbcAiuc pbcAiuc)
 {
-    const int        i = blockIdx.x*blockDim.x + threadIdx.x;
-
-    __shared__ float vtot_loc;
-    __shared__ fvec  fshift_loc[SHIFTS];
-
-    if (calcVir || calcEner)
-    {
-        if (threadIdx.x == 0)
-        {
-            vtot_loc = 0.0f;
-        }
-        if (threadIdx.x < SHIFTS)
-        {
-            fshift_loc[threadIdx.x][XX] = 0.0f;
-            fshift_loc[threadIdx.x][YY] = 0.0f;
-            fshift_loc[threadIdx.x][ZZ] = 0.0f;
-        }
-        __syncthreads();
-    }
-
-    if (i < nbonds)
+    if (i < nBonds)
     {
         int   type = forceatoms[5*i];
         int   ai   = forceatoms[5*i + 1];
@@ -814,7 +634,7 @@ void  idihs_gpu(float *vtot, const int nbonds,
         int   t2;
         int   t3;
         float phi  =
-            dih_angle_gpu<calcVir>(x[ai], x[aj], x[ak], x[al], pbcAiuc,
+            dih_angle_gpu<calcVir>(xq[ai], xq[aj], xq[ak], xq[al], pbcAiuc,
                                    r_ij, r_kj, r_kl, m, n, &t1, &t2, &t3);
 
         /* phi can jump if phi0 is close to Pi/-Pi, which will cause huge
@@ -837,63 +657,26 @@ void  idihs_gpu(float *vtot, const int nbonds,
 
         do_dih_fup_gpu<calcVir>(ai, aj, ak, al,
                                 -ddphi, r_ij, r_kj, r_kl, m, n,
-                                f, fshift_loc, pbcAiuc,
-                                x, t1, t2, t3);
+                                f, sm_fShiftLoc, pbcAiuc,
+                                xq, t1, t2, t3);
 
         if (calcEner)
         {
-            atomicAdd(&vtot_loc, -0.5f*ddphi*dp);
-        }
-    }
-
-    if (calcVir || calcEner)
-    {
-        __syncthreads();
-
-        if (calcEner && threadIdx.x == 0)
-        {
-            atomicAdd(vtot, vtot_loc);
-        }
-        if (calcVir && threadIdx.x < SHIFTS)
-        {
-            fvec_inc_atomic(fshift[threadIdx.x], fshift_loc[threadIdx.x]);
+            *vtot_loc += -0.5f*ddphi*dp;
         }
     }
 }
 
 template <bool calcVir, bool calcEner>
-__global__
-void pairs_gpu(const int nbonds,
+__device__
+void pairs_gpu(const int i, const int nBonds,
                const t_iatom iatoms[], const t_iparams iparams[],
-               const float4 xq[], fvec force[], fvec fshift[],
+               const float4 xq[], fvec force[], fvec sm_fShiftLoc[],
                const PbcAiuc pbcAiuc,
                const float scale_factor,
-               float *vtotVdw, float *vtotElec)
+               float *vtotVdw_loc, float *vtotElec_loc)
 {
-    const int        i = blockIdx.x*blockDim.x+threadIdx.x;
-
-    __shared__ float vtotVdw_loc;
-    __shared__ float vtotElec_loc;
-    __shared__ fvec  fshift_loc[SHIFTS];
-
-    if (calcVir || calcEner)
-    {
-        if (threadIdx.x == 0)
-        {
-            vtotVdw_loc  = 0.0f;
-            vtotElec_loc = 0.0f;
-        }
-
-        if (threadIdx.x < SHIFTS)
-        {
-            fshift_loc[threadIdx.x][XX] = 0.0f;
-            fshift_loc[threadIdx.x][YY] = 0.0f;
-            fshift_loc[threadIdx.x][ZZ] = 0.0f;
-        }
-        __syncthreads();
-    }
-
-    if (i <  nbonds)
+    if (i <  nBonds)
     {
         int   itype = iatoms[3*i];
         int   ai    = iatoms[3*i + 1];
@@ -928,209 +711,179 @@ void pairs_gpu(const int nbonds,
         {
             atomicAdd(&force[ai][m], f[m]);
             atomicAdd(&force[aj][m], -f[m]);
+            if (calcVir && fshift_index != CENTRAL)
+            {
+                atomicAdd(&sm_fShiftLoc[fshift_index][m], f[m]);
+                atomicAdd(&sm_fShiftLoc[CENTRAL][m], -f[m]);
+            }
         }
 
         if (calcEner)
         {
-            atomicAdd(&vtotVdw_loc, (c12*rinv6 - c6)*rinv6);
-            atomicAdd(&vtotElec_loc, velec);
+            *vtotVdw_loc  += (c12*rinv6 - c6)*rinv6;
+            *vtotElec_loc += velec;
         }
+    }
+}
 
-        if (calcVir && fshift_index != CENTRAL)
+namespace gmx
+{
+
+template <bool calcVir, bool calcEner>
+__global__
+void exec_kernel_gpu(BondedCudaKernelParameters kernelParams)
+{
+    assert(blockDim.y == 1 && blockDim.z == 1);
+    const int       threadIndex  = blockIdx.x*blockDim.x+threadIdx.x;
+    float           vtot_loc     = 0;
+    float           vtotVdw_loc  = 0;
+    float           vtotElec_loc = 0;
+    __shared__ fvec sm_fShiftLoc[SHIFTS];
+
+    if (calcVir)
+    {
+        if (threadIdx.x < SHIFTS)
         {
-            fvec_inc_atomic(fshift_loc[fshift_index], f);
-            fvec_dec_atomic(fshift_loc[CENTRAL], f);
+            sm_fShiftLoc[threadIdx.x][XX] = 0.0f;
+            sm_fShiftLoc[threadIdx.x][YY] = 0.0f;
+            sm_fShiftLoc[threadIdx.x][ZZ] = 0.0f;
         }
+        __syncthreads();
     }
 
-    if (calcVir || calcEner)
+    int  ftype;
+    bool threadComputedPotential = false;
+#pragma unroll
+    for (int j = 0; j < nFtypesOnGpu; j++)
     {
-        __syncthreads();
-
-        if (calcEner && threadIdx.x == 0)
+        if (threadIndex >= kernelParams.ftypeRangeStart[j] && threadIndex <= kernelParams.ftypeRangeEnd[j])
         {
-            atomicAdd(vtotVdw, vtotVdw_loc);
-            atomicAdd(vtotElec, vtotElec_loc);
+            const int      nBonds           = kernelParams.nrFTypeBonds[j];
+
+            int            localThreadIndex = threadIndex - kernelParams.ftypeRangeStart[j];
+            const t_iatom *iatoms           = kernelParams.iatoms[j];
+            ftype                           = kernelParams.ftypesOnGpu[j];
+            if (calcEner)
+            {
+                threadComputedPotential         = true;
+            }
+
+            switch (ftype)
+            {
+                case F_BONDS:
+                    bonds_gpu<calcVir, calcEner>(localThreadIndex, &vtot_loc, nBonds, iatoms, kernelParams.forceparamsDevice,
+                                                 kernelParams.xqDevice, kernelParams.forceDevice, sm_fShiftLoc, kernelParams.pbcAiuc);
+                    break;
+                case F_ANGLES:
+                    angles_gpu<calcVir, calcEner>(localThreadIndex, &vtot_loc, nBonds, iatoms, kernelParams.forceparamsDevice,
+                                                  kernelParams.xqDevice, kernelParams.forceDevice, sm_fShiftLoc, kernelParams.pbcAiuc);
+                    break;
+                case F_UREY_BRADLEY:
+                    urey_bradley_gpu<calcVir, calcEner>(localThreadIndex, &vtot_loc, nBonds, iatoms, kernelParams.forceparamsDevice,
+                                                        kernelParams.xqDevice, kernelParams.forceDevice, sm_fShiftLoc, kernelParams.pbcAiuc);
+                    break;
+                case F_PDIHS:
+                case F_PIDIHS:
+                    pdihs_gpu<calcVir, calcEner>(localThreadIndex, &vtot_loc, nBonds, iatoms, kernelParams.forceparamsDevice,
+                                                 kernelParams.xqDevice, kernelParams.forceDevice, sm_fShiftLoc, kernelParams.pbcAiuc);
+                    break;
+                case F_RBDIHS:
+                    rbdihs_gpu<calcVir, calcEner>(localThreadIndex, &vtot_loc, nBonds, iatoms, kernelParams.forceparamsDevice,
+                                                  kernelParams.xqDevice, kernelParams.forceDevice, sm_fShiftLoc, kernelParams.pbcAiuc);
+                    break;
+                case F_IDIHS:
+                    idihs_gpu<calcVir, calcEner>(localThreadIndex, &vtot_loc, nBonds, iatoms, kernelParams.forceparamsDevice,
+                                                 kernelParams.xqDevice, kernelParams.forceDevice, sm_fShiftLoc, kernelParams.pbcAiuc);
+                    break;
+                case F_LJ14:
+                    pairs_gpu<calcVir, calcEner>(localThreadIndex, nBonds, iatoms, kernelParams.forceparamsDevice,
+                                                 kernelParams.xqDevice, kernelParams.forceDevice, sm_fShiftLoc, kernelParams.pbcAiuc,
+                                                 kernelParams.scaleFactor, &vtotVdw_loc, &vtotElec_loc);
+                    break;
+            }
+            break;
         }
-        if (calcVir && threadIdx.x < SHIFTS)
+    }
+
+    if (threadComputedPotential)
+    {
+        float *vtotVdw  = kernelParams.vtotDevice + F_LJ14;
+        float *vtotElec = kernelParams.vtotDevice + F_COUL14;
+        atomicAdd(kernelParams.vtotDevice + ftype, vtot_loc);
+        atomicAdd(vtotVdw, vtotVdw_loc);
+        atomicAdd(vtotElec, vtotElec_loc);
+    }
+    /* Accumulate shift vectors from shared memory to global memory on the first SHIFTS threads of the block. */
+    if (calcVir)
+    {
+        __syncthreads();
+        if (threadIdx.x < SHIFTS)
         {
-            fvec_inc_atomic(fshift[threadIdx.x], fshift_loc[threadIdx.x]);
+            fvec_inc_atomic(kernelParams.fshiftDevice[threadIdx.x], sm_fShiftLoc[threadIdx.x]);
         }
     }
 }
 
-/*-------------------------------- End CUDA kernels-----------------------------*/
 
+/*-------------------------------- End CUDA kernels-----------------------------*/
 
-namespace gmx
-{
 
 template <bool calcVir, bool calcEner>
 void
-GpuBonded::Impl::launchKernels(const t_forcerec *fr,
-                               const matrix      box)
+GpuBonded::Impl::launchKernel(const t_forcerec *fr,
+                              const matrix      box)
 {
     GMX_ASSERT(haveInteractions_,
                "Cannot launch bonded GPU kernels unless bonded GPU work was scheduled");
+    static_assert(TPB_BONDED >= SHIFTS, "TPB_BONDED must be >= SHIFTS for the virial kernel (calcVir=true)");
 
     PbcAiuc       pbcAiuc;
     setPbcAiuc(fr->bMolPBC ? ePBC2npbcdim(fr->ePBC) : 0, box, &pbcAiuc);
 
-    const t_iparams *forceparams_d = forceparamsDevice;
-    float           *vtot_d        = vtotDevice;
-    const float4    *xq_d          = xqDevice;
-    fvec            *force_d       = forceDevice;
-    fvec            *fshift_d      = fshiftDevice;
+    int                ftypeRangeEnd = kernelParams_.ftypeRangeEnd[nFtypesOnGpu - 1];
 
-    for (int ftype : ftypesOnGpu)
+    if (ftypeRangeEnd < 0)
     {
-        const auto &iList = iLists[ftype];
-
-        if (iList.size() > 0)
-        {
-            int                nat1   = interaction_function[ftype].nratoms + 1;
-            int                nbonds = iList.size()/nat1;
-
-            KernelLaunchConfig config;
-            config.blockSize[0] = TPB_BONDED;
-            config.blockSize[1] = 1;
-            config.blockSize[2] = 1;
-            config.gridSize[0]  = (nbonds + TPB_BONDED - 1)/TPB_BONDED;
-            config.gridSize[1]  = 1;
-            config.gridSize[2]  = 1;
-            config.stream       = stream;
-
-            const t_iatom *iatoms = iListsDevice[ftype].iatoms;
-
-            if (ftype == F_PDIHS || ftype == F_PIDIHS)
-            {
-                auto       kernelPtr      = pdihs_gpu<calcVir, calcEner>;
-                float     *ftypeEnergyPtr = vtot_d + ftype;
-                const auto kernelArgs     = prepareGpuKernelArguments(kernelPtr, config,
-                                                                      &ftypeEnergyPtr, &nbonds,
-                                                                      &iatoms, &forceparams_d,
-                                                                      &xq_d, &force_d, &fshift_d,
-                                                                      &pbcAiuc);
-                launchGpuKernel(kernelPtr, config, nullptr, "pdihs_gpu<calcVir, calcEner>", kernelArgs);
-            }
-        }
+        return;
     }
 
-    for (int ftype : ftypesOnGpu)
-    {
-        const auto &iList = iLists[ftype];
+    KernelLaunchConfig config;
+    config.blockSize[0] = TPB_BONDED;
+    config.blockSize[1] = 1;
+    config.blockSize[2] = 1;
+    config.gridSize[0]  = (ftypeRangeEnd + TPB_BONDED)/TPB_BONDED;
+    config.gridSize[1]  = 1;
+    config.gridSize[2]  = 1;
+    config.stream       = stream;
 
-        if (iList.size() > 0)
-        {
-            int                nat1   = interaction_function[ftype].nratoms + 1;
-            int                nbonds = iList.size()/nat1;
-
-            const t_iatom     *iatoms = iListsDevice[ftype].iatoms;
-
-            KernelLaunchConfig config;
-            config.blockSize[0] = TPB_BONDED;
-            config.blockSize[1] = 1;
-            config.blockSize[2] = 1;
-            config.gridSize[0]  = (nbonds + TPB_BONDED - 1)/TPB_BONDED;
-            config.gridSize[1]  = 1;
-            config.gridSize[2]  = 1;
-            config.stream       = stream;
-
-            float *ftypeEnergyPtr = vtot_d + ftype;
-            // TODO consider using a map to assign the fn pointers to ftypes
-            if (ftype == F_BONDS)
-            {
-                auto       kernelPtr  = bonds_gpu<calcVir, calcEner>;
-                const auto kernelArgs = prepareGpuKernelArguments(kernelPtr, config,
-                                                                  &ftypeEnergyPtr, &nbonds,
-                                                                  &iatoms, &forceparams_d,
-                                                                  &xq_d, &force_d, &fshift_d,
-                                                                  &pbcAiuc);
-                launchGpuKernel(kernelPtr, config, nullptr, "bonds_gpu<calcVir, calcEner>", kernelArgs);
-            }
+    auto kernelPtr            = exec_kernel_gpu<calcVir, calcEner>;
+    kernelParams_.scaleFactor = fr->ic->epsfac*fr->fudgeQQ;
+    kernelParams_.pbcAiuc     = pbcAiuc;
 
-            if (ftype == F_ANGLES)
-            {
-                auto       kernelPtr  = angles_gpu<calcVir, calcEner>;
-                const auto kernelArgs = prepareGpuKernelArguments(kernelPtr, config,
-                                                                  &ftypeEnergyPtr, &nbonds,
-                                                                  &iatoms, &forceparams_d,
-                                                                  &xq_d, &force_d, &fshift_d,
-                                                                  &pbcAiuc);
-                launchGpuKernel(kernelPtr, config, nullptr, "angles_gpu<calcVir, calcEner>", kernelArgs);
-            }
+    const auto kernelArgs     = prepareGpuKernelArguments(kernelPtr, config, &kernelParams_);
 
-            if (ftype == F_UREY_BRADLEY)
-            {
-                auto       kernelPtr  = urey_bradley_gpu<calcVir, calcEner>;
-                const auto kernelArgs = prepareGpuKernelArguments(kernelPtr, config,
-                                                                  &ftypeEnergyPtr, &nbonds,
-                                                                  &iatoms, &forceparams_d,
-                                                                  &xq_d, &force_d, &fshift_d,
-                                                                  &pbcAiuc);
-                launchGpuKernel(kernelPtr, config, nullptr, "urey_bradley_gpu<calcVir, calcEner>", kernelArgs);
-            }
-
-            if (ftype == F_RBDIHS)
-            {
-                auto       kernelPtr  = rbdihs_gpu<calcVir, calcEner>;
-                const auto kernelArgs = prepareGpuKernelArguments(kernelPtr, config,
-                                                                  &ftypeEnergyPtr, &nbonds,
-                                                                  &iatoms, &forceparams_d,
-                                                                  &xq_d, &force_d, &fshift_d,
-                                                                  &pbcAiuc);
-                launchGpuKernel(kernelPtr, config, nullptr, "rbdihs_gpu<calcVir, calcEner>", kernelArgs);
-            }
-
-            if (ftype == F_IDIHS)
-            {
-                auto       kernelPtr  = idihs_gpu<calcVir, calcEner>;
-                const auto kernelArgs = prepareGpuKernelArguments(kernelPtr, config,
-                                                                  &ftypeEnergyPtr, &nbonds,
-                                                                  &iatoms, &forceparams_d,
-                                                                  &xq_d, &force_d, &fshift_d,
-                                                                  &pbcAiuc);
-                launchGpuKernel(kernelPtr, config, nullptr, "idihs_gpu<calcVir, calcEner>", kernelArgs);
-            }
-
-            if (ftype == F_LJ14)
-            {
-                auto       kernelPtr       = pairs_gpu<calcVir, calcEner>;
-                float      scale_factor    = fr->ic->epsfac*fr->fudgeQQ;
-                float     *lj14Energy      = vtot_d + F_LJ14;
-                float     *coulomb14Energy = vtot_d + F_COUL14;
-                const auto kernelArgs      = prepareGpuKernelArguments(kernelPtr, config,
-                                                                       &nbonds,
-                                                                       &iatoms, &forceparams_d,
-                                                                       &xq_d, &force_d, &fshift_d,
-                                                                       &pbcAiuc,
-                                                                       &scale_factor,
-                                                                       &lj14Energy, &coulomb14Energy);
-                launchGpuKernel(kernelPtr, config, nullptr, "pairs_gpu<calcVir, calcEner>", kernelArgs);
-            }
-        }
-    }
+    launchGpuKernel(kernelPtr, config, nullptr, "exec_kernel_gpu<calcVir, calcEner>", kernelArgs);
 }
 
 void
-GpuBonded::launchKernels(const t_forcerec *fr,
-                         int               forceFlags,
-                         const matrix      box)
+GpuBonded::launchKernel(const t_forcerec *fr,
+                        int               forceFlags,
+                        const matrix      box)
 {
     if (forceFlags & GMX_FORCE_ENERGY)
     {
         // When we need the energy, we also need the virial
-        impl_->launchKernels<true, true>
+        impl_->launchKernel<true, true>
             (fr, box);
     }
     else if (forceFlags & GMX_FORCE_VIRIAL)
     {
-        impl_->launchKernels<true, false>
+        impl_->launchKernel<true, false>
             (fr, box);
     }
     else
     {
-        impl_->launchKernels<false, false>
+        impl_->launchKernel<false, false>
             (fr, box);
     }
 }
index 7e6304774668a6136c24e09f3d94efe9407b684e..c90dfed17d9c5175fc807170f94d9c867e4a06ab 100644 (file)
@@ -1079,7 +1079,7 @@ void do_force(FILE                                     *fplog,
         if (ppForceWorkload->haveGpuBondedWork && !havePPDomainDecomposition(cr))
         {
             wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_BONDED);
-            fr->gpuBonded->launchKernels(fr, flags, box);
+            fr->gpuBonded->launchKernel(fr, flags, box);
             wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_BONDED);
         }
 
@@ -1146,7 +1146,7 @@ void do_force(FILE                                     *fplog,
             if (ppForceWorkload->haveGpuBondedWork)
             {
                 wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_BONDED);
-                fr->gpuBonded->launchKernels(fr, flags, box);
+                fr->gpuBonded->launchKernel(fr, flags, box);
                 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_BONDED);
             }