Cleaning up the GPU bonded kernel
authorMagnus Lundborg <lundborg.magnus@gmail.com>
Tue, 18 Jun 2019 12:22:10 +0000 (14:22 +0200)
committerMark Abraham <mark.j.abraham@gmail.com>
Sun, 30 Jun 2019 21:09:31 +0000 (23:09 +0200)
Improved variable names
Removed unnecessary includes

Refs #2988

Change-Id: Ife5f50e68f27994203305e376f2781a076052b64

src/gromacs/listed_forces/gpubonded.h
src/gromacs/listed_forces/gpubonded_impl.cpp
src/gromacs/listed_forces/gpubonded_impl.cu
src/gromacs/listed_forces/gpubonded_impl.h
src/gromacs/listed_forces/gpubondedkernels.cu
src/gromacs/listed_forces/manage_threading.cpp

index 3f0f5eba3f3b571be19c7f2aff8f3b2f4cb7c0c5..df63d675d074ee7188d37684084da11893330de8 100644 (file)
@@ -61,7 +61,7 @@ struct t_idef;
 struct t_inputrec;
 
 /*! \brief The number on bonded function types supported on GPUs */
-static constexpr int nFtypesOnGpu = 8;
+static constexpr int numFTypesOnGpu = 8;
 
 namespace gmx
 {
@@ -73,7 +73,7 @@ namespace gmx
  * \note The function types in the list are ordered on increasing value.
  * \note Currently bonded are only supported with CUDA, not with OpenCL.
  */
-constexpr std::array<int, nFtypesOnGpu> ftypesOnGpu =
+constexpr std::array<int, numFTypesOnGpu> fTypesOnGpu =
 {
     F_BONDS,
     F_ANGLES,
index dc816facc3865503a46039ad13989691aebe5a44..b7b20fdd52530ffbc75c28826d51aa59f4766adf 100644 (file)
@@ -58,9 +58,9 @@ namespace gmx
 //! Returns whether there are any interactions in ilists suitable for a GPU.
 static bool someInteractionsCanRunOnGpu(const InteractionLists &ilists)
 {
-    for (int ftype : ftypesOnGpu)
+    for (int fType : fTypesOnGpu)
     {
-        if (!ilists[ftype].iatoms.empty())
+        if (!ilists[fType].iatoms.empty())
         {
             // Perturbation is not implemented in the GPU bonded
             // kernels. If all the interactions were actually
index 050efc0f2d753e8f7e88c0365da74ea506e32d5d..065e88e253b049c5b08da31af00a82da6d2fe02d 100644 (file)
 #include "gromacs/gpu_utils/cuda_arch_utils.cuh"
 #include "gromacs/gpu_utils/cudautils.cuh"
 #include "gromacs/gpu_utils/devicebuffer.h"
-#include "gromacs/gpu_utils/gpu_vec.cuh"
-#include "gromacs/gpu_utils/gputraits.cuh"
-#include "gromacs/gpu_utils/hostallocator.h"
-#include "gromacs/listed_forces/gpubonded.h"
 #include "gromacs/mdtypes/enerdata.h"
 #include "gromacs/topology/forcefieldparameters.h"
-#include "gromacs/topology/idef.h"
 
 struct t_forcerec;
 
@@ -69,62 +64,62 @@ namespace gmx
 GpuBonded::Impl::Impl(const gmx_ffparams_t &ffparams,
                       void                 *streamPtr)
 {
-    stream = *static_cast<CommandStream*>(streamPtr);
+    stream_ = *static_cast<CommandStream*>(streamPtr);
 
-    allocateDeviceBuffer(&forceparamsDevice, ffparams.numTypes(), nullptr);
+    allocateDeviceBuffer(&d_forceParams_, ffparams.numTypes(), nullptr);
     // This could be an async transfer (if the source is pinned), so
     // long as it uses the same stream as the kernels and we are happy
     // to consume additional pinned pages.
-    copyToDeviceBuffer(&forceparamsDevice, ffparams.iparams.data(),
+    copyToDeviceBuffer(&d_forceParams_, ffparams.iparams.data(),
                        0, ffparams.numTypes(),
-                       stream, GpuApiCallBehavior::Sync, nullptr);
-    vtot.resize(F_NRE);
-    allocateDeviceBuffer(&vtotDevice, F_NRE, nullptr);
-    clearDeviceBufferAsync(&vtotDevice, 0, F_NRE, stream);
+                       stream_, GpuApiCallBehavior::Sync, nullptr);
+    vTot_.resize(F_NRE);
+    allocateDeviceBuffer(&d_vTot_, F_NRE, nullptr);
+    clearDeviceBufferAsync(&d_vTot_, 0, F_NRE, stream_);
 
-    for (int ftype = 0; ftype < F_NRE; ftype++)
+    for (int fType = 0; fType < F_NRE; fType++)
     {
-        iListsDevice[ftype].nr     = 0;
-        iListsDevice[ftype].iatoms = nullptr;
-        iListsDevice[ftype].nalloc = 0;
+        d_iLists_[fType].nr     = 0;
+        d_iLists_[fType].iatoms = nullptr;
+        d_iLists_[fType].nalloc = 0;
     }
 
-    kernelParams_.forceparamsDevice      = forceparamsDevice;
-    kernelParams_.xqDevice               = xqDevice;
-    kernelParams_.forceDevice            = forceDevice;
-    kernelParams_.fshiftDevice           = fshiftDevice;
-    kernelParams_.vtotDevice             = vtotDevice;
-    for (int i = 0; i < nFtypesOnGpu; i++)
+    kernelParams_.d_forceParams          = d_forceParams_;
+    kernelParams_.d_xq                   = d_xq_;
+    kernelParams_.d_f                    = d_f_;
+    kernelParams_.d_fShift               = d_fShift_;
+    kernelParams_.d_vTot                 = d_vTot_;
+    for (int i = 0; i < numFTypesOnGpu; i++)
     {
-        kernelParams_.iatoms[i]          = nullptr;
-        kernelParams_.ftypeRangeStart[i] = 0;
-        kernelParams_.ftypeRangeEnd[i]   = -1;
+        kernelParams_.d_iatoms[i]        = nullptr;
+        kernelParams_.fTypeRangeStart[i] = 0;
+        kernelParams_.fTypeRangeEnd[i]   = -1;
     }
 }
 
 GpuBonded::Impl::~Impl()
 {
-    for (int ftype : ftypesOnGpu)
+    for (int fType : fTypesOnGpu)
     {
-        if (iListsDevice[ftype].iatoms)
+        if (d_iLists_[fType].iatoms)
         {
-            freeDeviceBuffer(&iListsDevice[ftype].iatoms);
-            iListsDevice[ftype].iatoms = nullptr;
+            freeDeviceBuffer(&d_iLists_[fType].iatoms);
+            d_iLists_[fType].iatoms = nullptr;
         }
     }
 
-    freeDeviceBuffer(&forceparamsDevice);
-    freeDeviceBuffer(&vtotDevice);
+    freeDeviceBuffer(&d_forceParams_);
+    freeDeviceBuffer(&d_vTot_);
 }
 
-//! Return whether function type \p ftype in \p idef has perturbed interactions
-static bool ftypeHasPerturbedEntries(const t_idef  &idef,
-                                     int            ftype)
+//! Return whether function type \p fType in \p idef has perturbed interactions
+static bool fTypeHasPerturbedEntries(const t_idef  &idef,
+                                     int            fType)
 {
     GMX_ASSERT(idef.ilsort == ilsortNO_FE || idef.ilsort == ilsortFE_SORTED,
                "Perturbed interations should be sorted here");
 
-    const t_ilist &ilist = idef.il[ftype];
+    const t_ilist &ilist = idef.il[fType];
 
     return (idef.ilsort != ilsortNO_FE && ilist.nr_nonperturbed != ilist.nr);
 }
@@ -180,29 +175,29 @@ static inline int roundUpToFactor(const int input, const int factor)
 void
 GpuBonded::Impl::updateInteractionListsAndDeviceBuffers(ArrayRef<const int>  nbnxnAtomOrder,
                                                         const t_idef        &idef,
-                                                        void                *xqDevicePtr,
-                                                        void                *forceDevicePtr,
-                                                        void                *fshiftDevicePtr)
+                                                        void                *d_xqPtr,
+                                                        void                *d_fPtr,
+                                                        void                *d_fShiftPtr)
 {
     // TODO wallcycle sub start
     haveInteractions_ = false;
-    int ftypesCounter = 0;
+    int fTypesCounter = 0;
 
-    for (int ftype : ftypesOnGpu)
+    for (int fType : fTypesOnGpu)
     {
-        auto &iList = iLists[ftype];
+        auto &iList = iLists_[fType];
 
         /* Perturbation is not implemented in the GPU bonded kernels.
          * But instead of doing all interactions on the CPU, we can
          * still easily handle the types that have no perturbed
          * interactions on the GPU. */
-        if (idef.il[ftype].nr > 0 && !ftypeHasPerturbedEntries(idef, ftype))
+        if (idef.il[fType].nr > 0 && !fTypeHasPerturbedEntries(idef, fType))
         {
             haveInteractions_ = true;
 
-            convertIlistToNbnxnOrder(idef.il[ftype],
+            convertIlistToNbnxnOrder(idef.il[fType],
                                      &iList,
-                                     NRAL(ftype), nbnxnAtomOrder);
+                                     NRAL(fType), nbnxnAtomOrder);
         }
         else
         {
@@ -215,46 +210,46 @@ GpuBonded::Impl::updateInteractionListsAndDeviceBuffers(ArrayRef<const int>  nbn
         // end.
         if (iList.size() > 0)
         {
-            t_ilist &iListDevice = iListsDevice[ftype];
+            t_ilist &d_iList = d_iLists_[fType];
 
-            reallocateDeviceBuffer(&iListDevice.iatoms, iList.size(), &iListDevice.nr, &iListDevice.nalloc, nullptr);
+            reallocateDeviceBuffer(&d_iList.iatoms, iList.size(), &d_iList.nr, &d_iList.nalloc, nullptr);
 
-            copyToDeviceBuffer(&iListDevice.iatoms, iList.iatoms.data(),
+            copyToDeviceBuffer(&d_iList.iatoms, iList.iatoms.data(),
                                0, iList.size(),
-                               stream, GpuApiCallBehavior::Async, nullptr);
+                               stream_, GpuApiCallBehavior::Async, nullptr);
         }
-        kernelParams_.ftypesOnGpu[ftypesCounter]   = ftype;
-        kernelParams_.nrFTypeIAtoms[ftypesCounter] = iList.size();
-        int nBonds = iList.size() / (interaction_function[ftype].nratoms + 1);
-        kernelParams_.nrFTypeBonds[ftypesCounter]  = nBonds;
-        kernelParams_.iatoms[ftypesCounter]        = iListsDevice[ftype].iatoms;
-        if (ftypesCounter == 0)
+        kernelParams_.fTypesOnGpu[fTypesCounter]    = fType;
+        kernelParams_.numFTypeIAtoms[fTypesCounter] = iList.size();
+        int numBonds = iList.size() / (interaction_function[fType].nratoms + 1);
+        kernelParams_.numFTypeBonds[fTypesCounter]  = numBonds;
+        kernelParams_.d_iatoms[fTypesCounter]       = d_iLists_[fType].iatoms;
+        if (fTypesCounter == 0)
         {
-            kernelParams_.ftypeRangeStart[ftypesCounter] = 0;
+            kernelParams_.fTypeRangeStart[fTypesCounter] = 0;
         }
         else
         {
-            kernelParams_.ftypeRangeStart[ftypesCounter] = kernelParams_.ftypeRangeEnd[ftypesCounter - 1] + 1;
+            kernelParams_.fTypeRangeStart[fTypesCounter] = kernelParams_.fTypeRangeEnd[fTypesCounter - 1] + 1;
         }
-        kernelParams_.ftypeRangeEnd[ftypesCounter] = kernelParams_.ftypeRangeStart[ftypesCounter] + roundUpToFactor(nBonds, warp_size) - 1;
+        kernelParams_.fTypeRangeEnd[fTypesCounter] = kernelParams_.fTypeRangeStart[fTypesCounter] + roundUpToFactor(numBonds, warp_size) - 1;
 
-        GMX_ASSERT(nBonds > 0 || kernelParams_.ftypeRangeEnd[ftypesCounter] <= kernelParams_.ftypeRangeStart[ftypesCounter],
-                   "Invalid GPU listed forces setup. nBonds must be > 0 if there are threads allocated to do work on that interaction function type.");
-        GMX_ASSERT(kernelParams_.ftypeRangeStart[ftypesCounter] % warp_size == 0 && (kernelParams_.ftypeRangeEnd[ftypesCounter] + 1) % warp_size == 0,
+        GMX_ASSERT(numBonds > 0 || kernelParams_.fTypeRangeEnd[fTypesCounter] <= kernelParams_.fTypeRangeStart[fTypesCounter],
+                   "Invalid GPU listed forces setup. numBonds must be > 0 if there are threads allocated to do work on that interaction function type.");
+        GMX_ASSERT(kernelParams_.fTypeRangeStart[fTypesCounter] % warp_size == 0 && (kernelParams_.fTypeRangeEnd[fTypesCounter] + 1) % warp_size == 0,
                    "The bonded interactions must be assigned to the GPU in blocks of warp size.");
 
-        ftypesCounter++;
+        fTypesCounter++;
     }
 
-    xqDevice     = static_cast<float4 *>(xqDevicePtr);
-    forceDevice  = static_cast<fvec *>(forceDevicePtr);
-    fshiftDevice = static_cast<fvec *>(fshiftDevicePtr);
+    d_xq_                           = static_cast<float4 *>(d_xqPtr);
+    d_f_                            = static_cast<fvec *>(d_fPtr);
+    d_fShift_                       = static_cast<fvec *>(d_fShiftPtr);
 
-    kernelParams_.xqDevice          = xqDevice;
-    kernelParams_.forceDevice       = forceDevice;
-    kernelParams_.fshiftDevice      = fshiftDevice;
-    kernelParams_.forceparamsDevice = forceparamsDevice;
-    kernelParams_.vtotDevice        = vtotDevice;
+    kernelParams_.d_xq              = d_xq_;
+    kernelParams_.d_f               = d_f_;
+    kernelParams_.d_fShift          = d_fShift_;
+    kernelParams_.d_forceParams     = d_forceParams_;
+    kernelParams_.d_vTot            = d_vTot_;
 
     // TODO wallcycle sub stop
 }
@@ -271,10 +266,10 @@ GpuBonded::Impl::launchEnergyTransfer()
     // TODO should wrap with ewcLAUNCH_GPU
     GMX_ASSERT(haveInteractions_, "No GPU bonded interactions, so no energies will be computed, so transfer should not be called");
 
-    float *vtot_h   = vtot.data();
-    copyFromDeviceBuffer(vtot_h, &vtotDevice,
+    float *h_vTot   = vTot_.data();
+    copyFromDeviceBuffer(h_vTot, &d_vTot_,
                          0, F_NRE,
-                         stream, GpuApiCallBehavior::Async, nullptr);
+                         stream_, GpuApiCallBehavior::Async, nullptr);
 }
 
 void
@@ -284,29 +279,29 @@ GpuBonded::Impl::accumulateEnergyTerms(gmx_enerdata_t *enerd)
     // wait goes in to the "Rest" counter
     GMX_ASSERT(haveInteractions_, "No GPU bonded interactions, so no energies will be computed or transferred, so accumulation should not occur");
 
-    cudaError_t stat = cudaStreamSynchronize(stream);
+    cudaError_t stat = cudaStreamSynchronize(stream_);
     CU_RET_ERR(stat, "D2H transfer of bonded energies failed");
 
-    for (int ftype : ftypesOnGpu)
+    for (int fType : fTypesOnGpu)
     {
-        if (ftype != F_LJ14 && ftype != F_COUL14)
+        if (fType != F_LJ14 && fType != F_COUL14)
         {
-            enerd->term[ftype] += vtot[ftype];
+            enerd->term[fType] += vTot_[fType];
         }
     }
 
     // Note: We do not support energy groups here
     gmx_grppairener_t *grppener = &enerd->grpp;
     GMX_RELEASE_ASSERT(grppener->nener == 1, "No energy group support for bondeds on the GPU");
-    grppener->ener[egLJ14][0]   += vtot[F_LJ14];
-    grppener->ener[egCOUL14][0] += vtot[F_COUL14];
+    grppener->ener[egLJ14][0]   += vTot_[F_LJ14];
+    grppener->ener[egCOUL14][0] += vTot_[F_COUL14];
 }
 
 void
 GpuBonded::Impl::clearEnergies()
 {
     // TODO should wrap with ewcLAUNCH_GPU
-    clearDeviceBufferAsync(&vtotDevice, 0, F_NRE, stream);
+    clearDeviceBufferAsync(&d_vTot_, 0, F_NRE, stream_);
 }
 
 // ---- GpuBonded
@@ -322,12 +317,12 @@ GpuBonded::~GpuBonded() = default;
 void
 GpuBonded::updateInteractionListsAndDeviceBuffers(ArrayRef<const int>  nbnxnAtomOrder,
                                                   const t_idef        &idef,
-                                                  void                *xqDevice,
-                                                  void                *forceDevice,
-                                                  void                *fshiftDevice)
+                                                  void                *d_xq,
+                                                  void                *d_f,
+                                                  void                *d_fShift)
 {
     impl_->updateInteractionListsAndDeviceBuffers
-        (nbnxnAtomOrder, idef, xqDevice, forceDevice, fshiftDevice);
+        (nbnxnAtomOrder, idef, d_xq, d_f, d_fShift);
 }
 
 bool
index 0cfc429f4467898a7676aca934b1a98571a550d9..f33f6453246ac60bea10dc0c330d39c4e888b6fc 100644 (file)
@@ -53,7 +53,6 @@
 #include "gromacs/gpu_utils/hostallocator.h"
 #include "gromacs/listed_forces/gpubonded.h"
 #include "gromacs/pbcutil/pbc_aiuc.h"
-#include "gromacs/topology/idef.h"
 
 struct gmx_ffparams_t;
 struct t_forcerec;
@@ -87,28 +86,28 @@ struct BondedCudaKernelParameters
     //! Scale factor
     float               scaleFactor;
     //! The bonded types on GPU
-    int                 ftypesOnGpu[nFtypesOnGpu];
+    int                 fTypesOnGpu[numFTypesOnGpu];
     //! The number of interaction atom (iatom) elements for every function type
-    int                 nrFTypeIAtoms[nFtypesOnGpu];
+    int                 numFTypeIAtoms[numFTypesOnGpu];
     //! The number of bonds for every function type
-    int                 nrFTypeBonds[nFtypesOnGpu];
+    int                 numFTypeBonds[numFTypesOnGpu];
     //! The start index in the range of each interaction type
-    int                 ftypeRangeStart[nFtypesOnGpu];
+    int                 fTypeRangeStart[numFTypesOnGpu];
     //! The end index in the range of each interaction type
-    int                 ftypeRangeEnd[nFtypesOnGpu];
+    int                 fTypeRangeEnd[numFTypesOnGpu];
 
     //! Force parameters (on GPU)
-    t_iparams          *forceparamsDevice;
+    t_iparams          *d_forceParams;
     //! Coordinates before the timestep (on GPU)
-    const float4       *xqDevice;
+    const float4       *d_xq;
     //! Forces on atoms (on GPU)
-    fvec               *forceDevice;
+    fvec               *d_f;
     //! Force shifts on atoms (on GPU)
-    fvec               *fshiftDevice;
+    fvec               *d_fShift;
     //! Total Energy (on GPU)
-    float              *vtotDevice;
+    float              *d_vTot;
     //! Interaction list atoms (on GPU)
-    t_iatom            *iatoms[nFtypesOnGpu];
+    t_iatom            *d_iatoms[numFTypesOnGpu];
 
     BondedCudaKernelParameters()
     {
@@ -116,12 +115,12 @@ struct BondedCudaKernelParameters
 
         setPbcAiuc(0, boxDummy, &pbcAiuc);
 
-        scaleFactor       = 1.0;
-        forceparamsDevice = nullptr;
-        xqDevice          = nullptr;
-        forceDevice       = nullptr;
-        fshiftDevice      = nullptr;
-        vtotDevice        = nullptr;
+        scaleFactor   = 1.0;
+        d_forceParams = nullptr;
+        d_xq          = nullptr;
+        d_f           = nullptr;
+        d_fShift      = nullptr;
+        d_vTot        = nullptr;
     }
 };
 
@@ -167,27 +166,27 @@ class GpuBonded::Impl
          *
          * \todo This is potentially several pinned allocations, which
          * could contribute to exhausting such pages. */
-        std::array<HostInteractionList, F_NRE> iLists;
+        std::array<HostInteractionList, F_NRE> iLists_;
+
         //! Tells whether there are any interaction in iLists.
-        bool                                   haveInteractions_;
+        bool                  haveInteractions_;
         //! Interaction lists on the device.
-        t_ilist                                iListsDevice[F_NRE];
-
+        t_ilist               d_iLists_[F_NRE];
         //! Bonded parameters for device-side use.
-        t_iparams            *forceparamsDevice = nullptr;
+        t_iparams            *d_forceParams_ = nullptr;
         //! Position-charge vector on the device.
-        const float4         *xqDevice = nullptr;
+        const float4         *d_xq_ = nullptr;
         //! Force vector on the device.
-        fvec                 *forceDevice = nullptr;
+        fvec                 *d_f_ = nullptr;
         //! Shift force vector on the device.
-        fvec                 *fshiftDevice = nullptr;
+        fvec                 *d_fShift_ = nullptr;
         //! \brief Host-side virial buffer
-        HostVector <float>    vtot = {{}, gmx::HostAllocationPolicy(gmx::PinningPolicy::PinnedIfSupported)};
+        HostVector <float>    vTot_ = {{}, gmx::HostAllocationPolicy(gmx::PinningPolicy::PinnedIfSupported)};
         //! \brief Device-side total virial
-        float                *vtotDevice   = nullptr;
+        float                *d_vTot_   = nullptr;
 
         //! \brief Bonded GPU stream, not owned by this module
-        CommandStream         stream;
+        CommandStream         stream_;
 
         //! Parameters and pointers, passed to the CUDA kernel
         BondedCudaKernelParameters kernelParams_;
index b0f19a866883a5ba4469ec6dcab31d39ff23dc7a..52a00914358a1b52c7152b02c58e55fcfa47c751 100644 (file)
 #include <math_constants.h>
 
 #include "gromacs/gpu_utils/cudautils.cuh"
-#include "gromacs/gpu_utils/devicebuffer.h"
 #include "gromacs/gpu_utils/gpu_vec.cuh"
-#include "gromacs/gpu_utils/gputraits.cuh"
 #include "gromacs/listed_forces/gpubonded.h"
-#include "gromacs/listed_forces/listed_forces.h"
 #include "gromacs/math/units.h"
 #include "gromacs/mdlib/force_flags.h"
-#include "gromacs/mdtypes/enerdata.h"
 #include "gromacs/mdtypes/forcerec.h"
-#include "gromacs/mdtypes/group.h"
-#include "gromacs/mdtypes/mdatom.h"
-#include "gromacs/pbcutil/ishift.h"
 #include "gromacs/pbcutil/pbc.h"
 #include "gromacs/pbcutil/pbc_aiuc_cuda.cuh"
-#include "gromacs/topology/idef.h"
-#include "gromacs/topology/ifunc.h"
 #include "gromacs/utility/gmxassert.h"
 
 #include "gpubonded_impl.h"
@@ -102,28 +93,28 @@ static void harmonic_gpu(const float kA, const float xA, const float x, float *V
 
 template <bool calcVir, bool calcEner>
 __device__
-void bonds_gpu(const int i, float *vtot_loc, const int nBonds,
-               const t_iatom forceatoms[], const t_iparams forceparams[],
-               const float4 xq[], fvec force[], fvec sm_fShiftLoc[],
+void bonds_gpu(const int i, float *vtot_loc, const int numBonds,
+               const t_iatom d_forceatoms[], const t_iparams d_forceparams[],
+               const float4 gm_xq[], fvec gm_f[], fvec sm_fShiftLoc[],
                const PbcAiuc pbcAiuc)
 {
-    if (i < nBonds)
+    if (i < numBonds)
     {
-        int type = forceatoms[3*i];
-        int ai   = forceatoms[3*i + 1];
-        int aj   = forceatoms[3*i + 2];
+        int type = d_forceatoms[3*i];
+        int ai   = d_forceatoms[3*i + 1];
+        int aj   = d_forceatoms[3*i + 2];
 
         /* dx = xi - xj, corrected for periodic boundary conditions. */
         fvec  dx;
-        int   ki = pbcDxAiuc<calcVir>(pbcAiuc, xq[ai], xq[aj], dx);
+        int   ki = pbcDxAiuc<calcVir>(pbcAiuc, gm_xq[ai], gm_xq[aj], dx);
 
         float dr2 = iprod_gpu(dx, dx);
         float dr  = sqrt(dr2);
 
         float vbond;
         float fbond;
-        harmonic_gpu(forceparams[type].harmonic.krA,
-                     forceparams[type].harmonic.rA,
+        harmonic_gpu(d_forceparams[type].harmonic.krA,
+                     d_forceparams[type].harmonic.rA,
                      dr, &vbond, &fbond);
 
         if (calcEner)
@@ -139,8 +130,8 @@ void bonds_gpu(const int i, float *vtot_loc, const int nBonds,
             for (int m = 0; m < DIM; m++)
             {
                 float fij = fbond*dx[m];
-                atomicAdd(&force[ai][m], fij);
-                atomicAdd(&force[aj][m], -fij);
+                atomicAdd(&gm_f[ai][m], fij);
+                atomicAdd(&gm_f[aj][m], -fij);
                 if (calcVir && ki != CENTRAL)
                 {
                     atomicAdd(&sm_fShiftLoc[ki][m], fij);
@@ -170,17 +161,17 @@ static float bond_angle_gpu(const float4 xi, const float4 xj, const float4 xk,
 
 template <bool calcVir, bool calcEner>
 __device__
-void angles_gpu(const int i, float *vtot_loc, const int nBonds,
-                const t_iatom forceatoms[], const t_iparams forceparams[],
-                const float4 xq[], fvec force[], fvec sm_fShiftLoc[],
+void angles_gpu(const int i, float *vtot_loc, const int numBonds,
+                const t_iatom d_forceatoms[], const t_iparams d_forceparams[],
+                const float4 gm_xq[], fvec gm_f[], fvec sm_fShiftLoc[],
                 const PbcAiuc pbcAiuc)
 {
-    if (i < nBonds)
+    if (i < numBonds)
     {
-        int   type = forceatoms[4*i];
-        int   ai   = forceatoms[4*i + 1];
-        int   aj   = forceatoms[4*i + 2];
-        int   ak   = forceatoms[4*i + 3];
+        int   type = d_forceatoms[4*i];
+        int   ai   = d_forceatoms[4*i + 1];
+        int   aj   = d_forceatoms[4*i + 2];
+        int   ak   = d_forceatoms[4*i + 3];
 
         fvec  r_ij;
         fvec  r_kj;
@@ -188,13 +179,13 @@ void angles_gpu(const int i, float *vtot_loc, const int nBonds,
         int   t1;
         int   t2;
         float theta =
-            bond_angle_gpu<calcVir>(xq[ai], xq[aj], xq[ak], pbcAiuc,
+            bond_angle_gpu<calcVir>(gm_xq[ai], gm_xq[aj], gm_xq[ak], pbcAiuc,
                                     r_ij, r_kj, &cos_theta, &t1, &t2);
 
         float va;
         float dVdt;
-        harmonic_gpu(forceparams[type].harmonic.krA,
-                     forceparams[type].harmonic.rA*DEG2RAD,
+        harmonic_gpu(d_forceparams[type].harmonic.krA,
+                     d_forceparams[type].harmonic.rA*DEG2RAD,
                      theta, &va, &dVdt);
 
         if (calcEner)
@@ -226,9 +217,9 @@ void angles_gpu(const int i, float *vtot_loc, const int nBonds,
                 f_i[m]    = -(cik*r_kj[m] - cii*r_ij[m]);
                 f_k[m]    = -(cik*r_ij[m] - ckk*r_kj[m]);
                 f_j[m]    = -f_i[m] - f_k[m];
-                atomicAdd(&force[ai][m], f_i[m]);
-                atomicAdd(&force[aj][m], f_j[m]);
-                atomicAdd(&force[ak][m], f_k[m]);
+                atomicAdd(&gm_f[ai][m], f_i[m]);
+                atomicAdd(&gm_f[aj][m], f_j[m]);
+                atomicAdd(&gm_f[ak][m], f_k[m]);
                 if (calcVir)
                 {
                     atomicAdd(&sm_fShiftLoc[t1][m], f_i[m]);
@@ -243,29 +234,29 @@ void angles_gpu(const int i, float *vtot_loc, const int nBonds,
 
 template <bool calcVir, bool calcEner>
 __device__
-void urey_bradley_gpu(const int i, float *vtot_loc, const int nBonds,
-                      const t_iatom forceatoms[], const t_iparams forceparams[],
-                      const float4 xq[], fvec force[], fvec sm_fShiftLoc[],
+void urey_bradley_gpu(const int i, float *vtot_loc, const int numBonds,
+                      const t_iatom d_forceatoms[], const t_iparams d_forceparams[],
+                      const float4 gm_xq[], fvec gm_f[], fvec sm_fShiftLoc[],
                       const PbcAiuc pbcAiuc)
 {
-    if (i < nBonds)
+    if (i < numBonds)
     {
-        int   type  = forceatoms[4*i];
-        int   ai    = forceatoms[4*i+1];
-        int   aj    = forceatoms[4*i+2];
-        int   ak    = forceatoms[4*i+3];
+        int   type  = d_forceatoms[4*i];
+        int   ai    = d_forceatoms[4*i+1];
+        int   aj    = d_forceatoms[4*i+2];
+        int   ak    = d_forceatoms[4*i+3];
 
-        float th0A  = forceparams[type].u_b.thetaA*DEG2RAD;
-        float kthA  = forceparams[type].u_b.kthetaA;
-        float r13A  = forceparams[type].u_b.r13A;
-        float kUBA  = forceparams[type].u_b.kUBA;
+        float th0A  = d_forceparams[type].u_b.thetaA*DEG2RAD;
+        float kthA  = d_forceparams[type].u_b.kthetaA;
+        float r13A  = d_forceparams[type].u_b.r13A;
+        float kUBA  = d_forceparams[type].u_b.kUBA;
 
         fvec  r_ij;
         fvec  r_kj;
         float cos_theta;
         int   t1;
         int   t2;
-        float theta = bond_angle_gpu<calcVir>(xq[ai], xq[aj], xq[ak], pbcAiuc,
+        float theta = bond_angle_gpu<calcVir>(gm_xq[ai], gm_xq[aj], gm_xq[ak], pbcAiuc,
                                               r_ij, r_kj, &cos_theta, &t1, &t2);
 
         float va;
@@ -278,7 +269,7 @@ void urey_bradley_gpu(const int i, float *vtot_loc, const int nBonds,
         }
 
         fvec  r_ik;
-        int   ki = pbcDxAiuc<calcVir>(pbcAiuc, xq[ai], xq[ak], r_ik);
+        int   ki = pbcDxAiuc<calcVir>(pbcAiuc, gm_xq[ai], gm_xq[ak], r_ik);
 
         float dr2  = iprod_gpu(r_ik, r_ik);
         float dr   = dr2*rsqrtf(dr2);
@@ -309,9 +300,9 @@ void urey_bradley_gpu(const int i, float *vtot_loc, const int nBonds,
                 f_i[m]    = -(cik*r_kj[m]-cii*r_ij[m]);
                 f_k[m]    = -(cik*r_ij[m]-ckk*r_kj[m]);
                 f_j[m]    = -f_i[m]-f_k[m];
-                atomicAdd(&force[ai][m], f_i[m]);
-                atomicAdd(&force[aj][m], f_j[m]);
-                atomicAdd(&force[ak][m], f_k[m]);
+                atomicAdd(&gm_f[ai][m], f_i[m]);
+                atomicAdd(&gm_f[aj][m], f_j[m]);
+                atomicAdd(&gm_f[ak][m], f_k[m]);
                 if (calcVir)
                 {
                     atomicAdd(&sm_fShiftLoc[t1][m], f_i[m]);
@@ -335,8 +326,8 @@ void urey_bradley_gpu(const int i, float *vtot_loc, const int nBonds,
             for (int m = 0; m < DIM; m++)
             {
                 float fik = fbond*r_ik[m];
-                atomicAdd(&force[ai][m], fik);
-                atomicAdd(&force[ak][m], -fik);
+                atomicAdd(&gm_f[ai][m], fik);
+                atomicAdd(&gm_f[ak][m], -fik);
 
                 if (calcVir && ki != CENTRAL)
                 {
@@ -372,23 +363,23 @@ static float dih_angle_gpu(const T xi, const T xj, const T xk, const T xl,
 
 __device__ __forceinline__
 static void dopdihs_gpu(const float cpA, const float phiA, const int mult,
-                        const float phi, float *V, float *F)
+                        const float phi, float *v, float *f)
 {
     float mdphi, sdphi;
 
     mdphi = mult*phi - phiA*DEG2RAD;
     sdphi = sinf(mdphi);
-    *V    = cpA * (1.0f + cosf(mdphi));
-    *F    = -cpA*mult*sdphi;
+    *v    = cpA * (1.0f + cosf(mdphi));
+    *f    = -cpA*mult*sdphi;
 }
 
 template <bool calcVir>
 __device__
 static void do_dih_fup_gpu(const int i, const int j, const int k, const int l,
                            const float ddphi, const fvec r_ij, const fvec r_kj, const fvec r_kl,
-                           const fvec m, const fvec n, fvec force[], fvec fshift[],
+                           const fvec m, const fvec n, fvec gm_f[], fvec sm_fShiftLoc[],
                            const PbcAiuc &pbcAiuc,
-                           const float4 xq[], const int t1, const int t2, const int gmx_unused t3)
+                           const float4 gm_xq[], const int t1, const int t2, const int gmx_unused t3)
 {
     float iprm  = iprod_gpu(m, m);
     float iprn  = iprod_gpu(n, n);
@@ -422,24 +413,24 @@ static void do_dih_fup_gpu(const int i, const int j, const int k, const int l,
 #pragma unroll
         for (int m = 0; (m < DIM); m++)
         {
-            atomicAdd(&force[i][m], f_i[m]);
-            atomicAdd(&force[j][m], -f_j[m]);
-            atomicAdd(&force[k][m], -f_k[m]);
-            atomicAdd(&force[l][m], f_l[m]);
+            atomicAdd(&gm_f[i][m], f_i[m]);
+            atomicAdd(&gm_f[j][m], -f_j[m]);
+            atomicAdd(&gm_f[k][m], -f_k[m]);
+            atomicAdd(&gm_f[l][m], f_l[m]);
         }
 
         if (calcVir)
         {
             fvec dx_jl;
-            int  t3 = pbcDxAiuc<calcVir>(pbcAiuc, xq[l], xq[j], dx_jl);
+            int  t3 = pbcDxAiuc<calcVir>(pbcAiuc, gm_xq[l], gm_xq[j], dx_jl);
 
 #pragma unroll
             for (int m = 0; (m < DIM); m++)
             {
-                atomicAdd(&fshift[t1][m], f_i[m]);
-                atomicAdd(&fshift[CENTRAL][m], -f_j[m]);
-                atomicAdd(&fshift[t2][m], -f_k[m]);
-                atomicAdd(&fshift[t3][m], f_l[m]);
+                atomicAdd(&sm_fShiftLoc[t1][m], f_i[m]);
+                atomicAdd(&sm_fShiftLoc[CENTRAL][m], -f_j[m]);
+                atomicAdd(&sm_fShiftLoc[t2][m], -f_k[m]);
+                atomicAdd(&sm_fShiftLoc[t3][m], f_l[m]);
             }
         }
     }
@@ -447,18 +438,18 @@ static void do_dih_fup_gpu(const int i, const int j, const int k, const int l,
 
 template <bool calcVir, bool calcEner>
 __device__
-void  pdihs_gpu(const int i, float *vtot_loc, const int nBonds,
-                const t_iatom forceatoms[], const t_iparams forceparams[],
-                const float4 xq[], fvec f[], fvec sm_fShiftLoc[],
+void  pdihs_gpu(const int i, float *vtot_loc, const int numBonds,
+                const t_iatom d_forceatoms[], const t_iparams d_forceparams[],
+                const float4 gm_xq[], fvec gm_f[], fvec sm_fShiftLoc[],
                 const PbcAiuc pbcAiuc)
 {
-    if (i < nBonds)
+    if (i < numBonds)
     {
-        int   type = forceatoms[5*i];
-        int   ai   = forceatoms[5*i + 1];
-        int   aj   = forceatoms[5*i + 2];
-        int   ak   = forceatoms[5*i + 3];
-        int   al   = forceatoms[5*i + 4];
+        int   type = d_forceatoms[5*i];
+        int   ai   = d_forceatoms[5*i + 1];
+        int   aj   = d_forceatoms[5*i + 2];
+        int   ak   = d_forceatoms[5*i + 3];
+        int   al   = d_forceatoms[5*i + 4];
 
         fvec  r_ij;
         fvec  r_kj;
@@ -469,14 +460,14 @@ void  pdihs_gpu(const int i, float *vtot_loc, const int nBonds,
         int   t2;
         int   t3;
         float phi  =
-            dih_angle_gpu<calcVir>(xq[ai], xq[aj], xq[ak], xq[al], pbcAiuc,
+            dih_angle_gpu<calcVir>(gm_xq[ai], gm_xq[aj], gm_xq[ak], gm_xq[al], pbcAiuc,
                                    r_ij, r_kj, r_kl, m, n, &t1, &t2, &t3);
 
         float vpd;
         float ddphi;
-        dopdihs_gpu(forceparams[type].pdihs.cpA,
-                    forceparams[type].pdihs.phiA,
-                    forceparams[type].pdihs.mult,
+        dopdihs_gpu(d_forceparams[type].pdihs.cpA,
+                    d_forceparams[type].pdihs.phiA,
+                    d_forceparams[type].pdihs.mult,
                     phi, &vpd, &ddphi);
 
         if (calcEner)
@@ -486,28 +477,28 @@ void  pdihs_gpu(const int i, float *vtot_loc, const int nBonds,
 
         do_dih_fup_gpu<calcVir>(ai, aj, ak, al,
                                 ddphi, r_ij, r_kj, r_kl, m, n,
-                                f, sm_fShiftLoc, pbcAiuc,
-                                xq, t1, t2, t3);
+                                gm_f, sm_fShiftLoc, pbcAiuc,
+                                gm_xq, t1, t2, t3);
 
     }
 }
 
 template <bool calcVir, bool calcEner>
 __device__
-void rbdihs_gpu(const int i, float *vtot_loc, const int nBonds,
-                const t_iatom forceatoms[], const t_iparams forceparams[],
-                const float4 xq[], fvec f[], fvec sm_fShiftLoc[],
+void rbdihs_gpu(const int i, float *vtot_loc, const int numBonds,
+                const t_iatom d_forceatoms[], const t_iparams d_forceparams[],
+                const float4 gm_xq[], fvec gm_f[], fvec sm_fShiftLoc[],
                 const PbcAiuc pbcAiuc)
 {
     constexpr float  c0 = 0.0f, c1 = 1.0f, c2 = 2.0f, c3 = 3.0f, c4 = 4.0f, c5 = 5.0f;
 
-    if (i < nBonds)
+    if (i < numBonds)
     {
-        int   type = forceatoms[5*i];
-        int   ai   = forceatoms[5*i+1];
-        int   aj   = forceatoms[5*i+2];
-        int   ak   = forceatoms[5*i+3];
-        int   al   = forceatoms[5*i+4];
+        int   type = d_forceatoms[5*i];
+        int   ai   = d_forceatoms[5*i+1];
+        int   aj   = d_forceatoms[5*i+2];
+        int   ak   = d_forceatoms[5*i+3];
+        int   al   = d_forceatoms[5*i+4];
 
         fvec  r_ij;
         fvec  r_kj;
@@ -518,7 +509,7 @@ void rbdihs_gpu(const int i, float *vtot_loc, const int nBonds,
         int   t2;
         int   t3;
         float phi  =
-            dih_angle_gpu<calcVir>(xq[ai], xq[aj], xq[ak], xq[al], pbcAiuc,
+            dih_angle_gpu<calcVir>(gm_xq[ai], gm_xq[aj], gm_xq[ak], gm_xq[al], pbcAiuc,
                                    r_ij, r_kj, r_kl, m, n, &t1, &t2, &t3);
 
         /* Change to polymer convention */
@@ -538,7 +529,7 @@ void rbdihs_gpu(const int i, float *vtot_loc, const int nBonds,
         float parm[NR_RBDIHS];
         for (int j = 0; j < NR_RBDIHS; j++)
         {
-            parm[j]  = forceparams[type].rbdihs.rbcA[j];
+            parm[j]  = d_forceparams[type].rbdihs.rbcA[j];
         }
         /* Calculate cosine powers */
         /* Calculate the energy */
@@ -587,8 +578,8 @@ void rbdihs_gpu(const int i, float *vtot_loc, const int nBonds,
 
         do_dih_fup_gpu<calcVir>(ai, aj, ak, al,
                                 ddphi, r_ij, r_kj, r_kl, m, n,
-                                f, sm_fShiftLoc, pbcAiuc,
-                                xq, t1, t2, t3);
+                                gm_f, sm_fShiftLoc, pbcAiuc,
+                                gm_xq, t1, t2, t3);
         if (calcEner)
         {
             *vtot_loc += v;
@@ -612,18 +603,18 @@ static void make_dp_periodic_gpu(float *dp)
 
 template <bool calcVir, bool calcEner>
 __device__
-void  idihs_gpu(const int i, float *vtot_loc, const int nBonds,
-                const t_iatom forceatoms[], const t_iparams forceparams[],
-                const float4 xq[], fvec f[], fvec sm_fShiftLoc[],
+void  idihs_gpu(const int i, float *vtot_loc, const int numBonds,
+                const t_iatom d_forceatoms[], const t_iparams d_forceparams[],
+                const float4 gm_xq[], fvec gm_f[], fvec sm_fShiftLoc[],
                 const PbcAiuc pbcAiuc)
 {
-    if (i < nBonds)
+    if (i < numBonds)
     {
-        int   type = forceatoms[5*i];
-        int   ai   = forceatoms[5*i + 1];
-        int   aj   = forceatoms[5*i + 2];
-        int   ak   = forceatoms[5*i + 3];
-        int   al   = forceatoms[5*i + 4];
+        int   type = d_forceatoms[5*i];
+        int   ai   = d_forceatoms[5*i + 1];
+        int   aj   = d_forceatoms[5*i + 2];
+        int   ak   = d_forceatoms[5*i + 3];
+        int   al   = d_forceatoms[5*i + 4];
 
         fvec  r_ij;
         fvec  r_kj;
@@ -634,7 +625,7 @@ void  idihs_gpu(const int i, float *vtot_loc, const int nBonds,
         int   t2;
         int   t3;
         float phi  =
-            dih_angle_gpu<calcVir>(xq[ai], xq[aj], xq[ak], xq[al], pbcAiuc,
+            dih_angle_gpu<calcVir>(gm_xq[ai], gm_xq[aj], gm_xq[ak], gm_xq[al], pbcAiuc,
                                    r_ij, r_kj, r_kl, m, n, &t1, &t2, &t3);
 
         /* phi can jump if phi0 is close to Pi/-Pi, which will cause huge
@@ -644,8 +635,8 @@ void  idihs_gpu(const int i, float *vtot_loc, const int nBonds,
          * the dihedral is Pi away from phiO, which is very unlikely due to
          * the potential.
          */
-        float kA   = forceparams[type].harmonic.krA;
-        float pA   = forceparams[type].harmonic.rA;
+        float kA   = d_forceparams[type].harmonic.krA;
+        float pA   = d_forceparams[type].harmonic.rA;
 
         float phi0 = pA*DEG2RAD;
 
@@ -657,8 +648,8 @@ void  idihs_gpu(const int i, float *vtot_loc, const int nBonds,
 
         do_dih_fup_gpu<calcVir>(ai, aj, ak, al,
                                 -ddphi, r_ij, r_kj, r_kl, m, n,
-                                f, sm_fShiftLoc, pbcAiuc,
-                                xq, t1, t2, t3);
+                                gm_f, sm_fShiftLoc, pbcAiuc,
+                                gm_xq, t1, t2, t3);
 
         if (calcEner)
         {
@@ -669,26 +660,26 @@ void  idihs_gpu(const int i, float *vtot_loc, const int nBonds,
 
 template <bool calcVir, bool calcEner>
 __device__
-void pairs_gpu(const int i, const int nBonds,
+void pairs_gpu(const int i, const int numBonds,
                const t_iatom iatoms[], const t_iparams iparams[],
-               const float4 xq[], fvec force[], fvec sm_fShiftLoc[],
+               const float4 gm_xq[], fvec gm_f[], fvec sm_fShiftLoc[],
                const PbcAiuc pbcAiuc,
                const float scale_factor,
                float *vtotVdw_loc, float *vtotElec_loc)
 {
-    if (i <  nBonds)
+    if (i <  numBonds)
     {
         int   itype = iatoms[3*i];
         int   ai    = iatoms[3*i + 1];
         int   aj    = iatoms[3*i + 2];
 
-        float qq    = xq[ai].w*xq[aj].w;
+        float qq    = gm_xq[ai].w*gm_xq[aj].w;
         float c6    = iparams[itype].lj14.c6A;
         float c12   = iparams[itype].lj14.c12A;
 
         /* Do we need to apply full periodic boundary conditions? */
         fvec  dr;
-        int   fshift_index = pbcDxAiuc<calcVir>(pbcAiuc, xq[ai], xq[aj], dr);
+        int   fshift_index = pbcDxAiuc<calcVir>(pbcAiuc, gm_xq[ai], gm_xq[aj], dr);
 
         float r2    = norm2_gpu(dr);
         float rinv  = rsqrtf(r2);
@@ -709,8 +700,8 @@ void pairs_gpu(const int i, const int nBonds,
 #pragma unroll
         for (int m = 0; m < DIM; m++)
         {
-            atomicAdd(&force[ai][m], f[m]);
-            atomicAdd(&force[aj][m], -f[m]);
+            atomicAdd(&gm_f[ai][m], f[m]);
+            atomicAdd(&gm_f[aj][m], -f[m]);
             if (calcVir && fshift_index != CENTRAL)
             {
                 atomicAdd(&sm_fShiftLoc[fshift_index][m], f[m]);
@@ -734,7 +725,7 @@ __global__
 void exec_kernel_gpu(BondedCudaKernelParameters kernelParams)
 {
     assert(blockDim.y == 1 && blockDim.z == 1);
-    const int       threadIndex  = blockIdx.x*blockDim.x+threadIdx.x;
+    const int       tid          = blockIdx.x*blockDim.x+threadIdx.x;
     float           vtot_loc     = 0;
     float           vtotVdw_loc  = 0;
     float           vtotElec_loc = 0;
@@ -751,53 +742,52 @@ void exec_kernel_gpu(BondedCudaKernelParameters kernelParams)
         __syncthreads();
     }
 
-    int  ftype;
+    int  fType;
     bool threadComputedPotential = false;
 #pragma unroll
-    for (int j = 0; j < nFtypesOnGpu; j++)
+    for (int j = 0; j < numFTypesOnGpu; j++)
     {
-        if (threadIndex >= kernelParams.ftypeRangeStart[j] && threadIndex <= kernelParams.ftypeRangeEnd[j])
+        if (tid >= kernelParams.fTypeRangeStart[j] && tid <= kernelParams.fTypeRangeEnd[j])
         {
-            const int      nBonds           = kernelParams.nrFTypeBonds[j];
-
-            int            localThreadIndex = threadIndex - kernelParams.ftypeRangeStart[j];
-            const t_iatom *iatoms           = kernelParams.iatoms[j];
-            ftype                           = kernelParams.ftypesOnGpu[j];
+            const int      numBonds = kernelParams.numFTypeBonds[j];
+            int            fTypeTid = tid - kernelParams.fTypeRangeStart[j];
+            const t_iatom *iatoms   = kernelParams.d_iatoms[j];
+            fType                   = kernelParams.fTypesOnGpu[j];
             if (calcEner)
             {
                 threadComputedPotential         = true;
             }
 
-            switch (ftype)
+            switch (fType)
             {
                 case F_BONDS:
-                    bonds_gpu<calcVir, calcEner>(localThreadIndex, &vtot_loc, nBonds, iatoms, kernelParams.forceparamsDevice,
-                                                 kernelParams.xqDevice, kernelParams.forceDevice, sm_fShiftLoc, kernelParams.pbcAiuc);
+                    bonds_gpu<calcVir, calcEner>(fTypeTid, &vtot_loc, numBonds, iatoms, kernelParams.d_forceParams,
+                                                 kernelParams.d_xq, kernelParams.d_f, sm_fShiftLoc, kernelParams.pbcAiuc);
                     break;
                 case F_ANGLES:
-                    angles_gpu<calcVir, calcEner>(localThreadIndex, &vtot_loc, nBonds, iatoms, kernelParams.forceparamsDevice,
-                                                  kernelParams.xqDevice, kernelParams.forceDevice, sm_fShiftLoc, kernelParams.pbcAiuc);
+                    angles_gpu<calcVir, calcEner>(fTypeTid, &vtot_loc, numBonds, iatoms, kernelParams.d_forceParams,
+                                                  kernelParams.d_xq, kernelParams.d_f, sm_fShiftLoc, kernelParams.pbcAiuc);
                     break;
                 case F_UREY_BRADLEY:
-                    urey_bradley_gpu<calcVir, calcEner>(localThreadIndex, &vtot_loc, nBonds, iatoms, kernelParams.forceparamsDevice,
-                                                        kernelParams.xqDevice, kernelParams.forceDevice, sm_fShiftLoc, kernelParams.pbcAiuc);
+                    urey_bradley_gpu<calcVir, calcEner>(fTypeTid, &vtot_loc, numBonds, iatoms, kernelParams.d_forceParams,
+                                                        kernelParams.d_xq, kernelParams.d_f, sm_fShiftLoc, kernelParams.pbcAiuc);
                     break;
                 case F_PDIHS:
                 case F_PIDIHS:
-                    pdihs_gpu<calcVir, calcEner>(localThreadIndex, &vtot_loc, nBonds, iatoms, kernelParams.forceparamsDevice,
-                                                 kernelParams.xqDevice, kernelParams.forceDevice, sm_fShiftLoc, kernelParams.pbcAiuc);
+                    pdihs_gpu<calcVir, calcEner>(fTypeTid, &vtot_loc, numBonds, iatoms, kernelParams.d_forceParams,
+                                                 kernelParams.d_xq, kernelParams.d_f, sm_fShiftLoc, kernelParams.pbcAiuc);
                     break;
                 case F_RBDIHS:
-                    rbdihs_gpu<calcVir, calcEner>(localThreadIndex, &vtot_loc, nBonds, iatoms, kernelParams.forceparamsDevice,
-                                                  kernelParams.xqDevice, kernelParams.forceDevice, sm_fShiftLoc, kernelParams.pbcAiuc);
+                    rbdihs_gpu<calcVir, calcEner>(fTypeTid, &vtot_loc, numBonds, iatoms, kernelParams.d_forceParams,
+                                                  kernelParams.d_xq, kernelParams.d_f, sm_fShiftLoc, kernelParams.pbcAiuc);
                     break;
                 case F_IDIHS:
-                    idihs_gpu<calcVir, calcEner>(localThreadIndex, &vtot_loc, nBonds, iatoms, kernelParams.forceparamsDevice,
-                                                 kernelParams.xqDevice, kernelParams.forceDevice, sm_fShiftLoc, kernelParams.pbcAiuc);
+                    idihs_gpu<calcVir, calcEner>(fTypeTid, &vtot_loc, numBonds, iatoms, kernelParams.d_forceParams,
+                                                 kernelParams.d_xq, kernelParams.d_f, sm_fShiftLoc, kernelParams.pbcAiuc);
                     break;
                 case F_LJ14:
-                    pairs_gpu<calcVir, calcEner>(localThreadIndex, nBonds, iatoms, kernelParams.forceparamsDevice,
-                                                 kernelParams.xqDevice, kernelParams.forceDevice, sm_fShiftLoc, kernelParams.pbcAiuc,
+                    pairs_gpu<calcVir, calcEner>(fTypeTid, numBonds, iatoms, kernelParams.d_forceParams,
+                                                 kernelParams.d_xq, kernelParams.d_f, sm_fShiftLoc, kernelParams.pbcAiuc,
                                                  kernelParams.scaleFactor, &vtotVdw_loc, &vtotElec_loc);
                     break;
             }
@@ -807,9 +797,9 @@ void exec_kernel_gpu(BondedCudaKernelParameters kernelParams)
 
     if (threadComputedPotential)
     {
-        float *vtotVdw  = kernelParams.vtotDevice + F_LJ14;
-        float *vtotElec = kernelParams.vtotDevice + F_COUL14;
-        atomicAdd(kernelParams.vtotDevice + ftype, vtot_loc);
+        float *vtotVdw  = kernelParams.d_vTot + F_LJ14;
+        float *vtotElec = kernelParams.d_vTot + F_COUL14;
+        atomicAdd(kernelParams.d_vTot + fType, vtot_loc);
         atomicAdd(vtotVdw, vtotVdw_loc);
         atomicAdd(vtotElec, vtotElec_loc);
     }
@@ -819,7 +809,7 @@ void exec_kernel_gpu(BondedCudaKernelParameters kernelParams)
         __syncthreads();
         if (threadIdx.x < SHIFTS)
         {
-            fvec_inc_atomic(kernelParams.fshiftDevice[threadIdx.x], sm_fShiftLoc[threadIdx.x]);
+            fvec_inc_atomic(kernelParams.d_fShift[threadIdx.x], sm_fShiftLoc[threadIdx.x]);
         }
     }
 }
@@ -840,9 +830,9 @@ GpuBonded::Impl::launchKernel(const t_forcerec *fr,
     PbcAiuc       pbcAiuc;
     setPbcAiuc(fr->bMolPBC ? ePBC2npbcdim(fr->ePBC) : 0, box, &pbcAiuc);
 
-    int                ftypeRangeEnd = kernelParams_.ftypeRangeEnd[nFtypesOnGpu - 1];
+    int                fTypeRangeEnd = kernelParams_.fTypeRangeEnd[numFTypesOnGpu - 1];
 
-    if (ftypeRangeEnd < 0)
+    if (fTypeRangeEnd < 0)
     {
         return;
     }
@@ -851,10 +841,10 @@ GpuBonded::Impl::launchKernel(const t_forcerec *fr,
     config.blockSize[0] = TPB_BONDED;
     config.blockSize[1] = 1;
     config.blockSize[2] = 1;
-    config.gridSize[0]  = (ftypeRangeEnd + TPB_BONDED)/TPB_BONDED;
+    config.gridSize[0]  = (fTypeRangeEnd + TPB_BONDED)/TPB_BONDED;
     config.gridSize[1]  = 1;
     config.gridSize[2]  = 1;
-    config.stream       = stream;
+    config.stream       = stream_;
 
     auto kernelPtr            = exec_kernel_gpu<calcVir, calcEner>;
     kernelParams_.scaleFactor = fr->ic->epsfac*fr->fudgeQQ;
index ee89e9b69acb07cd7b8b26749f2a6b155b224e74..3ad73d5dc1aa76a8be397c46ae93d4d58f287a24 100644 (file)
@@ -76,13 +76,13 @@ typedef struct {
 
 /*! \brief Divides listed interactions over threads
  *
- * This routine attempts to divide all interactions of the ntype bondeds
+ * This routine attempts to divide all interactions of the numType bondeds
  * types stored in ild over the threads such that each thread has roughly
  * equal load and different threads avoid touching the same atoms as much
  * as possible.
  */
 static void divide_bondeds_by_locality(bonded_threading_t *bt,
-                                       int                 ntype,
+                                       int                 numType,
                                        const ilist_data_t *ild)
 {
     int nat_tot, nat_sum;
@@ -90,10 +90,10 @@ static void divide_bondeds_by_locality(bonded_threading_t *bt,
     int at_ind[F_NRE]; /* index of the first atom of the interaction at ind */
     int f, t;
 
-    assert(ntype <= F_NRE);
+    assert(numType <= F_NRE);
 
     nat_tot = 0;
-    for (f = 0; f < ntype; f++)
+    for (f = 0; f < numType; f++)
     {
         /* Sum #bondeds*#atoms_per_bond over all bonded types */
         nat_tot  += ild[f].il->nr/(ild[f].nat + 1)*ild[f].nat;
@@ -108,7 +108,7 @@ static void divide_bondeds_by_locality(bonded_threading_t *bt,
     /* Loop over the end bounds of the nthreads threads to determine
      * which interactions threads 0 to nthreads shall calculate.
      *
-     * NOTE: The cost of these combined loops is #interactions*ntype.
+     * NOTE: The cost of these combined loops is #interactions*numType.
      * This code is running single threaded (difficult to parallelize
      * over threads). So the relative cost of this function increases
      * linearly with the number of threads. Since the inner-most loop
@@ -146,14 +146,14 @@ static void divide_bondeds_by_locality(bonded_threading_t *bt,
 
             /* Find out which of the types has the lowest atom index */
             f_min = 0;
-            for (f = 1; f < ntype; f++)
+            for (f = 1; f < numType; f++)
             {
                 if (at_ind[f] < at_ind[f_min])
                 {
                     f_min = f;
                 }
             }
-            assert(f_min >= 0 && f_min < ntype);
+            assert(f_min >= 0 && f_min < numType);
 
             /* Assign the interaction with the lowest atom index (of type
              * index f_min) to thread t-1 by increasing ind.
@@ -177,13 +177,13 @@ static void divide_bondeds_by_locality(bonded_threading_t *bt,
         }
 
         /* Store the bonded end boundaries (at index t) for thread t-1 */
-        for (f = 0; f < ntype; f++)
+        for (f = 0; f < numType; f++)
         {
             bt->workDivision.setBound(ild[f].ftype, t, ind[f]);
         }
     }
 
-    for (f = 0; f < ntype; f++)
+    for (f = 0; f < numType; f++)
     {
         assert(ind[f] == ild[f].il->nr);
     }
@@ -211,29 +211,29 @@ static void divide_bondeds_over_threads(bonded_threading_t *bt,
     assert(bt->nthreads > 0);
 
     bt->haveBondeds      = false;
-    int    ntype         = 0;
-    size_t ftypeGpuIndex = 0;
-    for (int ftype = 0; ftype < F_NRE; ftype++)
+    int    numType       = 0;
+    size_t fTypeGpuIndex = 0;
+    for (int fType = 0; fType < F_NRE; fType++)
     {
-        if (!ftype_is_bonded_potential(ftype))
+        if (!ftype_is_bonded_potential(fType))
         {
             continue;
         }
 
-        const t_ilist &il                     = idef.il[ftype];
+        const t_ilist &il                     = idef.il[fType];
         int            nrToAssignToCpuThreads = il.nr;
 
         if (useGpuForBondeds &&
-            ftypeGpuIndex < gmx::ftypesOnGpu.size() &&
-            gmx::ftypesOnGpu[ftypeGpuIndex] == ftype)
+            fTypeGpuIndex < gmx::fTypesOnGpu.size() &&
+            gmx::fTypesOnGpu[fTypeGpuIndex] == fType)
         {
-            ftypeGpuIndex++;
+            fTypeGpuIndex++;
 
             /* Perturbation is not implemented in the GPU bonded kernels.
              * But instead of doing all on the CPU, we could do only
              * the actually perturbed interactions on the CPU.
              */
-            if (!ftypeHasPerturbedEntries(idef, ftype))
+            if (!ftypeHasPerturbedEntries(idef, fType))
             {
                 /* We will assign this interaction type to the GPU */
                 nrToAssignToCpuThreads = 0;
@@ -250,23 +250,23 @@ static void divide_bondeds_over_threads(bonded_threading_t *bt,
             /* No interactions, avoid all the integer math below */
             for (int t = 0; t <= bt->nthreads; t++)
             {
-                bt->workDivision.setBound(ftype, t, 0);
+                bt->workDivision.setBound(fType, t, 0);
             }
         }
-        else if (bt->nthreads <= bt->max_nthread_uniform || ftype == F_DISRES)
+        else if (bt->nthreads <= bt->max_nthread_uniform || fType == F_DISRES)
         {
             /* On up to 4 threads, load balancing the bonded work
              * is more important than minimizing the reduction cost.
              */
 
-            const int stride = 1 + NRAL(ftype);
+            const int stride = 1 + NRAL(fType);
 
             for (int t = 0; t <= bt->nthreads; t++)
             {
                 /* Divide equally over the threads */
                 int nr_t = (((nrToAssignToCpuThreads/stride)*t)/bt->nthreads)*stride;
 
-                if (ftype == F_DISRES)
+                if (fType == F_DISRES)
                 {
                     /* Ensure that distance restraint pairs with the same label
                      * end up on the same thread.
@@ -279,27 +279,27 @@ static void divide_bondeds_over_threads(bonded_threading_t *bt,
                     }
                 }
 
-                bt->workDivision.setBound(ftype, t, nr_t);
+                bt->workDivision.setBound(fType, t, nr_t);
             }
         }
         else
         {
-            /* Add this ftype to the list to be distributed */
-            int nat          = NRAL(ftype);
-            ild[ntype].ftype = ftype;
-            ild[ntype].il    = &il;
-            ild[ntype].nat   = nat;
+            /* Add this fType to the list to be distributed */
+            int nat          = NRAL(fType);
+            ild[numType].ftype = fType;
+            ild[numType].il    = &il;
+            ild[numType].nat   = nat;
 
             /* The first index for the thread division is always 0 */
-            bt->workDivision.setBound(ftype, 0, 0);
+            bt->workDivision.setBound(fType, 0, 0);
 
-            ntype++;
+            numType++;
         }
     }
 
-    if (ntype > 0)
+    if (numType > 0)
     {
-        divide_bondeds_by_locality(bt, ntype, ild);
+        divide_bondeds_by_locality(bt, numType, ild);
     }
 
     if (debug)