Cleaning up the GPU bonded kernel

author Magnus Lundborg <lundborg.magnus@gmail.com>

Tue, 18 Jun 2019 12:22:10 +0000 (14:22 +0200)

committer Mark Abraham <mark.j.abraham@gmail.com>

Sun, 30 Jun 2019 21:09:31 +0000 (23:09 +0200)
author Magnus Lundborg <lundborg.magnus@gmail.com>
Tue, 18 Jun 2019 12:22:10 +0000 (14:22 +0200)
committer Mark Abraham <mark.j.abraham@gmail.com>
Sun, 30 Jun 2019 21:09:31 +0000 (23:09 +0200)
diff --git a/src/gromacs/listed_forces/gpubonded.h b/src/gromacs/listed_forces/gpubonded.h

index 3f0f5eba3f3b571be19c7f2aff8f3b2f4cb7c0c5..df63d675d074ee7188d37684084da11893330de8 100644 (file)
--- a/src/gromacs/listed_forces/gpubonded.h
+++ b/src/gromacs/listed_forces/gpubonded.h
@@ -61,7 +61,7 @@ struct t_idef;
  struct t_inputrec;
  
  /*! \brief The number on bonded function types supported on GPUs */
-static constexpr int nFtypesOnGpu = 8;
+static constexpr int numFTypesOnGpu = 8;
  
  namespace gmx
  {
@@ -73,7 +73,7 @@ namespace gmx
   * \note The function types in the list are ordered on increasing value.
   * \note Currently bonded are only supported with CUDA, not with OpenCL.
   */
-constexpr std::array<int, nFtypesOnGpu> ftypesOnGpu =
+constexpr std::array<int, numFTypesOnGpu> fTypesOnGpu =
  {
      F_BONDS,
      F_ANGLES,
diff --git a/src/gromacs/listed_forces/gpubonded_impl.cpp b/src/gromacs/listed_forces/gpubonded_impl.cpp

index dc816facc3865503a46039ad13989691aebe5a44..b7b20fdd52530ffbc75c28826d51aa59f4766adf 100644 (file)
--- a/src/gromacs/listed_forces/gpubonded_impl.cpp
+++ b/src/gromacs/listed_forces/gpubonded_impl.cpp
@@ -58,9 +58,9 @@ namespace gmx
  //! Returns whether there are any interactions in ilists suitable for a GPU.
  static bool someInteractionsCanRunOnGpu(const InteractionLists &ilists)
  {
-    for (int ftype : ftypesOnGpu)
+    for (int fType : fTypesOnGpu)
      {
-        if (!ilists[ftype].iatoms.empty())
+        if (!ilists[fType].iatoms.empty())
          {
              // Perturbation is not implemented in the GPU bonded
              // kernels. If all the interactions were actually
diff --git a/src/gromacs/listed_forces/gpubonded_impl.cu b/src/gromacs/listed_forces/gpubonded_impl.cu

index 050efc0f2d753e8f7e88c0365da74ea506e32d5d..065e88e253b049c5b08da31af00a82da6d2fe02d 100644 (file)
--- a/src/gromacs/listed_forces/gpubonded_impl.cu
+++ b/src/gromacs/listed_forces/gpubonded_impl.cu
@@ -51,13 +51,8 @@
  #include "gromacs/gpu_utils/cuda_arch_utils.cuh"
  #include "gromacs/gpu_utils/cudautils.cuh"
  #include "gromacs/gpu_utils/devicebuffer.h"
-#include "gromacs/gpu_utils/gpu_vec.cuh"
-#include "gromacs/gpu_utils/gputraits.cuh"
-#include "gromacs/gpu_utils/hostallocator.h"
-#include "gromacs/listed_forces/gpubonded.h"
  #include "gromacs/mdtypes/enerdata.h"
  #include "gromacs/topology/forcefieldparameters.h"
-#include "gromacs/topology/idef.h"
  
  struct t_forcerec;
  
@@ -69,62 +64,62 @@ namespace gmx
  GpuBonded::Impl::Impl(const gmx_ffparams_t &ffparams,
                        void                 *streamPtr)
  {
-    stream = *static_cast<CommandStream*>(streamPtr);
+    stream_ = *static_cast<CommandStream*>(streamPtr);
  
-    allocateDeviceBuffer(&forceparamsDevice, ffparams.numTypes(), nullptr);
+    allocateDeviceBuffer(&d_forceParams_, ffparams.numTypes(), nullptr);
      // This could be an async transfer (if the source is pinned), so
      // long as it uses the same stream as the kernels and we are happy
      // to consume additional pinned pages.
-    copyToDeviceBuffer(&forceparamsDevice, ffparams.iparams.data(),
+    copyToDeviceBuffer(&d_forceParams_, ffparams.iparams.data(),
                         0, ffparams.numTypes(),
-                       stream, GpuApiCallBehavior::Sync, nullptr);
-    vtot.resize(F_NRE);
-    allocateDeviceBuffer(&vtotDevice, F_NRE, nullptr);
-    clearDeviceBufferAsync(&vtotDevice, 0, F_NRE, stream);
+                       stream_, GpuApiCallBehavior::Sync, nullptr);
+    vTot_.resize(F_NRE);
+    allocateDeviceBuffer(&d_vTot_, F_NRE, nullptr);
+    clearDeviceBufferAsync(&d_vTot_, 0, F_NRE, stream_);
  
-    for (int ftype = 0; ftype < F_NRE; ftype++)
+    for (int fType = 0; fType < F_NRE; fType++)
      {
-        iListsDevice[ftype].nr     = 0;
-        iListsDevice[ftype].iatoms = nullptr;
-        iListsDevice[ftype].nalloc = 0;
+        d_iLists_[fType].nr     = 0;
+        d_iLists_[fType].iatoms = nullptr;
+        d_iLists_[fType].nalloc = 0;
      }
  
-    kernelParams_.forceparamsDevice      = forceparamsDevice;
-    kernelParams_.xqDevice               = xqDevice;
-    kernelParams_.forceDevice            = forceDevice;
-    kernelParams_.fshiftDevice           = fshiftDevice;
-    kernelParams_.vtotDevice             = vtotDevice;
-    for (int i = 0; i < nFtypesOnGpu; i++)
+    kernelParams_.d_forceParams          = d_forceParams_;
+    kernelParams_.d_xq                   = d_xq_;
+    kernelParams_.d_f                    = d_f_;
+    kernelParams_.d_fShift               = d_fShift_;
+    kernelParams_.d_vTot                 = d_vTot_;
+    for (int i = 0; i < numFTypesOnGpu; i++)
      {
-        kernelParams_.iatoms[i]          = nullptr;
-        kernelParams_.ftypeRangeStart[i] = 0;
-        kernelParams_.ftypeRangeEnd[i]   = -1;
+        kernelParams_.d_iatoms[i]        = nullptr;
+        kernelParams_.fTypeRangeStart[i] = 0;
+        kernelParams_.fTypeRangeEnd[i]   = -1;
      }
  }
  
  GpuBonded::Impl::~Impl()
  {
-    for (int ftype : ftypesOnGpu)
+    for (int fType : fTypesOnGpu)
      {
-        if (iListsDevice[ftype].iatoms)
+        if (d_iLists_[fType].iatoms)
          {
-            freeDeviceBuffer(&iListsDevice[ftype].iatoms);
-            iListsDevice[ftype].iatoms = nullptr;
+            freeDeviceBuffer(&d_iLists_[fType].iatoms);
+            d_iLists_[fType].iatoms = nullptr;
          }
      }
  
-    freeDeviceBuffer(&forceparamsDevice);
-    freeDeviceBuffer(&vtotDevice);
+    freeDeviceBuffer(&d_forceParams_);
+    freeDeviceBuffer(&d_vTot_);
  }
  
-//! Return whether function type \p ftype in \p idef has perturbed interactions
-static bool ftypeHasPerturbedEntries(const t_idef  &idef,
-                                     int            ftype)
+//! Return whether function type \p fType in \p idef has perturbed interactions
+static bool fTypeHasPerturbedEntries(const t_idef  &idef,
+                                     int            fType)
  {
      GMX_ASSERT(idef.ilsort == ilsortNO_FE || idef.ilsort == ilsortFE_SORTED,
                 "Perturbed interations should be sorted here");
  
-    const t_ilist &ilist = idef.il[ftype];
+    const t_ilist &ilist = idef.il[fType];
  
      return (idef.ilsort != ilsortNO_FE && ilist.nr_nonperturbed != ilist.nr);
  }
@@ -180,29 +175,29 @@ static inline int roundUpToFactor(const int input, const int factor)
  void
  GpuBonded::Impl::updateInteractionListsAndDeviceBuffers(ArrayRef<const int>  nbnxnAtomOrder,
                                                          const t_idef        &idef,
-                                                        void                *xqDevicePtr,
-                                                        void                *forceDevicePtr,
-                                                        void                *fshiftDevicePtr)
+                                                        void                *d_xqPtr,
+                                                        void                *d_fPtr,
+                                                        void                *d_fShiftPtr)
  {
      // TODO wallcycle sub start
      haveInteractions_ = false;
-    int ftypesCounter = 0;
+    int fTypesCounter = 0;
  
-    for (int ftype : ftypesOnGpu)
+    for (int fType : fTypesOnGpu)
      {
-        auto &iList = iLists[ftype];
+        auto &iList = iLists_[fType];
  
          /* Perturbation is not implemented in the GPU bonded kernels.
           * But instead of doing all interactions on the CPU, we can
           * still easily handle the types that have no perturbed
           * interactions on the GPU. */
-        if (idef.il[ftype].nr > 0 && !ftypeHasPerturbedEntries(idef, ftype))
+        if (idef.il[fType].nr > 0 && !fTypeHasPerturbedEntries(idef, fType))
          {
              haveInteractions_ = true;
  
-            convertIlistToNbnxnOrder(idef.il[ftype],
+            convertIlistToNbnxnOrder(idef.il[fType],
                                       &iList,
-                                     NRAL(ftype), nbnxnAtomOrder);
+                                     NRAL(fType), nbnxnAtomOrder);
          }
          else
          {
@@ -215,46 +210,46 @@ GpuBonded::Impl::updateInteractionListsAndDeviceBuffers(ArrayRef<const int>  nbn
          // end.
          if (iList.size() > 0)
          {
-            t_ilist &iListDevice = iListsDevice[ftype];
+            t_ilist &d_iList = d_iLists_[fType];
  
-            reallocateDeviceBuffer(&iListDevice.iatoms, iList.size(), &iListDevice.nr, &iListDevice.nalloc, nullptr);
+            reallocateDeviceBuffer(&d_iList.iatoms, iList.size(), &d_iList.nr, &d_iList.nalloc, nullptr);
  
-            copyToDeviceBuffer(&iListDevice.iatoms, iList.iatoms.data(),
+            copyToDeviceBuffer(&d_iList.iatoms, iList.iatoms.data(),
                                 0, iList.size(),
-                               stream, GpuApiCallBehavior::Async, nullptr);
+                               stream_, GpuApiCallBehavior::Async, nullptr);
          }
-        kernelParams_.ftypesOnGpu[ftypesCounter]   = ftype;
-        kernelParams_.nrFTypeIAtoms[ftypesCounter] = iList.size();
-        int nBonds = iList.size() / (interaction_function[ftype].nratoms + 1);
-        kernelParams_.nrFTypeBonds[ftypesCounter]  = nBonds;
-        kernelParams_.iatoms[ftypesCounter]        = iListsDevice[ftype].iatoms;
-        if (ftypesCounter == 0)
+        kernelParams_.fTypesOnGpu[fTypesCounter]    = fType;
+        kernelParams_.numFTypeIAtoms[fTypesCounter] = iList.size();
+        int numBonds = iList.size() / (interaction_function[fType].nratoms + 1);
+        kernelParams_.numFTypeBonds[fTypesCounter]  = numBonds;
+        kernelParams_.d_iatoms[fTypesCounter]       = d_iLists_[fType].iatoms;
+        if (fTypesCounter == 0)
          {
-            kernelParams_.ftypeRangeStart[ftypesCounter] = 0;
+            kernelParams_.fTypeRangeStart[fTypesCounter] = 0;
          }
          else
          {
-            kernelParams_.ftypeRangeStart[ftypesCounter] = kernelParams_.ftypeRangeEnd[ftypesCounter - 1] + 1;
+            kernelParams_.fTypeRangeStart[fTypesCounter] = kernelParams_.fTypeRangeEnd[fTypesCounter - 1] + 1;
          }
-        kernelParams_.ftypeRangeEnd[ftypesCounter] = kernelParams_.ftypeRangeStart[ftypesCounter] + roundUpToFactor(nBonds, warp_size) - 1;
+        kernelParams_.fTypeRangeEnd[fTypesCounter] = kernelParams_.fTypeRangeStart[fTypesCounter] + roundUpToFactor(numBonds, warp_size) - 1;
  
-        GMX_ASSERT(nBonds > 0 || kernelParams_.ftypeRangeEnd[ftypesCounter] <= kernelParams_.ftypeRangeStart[ftypesCounter],
-                   "Invalid GPU listed forces setup. nBonds must be > 0 if there are threads allocated to do work on that interaction function type.");
-        GMX_ASSERT(kernelParams_.ftypeRangeStart[ftypesCounter] % warp_size == 0 && (kernelParams_.ftypeRangeEnd[ftypesCounter] + 1) % warp_size == 0,
+        GMX_ASSERT(numBonds > 0 || kernelParams_.fTypeRangeEnd[fTypesCounter] <= kernelParams_.fTypeRangeStart[fTypesCounter],
+                   "Invalid GPU listed forces setup. numBonds must be > 0 if there are threads allocated to do work on that interaction function type.");
+        GMX_ASSERT(kernelParams_.fTypeRangeStart[fTypesCounter] % warp_size == 0 && (kernelParams_.fTypeRangeEnd[fTypesCounter] + 1) % warp_size == 0,
                     "The bonded interactions must be assigned to the GPU in blocks of warp size.");
  
-        ftypesCounter++;
+        fTypesCounter++;
      }
  
-    xqDevice     = static_cast<float4 *>(xqDevicePtr);
-    forceDevice  = static_cast<fvec *>(forceDevicePtr);
-    fshiftDevice = static_cast<fvec *>(fshiftDevicePtr);
+    d_xq_                           = static_cast<float4 *>(d_xqPtr);
+    d_f_                            = static_cast<fvec *>(d_fPtr);
+    d_fShift_                       = static_cast<fvec *>(d_fShiftPtr);
  
-    kernelParams_.xqDevice          = xqDevice;
-    kernelParams_.forceDevice       = forceDevice;
-    kernelParams_.fshiftDevice      = fshiftDevice;
-    kernelParams_.forceparamsDevice = forceparamsDevice;
-    kernelParams_.vtotDevice        = vtotDevice;
+    kernelParams_.d_xq              = d_xq_;
+    kernelParams_.d_f               = d_f_;
+    kernelParams_.d_fShift          = d_fShift_;
+    kernelParams_.d_forceParams     = d_forceParams_;
+    kernelParams_.d_vTot            = d_vTot_;
  
      // TODO wallcycle sub stop
  }
@@ -271,10 +266,10 @@ GpuBonded::Impl::launchEnergyTransfer()
      // TODO should wrap with ewcLAUNCH_GPU
      GMX_ASSERT(haveInteractions_, "No GPU bonded interactions, so no energies will be computed, so transfer should not be called");
  
-    float *vtot_h   = vtot.data();
-    copyFromDeviceBuffer(vtot_h, &vtotDevice,
+    float *h_vTot   = vTot_.data();
+    copyFromDeviceBuffer(h_vTot, &d_vTot_,
                           0, F_NRE,
-                         stream, GpuApiCallBehavior::Async, nullptr);
+                         stream_, GpuApiCallBehavior::Async, nullptr);
  }
  
  void
@@ -284,29 +279,29 @@ GpuBonded::Impl::accumulateEnergyTerms(gmx_enerdata_t *enerd)
      // wait goes in to the "Rest" counter
      GMX_ASSERT(haveInteractions_, "No GPU bonded interactions, so no energies will be computed or transferred, so accumulation should not occur");
  
-    cudaError_t stat = cudaStreamSynchronize(stream);
+    cudaError_t stat = cudaStreamSynchronize(stream_);
      CU_RET_ERR(stat, "D2H transfer of bonded energies failed");
  
-    for (int ftype : ftypesOnGpu)
+    for (int fType : fTypesOnGpu)
      {
-        if (ftype != F_LJ14 && ftype != F_COUL14)
+        if (fType != F_LJ14 && fType != F_COUL14)
          {
-            enerd->term[ftype] += vtot[ftype];
+            enerd->term[fType] += vTot_[fType];
          }
      }
  
      // Note: We do not support energy groups here
      gmx_grppairener_t *grppener = &enerd->grpp;
      GMX_RELEASE_ASSERT(grppener->nener == 1, "No energy group support for bondeds on the GPU");
-    grppener->ener[egLJ14][0]   += vtot[F_LJ14];
-    grppener->ener[egCOUL14][0] += vtot[F_COUL14];
+    grppener->ener[egLJ14][0]   += vTot_[F_LJ14];
+    grppener->ener[egCOUL14][0] += vTot_[F_COUL14];
  }
  
  void
  GpuBonded::Impl::clearEnergies()
  {
      // TODO should wrap with ewcLAUNCH_GPU
-    clearDeviceBufferAsync(&vtotDevice, 0, F_NRE, stream);
+    clearDeviceBufferAsync(&d_vTot_, 0, F_NRE, stream_);
  }
  
  // ---- GpuBonded
@@ -322,12 +317,12 @@ GpuBonded::~GpuBonded() = default;
  void
  GpuBonded::updateInteractionListsAndDeviceBuffers(ArrayRef<const int>  nbnxnAtomOrder,
                                                    const t_idef        &idef,
-                                                  void                *xqDevice,
-                                                  void                *forceDevice,
-                                                  void                *fshiftDevice)
+                                                  void                *d_xq,
+                                                  void                *d_f,
+                                                  void                *d_fShift)
  {
      impl_->updateInteractionListsAndDeviceBuffers
-        (nbnxnAtomOrder, idef, xqDevice, forceDevice, fshiftDevice);
+        (nbnxnAtomOrder, idef, d_xq, d_f, d_fShift);
  }
  
  bool
diff --git a/src/gromacs/listed_forces/gpubonded_impl.h b/src/gromacs/listed_forces/gpubonded_impl.h

index 0cfc429f4467898a7676aca934b1a98571a550d9..f33f6453246ac60bea10dc0c330d39c4e888b6fc 100644 (file)
--- a/src/gromacs/listed_forces/gpubonded_impl.h
+++ b/src/gromacs/listed_forces/gpubonded_impl.h
@@ -53,7 +53,6 @@
  #include "gromacs/gpu_utils/hostallocator.h"
  #include "gromacs/listed_forces/gpubonded.h"
  #include "gromacs/pbcutil/pbc_aiuc.h"
-#include "gromacs/topology/idef.h"
  
  struct gmx_ffparams_t;
  struct t_forcerec;
@@ -87,28 +86,28 @@ struct BondedCudaKernelParameters
      //! Scale factor
      float               scaleFactor;
      //! The bonded types on GPU
-    int                 ftypesOnGpu[nFtypesOnGpu];
+    int                 fTypesOnGpu[numFTypesOnGpu];
      //! The number of interaction atom (iatom) elements for every function type
-    int                 nrFTypeIAtoms[nFtypesOnGpu];
+    int                 numFTypeIAtoms[numFTypesOnGpu];
      //! The number of bonds for every function type
-    int                 nrFTypeBonds[nFtypesOnGpu];
+    int                 numFTypeBonds[numFTypesOnGpu];
      //! The start index in the range of each interaction type
-    int                 ftypeRangeStart[nFtypesOnGpu];
+    int                 fTypeRangeStart[numFTypesOnGpu];
      //! The end index in the range of each interaction type
-    int                 ftypeRangeEnd[nFtypesOnGpu];
+    int                 fTypeRangeEnd[numFTypesOnGpu];
  
      //! Force parameters (on GPU)
-    t_iparams          *forceparamsDevice;
+    t_iparams          *d_forceParams;
      //! Coordinates before the timestep (on GPU)
-    const float4       *xqDevice;
+    const float4       *d_xq;
      //! Forces on atoms (on GPU)
-    fvec               *forceDevice;
+    fvec               *d_f;
      //! Force shifts on atoms (on GPU)
-    fvec               *fshiftDevice;
+    fvec               *d_fShift;
      //! Total Energy (on GPU)
-    float              *vtotDevice;
+    float              *d_vTot;
      //! Interaction list atoms (on GPU)
-    t_iatom            *iatoms[nFtypesOnGpu];
+    t_iatom            *d_iatoms[numFTypesOnGpu];
  
      BondedCudaKernelParameters()
      {
@@ -116,12 +115,12 @@ struct BondedCudaKernelParameters
  
          setPbcAiuc(0, boxDummy, &pbcAiuc);
  
-        scaleFactor       = 1.0;
-        forceparamsDevice = nullptr;
-        xqDevice          = nullptr;
-        forceDevice       = nullptr;
-        fshiftDevice      = nullptr;
-        vtotDevice        = nullptr;
+        scaleFactor   = 1.0;
+        d_forceParams = nullptr;
+        d_xq          = nullptr;
+        d_f           = nullptr;
+        d_fShift      = nullptr;
+        d_vTot        = nullptr;
      }
  };
  
@@ -167,27 +166,27 @@ class GpuBonded::Impl
           *
           * \todo This is potentially several pinned allocations, which
           * could contribute to exhausting such pages. */
-        std::array<HostInteractionList, F_NRE> iLists;
+        std::array<HostInteractionList, F_NRE> iLists_;
+
          //! Tells whether there are any interaction in iLists.
-        bool                                   haveInteractions_;
+        bool                  haveInteractions_;
          //! Interaction lists on the device.
-        t_ilist                                iListsDevice[F_NRE];
-
+        t_ilist               d_iLists_[F_NRE];
          //! Bonded parameters for device-side use.
-        t_iparams            *forceparamsDevice = nullptr;
+        t_iparams            *d_forceParams_ = nullptr;
          //! Position-charge vector on the device.
-        const float4         *xqDevice = nullptr;
+        const float4         *d_xq_ = nullptr;
          //! Force vector on the device.
-        fvec                 *forceDevice = nullptr;
+        fvec                 *d_f_ = nullptr;
          //! Shift force vector on the device.
-        fvec                 *fshiftDevice = nullptr;
+        fvec                 *d_fShift_ = nullptr;
          //! \brief Host-side virial buffer
-        HostVector <float>    vtot = {{}, gmx::HostAllocationPolicy(gmx::PinningPolicy::PinnedIfSupported)};
+        HostVector <float>    vTot_ = {{}, gmx::HostAllocationPolicy(gmx::PinningPolicy::PinnedIfSupported)};
          //! \brief Device-side total virial
-        float                *vtotDevice   = nullptr;
+        float                *d_vTot_   = nullptr;
  
          //! \brief Bonded GPU stream, not owned by this module
-        CommandStream         stream;
+        CommandStream         stream_;
  
          //! Parameters and pointers, passed to the CUDA kernel
          BondedCudaKernelParameters kernelParams_;
diff --git a/src/gromacs/listed_forces/gpubondedkernels.cu b/src/gromacs/listed_forces/gpubondedkernels.cu

index b0f19a866883a5ba4469ec6dcab31d39ff23dc7a..52a00914358a1b52c7152b02c58e55fcfa47c751 100644 (file)
--- a/src/gromacs/listed_forces/gpubondedkernels.cu
+++ b/src/gromacs/listed_forces/gpubondedkernels.cu
@@ -53,22 +53,13 @@
  #include <math_constants.h>
  
  #include "gromacs/gpu_utils/cudautils.cuh"
-#include "gromacs/gpu_utils/devicebuffer.h"
  #include "gromacs/gpu_utils/gpu_vec.cuh"
-#include "gromacs/gpu_utils/gputraits.cuh"
  #include "gromacs/listed_forces/gpubonded.h"
-#include "gromacs/listed_forces/listed_forces.h"
  #include "gromacs/math/units.h"
  #include "gromacs/mdlib/force_flags.h"
-#include "gromacs/mdtypes/enerdata.h"
  #include "gromacs/mdtypes/forcerec.h"
-#include "gromacs/mdtypes/group.h"
-#include "gromacs/mdtypes/mdatom.h"
-#include "gromacs/pbcutil/ishift.h"
  #include "gromacs/pbcutil/pbc.h"
  #include "gromacs/pbcutil/pbc_aiuc_cuda.cuh"
-#include "gromacs/topology/idef.h"
-#include "gromacs/topology/ifunc.h"
  #include "gromacs/utility/gmxassert.h"
  
  #include "gpubonded_impl.h"
@@ -102,28 +93,28 @@ static void harmonic_gpu(const float kA, const float xA, const float x, float *V
  
  template <bool calcVir, bool calcEner>
  __device__
-void bonds_gpu(const int i, float *vtot_loc, const int nBonds,
-               const t_iatom forceatoms[], const t_iparams forceparams[],
-               const float4 xq[], fvec force[], fvec sm_fShiftLoc[],
+void bonds_gpu(const int i, float *vtot_loc, const int numBonds,
+               const t_iatom d_forceatoms[], const t_iparams d_forceparams[],
+               const float4 gm_xq[], fvec gm_f[], fvec sm_fShiftLoc[],
                 const PbcAiuc pbcAiuc)
  {
-    if (i < nBonds)
+    if (i < numBonds)
      {
-        int type = forceatoms[3*i];
-        int ai   = forceatoms[3*i + 1];
-        int aj   = forceatoms[3*i + 2];
+        int type = d_forceatoms[3*i];
+        int ai   = d_forceatoms[3*i + 1];
+        int aj   = d_forceatoms[3*i + 2];
  
          /* dx = xi - xj, corrected for periodic boundary conditions. */
          fvec  dx;
-        int   ki = pbcDxAiuc<calcVir>(pbcAiuc, xq[ai], xq[aj], dx);
+        int   ki = pbcDxAiuc<calcVir>(pbcAiuc, gm_xq[ai], gm_xq[aj], dx);
  
          float dr2 = iprod_gpu(dx, dx);
          float dr  = sqrt(dr2);
  
          float vbond;
          float fbond;
-        harmonic_gpu(forceparams[type].harmonic.krA,
-                     forceparams[type].harmonic.rA,
+        harmonic_gpu(d_forceparams[type].harmonic.krA,
+                     d_forceparams[type].harmonic.rA,
                       dr, &vbond, &fbond);
  
          if (calcEner)
@@ -139,8 +130,8 @@ void bonds_gpu(const int i, float *vtot_loc, const int nBonds,
              for (int m = 0; m < DIM; m++)
              {
                  float fij = fbond*dx[m];
-                atomicAdd(&force[ai][m], fij);
-                atomicAdd(&force[aj][m], -fij);
+                atomicAdd(&gm_f[ai][m], fij);
+                atomicAdd(&gm_f[aj][m], -fij);
                  if (calcVir && ki != CENTRAL)
                  {
                      atomicAdd(&sm_fShiftLoc[ki][m], fij);
@@ -170,17 +161,17 @@ static float bond_angle_gpu(const float4 xi, const float4 xj, const float4 xk,
  
  template <bool calcVir, bool calcEner>
  __device__
-void angles_gpu(const int i, float *vtot_loc, const int nBonds,
-                const t_iatom forceatoms[], const t_iparams forceparams[],
-                const float4 xq[], fvec force[], fvec sm_fShiftLoc[],
+void angles_gpu(const int i, float *vtot_loc, const int numBonds,
+                const t_iatom d_forceatoms[], const t_iparams d_forceparams[],
+                const float4 gm_xq[], fvec gm_f[], fvec sm_fShiftLoc[],
                  const PbcAiuc pbcAiuc)
  {
-    if (i < nBonds)
+    if (i < numBonds)
      {
-        int   type = forceatoms[4*i];
-        int   ai   = forceatoms[4*i + 1];
-        int   aj   = forceatoms[4*i + 2];
-        int   ak   = forceatoms[4*i + 3];
+        int   type = d_forceatoms[4*i];
+        int   ai   = d_forceatoms[4*i + 1];
+        int   aj   = d_forceatoms[4*i + 2];
+        int   ak   = d_forceatoms[4*i + 3];
  
          fvec  r_ij;
          fvec  r_kj;
@@ -188,13 +179,13 @@ void angles_gpu(const int i, float *vtot_loc, const int nBonds,
          int   t1;
          int   t2;
          float theta =
-            bond_angle_gpu<calcVir>(xq[ai], xq[aj], xq[ak], pbcAiuc,
+            bond_angle_gpu<calcVir>(gm_xq[ai], gm_xq[aj], gm_xq[ak], pbcAiuc,
                                      r_ij, r_kj, &cos_theta, &t1, &t2);
  
          float va;
          float dVdt;
-        harmonic_gpu(forceparams[type].harmonic.krA,
-                     forceparams[type].harmonic.rA*DEG2RAD,
+        harmonic_gpu(d_forceparams[type].harmonic.krA,
+                     d_forceparams[type].harmonic.rA*DEG2RAD,
                       theta, &va, &dVdt);
  
          if (calcEner)
@@ -226,9 +217,9 @@ void angles_gpu(const int i, float *vtot_loc, const int nBonds,
                  f_i[m]    = -(cik*r_kj[m] - cii*r_ij[m]);
                  f_k[m]    = -(cik*r_ij[m] - ckk*r_kj[m]);
                  f_j[m]    = -f_i[m] - f_k[m];
-                atomicAdd(&force[ai][m], f_i[m]);
-                atomicAdd(&force[aj][m], f_j[m]);
-                atomicAdd(&force[ak][m], f_k[m]);
+                atomicAdd(&gm_f[ai][m], f_i[m]);
+                atomicAdd(&gm_f[aj][m], f_j[m]);
+                atomicAdd(&gm_f[ak][m], f_k[m]);
                  if (calcVir)
                  {
                      atomicAdd(&sm_fShiftLoc[t1][m], f_i[m]);
@@ -243,29 +234,29 @@ void angles_gpu(const int i, float *vtot_loc, const int nBonds,
  
  template <bool calcVir, bool calcEner>
  __device__
-void urey_bradley_gpu(const int i, float *vtot_loc, const int nBonds,
-                      const t_iatom forceatoms[], const t_iparams forceparams[],
-                      const float4 xq[], fvec force[], fvec sm_fShiftLoc[],
+void urey_bradley_gpu(const int i, float *vtot_loc, const int numBonds,
+                      const t_iatom d_forceatoms[], const t_iparams d_forceparams[],
+                      const float4 gm_xq[], fvec gm_f[], fvec sm_fShiftLoc[],
                        const PbcAiuc pbcAiuc)
  {
-    if (i < nBonds)
+    if (i < numBonds)
      {
-        int   type  = forceatoms[4*i];
-        int   ai    = forceatoms[4*i+1];
-        int   aj    = forceatoms[4*i+2];
-        int   ak    = forceatoms[4*i+3];
+        int   type  = d_forceatoms[4*i];
+        int   ai    = d_forceatoms[4*i+1];
+        int   aj    = d_forceatoms[4*i+2];
+        int   ak    = d_forceatoms[4*i+3];
  
-        float th0A  = forceparams[type].u_b.thetaA*DEG2RAD;
-        float kthA  = forceparams[type].u_b.kthetaA;
-        float r13A  = forceparams[type].u_b.r13A;
-        float kUBA  = forceparams[type].u_b.kUBA;
+        float th0A  = d_forceparams[type].u_b.thetaA*DEG2RAD;
+        float kthA  = d_forceparams[type].u_b.kthetaA;
+        float r13A  = d_forceparams[type].u_b.r13A;
+        float kUBA  = d_forceparams[type].u_b.kUBA;
  
          fvec  r_ij;
          fvec  r_kj;
          float cos_theta;
          int   t1;
          int   t2;
-        float theta = bond_angle_gpu<calcVir>(xq[ai], xq[aj], xq[ak], pbcAiuc,
+        float theta = bond_angle_gpu<calcVir>(gm_xq[ai], gm_xq[aj], gm_xq[ak], pbcAiuc,
                                                r_ij, r_kj, &cos_theta, &t1, &t2);
  
          float va;
@@ -278,7 +269,7 @@ void urey_bradley_gpu(const int i, float *vtot_loc, const int nBonds,
          }
  
          fvec  r_ik;
-        int   ki = pbcDxAiuc<calcVir>(pbcAiuc, xq[ai], xq[ak], r_ik);
+        int   ki = pbcDxAiuc<calcVir>(pbcAiuc, gm_xq[ai], gm_xq[ak], r_ik);
  
          float dr2  = iprod_gpu(r_ik, r_ik);
          float dr   = dr2*rsqrtf(dr2);
@@ -309,9 +300,9 @@ void urey_bradley_gpu(const int i, float *vtot_loc, const int nBonds,
                  f_i[m]    = -(cik*r_kj[m]-cii*r_ij[m]);
                  f_k[m]    = -(cik*r_ij[m]-ckk*r_kj[m]);
                  f_j[m]    = -f_i[m]-f_k[m];
-                atomicAdd(&force[ai][m], f_i[m]);
-                atomicAdd(&force[aj][m], f_j[m]);
-                atomicAdd(&force[ak][m], f_k[m]);
+                atomicAdd(&gm_f[ai][m], f_i[m]);
+                atomicAdd(&gm_f[aj][m], f_j[m]);
+                atomicAdd(&gm_f[ak][m], f_k[m]);
                  if (calcVir)
                  {
                      atomicAdd(&sm_fShiftLoc[t1][m], f_i[m]);
@@ -335,8 +326,8 @@ void urey_bradley_gpu(const int i, float *vtot_loc, const int nBonds,
              for (int m = 0; m < DIM; m++)
              {
                  float fik = fbond*r_ik[m];
-                atomicAdd(&force[ai][m], fik);
-                atomicAdd(&force[ak][m], -fik);
+                atomicAdd(&gm_f[ai][m], fik);
+                atomicAdd(&gm_f[ak][m], -fik);
  
                  if (calcVir && ki != CENTRAL)
                  {
@@ -372,23 +363,23 @@ static float dih_angle_gpu(const T xi, const T xj, const T xk, const T xl,
  
  __device__ __forceinline__
  static void dopdihs_gpu(const float cpA, const float phiA, const int mult,
-                        const float phi, float *V, float *F)
+                        const float phi, float *v, float *f)
  {
      float mdphi, sdphi;
  
      mdphi = mult*phi - phiA*DEG2RAD;
      sdphi = sinf(mdphi);
-    *V    = cpA * (1.0f + cosf(mdphi));
-    *F    = -cpA*mult*sdphi;
+    *v    = cpA * (1.0f + cosf(mdphi));
+    *f    = -cpA*mult*sdphi;
  }
  
  template <bool calcVir>
  __device__
  static void do_dih_fup_gpu(const int i, const int j, const int k, const int l,
                             const float ddphi, const fvec r_ij, const fvec r_kj, const fvec r_kl,
-                           const fvec m, const fvec n, fvec force[], fvec fshift[],
+                           const fvec m, const fvec n, fvec gm_f[], fvec sm_fShiftLoc[],
                             const PbcAiuc &pbcAiuc,
-                           const float4 xq[], const int t1, const int t2, const int gmx_unused t3)
+                           const float4 gm_xq[], const int t1, const int t2, const int gmx_unused t3)
  {
      float iprm  = iprod_gpu(m, m);
      float iprn  = iprod_gpu(n, n);
@@ -422,24 +413,24 @@ static void do_dih_fup_gpu(const int i, const int j, const int k, const int l,
  #pragma unroll
          for (int m = 0; (m < DIM); m++)
          {
-            atomicAdd(&force[i][m], f_i[m]);
-            atomicAdd(&force[j][m], -f_j[m]);
-            atomicAdd(&force[k][m], -f_k[m]);
-            atomicAdd(&force[l][m], f_l[m]);
+            atomicAdd(&gm_f[i][m], f_i[m]);
+            atomicAdd(&gm_f[j][m], -f_j[m]);
+            atomicAdd(&gm_f[k][m], -f_k[m]);
+            atomicAdd(&gm_f[l][m], f_l[m]);
          }
  
          if (calcVir)
          {
              fvec dx_jl;
-            int  t3 = pbcDxAiuc<calcVir>(pbcAiuc, xq[l], xq[j], dx_jl);
+            int  t3 = pbcDxAiuc<calcVir>(pbcAiuc, gm_xq[l], gm_xq[j], dx_jl);
  
  #pragma unroll
              for (int m = 0; (m < DIM); m++)
              {
-                atomicAdd(&fshift[t1][m], f_i[m]);
-                atomicAdd(&fshift[CENTRAL][m], -f_j[m]);
-                atomicAdd(&fshift[t2][m], -f_k[m]);
-                atomicAdd(&fshift[t3][m], f_l[m]);
+                atomicAdd(&sm_fShiftLoc[t1][m], f_i[m]);
+                atomicAdd(&sm_fShiftLoc[CENTRAL][m], -f_j[m]);
+                atomicAdd(&sm_fShiftLoc[t2][m], -f_k[m]);
+                atomicAdd(&sm_fShiftLoc[t3][m], f_l[m]);
              }
          }
      }
@@ -447,18 +438,18 @@ static void do_dih_fup_gpu(const int i, const int j, const int k, const int l,
  
  template <bool calcVir, bool calcEner>
  __device__
-void  pdihs_gpu(const int i, float *vtot_loc, const int nBonds,
-                const t_iatom forceatoms[], const t_iparams forceparams[],
-                const float4 xq[], fvec f[], fvec sm_fShiftLoc[],
+void  pdihs_gpu(const int i, float *vtot_loc, const int numBonds,
+                const t_iatom d_forceatoms[], const t_iparams d_forceparams[],
+                const float4 gm_xq[], fvec gm_f[], fvec sm_fShiftLoc[],
                  const PbcAiuc pbcAiuc)
  {
-    if (i < nBonds)
+    if (i < numBonds)
      {
-        int   type = forceatoms[5*i];
-        int   ai   = forceatoms[5*i + 1];
-        int   aj   = forceatoms[5*i + 2];
-        int   ak   = forceatoms[5*i + 3];
-        int   al   = forceatoms[5*i + 4];
+        int   type = d_forceatoms[5*i];
+        int   ai   = d_forceatoms[5*i + 1];
+        int   aj   = d_forceatoms[5*i + 2];
+        int   ak   = d_forceatoms[5*i + 3];
+        int   al   = d_forceatoms[5*i + 4];
  
          fvec  r_ij;
          fvec  r_kj;
@@ -469,14 +460,14 @@ void  pdihs_gpu(const int i, float *vtot_loc, const int nBonds,
          int   t2;
          int   t3;
          float phi  =
-            dih_angle_gpu<calcVir>(xq[ai], xq[aj], xq[ak], xq[al], pbcAiuc,
+            dih_angle_gpu<calcVir>(gm_xq[ai], gm_xq[aj], gm_xq[ak], gm_xq[al], pbcAiuc,
                                     r_ij, r_kj, r_kl, m, n, &t1, &t2, &t3);
  
          float vpd;
          float ddphi;
-        dopdihs_gpu(forceparams[type].pdihs.cpA,
-                    forceparams[type].pdihs.phiA,
-                    forceparams[type].pdihs.mult,
+        dopdihs_gpu(d_forceparams[type].pdihs.cpA,
+                    d_forceparams[type].pdihs.phiA,
+                    d_forceparams[type].pdihs.mult,
                      phi, &vpd, &ddphi);
  
          if (calcEner)
@@ -486,28 +477,28 @@ void  pdihs_gpu(const int i, float *vtot_loc, const int nBonds,
  
          do_dih_fup_gpu<calcVir>(ai, aj, ak, al,
                                  ddphi, r_ij, r_kj, r_kl, m, n,
-                                f, sm_fShiftLoc, pbcAiuc,
-                                xq, t1, t2, t3);
+                                gm_f, sm_fShiftLoc, pbcAiuc,
+                                gm_xq, t1, t2, t3);
  
      }
  }
  
  template <bool calcVir, bool calcEner>
  __device__
-void rbdihs_gpu(const int i, float *vtot_loc, const int nBonds,
-                const t_iatom forceatoms[], const t_iparams forceparams[],
-                const float4 xq[], fvec f[], fvec sm_fShiftLoc[],
+void rbdihs_gpu(const int i, float *vtot_loc, const int numBonds,
+                const t_iatom d_forceatoms[], const t_iparams d_forceparams[],
+                const float4 gm_xq[], fvec gm_f[], fvec sm_fShiftLoc[],
                  const PbcAiuc pbcAiuc)
  {
      constexpr float  c0 = 0.0f, c1 = 1.0f, c2 = 2.0f, c3 = 3.0f, c4 = 4.0f, c5 = 5.0f;
  
-    if (i < nBonds)
+    if (i < numBonds)
      {
-        int   type = forceatoms[5*i];
-        int   ai   = forceatoms[5*i+1];
-        int   aj   = forceatoms[5*i+2];
-        int   ak   = forceatoms[5*i+3];
-        int   al   = forceatoms[5*i+4];
+        int   type = d_forceatoms[5*i];
+        int   ai   = d_forceatoms[5*i+1];
+        int   aj   = d_forceatoms[5*i+2];
+        int   ak   = d_forceatoms[5*i+3];
+        int   al   = d_forceatoms[5*i+4];
  
          fvec  r_ij;
          fvec  r_kj;
@@ -518,7 +509,7 @@ void rbdihs_gpu(const int i, float *vtot_loc, const int nBonds,
          int   t2;
          int   t3;
          float phi  =
-            dih_angle_gpu<calcVir>(xq[ai], xq[aj], xq[ak], xq[al], pbcAiuc,
+            dih_angle_gpu<calcVir>(gm_xq[ai], gm_xq[aj], gm_xq[ak], gm_xq[al], pbcAiuc,
                                     r_ij, r_kj, r_kl, m, n, &t1, &t2, &t3);
  
          /* Change to polymer convention */
@@ -538,7 +529,7 @@ void rbdihs_gpu(const int i, float *vtot_loc, const int nBonds,
          float parm[NR_RBDIHS];
          for (int j = 0; j < NR_RBDIHS; j++)
          {
-            parm[j]  = forceparams[type].rbdihs.rbcA[j];
+            parm[j]  = d_forceparams[type].rbdihs.rbcA[j];
          }
          /* Calculate cosine powers */
          /* Calculate the energy */
@@ -587,8 +578,8 @@ void rbdihs_gpu(const int i, float *vtot_loc, const int nBonds,
  
          do_dih_fup_gpu<calcVir>(ai, aj, ak, al,
                                  ddphi, r_ij, r_kj, r_kl, m, n,
-                                f, sm_fShiftLoc, pbcAiuc,
-                                xq, t1, t2, t3);
+                                gm_f, sm_fShiftLoc, pbcAiuc,
+                                gm_xq, t1, t2, t3);
          if (calcEner)
          {
              *vtot_loc += v;
@@ -612,18 +603,18 @@ static void make_dp_periodic_gpu(float *dp)
  
  template <bool calcVir, bool calcEner>
  __device__
-void  idihs_gpu(const int i, float *vtot_loc, const int nBonds,
-                const t_iatom forceatoms[], const t_iparams forceparams[],
-                const float4 xq[], fvec f[], fvec sm_fShiftLoc[],
+void  idihs_gpu(const int i, float *vtot_loc, const int numBonds,
+                const t_iatom d_forceatoms[], const t_iparams d_forceparams[],
+                const float4 gm_xq[], fvec gm_f[], fvec sm_fShiftLoc[],
                  const PbcAiuc pbcAiuc)
  {
-    if (i < nBonds)
+    if (i < numBonds)
      {
-        int   type = forceatoms[5*i];
-        int   ai   = forceatoms[5*i + 1];
-        int   aj   = forceatoms[5*i + 2];
-        int   ak   = forceatoms[5*i + 3];
-        int   al   = forceatoms[5*i + 4];
+        int   type = d_forceatoms[5*i];
+        int   ai   = d_forceatoms[5*i + 1];
+        int   aj   = d_forceatoms[5*i + 2];
+        int   ak   = d_forceatoms[5*i + 3];
+        int   al   = d_forceatoms[5*i + 4];
  
          fvec  r_ij;
          fvec  r_kj;
@@ -634,7 +625,7 @@ void  idihs_gpu(const int i, float *vtot_loc, const int nBonds,
          int   t2;
          int   t3;
          float phi  =
-            dih_angle_gpu<calcVir>(xq[ai], xq[aj], xq[ak], xq[al], pbcAiuc,
+            dih_angle_gpu<calcVir>(gm_xq[ai], gm_xq[aj], gm_xq[ak], gm_xq[al], pbcAiuc,
                                     r_ij, r_kj, r_kl, m, n, &t1, &t2, &t3);
  
          /* phi can jump if phi0 is close to Pi/-Pi, which will cause huge
@@ -644,8 +635,8 @@ void  idihs_gpu(const int i, float *vtot_loc, const int nBonds,
           * the dihedral is Pi away from phiO, which is very unlikely due to
           * the potential.
           */
-        float kA   = forceparams[type].harmonic.krA;
-        float pA   = forceparams[type].harmonic.rA;
+        float kA   = d_forceparams[type].harmonic.krA;
+        float pA   = d_forceparams[type].harmonic.rA;
  
          float phi0 = pA*DEG2RAD;
  
@@ -657,8 +648,8 @@ void  idihs_gpu(const int i, float *vtot_loc, const int nBonds,
  
          do_dih_fup_gpu<calcVir>(ai, aj, ak, al,
                                  -ddphi, r_ij, r_kj, r_kl, m, n,
-                                f, sm_fShiftLoc, pbcAiuc,
-                                xq, t1, t2, t3);
+                                gm_f, sm_fShiftLoc, pbcAiuc,
+                                gm_xq, t1, t2, t3);
  
          if (calcEner)
          {
@@ -669,26 +660,26 @@ void  idihs_gpu(const int i, float *vtot_loc, const int nBonds,
  
  template <bool calcVir, bool calcEner>
  __device__
-void pairs_gpu(const int i, const int nBonds,
+void pairs_gpu(const int i, const int numBonds,
                 const t_iatom iatoms[], const t_iparams iparams[],
-               const float4 xq[], fvec force[], fvec sm_fShiftLoc[],
+               const float4 gm_xq[], fvec gm_f[], fvec sm_fShiftLoc[],
                 const PbcAiuc pbcAiuc,
                 const float scale_factor,
                 float *vtotVdw_loc, float *vtotElec_loc)
  {
-    if (i <  nBonds)
+    if (i <  numBonds)
      {
          int   itype = iatoms[3*i];
          int   ai    = iatoms[3*i + 1];
          int   aj    = iatoms[3*i + 2];
  
-        float qq    = xq[ai].w*xq[aj].w;
+        float qq    = gm_xq[ai].w*gm_xq[aj].w;
          float c6    = iparams[itype].lj14.c6A;
          float c12   = iparams[itype].lj14.c12A;
  
          /* Do we need to apply full periodic boundary conditions? */
          fvec  dr;
-        int   fshift_index = pbcDxAiuc<calcVir>(pbcAiuc, xq[ai], xq[aj], dr);
+        int   fshift_index = pbcDxAiuc<calcVir>(pbcAiuc, gm_xq[ai], gm_xq[aj], dr);
  
          float r2    = norm2_gpu(dr);
          float rinv  = rsqrtf(r2);
@@ -709,8 +700,8 @@ void pairs_gpu(const int i, const int nBonds,
  #pragma unroll
          for (int m = 0; m < DIM; m++)
          {
-            atomicAdd(&force[ai][m], f[m]);
-            atomicAdd(&force[aj][m], -f[m]);
+            atomicAdd(&gm_f[ai][m], f[m]);
+            atomicAdd(&gm_f[aj][m], -f[m]);
              if (calcVir && fshift_index != CENTRAL)
              {
                  atomicAdd(&sm_fShiftLoc[fshift_index][m], f[m]);
@@ -734,7 +725,7 @@ __global__
  void exec_kernel_gpu(BondedCudaKernelParameters kernelParams)
  {
      assert(blockDim.y == 1 && blockDim.z == 1);
-    const int       threadIndex  = blockIdx.x*blockDim.x+threadIdx.x;
+    const int       tid          = blockIdx.x*blockDim.x+threadIdx.x;
      float           vtot_loc     = 0;
      float           vtotVdw_loc  = 0;
      float           vtotElec_loc = 0;
@@ -751,53 +742,52 @@ void exec_kernel_gpu(BondedCudaKernelParameters kernelParams)
          __syncthreads();
      }
  
-    int  ftype;
+    int  fType;
      bool threadComputedPotential = false;
  #pragma unroll
-    for (int j = 0; j < nFtypesOnGpu; j++)
+    for (int j = 0; j < numFTypesOnGpu; j++)
      {
-        if (threadIndex >= kernelParams.ftypeRangeStart[j] && threadIndex <= kernelParams.ftypeRangeEnd[j])
+        if (tid >= kernelParams.fTypeRangeStart[j] && tid <= kernelParams.fTypeRangeEnd[j])
          {
-            const int      nBonds           = kernelParams.nrFTypeBonds[j];
-
-            int            localThreadIndex = threadIndex - kernelParams.ftypeRangeStart[j];
-            const t_iatom *iatoms           = kernelParams.iatoms[j];
-            ftype                           = kernelParams.ftypesOnGpu[j];
+            const int      numBonds = kernelParams.numFTypeBonds[j];
+            int            fTypeTid = tid - kernelParams.fTypeRangeStart[j];
+            const t_iatom *iatoms   = kernelParams.d_iatoms[j];
+            fType                   = kernelParams.fTypesOnGpu[j];
              if (calcEner)
              {
                  threadComputedPotential         = true;
              }
  
-            switch (ftype)
+            switch (fType)
              {
                  case F_BONDS:
-                    bonds_gpu<calcVir, calcEner>(localThreadIndex, &vtot_loc, nBonds, iatoms, kernelParams.forceparamsDevice,
-                                                 kernelParams.xqDevice, kernelParams.forceDevice, sm_fShiftLoc, kernelParams.pbcAiuc);
+                    bonds_gpu<calcVir, calcEner>(fTypeTid, &vtot_loc, numBonds, iatoms, kernelParams.d_forceParams,
+                                                 kernelParams.d_xq, kernelParams.d_f, sm_fShiftLoc, kernelParams.pbcAiuc);
                      break;
                  case F_ANGLES:
-                    angles_gpu<calcVir, calcEner>(localThreadIndex, &vtot_loc, nBonds, iatoms, kernelParams.forceparamsDevice,
-                                                  kernelParams.xqDevice, kernelParams.forceDevice, sm_fShiftLoc, kernelParams.pbcAiuc);
+                    angles_gpu<calcVir, calcEner>(fTypeTid, &vtot_loc, numBonds, iatoms, kernelParams.d_forceParams,
+                                                  kernelParams.d_xq, kernelParams.d_f, sm_fShiftLoc, kernelParams.pbcAiuc);
                      break;
                  case F_UREY_BRADLEY:
-                    urey_bradley_gpu<calcVir, calcEner>(localThreadIndex, &vtot_loc, nBonds, iatoms, kernelParams.forceparamsDevice,
-                                                        kernelParams.xqDevice, kernelParams.forceDevice, sm_fShiftLoc, kernelParams.pbcAiuc);
+                    urey_bradley_gpu<calcVir, calcEner>(fTypeTid, &vtot_loc, numBonds, iatoms, kernelParams.d_forceParams,
+                                                        kernelParams.d_xq, kernelParams.d_f, sm_fShiftLoc, kernelParams.pbcAiuc);
                      break;
                  case F_PDIHS:
                  case F_PIDIHS:
-                    pdihs_gpu<calcVir, calcEner>(localThreadIndex, &vtot_loc, nBonds, iatoms, kernelParams.forceparamsDevice,
-                                                 kernelParams.xqDevice, kernelParams.forceDevice, sm_fShiftLoc, kernelParams.pbcAiuc);
+                    pdihs_gpu<calcVir, calcEner>(fTypeTid, &vtot_loc, numBonds, iatoms, kernelParams.d_forceParams,
+                                                 kernelParams.d_xq, kernelParams.d_f, sm_fShiftLoc, kernelParams.pbcAiuc);
                      break;
                  case F_RBDIHS:
-                    rbdihs_gpu<calcVir, calcEner>(localThreadIndex, &vtot_loc, nBonds, iatoms, kernelParams.forceparamsDevice,
-                                                  kernelParams.xqDevice, kernelParams.forceDevice, sm_fShiftLoc, kernelParams.pbcAiuc);
+                    rbdihs_gpu<calcVir, calcEner>(fTypeTid, &vtot_loc, numBonds, iatoms, kernelParams.d_forceParams,
+                                                  kernelParams.d_xq, kernelParams.d_f, sm_fShiftLoc, kernelParams.pbcAiuc);
                      break;
                  case F_IDIHS:
-                    idihs_gpu<calcVir, calcEner>(localThreadIndex, &vtot_loc, nBonds, iatoms, kernelParams.forceparamsDevice,
-                                                 kernelParams.xqDevice, kernelParams.forceDevice, sm_fShiftLoc, kernelParams.pbcAiuc);
+                    idihs_gpu<calcVir, calcEner>(fTypeTid, &vtot_loc, numBonds, iatoms, kernelParams.d_forceParams,
+                                                 kernelParams.d_xq, kernelParams.d_f, sm_fShiftLoc, kernelParams.pbcAiuc);
                      break;
                  case F_LJ14:
-                    pairs_gpu<calcVir, calcEner>(localThreadIndex, nBonds, iatoms, kernelParams.forceparamsDevice,
-                                                 kernelParams.xqDevice, kernelParams.forceDevice, sm_fShiftLoc, kernelParams.pbcAiuc,
+                    pairs_gpu<calcVir, calcEner>(fTypeTid, numBonds, iatoms, kernelParams.d_forceParams,
+                                                 kernelParams.d_xq, kernelParams.d_f, sm_fShiftLoc, kernelParams.pbcAiuc,
                                                   kernelParams.scaleFactor, &vtotVdw_loc, &vtotElec_loc);
                      break;
              }
@@ -807,9 +797,9 @@ void exec_kernel_gpu(BondedCudaKernelParameters kernelParams)
  
      if (threadComputedPotential)
      {
-        float *vtotVdw  = kernelParams.vtotDevice + F_LJ14;
-        float *vtotElec = kernelParams.vtotDevice + F_COUL14;
-        atomicAdd(kernelParams.vtotDevice + ftype, vtot_loc);
+        float *vtotVdw  = kernelParams.d_vTot + F_LJ14;
+        float *vtotElec = kernelParams.d_vTot + F_COUL14;
+        atomicAdd(kernelParams.d_vTot + fType, vtot_loc);
          atomicAdd(vtotVdw, vtotVdw_loc);
          atomicAdd(vtotElec, vtotElec_loc);
      }
@@ -819,7 +809,7 @@ void exec_kernel_gpu(BondedCudaKernelParameters kernelParams)
          __syncthreads();
          if (threadIdx.x < SHIFTS)
          {
-            fvec_inc_atomic(kernelParams.fshiftDevice[threadIdx.x], sm_fShiftLoc[threadIdx.x]);
+            fvec_inc_atomic(kernelParams.d_fShift[threadIdx.x], sm_fShiftLoc[threadIdx.x]);
          }
      }
  }
@@ -840,9 +830,9 @@ GpuBonded::Impl::launchKernel(const t_forcerec *fr,
      PbcAiuc       pbcAiuc;
      setPbcAiuc(fr->bMolPBC ? ePBC2npbcdim(fr->ePBC) : 0, box, &pbcAiuc);
  
-    int                ftypeRangeEnd = kernelParams_.ftypeRangeEnd[nFtypesOnGpu - 1];
+    int                fTypeRangeEnd = kernelParams_.fTypeRangeEnd[numFTypesOnGpu - 1];
  
-    if (ftypeRangeEnd < 0)
+    if (fTypeRangeEnd < 0)
      {
          return;
      }
@@ -851,10 +841,10 @@ GpuBonded::Impl::launchKernel(const t_forcerec *fr,
      config.blockSize[0] = TPB_BONDED;
      config.blockSize[1] = 1;
      config.blockSize[2] = 1;
-    config.gridSize[0]  = (ftypeRangeEnd + TPB_BONDED)/TPB_BONDED;
+    config.gridSize[0]  = (fTypeRangeEnd + TPB_BONDED)/TPB_BONDED;
      config.gridSize[1]  = 1;
      config.gridSize[2]  = 1;
-    config.stream       = stream;
+    config.stream       = stream_;
  
      auto kernelPtr            = exec_kernel_gpu<calcVir, calcEner>;
      kernelParams_.scaleFactor = fr->ic->epsfac*fr->fudgeQQ;
diff --git a/src/gromacs/listed_forces/manage_threading.cpp b/src/gromacs/listed_forces/manage_threading.cpp

index ee89e9b69acb07cd7b8b26749f2a6b155b224e74..3ad73d5dc1aa76a8be397c46ae93d4d58f287a24 100644 (file)
--- a/src/gromacs/listed_forces/manage_threading.cpp
+++ b/src/gromacs/listed_forces/manage_threading.cpp
@@ -76,13 +76,13 @@ typedef struct {
  
  /*! \brief Divides listed interactions over threads
   *
- * This routine attempts to divide all interactions of the ntype bondeds
+ * This routine attempts to divide all interactions of the numType bondeds
   * types stored in ild over the threads such that each thread has roughly
   * equal load and different threads avoid touching the same atoms as much
   * as possible.
   */
  static void divide_bondeds_by_locality(bonded_threading_t *bt,
-                                       int                 ntype,
+                                       int                 numType,
                                         const ilist_data_t *ild)
  {
      int nat_tot, nat_sum;
@@ -90,10 +90,10 @@ static void divide_bondeds_by_locality(bonded_threading_t *bt,
      int at_ind[F_NRE]; /* index of the first atom of the interaction at ind */
      int f, t;
  
-    assert(ntype <= F_NRE);
+    assert(numType <= F_NRE);
  
      nat_tot = 0;
-    for (f = 0; f < ntype; f++)
+    for (f = 0; f < numType; f++)
      {
          /* Sum #bondeds*#atoms_per_bond over all bonded types */
          nat_tot  += ild[f].il->nr/(ild[f].nat + 1)*ild[f].nat;
@@ -108,7 +108,7 @@ static void divide_bondeds_by_locality(bonded_threading_t *bt,
      /* Loop over the end bounds of the nthreads threads to determine
       * which interactions threads 0 to nthreads shall calculate.
       *
-     * NOTE: The cost of these combined loops is #interactions*ntype.
+     * NOTE: The cost of these combined loops is #interactions*numType.
       * This code is running single threaded (difficult to parallelize
       * over threads). So the relative cost of this function increases
       * linearly with the number of threads. Since the inner-most loop
@@ -146,14 +146,14 @@ static void divide_bondeds_by_locality(bonded_threading_t *bt,
  
              /* Find out which of the types has the lowest atom index */
              f_min = 0;
-            for (f = 1; f < ntype; f++)
+            for (f = 1; f < numType; f++)
              {
                  if (at_ind[f] < at_ind[f_min])
                  {
                      f_min = f;
                  }
              }
-            assert(f_min >= 0 && f_min < ntype);
+            assert(f_min >= 0 && f_min < numType);
  
              /* Assign the interaction with the lowest atom index (of type
               * index f_min) to thread t-1 by increasing ind.
@@ -177,13 +177,13 @@ static void divide_bondeds_by_locality(bonded_threading_t *bt,
          }
  
          /* Store the bonded end boundaries (at index t) for thread t-1 */
-        for (f = 0; f < ntype; f++)
+        for (f = 0; f < numType; f++)
          {
              bt->workDivision.setBound(ild[f].ftype, t, ind[f]);
          }
      }
  
-    for (f = 0; f < ntype; f++)
+    for (f = 0; f < numType; f++)
      {
          assert(ind[f] == ild[f].il->nr);
      }
@@ -211,29 +211,29 @@ static void divide_bondeds_over_threads(bonded_threading_t *bt,
      assert(bt->nthreads > 0);
  
      bt->haveBondeds      = false;
-    int    ntype         = 0;
-    size_t ftypeGpuIndex = 0;
-    for (int ftype = 0; ftype < F_NRE; ftype++)
+    int    numType       = 0;
+    size_t fTypeGpuIndex = 0;
+    for (int fType = 0; fType < F_NRE; fType++)
      {
-        if (!ftype_is_bonded_potential(ftype))
+        if (!ftype_is_bonded_potential(fType))
          {
              continue;
          }
  
-        const t_ilist &il                     = idef.il[ftype];
+        const t_ilist &il                     = idef.il[fType];
          int            nrToAssignToCpuThreads = il.nr;
  
          if (useGpuForBondeds &&
-            ftypeGpuIndex < gmx::ftypesOnGpu.size() &&
-            gmx::ftypesOnGpu[ftypeGpuIndex] == ftype)
+            fTypeGpuIndex < gmx::fTypesOnGpu.size() &&
+            gmx::fTypesOnGpu[fTypeGpuIndex] == fType)
          {
-            ftypeGpuIndex++;
+            fTypeGpuIndex++;
  
              /* Perturbation is not implemented in the GPU bonded kernels.
               * But instead of doing all on the CPU, we could do only
               * the actually perturbed interactions on the CPU.
               */
-            if (!ftypeHasPerturbedEntries(idef, ftype))
+            if (!ftypeHasPerturbedEntries(idef, fType))
              {
                  /* We will assign this interaction type to the GPU */
                  nrToAssignToCpuThreads = 0;
@@ -250,23 +250,23 @@ static void divide_bondeds_over_threads(bonded_threading_t *bt,
              /* No interactions, avoid all the integer math below */
              for (int t = 0; t <= bt->nthreads; t++)
              {
-                bt->workDivision.setBound(ftype, t, 0);
+                bt->workDivision.setBound(fType, t, 0);
              }
          }
-        else if (bt->nthreads <= bt->max_nthread_uniform || ftype == F_DISRES)
+        else if (bt->nthreads <= bt->max_nthread_uniform || fType == F_DISRES)
          {
              /* On up to 4 threads, load balancing the bonded work
               * is more important than minimizing the reduction cost.
               */
  
-            const int stride = 1 + NRAL(ftype);
+            const int stride = 1 + NRAL(fType);
  
              for (int t = 0; t <= bt->nthreads; t++)
              {
                  /* Divide equally over the threads */
                  int nr_t = (((nrToAssignToCpuThreads/stride)*t)/bt->nthreads)*stride;
  
-                if (ftype == F_DISRES)
+                if (fType == F_DISRES)
                  {
                      /* Ensure that distance restraint pairs with the same label
                       * end up on the same thread.
@@ -279,27 +279,27 @@ static void divide_bondeds_over_threads(bonded_threading_t *bt,
                      }
                  }
  
-                bt->workDivision.setBound(ftype, t, nr_t);
+                bt->workDivision.setBound(fType, t, nr_t);
              }
          }
          else
          {
-            /* Add this ftype to the list to be distributed */
-            int nat          = NRAL(ftype);
-            ild[ntype].ftype = ftype;
-            ild[ntype].il    = &il;
-            ild[ntype].nat   = nat;
+            /* Add this fType to the list to be distributed */
+            int nat          = NRAL(fType);
+            ild[numType].ftype = fType;
+            ild[numType].il    = &il;
+            ild[numType].nat   = nat;
  
              /* The first index for the thread division is always 0 */
-            bt->workDivision.setBound(ftype, 0, 0);
+            bt->workDivision.setBound(fType, 0, 0);
  
-            ntype++;
+            numType++;
          }
      }
  
-    if (ntype > 0)
+    if (numType > 0)
      {
-        divide_bondeds_by_locality(bt, ntype, ild);
+        divide_bondeds_by_locality(bt, numType, ild);
      }
  
      if (debug)
author	Magnus Lundborg <lundborg.magnus@gmail.com>
	Tue, 18 Jun 2019 12:22:10 +0000 (14:22 +0200)
committer	Mark Abraham <mark.j.abraham@gmail.com>
	Sun, 30 Jun 2019 21:09:31 +0000 (23:09 +0200)
src/gromacs/listed_forces/gpubonded.h		patch \| blob \| history
src/gromacs/listed_forces/gpubonded_impl.cpp		patch \| blob \| history
src/gromacs/listed_forces/gpubonded_impl.cu		patch \| blob \| history
src/gromacs/listed_forces/gpubonded_impl.h		patch \| blob \| history
src/gromacs/listed_forces/gpubondedkernels.cu		patch \| blob \| history
src/gromacs/listed_forces/manage_threading.cpp		patch \| blob \| history