Revert "Wrap more device pointers in DeviceBuffer" (!1244)

author Andrey Alekseenko <al42and@gmail.com>

Wed, 10 Mar 2021 09:06:41 +0000 (09:06 +0000)

committer Paul Bauer <paul.bauer.q@gmail.com>

Wed, 10 Mar 2021 09:06:41 +0000 (09:06 +0000)
author Andrey Alekseenko <al42and@gmail.com>
Wed, 10 Mar 2021 09:06:41 +0000 (09:06 +0000)
committer Paul Bauer <paul.bauer.q@gmail.com>
Wed, 10 Mar 2021 09:06:41 +0000 (09:06 +0000)
diff --git a/src/gromacs/ewald/pme.h b/src/gromacs/ewald/pme.h

index d4a591b7bfc0b1949126d2a212d84ba81a316e42..3f2d33f6c69ff8cc969ea01b9aed3682dd3273cd 100644 (file)
--- a/src/gromacs/ewald/pme.h
+++ b/src/gromacs/ewald/pme.h
@@ -467,12 +467,12 @@ GPU_FUNC_QUALIFIER void pme_gpu_set_device_x(const gmx_pme_t*        GPU_FUNC_AR
   * \param[in] pme            The PME data structure.
   * \returns                  Pointer to force data
   */
-GPU_FUNC_QUALIFIER DeviceBuffer<gmx::RVec> pme_gpu_get_device_f(const gmx_pme_t* GPU_FUNC_ARGUMENT(pme))
-        GPU_FUNC_TERM_WITH_RETURN(DeviceBuffer<gmx::RVec>{});
+GPU_FUNC_QUALIFIER void* pme_gpu_get_device_f(const gmx_pme_t* GPU_FUNC_ARGUMENT(pme))
+        GPU_FUNC_TERM_WITH_RETURN(nullptr);
  
  /*! \brief Get pointer to the device synchronizer object that allows syncing on PME force calculation completion
   * \param[in] pme            The PME data structure.
- * \returns                  Pointer to synchronizer
+ * \returns                  Pointer to sychronizer
   */
  GPU_FUNC_QUALIFIER GpuEventSynchronizer* pme_gpu_get_f_ready_synchronizer(const gmx_pme_t* GPU_FUNC_ARGUMENT(pme))
          GPU_FUNC_TERM_WITH_RETURN(nullptr);
diff --git a/src/gromacs/ewald/pme_force_sender_gpu.h b/src/gromacs/ewald/pme_force_sender_gpu.h

index ec88e8569babd0df83a3cec327152cd79528d0a3..bcc3b1e39330cd530bdcfef9b76000c6d0b34639 100644 (file)
--- a/src/gromacs/ewald/pme_force_sender_gpu.h
+++ b/src/gromacs/ewald/pme_force_sender_gpu.h
@@ -45,7 +45,6 @@
  #include <memory>
  
  #include "gromacs/math/vectypes.h"
-#include "gromacs/gpu_utils/devicebuffer_datatype.h"
  #include "gromacs/utility/gmxmpi.h"
  
  class DeviceStream;
@@ -84,7 +83,7 @@ public:
       * Initialization of GPU PME Force sender
       * \param[in] d_f   force buffer in GPU memory
       */
-    void sendForceBufferAddressToPpRanks(DeviceBuffer<RVec> d_f);
+    void sendForceBufferAddressToPpRanks(rvec* d_f);
  
      /*! \brief
       * Send PP data to PP rank
diff --git a/src/gromacs/ewald/pme_force_sender_gpu_impl.cpp b/src/gromacs/ewald/pme_force_sender_gpu_impl.cpp

index 88cc5ca11c7401e02cf87a9b35d516861e9e886b..915d0953ed7e292d048fb7a9d0cc51aab18de5a7 100644 (file)
--- a/src/gromacs/ewald/pme_force_sender_gpu_impl.cpp
+++ b/src/gromacs/ewald/pme_force_sender_gpu_impl.cpp
@@ -75,7 +75,7 @@ PmeForceSenderGpu::PmeForceSenderGpu(const DeviceStream& /*pmeStream */,
  PmeForceSenderGpu::~PmeForceSenderGpu() = default;
  
  /*!\brief init PME-PP GPU communication stub */
-void PmeForceSenderGpu::sendForceBufferAddressToPpRanks(DeviceBuffer<gmx::RVec> /* d_f */)
+void PmeForceSenderGpu::sendForceBufferAddressToPpRanks(rvec* /* d_f */)
  {
      GMX_ASSERT(!impl_,
                 "A CPU stub for PME-PP GPU communication initialization was called instead of the "
diff --git a/src/gromacs/ewald/pme_force_sender_gpu_impl.cu b/src/gromacs/ewald/pme_force_sender_gpu_impl.cu

index 972c1c7d57c7541257f29446850bdb8acd12d9d2..07d37dcd7e31dc12ff0c40eeb96d4088a1a9cbdf 100644 (file)
--- a/src/gromacs/ewald/pme_force_sender_gpu_impl.cu
+++ b/src/gromacs/ewald/pme_force_sender_gpu_impl.cu
@@ -48,7 +48,6 @@
  #include "config.h"
  
  #include "gromacs/gpu_utils/cudautils.cuh"
-#include "gromacs/gpu_utils/devicebuffer_datatype.h"
  #include "gromacs/gpu_utils/gpueventsynchronizer.cuh"
  #include "gromacs/utility/gmxmpi.h"
  
@@ -69,7 +68,7 @@ PmeForceSenderGpu::Impl::Impl(const DeviceStream& pmeStream, MPI_Comm comm, gmx:
  PmeForceSenderGpu::Impl::~Impl() = default;
  
  /*! \brief  sends force buffer address to PP ranks */
-void PmeForceSenderGpu::Impl::sendForceBufferAddressToPpRanks(DeviceBuffer<RVec> d_f)
+void PmeForceSenderGpu::Impl::sendForceBufferAddressToPpRanks(rvec* d_f)
  {
      int ind_start = 0;
      int ind_end   = 0;
@@ -116,7 +115,7 @@ PmeForceSenderGpu::PmeForceSenderGpu(const DeviceStream&    pmeStream,
  
  PmeForceSenderGpu::~PmeForceSenderGpu() = default;
  
-void PmeForceSenderGpu::sendForceBufferAddressToPpRanks(DeviceBuffer<RVec> d_f)
+void PmeForceSenderGpu::sendForceBufferAddressToPpRanks(rvec* d_f)
  {
      impl_->sendForceBufferAddressToPpRanks(d_f);
  }
diff --git a/src/gromacs/ewald/pme_force_sender_gpu_impl.h b/src/gromacs/ewald/pme_force_sender_gpu_impl.h

index f0b0f74bad7a03d0c65eae9418bac0770d2482ec..70be40cc7f927b9f971d95d3a928f58a227403b3 100644 (file)
--- a/src/gromacs/ewald/pme_force_sender_gpu_impl.h
+++ b/src/gromacs/ewald/pme_force_sender_gpu_impl.h
@@ -44,7 +44,6 @@
  #define GMX_PMEFORCESENDERGPU_IMPL_H
  
  #include "gromacs/ewald/pme_force_sender_gpu.h"
-#include "gromacs/gpu_utils/devicebuffer_datatype.h"
  #include "gromacs/gpu_utils/gpueventsynchronizer.cuh"
  #include "gromacs/utility/arrayref.h"
  
@@ -69,7 +68,7 @@ public:
       * sends force buffer address to PP rank
       * \param[in] d_f   force buffer in GPU memory
       */
-    void sendForceBufferAddressToPpRanks(DeviceBuffer<RVec> d_f);
+    void sendForceBufferAddressToPpRanks(rvec* d_f);
  
      /*! \brief
       * Send PP data to PP rank
diff --git a/src/gromacs/ewald/pme_gather.cu b/src/gromacs/ewald/pme_gather.cu

index 82eeed85a1b5742a4901bfe5a8c33912890debb6..35111b52ec18fdd7e7f2a6aed828f28a7961f857 100644 (file)
--- a/src/gromacs/ewald/pme_gather.cu
+++ b/src/gromacs/ewald/pme_gather.cu
@@ -351,7 +351,7 @@ __launch_bounds__(c_gatherMaxThreadsPerBlock, c_gatherMinBlocksPerMP) __global__
      const float* __restrict__ gm_coefficientsB = kernelParams.atoms.d_coefficients[1];
      const float* __restrict__ gm_gridA         = kernelParams.grid.d_realGrid[0];
      const float* __restrict__ gm_gridB         = kernelParams.grid.d_realGrid[1];
-    float* __restrict__ gm_forces = reinterpret_cast<float*>(kernelParams.atoms.d_forces);
+    float* __restrict__ gm_forces              = kernelParams.atoms.d_forces;
  
      /* Global memory pointers for readGlobal */
      const float* __restrict__ gm_theta         = kernelParams.atoms.d_theta;
diff --git a/src/gromacs/ewald/pme_gpu.cpp b/src/gromacs/ewald/pme_gpu.cpp

index 225fb1050a281a33cfc3cff9bec8dea32d03e7b9..e0c32e207d1b46edcf1365c6750581f980c013d3 100644 (file)
--- a/src/gromacs/ewald/pme_gpu.cpp
+++ b/src/gromacs/ewald/pme_gpu.cpp
@@ -440,11 +440,11 @@ void pme_gpu_reinit_computation(const gmx_pme_t* pme, gmx_wallcycle* wcycle)
      wallcycle_stop(wcycle, ewcLAUNCH_GPU);
  }
  
-DeviceBuffer<gmx::RVec> pme_gpu_get_device_f(const gmx_pme_t* pme)
+void* pme_gpu_get_device_f(const gmx_pme_t* pme)
  {
      if (!pme || !pme_gpu_active(pme))
      {
-        return DeviceBuffer<gmx::RVec>{};
+        return nullptr;
      }
      return pme_gpu_get_kernelparam_forces(pme->gpu);
  }
diff --git a/src/gromacs/ewald/pme_gpu_internal.cpp b/src/gromacs/ewald/pme_gpu_internal.cpp

index c0d422d5728a111cf884dbc5aad785a6628ebd86..028a66a35a9f30df54cdfc40e1628b878b9ae154 100644 (file)
--- a/src/gromacs/ewald/pme_gpu_internal.cpp
+++ b/src/gromacs/ewald/pme_gpu_internal.cpp
@@ -229,7 +229,7 @@ void pme_gpu_free_bspline_values(const PmeGpu* pmeGpu)
  
  void pme_gpu_realloc_forces(PmeGpu* pmeGpu)
  {
-    const size_t newForcesSize = pmeGpu->nAtomsAlloc;
+    const size_t newForcesSize = pmeGpu->nAtomsAlloc * DIM;
      GMX_ASSERT(newForcesSize > 0, "Bad number of atoms in PME GPU");
      reallocateDeviceBuffer(&pmeGpu->kernelParams->atoms.d_forces,
                             newForcesSize,
@@ -248,10 +248,11 @@ void pme_gpu_free_forces(const PmeGpu* pmeGpu)
  void pme_gpu_copy_input_forces(PmeGpu* pmeGpu)
  {
      GMX_ASSERT(pmeGpu->kernelParams->atoms.nAtoms > 0, "Bad number of atoms in PME GPU");
+    float* h_forcesFloat = reinterpret_cast<float*>(pmeGpu->staging.h_forces.data());
      copyToDeviceBuffer(&pmeGpu->kernelParams->atoms.d_forces,
-                       pmeGpu->staging.h_forces.data(),
+                       h_forcesFloat,
                         0,
-                       pmeGpu->kernelParams->atoms.nAtoms,
+                       DIM * pmeGpu->kernelParams->atoms.nAtoms,
                         pmeGpu->archSpecific->pmeStream_,
                         pmeGpu->settings.transferKind,
                         nullptr);
@@ -260,10 +261,11 @@ void pme_gpu_copy_input_forces(PmeGpu* pmeGpu)
  void pme_gpu_copy_output_forces(PmeGpu* pmeGpu)
  {
      GMX_ASSERT(pmeGpu->kernelParams->atoms.nAtoms > 0, "Bad number of atoms in PME GPU");
-    copyFromDeviceBuffer(pmeGpu->staging.h_forces.data(),
+    float* h_forcesFloat = reinterpret_cast<float*>(pmeGpu->staging.h_forces.data());
+    copyFromDeviceBuffer(h_forcesFloat,
                           &pmeGpu->kernelParams->atoms.d_forces,
                           0,
-                         pmeGpu->kernelParams->atoms.nAtoms,
+                         DIM * pmeGpu->kernelParams->atoms.nAtoms,
                           pmeGpu->archSpecific->pmeStream_,
                           pmeGpu->settings.transferKind,
                           nullptr);
@@ -1704,7 +1706,7 @@ void pme_gpu_gather(PmeGpu* pmeGpu, real** h_grids, const float lambda)
      }
  }
  
-DeviceBuffer<gmx::RVec> pme_gpu_get_kernelparam_forces(const PmeGpu* pmeGpu)
+void* pme_gpu_get_kernelparam_forces(const PmeGpu* pmeGpu)
  {
      if (pmeGpu && pmeGpu->kernelParams)
      {
@@ -1712,7 +1714,7 @@ DeviceBuffer<gmx::RVec> pme_gpu_get_kernelparam_forces(const PmeGpu* pmeGpu)
      }
      else
      {
-        return DeviceBuffer<gmx::RVec>{};
+        return nullptr;
      }
  }
  
diff --git a/src/gromacs/ewald/pme_gpu_internal.h b/src/gromacs/ewald/pme_gpu_internal.h

index 7baa6bd3475eb1a79ff5e7f498c0554a7eb5f735..41b912e2b6a7385ac8ee102ebe3ab198b52e043f 100644 (file)
--- a/src/gromacs/ewald/pme_gpu_internal.h
+++ b/src/gromacs/ewald/pme_gpu_internal.h
@@ -405,8 +405,8 @@ GPU_FUNC_QUALIFIER void pme_gpu_set_kernelparam_coordinates(const PmeGpu* GPU_FU
   * \param[in] pmeGpu         The PME GPU structure.
   * \returns                  Pointer to force data
   */
-GPU_FUNC_QUALIFIER DeviceBuffer<gmx::RVec> pme_gpu_get_kernelparam_forces(const PmeGpu* GPU_FUNC_ARGUMENT(pmeGpu))
-        GPU_FUNC_TERM_WITH_RETURN(DeviceBuffer<gmx::RVec>{});
+GPU_FUNC_QUALIFIER void* pme_gpu_get_kernelparam_forces(const PmeGpu* GPU_FUNC_ARGUMENT(pmeGpu))
+        GPU_FUNC_TERM_WITH_RETURN(nullptr);
  
  /*! \brief Return pointer to the sync object triggered after the PME force calculation completion
   * \param[in] pmeGpu         The PME GPU structure.
diff --git a/src/gromacs/ewald/pme_gpu_types.h b/src/gromacs/ewald/pme_gpu_types.h

index e2c067390ad2c82f28d514e1951ba00d0a3595d9..abf7a17ed002900ab17583d96f7256565b656ebb 100644 (file)
--- a/src/gromacs/ewald/pme_gpu_types.h
+++ b/src/gromacs/ewald/pme_gpu_types.h
@@ -171,7 +171,7 @@ struct PmeGpuAtomParams
       * The forces change and need to be copied from (and possibly to) the GPU for every PME
       * computation, but reallocation happens only at DD.
       */
-    HIDE_FROM_OPENCL_COMPILER(DeviceBuffer<gmx::RVec>) d_forces;
+    HIDE_FROM_OPENCL_COMPILER(DeviceBuffer<float>) d_forces;
      /*! \brief Global GPU memory array handle with ivec atom gridline indices.
       * Computed on GPU in the spline calculation part.
       */
diff --git a/src/gromacs/ewald/pme_only.cpp b/src/gromacs/ewald/pme_only.cpp

index 5a01c898c8a2c98b0c7d8a7097751d7e1a3efa33..c895975144b5af2b2e80fc76e3713a157bbe8e40 100644 (file)
--- a/src/gromacs/ewald/pme_only.cpp
+++ b/src/gromacs/ewald/pme_only.cpp
@@ -445,7 +445,8 @@ static int gmx_pme_recv_coeffs_coords(struct gmx_pme_t*            pme,
                      // This rank will have its data accessed directly by PP rank, so needs to send the remote addresses.
                      pme_pp->pmeCoordinateReceiverGpu->sendCoordinateBufferAddressToPpRanks(
                              stateGpu->getCoordinates());
-                    pme_pp->pmeForceSenderGpu->sendForceBufferAddressToPpRanks(pme_gpu_get_device_f(pme));
+                    pme_pp->pmeForceSenderGpu->sendForceBufferAddressToPpRanks(
+                            reinterpret_cast<rvec*>(pme_gpu_get_device_f(pme)));
                  }
              }
  
@@ -582,13 +583,8 @@ static void gmx_pme_send_force_vir_ener(const gmx_pme_t& pme,
          if (pme_pp->useGpuDirectComm)
          {
              // Data will be transferred directly from GPU.
-            DeviceBuffer<gmx::RVec> gmx_unused d_f = pme_gpu_get_device_f(&pme);
-#    if GMX_GPU_CUDA
-            // OpenCL does not allow host-side pointer arithmetic on buffers. Neither does SYCL.
-            sendbuf = reinterpret_cast<void*>(&d_f[ind_start]);
-#    else
-            GMX_RELEASE_ASSERT(false, "Can only use GPU Direct Communications with CUDA");
-#    endif
+            rvec* d_f = reinterpret_cast<rvec*>(pme_gpu_get_device_f(&pme));
+            sendbuf   = reinterpret_cast<void*>(&d_f[ind_start]);
          }
          sendFToPP(sendbuf, receiver, pme_pp, &messages);
      }
diff --git a/src/gromacs/ewald/pme_pp_comm_gpu.h b/src/gromacs/ewald/pme_pp_comm_gpu.h

index 886e0c221b8f8a456b99ef4e40fcea45070da3e9..3e56da9af3e8604c3030e7b04772a18be7b62c83 100644 (file)
--- a/src/gromacs/ewald/pme_pp_comm_gpu.h
+++ b/src/gromacs/ewald/pme_pp_comm_gpu.h
@@ -44,7 +44,6 @@
  
  #include <memory>
  
-#include "gromacs/gpu_utils/devicebuffer_datatype.h"
  #include "gromacs/utility/gmxmpi.h"
  
  class DeviceContext;
@@ -100,7 +99,7 @@ public:
      /*! \brief
       * Return pointer to buffer used for staging PME force on GPU
       */
-    DeviceBuffer<gmx::RVec> getGpuForceStagingPtr();
+    void* getGpuForceStagingPtr();
  
      /*! \brief
       * Return pointer to event recorded when forces are ready
diff --git a/src/gromacs/ewald/pme_pp_comm_gpu_impl.cpp b/src/gromacs/ewald/pme_pp_comm_gpu_impl.cpp

index d31b976c10f5a9de691420b747c26d9c0574d8ec..c5f92aa53f01cbba57da1682e4df4d28a60a2836 100644 (file)
--- a/src/gromacs/ewald/pme_pp_comm_gpu_impl.cpp
+++ b/src/gromacs/ewald/pme_pp_comm_gpu_impl.cpp
@@ -102,12 +102,12 @@ void PmePpCommGpu::sendCoordinatesToPmeCudaDirect(void* /* sendPtr */,
                 "implementation.");
  }
  
-DeviceBuffer<gmx::RVec> PmePpCommGpu::getGpuForceStagingPtr()
+void* PmePpCommGpu::getGpuForceStagingPtr()
  {
      GMX_ASSERT(!impl_,
                 "A CPU stub for PME-PP GPU communication was called instead of the correct "
                 "implementation.");
-    return DeviceBuffer<gmx::RVec>{};
+    return nullptr;
  }
  
  GpuEventSynchronizer* PmePpCommGpu::getForcesReadySynchronizer()
diff --git a/src/gromacs/ewald/pme_pp_comm_gpu_impl.cu b/src/gromacs/ewald/pme_pp_comm_gpu_impl.cu

index 0d66883979cc4b90bea44e5ef7cc237c7017ae98..0ecf0281333003926fb8a875fd61eb60ca4337ff 100644 (file)
--- a/src/gromacs/ewald/pme_pp_comm_gpu_impl.cu
+++ b/src/gromacs/ewald/pme_pp_comm_gpu_impl.cu
@@ -155,9 +155,9 @@ void PmePpCommGpu::Impl::sendCoordinatesToPmeCudaDirect(void* sendPtr,
      GMX_UNUSED_VALUE(coordinatesReadyOnDeviceEvent);
  #endif
  }
-DeviceBuffer<gmx::RVec> PmePpCommGpu::Impl::getGpuForceStagingPtr()
+void* PmePpCommGpu::Impl::getGpuForceStagingPtr()
  {
-    return d_pmeForces_;
+    return static_cast<void*>(d_pmeForces_);
  }
  
  GpuEventSynchronizer* PmePpCommGpu::Impl::getForcesReadySynchronizer()
@@ -194,7 +194,7 @@ void PmePpCommGpu::sendCoordinatesToPmeCudaDirect(void*                 sendPtr,
              sendPtr, sendSize, sendPmeCoordinatesFromGpu, coordinatesReadyOnDeviceEvent);
  }
  
-DeviceBuffer<gmx::RVec> PmePpCommGpu::getGpuForceStagingPtr()
+void* PmePpCommGpu::getGpuForceStagingPtr()
  {
      return impl_->getGpuForceStagingPtr();
  }
diff --git a/src/gromacs/ewald/pme_pp_comm_gpu_impl.h b/src/gromacs/ewald/pme_pp_comm_gpu_impl.h

index 58e22c2e46b832fbf1f2e65d0c1b8d5f08d3776d..042891a04d3aaf044914e9cbf666c4c3254712f1 100644 (file)
--- a/src/gromacs/ewald/pme_pp_comm_gpu_impl.h
+++ b/src/gromacs/ewald/pme_pp_comm_gpu_impl.h
@@ -44,7 +44,6 @@
  #define GMX_PME_PP_COMM_GPU_IMPL_H
  
  #include "gromacs/ewald/pme_pp_comm_gpu.h"
-#include "gromacs/gpu_utils/devicebuffer_datatype.h"
  #include "gromacs/gpu_utils/gpueventsynchronizer.cuh"
  #include "gromacs/math/vectypes.h"
  #include "gromacs/utility/gmxmpi.h"
@@ -111,7 +110,7 @@ public:
      /*! \brief
       * Return pointer to buffer used for staging PME force on GPU
       */
-    DeviceBuffer<gmx::RVec> getGpuForceStagingPtr();
+    void* getGpuForceStagingPtr();
  
      /*! \brief
       * Return pointer to event recorded when forces are ready
@@ -124,15 +123,15 @@ private:
      //! Handle for CUDA stream used for the communication operations in this class
      const DeviceStream& pmePpCommStream_;
      //! Remote location of PME coordinate data buffer
-    DeviceBuffer<gmx::RVec> remotePmeXBuffer_ = nullptr;
+    void* remotePmeXBuffer_ = nullptr;
      //! Remote location of PME force data buffer
-    DeviceBuffer<gmx::RVec> remotePmeFBuffer_ = nullptr;
+    void* remotePmeFBuffer_ = nullptr;
      //! communicator for simulation
      MPI_Comm comm_;
      //! Rank of PME task
      int pmeRank_ = -1;
      //! Buffer for staging PME force on GPU
-    DeviceBuffer<gmx::RVec> d_pmeForces_ = nullptr;
+    rvec* d_pmeForces_ = nullptr;
      //! number of atoms in PME force staging array
      int d_pmeForcesSize_ = -1;
      //! number of atoms allocated in recvbuf array
diff --git a/src/gromacs/mdlib/gpuforcereduction.h b/src/gromacs/mdlib/gpuforcereduction.h

index 82e8db33f56e3c289c41fcfeb7633b47a7c85839..157c4c7eca5a2be662f7a47ce7dc4d287803fe51 100644 (file)
--- a/src/gromacs/mdlib/gpuforcereduction.h
+++ b/src/gromacs/mdlib/gpuforcereduction.h
@@ -86,13 +86,13 @@ public:
       *
       * \param [in] forcePtr  Pointer to force to be reduced
       */
-    void registerNbnxmForce(DeviceBuffer<RVec> forcePtr);
+    void registerNbnxmForce(void* forcePtr);
  
      /*! \brief Register a rvec-format force to be reduced
       *
       * \param [in] forcePtr  Pointer to force to be reduced
       */
-    void registerRvecForce(DeviceBuffer<RVec> forcePtr);
+    void registerRvecForce(void* forcePtr);
  
      /*! \brief Add a dependency for this force reduction
       *
@@ -102,14 +102,14 @@ public:
  
      /*! \brief Reinitialize the GPU force reduction
       *
-     * \param [in] baseForce        Pointer to force to be used as a base
+     * \param [in] baseForcePtr     Pointer to force to be used as a base
       * \param [in] numAtoms         The number of atoms
       * \param [in] cell             Pointer to the cell array
       * \param [in] atomStart        The start atom for the reduction
       * \param [in] accumulate       Whether reduction should be accumulated
       * \param [in] completionMarker Event to be marked when launch of reduction is complete
       */
-    void reinit(DeviceBuffer<RVec>    baseForce,
+    void reinit(DeviceBuffer<RVec>    baseForcePtr,
                  int                   numAtoms,
                  ArrayRef<const int>   cell,
                  int                   atomStart,
diff --git a/src/gromacs/mdlib/gpuforcereduction_impl.cpp b/src/gromacs/mdlib/gpuforcereduction_impl.cpp

index b69759b848a18e8b1477e79962ae6f59f5e1903d..1e0a30b2b6aba01c90831eaba7661d7866255e80 100644 (file)
--- a/src/gromacs/mdlib/gpuforcereduction_impl.cpp
+++ b/src/gromacs/mdlib/gpuforcereduction_impl.cpp
@@ -76,13 +76,13 @@ void GpuForceReduction::reinit(DeviceBuffer<RVec> /*baseForcePtr*/,
  }
  
  // NOLINTNEXTLINE readability-convert-member-functions-to-static
-void GpuForceReduction::registerNbnxmForce(DeviceBuffer<RVec> /* forcePtr */)
+void GpuForceReduction::registerNbnxmForce(void* /* forcePtr */)
  {
      GMX_ASSERT(false, "A CPU stub has been called instead of the correct implementation.");
  }
  
  // NOLINTNEXTLINE readability-convert-member-functions-to-static
-void GpuForceReduction::registerRvecForce(DeviceBuffer<RVec> /* forcePtr */)
+void GpuForceReduction::registerRvecForce(void* /* forcePtr */)
  {
      GMX_ASSERT(false, "A CPU stub has been called instead of the correct implementation.");
  }
diff --git a/src/gromacs/mdlib/gpuforcereduction_impl.cu b/src/gromacs/mdlib/gpuforcereduction_impl.cu

index ac89b47d24712f8bb8474b25fad098871809b185..6e1e7e920a50b2b91e8e2d26149beee45c6d1380 100644 (file)
--- a/src/gromacs/mdlib/gpuforcereduction_impl.cu
+++ b/src/gromacs/mdlib/gpuforcereduction_impl.cu
@@ -43,7 +43,7 @@
  
  #include "gmxpre.h"
  
-#include "gpuforcereduction_impl.h"
+#include "gpuforcereduction_impl.cuh"
  
  #include <stdio.h>
  
@@ -112,15 +112,15 @@ GpuForceReduction::Impl::Impl(const DeviceContext& deviceContext,
      deviceStream_(deviceStream),
      wcycle_(wcycle){};
  
-void GpuForceReduction::Impl::reinit(DeviceBuffer<gmx::RVec> baseForce,
-                                     const int               numAtoms,
-                                     ArrayRef<const int>     cell,
-                                     const int               atomStart,
-                                     const bool              accumulate,
-                                     GpuEventSynchronizer*   completionMarker)
+void GpuForceReduction::Impl::reinit(float3*               baseForcePtr,
+                                     const int             numAtoms,
+                                     ArrayRef<const int>   cell,
+                                     const int             atomStart,
+                                     const bool            accumulate,
+                                     GpuEventSynchronizer* completionMarker)
  {
-    GMX_ASSERT((baseForce != nullptr), "Input base force for reduction has no data");
-    baseForce_        = baseForce;
+    GMX_ASSERT((baseForcePtr != nullptr), "Input base force for reduction has no data");
+    baseForce_        = &(baseForcePtr[atomStart]);
      numAtoms_         = numAtoms;
      atomStart_        = atomStart;
      accumulate_       = accumulate;
@@ -144,13 +144,13 @@ void GpuForceReduction::Impl::reinit(DeviceBuffer<gmx::RVec> baseForce,
  
  void GpuForceReduction::Impl::registerNbnxmForce(DeviceBuffer<RVec> forcePtr)
  {
-    GMX_ASSERT((forcePtr), "Input force for reduction has no data");
+    GMX_ASSERT((forcePtr != nullptr), "Input force for reduction has no data");
      nbnxmForceToAdd_ = forcePtr;
  };
  
  void GpuForceReduction::Impl::registerRvecForce(DeviceBuffer<RVec> forcePtr)
  {
-    GMX_ASSERT((forcePtr), "Input force for reduction has no data");
+    GMX_ASSERT((forcePtr != nullptr), "Input force for reduction has no data");
      rvecForceToAdd_ = forcePtr;
  };
  
@@ -172,12 +172,11 @@ void GpuForceReduction::Impl::execute()
      GMX_ASSERT((nbnxmForceToAdd_ != nullptr), "Nbnxm force for reduction has no data");
  
      // Enqueue wait on all dependencies passed
-    for (const auto& synchronizer : dependencyList_)
+    for (auto const synchronizer : dependencyList_)
      {
          synchronizer->enqueueWaitEvent(deviceStream_);
      }
  
-    float3* d_baseForce      = &(asFloat3(baseForce_)[atomStart_]);
      float3* d_nbnxmForce     = asFloat3(nbnxmForceToAdd_);
      float3* d_rvecForceToAdd = &(asFloat3(rvecForceToAdd_)[atomStart_]);
  
@@ -196,7 +195,7 @@ void GpuForceReduction::Impl::execute()
                              : (accumulate_ ? reduceKernel<false, true> : reduceKernel<false, false>);
  
      const auto kernelArgs = prepareGpuKernelArguments(
-            kernelFn, config, &d_nbnxmForce, &d_rvecForceToAdd, &d_baseForce, &cellInfo_.d_cell, &numAtoms_);
+            kernelFn, config, &d_nbnxmForce, &d_rvecForceToAdd, &baseForce_, &cellInfo_.d_cell, &numAtoms_);
  
      launchGpuKernel(kernelFn, config, deviceStream_, nullptr, "Force Reduction", kernelArgs);
  
@@ -219,14 +218,14 @@ GpuForceReduction::GpuForceReduction(const DeviceContext& deviceContext,
  {
  }
  
-void GpuForceReduction::registerNbnxmForce(DeviceBuffer<RVec> forcePtr)
+void GpuForceReduction::registerNbnxmForce(void* forcePtr)
  {
-    impl_->registerNbnxmForce(std::move(forcePtr));
+    impl_->registerNbnxmForce(reinterpret_cast<DeviceBuffer<RVec>>(forcePtr));
  }
  
-void GpuForceReduction::registerRvecForce(DeviceBuffer<RVec> forcePtr)
+void GpuForceReduction::registerRvecForce(void* forcePtr)
  {
-    impl_->registerRvecForce(std::move(forcePtr));
+    impl_->registerRvecForce(reinterpret_cast<DeviceBuffer<RVec>>(forcePtr));
  }
  
  void GpuForceReduction::addDependency(GpuEventSynchronizer* const dependency)
@@ -234,14 +233,14 @@ void GpuForceReduction::addDependency(GpuEventSynchronizer* const dependency)
      impl_->addDependency(dependency);
  }
  
-void GpuForceReduction::reinit(DeviceBuffer<RVec>    baseForce,
+void GpuForceReduction::reinit(DeviceBuffer<RVec>    baseForcePtr,
                                 const int             numAtoms,
                                 ArrayRef<const int>   cell,
                                 const int             atomStart,
                                 const bool            accumulate,
                                 GpuEventSynchronizer* completionMarker)
  {
-    impl_->reinit(baseForce, numAtoms, cell, atomStart, accumulate, completionMarker);
+    impl_->reinit(asFloat3(baseForcePtr), numAtoms, cell, atomStart, accumulate, completionMarker);
  }
  void GpuForceReduction::execute()
  {
diff --git a/src/gromacs/mdlib/gpuforcereduction_impl.h b/src/gromacs/mdlib/gpuforcereduction_impl.cuh

similarity index 93%

rename from src/gromacs/mdlib/gpuforcereduction_impl.h

rename to src/gromacs/mdlib/gpuforcereduction_impl.cuh

index 491fd95f21126f4123db83e4e349d46920bc7102..bd222e40a643a01296b1ec222b9f25204af3c3d8 100644 (file)
--- a/src/gromacs/mdlib/gpuforcereduction_impl.h
+++ b/src/gromacs/mdlib/gpuforcereduction_impl.cuh
@@ -58,7 +58,7 @@ struct cellInfo
      //! cell index mapping for any nbat-format forces
      const int* cell = nullptr;
      //! device copy of cell index mapping for any nbat-format forces
-    DeviceBuffer<int> d_cell;
+    int* d_cell = nullptr;
      //! number of atoms in cell array
      int cellSize = -1;
      //! number of atoms allocated in cell array
@@ -75,7 +75,7 @@ public:
       * \param [in] deviceContext GPU device context
       * \param [in] wcycle        The wallclock counter
       */
-    Impl(const DeviceContext& deviceContext, const DeviceStream& deviceStream, gmx_wallcycle* wcycle);
+    Impl(const DeviceContext& deviceContext, const DeviceStream& deviceStreami, gmx_wallcycle* wcycle);
      ~Impl();
  
      /*! \brief Register a nbnxm-format force to be reduced
@@ -98,14 +98,14 @@ public:
  
      /*! \brief Reinitialize the GPU force reduction
       *
-     * \param [in] baseForce        Pointer to force to be used as a base
+     * \param [in] baseForcePtr     Pointer to force to be used as a base
       * \param [in] numAtoms         The number of atoms
       * \param [in] cell             Pointer to the cell array
       * \param [in] atomStart        The start atom for the reduction
       * \param [in] accumulate       Whether reduction should be accumulated
       * \param [in] completionMarker Event to be marked when launch of reduction is complete
       */
-    void reinit(DeviceBuffer<RVec>    baseForce,
+    void reinit(float3*               baseForcePtr,
                  const int             numAtoms,
                  ArrayRef<const int>   cell,
                  const int             atomStart,
@@ -117,13 +117,13 @@ public:
  
  private:
      //! force to be used as a base for this reduction
-    DeviceBuffer<RVec> baseForce_;
+    float3* baseForce_ = nullptr;
      //! starting atom
      int atomStart_ = 0;
      //! number of atoms
      int numAtoms_ = 0;
      //! whether reduction is accumulated into base force buffer
-    bool accumulate_ = true;
+    int accumulate_ = true;
      //! cell information for any nbat-format forces
      struct cellInfo cellInfo_;
      //! GPU context object
@@ -133,9 +133,9 @@ private:
      //! stream to be used for this reduction
      const DeviceStream& deviceStream_;
      //! Nbnxm force to be added in this reduction
-    DeviceBuffer<RVec> nbnxmForceToAdd_;
+    DeviceBuffer<RVec> nbnxmForceToAdd_ = nullptr;
      //! Rvec-format force to be added in this reduction
-    DeviceBuffer<RVec> rvecForceToAdd_;
+    DeviceBuffer<RVec> rvecForceToAdd_ = nullptr;
      //! event to be marked when redcution launch has been completed
      GpuEventSynchronizer* completionMarker_ = nullptr;
      //! The wallclock counter
diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp

index f309e82c26864b9df5f9b0ea7dacafcf4d3465c8..0bcf0c5451e9eb77b6532f5ec1e826516099db23 100644 (file)
--- a/src/gromacs/mdlib/sim_util.cpp
+++ b/src/gromacs/mdlib/sim_util.cpp
@@ -1115,10 +1115,9 @@ static void setupGpuForceReductions(gmx::MdrunScheduleWorkload* runScheduleWork,
      if (runScheduleWork->simulationWork.useGpuPme
          && (thisRankHasDuty(cr, DUTY_PME) || runScheduleWork->simulationWork.useGpuPmePpCommunication))
      {
-        DeviceBuffer<gmx::RVec> forcePtr =
-                thisRankHasDuty(cr, DUTY_PME) ? pme_gpu_get_device_f(fr->pmedata)
-                                              :                    // PME force buffer on same GPU
-                        fr->pmePpCommGpu->getGpuForceStagingPtr(); // buffer received from other GPU
+        void* forcePtr = thisRankHasDuty(cr, DUTY_PME) ? pme_gpu_get_device_f(fr->pmedata)
+                                                       : // PME force buffer on same GPU
+                                 fr->pmePpCommGpu->getGpuForceStagingPtr(); // buffer received from other GPU
          fr->gpuForceReduction[gmx::AtomLocality::Local]->registerRvecForce(forcePtr);
  
          GpuEventSynchronizer* const pmeSynchronizer =
diff --git a/src/gromacs/mdlib/update_constrain_gpu_impl.cu b/src/gromacs/mdlib/update_constrain_gpu_impl.cu

index 831bbfb3c315f30c76c2b48b61ddddf97a17d988..f03ab778c3951b84a1f83ee7243bc07c96ed5952 100644 (file)
--- a/src/gromacs/mdlib/update_constrain_gpu_impl.cu
+++ b/src/gromacs/mdlib/update_constrain_gpu_impl.cu
@@ -60,7 +60,6 @@
  #include "gromacs/gpu_utils/device_context.h"
  #include "gromacs/gpu_utils/device_stream.h"
  #include "gromacs/gpu_utils/devicebuffer.h"
-#include "gromacs/gpu_utils/gpueventsynchronizer.cuh"
  #include "gromacs/gpu_utils/gputraits.cuh"
  #include "gromacs/gpu_utils/vectype_ops.cuh"
  #include "gromacs/mdlib/leapfrog_gpu.h"
diff --git a/src/gromacs/mdlib/update_constrain_gpu_impl.h b/src/gromacs/mdlib/update_constrain_gpu_impl.h

index 34712d494158952ed521484afd2620bd316c14b4..8e101b8cd0ed51ffb1c73b474c3b000ea09bd138 100644 (file)
--- a/src/gromacs/mdlib/update_constrain_gpu_impl.h
+++ b/src/gromacs/mdlib/update_constrain_gpu_impl.h
@@ -46,22 +46,14 @@
  #ifndef GMX_MDLIB_UPDATE_CONSTRAIN_GPU_IMPL_H
  #define GMX_MDLIB_UPDATE_CONSTRAIN_GPU_IMPL_H
  
-#include "gromacs/gpu_utils/devicebuffer_datatype.h"
+#include "gmxpre.h"
+
+#include "gromacs/gpu_utils/gpueventsynchronizer.cuh"
+#include "gromacs/mdlib/leapfrog_gpu.h"
+#include "gromacs/mdlib/lincs_gpu.cuh"
+#include "gromacs/mdlib/settle_gpu.cuh"
  #include "gromacs/mdlib/update_constrain_gpu.h"
  #include "gromacs/mdtypes/inputrec.h"
-#include "gromacs/pbcutil/pbc_aiuc.h"
-
-#if GMX_GPU_CUDA
-#    include "gromacs/gpu_utils/gputraits.cuh"
-#endif
-
-class GpuEventSynchronizer;
-namespace gmx
-{
-class LincsGpu;
-class SettleGpu;
-class LeapFrogGpu;
-} // namespace gmx
  
  namespace gmx
  {
@@ -201,14 +193,14 @@ private:
      int numAtoms_;
  
      //! Local copy of the pointer to the device positions buffer
-    DeviceBuffer<float3> d_x_;
+    float3* d_x_;
      //! Local copy of the pointer to the device velocities buffer
-    DeviceBuffer<float3> d_v_;
+    float3* d_v_;
      //! Local copy of the pointer to the device forces buffer
-    DeviceBuffer<float3> d_f_;
+    float3* d_f_;
  
      //! Device buffer for intermediate positions (maintained internally)
-    DeviceBuffer<float3> d_xp_;
+    float3* d_xp_;
      //! Number of elements in shifted coordinates buffer
      int numXp_ = -1;
      //! Allocation size for the shifted coordinates buffer
@@ -216,7 +208,7 @@ private:
  
  
      //! 1/mass for all atoms (GPU)
-    DeviceBuffer<real> d_inverseMasses_;
+    real* d_inverseMasses_;
      //! Number of elements in reciprocal masses buffer
      int numInverseMasses_ = -1;
      //! Allocation size for the reciprocal masses buffer
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu

index e30f8e8c8762a42b094a96431931a51848e8f606..6ed6c1ff78429803c928bc435ca838883e5ac680 100644 (file)
--- a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
@@ -907,7 +907,7 @@ void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid&        grid,
      nbnxnInsertNonlocalGpuDependency(nb, interactionLoc);
  }
  
-DeviceBuffer<gmx::RVec> getGpuForces(NbnxmGpu* nb)
+void* getGpuForces(NbnxmGpu* nb)
  {
      return nb->atdat->f;
  }
diff --git a/src/gromacs/nbnxm/nbnxm.cpp b/src/gromacs/nbnxm/nbnxm.cpp

index c96291a1c2d99aa5ba7e0d71499c27c7a3373f3f..96714bb15454d7397a23e8f0fd4acd0b4640aaa6 100644 (file)
--- a/src/gromacs/nbnxm/nbnxm.cpp
+++ b/src/gromacs/nbnxm/nbnxm.cpp
@@ -215,7 +215,7 @@ int nonbonded_verlet_t::getNumAtoms(const gmx::AtomLocality locality) const
      return numAtoms;
  }
  
-DeviceBuffer<gmx::RVec> nonbonded_verlet_t::getGpuForces() const
+void* nonbonded_verlet_t::getGpuForces() const
  {
      return Nbnxm::getGpuForces(gpu_nbv);
  }
diff --git a/src/gromacs/nbnxm/nbnxm.h b/src/gromacs/nbnxm/nbnxm.h

index 9732b01a39ec2fbac49e59fdddbaaa67a7f544d6..7ea59e4e5a063c7dfd598df4c546b01415f10df1 100644 (file)
--- a/src/gromacs/nbnxm/nbnxm.h
+++ b/src/gromacs/nbnxm/nbnxm.h
@@ -398,7 +398,7 @@ public:
       *
       * \returns A pointer to the force buffer in GPU memory
       */
-    DeviceBuffer<gmx::RVec> getGpuForces() const;
+    void* getGpuForces() const;
  
      //! Return the kernel setup
      const Nbnxm::KernelSetup& kernelSetup() const { return kernelSetup_; }
diff --git a/src/gromacs/nbnxm/nbnxm_gpu.h b/src/gromacs/nbnxm/nbnxm_gpu.h

index c3fdb36a19eaa4daf3bfbcf6d0846fca9b959cd4..fe836969347c3e33d2f141f3c3923eb548969efc 100644 (file)
--- a/src/gromacs/nbnxm/nbnxm_gpu.h
+++ b/src/gromacs/nbnxm/nbnxm_gpu.h
@@ -315,8 +315,7 @@ void nbnxn_wait_x_on_device(NbnxmGpu gmx_unused* nb) CUDA_FUNC_TERM;
   * \returns       A pointer to the force buffer in GPU memory
   */
  CUDA_FUNC_QUALIFIER
-DeviceBuffer<gmx::RVec> getGpuForces(NbnxmGpu gmx_unused* nb)
-        CUDA_FUNC_TERM_WITH_RETURN(DeviceBuffer<gmx::RVec>{});
+void* getGpuForces(NbnxmGpu gmx_unused* nb) CUDA_FUNC_TERM_WITH_RETURN(nullptr);
  
  } // namespace Nbnxm
  #endif
author	Andrey Alekseenko <al42and@gmail.com>
	Wed, 10 Mar 2021 09:06:41 +0000 (09:06 +0000)
committer	Paul Bauer <paul.bauer.q@gmail.com>
	Wed, 10 Mar 2021 09:06:41 +0000 (09:06 +0000)
src/gromacs/ewald/pme.h		patch \| blob \| history
src/gromacs/ewald/pme_force_sender_gpu.h		patch \| blob \| history
src/gromacs/ewald/pme_force_sender_gpu_impl.cpp		patch \| blob \| history
src/gromacs/ewald/pme_force_sender_gpu_impl.cu		patch \| blob \| history
src/gromacs/ewald/pme_force_sender_gpu_impl.h		patch \| blob \| history
src/gromacs/ewald/pme_gather.cu		patch \| blob \| history
src/gromacs/ewald/pme_gpu.cpp		patch \| blob \| history
src/gromacs/ewald/pme_gpu_internal.cpp		patch \| blob \| history
src/gromacs/ewald/pme_gpu_internal.h		patch \| blob \| history
src/gromacs/ewald/pme_gpu_types.h		patch \| blob \| history
src/gromacs/ewald/pme_only.cpp		patch \| blob \| history
src/gromacs/ewald/pme_pp_comm_gpu.h		patch \| blob \| history
src/gromacs/ewald/pme_pp_comm_gpu_impl.cpp		patch \| blob \| history
src/gromacs/ewald/pme_pp_comm_gpu_impl.cu		patch \| blob \| history
src/gromacs/ewald/pme_pp_comm_gpu_impl.h		patch \| blob \| history
src/gromacs/mdlib/gpuforcereduction.h		patch \| blob \| history
src/gromacs/mdlib/gpuforcereduction_impl.cpp		patch \| blob \| history
src/gromacs/mdlib/gpuforcereduction_impl.cu		patch \| blob \| history
src/gromacs/mdlib/gpuforcereduction_impl.cuh	[moved from src/gromacs/mdlib/gpuforcereduction_impl.h with 93% similarity]	patch \| blob \| history
src/gromacs/mdlib/sim_util.cpp		patch \| blob \| history
src/gromacs/mdlib/update_constrain_gpu_impl.cu		patch \| blob \| history
src/gromacs/mdlib/update_constrain_gpu_impl.h		patch \| blob \| history
src/gromacs/nbnxm/cuda/nbnxm_cuda.cu		patch \| blob \| history
src/gromacs/nbnxm/nbnxm.cpp		patch \| blob \| history
src/gromacs/nbnxm/nbnxm.h		patch \| blob \| history
src/gromacs/nbnxm/nbnxm_gpu.h		patch \| blob \| history