Use DeviceBuffer<RVec> in GPU force reduction and PME code
authorAndrey Alekseenko <al42and@gmail.com>
Tue, 16 Mar 2021 10:03:24 +0000 (11:03 +0100)
committerPaul Bauer <paul.bauer.q@gmail.com>
Fri, 19 Mar 2021 15:27:49 +0000 (15:27 +0000)
... instead of raw device pointers.

Preparation for #3932. PME change is incidental, the main focus is
GpuForceReduction.

20 files changed:
src/gromacs/ewald/pme.h
src/gromacs/ewald/pme_force_sender_gpu.h
src/gromacs/ewald/pme_force_sender_gpu_impl.cpp
src/gromacs/ewald/pme_force_sender_gpu_impl.cu
src/gromacs/ewald/pme_force_sender_gpu_impl.h
src/gromacs/ewald/pme_gpu.cpp
src/gromacs/ewald/pme_gpu_internal.cpp
src/gromacs/ewald/pme_gpu_internal.h
src/gromacs/ewald/pme_only.cpp
src/gromacs/ewald/pme_pp_comm_gpu.h
src/gromacs/ewald/pme_pp_comm_gpu_impl.cpp
src/gromacs/ewald/pme_pp_comm_gpu_impl.cu
src/gromacs/ewald/pme_pp_comm_gpu_impl.h
src/gromacs/mdlib/gpuforcereduction.h
src/gromacs/mdlib/gpuforcereduction_impl.cpp
src/gromacs/mdlib/gpuforcereduction_impl.cu
src/gromacs/mdlib/gpuforcereduction_impl.h [moved from src/gromacs/mdlib/gpuforcereduction_impl.cuh with 95% similarity]
src/gromacs/mdlib/sim_util.cpp
src/gromacs/mdlib/update_constrain_gpu_impl.cu
src/gromacs/mdlib/update_constrain_gpu_impl.h

index c9f1a6e2809e342327916ef1ba26a007614e7748..d897c0fb0caf08004fa44908da06c5738d09d859 100644 (file)
@@ -467,12 +467,12 @@ GPU_FUNC_QUALIFIER void pme_gpu_set_device_x(const gmx_pme_t*        GPU_FUNC_AR
  * \param[in] pme            The PME data structure.
  * \returns                  Pointer to force data
  */
-GPU_FUNC_QUALIFIER void* pme_gpu_get_device_f(const gmx_pme_t* GPU_FUNC_ARGUMENT(pme))
-        GPU_FUNC_TERM_WITH_RETURN(nullptr);
+GPU_FUNC_QUALIFIER DeviceBuffer<gmx::RVec> pme_gpu_get_device_f(const gmx_pme_t* GPU_FUNC_ARGUMENT(pme))
+        GPU_FUNC_TERM_WITH_RETURN(DeviceBuffer<gmx::RVec>{});
 
 /*! \brief Get pointer to the device synchronizer object that allows syncing on PME force calculation completion
  * \param[in] pme            The PME data structure.
- * \returns                  Pointer to sychronizer
+ * \returns                  Pointer to synchronizer
  */
 GPU_FUNC_QUALIFIER GpuEventSynchronizer* pme_gpu_get_f_ready_synchronizer(const gmx_pme_t* GPU_FUNC_ARGUMENT(pme))
         GPU_FUNC_TERM_WITH_RETURN(nullptr);
index 081ba454e63ae688f05f470d7f7f4781e05970ba..261260977993847c232c726f5778007944890a1d 100644 (file)
@@ -45,6 +45,7 @@
 #include <memory>
 
 #include "gromacs/math/vectypes.h"
+#include "gromacs/gpu_utils/devicebuffer_datatype.h"
 #include "gromacs/utility/gmxmpi.h"
 
 class GpuEventSynchronizer;
@@ -83,7 +84,7 @@ public:
      * Initialization of GPU PME Force sender
      * \param[in] d_f   force buffer in GPU memory
      */
-    void sendForceBufferAddressToPpRanks(rvec* d_f);
+    void sendForceBufferAddressToPpRanks(DeviceBuffer<RVec> d_f);
 
     /*! \brief
      * Send force synchronizer to PP rank
index 8d8b97f5c5712330f90a6688c697b91bc260b133..69974b31196ab9ca0cc6269158669c608cd20d62 100644 (file)
@@ -48,6 +48,7 @@
 #include "config.h"
 
 #include "gromacs/ewald/pme_force_sender_gpu.h"
+#include "gromacs/gpu_utils/devicebuffer_datatype.h"
 #include "gromacs/utility/arrayref.h"
 #include "gromacs/utility/gmxassert.h"
 
@@ -75,7 +76,7 @@ PmeForceSenderGpu::PmeForceSenderGpu(GpuEventSynchronizer* /*pmeForcesReady */,
 PmeForceSenderGpu::~PmeForceSenderGpu() = default;
 
 /*!\brief init PME-PP GPU communication stub */
-void PmeForceSenderGpu::sendForceBufferAddressToPpRanks(rvec* /* d_f */)
+void PmeForceSenderGpu::sendForceBufferAddressToPpRanks(DeviceBuffer<RVec> /* d_f */)
 {
     GMX_ASSERT(!impl_,
                "A CPU stub for PME-PP GPU communication initialization was called instead of the "
index 44a2e30de33111941ea49abaa02b5d6d7dcc0078..509a624d041bccad080957ae53542c6dcf066b32 100644 (file)
@@ -70,7 +70,7 @@ PmeForceSenderGpu::Impl::Impl(GpuEventSynchronizer*  pmeForcesReady,
 PmeForceSenderGpu::Impl::~Impl() = default;
 
 /*! \brief  sends force buffer address to PP ranks */
-void PmeForceSenderGpu::Impl::sendForceBufferAddressToPpRanks(rvec* d_f)
+void PmeForceSenderGpu::Impl::sendForceBufferAddressToPpRanks(DeviceBuffer<Float3> d_f)
 {
     int ind_start = 0;
     int ind_end   = 0;
@@ -113,7 +113,7 @@ PmeForceSenderGpu::PmeForceSenderGpu(GpuEventSynchronizer*  pmeForcesReady,
 
 PmeForceSenderGpu::~PmeForceSenderGpu() = default;
 
-void PmeForceSenderGpu::sendForceBufferAddressToPpRanks(rvec* d_f)
+void PmeForceSenderGpu::sendForceBufferAddressToPpRanks(DeviceBuffer<RVec> d_f)
 {
     impl_->sendForceBufferAddressToPpRanks(d_f);
 }
index ad9718c4685b55d18cdf47edd78ff7993de562c7..c7d4c0d76c98fdf8f6e94d83d311d938a27a7692 100644 (file)
 #define GMX_PMEFORCESENDERGPU_IMPL_H
 
 #include "gromacs/ewald/pme_force_sender_gpu.h"
-#include "gromacs/gpu_utils/gpueventsynchronizer.cuh"
+#include "gromacs/gpu_utils/devicebuffer_datatype.h"
+#include "gromacs/gpu_utils/gputraits.h"
 #include "gromacs/utility/arrayref.h"
 
+class GpuEventSynchronizer;
+
 namespace gmx
 {
 
@@ -68,7 +71,7 @@ public:
      * sends force buffer address to PP rank
      * \param[in] d_f   force buffer in GPU memory
      */
-    void sendForceBufferAddressToPpRanks(rvec* d_f);
+    void sendForceBufferAddressToPpRanks(DeviceBuffer<Float3> d_f);
 
     /*! \brief
      * Send force synchronizer to PP rank
index e0c32e207d1b46edcf1365c6750581f980c013d3..225fb1050a281a33cfc3cff9bec8dea32d03e7b9 100644 (file)
@@ -440,11 +440,11 @@ void pme_gpu_reinit_computation(const gmx_pme_t* pme, gmx_wallcycle* wcycle)
     wallcycle_stop(wcycle, ewcLAUNCH_GPU);
 }
 
-void* pme_gpu_get_device_f(const gmx_pme_t* pme)
+DeviceBuffer<gmx::RVec> pme_gpu_get_device_f(const gmx_pme_t* pme)
 {
     if (!pme || !pme_gpu_active(pme))
     {
-        return nullptr;
+        return DeviceBuffer<gmx::RVec>{};
     }
     return pme_gpu_get_kernelparam_forces(pme->gpu);
 }
index 8eea806ac1934177f4f10364b09017354983efe6..798a9be9ef3563485370ca9d39bd7b18c37b3398 100644 (file)
@@ -1701,7 +1701,7 @@ void pme_gpu_gather(PmeGpu* pmeGpu, real** h_grids, const float lambda)
     }
 }
 
-void* pme_gpu_get_kernelparam_forces(const PmeGpu* pmeGpu)
+DeviceBuffer<gmx::RVec> pme_gpu_get_kernelparam_forces(const PmeGpu* pmeGpu)
 {
     if (pmeGpu && pmeGpu->kernelParams)
     {
@@ -1709,7 +1709,7 @@ void* pme_gpu_get_kernelparam_forces(const PmeGpu* pmeGpu)
     }
     else
     {
-        return nullptr;
+        return DeviceBuffer<gmx::RVec>{};
     }
 }
 
index 41b912e2b6a7385ac8ee102ebe3ab198b52e043f..7baa6bd3475eb1a79ff5e7f498c0554a7eb5f735 100644 (file)
@@ -405,8 +405,8 @@ GPU_FUNC_QUALIFIER void pme_gpu_set_kernelparam_coordinates(const PmeGpu* GPU_FU
  * \param[in] pmeGpu         The PME GPU structure.
  * \returns                  Pointer to force data
  */
-GPU_FUNC_QUALIFIER void* pme_gpu_get_kernelparam_forces(const PmeGpu* GPU_FUNC_ARGUMENT(pmeGpu))
-        GPU_FUNC_TERM_WITH_RETURN(nullptr);
+GPU_FUNC_QUALIFIER DeviceBuffer<gmx::RVec> pme_gpu_get_kernelparam_forces(const PmeGpu* GPU_FUNC_ARGUMENT(pmeGpu))
+        GPU_FUNC_TERM_WITH_RETURN(DeviceBuffer<gmx::RVec>{});
 
 /*! \brief Return pointer to the sync object triggered after the PME force calculation completion
  * \param[in] pmeGpu         The PME GPU structure.
index 9ba22e26207b00f76218c0d153620f66f13f3b15..138711440c505085f1b358706209baacebf0f686 100644 (file)
@@ -445,8 +445,7 @@ static int gmx_pme_recv_coeffs_coords(struct gmx_pme_t*            pme,
                     // This rank will have its data accessed directly by PP rank, so needs to send the remote addresses.
                     pme_pp->pmeCoordinateReceiverGpu->sendCoordinateBufferAddressToPpRanks(
                             stateGpu->getCoordinates());
-                    pme_pp->pmeForceSenderGpu->sendForceBufferAddressToPpRanks(
-                            reinterpret_cast<rvec*>(pme_gpu_get_device_f(pme)));
+                    pme_pp->pmeForceSenderGpu->sendForceBufferAddressToPpRanks(pme_gpu_get_device_f(pme));
                 }
             }
 
index 3e56da9af3e8604c3030e7b04772a18be7b62c83..886e0c221b8f8a456b99ef4e40fcea45070da3e9 100644 (file)
@@ -44,6 +44,7 @@
 
 #include <memory>
 
+#include "gromacs/gpu_utils/devicebuffer_datatype.h"
 #include "gromacs/utility/gmxmpi.h"
 
 class DeviceContext;
@@ -99,7 +100,7 @@ public:
     /*! \brief
      * Return pointer to buffer used for staging PME force on GPU
      */
-    void* getGpuForceStagingPtr();
+    DeviceBuffer<gmx::RVec> getGpuForceStagingPtr();
 
     /*! \brief
      * Return pointer to event recorded when forces are ready
index c5f92aa53f01cbba57da1682e4df4d28a60a2836..d31b976c10f5a9de691420b747c26d9c0574d8ec 100644 (file)
@@ -102,12 +102,12 @@ void PmePpCommGpu::sendCoordinatesToPmeCudaDirect(void* /* sendPtr */,
                "implementation.");
 }
 
-void* PmePpCommGpu::getGpuForceStagingPtr()
+DeviceBuffer<gmx::RVec> PmePpCommGpu::getGpuForceStagingPtr()
 {
     GMX_ASSERT(!impl_,
                "A CPU stub for PME-PP GPU communication was called instead of the correct "
                "implementation.");
-    return nullptr;
+    return DeviceBuffer<gmx::RVec>{};
 }
 
 GpuEventSynchronizer* PmePpCommGpu::getForcesReadySynchronizer()
index 0ecf0281333003926fb8a875fd61eb60ca4337ff..cb9e787c446836d27549d3e1c6775017520bc07f 100644 (file)
@@ -64,7 +64,8 @@ PmePpCommGpu::Impl::Impl(MPI_Comm             comm,
     deviceContext_(deviceContext),
     pmePpCommStream_(deviceStream),
     comm_(comm),
-    pmeRank_(pmeRank)
+    pmeRank_(pmeRank),
+    d_pmeForces_(nullptr)
 {
     GMX_RELEASE_ASSERT(
             GMX_THREAD_MPI,
@@ -155,9 +156,10 @@ void PmePpCommGpu::Impl::sendCoordinatesToPmeCudaDirect(void* sendPtr,
     GMX_UNUSED_VALUE(coordinatesReadyOnDeviceEvent);
 #endif
 }
-void* PmePpCommGpu::Impl::getGpuForceStagingPtr()
+
+DeviceBuffer<Float3> PmePpCommGpu::Impl::getGpuForceStagingPtr()
 {
-    return static_cast<void*>(d_pmeForces_);
+    return d_pmeForces_;
 }
 
 GpuEventSynchronizer* PmePpCommGpu::Impl::getForcesReadySynchronizer()
@@ -194,7 +196,7 @@ void PmePpCommGpu::sendCoordinatesToPmeCudaDirect(void*                 sendPtr,
             sendPtr, sendSize, sendPmeCoordinatesFromGpu, coordinatesReadyOnDeviceEvent);
 }
 
-void* PmePpCommGpu::getGpuForceStagingPtr()
+DeviceBuffer<gmx::RVec> PmePpCommGpu::getGpuForceStagingPtr()
 {
     return impl_->getGpuForceStagingPtr();
 }
index 042891a04d3aaf044914e9cbf666c4c3254712f1..70ef8f937c3476b20e91e2d048fb759d4b36f9fb 100644 (file)
@@ -44,7 +44,9 @@
 #define GMX_PME_PP_COMM_GPU_IMPL_H
 
 #include "gromacs/ewald/pme_pp_comm_gpu.h"
+#include "gromacs/gpu_utils/devicebuffer_datatype.h"
 #include "gromacs/gpu_utils/gpueventsynchronizer.cuh"
+#include "gromacs/gpu_utils/gputraits.h"
 #include "gromacs/math/vectypes.h"
 #include "gromacs/utility/gmxmpi.h"
 
@@ -110,7 +112,7 @@ public:
     /*! \brief
      * Return pointer to buffer used for staging PME force on GPU
      */
-    void* getGpuForceStagingPtr();
+    DeviceBuffer<Float3> getGpuForceStagingPtr();
 
     /*! \brief
      * Return pointer to event recorded when forces are ready
@@ -131,7 +133,7 @@ private:
     //! Rank of PME task
     int pmeRank_ = -1;
     //! Buffer for staging PME force on GPU
-    rvec* d_pmeForces_ = nullptr;
+    DeviceBuffer<gmx::RVec> d_pmeForces_;
     //! number of atoms in PME force staging array
     int d_pmeForcesSize_ = -1;
     //! number of atoms allocated in recvbuf array
index b23df660ed3fc1fb5ecdce731463e304c6538ac7..2955dd60c2013e48ac126ec1c97ae7f6ed58065d 100644 (file)
@@ -92,7 +92,7 @@ public:
      *
      * \param [in] forcePtr  Pointer to force to be reduced
      */
-    void registerRvecForce(void* forcePtr);
+    void registerRvecForce(DeviceBuffer<gmx::RVec> forcePtr);
 
     /*! \brief Add a dependency for this force reduction
      *
index b431fbad495c950c63cd568c287cf3305810ac52..6d826d66757c6936e60e8451790debedcf89954d 100644 (file)
@@ -82,7 +82,7 @@ void GpuForceReduction::registerNbnxmForce(DeviceBuffer<RVec> /* forcePtr */)
 }
 
 // NOLINTNEXTLINE readability-convert-member-functions-to-static
-void GpuForceReduction::registerRvecForce(void* /* forcePtr */)
+void GpuForceReduction::registerRvecForce(DeviceBuffer<gmx::RVec> /* forcePtr */)
 {
     GMX_ASSERT(false, "A CPU stub has been called instead of the correct implementation.");
 }
index dab7d4da0cbd8c9938a85766211d47d7a2ce9938..f62ec44d467239343115881c5472b89ad01f44d5 100644 (file)
@@ -43,7 +43,7 @@
 
 #include "gmxpre.h"
 
-#include "gpuforcereduction_impl.cuh"
+#include "gpuforcereduction_impl.h"
 
 #include <stdio.h>
 
@@ -108,11 +108,14 @@ static __global__ void reduceKernel(const float3* __restrict__ gm_nbnxmForce,
 GpuForceReduction::Impl::Impl(const DeviceContext& deviceContext,
                               const DeviceStream&  deviceStream,
                               gmx_wallcycle*       wcycle) :
+    baseForce_(nullptr),
     deviceContext_(deviceContext),
     deviceStream_(deviceStream),
+    nbnxmForceToAdd_(nullptr),
+    rvecForceToAdd_(nullptr),
     wcycle_(wcycle){};
 
-void GpuForceReduction::Impl::reinit(float3*               baseForcePtr,
+void GpuForceReduction::Impl::reinit(DeviceBuffer<Float3>  baseForcePtr,
                                      const int             numAtoms,
                                      ArrayRef<const int>   cell,
                                      const int             atomStart,
@@ -223,9 +226,9 @@ void GpuForceReduction::registerNbnxmForce(DeviceBuffer<Float3> forcePtr)
     impl_->registerNbnxmForce(forcePtr);
 }
 
-void GpuForceReduction::registerRvecForce(void* forcePtr)
+void GpuForceReduction::registerRvecForce(DeviceBuffer<gmx::RVec> forcePtr)
 {
-    impl_->registerRvecForce(reinterpret_cast<DeviceBuffer<RVec>>(forcePtr));
+    impl_->registerRvecForce(forcePtr);
 }
 
 void GpuForceReduction::addDependency(GpuEventSynchronizer* const dependency)
@@ -240,7 +243,7 @@ void GpuForceReduction::reinit(DeviceBuffer<RVec>    baseForcePtr,
                                const bool            accumulate,
                                GpuEventSynchronizer* completionMarker)
 {
-    impl_->reinit(asFloat3(baseForcePtr), numAtoms, cell, atomStart, accumulate, completionMarker);
+    impl_->reinit(baseForcePtr, numAtoms, cell, atomStart, accumulate, completionMarker);
 }
 void GpuForceReduction::execute()
 {
similarity index 95%
rename from src/gromacs/mdlib/gpuforcereduction_impl.cuh
rename to src/gromacs/mdlib/gpuforcereduction_impl.h
index c7d9493c821129515e44db0cb811ec3b3d6aef61..98c8ca2c7be9140babcbb234893a099a442a6429 100644 (file)
@@ -59,7 +59,7 @@ struct cellInfo
     //! cell index mapping for any nbat-format forces
     const int* cell = nullptr;
     //! device copy of cell index mapping for any nbat-format forces
-    int* d_cell = nullptr;
+    DeviceBuffer<int> d_cell;
     //! number of atoms in cell array
     int cellSize = -1;
     //! number of atoms allocated in cell array
@@ -76,7 +76,7 @@ public:
      * \param [in] deviceContext GPU device context
      * \param [in] wcycle        The wallclock counter
      */
-    Impl(const DeviceContext& deviceContext, const DeviceStream& deviceStreami, gmx_wallcycle* wcycle);
+    Impl(const DeviceContext& deviceContext, const DeviceStream& deviceStream, gmx_wallcycle* wcycle);
     ~Impl();
 
     /*! \brief Register a nbnxm-format force to be reduced
@@ -106,7 +106,7 @@ public:
      * \param [in] accumulate       Whether reduction should be accumulated
      * \param [in] completionMarker Event to be marked when launch of reduction is complete
      */
-    void reinit(float3*               baseForcePtr,
+    void reinit(DeviceBuffer<Float3>  baseForcePtr,
                 const int             numAtoms,
                 ArrayRef<const int>   cell,
                 const int             atomStart,
@@ -118,7 +118,7 @@ public:
 
 private:
     //! force to be used as a base for this reduction
-    float3* baseForce_ = nullptr;
+    DeviceBuffer<Float3> baseForce_;
     //! starting atom
     int atomStart_ = 0;
     //! number of atoms
@@ -134,9 +134,9 @@ private:
     //! stream to be used for this reduction
     const DeviceStream& deviceStream_;
     //! Nbnxm force to be added in this reduction
-    DeviceBuffer<RVec> nbnxmForceToAdd_ = nullptr;
+    DeviceBuffer<RVec> nbnxmForceToAdd_;
     //! Rvec-format force to be added in this reduction
-    DeviceBuffer<RVec> rvecForceToAdd_ = nullptr;
+    DeviceBuffer<RVec> rvecForceToAdd_;
     //! event to be marked when redcution launch has been completed
     GpuEventSynchronizer* completionMarker_ = nullptr;
     //! The wallclock counter
index 99dc4758c4070546394ece82f252eecca21dc6c4..ec10bb27a0fa7dd3fc09c0c7aa819b2e6c97b6d7 100644 (file)
@@ -1123,9 +1123,10 @@ static void setupGpuForceReductions(gmx::MdrunScheduleWorkload* runScheduleWork,
     if (runScheduleWork->simulationWork.useGpuPme
         && (thisRankHasDuty(cr, DUTY_PME) || runScheduleWork->simulationWork.useGpuPmePpCommunication))
     {
-        void* forcePtr = thisRankHasDuty(cr, DUTY_PME) ? pme_gpu_get_device_f(fr->pmedata)
-                                                       : // PME force buffer on same GPU
-                                 fr->pmePpCommGpu->getGpuForceStagingPtr(); // buffer received from other GPU
+        DeviceBuffer<gmx::RVec> forcePtr =
+                thisRankHasDuty(cr, DUTY_PME) ? pme_gpu_get_device_f(fr->pmedata)
+                                              :                    // PME force buffer on same GPU
+                        fr->pmePpCommGpu->getGpuForceStagingPtr(); // buffer received from other GPU
         fr->gpuForceReduction[gmx::AtomLocality::Local]->registerRvecForce(forcePtr);
 
         GpuEventSynchronizer* const pmeSynchronizer =
index b56185651170a6895d88cffe31ac20f3e2760bde..3b428a183b591af953d6100f31ec8b7975dd3b1a 100644 (file)
@@ -60,6 +60,7 @@
 #include "gromacs/gpu_utils/device_context.h"
 #include "gromacs/gpu_utils/device_stream.h"
 #include "gromacs/gpu_utils/devicebuffer.h"
+#include "gromacs/gpu_utils/gpueventsynchronizer.cuh"
 #include "gromacs/gpu_utils/gputraits.cuh"
 #include "gromacs/gpu_utils/vectype_ops.cuh"
 #include "gromacs/mdlib/leapfrog_gpu.h"
index 76e41398e762aaa941d41b24e7968bd0e242421e..1d1a8fd0de1a8453e6a20a022fedcf5da60c54bb 100644 (file)
 
 #include "gmxpre.h"
 
-#include "gromacs/gpu_utils/gpueventsynchronizer.cuh"
 #include "gromacs/mdlib/leapfrog_gpu.h"
 #include "gromacs/mdlib/lincs_gpu.cuh"
 #include "gromacs/mdlib/settle_gpu.cuh"
 #include "gromacs/mdlib/update_constrain_gpu.h"
 #include "gromacs/mdtypes/inputrec.h"
 
+class GpuEventSynchronizer;
+
 namespace gmx
 {