Revert "Wrap more device pointers in DeviceBuffer" (!1244)
authorAndrey Alekseenko <al42and@gmail.com>
Wed, 10 Mar 2021 09:06:41 +0000 (09:06 +0000)
committerPaul Bauer <paul.bauer.q@gmail.com>
Wed, 10 Mar 2021 09:06:41 +0000 (09:06 +0000)
This reverts commit 850429f3ebe34d27dabed6d8c31c08968befd1f5,
which broke some MPI functionality, as revealed in post-merge tests.

26 files changed:
src/gromacs/ewald/pme.h
src/gromacs/ewald/pme_force_sender_gpu.h
src/gromacs/ewald/pme_force_sender_gpu_impl.cpp
src/gromacs/ewald/pme_force_sender_gpu_impl.cu
src/gromacs/ewald/pme_force_sender_gpu_impl.h
src/gromacs/ewald/pme_gather.cu
src/gromacs/ewald/pme_gpu.cpp
src/gromacs/ewald/pme_gpu_internal.cpp
src/gromacs/ewald/pme_gpu_internal.h
src/gromacs/ewald/pme_gpu_types.h
src/gromacs/ewald/pme_only.cpp
src/gromacs/ewald/pme_pp_comm_gpu.h
src/gromacs/ewald/pme_pp_comm_gpu_impl.cpp
src/gromacs/ewald/pme_pp_comm_gpu_impl.cu
src/gromacs/ewald/pme_pp_comm_gpu_impl.h
src/gromacs/mdlib/gpuforcereduction.h
src/gromacs/mdlib/gpuforcereduction_impl.cpp
src/gromacs/mdlib/gpuforcereduction_impl.cu
src/gromacs/mdlib/gpuforcereduction_impl.cuh [moved from src/gromacs/mdlib/gpuforcereduction_impl.h with 93% similarity]
src/gromacs/mdlib/sim_util.cpp
src/gromacs/mdlib/update_constrain_gpu_impl.cu
src/gromacs/mdlib/update_constrain_gpu_impl.h
src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
src/gromacs/nbnxm/nbnxm.cpp
src/gromacs/nbnxm/nbnxm.h
src/gromacs/nbnxm/nbnxm_gpu.h

index d4a591b7bfc0b1949126d2a212d84ba81a316e42..3f2d33f6c69ff8cc969ea01b9aed3682dd3273cd 100644 (file)
@@ -467,12 +467,12 @@ GPU_FUNC_QUALIFIER void pme_gpu_set_device_x(const gmx_pme_t*        GPU_FUNC_AR
  * \param[in] pme            The PME data structure.
  * \returns                  Pointer to force data
  */
-GPU_FUNC_QUALIFIER DeviceBuffer<gmx::RVec> pme_gpu_get_device_f(const gmx_pme_t* GPU_FUNC_ARGUMENT(pme))
-        GPU_FUNC_TERM_WITH_RETURN(DeviceBuffer<gmx::RVec>{});
+GPU_FUNC_QUALIFIER void* pme_gpu_get_device_f(const gmx_pme_t* GPU_FUNC_ARGUMENT(pme))
+        GPU_FUNC_TERM_WITH_RETURN(nullptr);
 
 /*! \brief Get pointer to the device synchronizer object that allows syncing on PME force calculation completion
  * \param[in] pme            The PME data structure.
- * \returns                  Pointer to synchronizer
+ * \returns                  Pointer to sychronizer
  */
 GPU_FUNC_QUALIFIER GpuEventSynchronizer* pme_gpu_get_f_ready_synchronizer(const gmx_pme_t* GPU_FUNC_ARGUMENT(pme))
         GPU_FUNC_TERM_WITH_RETURN(nullptr);
index ec88e8569babd0df83a3cec327152cd79528d0a3..bcc3b1e39330cd530bdcfef9b76000c6d0b34639 100644 (file)
@@ -45,7 +45,6 @@
 #include <memory>
 
 #include "gromacs/math/vectypes.h"
-#include "gromacs/gpu_utils/devicebuffer_datatype.h"
 #include "gromacs/utility/gmxmpi.h"
 
 class DeviceStream;
@@ -84,7 +83,7 @@ public:
      * Initialization of GPU PME Force sender
      * \param[in] d_f   force buffer in GPU memory
      */
-    void sendForceBufferAddressToPpRanks(DeviceBuffer<RVec> d_f);
+    void sendForceBufferAddressToPpRanks(rvec* d_f);
 
     /*! \brief
      * Send PP data to PP rank
index 88cc5ca11c7401e02cf87a9b35d516861e9e886b..915d0953ed7e292d048fb7a9d0cc51aab18de5a7 100644 (file)
@@ -75,7 +75,7 @@ PmeForceSenderGpu::PmeForceSenderGpu(const DeviceStream& /*pmeStream */,
 PmeForceSenderGpu::~PmeForceSenderGpu() = default;
 
 /*!\brief init PME-PP GPU communication stub */
-void PmeForceSenderGpu::sendForceBufferAddressToPpRanks(DeviceBuffer<gmx::RVec> /* d_f */)
+void PmeForceSenderGpu::sendForceBufferAddressToPpRanks(rvec* /* d_f */)
 {
     GMX_ASSERT(!impl_,
                "A CPU stub for PME-PP GPU communication initialization was called instead of the "
index 972c1c7d57c7541257f29446850bdb8acd12d9d2..07d37dcd7e31dc12ff0c40eeb96d4088a1a9cbdf 100644 (file)
@@ -48,7 +48,6 @@
 #include "config.h"
 
 #include "gromacs/gpu_utils/cudautils.cuh"
-#include "gromacs/gpu_utils/devicebuffer_datatype.h"
 #include "gromacs/gpu_utils/gpueventsynchronizer.cuh"
 #include "gromacs/utility/gmxmpi.h"
 
@@ -69,7 +68,7 @@ PmeForceSenderGpu::Impl::Impl(const DeviceStream& pmeStream, MPI_Comm comm, gmx:
 PmeForceSenderGpu::Impl::~Impl() = default;
 
 /*! \brief  sends force buffer address to PP ranks */
-void PmeForceSenderGpu::Impl::sendForceBufferAddressToPpRanks(DeviceBuffer<RVec> d_f)
+void PmeForceSenderGpu::Impl::sendForceBufferAddressToPpRanks(rvec* d_f)
 {
     int ind_start = 0;
     int ind_end   = 0;
@@ -116,7 +115,7 @@ PmeForceSenderGpu::PmeForceSenderGpu(const DeviceStream&    pmeStream,
 
 PmeForceSenderGpu::~PmeForceSenderGpu() = default;
 
-void PmeForceSenderGpu::sendForceBufferAddressToPpRanks(DeviceBuffer<RVec> d_f)
+void PmeForceSenderGpu::sendForceBufferAddressToPpRanks(rvec* d_f)
 {
     impl_->sendForceBufferAddressToPpRanks(d_f);
 }
index f0b0f74bad7a03d0c65eae9418bac0770d2482ec..70be40cc7f927b9f971d95d3a928f58a227403b3 100644 (file)
@@ -44,7 +44,6 @@
 #define GMX_PMEFORCESENDERGPU_IMPL_H
 
 #include "gromacs/ewald/pme_force_sender_gpu.h"
-#include "gromacs/gpu_utils/devicebuffer_datatype.h"
 #include "gromacs/gpu_utils/gpueventsynchronizer.cuh"
 #include "gromacs/utility/arrayref.h"
 
@@ -69,7 +68,7 @@ public:
      * sends force buffer address to PP rank
      * \param[in] d_f   force buffer in GPU memory
      */
-    void sendForceBufferAddressToPpRanks(DeviceBuffer<RVec> d_f);
+    void sendForceBufferAddressToPpRanks(rvec* d_f);
 
     /*! \brief
      * Send PP data to PP rank
index 82eeed85a1b5742a4901bfe5a8c33912890debb6..35111b52ec18fdd7e7f2a6aed828f28a7961f857 100644 (file)
@@ -351,7 +351,7 @@ __launch_bounds__(c_gatherMaxThreadsPerBlock, c_gatherMinBlocksPerMP) __global__
     const float* __restrict__ gm_coefficientsB = kernelParams.atoms.d_coefficients[1];
     const float* __restrict__ gm_gridA         = kernelParams.grid.d_realGrid[0];
     const float* __restrict__ gm_gridB         = kernelParams.grid.d_realGrid[1];
-    float* __restrict__ gm_forces = reinterpret_cast<float*>(kernelParams.atoms.d_forces);
+    float* __restrict__ gm_forces              = kernelParams.atoms.d_forces;
 
     /* Global memory pointers for readGlobal */
     const float* __restrict__ gm_theta         = kernelParams.atoms.d_theta;
index 225fb1050a281a33cfc3cff9bec8dea32d03e7b9..e0c32e207d1b46edcf1365c6750581f980c013d3 100644 (file)
@@ -440,11 +440,11 @@ void pme_gpu_reinit_computation(const gmx_pme_t* pme, gmx_wallcycle* wcycle)
     wallcycle_stop(wcycle, ewcLAUNCH_GPU);
 }
 
-DeviceBuffer<gmx::RVec> pme_gpu_get_device_f(const gmx_pme_t* pme)
+void* pme_gpu_get_device_f(const gmx_pme_t* pme)
 {
     if (!pme || !pme_gpu_active(pme))
     {
-        return DeviceBuffer<gmx::RVec>{};
+        return nullptr;
     }
     return pme_gpu_get_kernelparam_forces(pme->gpu);
 }
index c0d422d5728a111cf884dbc5aad785a6628ebd86..028a66a35a9f30df54cdfc40e1628b878b9ae154 100644 (file)
@@ -229,7 +229,7 @@ void pme_gpu_free_bspline_values(const PmeGpu* pmeGpu)
 
 void pme_gpu_realloc_forces(PmeGpu* pmeGpu)
 {
-    const size_t newForcesSize = pmeGpu->nAtomsAlloc;
+    const size_t newForcesSize = pmeGpu->nAtomsAlloc * DIM;
     GMX_ASSERT(newForcesSize > 0, "Bad number of atoms in PME GPU");
     reallocateDeviceBuffer(&pmeGpu->kernelParams->atoms.d_forces,
                            newForcesSize,
@@ -248,10 +248,11 @@ void pme_gpu_free_forces(const PmeGpu* pmeGpu)
 void pme_gpu_copy_input_forces(PmeGpu* pmeGpu)
 {
     GMX_ASSERT(pmeGpu->kernelParams->atoms.nAtoms > 0, "Bad number of atoms in PME GPU");
+    float* h_forcesFloat = reinterpret_cast<float*>(pmeGpu->staging.h_forces.data());
     copyToDeviceBuffer(&pmeGpu->kernelParams->atoms.d_forces,
-                       pmeGpu->staging.h_forces.data(),
+                       h_forcesFloat,
                        0,
-                       pmeGpu->kernelParams->atoms.nAtoms,
+                       DIM * pmeGpu->kernelParams->atoms.nAtoms,
                        pmeGpu->archSpecific->pmeStream_,
                        pmeGpu->settings.transferKind,
                        nullptr);
@@ -260,10 +261,11 @@ void pme_gpu_copy_input_forces(PmeGpu* pmeGpu)
 void pme_gpu_copy_output_forces(PmeGpu* pmeGpu)
 {
     GMX_ASSERT(pmeGpu->kernelParams->atoms.nAtoms > 0, "Bad number of atoms in PME GPU");
-    copyFromDeviceBuffer(pmeGpu->staging.h_forces.data(),
+    float* h_forcesFloat = reinterpret_cast<float*>(pmeGpu->staging.h_forces.data());
+    copyFromDeviceBuffer(h_forcesFloat,
                          &pmeGpu->kernelParams->atoms.d_forces,
                          0,
-                         pmeGpu->kernelParams->atoms.nAtoms,
+                         DIM * pmeGpu->kernelParams->atoms.nAtoms,
                          pmeGpu->archSpecific->pmeStream_,
                          pmeGpu->settings.transferKind,
                          nullptr);
@@ -1704,7 +1706,7 @@ void pme_gpu_gather(PmeGpu* pmeGpu, real** h_grids, const float lambda)
     }
 }
 
-DeviceBuffer<gmx::RVec> pme_gpu_get_kernelparam_forces(const PmeGpu* pmeGpu)
+void* pme_gpu_get_kernelparam_forces(const PmeGpu* pmeGpu)
 {
     if (pmeGpu && pmeGpu->kernelParams)
     {
@@ -1712,7 +1714,7 @@ DeviceBuffer<gmx::RVec> pme_gpu_get_kernelparam_forces(const PmeGpu* pmeGpu)
     }
     else
     {
-        return DeviceBuffer<gmx::RVec>{};
+        return nullptr;
     }
 }
 
index 7baa6bd3475eb1a79ff5e7f498c0554a7eb5f735..41b912e2b6a7385ac8ee102ebe3ab198b52e043f 100644 (file)
@@ -405,8 +405,8 @@ GPU_FUNC_QUALIFIER void pme_gpu_set_kernelparam_coordinates(const PmeGpu* GPU_FU
  * \param[in] pmeGpu         The PME GPU structure.
  * \returns                  Pointer to force data
  */
-GPU_FUNC_QUALIFIER DeviceBuffer<gmx::RVec> pme_gpu_get_kernelparam_forces(const PmeGpu* GPU_FUNC_ARGUMENT(pmeGpu))
-        GPU_FUNC_TERM_WITH_RETURN(DeviceBuffer<gmx::RVec>{});
+GPU_FUNC_QUALIFIER void* pme_gpu_get_kernelparam_forces(const PmeGpu* GPU_FUNC_ARGUMENT(pmeGpu))
+        GPU_FUNC_TERM_WITH_RETURN(nullptr);
 
 /*! \brief Return pointer to the sync object triggered after the PME force calculation completion
  * \param[in] pmeGpu         The PME GPU structure.
index e2c067390ad2c82f28d514e1951ba00d0a3595d9..abf7a17ed002900ab17583d96f7256565b656ebb 100644 (file)
@@ -171,7 +171,7 @@ struct PmeGpuAtomParams
      * The forces change and need to be copied from (and possibly to) the GPU for every PME
      * computation, but reallocation happens only at DD.
      */
-    HIDE_FROM_OPENCL_COMPILER(DeviceBuffer<gmx::RVec>) d_forces;
+    HIDE_FROM_OPENCL_COMPILER(DeviceBuffer<float>) d_forces;
     /*! \brief Global GPU memory array handle with ivec atom gridline indices.
      * Computed on GPU in the spline calculation part.
      */
index 5a01c898c8a2c98b0c7d8a7097751d7e1a3efa33..c895975144b5af2b2e80fc76e3713a157bbe8e40 100644 (file)
@@ -445,7 +445,8 @@ static int gmx_pme_recv_coeffs_coords(struct gmx_pme_t*            pme,
                     // This rank will have its data accessed directly by PP rank, so needs to send the remote addresses.
                     pme_pp->pmeCoordinateReceiverGpu->sendCoordinateBufferAddressToPpRanks(
                             stateGpu->getCoordinates());
-                    pme_pp->pmeForceSenderGpu->sendForceBufferAddressToPpRanks(pme_gpu_get_device_f(pme));
+                    pme_pp->pmeForceSenderGpu->sendForceBufferAddressToPpRanks(
+                            reinterpret_cast<rvec*>(pme_gpu_get_device_f(pme)));
                 }
             }
 
@@ -582,13 +583,8 @@ static void gmx_pme_send_force_vir_ener(const gmx_pme_t& pme,
         if (pme_pp->useGpuDirectComm)
         {
             // Data will be transferred directly from GPU.
-            DeviceBuffer<gmx::RVec> gmx_unused d_f = pme_gpu_get_device_f(&pme);
-#    if GMX_GPU_CUDA
-            // OpenCL does not allow host-side pointer arithmetic on buffers. Neither does SYCL.
-            sendbuf = reinterpret_cast<void*>(&d_f[ind_start]);
-#    else
-            GMX_RELEASE_ASSERT(false, "Can only use GPU Direct Communications with CUDA");
-#    endif
+            rvec* d_f = reinterpret_cast<rvec*>(pme_gpu_get_device_f(&pme));
+            sendbuf   = reinterpret_cast<void*>(&d_f[ind_start]);
         }
         sendFToPP(sendbuf, receiver, pme_pp, &messages);
     }
index 886e0c221b8f8a456b99ef4e40fcea45070da3e9..3e56da9af3e8604c3030e7b04772a18be7b62c83 100644 (file)
@@ -44,7 +44,6 @@
 
 #include <memory>
 
-#include "gromacs/gpu_utils/devicebuffer_datatype.h"
 #include "gromacs/utility/gmxmpi.h"
 
 class DeviceContext;
@@ -100,7 +99,7 @@ public:
     /*! \brief
      * Return pointer to buffer used for staging PME force on GPU
      */
-    DeviceBuffer<gmx::RVec> getGpuForceStagingPtr();
+    void* getGpuForceStagingPtr();
 
     /*! \brief
      * Return pointer to event recorded when forces are ready
index d31b976c10f5a9de691420b747c26d9c0574d8ec..c5f92aa53f01cbba57da1682e4df4d28a60a2836 100644 (file)
@@ -102,12 +102,12 @@ void PmePpCommGpu::sendCoordinatesToPmeCudaDirect(void* /* sendPtr */,
                "implementation.");
 }
 
-DeviceBuffer<gmx::RVec> PmePpCommGpu::getGpuForceStagingPtr()
+void* PmePpCommGpu::getGpuForceStagingPtr()
 {
     GMX_ASSERT(!impl_,
                "A CPU stub for PME-PP GPU communication was called instead of the correct "
                "implementation.");
-    return DeviceBuffer<gmx::RVec>{};
+    return nullptr;
 }
 
 GpuEventSynchronizer* PmePpCommGpu::getForcesReadySynchronizer()
index 0d66883979cc4b90bea44e5ef7cc237c7017ae98..0ecf0281333003926fb8a875fd61eb60ca4337ff 100644 (file)
@@ -155,9 +155,9 @@ void PmePpCommGpu::Impl::sendCoordinatesToPmeCudaDirect(void* sendPtr,
     GMX_UNUSED_VALUE(coordinatesReadyOnDeviceEvent);
 #endif
 }
-DeviceBuffer<gmx::RVec> PmePpCommGpu::Impl::getGpuForceStagingPtr()
+void* PmePpCommGpu::Impl::getGpuForceStagingPtr()
 {
-    return d_pmeForces_;
+    return static_cast<void*>(d_pmeForces_);
 }
 
 GpuEventSynchronizer* PmePpCommGpu::Impl::getForcesReadySynchronizer()
@@ -194,7 +194,7 @@ void PmePpCommGpu::sendCoordinatesToPmeCudaDirect(void*                 sendPtr,
             sendPtr, sendSize, sendPmeCoordinatesFromGpu, coordinatesReadyOnDeviceEvent);
 }
 
-DeviceBuffer<gmx::RVec> PmePpCommGpu::getGpuForceStagingPtr()
+void* PmePpCommGpu::getGpuForceStagingPtr()
 {
     return impl_->getGpuForceStagingPtr();
 }
index 58e22c2e46b832fbf1f2e65d0c1b8d5f08d3776d..042891a04d3aaf044914e9cbf666c4c3254712f1 100644 (file)
@@ -44,7 +44,6 @@
 #define GMX_PME_PP_COMM_GPU_IMPL_H
 
 #include "gromacs/ewald/pme_pp_comm_gpu.h"
-#include "gromacs/gpu_utils/devicebuffer_datatype.h"
 #include "gromacs/gpu_utils/gpueventsynchronizer.cuh"
 #include "gromacs/math/vectypes.h"
 #include "gromacs/utility/gmxmpi.h"
@@ -111,7 +110,7 @@ public:
     /*! \brief
      * Return pointer to buffer used for staging PME force on GPU
      */
-    DeviceBuffer<gmx::RVec> getGpuForceStagingPtr();
+    void* getGpuForceStagingPtr();
 
     /*! \brief
      * Return pointer to event recorded when forces are ready
@@ -124,15 +123,15 @@ private:
     //! Handle for CUDA stream used for the communication operations in this class
     const DeviceStream& pmePpCommStream_;
     //! Remote location of PME coordinate data buffer
-    DeviceBuffer<gmx::RVec> remotePmeXBuffer_ = nullptr;
+    void* remotePmeXBuffer_ = nullptr;
     //! Remote location of PME force data buffer
-    DeviceBuffer<gmx::RVec> remotePmeFBuffer_ = nullptr;
+    void* remotePmeFBuffer_ = nullptr;
     //! communicator for simulation
     MPI_Comm comm_;
     //! Rank of PME task
     int pmeRank_ = -1;
     //! Buffer for staging PME force on GPU
-    DeviceBuffer<gmx::RVec> d_pmeForces_ = nullptr;
+    rvec* d_pmeForces_ = nullptr;
     //! number of atoms in PME force staging array
     int d_pmeForcesSize_ = -1;
     //! number of atoms allocated in recvbuf array
index 82e8db33f56e3c289c41fcfeb7633b47a7c85839..157c4c7eca5a2be662f7a47ce7dc4d287803fe51 100644 (file)
@@ -86,13 +86,13 @@ public:
      *
      * \param [in] forcePtr  Pointer to force to be reduced
      */
-    void registerNbnxmForce(DeviceBuffer<RVec> forcePtr);
+    void registerNbnxmForce(void* forcePtr);
 
     /*! \brief Register a rvec-format force to be reduced
      *
      * \param [in] forcePtr  Pointer to force to be reduced
      */
-    void registerRvecForce(DeviceBuffer<RVec> forcePtr);
+    void registerRvecForce(void* forcePtr);
 
     /*! \brief Add a dependency for this force reduction
      *
@@ -102,14 +102,14 @@ public:
 
     /*! \brief Reinitialize the GPU force reduction
      *
-     * \param [in] baseForce        Pointer to force to be used as a base
+     * \param [in] baseForcePtr     Pointer to force to be used as a base
      * \param [in] numAtoms         The number of atoms
      * \param [in] cell             Pointer to the cell array
      * \param [in] atomStart        The start atom for the reduction
      * \param [in] accumulate       Whether reduction should be accumulated
      * \param [in] completionMarker Event to be marked when launch of reduction is complete
      */
-    void reinit(DeviceBuffer<RVec>    baseForce,
+    void reinit(DeviceBuffer<RVec>    baseForcePtr,
                 int                   numAtoms,
                 ArrayRef<const int>   cell,
                 int                   atomStart,
index b69759b848a18e8b1477e79962ae6f59f5e1903d..1e0a30b2b6aba01c90831eaba7661d7866255e80 100644 (file)
@@ -76,13 +76,13 @@ void GpuForceReduction::reinit(DeviceBuffer<RVec> /*baseForcePtr*/,
 }
 
 // NOLINTNEXTLINE readability-convert-member-functions-to-static
-void GpuForceReduction::registerNbnxmForce(DeviceBuffer<RVec> /* forcePtr */)
+void GpuForceReduction::registerNbnxmForce(void* /* forcePtr */)
 {
     GMX_ASSERT(false, "A CPU stub has been called instead of the correct implementation.");
 }
 
 // NOLINTNEXTLINE readability-convert-member-functions-to-static
-void GpuForceReduction::registerRvecForce(DeviceBuffer<RVec> /* forcePtr */)
+void GpuForceReduction::registerRvecForce(void* /* forcePtr */)
 {
     GMX_ASSERT(false, "A CPU stub has been called instead of the correct implementation.");
 }
index ac89b47d24712f8bb8474b25fad098871809b185..6e1e7e920a50b2b91e8e2d26149beee45c6d1380 100644 (file)
@@ -43,7 +43,7 @@
 
 #include "gmxpre.h"
 
-#include "gpuforcereduction_impl.h"
+#include "gpuforcereduction_impl.cuh"
 
 #include <stdio.h>
 
@@ -112,15 +112,15 @@ GpuForceReduction::Impl::Impl(const DeviceContext& deviceContext,
     deviceStream_(deviceStream),
     wcycle_(wcycle){};
 
-void GpuForceReduction::Impl::reinit(DeviceBuffer<gmx::RVec> baseForce,
-                                     const int               numAtoms,
-                                     ArrayRef<const int>     cell,
-                                     const int               atomStart,
-                                     const bool              accumulate,
-                                     GpuEventSynchronizer*   completionMarker)
+void GpuForceReduction::Impl::reinit(float3*               baseForcePtr,
+                                     const int             numAtoms,
+                                     ArrayRef<const int>   cell,
+                                     const int             atomStart,
+                                     const bool            accumulate,
+                                     GpuEventSynchronizer* completionMarker)
 {
-    GMX_ASSERT((baseForce != nullptr), "Input base force for reduction has no data");
-    baseForce_        = baseForce;
+    GMX_ASSERT((baseForcePtr != nullptr), "Input base force for reduction has no data");
+    baseForce_        = &(baseForcePtr[atomStart]);
     numAtoms_         = numAtoms;
     atomStart_        = atomStart;
     accumulate_       = accumulate;
@@ -144,13 +144,13 @@ void GpuForceReduction::Impl::reinit(DeviceBuffer<gmx::RVec> baseForce,
 
 void GpuForceReduction::Impl::registerNbnxmForce(DeviceBuffer<RVec> forcePtr)
 {
-    GMX_ASSERT((forcePtr), "Input force for reduction has no data");
+    GMX_ASSERT((forcePtr != nullptr), "Input force for reduction has no data");
     nbnxmForceToAdd_ = forcePtr;
 };
 
 void GpuForceReduction::Impl::registerRvecForce(DeviceBuffer<RVec> forcePtr)
 {
-    GMX_ASSERT((forcePtr), "Input force for reduction has no data");
+    GMX_ASSERT((forcePtr != nullptr), "Input force for reduction has no data");
     rvecForceToAdd_ = forcePtr;
 };
 
@@ -172,12 +172,11 @@ void GpuForceReduction::Impl::execute()
     GMX_ASSERT((nbnxmForceToAdd_ != nullptr), "Nbnxm force for reduction has no data");
 
     // Enqueue wait on all dependencies passed
-    for (const auto& synchronizer : dependencyList_)
+    for (auto const synchronizer : dependencyList_)
     {
         synchronizer->enqueueWaitEvent(deviceStream_);
     }
 
-    float3* d_baseForce      = &(asFloat3(baseForce_)[atomStart_]);
     float3* d_nbnxmForce     = asFloat3(nbnxmForceToAdd_);
     float3* d_rvecForceToAdd = &(asFloat3(rvecForceToAdd_)[atomStart_]);
 
@@ -196,7 +195,7 @@ void GpuForceReduction::Impl::execute()
                             : (accumulate_ ? reduceKernel<false, true> : reduceKernel<false, false>);
 
     const auto kernelArgs = prepareGpuKernelArguments(
-            kernelFn, config, &d_nbnxmForce, &d_rvecForceToAdd, &d_baseForce, &cellInfo_.d_cell, &numAtoms_);
+            kernelFn, config, &d_nbnxmForce, &d_rvecForceToAdd, &baseForce_, &cellInfo_.d_cell, &numAtoms_);
 
     launchGpuKernel(kernelFn, config, deviceStream_, nullptr, "Force Reduction", kernelArgs);
 
@@ -219,14 +218,14 @@ GpuForceReduction::GpuForceReduction(const DeviceContext& deviceContext,
 {
 }
 
-void GpuForceReduction::registerNbnxmForce(DeviceBuffer<RVec> forcePtr)
+void GpuForceReduction::registerNbnxmForce(void* forcePtr)
 {
-    impl_->registerNbnxmForce(std::move(forcePtr));
+    impl_->registerNbnxmForce(reinterpret_cast<DeviceBuffer<RVec>>(forcePtr));
 }
 
-void GpuForceReduction::registerRvecForce(DeviceBuffer<RVec> forcePtr)
+void GpuForceReduction::registerRvecForce(void* forcePtr)
 {
-    impl_->registerRvecForce(std::move(forcePtr));
+    impl_->registerRvecForce(reinterpret_cast<DeviceBuffer<RVec>>(forcePtr));
 }
 
 void GpuForceReduction::addDependency(GpuEventSynchronizer* const dependency)
@@ -234,14 +233,14 @@ void GpuForceReduction::addDependency(GpuEventSynchronizer* const dependency)
     impl_->addDependency(dependency);
 }
 
-void GpuForceReduction::reinit(DeviceBuffer<RVec>    baseForce,
+void GpuForceReduction::reinit(DeviceBuffer<RVec>    baseForcePtr,
                                const int             numAtoms,
                                ArrayRef<const int>   cell,
                                const int             atomStart,
                                const bool            accumulate,
                                GpuEventSynchronizer* completionMarker)
 {
-    impl_->reinit(baseForce, numAtoms, cell, atomStart, accumulate, completionMarker);
+    impl_->reinit(asFloat3(baseForcePtr), numAtoms, cell, atomStart, accumulate, completionMarker);
 }
 void GpuForceReduction::execute()
 {
similarity index 93%
rename from src/gromacs/mdlib/gpuforcereduction_impl.h
rename to src/gromacs/mdlib/gpuforcereduction_impl.cuh
index 491fd95f21126f4123db83e4e349d46920bc7102..bd222e40a643a01296b1ec222b9f25204af3c3d8 100644 (file)
@@ -58,7 +58,7 @@ struct cellInfo
     //! cell index mapping for any nbat-format forces
     const int* cell = nullptr;
     //! device copy of cell index mapping for any nbat-format forces
-    DeviceBuffer<int> d_cell;
+    int* d_cell = nullptr;
     //! number of atoms in cell array
     int cellSize = -1;
     //! number of atoms allocated in cell array
@@ -75,7 +75,7 @@ public:
      * \param [in] deviceContext GPU device context
      * \param [in] wcycle        The wallclock counter
      */
-    Impl(const DeviceContext& deviceContext, const DeviceStream& deviceStream, gmx_wallcycle* wcycle);
+    Impl(const DeviceContext& deviceContext, const DeviceStream& deviceStreami, gmx_wallcycle* wcycle);
     ~Impl();
 
     /*! \brief Register a nbnxm-format force to be reduced
@@ -98,14 +98,14 @@ public:
 
     /*! \brief Reinitialize the GPU force reduction
      *
-     * \param [in] baseForce        Pointer to force to be used as a base
+     * \param [in] baseForcePtr     Pointer to force to be used as a base
      * \param [in] numAtoms         The number of atoms
      * \param [in] cell             Pointer to the cell array
      * \param [in] atomStart        The start atom for the reduction
      * \param [in] accumulate       Whether reduction should be accumulated
      * \param [in] completionMarker Event to be marked when launch of reduction is complete
      */
-    void reinit(DeviceBuffer<RVec>    baseForce,
+    void reinit(float3*               baseForcePtr,
                 const int             numAtoms,
                 ArrayRef<const int>   cell,
                 const int             atomStart,
@@ -117,13 +117,13 @@ public:
 
 private:
     //! force to be used as a base for this reduction
-    DeviceBuffer<RVec> baseForce_;
+    float3* baseForce_ = nullptr;
     //! starting atom
     int atomStart_ = 0;
     //! number of atoms
     int numAtoms_ = 0;
     //! whether reduction is accumulated into base force buffer
-    bool accumulate_ = true;
+    int accumulate_ = true;
     //! cell information for any nbat-format forces
     struct cellInfo cellInfo_;
     //! GPU context object
@@ -133,9 +133,9 @@ private:
     //! stream to be used for this reduction
     const DeviceStream& deviceStream_;
     //! Nbnxm force to be added in this reduction
-    DeviceBuffer<RVec> nbnxmForceToAdd_;
+    DeviceBuffer<RVec> nbnxmForceToAdd_ = nullptr;
     //! Rvec-format force to be added in this reduction
-    DeviceBuffer<RVec> rvecForceToAdd_;
+    DeviceBuffer<RVec> rvecForceToAdd_ = nullptr;
     //! event to be marked when redcution launch has been completed
     GpuEventSynchronizer* completionMarker_ = nullptr;
     //! The wallclock counter
index f309e82c26864b9df5f9b0ea7dacafcf4d3465c8..0bcf0c5451e9eb77b6532f5ec1e826516099db23 100644 (file)
@@ -1115,10 +1115,9 @@ static void setupGpuForceReductions(gmx::MdrunScheduleWorkload* runScheduleWork,
     if (runScheduleWork->simulationWork.useGpuPme
         && (thisRankHasDuty(cr, DUTY_PME) || runScheduleWork->simulationWork.useGpuPmePpCommunication))
     {
-        DeviceBuffer<gmx::RVec> forcePtr =
-                thisRankHasDuty(cr, DUTY_PME) ? pme_gpu_get_device_f(fr->pmedata)
-                                              :                    // PME force buffer on same GPU
-                        fr->pmePpCommGpu->getGpuForceStagingPtr(); // buffer received from other GPU
+        void* forcePtr = thisRankHasDuty(cr, DUTY_PME) ? pme_gpu_get_device_f(fr->pmedata)
+                                                       : // PME force buffer on same GPU
+                                 fr->pmePpCommGpu->getGpuForceStagingPtr(); // buffer received from other GPU
         fr->gpuForceReduction[gmx::AtomLocality::Local]->registerRvecForce(forcePtr);
 
         GpuEventSynchronizer* const pmeSynchronizer =
index 831bbfb3c315f30c76c2b48b61ddddf97a17d988..f03ab778c3951b84a1f83ee7243bc07c96ed5952 100644 (file)
@@ -60,7 +60,6 @@
 #include "gromacs/gpu_utils/device_context.h"
 #include "gromacs/gpu_utils/device_stream.h"
 #include "gromacs/gpu_utils/devicebuffer.h"
-#include "gromacs/gpu_utils/gpueventsynchronizer.cuh"
 #include "gromacs/gpu_utils/gputraits.cuh"
 #include "gromacs/gpu_utils/vectype_ops.cuh"
 #include "gromacs/mdlib/leapfrog_gpu.h"
index 34712d494158952ed521484afd2620bd316c14b4..8e101b8cd0ed51ffb1c73b474c3b000ea09bd138 100644 (file)
 #ifndef GMX_MDLIB_UPDATE_CONSTRAIN_GPU_IMPL_H
 #define GMX_MDLIB_UPDATE_CONSTRAIN_GPU_IMPL_H
 
-#include "gromacs/gpu_utils/devicebuffer_datatype.h"
+#include "gmxpre.h"
+
+#include "gromacs/gpu_utils/gpueventsynchronizer.cuh"
+#include "gromacs/mdlib/leapfrog_gpu.h"
+#include "gromacs/mdlib/lincs_gpu.cuh"
+#include "gromacs/mdlib/settle_gpu.cuh"
 #include "gromacs/mdlib/update_constrain_gpu.h"
 #include "gromacs/mdtypes/inputrec.h"
-#include "gromacs/pbcutil/pbc_aiuc.h"
-
-#if GMX_GPU_CUDA
-#    include "gromacs/gpu_utils/gputraits.cuh"
-#endif
-
-class GpuEventSynchronizer;
-namespace gmx
-{
-class LincsGpu;
-class SettleGpu;
-class LeapFrogGpu;
-} // namespace gmx
 
 namespace gmx
 {
@@ -201,14 +193,14 @@ private:
     int numAtoms_;
 
     //! Local copy of the pointer to the device positions buffer
-    DeviceBuffer<float3> d_x_;
+    float3* d_x_;
     //! Local copy of the pointer to the device velocities buffer
-    DeviceBuffer<float3> d_v_;
+    float3* d_v_;
     //! Local copy of the pointer to the device forces buffer
-    DeviceBuffer<float3> d_f_;
+    float3* d_f_;
 
     //! Device buffer for intermediate positions (maintained internally)
-    DeviceBuffer<float3> d_xp_;
+    float3* d_xp_;
     //! Number of elements in shifted coordinates buffer
     int numXp_ = -1;
     //! Allocation size for the shifted coordinates buffer
@@ -216,7 +208,7 @@ private:
 
 
     //! 1/mass for all atoms (GPU)
-    DeviceBuffer<real> d_inverseMasses_;
+    real* d_inverseMasses_;
     //! Number of elements in reciprocal masses buffer
     int numInverseMasses_ = -1;
     //! Allocation size for the reciprocal masses buffer
index e30f8e8c8762a42b094a96431931a51848e8f606..6ed6c1ff78429803c928bc435ca838883e5ac680 100644 (file)
@@ -907,7 +907,7 @@ void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid&        grid,
     nbnxnInsertNonlocalGpuDependency(nb, interactionLoc);
 }
 
-DeviceBuffer<gmx::RVec> getGpuForces(NbnxmGpu* nb)
+void* getGpuForces(NbnxmGpu* nb)
 {
     return nb->atdat->f;
 }
index c96291a1c2d99aa5ba7e0d71499c27c7a3373f3f..96714bb15454d7397a23e8f0fd4acd0b4640aaa6 100644 (file)
@@ -215,7 +215,7 @@ int nonbonded_verlet_t::getNumAtoms(const gmx::AtomLocality locality) const
     return numAtoms;
 }
 
-DeviceBuffer<gmx::RVec> nonbonded_verlet_t::getGpuForces() const
+void* nonbonded_verlet_t::getGpuForces() const
 {
     return Nbnxm::getGpuForces(gpu_nbv);
 }
index 9732b01a39ec2fbac49e59fdddbaaa67a7f544d6..7ea59e4e5a063c7dfd598df4c546b01415f10df1 100644 (file)
@@ -398,7 +398,7 @@ public:
      *
      * \returns A pointer to the force buffer in GPU memory
      */
-    DeviceBuffer<gmx::RVec> getGpuForces() const;
+    void* getGpuForces() const;
 
     //! Return the kernel setup
     const Nbnxm::KernelSetup& kernelSetup() const { return kernelSetup_; }
index c3fdb36a19eaa4daf3bfbcf6d0846fca9b959cd4..fe836969347c3e33d2f141f3c3923eb548969efc 100644 (file)
@@ -315,8 +315,7 @@ void nbnxn_wait_x_on_device(NbnxmGpu gmx_unused* nb) CUDA_FUNC_TERM;
  * \returns       A pointer to the force buffer in GPU memory
  */
 CUDA_FUNC_QUALIFIER
-DeviceBuffer<gmx::RVec> getGpuForces(NbnxmGpu gmx_unused* nb)
-        CUDA_FUNC_TERM_WITH_RETURN(DeviceBuffer<gmx::RVec>{});
+void* getGpuForces(NbnxmGpu gmx_unused* nb) CUDA_FUNC_TERM_WITH_RETURN(nullptr);
 
 } // namespace Nbnxm
 #endif