From af60a72fe677d8fe5de06c2e5dbd8f4d129df0ba Mon Sep 17 00:00:00 2001 From: Andrey Alekseenko Date: Wed, 10 Mar 2021 09:06:41 +0000 Subject: [PATCH] Revert "Wrap more device pointers in DeviceBuffer" (!1244) This reverts commit 850429f3ebe34d27dabed6d8c31c08968befd1f5, which broke some MPI functionality, as revealed in post-merge tests. --- src/gromacs/ewald/pme.h | 6 +-- src/gromacs/ewald/pme_force_sender_gpu.h | 3 +- .../ewald/pme_force_sender_gpu_impl.cpp | 2 +- .../ewald/pme_force_sender_gpu_impl.cu | 5 +-- src/gromacs/ewald/pme_force_sender_gpu_impl.h | 3 +- src/gromacs/ewald/pme_gather.cu | 2 +- src/gromacs/ewald/pme_gpu.cpp | 4 +- src/gromacs/ewald/pme_gpu_internal.cpp | 16 ++++---- src/gromacs/ewald/pme_gpu_internal.h | 4 +- src/gromacs/ewald/pme_gpu_types.h | 2 +- src/gromacs/ewald/pme_only.cpp | 12 ++---- src/gromacs/ewald/pme_pp_comm_gpu.h | 3 +- src/gromacs/ewald/pme_pp_comm_gpu_impl.cpp | 4 +- src/gromacs/ewald/pme_pp_comm_gpu_impl.cu | 6 +-- src/gromacs/ewald/pme_pp_comm_gpu_impl.h | 9 ++--- src/gromacs/mdlib/gpuforcereduction.h | 8 ++-- src/gromacs/mdlib/gpuforcereduction_impl.cpp | 4 +- src/gromacs/mdlib/gpuforcereduction_impl.cu | 39 +++++++++---------- ...tion_impl.h => gpuforcereduction_impl.cuh} | 16 ++++---- src/gromacs/mdlib/sim_util.cpp | 7 ++-- .../mdlib/update_constrain_gpu_impl.cu | 1 - src/gromacs/mdlib/update_constrain_gpu_impl.h | 30 ++++++-------- src/gromacs/nbnxm/cuda/nbnxm_cuda.cu | 2 +- src/gromacs/nbnxm/nbnxm.cpp | 2 +- src/gromacs/nbnxm/nbnxm.h | 2 +- src/gromacs/nbnxm/nbnxm_gpu.h | 3 +- 26 files changed, 88 insertions(+), 107 deletions(-) rename src/gromacs/mdlib/{gpuforcereduction_impl.h => gpuforcereduction_impl.cuh} (93%) diff --git a/src/gromacs/ewald/pme.h b/src/gromacs/ewald/pme.h index d4a591b7bf..3f2d33f6c6 100644 --- a/src/gromacs/ewald/pme.h +++ b/src/gromacs/ewald/pme.h @@ -467,12 +467,12 @@ GPU_FUNC_QUALIFIER void pme_gpu_set_device_x(const gmx_pme_t* GPU_FUNC_AR * \param[in] pme The PME data structure. * \returns Pointer to force data */ -GPU_FUNC_QUALIFIER DeviceBuffer pme_gpu_get_device_f(const gmx_pme_t* GPU_FUNC_ARGUMENT(pme)) - GPU_FUNC_TERM_WITH_RETURN(DeviceBuffer{}); +GPU_FUNC_QUALIFIER void* pme_gpu_get_device_f(const gmx_pme_t* GPU_FUNC_ARGUMENT(pme)) + GPU_FUNC_TERM_WITH_RETURN(nullptr); /*! \brief Get pointer to the device synchronizer object that allows syncing on PME force calculation completion * \param[in] pme The PME data structure. - * \returns Pointer to synchronizer + * \returns Pointer to sychronizer */ GPU_FUNC_QUALIFIER GpuEventSynchronizer* pme_gpu_get_f_ready_synchronizer(const gmx_pme_t* GPU_FUNC_ARGUMENT(pme)) GPU_FUNC_TERM_WITH_RETURN(nullptr); diff --git a/src/gromacs/ewald/pme_force_sender_gpu.h b/src/gromacs/ewald/pme_force_sender_gpu.h index ec88e8569b..bcc3b1e393 100644 --- a/src/gromacs/ewald/pme_force_sender_gpu.h +++ b/src/gromacs/ewald/pme_force_sender_gpu.h @@ -45,7 +45,6 @@ #include #include "gromacs/math/vectypes.h" -#include "gromacs/gpu_utils/devicebuffer_datatype.h" #include "gromacs/utility/gmxmpi.h" class DeviceStream; @@ -84,7 +83,7 @@ public: * Initialization of GPU PME Force sender * \param[in] d_f force buffer in GPU memory */ - void sendForceBufferAddressToPpRanks(DeviceBuffer d_f); + void sendForceBufferAddressToPpRanks(rvec* d_f); /*! \brief * Send PP data to PP rank diff --git a/src/gromacs/ewald/pme_force_sender_gpu_impl.cpp b/src/gromacs/ewald/pme_force_sender_gpu_impl.cpp index 88cc5ca11c..915d0953ed 100644 --- a/src/gromacs/ewald/pme_force_sender_gpu_impl.cpp +++ b/src/gromacs/ewald/pme_force_sender_gpu_impl.cpp @@ -75,7 +75,7 @@ PmeForceSenderGpu::PmeForceSenderGpu(const DeviceStream& /*pmeStream */, PmeForceSenderGpu::~PmeForceSenderGpu() = default; /*!\brief init PME-PP GPU communication stub */ -void PmeForceSenderGpu::sendForceBufferAddressToPpRanks(DeviceBuffer /* d_f */) +void PmeForceSenderGpu::sendForceBufferAddressToPpRanks(rvec* /* d_f */) { GMX_ASSERT(!impl_, "A CPU stub for PME-PP GPU communication initialization was called instead of the " diff --git a/src/gromacs/ewald/pme_force_sender_gpu_impl.cu b/src/gromacs/ewald/pme_force_sender_gpu_impl.cu index 972c1c7d57..07d37dcd7e 100644 --- a/src/gromacs/ewald/pme_force_sender_gpu_impl.cu +++ b/src/gromacs/ewald/pme_force_sender_gpu_impl.cu @@ -48,7 +48,6 @@ #include "config.h" #include "gromacs/gpu_utils/cudautils.cuh" -#include "gromacs/gpu_utils/devicebuffer_datatype.h" #include "gromacs/gpu_utils/gpueventsynchronizer.cuh" #include "gromacs/utility/gmxmpi.h" @@ -69,7 +68,7 @@ PmeForceSenderGpu::Impl::Impl(const DeviceStream& pmeStream, MPI_Comm comm, gmx: PmeForceSenderGpu::Impl::~Impl() = default; /*! \brief sends force buffer address to PP ranks */ -void PmeForceSenderGpu::Impl::sendForceBufferAddressToPpRanks(DeviceBuffer d_f) +void PmeForceSenderGpu::Impl::sendForceBufferAddressToPpRanks(rvec* d_f) { int ind_start = 0; int ind_end = 0; @@ -116,7 +115,7 @@ PmeForceSenderGpu::PmeForceSenderGpu(const DeviceStream& pmeStream, PmeForceSenderGpu::~PmeForceSenderGpu() = default; -void PmeForceSenderGpu::sendForceBufferAddressToPpRanks(DeviceBuffer d_f) +void PmeForceSenderGpu::sendForceBufferAddressToPpRanks(rvec* d_f) { impl_->sendForceBufferAddressToPpRanks(d_f); } diff --git a/src/gromacs/ewald/pme_force_sender_gpu_impl.h b/src/gromacs/ewald/pme_force_sender_gpu_impl.h index f0b0f74bad..70be40cc7f 100644 --- a/src/gromacs/ewald/pme_force_sender_gpu_impl.h +++ b/src/gromacs/ewald/pme_force_sender_gpu_impl.h @@ -44,7 +44,6 @@ #define GMX_PMEFORCESENDERGPU_IMPL_H #include "gromacs/ewald/pme_force_sender_gpu.h" -#include "gromacs/gpu_utils/devicebuffer_datatype.h" #include "gromacs/gpu_utils/gpueventsynchronizer.cuh" #include "gromacs/utility/arrayref.h" @@ -69,7 +68,7 @@ public: * sends force buffer address to PP rank * \param[in] d_f force buffer in GPU memory */ - void sendForceBufferAddressToPpRanks(DeviceBuffer d_f); + void sendForceBufferAddressToPpRanks(rvec* d_f); /*! \brief * Send PP data to PP rank diff --git a/src/gromacs/ewald/pme_gather.cu b/src/gromacs/ewald/pme_gather.cu index 82eeed85a1..35111b52ec 100644 --- a/src/gromacs/ewald/pme_gather.cu +++ b/src/gromacs/ewald/pme_gather.cu @@ -351,7 +351,7 @@ __launch_bounds__(c_gatherMaxThreadsPerBlock, c_gatherMinBlocksPerMP) __global__ const float* __restrict__ gm_coefficientsB = kernelParams.atoms.d_coefficients[1]; const float* __restrict__ gm_gridA = kernelParams.grid.d_realGrid[0]; const float* __restrict__ gm_gridB = kernelParams.grid.d_realGrid[1]; - float* __restrict__ gm_forces = reinterpret_cast(kernelParams.atoms.d_forces); + float* __restrict__ gm_forces = kernelParams.atoms.d_forces; /* Global memory pointers for readGlobal */ const float* __restrict__ gm_theta = kernelParams.atoms.d_theta; diff --git a/src/gromacs/ewald/pme_gpu.cpp b/src/gromacs/ewald/pme_gpu.cpp index 225fb1050a..e0c32e207d 100644 --- a/src/gromacs/ewald/pme_gpu.cpp +++ b/src/gromacs/ewald/pme_gpu.cpp @@ -440,11 +440,11 @@ void pme_gpu_reinit_computation(const gmx_pme_t* pme, gmx_wallcycle* wcycle) wallcycle_stop(wcycle, ewcLAUNCH_GPU); } -DeviceBuffer pme_gpu_get_device_f(const gmx_pme_t* pme) +void* pme_gpu_get_device_f(const gmx_pme_t* pme) { if (!pme || !pme_gpu_active(pme)) { - return DeviceBuffer{}; + return nullptr; } return pme_gpu_get_kernelparam_forces(pme->gpu); } diff --git a/src/gromacs/ewald/pme_gpu_internal.cpp b/src/gromacs/ewald/pme_gpu_internal.cpp index c0d422d572..028a66a35a 100644 --- a/src/gromacs/ewald/pme_gpu_internal.cpp +++ b/src/gromacs/ewald/pme_gpu_internal.cpp @@ -229,7 +229,7 @@ void pme_gpu_free_bspline_values(const PmeGpu* pmeGpu) void pme_gpu_realloc_forces(PmeGpu* pmeGpu) { - const size_t newForcesSize = pmeGpu->nAtomsAlloc; + const size_t newForcesSize = pmeGpu->nAtomsAlloc * DIM; GMX_ASSERT(newForcesSize > 0, "Bad number of atoms in PME GPU"); reallocateDeviceBuffer(&pmeGpu->kernelParams->atoms.d_forces, newForcesSize, @@ -248,10 +248,11 @@ void pme_gpu_free_forces(const PmeGpu* pmeGpu) void pme_gpu_copy_input_forces(PmeGpu* pmeGpu) { GMX_ASSERT(pmeGpu->kernelParams->atoms.nAtoms > 0, "Bad number of atoms in PME GPU"); + float* h_forcesFloat = reinterpret_cast(pmeGpu->staging.h_forces.data()); copyToDeviceBuffer(&pmeGpu->kernelParams->atoms.d_forces, - pmeGpu->staging.h_forces.data(), + h_forcesFloat, 0, - pmeGpu->kernelParams->atoms.nAtoms, + DIM * pmeGpu->kernelParams->atoms.nAtoms, pmeGpu->archSpecific->pmeStream_, pmeGpu->settings.transferKind, nullptr); @@ -260,10 +261,11 @@ void pme_gpu_copy_input_forces(PmeGpu* pmeGpu) void pme_gpu_copy_output_forces(PmeGpu* pmeGpu) { GMX_ASSERT(pmeGpu->kernelParams->atoms.nAtoms > 0, "Bad number of atoms in PME GPU"); - copyFromDeviceBuffer(pmeGpu->staging.h_forces.data(), + float* h_forcesFloat = reinterpret_cast(pmeGpu->staging.h_forces.data()); + copyFromDeviceBuffer(h_forcesFloat, &pmeGpu->kernelParams->atoms.d_forces, 0, - pmeGpu->kernelParams->atoms.nAtoms, + DIM * pmeGpu->kernelParams->atoms.nAtoms, pmeGpu->archSpecific->pmeStream_, pmeGpu->settings.transferKind, nullptr); @@ -1704,7 +1706,7 @@ void pme_gpu_gather(PmeGpu* pmeGpu, real** h_grids, const float lambda) } } -DeviceBuffer pme_gpu_get_kernelparam_forces(const PmeGpu* pmeGpu) +void* pme_gpu_get_kernelparam_forces(const PmeGpu* pmeGpu) { if (pmeGpu && pmeGpu->kernelParams) { @@ -1712,7 +1714,7 @@ DeviceBuffer pme_gpu_get_kernelparam_forces(const PmeGpu* pmeGpu) } else { - return DeviceBuffer{}; + return nullptr; } } diff --git a/src/gromacs/ewald/pme_gpu_internal.h b/src/gromacs/ewald/pme_gpu_internal.h index 7baa6bd347..41b912e2b6 100644 --- a/src/gromacs/ewald/pme_gpu_internal.h +++ b/src/gromacs/ewald/pme_gpu_internal.h @@ -405,8 +405,8 @@ GPU_FUNC_QUALIFIER void pme_gpu_set_kernelparam_coordinates(const PmeGpu* GPU_FU * \param[in] pmeGpu The PME GPU structure. * \returns Pointer to force data */ -GPU_FUNC_QUALIFIER DeviceBuffer pme_gpu_get_kernelparam_forces(const PmeGpu* GPU_FUNC_ARGUMENT(pmeGpu)) - GPU_FUNC_TERM_WITH_RETURN(DeviceBuffer{}); +GPU_FUNC_QUALIFIER void* pme_gpu_get_kernelparam_forces(const PmeGpu* GPU_FUNC_ARGUMENT(pmeGpu)) + GPU_FUNC_TERM_WITH_RETURN(nullptr); /*! \brief Return pointer to the sync object triggered after the PME force calculation completion * \param[in] pmeGpu The PME GPU structure. diff --git a/src/gromacs/ewald/pme_gpu_types.h b/src/gromacs/ewald/pme_gpu_types.h index e2c067390a..abf7a17ed0 100644 --- a/src/gromacs/ewald/pme_gpu_types.h +++ b/src/gromacs/ewald/pme_gpu_types.h @@ -171,7 +171,7 @@ struct PmeGpuAtomParams * The forces change and need to be copied from (and possibly to) the GPU for every PME * computation, but reallocation happens only at DD. */ - HIDE_FROM_OPENCL_COMPILER(DeviceBuffer) d_forces; + HIDE_FROM_OPENCL_COMPILER(DeviceBuffer) d_forces; /*! \brief Global GPU memory array handle with ivec atom gridline indices. * Computed on GPU in the spline calculation part. */ diff --git a/src/gromacs/ewald/pme_only.cpp b/src/gromacs/ewald/pme_only.cpp index 5a01c898c8..c895975144 100644 --- a/src/gromacs/ewald/pme_only.cpp +++ b/src/gromacs/ewald/pme_only.cpp @@ -445,7 +445,8 @@ static int gmx_pme_recv_coeffs_coords(struct gmx_pme_t* pme, // This rank will have its data accessed directly by PP rank, so needs to send the remote addresses. pme_pp->pmeCoordinateReceiverGpu->sendCoordinateBufferAddressToPpRanks( stateGpu->getCoordinates()); - pme_pp->pmeForceSenderGpu->sendForceBufferAddressToPpRanks(pme_gpu_get_device_f(pme)); + pme_pp->pmeForceSenderGpu->sendForceBufferAddressToPpRanks( + reinterpret_cast(pme_gpu_get_device_f(pme))); } } @@ -582,13 +583,8 @@ static void gmx_pme_send_force_vir_ener(const gmx_pme_t& pme, if (pme_pp->useGpuDirectComm) { // Data will be transferred directly from GPU. - DeviceBuffer gmx_unused d_f = pme_gpu_get_device_f(&pme); -# if GMX_GPU_CUDA - // OpenCL does not allow host-side pointer arithmetic on buffers. Neither does SYCL. - sendbuf = reinterpret_cast(&d_f[ind_start]); -# else - GMX_RELEASE_ASSERT(false, "Can only use GPU Direct Communications with CUDA"); -# endif + rvec* d_f = reinterpret_cast(pme_gpu_get_device_f(&pme)); + sendbuf = reinterpret_cast(&d_f[ind_start]); } sendFToPP(sendbuf, receiver, pme_pp, &messages); } diff --git a/src/gromacs/ewald/pme_pp_comm_gpu.h b/src/gromacs/ewald/pme_pp_comm_gpu.h index 886e0c221b..3e56da9af3 100644 --- a/src/gromacs/ewald/pme_pp_comm_gpu.h +++ b/src/gromacs/ewald/pme_pp_comm_gpu.h @@ -44,7 +44,6 @@ #include -#include "gromacs/gpu_utils/devicebuffer_datatype.h" #include "gromacs/utility/gmxmpi.h" class DeviceContext; @@ -100,7 +99,7 @@ public: /*! \brief * Return pointer to buffer used for staging PME force on GPU */ - DeviceBuffer getGpuForceStagingPtr(); + void* getGpuForceStagingPtr(); /*! \brief * Return pointer to event recorded when forces are ready diff --git a/src/gromacs/ewald/pme_pp_comm_gpu_impl.cpp b/src/gromacs/ewald/pme_pp_comm_gpu_impl.cpp index d31b976c10..c5f92aa53f 100644 --- a/src/gromacs/ewald/pme_pp_comm_gpu_impl.cpp +++ b/src/gromacs/ewald/pme_pp_comm_gpu_impl.cpp @@ -102,12 +102,12 @@ void PmePpCommGpu::sendCoordinatesToPmeCudaDirect(void* /* sendPtr */, "implementation."); } -DeviceBuffer PmePpCommGpu::getGpuForceStagingPtr() +void* PmePpCommGpu::getGpuForceStagingPtr() { GMX_ASSERT(!impl_, "A CPU stub for PME-PP GPU communication was called instead of the correct " "implementation."); - return DeviceBuffer{}; + return nullptr; } GpuEventSynchronizer* PmePpCommGpu::getForcesReadySynchronizer() diff --git a/src/gromacs/ewald/pme_pp_comm_gpu_impl.cu b/src/gromacs/ewald/pme_pp_comm_gpu_impl.cu index 0d66883979..0ecf028133 100644 --- a/src/gromacs/ewald/pme_pp_comm_gpu_impl.cu +++ b/src/gromacs/ewald/pme_pp_comm_gpu_impl.cu @@ -155,9 +155,9 @@ void PmePpCommGpu::Impl::sendCoordinatesToPmeCudaDirect(void* sendPtr, GMX_UNUSED_VALUE(coordinatesReadyOnDeviceEvent); #endif } -DeviceBuffer PmePpCommGpu::Impl::getGpuForceStagingPtr() +void* PmePpCommGpu::Impl::getGpuForceStagingPtr() { - return d_pmeForces_; + return static_cast(d_pmeForces_); } GpuEventSynchronizer* PmePpCommGpu::Impl::getForcesReadySynchronizer() @@ -194,7 +194,7 @@ void PmePpCommGpu::sendCoordinatesToPmeCudaDirect(void* sendPtr, sendPtr, sendSize, sendPmeCoordinatesFromGpu, coordinatesReadyOnDeviceEvent); } -DeviceBuffer PmePpCommGpu::getGpuForceStagingPtr() +void* PmePpCommGpu::getGpuForceStagingPtr() { return impl_->getGpuForceStagingPtr(); } diff --git a/src/gromacs/ewald/pme_pp_comm_gpu_impl.h b/src/gromacs/ewald/pme_pp_comm_gpu_impl.h index 58e22c2e46..042891a04d 100644 --- a/src/gromacs/ewald/pme_pp_comm_gpu_impl.h +++ b/src/gromacs/ewald/pme_pp_comm_gpu_impl.h @@ -44,7 +44,6 @@ #define GMX_PME_PP_COMM_GPU_IMPL_H #include "gromacs/ewald/pme_pp_comm_gpu.h" -#include "gromacs/gpu_utils/devicebuffer_datatype.h" #include "gromacs/gpu_utils/gpueventsynchronizer.cuh" #include "gromacs/math/vectypes.h" #include "gromacs/utility/gmxmpi.h" @@ -111,7 +110,7 @@ public: /*! \brief * Return pointer to buffer used for staging PME force on GPU */ - DeviceBuffer getGpuForceStagingPtr(); + void* getGpuForceStagingPtr(); /*! \brief * Return pointer to event recorded when forces are ready @@ -124,15 +123,15 @@ private: //! Handle for CUDA stream used for the communication operations in this class const DeviceStream& pmePpCommStream_; //! Remote location of PME coordinate data buffer - DeviceBuffer remotePmeXBuffer_ = nullptr; + void* remotePmeXBuffer_ = nullptr; //! Remote location of PME force data buffer - DeviceBuffer remotePmeFBuffer_ = nullptr; + void* remotePmeFBuffer_ = nullptr; //! communicator for simulation MPI_Comm comm_; //! Rank of PME task int pmeRank_ = -1; //! Buffer for staging PME force on GPU - DeviceBuffer d_pmeForces_ = nullptr; + rvec* d_pmeForces_ = nullptr; //! number of atoms in PME force staging array int d_pmeForcesSize_ = -1; //! number of atoms allocated in recvbuf array diff --git a/src/gromacs/mdlib/gpuforcereduction.h b/src/gromacs/mdlib/gpuforcereduction.h index 82e8db33f5..157c4c7eca 100644 --- a/src/gromacs/mdlib/gpuforcereduction.h +++ b/src/gromacs/mdlib/gpuforcereduction.h @@ -86,13 +86,13 @@ public: * * \param [in] forcePtr Pointer to force to be reduced */ - void registerNbnxmForce(DeviceBuffer forcePtr); + void registerNbnxmForce(void* forcePtr); /*! \brief Register a rvec-format force to be reduced * * \param [in] forcePtr Pointer to force to be reduced */ - void registerRvecForce(DeviceBuffer forcePtr); + void registerRvecForce(void* forcePtr); /*! \brief Add a dependency for this force reduction * @@ -102,14 +102,14 @@ public: /*! \brief Reinitialize the GPU force reduction * - * \param [in] baseForce Pointer to force to be used as a base + * \param [in] baseForcePtr Pointer to force to be used as a base * \param [in] numAtoms The number of atoms * \param [in] cell Pointer to the cell array * \param [in] atomStart The start atom for the reduction * \param [in] accumulate Whether reduction should be accumulated * \param [in] completionMarker Event to be marked when launch of reduction is complete */ - void reinit(DeviceBuffer baseForce, + void reinit(DeviceBuffer baseForcePtr, int numAtoms, ArrayRef cell, int atomStart, diff --git a/src/gromacs/mdlib/gpuforcereduction_impl.cpp b/src/gromacs/mdlib/gpuforcereduction_impl.cpp index b69759b848..1e0a30b2b6 100644 --- a/src/gromacs/mdlib/gpuforcereduction_impl.cpp +++ b/src/gromacs/mdlib/gpuforcereduction_impl.cpp @@ -76,13 +76,13 @@ void GpuForceReduction::reinit(DeviceBuffer /*baseForcePtr*/, } // NOLINTNEXTLINE readability-convert-member-functions-to-static -void GpuForceReduction::registerNbnxmForce(DeviceBuffer /* forcePtr */) +void GpuForceReduction::registerNbnxmForce(void* /* forcePtr */) { GMX_ASSERT(false, "A CPU stub has been called instead of the correct implementation."); } // NOLINTNEXTLINE readability-convert-member-functions-to-static -void GpuForceReduction::registerRvecForce(DeviceBuffer /* forcePtr */) +void GpuForceReduction::registerRvecForce(void* /* forcePtr */) { GMX_ASSERT(false, "A CPU stub has been called instead of the correct implementation."); } diff --git a/src/gromacs/mdlib/gpuforcereduction_impl.cu b/src/gromacs/mdlib/gpuforcereduction_impl.cu index ac89b47d24..6e1e7e920a 100644 --- a/src/gromacs/mdlib/gpuforcereduction_impl.cu +++ b/src/gromacs/mdlib/gpuforcereduction_impl.cu @@ -43,7 +43,7 @@ #include "gmxpre.h" -#include "gpuforcereduction_impl.h" +#include "gpuforcereduction_impl.cuh" #include @@ -112,15 +112,15 @@ GpuForceReduction::Impl::Impl(const DeviceContext& deviceContext, deviceStream_(deviceStream), wcycle_(wcycle){}; -void GpuForceReduction::Impl::reinit(DeviceBuffer baseForce, - const int numAtoms, - ArrayRef cell, - const int atomStart, - const bool accumulate, - GpuEventSynchronizer* completionMarker) +void GpuForceReduction::Impl::reinit(float3* baseForcePtr, + const int numAtoms, + ArrayRef cell, + const int atomStart, + const bool accumulate, + GpuEventSynchronizer* completionMarker) { - GMX_ASSERT((baseForce != nullptr), "Input base force for reduction has no data"); - baseForce_ = baseForce; + GMX_ASSERT((baseForcePtr != nullptr), "Input base force for reduction has no data"); + baseForce_ = &(baseForcePtr[atomStart]); numAtoms_ = numAtoms; atomStart_ = atomStart; accumulate_ = accumulate; @@ -144,13 +144,13 @@ void GpuForceReduction::Impl::reinit(DeviceBuffer baseForce, void GpuForceReduction::Impl::registerNbnxmForce(DeviceBuffer forcePtr) { - GMX_ASSERT((forcePtr), "Input force for reduction has no data"); + GMX_ASSERT((forcePtr != nullptr), "Input force for reduction has no data"); nbnxmForceToAdd_ = forcePtr; }; void GpuForceReduction::Impl::registerRvecForce(DeviceBuffer forcePtr) { - GMX_ASSERT((forcePtr), "Input force for reduction has no data"); + GMX_ASSERT((forcePtr != nullptr), "Input force for reduction has no data"); rvecForceToAdd_ = forcePtr; }; @@ -172,12 +172,11 @@ void GpuForceReduction::Impl::execute() GMX_ASSERT((nbnxmForceToAdd_ != nullptr), "Nbnxm force for reduction has no data"); // Enqueue wait on all dependencies passed - for (const auto& synchronizer : dependencyList_) + for (auto const synchronizer : dependencyList_) { synchronizer->enqueueWaitEvent(deviceStream_); } - float3* d_baseForce = &(asFloat3(baseForce_)[atomStart_]); float3* d_nbnxmForce = asFloat3(nbnxmForceToAdd_); float3* d_rvecForceToAdd = &(asFloat3(rvecForceToAdd_)[atomStart_]); @@ -196,7 +195,7 @@ void GpuForceReduction::Impl::execute() : (accumulate_ ? reduceKernel : reduceKernel); const auto kernelArgs = prepareGpuKernelArguments( - kernelFn, config, &d_nbnxmForce, &d_rvecForceToAdd, &d_baseForce, &cellInfo_.d_cell, &numAtoms_); + kernelFn, config, &d_nbnxmForce, &d_rvecForceToAdd, &baseForce_, &cellInfo_.d_cell, &numAtoms_); launchGpuKernel(kernelFn, config, deviceStream_, nullptr, "Force Reduction", kernelArgs); @@ -219,14 +218,14 @@ GpuForceReduction::GpuForceReduction(const DeviceContext& deviceContext, { } -void GpuForceReduction::registerNbnxmForce(DeviceBuffer forcePtr) +void GpuForceReduction::registerNbnxmForce(void* forcePtr) { - impl_->registerNbnxmForce(std::move(forcePtr)); + impl_->registerNbnxmForce(reinterpret_cast>(forcePtr)); } -void GpuForceReduction::registerRvecForce(DeviceBuffer forcePtr) +void GpuForceReduction::registerRvecForce(void* forcePtr) { - impl_->registerRvecForce(std::move(forcePtr)); + impl_->registerRvecForce(reinterpret_cast>(forcePtr)); } void GpuForceReduction::addDependency(GpuEventSynchronizer* const dependency) @@ -234,14 +233,14 @@ void GpuForceReduction::addDependency(GpuEventSynchronizer* const dependency) impl_->addDependency(dependency); } -void GpuForceReduction::reinit(DeviceBuffer baseForce, +void GpuForceReduction::reinit(DeviceBuffer baseForcePtr, const int numAtoms, ArrayRef cell, const int atomStart, const bool accumulate, GpuEventSynchronizer* completionMarker) { - impl_->reinit(baseForce, numAtoms, cell, atomStart, accumulate, completionMarker); + impl_->reinit(asFloat3(baseForcePtr), numAtoms, cell, atomStart, accumulate, completionMarker); } void GpuForceReduction::execute() { diff --git a/src/gromacs/mdlib/gpuforcereduction_impl.h b/src/gromacs/mdlib/gpuforcereduction_impl.cuh similarity index 93% rename from src/gromacs/mdlib/gpuforcereduction_impl.h rename to src/gromacs/mdlib/gpuforcereduction_impl.cuh index 491fd95f21..bd222e40a6 100644 --- a/src/gromacs/mdlib/gpuforcereduction_impl.h +++ b/src/gromacs/mdlib/gpuforcereduction_impl.cuh @@ -58,7 +58,7 @@ struct cellInfo //! cell index mapping for any nbat-format forces const int* cell = nullptr; //! device copy of cell index mapping for any nbat-format forces - DeviceBuffer d_cell; + int* d_cell = nullptr; //! number of atoms in cell array int cellSize = -1; //! number of atoms allocated in cell array @@ -75,7 +75,7 @@ public: * \param [in] deviceContext GPU device context * \param [in] wcycle The wallclock counter */ - Impl(const DeviceContext& deviceContext, const DeviceStream& deviceStream, gmx_wallcycle* wcycle); + Impl(const DeviceContext& deviceContext, const DeviceStream& deviceStreami, gmx_wallcycle* wcycle); ~Impl(); /*! \brief Register a nbnxm-format force to be reduced @@ -98,14 +98,14 @@ public: /*! \brief Reinitialize the GPU force reduction * - * \param [in] baseForce Pointer to force to be used as a base + * \param [in] baseForcePtr Pointer to force to be used as a base * \param [in] numAtoms The number of atoms * \param [in] cell Pointer to the cell array * \param [in] atomStart The start atom for the reduction * \param [in] accumulate Whether reduction should be accumulated * \param [in] completionMarker Event to be marked when launch of reduction is complete */ - void reinit(DeviceBuffer baseForce, + void reinit(float3* baseForcePtr, const int numAtoms, ArrayRef cell, const int atomStart, @@ -117,13 +117,13 @@ public: private: //! force to be used as a base for this reduction - DeviceBuffer baseForce_; + float3* baseForce_ = nullptr; //! starting atom int atomStart_ = 0; //! number of atoms int numAtoms_ = 0; //! whether reduction is accumulated into base force buffer - bool accumulate_ = true; + int accumulate_ = true; //! cell information for any nbat-format forces struct cellInfo cellInfo_; //! GPU context object @@ -133,9 +133,9 @@ private: //! stream to be used for this reduction const DeviceStream& deviceStream_; //! Nbnxm force to be added in this reduction - DeviceBuffer nbnxmForceToAdd_; + DeviceBuffer nbnxmForceToAdd_ = nullptr; //! Rvec-format force to be added in this reduction - DeviceBuffer rvecForceToAdd_; + DeviceBuffer rvecForceToAdd_ = nullptr; //! event to be marked when redcution launch has been completed GpuEventSynchronizer* completionMarker_ = nullptr; //! The wallclock counter diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp index f309e82c26..0bcf0c5451 100644 --- a/src/gromacs/mdlib/sim_util.cpp +++ b/src/gromacs/mdlib/sim_util.cpp @@ -1115,10 +1115,9 @@ static void setupGpuForceReductions(gmx::MdrunScheduleWorkload* runScheduleWork, if (runScheduleWork->simulationWork.useGpuPme && (thisRankHasDuty(cr, DUTY_PME) || runScheduleWork->simulationWork.useGpuPmePpCommunication)) { - DeviceBuffer forcePtr = - thisRankHasDuty(cr, DUTY_PME) ? pme_gpu_get_device_f(fr->pmedata) - : // PME force buffer on same GPU - fr->pmePpCommGpu->getGpuForceStagingPtr(); // buffer received from other GPU + void* forcePtr = thisRankHasDuty(cr, DUTY_PME) ? pme_gpu_get_device_f(fr->pmedata) + : // PME force buffer on same GPU + fr->pmePpCommGpu->getGpuForceStagingPtr(); // buffer received from other GPU fr->gpuForceReduction[gmx::AtomLocality::Local]->registerRvecForce(forcePtr); GpuEventSynchronizer* const pmeSynchronizer = diff --git a/src/gromacs/mdlib/update_constrain_gpu_impl.cu b/src/gromacs/mdlib/update_constrain_gpu_impl.cu index 831bbfb3c3..f03ab778c3 100644 --- a/src/gromacs/mdlib/update_constrain_gpu_impl.cu +++ b/src/gromacs/mdlib/update_constrain_gpu_impl.cu @@ -60,7 +60,6 @@ #include "gromacs/gpu_utils/device_context.h" #include "gromacs/gpu_utils/device_stream.h" #include "gromacs/gpu_utils/devicebuffer.h" -#include "gromacs/gpu_utils/gpueventsynchronizer.cuh" #include "gromacs/gpu_utils/gputraits.cuh" #include "gromacs/gpu_utils/vectype_ops.cuh" #include "gromacs/mdlib/leapfrog_gpu.h" diff --git a/src/gromacs/mdlib/update_constrain_gpu_impl.h b/src/gromacs/mdlib/update_constrain_gpu_impl.h index 34712d4941..8e101b8cd0 100644 --- a/src/gromacs/mdlib/update_constrain_gpu_impl.h +++ b/src/gromacs/mdlib/update_constrain_gpu_impl.h @@ -46,22 +46,14 @@ #ifndef GMX_MDLIB_UPDATE_CONSTRAIN_GPU_IMPL_H #define GMX_MDLIB_UPDATE_CONSTRAIN_GPU_IMPL_H -#include "gromacs/gpu_utils/devicebuffer_datatype.h" +#include "gmxpre.h" + +#include "gromacs/gpu_utils/gpueventsynchronizer.cuh" +#include "gromacs/mdlib/leapfrog_gpu.h" +#include "gromacs/mdlib/lincs_gpu.cuh" +#include "gromacs/mdlib/settle_gpu.cuh" #include "gromacs/mdlib/update_constrain_gpu.h" #include "gromacs/mdtypes/inputrec.h" -#include "gromacs/pbcutil/pbc_aiuc.h" - -#if GMX_GPU_CUDA -# include "gromacs/gpu_utils/gputraits.cuh" -#endif - -class GpuEventSynchronizer; -namespace gmx -{ -class LincsGpu; -class SettleGpu; -class LeapFrogGpu; -} // namespace gmx namespace gmx { @@ -201,14 +193,14 @@ private: int numAtoms_; //! Local copy of the pointer to the device positions buffer - DeviceBuffer d_x_; + float3* d_x_; //! Local copy of the pointer to the device velocities buffer - DeviceBuffer d_v_; + float3* d_v_; //! Local copy of the pointer to the device forces buffer - DeviceBuffer d_f_; + float3* d_f_; //! Device buffer for intermediate positions (maintained internally) - DeviceBuffer d_xp_; + float3* d_xp_; //! Number of elements in shifted coordinates buffer int numXp_ = -1; //! Allocation size for the shifted coordinates buffer @@ -216,7 +208,7 @@ private: //! 1/mass for all atoms (GPU) - DeviceBuffer d_inverseMasses_; + real* d_inverseMasses_; //! Number of elements in reciprocal masses buffer int numInverseMasses_ = -1; //! Allocation size for the reciprocal masses buffer diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu index e30f8e8c87..6ed6c1ff78 100644 --- a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu +++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu @@ -907,7 +907,7 @@ void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid& grid, nbnxnInsertNonlocalGpuDependency(nb, interactionLoc); } -DeviceBuffer getGpuForces(NbnxmGpu* nb) +void* getGpuForces(NbnxmGpu* nb) { return nb->atdat->f; } diff --git a/src/gromacs/nbnxm/nbnxm.cpp b/src/gromacs/nbnxm/nbnxm.cpp index c96291a1c2..96714bb154 100644 --- a/src/gromacs/nbnxm/nbnxm.cpp +++ b/src/gromacs/nbnxm/nbnxm.cpp @@ -215,7 +215,7 @@ int nonbonded_verlet_t::getNumAtoms(const gmx::AtomLocality locality) const return numAtoms; } -DeviceBuffer nonbonded_verlet_t::getGpuForces() const +void* nonbonded_verlet_t::getGpuForces() const { return Nbnxm::getGpuForces(gpu_nbv); } diff --git a/src/gromacs/nbnxm/nbnxm.h b/src/gromacs/nbnxm/nbnxm.h index 9732b01a39..7ea59e4e5a 100644 --- a/src/gromacs/nbnxm/nbnxm.h +++ b/src/gromacs/nbnxm/nbnxm.h @@ -398,7 +398,7 @@ public: * * \returns A pointer to the force buffer in GPU memory */ - DeviceBuffer getGpuForces() const; + void* getGpuForces() const; //! Return the kernel setup const Nbnxm::KernelSetup& kernelSetup() const { return kernelSetup_; } diff --git a/src/gromacs/nbnxm/nbnxm_gpu.h b/src/gromacs/nbnxm/nbnxm_gpu.h index c3fdb36a19..fe83696934 100644 --- a/src/gromacs/nbnxm/nbnxm_gpu.h +++ b/src/gromacs/nbnxm/nbnxm_gpu.h @@ -315,8 +315,7 @@ void nbnxn_wait_x_on_device(NbnxmGpu gmx_unused* nb) CUDA_FUNC_TERM; * \returns A pointer to the force buffer in GPU memory */ CUDA_FUNC_QUALIFIER -DeviceBuffer getGpuForces(NbnxmGpu gmx_unused* nb) - CUDA_FUNC_TERM_WITH_RETURN(DeviceBuffer{}); +void* getGpuForces(NbnxmGpu gmx_unused* nb) CUDA_FUNC_TERM_WITH_RETURN(nullptr); } // namespace Nbnxm #endif -- 2.22.0