From 850429f3ebe34d27dabed6d8c31c08968befd1f5 Mon Sep 17 00:00:00 2001 From: Andrey Alekseenko Date: Wed, 10 Mar 2021 05:28:27 +0000 Subject: [PATCH] Wrap more device pointers in DeviceBuffer Store and pass around more device pointers as DeviceBuffer instead of as a raw or even void pointer. Related changes: - PmeGpu->kernelParams->atoms.d_forces is now RVec, not float. Forces in most other places are RVec, so it seems more logical. - GpuForceReduction::Impl::baseForce_ now stores pointer to the beginning of the array, without shift. --- src/gromacs/ewald/pme.h | 6 +-- src/gromacs/ewald/pme_force_sender_gpu.h | 3 +- .../ewald/pme_force_sender_gpu_impl.cpp | 4 +- .../ewald/pme_force_sender_gpu_impl.cu | 7 ++-- src/gromacs/ewald/pme_force_sender_gpu_impl.h | 5 ++- src/gromacs/ewald/pme_gather.cu | 2 +- src/gromacs/ewald/pme_gpu.cpp | 4 +- src/gromacs/ewald/pme_gpu_internal.cpp | 16 ++++---- src/gromacs/ewald/pme_gpu_internal.h | 6 +-- src/gromacs/ewald/pme_gpu_types.h | 2 +- src/gromacs/ewald/pme_only.cpp | 14 ++++--- src/gromacs/ewald/pme_pp_comm_gpu.h | 3 +- src/gromacs/ewald/pme_pp_comm_gpu_impl.cpp | 6 +-- src/gromacs/ewald/pme_pp_comm_gpu_impl.cu | 8 ++-- src/gromacs/ewald/pme_pp_comm_gpu_impl.h | 11 ++--- src/gromacs/mdlib/gpuforcereduction.h | 8 ++-- src/gromacs/mdlib/gpuforcereduction_impl.cpp | 6 +-- src/gromacs/mdlib/gpuforcereduction_impl.cu | 41 ++++++++++--------- ...tion_impl.cuh => gpuforcereduction_impl.h} | 18 ++++---- src/gromacs/mdlib/sim_util.cpp | 7 ++-- .../mdlib/update_constrain_gpu_impl.cu | 1 + src/gromacs/mdlib/update_constrain_gpu_impl.h | 30 +++++++++----- src/gromacs/nbnxm/cuda/nbnxm_cuda.cu | 2 +- src/gromacs/nbnxm/nbnxm.cpp | 2 +- src/gromacs/nbnxm/nbnxm.h | 2 +- src/gromacs/nbnxm/nbnxm_gpu.h | 3 +- 26 files changed, 118 insertions(+), 99 deletions(-) rename src/gromacs/mdlib/{gpuforcereduction_impl.cuh => gpuforcereduction_impl.h} (92%) diff --git a/src/gromacs/ewald/pme.h b/src/gromacs/ewald/pme.h index 3f2d33f6c6..d4a591b7bf 100644 --- a/src/gromacs/ewald/pme.h +++ b/src/gromacs/ewald/pme.h @@ -467,12 +467,12 @@ GPU_FUNC_QUALIFIER void pme_gpu_set_device_x(const gmx_pme_t* GPU_FUNC_AR * \param[in] pme The PME data structure. * \returns Pointer to force data */ -GPU_FUNC_QUALIFIER void* pme_gpu_get_device_f(const gmx_pme_t* GPU_FUNC_ARGUMENT(pme)) - GPU_FUNC_TERM_WITH_RETURN(nullptr); +GPU_FUNC_QUALIFIER DeviceBuffer pme_gpu_get_device_f(const gmx_pme_t* GPU_FUNC_ARGUMENT(pme)) + GPU_FUNC_TERM_WITH_RETURN(DeviceBuffer{}); /*! \brief Get pointer to the device synchronizer object that allows syncing on PME force calculation completion * \param[in] pme The PME data structure. - * \returns Pointer to sychronizer + * \returns Pointer to synchronizer */ GPU_FUNC_QUALIFIER GpuEventSynchronizer* pme_gpu_get_f_ready_synchronizer(const gmx_pme_t* GPU_FUNC_ARGUMENT(pme)) GPU_FUNC_TERM_WITH_RETURN(nullptr); diff --git a/src/gromacs/ewald/pme_force_sender_gpu.h b/src/gromacs/ewald/pme_force_sender_gpu.h index bcc3b1e393..ec88e8569b 100644 --- a/src/gromacs/ewald/pme_force_sender_gpu.h +++ b/src/gromacs/ewald/pme_force_sender_gpu.h @@ -45,6 +45,7 @@ #include #include "gromacs/math/vectypes.h" +#include "gromacs/gpu_utils/devicebuffer_datatype.h" #include "gromacs/utility/gmxmpi.h" class DeviceStream; @@ -83,7 +84,7 @@ public: * Initialization of GPU PME Force sender * \param[in] d_f force buffer in GPU memory */ - void sendForceBufferAddressToPpRanks(rvec* d_f); + void sendForceBufferAddressToPpRanks(DeviceBuffer d_f); /*! \brief * Send PP data to PP rank diff --git a/src/gromacs/ewald/pme_force_sender_gpu_impl.cpp b/src/gromacs/ewald/pme_force_sender_gpu_impl.cpp index a30384a8c2..88cc5ca11c 100644 --- a/src/gromacs/ewald/pme_force_sender_gpu_impl.cpp +++ b/src/gromacs/ewald/pme_force_sender_gpu_impl.cpp @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2019,2020, by the GROMACS development team, led by + * Copyright (c) 2019,2020,2021, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -75,7 +75,7 @@ PmeForceSenderGpu::PmeForceSenderGpu(const DeviceStream& /*pmeStream */, PmeForceSenderGpu::~PmeForceSenderGpu() = default; /*!\brief init PME-PP GPU communication stub */ -void PmeForceSenderGpu::sendForceBufferAddressToPpRanks(rvec* /* d_f */) +void PmeForceSenderGpu::sendForceBufferAddressToPpRanks(DeviceBuffer /* d_f */) { GMX_ASSERT(!impl_, "A CPU stub for PME-PP GPU communication initialization was called instead of the " diff --git a/src/gromacs/ewald/pme_force_sender_gpu_impl.cu b/src/gromacs/ewald/pme_force_sender_gpu_impl.cu index 6e6d21eaf2..972c1c7d57 100644 --- a/src/gromacs/ewald/pme_force_sender_gpu_impl.cu +++ b/src/gromacs/ewald/pme_force_sender_gpu_impl.cu @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2019,2020, by the GROMACS development team, led by + * Copyright (c) 2019,2020,2021, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -48,6 +48,7 @@ #include "config.h" #include "gromacs/gpu_utils/cudautils.cuh" +#include "gromacs/gpu_utils/devicebuffer_datatype.h" #include "gromacs/gpu_utils/gpueventsynchronizer.cuh" #include "gromacs/utility/gmxmpi.h" @@ -68,7 +69,7 @@ PmeForceSenderGpu::Impl::Impl(const DeviceStream& pmeStream, MPI_Comm comm, gmx: PmeForceSenderGpu::Impl::~Impl() = default; /*! \brief sends force buffer address to PP ranks */ -void PmeForceSenderGpu::Impl::sendForceBufferAddressToPpRanks(rvec* d_f) +void PmeForceSenderGpu::Impl::sendForceBufferAddressToPpRanks(DeviceBuffer d_f) { int ind_start = 0; int ind_end = 0; @@ -115,7 +116,7 @@ PmeForceSenderGpu::PmeForceSenderGpu(const DeviceStream& pmeStream, PmeForceSenderGpu::~PmeForceSenderGpu() = default; -void PmeForceSenderGpu::sendForceBufferAddressToPpRanks(rvec* d_f) +void PmeForceSenderGpu::sendForceBufferAddressToPpRanks(DeviceBuffer d_f) { impl_->sendForceBufferAddressToPpRanks(d_f); } diff --git a/src/gromacs/ewald/pme_force_sender_gpu_impl.h b/src/gromacs/ewald/pme_force_sender_gpu_impl.h index 91fe1c1140..f0b0f74bad 100644 --- a/src/gromacs/ewald/pme_force_sender_gpu_impl.h +++ b/src/gromacs/ewald/pme_force_sender_gpu_impl.h @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2019,2020, by the GROMACS development team, led by + * Copyright (c) 2019,2020,2021, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -44,6 +44,7 @@ #define GMX_PMEFORCESENDERGPU_IMPL_H #include "gromacs/ewald/pme_force_sender_gpu.h" +#include "gromacs/gpu_utils/devicebuffer_datatype.h" #include "gromacs/gpu_utils/gpueventsynchronizer.cuh" #include "gromacs/utility/arrayref.h" @@ -68,7 +69,7 @@ public: * sends force buffer address to PP rank * \param[in] d_f force buffer in GPU memory */ - void sendForceBufferAddressToPpRanks(rvec* d_f); + void sendForceBufferAddressToPpRanks(DeviceBuffer d_f); /*! \brief * Send PP data to PP rank diff --git a/src/gromacs/ewald/pme_gather.cu b/src/gromacs/ewald/pme_gather.cu index 35111b52ec..82eeed85a1 100644 --- a/src/gromacs/ewald/pme_gather.cu +++ b/src/gromacs/ewald/pme_gather.cu @@ -351,7 +351,7 @@ __launch_bounds__(c_gatherMaxThreadsPerBlock, c_gatherMinBlocksPerMP) __global__ const float* __restrict__ gm_coefficientsB = kernelParams.atoms.d_coefficients[1]; const float* __restrict__ gm_gridA = kernelParams.grid.d_realGrid[0]; const float* __restrict__ gm_gridB = kernelParams.grid.d_realGrid[1]; - float* __restrict__ gm_forces = kernelParams.atoms.d_forces; + float* __restrict__ gm_forces = reinterpret_cast(kernelParams.atoms.d_forces); /* Global memory pointers for readGlobal */ const float* __restrict__ gm_theta = kernelParams.atoms.d_theta; diff --git a/src/gromacs/ewald/pme_gpu.cpp b/src/gromacs/ewald/pme_gpu.cpp index e0c32e207d..225fb1050a 100644 --- a/src/gromacs/ewald/pme_gpu.cpp +++ b/src/gromacs/ewald/pme_gpu.cpp @@ -440,11 +440,11 @@ void pme_gpu_reinit_computation(const gmx_pme_t* pme, gmx_wallcycle* wcycle) wallcycle_stop(wcycle, ewcLAUNCH_GPU); } -void* pme_gpu_get_device_f(const gmx_pme_t* pme) +DeviceBuffer pme_gpu_get_device_f(const gmx_pme_t* pme) { if (!pme || !pme_gpu_active(pme)) { - return nullptr; + return DeviceBuffer{}; } return pme_gpu_get_kernelparam_forces(pme->gpu); } diff --git a/src/gromacs/ewald/pme_gpu_internal.cpp b/src/gromacs/ewald/pme_gpu_internal.cpp index 028a66a35a..c0d422d572 100644 --- a/src/gromacs/ewald/pme_gpu_internal.cpp +++ b/src/gromacs/ewald/pme_gpu_internal.cpp @@ -229,7 +229,7 @@ void pme_gpu_free_bspline_values(const PmeGpu* pmeGpu) void pme_gpu_realloc_forces(PmeGpu* pmeGpu) { - const size_t newForcesSize = pmeGpu->nAtomsAlloc * DIM; + const size_t newForcesSize = pmeGpu->nAtomsAlloc; GMX_ASSERT(newForcesSize > 0, "Bad number of atoms in PME GPU"); reallocateDeviceBuffer(&pmeGpu->kernelParams->atoms.d_forces, newForcesSize, @@ -248,11 +248,10 @@ void pme_gpu_free_forces(const PmeGpu* pmeGpu) void pme_gpu_copy_input_forces(PmeGpu* pmeGpu) { GMX_ASSERT(pmeGpu->kernelParams->atoms.nAtoms > 0, "Bad number of atoms in PME GPU"); - float* h_forcesFloat = reinterpret_cast(pmeGpu->staging.h_forces.data()); copyToDeviceBuffer(&pmeGpu->kernelParams->atoms.d_forces, - h_forcesFloat, + pmeGpu->staging.h_forces.data(), 0, - DIM * pmeGpu->kernelParams->atoms.nAtoms, + pmeGpu->kernelParams->atoms.nAtoms, pmeGpu->archSpecific->pmeStream_, pmeGpu->settings.transferKind, nullptr); @@ -261,11 +260,10 @@ void pme_gpu_copy_input_forces(PmeGpu* pmeGpu) void pme_gpu_copy_output_forces(PmeGpu* pmeGpu) { GMX_ASSERT(pmeGpu->kernelParams->atoms.nAtoms > 0, "Bad number of atoms in PME GPU"); - float* h_forcesFloat = reinterpret_cast(pmeGpu->staging.h_forces.data()); - copyFromDeviceBuffer(h_forcesFloat, + copyFromDeviceBuffer(pmeGpu->staging.h_forces.data(), &pmeGpu->kernelParams->atoms.d_forces, 0, - DIM * pmeGpu->kernelParams->atoms.nAtoms, + pmeGpu->kernelParams->atoms.nAtoms, pmeGpu->archSpecific->pmeStream_, pmeGpu->settings.transferKind, nullptr); @@ -1706,7 +1704,7 @@ void pme_gpu_gather(PmeGpu* pmeGpu, real** h_grids, const float lambda) } } -void* pme_gpu_get_kernelparam_forces(const PmeGpu* pmeGpu) +DeviceBuffer pme_gpu_get_kernelparam_forces(const PmeGpu* pmeGpu) { if (pmeGpu && pmeGpu->kernelParams) { @@ -1714,7 +1712,7 @@ void* pme_gpu_get_kernelparam_forces(const PmeGpu* pmeGpu) } else { - return nullptr; + return DeviceBuffer{}; } } diff --git a/src/gromacs/ewald/pme_gpu_internal.h b/src/gromacs/ewald/pme_gpu_internal.h index 632557d13e..7baa6bd347 100644 --- a/src/gromacs/ewald/pme_gpu_internal.h +++ b/src/gromacs/ewald/pme_gpu_internal.h @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2016,2017,2018,2019,2020, by the GROMACS development team, led by + * Copyright (c) 2016,2017,2018,2019,2020,2021, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -405,8 +405,8 @@ GPU_FUNC_QUALIFIER void pme_gpu_set_kernelparam_coordinates(const PmeGpu* GPU_FU * \param[in] pmeGpu The PME GPU structure. * \returns Pointer to force data */ -GPU_FUNC_QUALIFIER void* pme_gpu_get_kernelparam_forces(const PmeGpu* GPU_FUNC_ARGUMENT(pmeGpu)) - GPU_FUNC_TERM_WITH_RETURN(nullptr); +GPU_FUNC_QUALIFIER DeviceBuffer pme_gpu_get_kernelparam_forces(const PmeGpu* GPU_FUNC_ARGUMENT(pmeGpu)) + GPU_FUNC_TERM_WITH_RETURN(DeviceBuffer{}); /*! \brief Return pointer to the sync object triggered after the PME force calculation completion * \param[in] pmeGpu The PME GPU structure. diff --git a/src/gromacs/ewald/pme_gpu_types.h b/src/gromacs/ewald/pme_gpu_types.h index abf7a17ed0..e2c067390a 100644 --- a/src/gromacs/ewald/pme_gpu_types.h +++ b/src/gromacs/ewald/pme_gpu_types.h @@ -171,7 +171,7 @@ struct PmeGpuAtomParams * The forces change and need to be copied from (and possibly to) the GPU for every PME * computation, but reallocation happens only at DD. */ - HIDE_FROM_OPENCL_COMPILER(DeviceBuffer) d_forces; + HIDE_FROM_OPENCL_COMPILER(DeviceBuffer) d_forces; /*! \brief Global GPU memory array handle with ivec atom gridline indices. * Computed on GPU in the spline calculation part. */ diff --git a/src/gromacs/ewald/pme_only.cpp b/src/gromacs/ewald/pme_only.cpp index 75f6c101bb..5a01c898c8 100644 --- a/src/gromacs/ewald/pme_only.cpp +++ b/src/gromacs/ewald/pme_only.cpp @@ -4,7 +4,7 @@ * Copyright (c) 1991-2000, University of Groningen, The Netherlands. * Copyright (c) 2001-2004, The GROMACS development team. * Copyright (c) 2013,2014,2015,2016,2017 by the GROMACS development team. - * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by + * Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -445,8 +445,7 @@ static int gmx_pme_recv_coeffs_coords(struct gmx_pme_t* pme, // This rank will have its data accessed directly by PP rank, so needs to send the remote addresses. pme_pp->pmeCoordinateReceiverGpu->sendCoordinateBufferAddressToPpRanks( stateGpu->getCoordinates()); - pme_pp->pmeForceSenderGpu->sendForceBufferAddressToPpRanks( - reinterpret_cast(pme_gpu_get_device_f(pme))); + pme_pp->pmeForceSenderGpu->sendForceBufferAddressToPpRanks(pme_gpu_get_device_f(pme)); } } @@ -583,8 +582,13 @@ static void gmx_pme_send_force_vir_ener(const gmx_pme_t& pme, if (pme_pp->useGpuDirectComm) { // Data will be transferred directly from GPU. - rvec* d_f = reinterpret_cast(pme_gpu_get_device_f(&pme)); - sendbuf = reinterpret_cast(&d_f[ind_start]); + DeviceBuffer gmx_unused d_f = pme_gpu_get_device_f(&pme); +# if GMX_GPU_CUDA + // OpenCL does not allow host-side pointer arithmetic on buffers. Neither does SYCL. + sendbuf = reinterpret_cast(&d_f[ind_start]); +# else + GMX_RELEASE_ASSERT(false, "Can only use GPU Direct Communications with CUDA"); +# endif } sendFToPP(sendbuf, receiver, pme_pp, &messages); } diff --git a/src/gromacs/ewald/pme_pp_comm_gpu.h b/src/gromacs/ewald/pme_pp_comm_gpu.h index 3e56da9af3..886e0c221b 100644 --- a/src/gromacs/ewald/pme_pp_comm_gpu.h +++ b/src/gromacs/ewald/pme_pp_comm_gpu.h @@ -44,6 +44,7 @@ #include +#include "gromacs/gpu_utils/devicebuffer_datatype.h" #include "gromacs/utility/gmxmpi.h" class DeviceContext; @@ -99,7 +100,7 @@ public: /*! \brief * Return pointer to buffer used for staging PME force on GPU */ - void* getGpuForceStagingPtr(); + DeviceBuffer getGpuForceStagingPtr(); /*! \brief * Return pointer to event recorded when forces are ready diff --git a/src/gromacs/ewald/pme_pp_comm_gpu_impl.cpp b/src/gromacs/ewald/pme_pp_comm_gpu_impl.cpp index d19004e7ed..d31b976c10 100644 --- a/src/gromacs/ewald/pme_pp_comm_gpu_impl.cpp +++ b/src/gromacs/ewald/pme_pp_comm_gpu_impl.cpp @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2019,2020, by the GROMACS development team, led by + * Copyright (c) 2019,2020,2021, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -102,12 +102,12 @@ void PmePpCommGpu::sendCoordinatesToPmeCudaDirect(void* /* sendPtr */, "implementation."); } -void* PmePpCommGpu::getGpuForceStagingPtr() +DeviceBuffer PmePpCommGpu::getGpuForceStagingPtr() { GMX_ASSERT(!impl_, "A CPU stub for PME-PP GPU communication was called instead of the correct " "implementation."); - return nullptr; + return DeviceBuffer{}; } GpuEventSynchronizer* PmePpCommGpu::getForcesReadySynchronizer() diff --git a/src/gromacs/ewald/pme_pp_comm_gpu_impl.cu b/src/gromacs/ewald/pme_pp_comm_gpu_impl.cu index 62bffa4e1b..0d66883979 100644 --- a/src/gromacs/ewald/pme_pp_comm_gpu_impl.cu +++ b/src/gromacs/ewald/pme_pp_comm_gpu_impl.cu @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2019,2020, by the GROMACS development team, led by + * Copyright (c) 2019,2020,2021, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -155,9 +155,9 @@ void PmePpCommGpu::Impl::sendCoordinatesToPmeCudaDirect(void* sendPtr, GMX_UNUSED_VALUE(coordinatesReadyOnDeviceEvent); #endif } -void* PmePpCommGpu::Impl::getGpuForceStagingPtr() +DeviceBuffer PmePpCommGpu::Impl::getGpuForceStagingPtr() { - return static_cast(d_pmeForces_); + return d_pmeForces_; } GpuEventSynchronizer* PmePpCommGpu::Impl::getForcesReadySynchronizer() @@ -194,7 +194,7 @@ void PmePpCommGpu::sendCoordinatesToPmeCudaDirect(void* sendPtr, sendPtr, sendSize, sendPmeCoordinatesFromGpu, coordinatesReadyOnDeviceEvent); } -void* PmePpCommGpu::getGpuForceStagingPtr() +DeviceBuffer PmePpCommGpu::getGpuForceStagingPtr() { return impl_->getGpuForceStagingPtr(); } diff --git a/src/gromacs/ewald/pme_pp_comm_gpu_impl.h b/src/gromacs/ewald/pme_pp_comm_gpu_impl.h index 0630084e59..58e22c2e46 100644 --- a/src/gromacs/ewald/pme_pp_comm_gpu_impl.h +++ b/src/gromacs/ewald/pme_pp_comm_gpu_impl.h @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2019,2020, by the GROMACS development team, led by + * Copyright (c) 2019,2020,2021, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -44,6 +44,7 @@ #define GMX_PME_PP_COMM_GPU_IMPL_H #include "gromacs/ewald/pme_pp_comm_gpu.h" +#include "gromacs/gpu_utils/devicebuffer_datatype.h" #include "gromacs/gpu_utils/gpueventsynchronizer.cuh" #include "gromacs/math/vectypes.h" #include "gromacs/utility/gmxmpi.h" @@ -110,7 +111,7 @@ public: /*! \brief * Return pointer to buffer used for staging PME force on GPU */ - void* getGpuForceStagingPtr(); + DeviceBuffer getGpuForceStagingPtr(); /*! \brief * Return pointer to event recorded when forces are ready @@ -123,15 +124,15 @@ private: //! Handle for CUDA stream used for the communication operations in this class const DeviceStream& pmePpCommStream_; //! Remote location of PME coordinate data buffer - void* remotePmeXBuffer_ = nullptr; + DeviceBuffer remotePmeXBuffer_ = nullptr; //! Remote location of PME force data buffer - void* remotePmeFBuffer_ = nullptr; + DeviceBuffer remotePmeFBuffer_ = nullptr; //! communicator for simulation MPI_Comm comm_; //! Rank of PME task int pmeRank_ = -1; //! Buffer for staging PME force on GPU - rvec* d_pmeForces_ = nullptr; + DeviceBuffer d_pmeForces_ = nullptr; //! number of atoms in PME force staging array int d_pmeForcesSize_ = -1; //! number of atoms allocated in recvbuf array diff --git a/src/gromacs/mdlib/gpuforcereduction.h b/src/gromacs/mdlib/gpuforcereduction.h index 157c4c7eca..82e8db33f5 100644 --- a/src/gromacs/mdlib/gpuforcereduction.h +++ b/src/gromacs/mdlib/gpuforcereduction.h @@ -86,13 +86,13 @@ public: * * \param [in] forcePtr Pointer to force to be reduced */ - void registerNbnxmForce(void* forcePtr); + void registerNbnxmForce(DeviceBuffer forcePtr); /*! \brief Register a rvec-format force to be reduced * * \param [in] forcePtr Pointer to force to be reduced */ - void registerRvecForce(void* forcePtr); + void registerRvecForce(DeviceBuffer forcePtr); /*! \brief Add a dependency for this force reduction * @@ -102,14 +102,14 @@ public: /*! \brief Reinitialize the GPU force reduction * - * \param [in] baseForcePtr Pointer to force to be used as a base + * \param [in] baseForce Pointer to force to be used as a base * \param [in] numAtoms The number of atoms * \param [in] cell Pointer to the cell array * \param [in] atomStart The start atom for the reduction * \param [in] accumulate Whether reduction should be accumulated * \param [in] completionMarker Event to be marked when launch of reduction is complete */ - void reinit(DeviceBuffer baseForcePtr, + void reinit(DeviceBuffer baseForce, int numAtoms, ArrayRef cell, int atomStart, diff --git a/src/gromacs/mdlib/gpuforcereduction_impl.cpp b/src/gromacs/mdlib/gpuforcereduction_impl.cpp index 69876c7d15..b69759b848 100644 --- a/src/gromacs/mdlib/gpuforcereduction_impl.cpp +++ b/src/gromacs/mdlib/gpuforcereduction_impl.cpp @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2020, by the GROMACS development team, led by + * Copyright (c) 2020,2021, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -76,13 +76,13 @@ void GpuForceReduction::reinit(DeviceBuffer /*baseForcePtr*/, } // NOLINTNEXTLINE readability-convert-member-functions-to-static -void GpuForceReduction::registerNbnxmForce(void* /* forcePtr */) +void GpuForceReduction::registerNbnxmForce(DeviceBuffer /* forcePtr */) { GMX_ASSERT(false, "A CPU stub has been called instead of the correct implementation."); } // NOLINTNEXTLINE readability-convert-member-functions-to-static -void GpuForceReduction::registerRvecForce(void* /* forcePtr */) +void GpuForceReduction::registerRvecForce(DeviceBuffer /* forcePtr */) { GMX_ASSERT(false, "A CPU stub has been called instead of the correct implementation."); } diff --git a/src/gromacs/mdlib/gpuforcereduction_impl.cu b/src/gromacs/mdlib/gpuforcereduction_impl.cu index f95f6f1439..ac89b47d24 100644 --- a/src/gromacs/mdlib/gpuforcereduction_impl.cu +++ b/src/gromacs/mdlib/gpuforcereduction_impl.cu @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2020, by the GROMACS development team, led by + * Copyright (c) 2020,2021, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -43,7 +43,7 @@ #include "gmxpre.h" -#include "gpuforcereduction_impl.cuh" +#include "gpuforcereduction_impl.h" #include @@ -112,15 +112,15 @@ GpuForceReduction::Impl::Impl(const DeviceContext& deviceContext, deviceStream_(deviceStream), wcycle_(wcycle){}; -void GpuForceReduction::Impl::reinit(float3* baseForcePtr, - const int numAtoms, - ArrayRef cell, - const int atomStart, - const bool accumulate, - GpuEventSynchronizer* completionMarker) +void GpuForceReduction::Impl::reinit(DeviceBuffer baseForce, + const int numAtoms, + ArrayRef cell, + const int atomStart, + const bool accumulate, + GpuEventSynchronizer* completionMarker) { - GMX_ASSERT((baseForcePtr != nullptr), "Input base force for reduction has no data"); - baseForce_ = &(baseForcePtr[atomStart]); + GMX_ASSERT((baseForce != nullptr), "Input base force for reduction has no data"); + baseForce_ = baseForce; numAtoms_ = numAtoms; atomStart_ = atomStart; accumulate_ = accumulate; @@ -144,13 +144,13 @@ void GpuForceReduction::Impl::reinit(float3* baseForcePtr, void GpuForceReduction::Impl::registerNbnxmForce(DeviceBuffer forcePtr) { - GMX_ASSERT((forcePtr != nullptr), "Input force for reduction has no data"); + GMX_ASSERT((forcePtr), "Input force for reduction has no data"); nbnxmForceToAdd_ = forcePtr; }; void GpuForceReduction::Impl::registerRvecForce(DeviceBuffer forcePtr) { - GMX_ASSERT((forcePtr != nullptr), "Input force for reduction has no data"); + GMX_ASSERT((forcePtr), "Input force for reduction has no data"); rvecForceToAdd_ = forcePtr; }; @@ -172,11 +172,12 @@ void GpuForceReduction::Impl::execute() GMX_ASSERT((nbnxmForceToAdd_ != nullptr), "Nbnxm force for reduction has no data"); // Enqueue wait on all dependencies passed - for (auto const synchronizer : dependencyList_) + for (const auto& synchronizer : dependencyList_) { synchronizer->enqueueWaitEvent(deviceStream_); } + float3* d_baseForce = &(asFloat3(baseForce_)[atomStart_]); float3* d_nbnxmForce = asFloat3(nbnxmForceToAdd_); float3* d_rvecForceToAdd = &(asFloat3(rvecForceToAdd_)[atomStart_]); @@ -195,7 +196,7 @@ void GpuForceReduction::Impl::execute() : (accumulate_ ? reduceKernel : reduceKernel); const auto kernelArgs = prepareGpuKernelArguments( - kernelFn, config, &d_nbnxmForce, &d_rvecForceToAdd, &baseForce_, &cellInfo_.d_cell, &numAtoms_); + kernelFn, config, &d_nbnxmForce, &d_rvecForceToAdd, &d_baseForce, &cellInfo_.d_cell, &numAtoms_); launchGpuKernel(kernelFn, config, deviceStream_, nullptr, "Force Reduction", kernelArgs); @@ -218,14 +219,14 @@ GpuForceReduction::GpuForceReduction(const DeviceContext& deviceContext, { } -void GpuForceReduction::registerNbnxmForce(void* forcePtr) +void GpuForceReduction::registerNbnxmForce(DeviceBuffer forcePtr) { - impl_->registerNbnxmForce(reinterpret_cast>(forcePtr)); + impl_->registerNbnxmForce(std::move(forcePtr)); } -void GpuForceReduction::registerRvecForce(void* forcePtr) +void GpuForceReduction::registerRvecForce(DeviceBuffer forcePtr) { - impl_->registerRvecForce(reinterpret_cast>(forcePtr)); + impl_->registerRvecForce(std::move(forcePtr)); } void GpuForceReduction::addDependency(GpuEventSynchronizer* const dependency) @@ -233,14 +234,14 @@ void GpuForceReduction::addDependency(GpuEventSynchronizer* const dependency) impl_->addDependency(dependency); } -void GpuForceReduction::reinit(DeviceBuffer baseForcePtr, +void GpuForceReduction::reinit(DeviceBuffer baseForce, const int numAtoms, ArrayRef cell, const int atomStart, const bool accumulate, GpuEventSynchronizer* completionMarker) { - impl_->reinit(asFloat3(baseForcePtr), numAtoms, cell, atomStart, accumulate, completionMarker); + impl_->reinit(baseForce, numAtoms, cell, atomStart, accumulate, completionMarker); } void GpuForceReduction::execute() { diff --git a/src/gromacs/mdlib/gpuforcereduction_impl.cuh b/src/gromacs/mdlib/gpuforcereduction_impl.h similarity index 92% rename from src/gromacs/mdlib/gpuforcereduction_impl.cuh rename to src/gromacs/mdlib/gpuforcereduction_impl.h index 8434a04c71..491fd95f21 100644 --- a/src/gromacs/mdlib/gpuforcereduction_impl.cuh +++ b/src/gromacs/mdlib/gpuforcereduction_impl.h @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2020, by the GROMACS development team, led by + * Copyright (c) 2020,2021, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -58,7 +58,7 @@ struct cellInfo //! cell index mapping for any nbat-format forces const int* cell = nullptr; //! device copy of cell index mapping for any nbat-format forces - int* d_cell = nullptr; + DeviceBuffer d_cell; //! number of atoms in cell array int cellSize = -1; //! number of atoms allocated in cell array @@ -75,7 +75,7 @@ public: * \param [in] deviceContext GPU device context * \param [in] wcycle The wallclock counter */ - Impl(const DeviceContext& deviceContext, const DeviceStream& deviceStreami, gmx_wallcycle* wcycle); + Impl(const DeviceContext& deviceContext, const DeviceStream& deviceStream, gmx_wallcycle* wcycle); ~Impl(); /*! \brief Register a nbnxm-format force to be reduced @@ -98,14 +98,14 @@ public: /*! \brief Reinitialize the GPU force reduction * - * \param [in] baseForcePtr Pointer to force to be used as a base + * \param [in] baseForce Pointer to force to be used as a base * \param [in] numAtoms The number of atoms * \param [in] cell Pointer to the cell array * \param [in] atomStart The start atom for the reduction * \param [in] accumulate Whether reduction should be accumulated * \param [in] completionMarker Event to be marked when launch of reduction is complete */ - void reinit(float3* baseForcePtr, + void reinit(DeviceBuffer baseForce, const int numAtoms, ArrayRef cell, const int atomStart, @@ -117,13 +117,13 @@ public: private: //! force to be used as a base for this reduction - float3* baseForce_ = nullptr; + DeviceBuffer baseForce_; //! starting atom int atomStart_ = 0; //! number of atoms int numAtoms_ = 0; //! whether reduction is accumulated into base force buffer - int accumulate_ = true; + bool accumulate_ = true; //! cell information for any nbat-format forces struct cellInfo cellInfo_; //! GPU context object @@ -133,9 +133,9 @@ private: //! stream to be used for this reduction const DeviceStream& deviceStream_; //! Nbnxm force to be added in this reduction - DeviceBuffer nbnxmForceToAdd_ = nullptr; + DeviceBuffer nbnxmForceToAdd_; //! Rvec-format force to be added in this reduction - DeviceBuffer rvecForceToAdd_ = nullptr; + DeviceBuffer rvecForceToAdd_; //! event to be marked when redcution launch has been completed GpuEventSynchronizer* completionMarker_ = nullptr; //! The wallclock counter diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp index 0bcf0c5451..f309e82c26 100644 --- a/src/gromacs/mdlib/sim_util.cpp +++ b/src/gromacs/mdlib/sim_util.cpp @@ -1115,9 +1115,10 @@ static void setupGpuForceReductions(gmx::MdrunScheduleWorkload* runScheduleWork, if (runScheduleWork->simulationWork.useGpuPme && (thisRankHasDuty(cr, DUTY_PME) || runScheduleWork->simulationWork.useGpuPmePpCommunication)) { - void* forcePtr = thisRankHasDuty(cr, DUTY_PME) ? pme_gpu_get_device_f(fr->pmedata) - : // PME force buffer on same GPU - fr->pmePpCommGpu->getGpuForceStagingPtr(); // buffer received from other GPU + DeviceBuffer forcePtr = + thisRankHasDuty(cr, DUTY_PME) ? pme_gpu_get_device_f(fr->pmedata) + : // PME force buffer on same GPU + fr->pmePpCommGpu->getGpuForceStagingPtr(); // buffer received from other GPU fr->gpuForceReduction[gmx::AtomLocality::Local]->registerRvecForce(forcePtr); GpuEventSynchronizer* const pmeSynchronizer = diff --git a/src/gromacs/mdlib/update_constrain_gpu_impl.cu b/src/gromacs/mdlib/update_constrain_gpu_impl.cu index f03ab778c3..831bbfb3c3 100644 --- a/src/gromacs/mdlib/update_constrain_gpu_impl.cu +++ b/src/gromacs/mdlib/update_constrain_gpu_impl.cu @@ -60,6 +60,7 @@ #include "gromacs/gpu_utils/device_context.h" #include "gromacs/gpu_utils/device_stream.h" #include "gromacs/gpu_utils/devicebuffer.h" +#include "gromacs/gpu_utils/gpueventsynchronizer.cuh" #include "gromacs/gpu_utils/gputraits.cuh" #include "gromacs/gpu_utils/vectype_ops.cuh" #include "gromacs/mdlib/leapfrog_gpu.h" diff --git a/src/gromacs/mdlib/update_constrain_gpu_impl.h b/src/gromacs/mdlib/update_constrain_gpu_impl.h index 8e101b8cd0..34712d4941 100644 --- a/src/gromacs/mdlib/update_constrain_gpu_impl.h +++ b/src/gromacs/mdlib/update_constrain_gpu_impl.h @@ -46,14 +46,22 @@ #ifndef GMX_MDLIB_UPDATE_CONSTRAIN_GPU_IMPL_H #define GMX_MDLIB_UPDATE_CONSTRAIN_GPU_IMPL_H -#include "gmxpre.h" - -#include "gromacs/gpu_utils/gpueventsynchronizer.cuh" -#include "gromacs/mdlib/leapfrog_gpu.h" -#include "gromacs/mdlib/lincs_gpu.cuh" -#include "gromacs/mdlib/settle_gpu.cuh" +#include "gromacs/gpu_utils/devicebuffer_datatype.h" #include "gromacs/mdlib/update_constrain_gpu.h" #include "gromacs/mdtypes/inputrec.h" +#include "gromacs/pbcutil/pbc_aiuc.h" + +#if GMX_GPU_CUDA +# include "gromacs/gpu_utils/gputraits.cuh" +#endif + +class GpuEventSynchronizer; +namespace gmx +{ +class LincsGpu; +class SettleGpu; +class LeapFrogGpu; +} // namespace gmx namespace gmx { @@ -193,14 +201,14 @@ private: int numAtoms_; //! Local copy of the pointer to the device positions buffer - float3* d_x_; + DeviceBuffer d_x_; //! Local copy of the pointer to the device velocities buffer - float3* d_v_; + DeviceBuffer d_v_; //! Local copy of the pointer to the device forces buffer - float3* d_f_; + DeviceBuffer d_f_; //! Device buffer for intermediate positions (maintained internally) - float3* d_xp_; + DeviceBuffer d_xp_; //! Number of elements in shifted coordinates buffer int numXp_ = -1; //! Allocation size for the shifted coordinates buffer @@ -208,7 +216,7 @@ private: //! 1/mass for all atoms (GPU) - real* d_inverseMasses_; + DeviceBuffer d_inverseMasses_; //! Number of elements in reciprocal masses buffer int numInverseMasses_ = -1; //! Allocation size for the reciprocal masses buffer diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu index 6ed6c1ff78..e30f8e8c87 100644 --- a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu +++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu @@ -907,7 +907,7 @@ void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid& grid, nbnxnInsertNonlocalGpuDependency(nb, interactionLoc); } -void* getGpuForces(NbnxmGpu* nb) +DeviceBuffer getGpuForces(NbnxmGpu* nb) { return nb->atdat->f; } diff --git a/src/gromacs/nbnxm/nbnxm.cpp b/src/gromacs/nbnxm/nbnxm.cpp index 96714bb154..c96291a1c2 100644 --- a/src/gromacs/nbnxm/nbnxm.cpp +++ b/src/gromacs/nbnxm/nbnxm.cpp @@ -215,7 +215,7 @@ int nonbonded_verlet_t::getNumAtoms(const gmx::AtomLocality locality) const return numAtoms; } -void* nonbonded_verlet_t::getGpuForces() const +DeviceBuffer nonbonded_verlet_t::getGpuForces() const { return Nbnxm::getGpuForces(gpu_nbv); } diff --git a/src/gromacs/nbnxm/nbnxm.h b/src/gromacs/nbnxm/nbnxm.h index 7ea59e4e5a..9732b01a39 100644 --- a/src/gromacs/nbnxm/nbnxm.h +++ b/src/gromacs/nbnxm/nbnxm.h @@ -398,7 +398,7 @@ public: * * \returns A pointer to the force buffer in GPU memory */ - void* getGpuForces() const; + DeviceBuffer getGpuForces() const; //! Return the kernel setup const Nbnxm::KernelSetup& kernelSetup() const { return kernelSetup_; } diff --git a/src/gromacs/nbnxm/nbnxm_gpu.h b/src/gromacs/nbnxm/nbnxm_gpu.h index fe83696934..c3fdb36a19 100644 --- a/src/gromacs/nbnxm/nbnxm_gpu.h +++ b/src/gromacs/nbnxm/nbnxm_gpu.h @@ -315,7 +315,8 @@ void nbnxn_wait_x_on_device(NbnxmGpu gmx_unused* nb) CUDA_FUNC_TERM; * \returns A pointer to the force buffer in GPU memory */ CUDA_FUNC_QUALIFIER -void* getGpuForces(NbnxmGpu gmx_unused* nb) CUDA_FUNC_TERM_WITH_RETURN(nullptr); +DeviceBuffer getGpuForces(NbnxmGpu gmx_unused* nb) + CUDA_FUNC_TERM_WITH_RETURN(DeviceBuffer{}); } // namespace Nbnxm #endif -- 2.22.0