From 092a8f684682abcc4c397622cc37fd87b2f10b25 Mon Sep 17 00:00:00 2001 From: Artem Zhmurov Date: Tue, 3 Sep 2019 14:23:40 +0200 Subject: [PATCH] StatePropagatorDataGpu object to manage GPU forces, positions and velocities buffers In current version the positions and forces on the GPU are managed by different modules, depending of the offload scenario for a particular run. This makes management of the buffers complicated and fragile. This commit adds the object responsible for management of the GPU buffers of coordinates, forces and velocities. The object is connected to all clients that use coordinates, forces and velocities buffers, while keeping the existing logic intact where its possible. Since the H2D and D2H copies are now done in nullptr stream, some of implicit synchronization is lost. Consequently this commit does not always work properly with newly introduced buffer ops / halo exchange features. To avoid the confusion, GPU buffer ops are disabled by the assertion. There will be a separate commit with all copies done synchronously, which will work with the buffer ops. The stream- and event-based synchronization will be introduced in the follow-up commits. Refs. #2816. Change-Id: I2e2ba1b6436f087d1f2fef4ff876445814a724e7 --- src/gromacs/domdec/domdec.cpp | 5 + src/gromacs/domdec/domdec.h | 3 + src/gromacs/domdec/gpuhaloexchange.h | 6 +- src/gromacs/domdec/gpuhaloexchange_impl.cpp | 4 +- src/gromacs/domdec/gpuhaloexchange_impl.cu | 4 +- src/gromacs/ewald/pme.h | 30 +- src/gromacs/ewald/pme_gpu.cpp | 35 +- src/gromacs/ewald/pme_gpu_internal.cpp | 69 +++- src/gromacs/ewald/pme_gpu_internal.h | 24 +- src/gromacs/ewald/pme_only.cpp | 19 +- src/gromacs/ewald/tests/pmegathertest.cpp | 5 +- .../ewald/tests/pmesplinespreadtest.cpp | 5 +- src/gromacs/ewald/tests/pmetestcommon.cpp | 26 +- src/gromacs/ewald/tests/pmetestcommon.h | 16 +- src/gromacs/mdlib/sim_util.cpp | 78 ++-- src/gromacs/mdlib/update_constrain_cuda.h | 90 +---- .../mdlib/update_constrain_cuda_impl.cpp | 47 +-- .../mdlib/update_constrain_cuda_impl.cu | 111 ++--- .../mdlib/update_constrain_cuda_impl.h | 127 ++---- src/gromacs/mdrun/md.cpp | 39 +- src/gromacs/mdrun/runner.cpp | 27 +- src/gromacs/mdtypes/CMakeLists.txt | 26 +- src/gromacs/mdtypes/forcerec.h | 6 + .../mdtypes/state_propagator_data_gpu.h | 225 +++++++++++ .../state_propagator_data_gpu_impl.cpp | 156 +++++++ .../mdtypes/state_propagator_data_gpu_impl.h | 273 +++++++++++++ .../state_propagator_data_gpu_impl_gpu.cpp | 381 ++++++++++++++++++ src/gromacs/nbnxm/atomdata.cpp | 39 +- src/gromacs/nbnxm/atomdata.h | 48 +-- src/gromacs/nbnxm/cuda/nbnxm_cuda.cu | 201 +-------- .../nbnxm/cuda/nbnxm_cuda_data_mgmt.cu | 9 - src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h | 12 - src/gromacs/nbnxm/nbnxm.cpp | 64 +-- src/gromacs/nbnxm/nbnxm.h | 54 +-- src/gromacs/nbnxm/nbnxm_gpu.h | 90 +---- 35 files changed, 1418 insertions(+), 936 deletions(-) create mode 100644 src/gromacs/mdtypes/state_propagator_data_gpu.h create mode 100644 src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp create mode 100644 src/gromacs/mdtypes/state_propagator_data_gpu_impl.h create mode 100644 src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp diff --git a/src/gromacs/domdec/domdec.cpp b/src/gromacs/domdec/domdec.cpp index 2807a97592..f3365e11a3 100644 --- a/src/gromacs/domdec/domdec.cpp +++ b/src/gromacs/domdec/domdec.cpp @@ -235,6 +235,11 @@ gmx_domdec_zones_t *domdec_zones(gmx_domdec_t *dd) return &dd->comm->zones; } +int dd_numAtomsZones(const gmx_domdec_t &dd) +{ + return dd.comm->atomRanges.end(DDAtomRanges::Type::Zones); +} + int dd_numHomeAtoms(const gmx_domdec_t &dd) { return dd.comm->atomRanges.numHomeAtoms(); diff --git a/src/gromacs/domdec/domdec.h b/src/gromacs/domdec/domdec.h index 9405a030ec..97b38a4300 100644 --- a/src/gromacs/domdec/domdec.h +++ b/src/gromacs/domdec/domdec.h @@ -110,6 +110,9 @@ void dd_store_state(struct gmx_domdec_t *dd, t_state *state); /*! \brief Returns a pointer to the gmx_domdec_zones_t struct */ struct gmx_domdec_zones_t *domdec_zones(struct gmx_domdec_t *dd); +/*! \brief Returns the range for atoms in zones*/ +int dd_numAtomsZones(const gmx_domdec_t &dd); + /*! \brief Returns the number of home atoms */ int dd_numHomeAtoms(const gmx_domdec_t &dd); diff --git a/src/gromacs/domdec/gpuhaloexchange.h b/src/gromacs/domdec/gpuhaloexchange.h index 4187e45cc2..f991e05123 100644 --- a/src/gromacs/domdec/gpuhaloexchange.h +++ b/src/gromacs/domdec/gpuhaloexchange.h @@ -42,6 +42,7 @@ #ifndef GMX_DOMDEC_GPUHALOEXCHANGE_H #define GMX_DOMDEC_GPUHALOEXCHANGE_H +#include "gromacs/gpu_utils/devicebuffer_datatype.h" #include "gromacs/math/vectypes.h" #include "gromacs/utility/basedefinitions.h" #include "gromacs/utility/classhelpers.h" @@ -95,8 +96,9 @@ class GpuHaloExchange * \param [in] d_coordinateBuffer pointer to coordinates buffer in GPU memory * \param [in] d_forcesBuffer pointer to coordinates buffer in GPU memory */ - void reinitHalo(rvec *d_coordinateBuffer, - rvec *d_forcesBuffer); + void reinitHalo(DeviceBuffer d_coordinateBuffer, + DeviceBuffer d_forcesBuffer); + /*! \brief GPU halo exchange of coordinates buffer. * diff --git a/src/gromacs/domdec/gpuhaloexchange_impl.cpp b/src/gromacs/domdec/gpuhaloexchange_impl.cpp index 741c79519c..c4c191b314 100644 --- a/src/gromacs/domdec/gpuhaloexchange_impl.cpp +++ b/src/gromacs/domdec/gpuhaloexchange_impl.cpp @@ -73,8 +73,8 @@ GpuHaloExchange::GpuHaloExchange(gmx_domdec_t * /* dd */, GpuHaloExchange::~GpuHaloExchange() = default; /*!\brief init halo exhange stub. */ -void GpuHaloExchange::reinitHalo(rvec * /* d_coordinatesBuffer */, - rvec * /* d_forcesBuffer */) +void GpuHaloExchange::reinitHalo(DeviceBuffer /* d_coordinatesBuffer */, + DeviceBuffer /* d_forcesBuffer */) { GMX_ASSERT(false, "A CPU stub for GPU Halo Exchange was called insted of the correct implementation."); } diff --git a/src/gromacs/domdec/gpuhaloexchange_impl.cu b/src/gromacs/domdec/gpuhaloexchange_impl.cu index dcb3bcd839..c195f96cb5 100644 --- a/src/gromacs/domdec/gpuhaloexchange_impl.cu +++ b/src/gromacs/domdec/gpuhaloexchange_impl.cu @@ -438,8 +438,8 @@ GpuHaloExchange::GpuHaloExchange(gmx_domdec_t *dd, GpuHaloExchange::~GpuHaloExchange() = default; -void GpuHaloExchange::reinitHalo(rvec *d_coordinatesBuffer, - rvec *d_forcesBuffer) +void GpuHaloExchange::reinitHalo(DeviceBuffer d_coordinatesBuffer, + DeviceBuffer d_forcesBuffer) { impl_->reinitHalo(reinterpret_cast(d_coordinatesBuffer), reinterpret_cast(d_forcesBuffer)); } diff --git a/src/gromacs/ewald/pme.h b/src/gromacs/ewald/pme.h index 6997d4e0eb..c6ec4e6fca 100644 --- a/src/gromacs/ewald/pme.h +++ b/src/gromacs/ewald/pme.h @@ -365,24 +365,13 @@ GPU_FUNC_QUALIFIER void pme_gpu_prepare_computation(gmx_pme_t *GPU_FUNC_ARG bool GPU_FUNC_ARGUMENT(useGpuForceReduction)) GPU_FUNC_TERM; /*! \brief - * Launches H2D input transfers for PME on GPU. - * - * \param[in] pme The PME data structure. - * \param[in] coordinatesHost The array of local atoms' coordinates. - * \param[in] wcycle The wallclock counter. - */ -GPU_FUNC_QUALIFIER void pme_gpu_copy_coordinates_to_gpu(gmx_pme_t *GPU_FUNC_ARGUMENT(pme), - const rvec *GPU_FUNC_ARGUMENT(coordinatesHost), - gmx_wallcycle *GPU_FUNC_ARGUMENT(wcycle)) GPU_FUNC_TERM; - -/*! \brief - * Launches first stage of PME on GPU - spreading kernel, and D2H grid transfer if needed. + * Launches first stage of PME on GPU - spreading kernel. * * \param[in] pme The PME data structure. * \param[in] wcycle The wallclock counter. */ -GPU_FUNC_QUALIFIER void pme_gpu_launch_spread(gmx_pme_t *GPU_FUNC_ARGUMENT(pme), - gmx_wallcycle *GPU_FUNC_ARGUMENT(wcycle)) GPU_FUNC_TERM; +GPU_FUNC_QUALIFIER void pme_gpu_launch_spread(gmx_pme_t *GPU_FUNC_ARGUMENT(pme), + gmx_wallcycle *GPU_FUNC_ARGUMENT(wcycle)) GPU_FUNC_TERM; /*! \brief * Launches middle stages of PME (FFT R2C, solving, FFT C2R) either on GPU or on CPU, depending on the run mode. @@ -476,6 +465,13 @@ GPU_FUNC_QUALIFIER void pme_gpu_reinit_computation(const gmx_pme_t *GPU_FUNC_ARG */ GPU_FUNC_QUALIFIER DeviceBuffer pme_gpu_get_device_x(const gmx_pme_t *GPU_FUNC_ARGUMENT(pme)) GPU_FUNC_TERM_WITH_RETURN(DeviceBuffer {}); +/*! \brief Set pointer to device copy of coordinate data. + * \param[in] pme The PME data structure. + * \param[in] d_x The pointer to the positions buffer to be set + */ +GPU_FUNC_QUALIFIER void pme_gpu_set_device_x(const gmx_pme_t *GPU_FUNC_ARGUMENT(pme), + DeviceBuffer GPU_FUNC_ARGUMENT(d_x)) GPU_FUNC_TERM; + /*! \brief Get pointer to device copy of force data. * \param[in] pme The PME data structure. * \returns Pointer to force data @@ -488,6 +484,12 @@ GPU_FUNC_QUALIFIER void *pme_gpu_get_device_f(const gmx_pme_t *GPU_FUNC_ARGUMENT */ GPU_FUNC_QUALIFIER void *pme_gpu_get_device_stream(const gmx_pme_t *GPU_FUNC_ARGUMENT(pme)) GPU_FUNC_TERM_WITH_RETURN(nullptr); +/*! \brief Returns the pointer to the GPU context. + * \param[in] pme The PME data structure. + * \returns Pointer to GPU context object. + */ +GPU_FUNC_QUALIFIER void *pme_gpu_get_device_context(const gmx_pme_t *GPU_FUNC_ARGUMENT(pme)) GPU_FUNC_TERM_WITH_RETURN(nullptr); + /*! \brief Get pointer to the device synchronizer object that allows syncing on PME force calculation completion * \param[in] pme The PME data structure. * \returns Pointer to sychronizer diff --git a/src/gromacs/ewald/pme_gpu.cpp b/src/gromacs/ewald/pme_gpu.cpp index ddbc2f9485..4685913dbd 100644 --- a/src/gromacs/ewald/pme_gpu.cpp +++ b/src/gromacs/ewald/pme_gpu.cpp @@ -174,23 +174,6 @@ void pme_gpu_prepare_computation(gmx_pme_t *pme, } } -void pme_gpu_copy_coordinates_to_gpu(gmx_pme_t *pme, - const rvec *coordinatesHost, - gmx_wallcycle *wcycle) -{ - GMX_ASSERT(pme_gpu_active(pme), "This should be a GPU run of PME but it is not enabled."); - - PmeGpu *pmeGpu = pme->gpu; - - // The only spot of PME GPU where LAUNCH_GPU counter increases call-count - wallcycle_start(wcycle, ewcLAUNCH_GPU); - // The only spot of PME GPU where ewcsLAUNCH_GPU_PME subcounter increases call-count - wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_PME); - pme_gpu_copy_input_coordinates(pmeGpu, coordinatesHost); - wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_PME); - wallcycle_stop(wcycle, ewcLAUNCH_GPU); -} - void pme_gpu_launch_spread(gmx_pme_t *pme, gmx_wallcycle *wcycle) { @@ -444,6 +427,15 @@ void *pme_gpu_get_device_f(const gmx_pme_t *pme) return pme_gpu_get_kernelparam_forces(pme->gpu); } +void pme_gpu_set_device_x(const gmx_pme_t *pme, + DeviceBuffer d_x) +{ + GMX_ASSERT(pme != nullptr, "Null pointer is passed as a PME to the set coordinates function."); + GMX_ASSERT(pme_gpu_active(pme), "This should be a GPU run of PME but it is not enabled."); + + pme_gpu_set_kernelparam_coordinates(pme->gpu, d_x); +} + void *pme_gpu_get_device_stream(const gmx_pme_t *pme) { if (!pme || !pme_gpu_active(pme)) @@ -453,6 +445,15 @@ void *pme_gpu_get_device_stream(const gmx_pme_t *pme) return pme_gpu_get_stream(pme->gpu); } +void *pme_gpu_get_device_context(const gmx_pme_t *pme) +{ + if (!pme || !pme_gpu_active(pme)) + { + return nullptr; + } + return pme_gpu_get_context(pme->gpu); +} + GpuEventSynchronizer * pme_gpu_get_f_ready_synchronizer(const gmx_pme_t *pme) { if (!pme || !pme_gpu_active(pme)) diff --git a/src/gromacs/ewald/pme_gpu_internal.cpp b/src/gromacs/ewald/pme_gpu_internal.cpp index c11580aa88..47ec41bb3c 100644 --- a/src/gromacs/ewald/pme_gpu_internal.cpp +++ b/src/gromacs/ewald/pme_gpu_internal.cpp @@ -233,23 +233,6 @@ void pme_gpu_realloc_coordinates(const PmeGpu *pmeGpu) } } -void pme_gpu_copy_input_coordinates(const PmeGpu *pmeGpu, const rvec *h_coordinates) -{ - GMX_ASSERT(h_coordinates, "Bad host-side coordinate buffer in PME GPU"); -#if GMX_DOUBLE - GMX_RELEASE_ASSERT(false, "Only single precision is supported"); - GMX_UNUSED_VALUE(h_coordinates); -#else - const float *h_coordinatesFloat = reinterpret_cast(h_coordinates); - copyToDeviceBuffer(&pmeGpu->kernelParams->atoms.d_coordinates, h_coordinatesFloat, - 0, pmeGpu->kernelParams->atoms.nAtoms * DIM, - pmeGpu->archSpecific->pmeStream, pmeGpu->settings.transferKind, nullptr); - // FIXME: sync required since the copied data will be used by PP stream when using single GPU for both - // Remove after adding the required event-based sync between the above H2D and the transform kernel - pme_gpu_synchronize(pmeGpu); -#endif -} - void pme_gpu_free_coordinates(const PmeGpu *pmeGpu) { freeDeviceBuffer(&pmeGpu->kernelParams->atoms.d_coordinates); @@ -967,7 +950,6 @@ void pme_gpu_destroy(PmeGpu *pmeGpu) pme_gpu_free_energy_virial(pmeGpu); pme_gpu_free_bspline_values(pmeGpu); pme_gpu_free_forces(pmeGpu); - pme_gpu_free_coordinates(pmeGpu); pme_gpu_free_coefficients(pmeGpu); pme_gpu_free_spline_data(pmeGpu); pme_gpu_free_grid_indices(pmeGpu); @@ -1002,7 +984,6 @@ void pme_gpu_reinit_atoms(PmeGpu *pmeGpu, const int nAtoms, const real *charges) if (haveToRealloc) { - pme_gpu_realloc_coordinates(pmeGpu); pme_gpu_realloc_forces(pmeGpu); pme_gpu_realloc_spline_data(pmeGpu); pme_gpu_realloc_grid_indices(pmeGpu); @@ -1317,6 +1298,44 @@ void * pme_gpu_get_kernelparam_forces(const PmeGpu *pmeGpu) } } +/*! \brief Check the validity of the device buffer. + * + * Checks if the buffer is not nullptr and, when possible, if it is big enough. + * + * \todo Split and move this function to gpu_utils. + * + * \param[in] buffer Device buffer to be checked. + * \param[in] requiredSize Number of elements that the buffer will have to accommodate. + * + * \returns If the device buffer can be set. + */ +template +static bool checkDeviceBuffer(gmx_unused DeviceBuffer buffer, gmx_unused int requiredSize) +{ +#if GMX_GPU == GMX_GPU_CUDA + GMX_ASSERT(buffer != nullptr, "The device pointer is nullptr"); + return buffer != nullptr; +#elif GMX_GPU == GMX_GPU_OPENCL + size_t size; + int retval = clGetMemObjectInfo(buffer, CL_MEM_SIZE, sizeof(size), &size, NULL); + GMX_ASSERT(retval == CL_SUCCESS, gmx::formatString("clGetMemObjectInfo failed with error code #%d", retval).c_str()); + GMX_ASSERT(static_cast(size) >= requiredSize, "Number of atoms in device buffer is smaller then required size."); + return retval == CL_SUCCESS && static_cast(size) >= requiredSize; +#elif GMX_GPU == GMX_GPU_NONE + GMX_ASSERT(false, "Setter for device-side coordinates was called in non-GPU build."); + return false; +#endif +} + +void pme_gpu_set_kernelparam_coordinates(const PmeGpu *pmeGpu, DeviceBuffer d_x) +{ + GMX_ASSERT(pmeGpu && pmeGpu->kernelParams, "PME GPU device buffer can not be set in non-GPU builds or before the GPU PME was initialized."); + + GMX_ASSERT(checkDeviceBuffer(d_x, pmeGpu->kernelParams->atoms.nAtoms), "The device-side buffer can not be set."); + + pmeGpu->kernelParams->atoms.d_coordinates = d_x; +} + void * pme_gpu_get_stream(const PmeGpu *pmeGpu) { if (pmeGpu) @@ -1329,6 +1348,18 @@ void * pme_gpu_get_stream(const PmeGpu *pmeGpu) } } +void * pme_gpu_get_context(const PmeGpu *pmeGpu) +{ + if (pmeGpu) + { + return static_cast(&pmeGpu->archSpecific->context); + } + else + { + return nullptr; + } +} + GpuEventSynchronizer *pme_gpu_get_forces_ready_synchronizer(const PmeGpu *pmeGpu) { if (pmeGpu && pmeGpu->kernelParams) diff --git a/src/gromacs/ewald/pme_gpu_internal.h b/src/gromacs/ewald/pme_gpu_internal.h index d6aafdbc85..bdd92bb83b 100644 --- a/src/gromacs/ewald/pme_gpu_internal.h +++ b/src/gromacs/ewald/pme_gpu_internal.h @@ -185,17 +185,6 @@ bool pme_gpu_stream_query(const PmeGpu *pmeGpu); */ void pme_gpu_realloc_coordinates(const PmeGpu *pmeGpu); -/*! \libinternal \brief - * Copies the input coordinates from the CPU buffer onto the GPU. - * - * \param[in] pmeGpu The PME GPU structure. - * \param[in] h_coordinates Input coordinates (XYZ rvec array). - * - * Needs to be called for every PME computation. The coordinates are then used in the spline calculation. - */ -GPU_FUNC_QUALIFIER void pme_gpu_copy_input_coordinates(const PmeGpu *GPU_FUNC_ARGUMENT(pmeGpu), - const rvec *GPU_FUNC_ARGUMENT(h_coordinates)) GPU_FUNC_TERM; - /*! \libinternal \brief * Frees the coordinates on the GPU. * @@ -448,6 +437,13 @@ GPU_FUNC_QUALIFIER void pme_gpu_gather(PmeGpu *GPU_FUNC_ARGUMENT( */ GPU_FUNC_QUALIFIER DeviceBuffer pme_gpu_get_kernelparam_coordinates(const PmeGpu *GPU_FUNC_ARGUMENT(pmeGpu)) GPU_FUNC_TERM_WITH_RETURN(DeviceBuffer {}); +/*! \brief Sets the device pointer to coordinate data + * \param[in] pmeGpu The PME GPU structure. + * \param[in] d_x Pointer to coordinate data + */ +GPU_FUNC_QUALIFIER void pme_gpu_set_kernelparam_coordinates(const PmeGpu *GPU_FUNC_ARGUMENT(pmeGpu), + DeviceBuffer GPU_FUNC_ARGUMENT(d_x)) GPU_FUNC_TERM; + /*! \brief Return pointer to device copy of force data. * \param[in] pmeGpu The PME GPU structure. * \returns Pointer to force data @@ -460,6 +456,12 @@ GPU_FUNC_QUALIFIER void * pme_gpu_get_kernelparam_forces(const PmeGpu *GPU_FUNC_ */ GPU_FUNC_QUALIFIER void * pme_gpu_get_stream(const PmeGpu *GPU_FUNC_ARGUMENT(pmeGpu)) GPU_FUNC_TERM_WITH_RETURN(nullptr); +/*! \brief Return pointer to GPU context (for OpenCL builds). + * \param[in] pmeGpu The PME GPU structure. + * \returns Pointer to context object. + */ +GPU_FUNC_QUALIFIER void * pme_gpu_get_context(const PmeGpu *GPU_FUNC_ARGUMENT(pmeGpu)) GPU_FUNC_TERM_WITH_RETURN(nullptr); + /*! \brief Return pointer to the sync object triggered after the PME force calculation completion * \param[in] pmeGpu The PME GPU structure. * \returns Pointer to sync object diff --git a/src/gromacs/ewald/pme_only.cpp b/src/gromacs/ewald/pme_only.cpp index 1a872bb383..eb81ebb79f 100644 --- a/src/gromacs/ewald/pme_only.cpp +++ b/src/gromacs/ewald/pme_only.cpp @@ -84,6 +84,7 @@ #include "gromacs/mdtypes/commrec.h" #include "gromacs/mdtypes/forceoutput.h" #include "gromacs/mdtypes/inputrec.h" +#include "gromacs/mdtypes/state_propagator_data_gpu.h" #include "gromacs/timing/cyclecounter.h" #include "gromacs/timing/wallcycle.h" #include "gromacs/utility/fatalerror.h" @@ -543,15 +544,21 @@ int gmx_pmeonly(struct gmx_pme_t *pme, std::vector pmedata; pmedata.push_back(pme); - auto pme_pp = gmx_pme_pp_init(cr); + auto pme_pp = gmx_pme_pp_init(cr); //TODO the variable below should be queried from the task assignment info - const bool useGpuForPme = (runMode == PmeRunMode::GPU) || (runMode == PmeRunMode::Mixed); + const bool useGpuForPme = (runMode == PmeRunMode::GPU) || (runMode == PmeRunMode::Mixed); + const void *commandStream = useGpuForPme ? pme_gpu_get_device_context(pme) : nullptr; + const void *gpuContext = useGpuForPme ? pme_gpu_get_device_stream(pme) : nullptr; + const int paddingSize = pme_gpu_get_padding_size(pme); if (useGpuForPme) { changePinningPolicy(&pme_pp->chargeA, pme_get_pinning_policy()); changePinningPolicy(&pme_pp->x, pme_get_pinning_policy()); } + // Unconditionally initialize the StatePropagatorDataGpu object to get more verbose message if it is used from CPU builds + auto stateGpu = std::make_unique(commandStream, gpuContext, GpuApiCallBehavior::Sync, paddingSize); + clear_nrnb(mynrnb); count = 0; @@ -585,6 +592,11 @@ int gmx_pmeonly(struct gmx_pme_t *pme, if (atomSetChanged) { gmx_pme_reinit_atoms(pme, natoms, pme_pp->chargeA.data()); + if (useGpuForPme) + { + stateGpu->reinit(natoms, natoms); + pme_gpu_set_device_x(pme, stateGpu->getCoordinates()); + } } if (ret == pmerecvqxRESETCOUNTERS) @@ -625,7 +637,8 @@ int gmx_pmeonly(struct gmx_pme_t *pme, //TODO this should be set properly by gmx_pme_recv_coeffs_coords, // or maybe use inputrecDynamicBox(ir), at the very least - change this when this codepath is tested! pme_gpu_prepare_computation(pme, boxChanged, box, wcycle, pmeFlags, useGpuPmeForceReduction); - pme_gpu_copy_coordinates_to_gpu(pme, as_rvec_array(pme_pp->x.data()), wcycle); + stateGpu->copyCoordinatesToGpu(gmx::ArrayRef(pme_pp->x), gmx::StatePropagatorDataGpu::AtomLocality::All); + pme_gpu_launch_spread(pme, wcycle); pme_gpu_launch_complex_transforms(pme, wcycle); pme_gpu_launch_gather(pme, wcycle, PmeForceOutputHandling::Set); diff --git a/src/gromacs/ewald/tests/pmegathertest.cpp b/src/gromacs/ewald/tests/pmegathertest.cpp index db72510636..0d4e94db6c 100644 --- a/src/gromacs/ewald/tests/pmegathertest.cpp +++ b/src/gromacs/ewald/tests/pmegathertest.cpp @@ -388,12 +388,13 @@ class PmeGatherTest : public ::testing::TestWithParam TestReferenceData refData; for (const auto &context : getPmeTestEnv()->getHardwareContexts()) { + std::shared_ptr stateGpu; CodePath codePath = context->getCodePath(); const bool supportedInput = pmeSupportsInputForMode(*getPmeTestEnv()->hwinfo(), &inputRec, codePath); if (!supportedInput) { /* Testing the failure for the unsupported input */ - EXPECT_THROW(pmeInitAtoms(&inputRec, codePath, nullptr, nullptr, inputAtomData.coordinates, inputAtomData.charges, box), NotImplementedError); + EXPECT_THROW(pmeInitAtoms(&inputRec, codePath, nullptr, nullptr, inputAtomData.coordinates, inputAtomData.charges, box, stateGpu), NotImplementedError); continue; } @@ -408,7 +409,7 @@ class PmeGatherTest : public ::testing::TestWithParam )); PmeSafePointer pmeSafe = pmeInitAtoms(&inputRec, codePath, context->getDeviceInfo(), - context->getPmeGpuProgram(), inputAtomData.coordinates, inputAtomData.charges, box); + context->getPmeGpuProgram(), inputAtomData.coordinates, inputAtomData.charges, box, stateGpu); /* Setting some more inputs */ pmeSetRealGrid(pmeSafe.get(), codePath, nonZeroGridValues); diff --git a/src/gromacs/ewald/tests/pmesplinespreadtest.cpp b/src/gromacs/ewald/tests/pmesplinespreadtest.cpp index eef1b9fe2d..a1c6eb4c1d 100644 --- a/src/gromacs/ewald/tests/pmesplinespreadtest.cpp +++ b/src/gromacs/ewald/tests/pmesplinespreadtest.cpp @@ -122,12 +122,13 @@ class PmeSplineAndSpreadTest : public ::testing::TestWithParamgetHardwareContexts()) { + std::shared_ptr stateGpu; CodePath codePath = context->getCodePath(); const bool supportedInput = pmeSupportsInputForMode(*getPmeTestEnv()->hwinfo(), &inputRec, codePath); if (!supportedInput) { /* Testing the failure for the unsupported input */ - EXPECT_THROW(pmeInitAtoms(&inputRec, codePath, nullptr, nullptr, coordinates, charges, box), NotImplementedError); + EXPECT_THROW(pmeInitAtoms(&inputRec, codePath, nullptr, nullptr, coordinates, charges, box, stateGpu), NotImplementedError); continue; } @@ -146,7 +147,7 @@ class PmeSplineAndSpreadTest : public ::testing::TestWithParamgetDeviceInfo(), - context->getPmeGpuProgram(), coordinates, charges, box); + context->getPmeGpuProgram(), coordinates, charges, box, stateGpu); const bool computeSplines = (option.first == PmeSplineAndSpreadOptions::SplineOnly) || (option.first == PmeSplineAndSpreadOptions::SplineAndSpreadUnified); const bool spreadCharges = (option.first == PmeSplineAndSpreadOptions::SpreadOnly) || (option.first == PmeSplineAndSpreadOptions::SplineAndSpreadUnified); diff --git a/src/gromacs/ewald/tests/pmetestcommon.cpp b/src/gromacs/ewald/tests/pmetestcommon.cpp index d5ce888798..61b28e587a 100644 --- a/src/gromacs/ewald/tests/pmetestcommon.cpp +++ b/src/gromacs/ewald/tests/pmetestcommon.cpp @@ -169,13 +169,14 @@ PmeSafePointer pmeInitEmpty(const t_inputrec *inputRec, } //! PME initialization with atom data -PmeSafePointer pmeInitAtoms(const t_inputrec *inputRec, - CodePath mode, - const gmx_device_info_t *gpuInfo, - PmeGpuProgramHandle pmeGpuProgram, - const CoordinatesVector &coordinates, - const ChargesVector &charges, - const Matrix3x3 &box +PmeSafePointer pmeInitAtoms(const t_inputrec *inputRec, + CodePath mode, + const gmx_device_info_t *gpuInfo, + PmeGpuProgramHandle pmeGpuProgram, + const CoordinatesVector &coordinates, + const ChargesVector &charges, + const Matrix3x3 &box, + std::shared_ptr stateGpu ) { const index atomCount = coordinates.size(); @@ -199,7 +200,16 @@ PmeSafePointer pmeInitAtoms(const t_inputrec *inputRec, // We need to set atc->n for passing the size in the tests atc->setNumAtoms(atomCount); gmx_pme_reinit_atoms(pmeSafe.get(), atomCount, charges.data()); - pme_gpu_copy_input_coordinates(pmeSafe->gpu, as_rvec_array(coordinates.data())); + + // TODO: Pin the host buffer and use async memory copies + stateGpu = std::make_unique(pme_gpu_get_device_stream(pmeSafe.get()), + pme_gpu_get_device_context(pmeSafe.get()), + GpuApiCallBehavior::Sync, + pme_gpu_get_padding_size(pmeSafe.get())); + stateGpu->reinit(atomCount, atomCount); + stateGpu->copyCoordinatesToGpu(arrayRefFromArray(coordinates.data(), coordinates.size()), gmx::StatePropagatorDataGpu::AtomLocality::All); + pme_gpu_set_kernelparam_coordinates(pmeSafe->gpu, stateGpu->getCoordinates()); + break; default: diff --git a/src/gromacs/ewald/tests/pmetestcommon.h b/src/gromacs/ewald/tests/pmetestcommon.h index e9290dbae9..b51551c29a 100644 --- a/src/gromacs/ewald/tests/pmetestcommon.h +++ b/src/gromacs/ewald/tests/pmetestcommon.h @@ -51,6 +51,7 @@ #include "gromacs/ewald/pme.h" #include "gromacs/ewald/pme_gpu_internal.h" #include "gromacs/math/gmxcomplex.h" +#include "gromacs/mdtypes/state_propagator_data_gpu.h" #include "gromacs/utility/arrayref.h" #include "gromacs/utility/unique_cptr.h" @@ -125,13 +126,14 @@ PmeSafePointer pmeInitEmpty(const t_inputrec *inputRec, const Matrix3x3 &box = {{1.0F, 0.0F, 0.0F, 0.0F, 1.0F, 0.0F, 0.0F, 0.0F, 1.0F}}, real ewaldCoeff_q = 0.0F, real ewaldCoeff_lj = 0.0F); //! PME initialization with atom data and system box -PmeSafePointer pmeInitAtoms(const t_inputrec *inputRec, - CodePath mode, - const gmx_device_info_t *gpuInfo, - PmeGpuProgramHandle pmeGpuProgram, - const CoordinatesVector &coordinates, - const ChargesVector &charges, - const Matrix3x3 &box +PmeSafePointer pmeInitAtoms(const t_inputrec *inputRec, + CodePath mode, + const gmx_device_info_t *gpuInfo, + PmeGpuProgramHandle pmeGpuProgram, + const CoordinatesVector &coordinates, + const ChargesVector &charges, + const Matrix3x3 &box, + std::shared_ptr stateGpu ); //! PME spline computation and charge spreading void pmePerformSplineAndSpread(gmx_pme_t *pme, CodePath mode, diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp index 16c891532d..a77e5751cf 100644 --- a/src/gromacs/mdlib/sim_util.cpp +++ b/src/gromacs/mdlib/sim_util.cpp @@ -86,6 +86,7 @@ #include "gromacs/mdtypes/md_enums.h" #include "gromacs/mdtypes/simulation_workload.h" #include "gromacs/mdtypes/state.h" +#include "gromacs/mdtypes/state_propagator_data_gpu.h" #include "gromacs/nbnxm/atomdata.h" #include "gromacs/nbnxm/gpu_data_mgmt.h" #include "gromacs/nbnxm/nbnxm.h" @@ -601,7 +602,6 @@ computeSpecialForces(FILE *fplog, * * \param[in] pmedata The PME structure * \param[in] box The box matrix - * \param[in] x Coordinate array * \param[in] stepWork Step schedule flags * \param[in] pmeFlags PME flags * \param[in] useGpuForceReduction True if GPU-based force reduction is active this step @@ -609,14 +609,12 @@ computeSpecialForces(FILE *fplog, */ static inline void launchPmeGpuSpread(gmx_pme_t *pmedata, const matrix box, - const rvec x[], const StepWorkload &stepWork, int pmeFlags, bool useGpuForceReduction, gmx_wallcycle_t wcycle) { pme_gpu_prepare_computation(pmedata, stepWork.haveDynamicBox, box, wcycle, pmeFlags, useGpuForceReduction); - pme_gpu_copy_coordinates_to_gpu(pmedata, x, wcycle); pme_gpu_launch_spread(pmedata, wcycle); } @@ -889,12 +887,13 @@ void do_force(FILE *fplog, int legacyFlags, const DDBalanceRegionHandler &ddBalanceRegionHandler) { - int i, j; - double mu[2*DIM]; - gmx_bool bFillGrid, bCalcCGCM; - gmx_bool bUseGPU, bUseOrEmulGPU; - nonbonded_verlet_t *nbv = fr->nbv.get(); - interaction_const_t *ic = fr->ic; + int i, j; + double mu[2*DIM]; + gmx_bool bFillGrid, bCalcCGCM; + gmx_bool bUseGPU, bUseOrEmulGPU; + nonbonded_verlet_t *nbv = fr->nbv.get(); + interaction_const_t *ic = fr->ic; + gmx::StatePropagatorDataGpu *stateGpu = fr->stateGpu; // TODO remove the code below when the legacy flags are not in use anymore /* modify force flag if not doing nonbonded */ @@ -998,9 +997,27 @@ void do_force(FILE *fplog, } #endif /* GMX_MPI */ + // Coordinates on the device are needed if PME or BufferOps are offloaded. + // The local coordinates can be copied right away. + // NOTE: Consider moving this copy to right after they are updated and constrained, + // if the later is not offloaded. + if (useGpuPme || useGpuXBufOps == BufferOpsUseGpu::True) + { + if (stepWork.doNeighborSearch) + { + stateGpu->reinit(mdatoms->homenr, cr->dd != nullptr ? dd_numAtomsZones(*cr->dd) : mdatoms->homenr); + if (useGpuPme) + { + // TODO: This should be moved into PME setup function ( pme_gpu_prepare_computation(...) ) + pme_gpu_set_device_x(fr->pmedata, stateGpu->getCoordinates()); + } + } + stateGpu->copyCoordinatesToGpu(x.unpaddedArrayRef(), gmx::StatePropagatorDataGpu::AtomLocality::Local); + } + if (useGpuPme) { - launchPmeGpuSpread(fr->pmedata, box, as_rvec_array(x.unpaddedArrayRef().data()), stepWork, pmeFlags, useGpuPmeFReduction, wcycle); + launchPmeGpuSpread(fr->pmedata, box, stepWork, pmeFlags, useGpuPmeFReduction, wcycle); } /* do gridding for pair search */ @@ -1124,14 +1141,8 @@ void do_force(FILE *fplog, { if (useGpuXBufOps == BufferOpsUseGpu::True) { - // The condition here was (pme != nullptr && pme_gpu_get_device_x(fr->pmedata) != nullptr) - if (!useGpuPme) - { - nbv->copyCoordinatesToGpu(Nbnxm::AtomLocality::Local, false, - x.unpaddedArrayRef()); - } nbv->convertCoordinatesGpu(Nbnxm::AtomLocality::Local, false, - useGpuPme ? pme_gpu_get_device_x(fr->pmedata) : nbv->getDeviceCoordinates()); + stateGpu->getCoordinates()); } else { @@ -1210,9 +1221,7 @@ void do_force(FILE *fplog, wallcycle_stop(wcycle, ewcNS); if (ddUsesGpuDirectCommunication) { - rvec* d_x = static_cast (nbv->get_gpu_xrvec()); - rvec* d_f = static_cast (nbv->get_gpu_frvec()); - gpuHaloExchange->reinitHalo(d_x, d_f); + gpuHaloExchange->reinitHalo(stateGpu->getCoordinates(), stateGpu->getForces()); } } else @@ -1226,7 +1235,7 @@ void do_force(FILE *fplog, if (domainWork.haveCpuBondedWork || domainWork.haveFreeEnergyWork) { //non-local part of coordinate buffer must be copied back to host for CPU work - nbv->launch_copy_x_from_gpu(as_rvec_array(x.unpaddedArrayRef().data()), Nbnxm::AtomLocality::NonLocal); + stateGpu->copyCoordinatesFromGpu(x.unpaddedArrayRef(), gmx::StatePropagatorDataGpu::AtomLocality::NonLocal); } } else @@ -1239,11 +1248,10 @@ void do_force(FILE *fplog, // The condition here was (pme != nullptr && pme_gpu_get_device_x(fr->pmedata) != nullptr) if (!useGpuPme && !ddUsesGpuDirectCommunication) { - nbv->copyCoordinatesToGpu(Nbnxm::AtomLocality::NonLocal, false, - x.unpaddedArrayRef()); + stateGpu->copyCoordinatesToGpu(x.unpaddedArrayRef(), gmx::StatePropagatorDataGpu::AtomLocality::NonLocal); } nbv->convertCoordinatesGpu(Nbnxm::AtomLocality::NonLocal, false, - useGpuPme ? pme_gpu_get_device_x(fr->pmedata) : nbv->getDeviceCoordinates()); + stateGpu->getCoordinates()); } else { @@ -1494,17 +1502,16 @@ void do_force(FILE *fplog, // which are a dependency for the GPU force reduction. bool haveNonLocalForceContribInCpuBuffer = domainWork.haveCpuBondedWork || domainWork.haveFreeEnergyWork; - rvec *f = as_rvec_array(forceWithShiftForces.force().data()); if (haveNonLocalForceContribInCpuBuffer) { - nbv->launch_copy_f_to_gpu(f, Nbnxm::AtomLocality::NonLocal); + stateGpu->copyForcesToGpu(forceOut.forceWithShiftForces().force(), gmx::StatePropagatorDataGpu::AtomLocality::NonLocal); } nbv->atomdata_add_nbat_f_to_f_gpu(Nbnxm::AtomLocality::NonLocal, - nbv->getDeviceForces(), + stateGpu->getForces(), pme_gpu_get_device_f(fr->pmedata), pme_gpu_get_f_ready_synchronizer(fr->pmedata), useGpuPmeFReduction, haveNonLocalForceContribInCpuBuffer); - nbv->launch_copy_f_from_gpu(f, Nbnxm::AtomLocality::NonLocal); + stateGpu->copyForcesFromGpu(forceOut.forceWithShiftForces().force(), gmx::StatePropagatorDataGpu::AtomLocality::NonLocal); } else { @@ -1538,17 +1545,14 @@ void do_force(FILE *fplog, if (stepWork.computeForces) { - gmx::ArrayRef force = forceOut.forceWithShiftForces().force(); - rvec *f = as_rvec_array(force.data()); if (useGpuForcesHaloExchange) { if (haveCpuLocalForces) { - nbv->launch_copy_f_to_gpu(f, Nbnxm::AtomLocality::Local); + stateGpu->copyForcesToGpu(forceOut.forceWithShiftForces().force(), gmx::StatePropagatorDataGpu::AtomLocality::Local); } - bool accumulateHaloForces = haveCpuLocalForces; - gpuHaloExchange->communicateHaloForces(accumulateHaloForces); + gpuHaloExchange->communicateHaloForces(haveCpuLocalForces); } else { @@ -1643,10 +1647,9 @@ void do_force(FILE *fplog, // - copy is not perfomed if GPU force halo exchange is active, because it would overwrite the result // of the halo exchange. In that case the copy is instead performed above, before the exchange. // These should be unified. - rvec *f = as_rvec_array(forceWithShift.data()); if (haveLocalForceContribInCpuBuffer && !useGpuForcesHaloExchange) { - nbv->launch_copy_f_to_gpu(f, Nbnxm::AtomLocality::Local); + stateGpu->copyForcesToGpu(forceWithShift, gmx::StatePropagatorDataGpu::AtomLocality::Local); } if (useGpuForcesHaloExchange) { @@ -1658,12 +1661,13 @@ void do_force(FILE *fplog, nbv->stream_local_wait_for_nonlocal(); } nbv->atomdata_add_nbat_f_to_f_gpu(Nbnxm::AtomLocality::Local, - nbv->getDeviceForces(), + stateGpu->getForces(), pme_gpu_get_device_f(fr->pmedata), pme_gpu_get_f_ready_synchronizer(fr->pmedata), useGpuPmeFReduction, haveLocalForceContribInCpuBuffer); - nbv->launch_copy_f_from_gpu(f, Nbnxm::AtomLocality::Local); + // This function call synchronizes the local stream nbv->wait_for_gpu_force_reduction(Nbnxm::AtomLocality::Local); + stateGpu->copyForcesFromGpu(forceWithShift, gmx::StatePropagatorDataGpu::AtomLocality::Local); } else { diff --git a/src/gromacs/mdlib/update_constrain_cuda.h b/src/gromacs/mdlib/update_constrain_cuda.h index 46b173ddab..714ad7e824 100644 --- a/src/gromacs/mdlib/update_constrain_cuda.h +++ b/src/gromacs/mdlib/update_constrain_cuda.h @@ -48,6 +48,7 @@ #ifndef GMX_MDLIB_UPDATE_CONSTRAIN_CUDA_H #define GMX_MDLIB_UPDATE_CONSTRAIN_CUDA_H +#include "gromacs/gpu_utils/devicebuffer_datatype.h" #include "gromacs/mdtypes/group.h" #include "gromacs/utility/arrayref.h" #include "gromacs/utility/classhelpers.h" @@ -84,7 +85,7 @@ class UpdateConstrainCuda * This will extract temperature scaling factors from tcstat, transform them into the plain * array and call the normal integrate method. * - * \param[in] dt Timestep + * \param[in] dt Timestep. * \param[in] updateVelocities If the velocities should be constrained. * \param[in] computeVirial If virial should be updated. * \param[out] virial Place to save virial tensor. @@ -104,16 +105,21 @@ class UpdateConstrainCuda float dtPressureCouple, const matrix velocityScalingMatrix); - /*! \brief - * Update data-structures (e.g. after NB search step). + /*! \brief Set the pointers and update data-structures (e.g. after NB search step). * - * \param[in] idef System topology - * \param[in] md Atoms data. - * \param[in] numTempScaleValues Number of temperature scaling groups. Zero for no temperature scaling. + * \param[in,out] d_x Device buffer with coordinates. + * \param[in,out] d_v Device buffer with velocities. + * \param[in] d_f Device buffer with forces. + * \param[in] idef System topology + * \param[in] md Atoms data. + * \param[in] numTempScaleValues Number of temperature scaling groups. Zero for no temperature scaling. */ - void set(const t_idef &idef, - const t_mdatoms &md, - int numTempScaleValues); + void set(DeviceBuffer d_x, + DeviceBuffer d_v, + DeviceBuffer d_f, + const t_idef &idef, + const t_mdatoms &md, + int numTempScaleValues); /*! \brief * Update PBC data. @@ -124,72 +130,6 @@ class UpdateConstrainCuda */ void setPbc(const t_pbc *pbc); - /*! \brief - * Copy coordinates from CPU to GPU. - * - * The data are assumed to be in float3/fvec format (single precision). - * - * \param[in] h_x CPU pointer where coordinates should be copied from. - */ - void copyCoordinatesToGpu(const rvec *h_x); - - /*! \brief - * Copy velocities from CPU to GPU. - * - * The data are assumed to be in float3/fvec format (single precision). - * - * \param[in] h_v CPU pointer where velocities should be copied from. - */ - void copyVelocitiesToGpu(const rvec *h_v); - - /*! \brief - * Copy forces from CPU to GPU. - * - * The data are assumed to be in float3/fvec format (single precision). - * - * \param[in] h_f CPU pointer where forces should be copied from. - */ - void copyForcesToGpu(const rvec *h_f); - - /*! \brief - * Copy coordinates from GPU to CPU. - * - * The data are assumed to be in float3/fvec format (single precision). - * - * \param[out] h_xp CPU pointer where coordinates should be copied to. - */ - void copyCoordinatesFromGpu(rvec *h_xp); - - /*! \brief - * Copy velocities from GPU to CPU. - * - * The velocities are assumed to be in float3/fvec format (single precision). - * - * \param[in] h_v Pointer to velocities data. - */ - void copyVelocitiesFromGpu(rvec *h_v); - - /*! \brief - * Copy forces from GPU to CPU. - * - * The forces are assumed to be in float3/fvec format (single precision). - * - * \param[in] h_f Pointer to forces data. - */ - void copyForcesFromGpu(rvec *h_f); - - /*! \brief - * Set the internal GPU-memory d_x, d_xp and d_v pointers. - * - * Data is not copied. The data are assumed to be in float3/fvec format - * (float3 is used internally, but the data layout should be identical). - * - * \param[in] d_x Pointer to the coordinates for the input (on GPU) - * \param[in] d_xp Pointer to the coordinates for the output (on GPU) - * \param[in] d_v Pointer to the velocities (on GPU) - * \param[in] d_f Pointer to the forces (on GPU) - */ - void setXVFPointers(rvec *d_x, rvec *d_xp, rvec *d_v, rvec *d_f); private: class Impl; diff --git a/src/gromacs/mdlib/update_constrain_cuda_impl.cpp b/src/gromacs/mdlib/update_constrain_cuda_impl.cpp index 075ca163d5..b5823ed97f 100644 --- a/src/gromacs/mdlib/update_constrain_cuda_impl.cpp +++ b/src/gromacs/mdlib/update_constrain_cuda_impl.cpp @@ -78,9 +78,12 @@ void UpdateConstrainCuda::integrate(gmx_unused const real GMX_ASSERT(false, "A CPU stub for UpdateConstrain was called insted of the correct implementation."); } -void UpdateConstrainCuda::set(gmx_unused const t_idef &idef, - gmx_unused const t_mdatoms &md, - gmx_unused const int numTempScaleValues) +void UpdateConstrainCuda::set(gmx_unused DeviceBuffer d_x, + gmx_unused DeviceBuffer d_v, + gmx_unused const DeviceBuffer d_f, + gmx_unused const t_idef &idef, + gmx_unused const t_mdatoms &md, + gmx_unused const int numTempScaleValues) { GMX_ASSERT(false, "A CPU stub for UpdateConstrain was called insted of the correct implementation."); } @@ -90,44 +93,6 @@ void UpdateConstrainCuda::setPbc(gmx_unused const t_pbc *pbc) GMX_ASSERT(false, "A CPU stub for UpdateConstrain was called insted of the correct implementation."); } -void UpdateConstrainCuda::copyCoordinatesToGpu(gmx_unused const rvec *h_x) -{ - GMX_ASSERT(false, "A CPU stub for UpdateConstrain was called insted of the correct implementation."); -} - -void UpdateConstrainCuda::copyVelocitiesToGpu(gmx_unused const rvec *h_v) -{ - GMX_ASSERT(false, "A CPU stub for UpdateConstrain was called insted of the correct implementation."); -} - -void UpdateConstrainCuda::copyForcesToGpu(gmx_unused const rvec *h_f) -{ - GMX_ASSERT(false, "A CPU stub for UpdateConstrain was called insted of the correct implementation."); -} - -void UpdateConstrainCuda::copyCoordinatesFromGpu(gmx_unused rvec *h_xp) -{ - GMX_ASSERT(false, "A CPU stub for UpdateConstrain was called insted of the correct implementation."); -} - -void UpdateConstrainCuda::copyVelocitiesFromGpu(gmx_unused rvec *h_v) -{ - GMX_ASSERT(false, "A CPU stub for UpdateConstrain was called insted of the correct implementation."); -} - -void UpdateConstrainCuda::copyForcesFromGpu(gmx_unused rvec *h_f) -{ - GMX_ASSERT(false, "A CPU stub for UpdateConstrain was called insted of the correct implementation."); -} - -void UpdateConstrainCuda::setXVFPointers(gmx_unused rvec *d_x, - gmx_unused rvec *d_xp, - gmx_unused rvec *d_v, - gmx_unused rvec *d_f) -{ - GMX_ASSERT(false, "A CPU stub for UpdateConstrain was called insted of the correct implementation."); -} - } // namespace gmx #endif /* GMX_GPU != GMX_GPU_CUDA */ diff --git a/src/gromacs/mdlib/update_constrain_cuda_impl.cu b/src/gromacs/mdlib/update_constrain_cuda_impl.cu index f373d27911..9ba9626738 100644 --- a/src/gromacs/mdlib/update_constrain_cuda_impl.cu +++ b/src/gromacs/mdlib/update_constrain_cuda_impl.cu @@ -105,6 +105,9 @@ void UpdateConstrainCuda::Impl::integrate(const real dt, } } + // TODO: This should be eliminated + cudaMemcpy(d_x_, d_xp_, numAtoms_*sizeof(float3), cudaMemcpyDeviceToDevice); + return; } @@ -124,16 +127,24 @@ UpdateConstrainCuda::Impl::~Impl() { } -void UpdateConstrainCuda::Impl::set(const t_idef &idef, - const t_mdatoms &md, - const int numTempScaleValues) +void UpdateConstrainCuda::Impl::set(DeviceBuffer d_x, + DeviceBuffer d_v, + const DeviceBuffer d_f, + const t_idef &idef, + const t_mdatoms &md, + const int numTempScaleValues) { + GMX_ASSERT(d_x != nullptr, "Coordinates device buffer should not be null."); + GMX_ASSERT(d_v != nullptr, "Velocities device buffer should not be null."); + GMX_ASSERT(d_f != nullptr, "Forces device buffer should not be null."); + + d_x_ = reinterpret_cast(d_x); + d_v_ = reinterpret_cast(d_v); + d_f_ = reinterpret_cast(d_f); + numAtoms_ = md.nr; - reallocateDeviceBuffer(&d_x_, numAtoms_, &numX_, &numXAlloc_, nullptr); reallocateDeviceBuffer(&d_xp_, numAtoms_, &numXp_, &numXpAlloc_, nullptr); - reallocateDeviceBuffer(&d_v_, numAtoms_, &numV_, &numVAlloc_, nullptr); - reallocateDeviceBuffer(&d_f_, numAtoms_, &numF_, &numFAlloc_, nullptr); reallocateDeviceBuffer(&d_inverseMasses_, numAtoms_, &numInverseMasses_, &numInverseMassesAlloc_, nullptr); @@ -152,44 +163,6 @@ void UpdateConstrainCuda::Impl::setPbc(const t_pbc *pbc) settleCuda_->setPbc(pbc); } -void UpdateConstrainCuda::Impl::copyCoordinatesToGpu(const rvec *h_x) -{ - copyToDeviceBuffer(&d_x_, (float3*)h_x, 0, numAtoms_, commandStream_, GpuApiCallBehavior::Sync, nullptr); -} - -void UpdateConstrainCuda::Impl::copyVelocitiesToGpu(const rvec *h_v) -{ - copyToDeviceBuffer(&d_v_, (float3*)h_v, 0, numAtoms_, commandStream_, GpuApiCallBehavior::Sync, nullptr); -} - -void UpdateConstrainCuda::Impl::copyForcesToGpu(const rvec *h_f) -{ - copyToDeviceBuffer(&d_f_, (float3*)h_f, 0, numAtoms_, commandStream_, GpuApiCallBehavior::Sync, nullptr); -} - -void UpdateConstrainCuda::Impl::copyCoordinatesFromGpu(rvec *h_xp) -{ - copyFromDeviceBuffer((float3*)h_xp, &d_xp_, 0, numAtoms_, commandStream_, GpuApiCallBehavior::Sync, nullptr); -} - -void UpdateConstrainCuda::Impl::copyVelocitiesFromGpu(rvec *h_v) -{ - copyFromDeviceBuffer((float3*)h_v, &d_v_, 0, numAtoms_, commandStream_, GpuApiCallBehavior::Sync, nullptr); -} - -void UpdateConstrainCuda::Impl::copyForcesFromGpu(rvec *h_f) -{ - copyFromDeviceBuffer((float3*)h_f, &d_f_, 0, numAtoms_, commandStream_, GpuApiCallBehavior::Sync, nullptr); -} - -void UpdateConstrainCuda::Impl::setXVFPointers(rvec *d_x, rvec *d_xp, rvec *d_v, rvec *d_f) -{ - d_x_ = (float3*)d_x; - d_xp_ = (float3*)d_xp; - d_v_ = (float3*)d_v; - d_f_ = (float3*)d_f; -} - UpdateConstrainCuda::UpdateConstrainCuda(const t_inputrec &ir, const gmx_mtop_t &mtop, const void *commandStream) @@ -207,18 +180,21 @@ void UpdateConstrainCuda::integrate(const real dt, gmx::ArrayRef tcstat, const bool doPressureCouple, const float dtPressureCouple, - const matrix pRVScalingMatrix) + const matrix velocityScalingMatrix) { impl_->integrate(dt, updateVelocities, computeVirial, virialScaled, doTempCouple, tcstat, - doPressureCouple, dtPressureCouple, pRVScalingMatrix); + doPressureCouple, dtPressureCouple, velocityScalingMatrix); } -void UpdateConstrainCuda::set(const t_idef &idef, - const t_mdatoms &md, - const int numTempScaleValues) +void UpdateConstrainCuda::set(DeviceBuffer d_x, + DeviceBuffer d_v, + const DeviceBuffer d_f, + const t_idef &idef, + const t_mdatoms &md, + const int numTempScaleValues) { - impl_->set(idef, md, numTempScaleValues); + impl_->set(d_x, d_v, d_f, idef, md, numTempScaleValues); } void UpdateConstrainCuda::setPbc(const t_pbc *pbc) @@ -226,39 +202,4 @@ void UpdateConstrainCuda::setPbc(const t_pbc *pbc) impl_->setPbc(pbc); } -void UpdateConstrainCuda::copyCoordinatesToGpu(const rvec *h_x) -{ - impl_->copyCoordinatesToGpu(h_x); -} - -void UpdateConstrainCuda::copyVelocitiesToGpu(const rvec *h_v) -{ - impl_->copyVelocitiesToGpu(h_v); -} - -void UpdateConstrainCuda::copyForcesToGpu(const rvec *h_f) -{ - impl_->copyForcesToGpu(h_f); -} - -void UpdateConstrainCuda::copyCoordinatesFromGpu(rvec *h_xp) -{ - impl_->copyCoordinatesFromGpu(h_xp); -} - -void UpdateConstrainCuda::copyVelocitiesFromGpu(rvec *h_v) -{ - impl_->copyVelocitiesFromGpu(h_v); -} - -void UpdateConstrainCuda::copyForcesFromGpu(rvec *h_f) -{ - impl_->copyForcesFromGpu(h_f); -} - -void UpdateConstrainCuda::setXVFPointers(rvec *d_x, rvec *d_xp, rvec *d_v, rvec *d_f) -{ - impl_->setXVFPointers(d_x, d_xp, d_v, d_f); -} - } //namespace gmx diff --git a/src/gromacs/mdlib/update_constrain_cuda_impl.h b/src/gromacs/mdlib/update_constrain_cuda_impl.h index b2f4c0eb5e..652dd84eb6 100644 --- a/src/gromacs/mdlib/update_constrain_cuda_impl.h +++ b/src/gromacs/mdlib/update_constrain_cuda_impl.h @@ -87,7 +87,7 @@ class UpdateConstrainCuda::Impl * 2. This is the temperature coupling step. * Parameters virial/lambdas can be nullptr if computeVirial/doTempCouple are false. * - * \param[in] dt Timestep + * \param[in] dt Timestep. * \param[in] updateVelocities If the velocities should be constrained. * \param[in] computeVirial If virial should be updated. * \param[out] virial Place to save virial tensor. @@ -97,26 +97,31 @@ class UpdateConstrainCuda::Impl * \param[in] dtPressureCouple Period between pressure coupling steps * \param[in] velocityScalingMatrix Parrinello-Rahman velocity scaling matrix */ - void integrate(const real dt, - const bool updateVelocities, - const bool computeVirial, + void integrate(real dt, + bool updateVelocities, + bool computeVirial, tensor virial, - const bool doTempCouple, + bool doTempCouple, gmx::ArrayRef tcstat, - const bool doPressureCouple, - const float dtPressureCouple, + bool doPressureCouple, + float dtPressureCouple, const matrix velocityScalingMatrix); - /*! \brief - * Update data-structures (e.g. after NB search step). + /*! \brief Set the pointers and update data-structures (e.g. after NB search step). * - * \param[in] idef System topology - * \param[in] md Atoms data. - * \param[in] numTempScaleValues Number of temperature scaling groups. Set zero for no temperature coupling. + * \param[in,out] d_x Device buffer with coordinates. + * \param[in,out] d_v Device buffer with velocities. + * \param[in] d_f Device buffer with forces. + * \param[in] idef System topology + * \param[in] md Atoms data. + * \param[in] numTempScaleValues Number of temperature scaling groups. Set zero for no temperature coupling. */ - void set(const t_idef &idef, - const t_mdatoms &md, - const int numTempScaleValues); + void set(DeviceBuffer d_x, + DeviceBuffer d_v, + const DeviceBuffer d_f, + const t_idef &idef, + const t_mdatoms &md, + const int numTempScaleValues); /*! \brief * Update PBC data. @@ -127,73 +132,6 @@ class UpdateConstrainCuda::Impl */ void setPbc(const t_pbc *pbc); - /*! \brief - * Copy coordinates from CPU to GPU. - * - * The data are assumed to be in float3/fvec format (single precision). - * - * \param[in] h_x CPU pointer where coordinates should be copied from. - */ - void copyCoordinatesToGpu(const rvec *h_x); - - /*! \brief - * Copy velocities from CPU to GPU. - * - * The data are assumed to be in float3/fvec format (single precision). - * - * \param[in] h_v CPU pointer where velocities should be copied from. - */ - void copyVelocitiesToGpu(const rvec *h_v); - - /*! \brief - * Copy forces from CPU to GPU. - * - * The data are assumed to be in float3/fvec format (single precision). - * - * \param[in] h_f CPU pointer where forces should be copied from. - */ - void copyForcesToGpu(const rvec *h_f); - - /*! \brief - * Copy coordinates from GPU to CPU. - * - * The data are assumed to be in float3/fvec format (single precision). - * - * \param[out] h_xp CPU pointer where coordinates should be copied to. - */ - void copyCoordinatesFromGpu(rvec *h_xp); - - /*! \brief - * Copy velocities from GPU to CPU. - * - * The velocities are assumed to be in float3/fvec format (single precision). - * - * \param[in] h_v Pointer to velocities data. - */ - void copyVelocitiesFromGpu(rvec *h_v); - - /*! \brief - * Copy forces from GPU to CPU. - * - * The forces are assumed to be in float3/fvec format (single precision). - * - * \param[in] h_f Pointer to forces data. - */ - void copyForcesFromGpu(rvec *h_f); - - /*! \brief - * Set the internal GPU-memory x, xprime and v pointers. - * - * Data is not copied. The data are assumed to be in float3/fvec format - * (float3 is used internally, but the data layout should be identical). - * - * \param[in] d_x Pointer to the coordinates for the input (on GPU) - * \param[in] d_xp Pointer to the coordinates for the output (on GPU) - * \param[in] d_v Pointer to the velocities (on GPU) - * \param[in] d_f Pointer to the forces (on GPU) - */ - void setXVFPointers(rvec *d_x, rvec *d_xp, rvec *d_v, rvec *d_f); - private: //! CUDA stream @@ -205,33 +143,20 @@ class UpdateConstrainCuda::Impl //! Number of atoms int numAtoms_; - //! Coordinates before the timestep (on GPU) + //! Local copy of the pointer to the device positions buffer float3 *d_x_; - //! Number of elements in coordinates buffer - int numX_ = -1; - //! Allocation size for the coordinates buffer - int numXAlloc_ = -1; + //! Local copy of the pointer to the device velocities buffer + float3 *d_v_; + //! Local copy of the pointer to the device forces buffer + float3 *d_f_; - //! Coordinates after the timestep (on GPU). + //! Device buffer for intermediate positions (maintained internally) float3 *d_xp_; //! Number of elements in shifted coordinates buffer int numXp_ = -1; //! Allocation size for the shifted coordinates buffer int numXpAlloc_ = -1; - //! Velocities of atoms (on GPU) - float3 *d_v_; - //! Number of elements in velocities buffer - int numV_ = -1; - //! Allocation size for the velocities buffer - int numVAlloc_ = -1; - - //! Forces, exerted by atoms (on GPU) - float3 *d_f_; - //! Number of elements in forces buffer - int numF_ = -1; - //! Allocation size for the forces buffer - int numFAlloc_ = -1; //! 1/mass for all atoms (GPU) real *d_inverseMasses_; diff --git a/src/gromacs/mdrun/md.cpp b/src/gromacs/mdrun/md.cpp index 370cc51175..99f2a4f0de 100644 --- a/src/gromacs/mdrun/md.cpp +++ b/src/gromacs/mdrun/md.cpp @@ -117,7 +117,9 @@ #include "gromacs/mdtypes/observableshistory.h" #include "gromacs/mdtypes/pullhistory.h" #include "gromacs/mdtypes/state.h" +#include "gromacs/mdtypes/state_propagator_data_gpu.h" #include "gromacs/modularsimulator/energyelement.h" +#include "gromacs/nbnxm/gpu_data_mgmt.h" #include "gromacs/nbnxm/nbnxm.h" #include "gromacs/pbcutil/mshift.h" #include "gromacs/pbcutil/pbc.h" @@ -315,8 +317,15 @@ void gmx::LegacySimulator::do_md() upd.setNumAtoms(state->natoms); } +/*****************************************************************************************/ +// TODO: The following block of code should be refactored, once: +// 1. We have the useGpuForBufferOps variable set and available here and in do_force(...) +// 2. The proper GPU syncronization is introduced, so that the H2D and D2H data copies can be performed in the separate +// stream owned by the StatePropagatorDataGpu bool useGpuForPme = (fr->pmedata != nullptr) && (pme_run_mode(fr->pmedata) != PmeRunMode::CPU); bool useGpuForNonbonded = fr->nbv->useGpu(); + // Temporary solution to make sure that the buffer ops are offloaded when update is offloaded + bool useGpuForBufferOps = (getenv("GMX_USE_GPU_BUFFER_OPS") != nullptr); if (useGpuForUpdate) { @@ -346,10 +355,19 @@ void gmx::LegacySimulator::do_md() integrator = std::make_unique(*ir, *top_global, nullptr); } - if (fr->nbv->useGpu()) + if (useGpuForPme || (useGpuForNonbonded && useGpuForBufferOps) || useGpuForUpdate) { - changePinningPolicy(&state->x, gmx::PinningPolicy::PinnedIfSupported); + changePinningPolicy(&state->x, PinningPolicy::PinnedIfSupported); } + if ((useGpuForNonbonded && useGpuForBufferOps) || useGpuForUpdate) + { + changePinningPolicy(&f, PinningPolicy::PinnedIfSupported); + } + if (useGpuForUpdate) + { + changePinningPolicy(&state->v, PinningPolicy::PinnedIfSupported); + } +/*****************************************************************************************/ // NOTE: The global state is no longer used at this point. // But state_global is still used as temporary storage space for writing @@ -1200,16 +1218,19 @@ void gmx::LegacySimulator::do_md() if (useGpuForUpdate) { + StatePropagatorDataGpu *stateGpu = fr->stateGpu; if (bNS) { - integrator->set(top.idef, *mdatoms, ekind->ngtc); + integrator->set(stateGpu->getCoordinates(), stateGpu->getVelocities(), stateGpu->getForces(), + top.idef, *mdatoms, ekind->ngtc); t_pbc pbc; set_pbc(&pbc, epbcXYZ, state->box); integrator->setPbc(&pbc); } - integrator->copyCoordinatesToGpu(state->x.rvec_array()); - integrator->copyVelocitiesToGpu(state->v.rvec_array()); - integrator->copyForcesToGpu(as_rvec_array(f.data())); + + stateGpu->copyCoordinatesToGpu(ArrayRef(state->x), StatePropagatorDataGpu::AtomLocality::All); + stateGpu->copyVelocitiesToGpu(state->v, StatePropagatorDataGpu::AtomLocality::All); + stateGpu->copyForcesToGpu(ArrayRef(f), StatePropagatorDataGpu::AtomLocality::All); bool doTempCouple = (ir->etc != etcNO && do_per_step(step + ir->nsttcouple - 1, ir->nsttcouple)); bool doPressureCouple = (ir->epc == epcPARRINELLORAHMAN && do_per_step(step + ir->nstpcouple - 1, ir->nstpcouple)); @@ -1218,9 +1239,9 @@ void gmx::LegacySimulator::do_md() integrator->integrate(ir->delta_t, true, bCalcVir, shake_vir, doTempCouple, ekind->tcstat, doPressureCouple, ir->nstpcouple*ir->delta_t, M); - - integrator->copyCoordinatesFromGpu(state->x.rvec_array()); - integrator->copyVelocitiesFromGpu(state->v.rvec_array()); + stateGpu->copyCoordinatesFromGpu(ArrayRef(state->x), StatePropagatorDataGpu::AtomLocality::All); + stateGpu->copyVelocitiesFromGpu(state->v, StatePropagatorDataGpu::AtomLocality::All); + stateGpu->synchronizeStream(); } else { diff --git a/src/gromacs/mdrun/runner.cpp b/src/gromacs/mdrun/runner.cpp index 1793b91d54..7bf64f9185 100644 --- a/src/gromacs/mdrun/runner.cpp +++ b/src/gromacs/mdrun/runner.cpp @@ -115,6 +115,7 @@ #include "gromacs/mdtypes/observableshistory.h" #include "gromacs/mdtypes/simulation_workload.h" #include "gromacs/mdtypes/state.h" +#include "gromacs/mdtypes/state_propagator_data_gpu.h" #include "gromacs/nbnxm/gpu_data_mgmt.h" #include "gromacs/nbnxm/nbnxm.h" #include "gromacs/nbnxm/pairlist_tuning.h" @@ -1501,6 +1502,27 @@ int Mdrunner::mdrunner() fcd->orires.nr != 0, fcd->disres.nsystems != 0); + const void *commandStream = ((GMX_GPU == GMX_GPU_OPENCL) && thisRankHasPmeGpuTask) ? pme_gpu_get_device_stream(fr->pmedata) : nullptr; + const void *gpuContext = ((GMX_GPU == GMX_GPU_OPENCL) && thisRankHasPmeGpuTask) ? pme_gpu_get_device_context(fr->pmedata) : nullptr; + const int paddingSize = pme_gpu_get_padding_size(fr->pmedata); + + const bool inputIsCompatibleWithModularSimulator = ModularSimulator::isInputCompatible( + false, + inputrec, doRerun, vsite.get(), ms, replExParams, + fcd, static_cast(filenames.size()), filenames.data(), + &observablesHistory, membed); + + const bool useModularSimulator = inputIsCompatibleWithModularSimulator && !(getenv("GMX_DISABLE_MODULAR_SIMULATOR") != nullptr); + GpuApiCallBehavior transferKind = (inputrec->eI == eiMD && !doRerun && !useModularSimulator) ? GpuApiCallBehavior::Async : GpuApiCallBehavior::Sync; + + // We initialize GPU state even for the CPU runs so we will have a more verbose + // error if someone will try accessing it from the CPU codepath + gmx::StatePropagatorDataGpu stateGpu(commandStream, + gpuContext, + transferKind, + paddingSize); + fr->stateGpu = &stateGpu; + // TODO This is not the right place to manage the lifetime of // this data structure, but currently it's the easiest way to // make it work. @@ -1510,11 +1532,6 @@ int Mdrunner::mdrunner() SimulatorBuilder simulatorBuilder; // build and run simulator object based on user-input - const bool inputIsCompatibleWithModularSimulator = ModularSimulator::isInputCompatible( - false, - inputrec, doRerun, vsite.get(), ms, replExParams, - fcd, static_cast(filenames.size()), filenames.data(), - &observablesHistory, membed); auto simulator = simulatorBuilder.build( inputIsCompatibleWithModularSimulator, fplog, cr, ms, mdlog, static_cast(filenames.size()), filenames.data(), diff --git a/src/gromacs/mdtypes/CMakeLists.txt b/src/gromacs/mdtypes/CMakeLists.txt index 788b6cf3bc..f9adf04c23 100644 --- a/src/gromacs/mdtypes/CMakeLists.txt +++ b/src/gromacs/mdtypes/CMakeLists.txt @@ -32,7 +32,31 @@ # To help us fund GROMACS development, we humbly ask that you cite # the research papers on the package. Check out http://www.gromacs.org. -file(GLOB MDTYPES_SOURCES *.cpp) +file(GLOB MDTYPES_SOURCES + df_history.cpp + group.cpp + iforceprovider.cpp + inputrec.cpp + md_enums.cpp + observableshistory.cpp + state.cpp) + +if(GMX_USE_CUDA OR GMX_USE_OPENCL) + gmx_add_libgromacs_sources( + state_propagator_data_gpu_impl_gpu.cpp + ) + if(GMX_USE_CUDA) + gmx_compile_cpp_as_cuda( + state_propagator_data_gpu_impl_gpu.cpp + ) + endif() +else() + gmx_add_libgromacs_sources( + state_propagator_data_gpu_impl.cpp + ) +endif() + + set(LIBGROMACS_SOURCES ${LIBGROMACS_SOURCES} ${MDTYPES_SOURCES} PARENT_SCOPE) if(GMX_INSTALL_LEGACY_API) diff --git a/src/gromacs/mdtypes/forcerec.h b/src/gromacs/mdtypes/forcerec.h index ec01003d24..540d0d073e 100644 --- a/src/gromacs/mdtypes/forcerec.h +++ b/src/gromacs/mdtypes/forcerec.h @@ -60,6 +60,7 @@ namespace gmx { class GpuBonded; class ForceProviders; +class StatePropagatorDataGpu; } /* macros for the cginfo data in forcerec @@ -268,6 +269,11 @@ struct t_forcerec { // NOLINT (clang-analyzer-optin.performance.Padding) struct ewald_corr_thread_t *ewc_t = nullptr; gmx::ForceProviders *forceProviders = nullptr; + + // The stateGpu object is created in runner, forcerec just keeps the copy of the pointer. + // TODO: This is not supposed to be here. StatePropagatorDataGpu should be a part of + // general StatePropagatorData object that is passed around + gmx::StatePropagatorDataGpu *stateGpu = nullptr; }; /* Important: Starting with Gromacs-4.6, the values of c6 and c12 in the nbfp array have diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu.h b/src/gromacs/mdtypes/state_propagator_data_gpu.h new file mode 100644 index 0000000000..cf20737bce --- /dev/null +++ b/src/gromacs/mdtypes/state_propagator_data_gpu.h @@ -0,0 +1,225 @@ +/* + * This file is part of the GROMACS molecular simulation package. + * + * Copyright (c) 2019, by the GROMACS development team, led by + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, + * and including many others, as listed in the AUTHORS file in the + * top-level source directory and at http://www.gromacs.org. + * + * GROMACS is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 + * of the License, or (at your option) any later version. + * + * GROMACS is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with GROMACS; if not, see + * http://www.gnu.org/licenses, or write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * If you want to redistribute modifications to GROMACS, please + * consider that scientific software is very special. Version + * control is crucial - bugs must be traceable. We will be happy to + * consider code for inclusion in the official distribution, but + * derived work must not be called official GROMACS. Details are found + * in the README & COPYING files - if they are missing, get the + * official version at http://www.gromacs.org. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the research papers on the package. Check out http://www.gromacs.org. + */ +/*! \internal \file + * + * \brief Declaration of interfaces for GPU state data propagator object. + * + * This object stores and manages positions, velocities and forces for + * all particles in the system on the GPU. + * + * \todo Add cycle counters. + * \todo Add synchronization points. + * + * \author Artem Zhmurov + * + * \inlibraryapi + * \ingroup module_mdtypes + */ +#ifndef GMX_MDTYPES_STATE_PROPAGATOR_DATA_GPU_H +#define GMX_MDTYPES_STATE_PROPAGATOR_DATA_GPU_H + +#include "gromacs/gpu_utils/devicebuffer_datatype.h" +#include "gromacs/gpu_utils/gpu_utils.h" +#include "gromacs/math/vectypes.h" +#include "gromacs/utility/arrayref.h" +#include "gromacs/utility/classhelpers.h" + +namespace gmx +{ + +class StatePropagatorDataGpu +{ + public: + + /*! \brief Atom locality indicator: local, non-local, all. + * + * \todo This should be managed by a separate object, since the localities + * are used here and in buffer ops. + */ + enum class AtomLocality : int + { + Local = 0, //!< Local atoms + NonLocal = 1, //!< Non-local atoms + All = 2, //!< Both local and non-local atoms + Count = 3 //!< The number of atom locality types + }; + + /*! \brief Constructor + * + * The buffers are reallocated only at the reinit call, the padding is + * used there for the coordinates buffer. It is needed for PME and added at + * the end of the buffer. It is assumed that if the rank has PME duties on the + * GPU, all coordinates are copied to the GPU and hence, for this rank, the + * coordinates buffer is not split into local and non-local ranges. For other + * ranks, the padding size is zero. This works because only one rank ever does + * PME work on the GPU, and if that rank also does PP work that is the only + * rank. So all coordinates are always transferred. + * + * \note \p commandStream and \p gpuContext are allowed to be nullptr if + * StatePropagatorDataGpu is not used in the OpenCL run (e.g. if PME + * does not run on the GPU). + * + * \todo Make \p CommandStream visible in the CPU parts of the code so we + * will not have to pass a void*. + * \todo Make \p Context visible in CPU parts of the code so we will not + * have to pass a void*. + * + * \param[in] commandStream GPU stream, nullptr allowed. + * \param[in] gpuContext GPU context, nullptr allowed. + * \param[in] transferKind H2D/D2H transfer call behavior (synchronous or not). + * \param[in] paddingSize Padding size for coordinates buffer. + */ + StatePropagatorDataGpu(const void *commandStream, + const void *gpuContext, + GpuApiCallBehavior transferKind, + int paddingSize); + + ~StatePropagatorDataGpu(); + + /*! \brief Set the ranges for local and non-local atoms and reallocates buffers. + * + * The coordinates buffer is reallocated with the padding added at the end. The + * size of padding is set by the constructor. + * + * \param[in] numAtomsLocal Number of atoms in local domain. + * \param[in] numAtomsAll Total number of atoms to handle. + */ + void reinit(int numAtomsLocal, int numAtomsAll); + + /*! \brief Returns the range of atoms to be copied based on the copy type (all, local or non-local). + * + * \todo There are at least three versions of the function with this functionality in the code: + * this one and two more in NBNXM. These should be unified in a shape of a general function + * in DD. + * + * \param[in] atomLocality If all, local or non-local ranges are needed. + * + * \returns Tuple, containing the index of the first atom in the range and the total number of atoms in the range. + */ + std::tuple getAtomRangesFromAtomLocality(AtomLocality atomLocality); + + + /*! \brief Get the positions buffer on the GPU. + * + * \returns GPU positions buffer. + */ + DeviceBuffer getCoordinates(); + + /*! \brief Copy positions to the GPU memory. + * + * \param[in] h_x Positions in the host memory. + * \param[in] atomLocality Locality of the particles to copy. + */ + void copyCoordinatesToGpu(gmx::ArrayRef h_x, + AtomLocality atomLocality); + + /*! \brief Copy positions from the GPU memory. + * + * \param[in] h_x Positions buffer in the host memory. + * \param[in] atomLocality Locality of the particles to copy. + */ + void copyCoordinatesFromGpu(gmx::ArrayRef h_x, + AtomLocality atomLocality); + + + /*! \brief Get the velocities buffer on the GPU. + * + * \returns GPU velocities buffer. + */ + DeviceBuffer getVelocities(); + + /*! \brief Copy velocities to the GPU memory. + * + * \param[in] h_v Velocities in the host memory. + * \param[in] atomLocality Locality of the particles to copy. + */ + void copyVelocitiesToGpu(gmx::ArrayRef h_v, + AtomLocality atomLocality); + + /*! \brief Copy velocities from the GPU memory. + * + * \param[in] h_v Velocities buffer in the host memory. + * \param[in] atomLocality Locality of the particles to copy. + */ + void copyVelocitiesFromGpu(gmx::ArrayRef h_v, + AtomLocality atomLocality); + + + /*! \brief Get the force buffer on the GPU. + * + * \returns GPU force buffer. + */ + DeviceBuffer getForces(); + + /*! \brief Copy forces to the GPU memory. + * + * \param[in] h_f Forces in the host memory. + * \param[in] atomLocality Locality of the particles to copy. + */ + void copyForcesToGpu(gmx::ArrayRef h_f, + AtomLocality atomLocality); + + /*! \brief Copy forces from the GPU memory. + * + * \param[in] h_f Forces buffer in the host memory. + * \param[in] atomLocality Locality of the particles to copy. + */ + void copyForcesFromGpu(gmx::ArrayRef h_f, + AtomLocality atomLocality); + /*! \brief Synchronize the underlying GPU stream + */ + void synchronizeStream(); + + /*! \brief Getter for the number of local atoms. + * + * \returns The number of local atoms. + */ + int numAtomsLocal(); + + /*! \brief Getter for the total number of atoms. + * + * \returns The total number of atoms. + */ + int numAtomsAll(); + + private: + class Impl; + gmx::PrivateImplPointer impl_; + +}; + +} // namespace gmx + +#endif // GMX_MDTYPES_STATE_PROPAGATOR_DATA_GPU_H diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp new file mode 100644 index 0000000000..dd0aa76f80 --- /dev/null +++ b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp @@ -0,0 +1,156 @@ +/* + * This file is part of the GROMACS molecular simulation package. + * + * Copyright (c) 2019, by the GROMACS development team, led by + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, + * and including many others, as listed in the AUTHORS file in the + * top-level source directory and at http://www.gromacs.org. + * + * GROMACS is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 + * of the License, or (at your option) any later version. + * + * GROMACS is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with GROMACS; if not, see + * http://www.gnu.org/licenses, or write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * If you want to redistribute modifications to GROMACS, please + * consider that scientific software is very special. Version + * control is crucial - bugs must be traceable. We will be happy to + * consider code for inclusion in the official distribution, but + * derived work must not be called official GROMACS. Details are found + * in the README & COPYING files - if they are missing, get the + * official version at http://www.gromacs.org. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the research papers on the package. Check out http://www.gromacs.org. + */ +/*! \internal \file + * + * \brief The CPU stub for the state propagator data class. + * + * \author Artem Zhmurov + * + * \ingroup module_mdtypes + */ +#include "gmxpre.h" + +#include "config.h" + +#include "gromacs/mdtypes/state_propagator_data_gpu.h" + +#if GMX_GPU == GMX_GPU_NONE +namespace gmx +{ + +class StatePropagatorDataGpu::Impl +{ +}; + +StatePropagatorDataGpu::StatePropagatorDataGpu(const void * /* commandStream */, + const void * /* gpuContext */, + GpuApiCallBehavior /* transferKind */, + int /* paddingSize */) + : impl_(nullptr) +{ +} + +StatePropagatorDataGpu::~StatePropagatorDataGpu() +{ +} + +void StatePropagatorDataGpu::reinit(int /* numAtomsLocal */, + int /* numAtomsAll */) +{ + GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called insted of one from GPU implementation."); +} + +std::tuple StatePropagatorDataGpu::getAtomRangesFromAtomLocality(AtomLocality /* atomLocality */) +{ + GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called insted of one from GPU implementation."); + return std::make_tuple(0, 0); +} + +DeviceBuffer StatePropagatorDataGpu::getCoordinates() +{ + GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called insted of one from GPU implementation."); + return DeviceBuffer {}; +} + +void StatePropagatorDataGpu::copyCoordinatesToGpu(const gmx::ArrayRef /* h_x */, + AtomLocality /* atomLocality */) +{ + GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called insted of one from GPU implementation."); +} + +void StatePropagatorDataGpu::copyCoordinatesFromGpu(gmx::ArrayRef /* h_x */, + AtomLocality /* atomLocality */) +{ + GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called insted of one from GPU implementation."); +} + + +DeviceBuffer StatePropagatorDataGpu::getVelocities() +{ + GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called insted of one from GPU implementation."); + return DeviceBuffer {}; +} + +void StatePropagatorDataGpu::copyVelocitiesToGpu(const gmx::ArrayRef /* h_v */, + AtomLocality /* atomLocality */) +{ + GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called insted of one from GPU implementation."); +} + +void StatePropagatorDataGpu::copyVelocitiesFromGpu(gmx::ArrayRef /* h_v */, + AtomLocality /* atomLocality */) +{ + GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called insted of one from GPU implementation."); +} + + +DeviceBuffer StatePropagatorDataGpu::getForces() +{ + GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called insted of one from GPU implementation."); + return DeviceBuffer {}; +} + +void StatePropagatorDataGpu::copyForcesToGpu(const gmx::ArrayRef /* h_f */, + AtomLocality /* atomLocality */) +{ + GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called insted of one from GPU implementation."); +} + +void StatePropagatorDataGpu::copyForcesFromGpu(gmx::ArrayRef /* h_f */, + AtomLocality /* atomLocality */) +{ + GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called insted of one from GPU implementation."); +} + +void StatePropagatorDataGpu::synchronizeStream() +{ + GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called insted of one from GPU implementation."); +} + +int StatePropagatorDataGpu::numAtomsLocal() +{ + GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called insted of one from GPU implementation."); + return 0; +} + +int StatePropagatorDataGpu::numAtomsAll() +{ + GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called insted of one from GPU implementation."); + return 0; +} + +} // namespace gmx + +#endif // GMX_GPU == GMX_GPU_NONE diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h new file mode 100644 index 0000000000..fe8dca0456 --- /dev/null +++ b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h @@ -0,0 +1,273 @@ +/* + * This file is part of the GROMACS molecular simulation package. + * + * Copyright (c) 2019, by the GROMACS development team, led by + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, + * and including many others, as listed in the AUTHORS file in the + * top-level source directory and at http://www.gromacs.org. + * + * GROMACS is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 + * of the License, or (at your option) any later version. + * + * GROMACS is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with GROMACS; if not, see + * http://www.gnu.org/licenses, or write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * If you want to redistribute modifications to GROMACS, please + * consider that scientific software is very special. Version + * control is crucial - bugs must be traceable. We will be happy to + * consider code for inclusion in the official distribution, but + * derived work must not be called official GROMACS. Details are found + * in the README & COPYING files - if they are missing, get the + * official version at http://www.gromacs.org. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the research papers on the package. Check out http://www.gromacs.org. + */ +/*! \internal \file + * + * \brief Declaration of low-level functions and fields of GPU state propagator object. + * + * \author Artem Zhmurov + * + * \ingroup module_mdtypes + */ +#ifndef GMX_MDTYPES_STATE_PROPAGATOR_DATA_GPU_IMPL_H +#define GMX_MDTYPES_STATE_PROPAGATOR_DATA_GPU_IMPL_H + +#include "gmxpre.h" + +#include "gromacs/gpu_utils/devicebuffer.h" +#include "gromacs/math/vectypes.h" +#include "gromacs/mdtypes/state_propagator_data_gpu.h" +#include "gromacs/utility/classhelpers.h" + +namespace gmx +{ + +class StatePropagatorDataGpu::Impl +{ + public: + + Impl(); + + + /*! \brief Constructor + * + * The buffers are reallocated only at the reinit call, the padding is + * used there for the coordinates buffer. It is needed for PME and added at + * the end of the buffer. It is assumed that if the rank has PME duties on the + * GPU, all coordinates are copied to the GPU and hence, for this rank, the + * coordinates buffer is not split into local and non-local ranges. For other + * ranks, the padding size is zero. This works because only one rank ever does + * PME work on the GPU, and if that rank also does PP work that is the only + * rank. So all coordinates are always transferred. + * + * \note \p commandStream and \p gpuContext are allowed to be nullptr if + * StatePropagatorDataGpu is not used in the OpenCL run (e.g. if PME + * does not run on the GPU). + * + * \todo Make CommandStream visible in the CPU parts of the code so we + * will not have to pass a void*. + * \todo Make a Context object visible in CPU parts of the code so we + * will not have to pass a void*. + * + * \param[in] commandStream GPU stream, nullptr allowed. + * \param[in] gpuContext GPU context, nullptr allowed. + * \param[in] transferKind H2D/D2H transfer call behavior (synchronous or not). + * \param[in] paddingSize Padding size for coordinates buffer. + */ + Impl(const void *commandStream, + const void *gpuContext, + GpuApiCallBehavior transferKind, + int paddingSize); + + ~Impl(); + + + /*! \brief Set the ranges for local and non-local atoms and reallocates buffers. + * + * The coordinates buffer is reallocated with the padding added at the end. The + * size of padding is set by the constructor. + * + * \param[in] numAtomsLocal Number of atoms in local domain. + * \param[in] numAtomsAll Total number of atoms to handle. + */ + void reinit(int numAtomsLocal, int numAtomsAll); + + /*! \brief Returns the range of atoms to be copied based on the copy type (all, local or non-local). + * + * \todo There are at least three versions of the function with this functionality in the code: + * this one and two more in NBNXM. These should be unified in a shape of a general function + * in DD. + * + * \param[in] atomLocality If all, local or non-local ranges are needed. + * + * \returns Tuple, containing the index of the first atom in the range and the total number of atoms in the range. + */ + std::tuple getAtomRangesFromAtomLocality(AtomLocality atomLocality); + + + /*! \brief Get the positions buffer on the GPU. + * + * \returns GPU positions buffer. + */ + DeviceBuffer getCoordinates(); + + /*! \brief Copy positions to the GPU memory. + * + * \param[in] h_x Positions in the host memory. + * \param[in] atomLocality Locality of the particles to copy. + */ + void copyCoordinatesToGpu(gmx::ArrayRef h_x, + AtomLocality atomLocality); + + /*! \brief Copy positions from the GPU memory. + * + * \param[in] h_x Positions buffer in the host memory. + * \param[in] atomLocality Locality of the particles to copy. + */ + void copyCoordinatesFromGpu(gmx::ArrayRef h_x, + AtomLocality atomLocality); + + + /*! \brief Get the velocities buffer on the GPU. + * + * \returns GPU velocities buffer. + */ + DeviceBuffer getVelocities(); + + /*! \brief Copy velocities to the GPU memory. + * + * \param[in] h_v Velocities in the host memory. + * \param[in] atomLocality Locality of the particles to copy. + */ + void copyVelocitiesToGpu(gmx::ArrayRef h_v, + AtomLocality atomLocality); + + /*! \brief Copy velocities from the GPU memory. + * + * \param[in] h_v Velocities buffer in the host memory. + * \param[in] atomLocality Locality of the particles to copy. + */ + void copyVelocitiesFromGpu(gmx::ArrayRef h_v, + AtomLocality atomLocality); + + + /*! \brief Get the force buffer on the GPU. + * + * \returns GPU force buffer. + */ + DeviceBuffer getForces(); + + /*! \brief Copy forces to the GPU memory. + * + * \param[in] h_f Forces in the host memory. + * \param[in] atomLocality Locality of the particles to copy. + */ + void copyForcesToGpu(gmx::ArrayRef h_f, + AtomLocality atomLocality); + + /*! \brief Copy forces from the GPU memory. + * + * \param[in] h_f Forces buffer in the host memory. + * \param[in] atomLocality Locality of the particles to copy. + */ + void copyForcesFromGpu(gmx::ArrayRef h_f, + AtomLocality atomLocality); + + /*! \brief Synchronize the underlying GPU stream + */ + void synchronizeStream(); + + /*! \brief Getter for the number of local atoms. + * + * \returns The number of local atoms. + */ + int numAtomsLocal(); + + /*! \brief Getter for the total number of atoms. + * + * \returns The total number of atoms. + */ + int numAtomsAll(); + + private: + + /*! \brief GPU stream. + * \todo The stream should be set to non-nullptr once the synchronization points are restored + */ + CommandStream commandStream_ = nullptr; + /*! \brief GPU context (for OpenCL builds) + * \todo Make a Context class usable in CPU code + */ + Context gpuContext_ = nullptr; + //! Default GPU calls behavior + GpuApiCallBehavior transferKind_ = GpuApiCallBehavior::Async; + //! Padding size for the coordinates buffer + int paddingSize_ = 0; + + //! Number of local atoms + int numAtomsLocal_ = -1; + //! Total number of atoms + int numAtomsAll_ = -1; + + //! Device positions buffer + DeviceBuffer d_x_; + //! Number of particles saved in the positions buffer + int d_xSize_ = -1; + //! Allocation size for the positions buffer + int d_xCapacity_ = -1; + + //! Device velocities buffer + DeviceBuffer d_v_; + //! Number of particles saved in the velocities buffer + int d_vSize_ = -1; + //! Allocation size for the velocities buffer + int d_vCapacity_ = -1; + + //! Device force buffer + DeviceBuffer d_f_; + //! Number of particles saved in the force buffer + int d_fSize_ = -1; + //! Allocation size for the force buffer + int d_fCapacity_ = -1; + + /*! \brief Performs the copy of data from host to device buffer. + * + * \todo Template on locality. + * + * \param[in,out] d_data Device-side buffer. + * \param[in,out] h_data Host-side buffer. + * \param[in] dataSize Device-side data allocation size. + * \param[in] atomLocality If all, local or non-local ranges should be copied. + */ + void copyToDevice(DeviceBuffer d_data, + const gmx::ArrayRef h_data, + int dataSize, + AtomLocality atomLocality); + + /*! \brief Performs the copy of data from device to host buffer. + * + * \param[in,out] h_data Host-side buffer. + * \param[in,out] d_data Device-side buffer. + * \param[in] dataSize Device-side data allocation size. + * \param[in] atomLocality If all, local or non-local ranges should be copied. + */ + void copyFromDevice(gmx::ArrayRef h_data, + DeviceBuffer d_data, + int dataSize, + AtomLocality atomLocality); +}; + +} // namespace gmx + +#endif // GMX_MDTYPES_STATE_PROPAGATOR_DATA_GPU_IMPL_H diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp b/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp new file mode 100644 index 0000000000..d2c1e5d54f --- /dev/null +++ b/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp @@ -0,0 +1,381 @@ +/* + * This file is part of the GROMACS molecular simulation package. + * + * Copyright (c) 2019, by the GROMACS development team, led by + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, + * and including many others, as listed in the AUTHORS file in the + * top-level source directory and at http://www.gromacs.org. + * + * GROMACS is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 + * of the License, or (at your option) any later version. + * + * GROMACS is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with GROMACS; if not, see + * http://www.gnu.org/licenses, or write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * If you want to redistribute modifications to GROMACS, please + * consider that scientific software is very special. Version + * control is crucial - bugs must be traceable. We will be happy to + * consider code for inclusion in the official distribution, but + * derived work must not be called official GROMACS. Details are found + * in the README & COPYING files - if they are missing, get the + * official version at http://www.gromacs.org. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the research papers on the package. Check out http://www.gromacs.org. + */ +/*! \internal \file + * + * \brief Definitions of interfaces for GPU state data propagator object. + * + * \author Artem Zhmurov + * + * \ingroup module_mdtypes + */ +#include "gmxpre.h" + +#include "config.h" + +#if GMX_GPU != GMX_GPU_NONE + +#if GMX_GPU == GMX_GPU_CUDA +#include "gromacs/gpu_utils/cudautils.cuh" +#endif +#include "gromacs/gpu_utils/devicebuffer.h" +#if GMX_GPU == GMX_GPU_OPENCL +#include "gromacs/gpu_utils/oclutils.h" +#endif +#include "gromacs/math/vectypes.h" +#include "gromacs/mdtypes/state_propagator_data_gpu.h" +#include "gromacs/utility/classhelpers.h" + +#include "state_propagator_data_gpu_impl.h" + +namespace gmx +{ + +StatePropagatorDataGpu::Impl::Impl(gmx_unused const void *commandStream, + gmx_unused const void *gpuContext, + GpuApiCallBehavior transferKind, + int paddingSize) : + transferKind_(transferKind), + paddingSize_(paddingSize) +{ + + GMX_RELEASE_ASSERT(getenv("GMX_USE_GPU_BUFFER_OPS") == nullptr, "GPU buffer ops are not supported in this build."); + + // Set the stream-context pair for the OpenCL builds, + // use the nullptr stream for CUDA builds +#if GMX_GPU == GMX_GPU_OPENCL + if (commandStream != nullptr) + { + commandStream_ = *static_cast(commandStream); + } + if (gpuContext != nullptr) + { + gpuContext_ = *static_cast(gpuContext); + } +#endif + +} + +StatePropagatorDataGpu::Impl::~Impl() +{ +} + +void StatePropagatorDataGpu::Impl::reinit(int numAtomsLocal, int numAtomsAll) +{ +#if GMX_GPU == GMX_GPU_OPENCL + GMX_ASSERT(gpuContext_ != nullptr, "GPU context should be set in OpenCL builds."); +#endif + numAtomsLocal_ = numAtomsLocal; + numAtomsAll_ = numAtomsAll; + + int numAtomsPadded; + if (paddingSize_ > 0) + { + numAtomsPadded = ((numAtomsAll_ + paddingSize_ - 1 ) / paddingSize_ )*paddingSize_; + } + else + { + numAtomsPadded = numAtomsAll_; + } + + reallocateDeviceBuffer(&d_x_, DIM*numAtomsPadded, &d_xSize_, &d_xCapacity_, gpuContext_); + + const size_t paddingAllocationSize = numAtomsPadded - numAtomsAll_; + if (paddingAllocationSize > 0) + { + clearDeviceBufferAsync(&d_x_, DIM*numAtomsAll_, DIM*paddingAllocationSize, commandStream_); + } + + reallocateDeviceBuffer(&d_v_, DIM*numAtomsAll_, &d_vSize_, &d_vCapacity_, gpuContext_); + reallocateDeviceBuffer(&d_f_, DIM*numAtomsAll_, &d_fSize_, &d_fCapacity_, gpuContext_); + +} + +std::tuple StatePropagatorDataGpu::Impl::getAtomRangesFromAtomLocality(AtomLocality atomLocality) +{ + int atomsStartAt = 0; + int numAtomsToCopy = 0; + switch (atomLocality) + { + case AtomLocality::All: + atomsStartAt = 0; + numAtomsToCopy = numAtomsAll_; + break; + case AtomLocality::Local: + atomsStartAt = 0; + numAtomsToCopy = numAtomsLocal_; + break; + case AtomLocality::NonLocal: + atomsStartAt = numAtomsLocal_; + numAtomsToCopy = numAtomsAll_ - numAtomsLocal_; + break; + default: + GMX_RELEASE_ASSERT(false, "Wrong range of atoms requested in GPU state data manager. Should be All, Local or NonLocal."); + } + GMX_ASSERT(atomsStartAt >= 0, "The first elemtnt to copy has negative index. Probably, the GPU propagator state was not initialized."); + GMX_ASSERT(numAtomsToCopy >= 0, "Number of atoms to copy is negative. Probably, the GPU propagator state was not initialized."); + return std::make_tuple(atomsStartAt, numAtomsToCopy); +} + +void StatePropagatorDataGpu::Impl::copyToDevice(DeviceBuffer d_data, + const gmx::ArrayRef h_data, + int dataSize, + AtomLocality atomLocality) +{ + +#if GMX_GPU == GMX_GPU_OPENCL + GMX_ASSERT(gpuContext_ != nullptr, "GPU context should be set in OpenCL builds."); +#endif + + GMX_UNUSED_VALUE(dataSize); + + GMX_ASSERT(dataSize >= 0, "Trying to copy to device buffer before it was allocated."); + + int atomsStartAt, numAtomsToCopy; + std::tie(atomsStartAt, numAtomsToCopy) = getAtomRangesFromAtomLocality(atomLocality); + + int elementsStartAt = atomsStartAt*DIM; + int numElementsToCopy = numAtomsToCopy*DIM; + + if (numAtomsToCopy != 0) + { + GMX_ASSERT(elementsStartAt + numElementsToCopy <= dataSize, "The device allocation is smaller than requested copy range."); + GMX_ASSERT(atomsStartAt + numAtomsToCopy <= h_data.ssize(), "The host buffer is smaller than the requested copy range."); + + // TODO: Use the proper stream + copyToDeviceBuffer(&d_data, reinterpret_cast(&h_data.data()[atomsStartAt]), + elementsStartAt, numElementsToCopy, + commandStream_, transferKind_, nullptr); + } +} + +void StatePropagatorDataGpu::Impl::copyFromDevice(gmx::ArrayRef h_data, + DeviceBuffer d_data, + int dataSize, + AtomLocality atomLocality) +{ + +#if GMX_GPU == GMX_GPU_OPENCL + GMX_ASSERT(gpuContext_ != nullptr, "GPU context should be set in OpenCL builds."); +#endif + + GMX_UNUSED_VALUE(dataSize); + + GMX_ASSERT(dataSize >= 0, "Trying to copy from device buffer before it was allocated."); + + int atomsStartAt, numAtomsToCopy; + std::tie(atomsStartAt, numAtomsToCopy) = getAtomRangesFromAtomLocality(atomLocality); + + int elementsStartAt = atomsStartAt*DIM; + int numElementsToCopy = numAtomsToCopy*DIM; + + if (numAtomsToCopy != 0) + { + GMX_ASSERT(elementsStartAt + numElementsToCopy <= dataSize, "The device allocation is smaller than requested copy range."); + GMX_ASSERT(atomsStartAt + numAtomsToCopy <= h_data.ssize(), "The host buffer is smaller than the requested copy range."); + + // TODO: Use the proper stream + copyFromDeviceBuffer(reinterpret_cast(&h_data.data()[atomsStartAt]), &d_data, + elementsStartAt, numElementsToCopy, + commandStream_, transferKind_, nullptr); + + } +} + +DeviceBuffer StatePropagatorDataGpu::Impl::getCoordinates() +{ + return d_x_; +} + +void StatePropagatorDataGpu::Impl::copyCoordinatesToGpu(const gmx::ArrayRef h_x, + AtomLocality atomLocality) +{ + copyToDevice(d_x_, h_x, d_xSize_, atomLocality); +} + +void StatePropagatorDataGpu::Impl::copyCoordinatesFromGpu(gmx::ArrayRef h_x, + AtomLocality atomLocality) +{ + copyFromDevice(h_x, d_x_, d_xSize_, atomLocality); +} + + +DeviceBuffer StatePropagatorDataGpu::Impl::getVelocities() +{ + return d_v_; +} + +void StatePropagatorDataGpu::Impl::copyVelocitiesToGpu(const gmx::ArrayRef h_v, + AtomLocality atomLocality) +{ + copyToDevice(d_v_, h_v, d_vSize_, atomLocality); +} + +void StatePropagatorDataGpu::Impl::copyVelocitiesFromGpu(gmx::ArrayRef h_v, + AtomLocality atomLocality) +{ + copyFromDevice(h_v, d_v_, d_vSize_, atomLocality); +} + + +DeviceBuffer StatePropagatorDataGpu::Impl::getForces() +{ + return d_f_; +} + +void StatePropagatorDataGpu::Impl::copyForcesToGpu(const gmx::ArrayRef h_f, + AtomLocality atomLocality) +{ + copyToDevice(d_f_, h_f, d_fSize_, atomLocality); +} + +void StatePropagatorDataGpu::Impl::copyForcesFromGpu(gmx::ArrayRef h_f, + AtomLocality atomLocality) +{ + copyFromDevice(h_f, d_f_, d_fSize_, atomLocality); +} + +void StatePropagatorDataGpu::Impl::synchronizeStream() +{ + gpuStreamSynchronize(commandStream_); +} + +int StatePropagatorDataGpu::Impl::numAtomsLocal() +{ + return numAtomsLocal_; +} + +int StatePropagatorDataGpu::Impl::numAtomsAll() +{ + return numAtomsAll_; +} + + + +StatePropagatorDataGpu::StatePropagatorDataGpu(const void *commandStream, + const void *gpuContext, + GpuApiCallBehavior transferKind, + int paddingSize) + : impl_(new Impl(commandStream, + gpuContext, + transferKind, + paddingSize)) +{ +} + +StatePropagatorDataGpu::~StatePropagatorDataGpu() = default; + + +void StatePropagatorDataGpu::reinit(int numAtomsLocal, int numAtomsAll) +{ + return impl_->reinit(numAtomsLocal, numAtomsAll); +} + +std::tuple StatePropagatorDataGpu::getAtomRangesFromAtomLocality(AtomLocality atomLocality) +{ + return impl_->getAtomRangesFromAtomLocality(atomLocality); +} + + +DeviceBuffer StatePropagatorDataGpu::getCoordinates() +{ + return impl_->getCoordinates(); +} + +void StatePropagatorDataGpu::copyCoordinatesToGpu(const gmx::ArrayRef h_x, + AtomLocality atomLocality) +{ + return impl_->copyCoordinatesToGpu(h_x, atomLocality); +} + +void StatePropagatorDataGpu::copyCoordinatesFromGpu(gmx::ArrayRef h_x, + AtomLocality atomLocality) +{ + return impl_->copyCoordinatesFromGpu(h_x, atomLocality); +} + + +DeviceBuffer StatePropagatorDataGpu::getVelocities() +{ + return impl_->getVelocities(); +} + +void StatePropagatorDataGpu::copyVelocitiesToGpu(const gmx::ArrayRef h_v, + AtomLocality atomLocality) +{ + return impl_->copyVelocitiesToGpu(h_v, atomLocality); +} + +void StatePropagatorDataGpu::copyVelocitiesFromGpu(gmx::ArrayRef h_v, + AtomLocality atomLocality) +{ + return impl_->copyVelocitiesFromGpu(h_v, atomLocality); +} + + +DeviceBuffer StatePropagatorDataGpu::getForces() +{ + return impl_->getForces(); +} + +void StatePropagatorDataGpu::copyForcesToGpu(const gmx::ArrayRef h_f, + AtomLocality atomLocality) +{ + return impl_->copyForcesToGpu(h_f, atomLocality); +} + +void StatePropagatorDataGpu::copyForcesFromGpu(gmx::ArrayRef h_f, + AtomLocality atomLocality) +{ + return impl_->copyForcesFromGpu(h_f, atomLocality); +} + +void StatePropagatorDataGpu::synchronizeStream() +{ + return impl_->synchronizeStream(); +} + +int StatePropagatorDataGpu::numAtomsLocal() +{ + return impl_->numAtomsLocal(); +} + +int StatePropagatorDataGpu::numAtomsAll() +{ + return impl_->numAtomsAll(); +} + +} // namespace gmx + +#endif // GMX_GPU == GMX_GPU_NONE diff --git a/src/gromacs/nbnxm/atomdata.cpp b/src/gromacs/nbnxm/atomdata.cpp index f6df6eb6b4..5df9633e27 100644 --- a/src/gromacs/nbnxm/atomdata.cpp +++ b/src/gromacs/nbnxm/atomdata.cpp @@ -1084,42 +1084,12 @@ void nbnxn_atomdata_copy_x_to_nbat_x(const Nbnxm::GridSet &gridSet, } } -void nbnxn_atomdata_copy_x_to_gpu(const Nbnxm::GridSet &gridSet, - const Nbnxm::AtomLocality locality, - bool fillLocal, - nbnxn_atomdata_t *nbat, - gmx_nbnxn_gpu_t *gpu_nbv, - const rvec *coordinatesHost) -{ - int gridBegin = 0; - int gridEnd = 0; - getAtomRanges(gridSet, locality, &gridBegin, &gridEnd); - - if (fillLocal) - { - nbat->natoms_local = gridSet.grids()[0].atomIndexEnd(); - } - - for (int g = gridBegin; g < gridEnd; g++) - { - nbnxn_gpu_copy_x_to_gpu(gridSet.grids()[g], - gpu_nbv, - locality, - coordinatesHost); - } -} - -DeviceBuffer nbnxn_atomdata_get_x_gpu(gmx_nbnxn_gpu_t *gpu_nbv) -{ - return Nbnxm::nbnxn_gpu_get_x_gpu(gpu_nbv); -} - /* Copies (and reorders) the coordinates to nbnxn_atomdata_t on the GPU*/ void nbnxn_atomdata_x_to_nbat_x_gpu(const Nbnxm::GridSet &gridSet, const Nbnxm::AtomLocality locality, bool fillLocal, gmx_nbnxn_gpu_t *gpu_nbv, - DeviceBuffer coordinatesDevice) + DeviceBuffer d_x) { int gridBegin = 0; @@ -1131,7 +1101,7 @@ void nbnxn_atomdata_x_to_nbat_x_gpu(const Nbnxm::GridSet &gridSet, nbnxn_gpu_x_to_nbat_x(gridSet.grids()[g], fillLocal && g == 0, gpu_nbv, - coordinatesDevice, + d_x, locality, g, gridSet.numColumnsMax()); @@ -1552,11 +1522,6 @@ void reduceForcesGpu(const Nbnxm::AtomLocality locality, accumulateForce); } -DeviceBuffer nbnxn_atomdata_get_f_gpu(gmx_nbnxn_gpu_t *gpu_nbv) -{ - return Nbnxm::nbnxn_gpu_get_f_gpu(gpu_nbv); -} - void nbnxn_atomdata_add_nbat_fshift_to_fshift(const nbnxn_atomdata_t &nbat, gmx::ArrayRef fshift) { diff --git a/src/gromacs/nbnxm/atomdata.h b/src/gromacs/nbnxm/atomdata.h index ba644bd436..412d328bce 100644 --- a/src/gromacs/nbnxm/atomdata.h +++ b/src/gromacs/nbnxm/atomdata.h @@ -58,8 +58,6 @@ struct nonbonded_verlet_t; struct t_mdatoms; struct tMPI_Atomic; -enum class BufferOpsUseGpu; - class GpuEventSynchronizer; namespace Nbnxm @@ -326,46 +324,22 @@ void nbnxn_atomdata_copy_x_to_nbat_x(const Nbnxm::GridSet &gridSet, const rvec *coordinates, nbnxn_atomdata_t *nbat); -/*! \brief Copies the coordinates to the GPU (in plain rvec format) - * - * This function copied data to the gpu so that the transformation to the NBNXM format can be done on the GPU. - * - * \param[in] gridSet The grids data. - * \param[in] locality If local or non local coordinates should be copied. - * \param[in] fillLocal If the local filler particle coordinates should be zeroed. - * \param[in] nbat Data in NBNXM format, used to zero coordinates of filler particles. - * \param[in] gpu_nbv The NBNXM GPU data structure. - * \param[in] coordinatesHost Coordinates to be copied (in plain rvec format). - */ -void nbnxn_atomdata_copy_x_to_gpu(const Nbnxm::GridSet &gridSet, - Nbnxm::AtomLocality locality, - bool fillLocal, - nbnxn_atomdata_t *nbat, - gmx_nbnxn_gpu_t *gpu_nbv, - const rvec *coordinatesHost); - -/*!\brief Getter for the GPU coordinates buffer - * - * \param[in] gpu_nbv The NBNXM GPU data structure. - */ -DeviceBuffer nbnxn_atomdata_get_x_gpu(gmx_nbnxn_gpu_t *gpu_nbv); - /*! \brief Transform coordinates to xbat layout on GPU * * Creates a GPU copy of the coordinates buffer using short-range ordering. * As input, uses coordinates in plain rvec format in GPU memory. * - * \param[in] gridSet The grids data. - * \param[in] locality If the transformation should be applied to local or non local coordinates. - * \param[in] fillLocal Tells if the local filler particle coordinates should be zeroed. - * \param[in,out] gpu_nbv The NBNXM GPU data structure. - * \param[in] coordinatesDevice Coordinates to be copied (in plain rvec format). + * \param[in] gridSet The grids data. + * \param[in] locality If the transformation should be applied to local or non local coordinates. + * \param[in] fillLocal Tells if the local filler particle coordinates should be zeroed. + * \param[in,out] gpu_nbv The NBNXM GPU data structure. + * \param[in] d_x Coordinates to be copied (in plain rvec format). */ void nbnxn_atomdata_x_to_nbat_x_gpu(const Nbnxm::GridSet &gridSet, Nbnxm::AtomLocality locality, bool fillLocal, gmx_nbnxn_gpu_t *gpu_nbv, - DeviceBuffer coordinatesDevice); + DeviceBuffer d_x); /*! \brief Add the computed forces to \p f, an internal reduction might be performed as well * @@ -399,16 +373,6 @@ void reduceForcesGpu(Nbnxm::AtomLocality locality, bool useGpuFPmeReduction, bool accumulateForce); -/*!\brief Getter for the GPU forces buffer - * - * \todo Will be removed when the buffer management is lifted out of the NBNXM - * - * \param[in] gpu_nbv The NBNXM GPU data structure. - * - * \returns Device forces buffer - */ -DeviceBuffer nbnxn_atomdata_get_f_gpu(gmx_nbnxn_gpu_t *gpu_nbv); - /* Add the fshift force stored in nbat to fshift */ void nbnxn_atomdata_add_nbat_fshift_to_fshift(const nbnxn_atomdata_t &nbat, gmx::ArrayRef fshift); diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu index 10a6f0a0d3..dba1ca8c82 100644 --- a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu +++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu @@ -743,60 +743,11 @@ void cuda_set_cacheconfig() } } -/* X buffer operations on GPU: copies coordinates to the GPU in rvec format. */ -void nbnxn_gpu_copy_x_to_gpu(const Nbnxm::Grid &grid, - gmx_nbnxn_gpu_t *nb, - const Nbnxm::AtomLocality locality, - const rvec *coordinatesHost) -{ - GMX_ASSERT(nb, "Need a valid nbnxn_gpu object"); - - bool bDoTime = nb->bDoTime; - - Nbnxm::InteractionLocality interactionLoc = gpuAtomToInteractionLocality(locality); - int numCopyAtoms = grid.srcAtomEnd() - grid.srcAtomBegin(); - int copyAtomStart = grid.srcAtomBegin(); - - cudaStream_t stream = nb->stream[interactionLoc]; - - // empty domain avoid launching zero-byte copy - if (numCopyAtoms == 0) - { - return; - } - GMX_ASSERT(coordinatesHost, "Need a valid host pointer"); - - if (bDoTime) - { - nb->timers->xf[locality].nb_h2d.openTimingRegion(stream); - } - - rvec *devicePtrDest = reinterpret_cast (nb->xrvec[copyAtomStart]); - const rvec *devicePtrSrc = reinterpret_cast (coordinatesHost[copyAtomStart]); - copyToDeviceBuffer(&devicePtrDest, devicePtrSrc, 0, numCopyAtoms, - stream, GpuApiCallBehavior::Async, nullptr); - - if (interactionLoc == Nbnxm::InteractionLocality::Local) - { - nb->xAvailableOnDevice->markEvent(stream); - } - - if (bDoTime) - { - nb->timers->xf[locality].nb_h2d.closeTimingRegion(stream); - } -} - -DeviceBuffer nbnxn_gpu_get_x_gpu(gmx_nbnxn_gpu_t *nb) -{ - return reinterpret_cast< DeviceBuffer >(nb->xrvec); -} - /* X buffer operations on GPU: performs conversion from rvec to nb format. */ void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid &grid, bool setFillerCoords, gmx_nbnxn_gpu_t *nb, - DeviceBuffer coordinatesDevice, + DeviceBuffer d_x, const Nbnxm::AtomLocality locality, int gridId, int numColumnsMax) @@ -817,7 +768,7 @@ void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid &grid, if (numAtoms != 0) { // TODO: This will only work with CUDA - GMX_ASSERT(coordinatesDevice, "Need a valid device pointer"); + GMX_ASSERT(d_x, "Need a valid device pointer"); KernelLaunchConfig config; config.blockSize[0] = c_bufOpsThreadsPerBlock; @@ -839,7 +790,7 @@ void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid &grid, &numColumns, &xqPtr, &setFillerCoords, - &coordinatesDevice, + &d_x, &d_atomIndices, &d_cxy_na, &d_cxy_ind, @@ -920,142 +871,6 @@ void nbnxn_gpu_add_nbat_f_to_f(const AtomLocality atomLocality, } -DeviceBuffer nbnxn_gpu_get_f_gpu(gmx_nbnxn_gpu_t *nb) -{ - return reinterpret_cast< DeviceBuffer >(nb->frvec); -} - -void nbnxn_launch_copy_f_to_gpu(const AtomLocality atomLocality, - const Nbnxm::GridSet &gridSet, - gmx_nbnxn_gpu_t *nb, - rvec *f) -{ - GMX_ASSERT(nb, "Need a valid nbnxn_gpu object"); - - const InteractionLocality iLocality = gpuAtomToInteractionLocality(atomLocality); - cudaStream_t stream = nb->stream[iLocality]; - - bool bDoTime = nb->bDoTime; - cu_timers_t *t = nb->timers; - - int atomStart = 0, numCopyAtoms = 0; - - nbnxn_get_atom_range(atomLocality, gridSet, &atomStart, &numCopyAtoms); - - // Avoiding launching copy with no work - if (numCopyAtoms == 0) - { - return; - } - GMX_ASSERT(f, "Need a valid f pointer"); - - if (bDoTime) - { - t->xf[atomLocality].nb_h2d.openTimingRegion(stream); - } - - rvec *ptrDest = reinterpret_cast (nb->frvec[atomStart]); - rvec *ptrSrc = reinterpret_cast (f[atomStart]); - //copyToDeviceBuffer(&ptrDest, ptrSrc, 0, numCopyAtoms, - // stream, GpuApiCallBehavior::Async, nullptr); - //TODO use above API call rather than direct memcpy when force has been implemented in a hostvector - cudaMemcpyAsync(ptrDest, ptrSrc, numCopyAtoms*sizeof(rvec), cudaMemcpyHostToDevice, - stream); - - if (bDoTime) - { - t->xf[atomLocality].nb_h2d.closeTimingRegion(stream); - } - - return; -} - -void nbnxn_launch_copy_f_from_gpu(const AtomLocality atomLocality, - const Nbnxm::GridSet &gridSet, - gmx_nbnxn_gpu_t *nb, - rvec *f) -{ - GMX_ASSERT(nb, "Need a valid nbnxn_gpu object"); - - const InteractionLocality iLocality = gpuAtomToInteractionLocality(atomLocality); - cudaStream_t stream = nb->stream[iLocality]; - - bool bDoTime = nb->bDoTime; - cu_timers_t *t = nb->timers; - int atomStart, numCopyAtoms; - - nbnxn_get_atom_range(atomLocality, gridSet, &atomStart, &numCopyAtoms); - - // Avoiding launching copy with no work - if (numCopyAtoms == 0) - { - return; - } - GMX_ASSERT(f, "Need a valid f pointer"); - - if (bDoTime) - { - t->xf[atomLocality].nb_d2h.openTimingRegion(stream); - } - - GMX_ASSERT(nb->frvec, "Need a valid nb->frvec pointer"); - rvec *ptrDest = reinterpret_cast (f[atomStart]); - rvec *ptrSrc = reinterpret_cast (nb->frvec[atomStart]); - //copyFromDeviceBuffer(ptrDest, &ptrSrc, 0, numCopyAtoms, - // stream, GpuApiCallBehavior::Async, nullptr); - //TODO use above API call rather than direct memcpy when force has been implemented in a hostvector - cudaMemcpyAsync(ptrDest, ptrSrc, numCopyAtoms*sizeof(rvec), cudaMemcpyDeviceToHost, - stream); - - if (bDoTime) - { - t->xf[atomLocality].nb_d2h.closeTimingRegion(stream); - } - - return; -} - -void nbnxn_launch_copy_x_from_gpu(const AtomLocality atomLocality, - const Nbnxm::GridSet &gridSet, - gmx_nbnxn_gpu_t *nb, - rvec *x) -{ - GMX_ASSERT(nb, "Need a valid nbnxn_gpu object"); - GMX_ASSERT(x, "Need a valid x pointer"); - - const InteractionLocality iLocality = gpuAtomToInteractionLocality(atomLocality); - cudaStream_t stream = nb->stream[iLocality]; - - bool bDoTime = nb->bDoTime; - cu_timers_t *t = nb->timers; - int atomStart, nAtoms; - - nbnxn_get_atom_range(atomLocality, gridSet, &atomStart, &nAtoms); - - if (bDoTime) - { - t->xf[atomLocality].nb_d2h.openTimingRegion(stream); - } - - GMX_ASSERT(nb->xrvec, "Need a valid nb->xrvec pointer"); - rvec *ptrDest = reinterpret_cast (x[atomStart]); - rvec *ptrSrc = reinterpret_cast (nb->xrvec[atomStart]); - copyFromDeviceBuffer(ptrDest, &ptrSrc, 0, nAtoms, - stream, GpuApiCallBehavior::Async, stream); - - if (atomLocality == AtomLocality::NonLocal) - { - nb->xNonLocalCopyD2HDone->markEvent(stream); - } - - if (bDoTime) - { - t->xf[atomLocality].nb_d2h.closeTimingRegion(stream); - } - - return; -} - void nbnxn_wait_for_gpu_force_reduction(const AtomLocality gmx_unused atomLocality, gmx_nbnxn_gpu_t *nb) { @@ -1069,16 +884,6 @@ void nbnxn_wait_for_gpu_force_reduction(const AtomLocality gmx_unused atomL } -void* nbnxn_get_gpu_xrvec(gmx_nbnxn_gpu_t *gpu_nbv) -{ - return static_cast (gpu_nbv->xrvec); -} - -void* nbnxn_get_gpu_frvec(gmx_nbnxn_gpu_t *gpu_nbv) -{ - return static_cast (gpu_nbv->frvec); -} - void* nbnxn_get_x_on_device_event(const gmx_nbnxn_cuda_t *nb) { return static_cast (nb->xAvailableOnDevice); diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu index 1df63f01db..3641d5eb5f 100644 --- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu +++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu @@ -502,16 +502,12 @@ gpu_init(const gmx_device_info_t *deviceInfo, cuda_init_const(nb, ic, listParams, nbat->params()); - nb->natoms = 0; - nb->natoms_alloc = 0; nb->atomIndicesSize = 0; nb->atomIndicesSize_alloc = 0; nb->ncxy_na = 0; nb->ncxy_na_alloc = 0; nb->ncxy_ind = 0; nb->ncxy_ind_alloc = 0; - nb->nfrvec = 0; - nb->nfrvec_alloc = 0; nb->ncell = 0; nb->ncell_alloc = 0; @@ -903,10 +899,7 @@ void nbnxn_gpu_init_x_to_nbat_x(const Nbnxm::GridSet &gridSet, const int atomIndicesSize = gridSet.atomIndices().size(); const int *cxy_na = grid.cxy_na().data(); const int *cxy_ind = grid.cxy_ind().data(); - // TODO Should be done once per gridset - const int numRealAtomsTotal = gridSet.numRealAtomsTotal(); - reallocateDeviceBuffer(&gpu_nbv->xrvec, numRealAtomsTotal, &gpu_nbv->natoms, &gpu_nbv->natoms_alloc, nullptr); reallocateDeviceBuffer(&gpu_nbv->atomIndices, atomIndicesSize, &gpu_nbv->atomIndicesSize, &gpu_nbv->atomIndicesSize_alloc, nullptr); if (atomIndicesSize > 0) @@ -977,8 +970,6 @@ void nbnxn_gpu_init_add_nbat_f_to_f(const int *cell, cudaStream_t stream = gpu_nbv->stream[InteractionLocality::Local]; - reallocateDeviceBuffer(&gpu_nbv->frvec, natoms_total, &gpu_nbv->nfrvec, &gpu_nbv->nfrvec_alloc, nullptr); - if (natoms_total > 0) { reallocateDeviceBuffer(&gpu_nbv->cell, natoms_total, &gpu_nbv->ncell, &gpu_nbv->ncell_alloc, nullptr); diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h b/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h index ec4b2b8d33..f3fc0e8852 100644 --- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h +++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h @@ -219,18 +219,6 @@ struct gmx_nbnxn_cuda_t bool bUseTwoStreams; //! atom data cu_atomdata_t *atdat; - //! coordinates in rvec format - rvec *xrvec; - //! number of atoms - int natoms; - //! number of atoms allocated in device buffer - int natoms_alloc; - //! force in rvec format - rvec *frvec; - //! number of atoms in force buffer - int nfrvec; - //! number of atoms allocated in force buffer - int nfrvec_alloc; //! f buf ops cell index mapping int *cell; //! number of indices in cell buffer diff --git a/src/gromacs/nbnxm/nbnxm.cpp b/src/gromacs/nbnxm/nbnxm.cpp index 892098ae4a..5c6967bb65 100644 --- a/src/gromacs/nbnxm/nbnxm.cpp +++ b/src/gromacs/nbnxm/nbnxm.cpp @@ -147,37 +147,16 @@ void nonbonded_verlet_t::convertCoordinates(const Nbnxm::AtomLocality loca wallcycle_stop(wcycle_, ewcNB_XF_BUF_OPS); } - -void nonbonded_verlet_t::copyCoordinatesToGpu(const Nbnxm::AtomLocality locality, - const bool fillLocal, - gmx::ArrayRef coordinatesHost) -{ - wallcycle_start(wcycle_, ewcNB_XF_BUF_OPS); - wallcycle_sub_start(wcycle_, ewcsNB_X_BUF_OPS); - - nbnxn_atomdata_copy_x_to_gpu(pairSearch_->gridSet(), locality, fillLocal, - nbat.get(), gpu_nbv, - as_rvec_array(coordinatesHost.data())); - - wallcycle_sub_stop(wcycle_, ewcsNB_X_BUF_OPS); - wallcycle_stop(wcycle_, ewcNB_XF_BUF_OPS); -} - -DeviceBuffer nonbonded_verlet_t::getDeviceCoordinates() -{ - return nbnxn_atomdata_get_x_gpu(gpu_nbv); -} - void nonbonded_verlet_t::convertCoordinatesGpu(const Nbnxm::AtomLocality locality, const bool fillLocal, - DeviceBuffer coordinatesDevice) + DeviceBuffer d_x) { wallcycle_start(wcycle_, ewcNB_XF_BUF_OPS); wallcycle_sub_start(wcycle_, ewcsNB_X_BUF_OPS); nbnxn_atomdata_x_to_nbat_x_gpu(pairSearch_->gridSet(), locality, fillLocal, gpu_nbv, - coordinatesDevice); + d_x); wallcycle_sub_stop(wcycle_, ewcsNB_X_BUF_OPS); wallcycle_stop(wcycle_, ewcNB_XF_BUF_OPS); @@ -254,11 +233,6 @@ nonbonded_verlet_t::atomdata_init_add_nbat_f_to_f_gpu() wallcycle_stop(wcycle_, ewcNB_XF_BUF_OPS); } -DeviceBuffer nonbonded_verlet_t::getDeviceForces() -{ - return nbnxn_atomdata_get_f_gpu(gpu_nbv); -} - real nonbonded_verlet_t::pairlistInnerRadius() const { return pairlistSets_->params().rlistInner; @@ -286,40 +260,11 @@ void nonbonded_verlet_t::insertNonlocalGpuDependency(const Nbnxm::InteractionLoc Nbnxm::nbnxnInsertNonlocalGpuDependency(gpu_nbv, interactionLocality); } -void nonbonded_verlet_t::launch_copy_f_to_gpu(rvec *f, const Nbnxm::AtomLocality locality) -{ - nbnxn_launch_copy_f_to_gpu(locality, - pairSearch_->gridSet(), - gpu_nbv, - f); -} - -void nonbonded_verlet_t::launch_copy_f_from_gpu(rvec *f, const Nbnxm::AtomLocality locality) -{ - nbnxn_launch_copy_f_from_gpu(locality, - pairSearch_->gridSet(), - gpu_nbv, - f); -} - -void nonbonded_verlet_t::launch_copy_x_from_gpu(rvec *x, const Nbnxm::AtomLocality locality) -{ - nbnxn_launch_copy_x_from_gpu(locality, - pairSearch_->gridSet(), - gpu_nbv, - x); -} - void nonbonded_verlet_t::wait_for_gpu_force_reduction(const Nbnxm::AtomLocality locality) { nbnxn_wait_for_gpu_force_reduction(locality, gpu_nbv); } -void* nonbonded_verlet_t::get_gpu_xrvec() -{ - return Nbnxm::nbnxn_get_gpu_xrvec(gpu_nbv); -} - void* nonbonded_verlet_t::get_x_on_device_event() { return Nbnxm::nbnxn_get_x_on_device_event(gpu_nbv); @@ -330,11 +275,6 @@ void nonbonded_verlet_t::wait_nonlocal_x_copy_D2H_done() Nbnxm::nbnxn_wait_nonlocal_x_copy_D2H_done(gpu_nbv); } -void* nonbonded_verlet_t::get_gpu_frvec() -{ - return Nbnxm::nbnxn_get_gpu_frvec(gpu_nbv); -} - void nonbonded_verlet_t::stream_local_wait_for_nonlocal() { Nbnxm::nbnxn_stream_local_wait_for_nonlocal(gpu_nbv); diff --git a/src/gromacs/nbnxm/nbnxm.h b/src/gromacs/nbnxm/nbnxm.h index c6d12d2e87..991f21692e 100644 --- a/src/gromacs/nbnxm/nbnxm.h +++ b/src/gromacs/nbnxm/nbnxm.h @@ -264,42 +264,17 @@ struct nonbonded_verlet_t bool fillLocal, gmx::ArrayRef coordinates); - /*!\brief Copy coordinates to the GPU memory. - * - * This function uses the internal NBNXM GPU pointer to copy coordinates in the plain rvec format - * into the GPU memory. - * - * \todo This function will be removed in future patches as the management of the device buffers - * is moved to a separate object. - * - * \param[in] locality Whether coordinates for local or non-local atoms should be transformed. - * \param[in] fillLocal If the coordinates for filler particles should be zeroed. - * \param[in] coordinatesHost Coordinates in plain rvec format to be transformed. - */ - void copyCoordinatesToGpu(Nbnxm::AtomLocality locality, - bool fillLocal, - gmx::ArrayRef coordinatesHost); - - /*!\brief Getter for the GPU coordinates buffer. - * - * \todo This function will be removed in future patches as the management of the device buffers - * is moved to a separate object. - * - * \returns The coordinates buffer in plain rvec format. - */ - DeviceBuffer getDeviceCoordinates(); - /*!\brief Convert the coordinates to NBNXM format on the GPU for the given locality * * The API function for the transformation of the coordinates from one layout to another in the GPU memory. * - * \param[in] locality Whether coordinates for local or non-local atoms should be transformed. - * \param[in] fillLocal If the coordinates for filler particles should be zeroed. - * \param[in] coordinatesDevice GPU coordinates buffer in plain rvec format to be transformed. + * \param[in] locality Whether coordinates for local or non-local atoms should be transformed. + * \param[in] fillLocal If the coordinates for filler particles should be zeroed. + * \param[in] d_x GPU coordinates buffer in plain rvec format to be transformed. */ void convertCoordinatesGpu(Nbnxm::AtomLocality locality, bool fillLocal, - DeviceBuffer coordinatesDevice); + DeviceBuffer d_x); //! Init for GPU version of setup coordinates in Nbnxm void atomdata_init_copy_x_to_nbat_x_gpu(); @@ -370,36 +345,15 @@ struct nonbonded_verlet_t bool useGpuFPmeReduction, bool accumulateForce); - /*!\brief Getter for the GPU force buffer. - * - * \todo This function will be removed in future patches as the management of the device buffers - * is moved to a separate object. - * - * \returns The force buffer in plain rvec format. - */ - DeviceBuffer getDeviceForces(); - /*! \brief Outer body of function to perform initialization for F buffer operations on GPU. */ void atomdata_init_add_nbat_f_to_f_gpu(); - /*! \brief H2D transfer of force buffer*/ - void launch_copy_f_to_gpu(rvec *f, Nbnxm::AtomLocality locality); - - /*! \brief D2H transfer of force buffer*/ - void launch_copy_f_from_gpu(rvec *f, Nbnxm::AtomLocality locality); - - /*! \brief D2H transfer of coordinate buffer*/ - void launch_copy_x_from_gpu(rvec *f, Nbnxm::AtomLocality locality); - /*! \brief Wait for GPU force reduction task and D2H transfer of its results to complete * * FIXME: need more details: when should be called / after which operation, etc. */ void wait_for_gpu_force_reduction(Nbnxm::AtomLocality locality); - /*! \brief return GPU pointer to x in rvec format */ - void* get_gpu_xrvec(); - /*! \brief return pointer to GPU event recorded when coordinates have been copied to device */ void* get_x_on_device_event(); diff --git a/src/gromacs/nbnxm/nbnxm_gpu.h b/src/gromacs/nbnxm/nbnxm_gpu.h index f300281802..f661a19ca1 100644 --- a/src/gromacs/nbnxm/nbnxm_gpu.h +++ b/src/gromacs/nbnxm/nbnxm_gpu.h @@ -229,48 +229,21 @@ CUDA_FUNC_QUALIFIER void nbnxn_gpu_init_x_to_nbat_x(const Nbnxm::GridSet gmx_unused &gridSet, gmx_nbnxn_gpu_t gmx_unused *gpu_nbv) CUDA_FUNC_TERM; -/*! \brief Copy coordinates from host to device memory. - * - * \todo This will be removed as the management of the buffers is taken out of the NBNXM module. +/*! \brief X buffer operations on GPU: performs conversion from rvec to nb format. * - * \param[in] grid Grid to be copied. + * \param[in] grid Grid to be converted. + * \param[in] setFillerCoords If the filler coordinates are used. * \param[in,out] gpu_nbv The nonbonded data GPU structure. + * \param[in] d_x Device-side coordinates in plain rvec format. * \param[in] locality Copy coordinates for local or non-local atoms. - * \param[in] coordinatesHost Host-side coordinates in plain rvec format. - */ -CUDA_FUNC_QUALIFIER -void nbnxn_gpu_copy_x_to_gpu(const Nbnxm::Grid gmx_unused &grid, - gmx_nbnxn_gpu_t gmx_unused *gpu_nbv, - Nbnxm::AtomLocality gmx_unused locality, - const rvec gmx_unused *coordinatesHost) CUDA_FUNC_TERM; - -/*! \brief Getter for the device coordinates buffer. - * - * \todo This will be removed as the management of the buffers is taken out of the NBNXM module. - * - * \param[in] gpu_nbv The nonbonded data GPU structure. - * - * \returns Device coordinates buffer in plain rvec format. - */ -CUDA_FUNC_QUALIFIER -DeviceBuffer nbnxn_gpu_get_x_gpu(gmx_nbnxn_gpu_t gmx_unused *gpu_nbv) CUDA_FUNC_TERM_WITH_RETURN(DeviceBuffer {}); - - -/*! \brief X buffer operations on GPU: performs conversion from rvec to nb format. - * - * \param[in] grid Grid to be converted. - * \param[in] setFillerCoords If the filler coordinates are used. - * \param[in,out] gpu_nbv The nonbonded data GPU structure. - * \param[in] coordinatesDevice Device-side coordinates in plain rvec format. - * \param[in] locality Copy coordinates for local or non-local atoms. - * \param[in] gridId Index of the grid being converted. - * \param[in] numColumnsMax Maximum number of columns in the grid. + * \param[in] gridId Index of the grid being converted. + * \param[in] numColumnsMax Maximum number of columns in the grid. */ CUDA_FUNC_QUALIFIER void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid gmx_unused &grid, bool gmx_unused setFillerCoords, gmx_nbnxn_gpu_t gmx_unused *gpu_nbv, - DeviceBuffer gmx_unused coordinatesDevice, + DeviceBuffer gmx_unused d_x, Nbnxm::AtomLocality gmx_unused locality, int gmx_unused gridId, int gmx_unused numColumnsMax) CUDA_FUNC_TERM; @@ -346,43 +319,6 @@ void nbnxn_gpu_add_nbat_f_to_f(AtomLocality gmx_unused atomLoca bool gmx_unused useGpuFPmeReduction, bool gmx_unused accumulateForce) CUDA_FUNC_TERM; -/*! \brief Getter for the device coordinates buffer. - * - * \todo This will be removed as the management of the buffers is taken out of the NBNXM module. - * - * \param[in] gpu_nbv The nonbonded data GPU structure. - * - * \returns Device coordinates buffer in plain rvec format. - */ -CUDA_FUNC_QUALIFIER -DeviceBuffer nbnxn_gpu_get_f_gpu(gmx_nbnxn_gpu_t gmx_unused *gpu_nbv) CUDA_FUNC_TERM_WITH_RETURN(DeviceBuffer {}); - -/*! \brief Copy force buffer from CPU to GPU */ -CUDA_FUNC_QUALIFIER -void nbnxn_launch_copy_f_to_gpu(AtomLocality gmx_unused atomLocality, - const Nbnxm::GridSet gmx_unused &gridSet, - gmx_nbnxn_gpu_t gmx_unused *nb, - rvec gmx_unused *f) CUDA_FUNC_TERM; - -/*! \brief Copy force buffer from GPU to CPU */ -CUDA_FUNC_QUALIFIER -void nbnxn_launch_copy_f_from_gpu(AtomLocality gmx_unused atomLocality, - const Nbnxm::GridSet gmx_unused &gridSet, - gmx_nbnxn_gpu_t gmx_unused *nb, - rvec gmx_unused *f) CUDA_FUNC_TERM; - -/*! \brief Asynchronous launch of copying coordinate buffer from GPU to CPU - * \param[in] atomLocality Locality for data trasnfer - * \param[in] gridSet The Grid Set data object - * \param[in] nb The nonbonded data GPU structure - * \param[out] x Coordinate buffer on CPU - */ -CUDA_FUNC_QUALIFIER -void nbnxn_launch_copy_x_from_gpu(AtomLocality gmx_unused atomLocality, - const Nbnxm::GridSet gmx_unused &gridSet, - gmx_nbnxn_gpu_t gmx_unused *nb, - rvec gmx_unused *x) CUDA_FUNC_TERM; - /*! \brief Wait for GPU stream to complete */ CUDA_FUNC_QUALIFIER void nbnxn_wait_for_gpu_force_reduction(AtomLocality gmx_unused atomLocality, @@ -400,24 +336,12 @@ void nbnxn_wait_x_on_device(gmx_nbnxn_gpu_t gmx_unused *nb) CUDA_FUNC_TERM; CUDA_FUNC_QUALIFIER void* nbnxn_get_x_on_device_event(const gmx_nbnxn_gpu_t gmx_unused *nb) CUDA_FUNC_TERM_WITH_RETURN(nullptr); -/*! \brief return GPU pointer to x in rvec format - * \param[in] nb The nonbonded data GPU structure - */ -CUDA_FUNC_QUALIFIER -void* nbnxn_get_gpu_xrvec(gmx_nbnxn_gpu_t gmx_unused *nb) CUDA_FUNC_TERM_WITH_RETURN(nullptr); - /*! \brief Wait for non-local copy of coordinate buffer from device to host * \param[in] nb The nonbonded data GPU structure */ CUDA_FUNC_QUALIFIER void nbnxn_wait_nonlocal_x_copy_D2H_done(gmx_nbnxn_gpu_t gmx_unused *nb) CUDA_FUNC_TERM; -/*! \brief return GPU pointer to f in rvec format - * \param[in] nb The nonbonded data GPU structure - */ -CUDA_FUNC_QUALIFIER -void* nbnxn_get_gpu_frvec(gmx_nbnxn_gpu_t gmx_unused *nb) CUDA_FUNC_TERM_WITH_RETURN(nullptr); - /*! \brief Ensure local stream waits for non-local stream * \param[in] nb The nonbonded data GPU structure */ -- 2.22.0