From c69e061d078f8a829c86e96c2d4144ec32c03eec Mon Sep 17 00:00:00 2001 From: Artem Zhmurov Date: Tue, 10 Sep 2019 22:52:37 +0200 Subject: [PATCH] Decouple coordinates buffer management from buffer ops in NBNXM To make transition to the new device-side state propagator data manager easier, the copy and plain buffers management have to be decoupled from the actual operations on data in the buffers. This commit splits the corresponding functions into two, and decouples CPU- and GPU-based coordinate buffer ops one from another. Since the coordinates are now clearly communicated between PME and NBNXM, the plain pointer on the CPU-side code is replaced with the DeviceBuffer. The getters for the device- side buffer now never return nullptr, but exit with the assertion if the corresponding data is not intialized. Change-Id: Ic793f815870a8b4f414a9b7ca0a5001e58f49c7a --- src/gromacs/ewald/pme.h | 2 +- src/gromacs/ewald/pme_gpu.cpp | 7 +- src/gromacs/ewald/pme_gpu_internal.cpp | 13 +- src/gromacs/ewald/pme_gpu_internal.h | 2 +- src/gromacs/mdlib/sim_util.cpp | 36 ++++- src/gromacs/nbnxm/atomdata.cpp | 190 +++++++++++++++---------- src/gromacs/nbnxm/atomdata.h | 77 ++++++---- src/gromacs/nbnxm/cuda/nbnxm_cuda.cu | 87 ++++++----- src/gromacs/nbnxm/nbnxm.cpp | 52 +++++-- src/gromacs/nbnxm/nbnxm.h | 55 ++++++- src/gromacs/nbnxm/nbnxm_gpu.h | 56 ++++++-- 11 files changed, 391 insertions(+), 186 deletions(-) diff --git a/src/gromacs/ewald/pme.h b/src/gromacs/ewald/pme.h index 1ca7dfbea9..3e173d217f 100644 --- a/src/gromacs/ewald/pme.h +++ b/src/gromacs/ewald/pme.h @@ -476,7 +476,7 @@ GPU_FUNC_QUALIFIER void pme_gpu_reinit_computation(const gmx_pme_t *GPU_FUNC_ARG * \param[in] pme The PME data structure. * \returns Pointer to coordinate data */ -GPU_FUNC_QUALIFIER void *pme_gpu_get_device_x(const gmx_pme_t *GPU_FUNC_ARGUMENT(pme)) GPU_FUNC_TERM_WITH_RETURN(nullptr); +GPU_FUNC_QUALIFIER DeviceBuffer pme_gpu_get_device_x(const gmx_pme_t *GPU_FUNC_ARGUMENT(pme)) GPU_FUNC_TERM_WITH_RETURN(DeviceBuffer {}); /*! \brief Get pointer to device copy of force data. * \param[in] pme The PME data structure. diff --git a/src/gromacs/ewald/pme_gpu.cpp b/src/gromacs/ewald/pme_gpu.cpp index 1f43a9fe5f..2bf148a593 100644 --- a/src/gromacs/ewald/pme_gpu.cpp +++ b/src/gromacs/ewald/pme_gpu.cpp @@ -420,12 +420,9 @@ void pme_gpu_reinit_computation(const gmx_pme_t *pme, wallcycle_stop(wcycle, ewcLAUNCH_GPU); } -void *pme_gpu_get_device_x(const gmx_pme_t *pme) +DeviceBuffer pme_gpu_get_device_x(const gmx_pme_t *pme) { - if (!pme || !pme_gpu_active(pme)) - { - return nullptr; - } + GMX_ASSERT((pme && pme_gpu_active(pme)), "PME GPU coordinates buffer was requested from uninitialized PME module"); return pme_gpu_get_kernelparam_coordinates(pme->gpu); } diff --git a/src/gromacs/ewald/pme_gpu_internal.cpp b/src/gromacs/ewald/pme_gpu_internal.cpp index aef078d8d4..8960f584ef 100644 --- a/src/gromacs/ewald/pme_gpu_internal.cpp +++ b/src/gromacs/ewald/pme_gpu_internal.cpp @@ -1289,16 +1289,11 @@ void pme_gpu_gather(PmeGpu *pmeGpu, } } -void * pme_gpu_get_kernelparam_coordinates(const PmeGpu *pmeGpu) +DeviceBuffer pme_gpu_get_kernelparam_coordinates(const PmeGpu *pmeGpu) { - if (pmeGpu && pmeGpu->kernelParams) - { - return pmeGpu->kernelParams->atoms.d_coordinates; - } - else - { - return nullptr; - } + GMX_ASSERT(pmeGpu && pmeGpu->kernelParams, "PME GPU device buffer was requested in non-GPU build or before the GPU PME was initialized."); + + return pmeGpu->kernelParams->atoms.d_coordinates; } void * pme_gpu_get_kernelparam_forces(const PmeGpu *pmeGpu) diff --git a/src/gromacs/ewald/pme_gpu_internal.h b/src/gromacs/ewald/pme_gpu_internal.h index 8a8c54609a..3abbed5c4d 100644 --- a/src/gromacs/ewald/pme_gpu_internal.h +++ b/src/gromacs/ewald/pme_gpu_internal.h @@ -448,7 +448,7 @@ GPU_FUNC_QUALIFIER void pme_gpu_gather(PmeGpu *GPU_FUNC_ARGUMENT( * \param[in] pmeGpu The PME GPU structure. * \returns Pointer to coordinate data */ -GPU_FUNC_QUALIFIER void * pme_gpu_get_kernelparam_coordinates(const PmeGpu *GPU_FUNC_ARGUMENT(pmeGpu)) GPU_FUNC_TERM_WITH_RETURN(nullptr); +GPU_FUNC_QUALIFIER DeviceBuffer pme_gpu_get_kernelparam_coordinates(const PmeGpu *GPU_FUNC_ARGUMENT(pmeGpu)) GPU_FUNC_TERM_WITH_RETURN(DeviceBuffer {}); /*! \brief Return pointer to device copy of force data. * \param[in] pmeGpu The PME GPU structure. diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp index 724160a472..853f97b021 100644 --- a/src/gromacs/mdlib/sim_util.cpp +++ b/src/gromacs/mdlib/sim_util.cpp @@ -1120,8 +1120,22 @@ void do_force(FILE *fplog, } else { - nbv->setCoordinates(Nbnxm::AtomLocality::Local, false, - x.unpaddedArrayRef(), useGpuXBufOps, pme_gpu_get_device_x(fr->pmedata)); + if (useGpuXBufOps == BufferOpsUseGpu::True) + { + // The condition here was (pme != nullptr && pme_gpu_get_device_x(fr->pmedata) != nullptr) + if (!useGpuPme) + { + nbv->copyCoordinatesToGpu(Nbnxm::AtomLocality::Local, false, + x.unpaddedArrayRef()); + } + nbv->convertCoordinatesGpu(Nbnxm::AtomLocality::Local, false, + useGpuPme ? pme_gpu_get_device_x(fr->pmedata) : nbv->getDeviceCoordinates()); + } + else + { + nbv->convertCoordinates(Nbnxm::AtomLocality::Local, false, + x.unpaddedArrayRef()); + } } if (bUseGPU) @@ -1187,8 +1201,22 @@ void do_force(FILE *fplog, { dd_move_x(cr->dd, box, x.unpaddedArrayRef(), wcycle); - nbv->setCoordinates(Nbnxm::AtomLocality::NonLocal, false, - x.unpaddedArrayRef(), useGpuXBufOps, pme_gpu_get_device_x(fr->pmedata)); + if (useGpuXBufOps == BufferOpsUseGpu::True) + { + // The condition here was (pme != nullptr && pme_gpu_get_device_x(fr->pmedata) != nullptr) + if (!useGpuPme) + { + nbv->copyCoordinatesToGpu(Nbnxm::AtomLocality::NonLocal, false, + x.unpaddedArrayRef()); + } + nbv->convertCoordinatesGpu(Nbnxm::AtomLocality::NonLocal, false, + useGpuPme ? pme_gpu_get_device_x(fr->pmedata) : nbv->getDeviceCoordinates()); + } + else + { + nbv->convertCoordinates(Nbnxm::AtomLocality::NonLocal, false, + x.unpaddedArrayRef()); + } } diff --git a/src/gromacs/nbnxm/atomdata.cpp b/src/gromacs/nbnxm/atomdata.cpp index e78a65945e..6424b1558e 100644 --- a/src/gromacs/nbnxm/atomdata.cpp +++ b/src/gromacs/nbnxm/atomdata.cpp @@ -998,116 +998,148 @@ void nbnxn_atomdata_copy_shiftvec(gmx_bool bDynamicBox, } } -/* Copies (and reorders) the coordinates to nbnxn_atomdata_t */ -template -void nbnxn_atomdata_copy_x_to_nbat_x(const Nbnxm::GridSet &gridSet, - const Nbnxm::AtomLocality locality, - gmx_bool FillLocal, - const rvec *x, - nbnxn_atomdata_t *nbat, - gmx_nbnxn_gpu_t *gpu_nbv, - void *xPmeDevicePtr) +// This is slightly different from nbnxn_get_atom_range(...) at the end of the file +// TODO: Combine if possible +static void getAtomRanges(const Nbnxm::GridSet &gridSet, + const Nbnxm::AtomLocality locality, + int *gridBegin, + int *gridEnd) { - int gridBegin = 0; - int gridEnd = 0; - switch (locality) { case Nbnxm::AtomLocality::All: - gridBegin = 0; - gridEnd = gridSet.grids().size(); + *gridBegin = 0; + *gridEnd = gridSet.grids().size(); break; case Nbnxm::AtomLocality::Local: - gridBegin = 0; - gridEnd = 1; + *gridBegin = 0; + *gridEnd = 1; break; case Nbnxm::AtomLocality::NonLocal: - gridBegin = 1; - gridEnd = gridSet.grids().size(); + *gridBegin = 1; + *gridEnd = gridSet.grids().size(); break; case Nbnxm::AtomLocality::Count: GMX_ASSERT(false, "Count is invalid locality specifier"); break; } +} - if (FillLocal) +/* Copies (and reorders) the coordinates to nbnxn_atomdata_t */ +void nbnxn_atomdata_copy_x_to_nbat_x(const Nbnxm::GridSet &gridSet, + const Nbnxm::AtomLocality locality, + bool fillLocal, + const rvec *coordinates, + nbnxn_atomdata_t *nbat) +{ + + int gridBegin = 0; + int gridEnd = 0; + getAtomRanges(gridSet, locality, &gridBegin, &gridEnd); + + if (fillLocal) { nbat->natoms_local = gridSet.grids()[0].atomIndexEnd(); } - if (useGpu) - { - for (int g = gridBegin; g < gridEnd; g++) - { - nbnxn_gpu_x_to_nbat_x(gridSet.grids()[g], - FillLocal && g == 0, - gpu_nbv, - xPmeDevicePtr, - locality, - x, g, gridSet.numColumnsMax()); - } - } - else - { - const int nth = gmx_omp_nthreads_get(emntPairsearch); + const int nth = gmx_omp_nthreads_get(emntPairsearch); #pragma omp parallel for num_threads(nth) schedule(static) - for (int th = 0; th < nth; th++) + for (int th = 0; th < nth; th++) + { + try { - try + for (int g = gridBegin; g < gridEnd; g++) { - for (int g = gridBegin; g < gridEnd; g++) - { - const Nbnxm::Grid &grid = gridSet.grids()[g]; - const int numCellsXY = grid.numColumns(); + const Nbnxm::Grid &grid = gridSet.grids()[g]; + const int numCellsXY = grid.numColumns(); - const int cxy0 = (numCellsXY* th + nth - 1)/nth; - const int cxy1 = (numCellsXY*(th + 1) + nth - 1)/nth; + const int cxy0 = (numCellsXY* th + nth - 1)/nth; + const int cxy1 = (numCellsXY*(th + 1) + nth - 1)/nth; - for (int cxy = cxy0; cxy < cxy1; cxy++) - { - const int na = grid.numAtomsInColumn(cxy); - const int ash = grid.firstAtomInColumn(cxy); + for (int cxy = cxy0; cxy < cxy1; cxy++) + { + const int na = grid.numAtomsInColumn(cxy); + const int ash = grid.firstAtomInColumn(cxy); - int na_fill; - if (g == 0 && FillLocal) - { - na_fill = grid.paddedNumAtomsInColumn(cxy); - } - else - { - /* We fill only the real particle locations. - * We assume the filling entries at the end have been - * properly set before during pair-list generation. - */ - na_fill = na; - } - copy_rvec_to_nbat_real(gridSet.atomIndices().data() + ash, - na, na_fill, x, - nbat->XFormat, nbat->x().data(), ash); + int na_fill; + if (g == 0 && fillLocal) + { + na_fill = grid.paddedNumAtomsInColumn(cxy); + } + else + { + /* We fill only the real particle locations. + * We assume the filling entries at the end have been + * properly set before during pair-list generation. + */ + na_fill = na; } + copy_rvec_to_nbat_real(gridSet.atomIndices().data() + ash, + na, na_fill, coordinates, + nbat->XFormat, nbat->x().data(), ash); } } - GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR; } + GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR; } } -template -void nbnxn_atomdata_copy_x_to_nbat_x(const Nbnxm::GridSet &, - const Nbnxm::AtomLocality, - gmx_bool, - const rvec*, - nbnxn_atomdata_t *, - gmx_nbnxn_gpu_t*, - void *); -template -void nbnxn_atomdata_copy_x_to_nbat_x(const Nbnxm::GridSet &, - const Nbnxm::AtomLocality, - gmx_bool, - const rvec*, - nbnxn_atomdata_t *, - gmx_nbnxn_gpu_t*, - void *); +void nbnxn_atomdata_copy_x_to_gpu(const Nbnxm::GridSet &gridSet, + const Nbnxm::AtomLocality locality, + bool fillLocal, + nbnxn_atomdata_t *nbat, + gmx_nbnxn_gpu_t *gpu_nbv, + const rvec *coordinatesHost) +{ + int gridBegin = 0; + int gridEnd = 0; + getAtomRanges(gridSet, locality, &gridBegin, &gridEnd); + + if (fillLocal) + { + nbat->natoms_local = gridSet.grids()[0].atomIndexEnd(); + } + + for (int g = gridBegin; g < gridEnd; g++) + { + nbnxn_gpu_copy_x_to_gpu(gridSet.grids()[g], + fillLocal && g == 0, + gpu_nbv, + locality, + coordinatesHost, + g, + gridSet.numColumnsMax()); + } +} + +DeviceBuffer nbnxn_atomdata_get_x_gpu(gmx_nbnxn_gpu_t *gpu_nbv) +{ + return Nbnxm::nbnxn_gpu_get_x_gpu(gpu_nbv); +} + +/* Copies (and reorders) the coordinates to nbnxn_atomdata_t on the GPU*/ +void nbnxn_atomdata_x_to_nbat_x_gpu(const Nbnxm::GridSet &gridSet, + const Nbnxm::AtomLocality locality, + bool fillLocal, + gmx_nbnxn_gpu_t *gpu_nbv, + DeviceBuffer coordinatesDevice) +{ + + int gridBegin = 0; + int gridEnd = 0; + getAtomRanges(gridSet, locality, &gridBegin, &gridEnd); + + for (int g = gridBegin; g < gridEnd; g++) + { + nbnxn_gpu_x_to_nbat_x(gridSet.grids()[g], + fillLocal && g == 0, + gpu_nbv, + coordinatesDevice, + locality, + g, + gridSet.numColumnsMax()); + } +} static void nbnxn_atomdata_clear_reals(gmx::ArrayRef dest, diff --git a/src/gromacs/nbnxm/atomdata.h b/src/gromacs/nbnxm/atomdata.h index 6bbe082c0b..488290bb5b 100644 --- a/src/gromacs/nbnxm/atomdata.h +++ b/src/gromacs/nbnxm/atomdata.h @@ -38,6 +38,7 @@ #include +#include "gromacs/gpu_utils/devicebuffer_datatype.h" #include "gromacs/gpu_utils/hostallocator.h" #include "gromacs/math/vectypes.h" #include "gromacs/utility/basedefinitions.h" @@ -309,34 +310,62 @@ void nbnxn_atomdata_copy_shiftvec(gmx_bool dynamic_box, rvec *shift_vec, nbnxn_atomdata_t *nbat); -/* Copy x to nbat->x. - * FillLocal tells if the local filler particle coordinates should be zeroed. +/*! \brief Transform coordinates to xbat layout + * + * Creates a copy of the coordinates buffer using short-range ordering. + * + * \param[in] gridSet The grids data. + * \param[in] locality If the transformation should be applied to local or non local coordinates. + * \param[in] fillLocal Tells if the local filler particle coordinates should be zeroed. + * \param[in] coordinates Coordinates in plain rvec format. + * \param[in,out] nbat Data in NBNXM format, used for mapping formats and to locate the output buffer. */ -template void nbnxn_atomdata_copy_x_to_nbat_x(const Nbnxm::GridSet &gridSet, Nbnxm::AtomLocality locality, - gmx_bool FillLocal, - const rvec *x, - nbnxn_atomdata_t *nbat, - gmx_nbnxn_gpu_t *gpu_nbv, - void *xPmeDevicePtr); + bool fillLocal, + const rvec *coordinates, + nbnxn_atomdata_t *nbat); -extern template -void nbnxn_atomdata_copy_x_to_nbat_x(const Nbnxm::GridSet &, - const Nbnxm::AtomLocality, - gmx_bool, - const rvec*, - nbnxn_atomdata_t *, - gmx_nbnxn_gpu_t*, - void *); -extern template -void nbnxn_atomdata_copy_x_to_nbat_x(const Nbnxm::GridSet &, - const Nbnxm::AtomLocality, - gmx_bool, - const rvec*, - nbnxn_atomdata_t *, - gmx_nbnxn_gpu_t*, - void *); +/*! \brief Copies the coordinates to the GPU (in plain rvec format) + * + * This function copied data to the gpu so that the transformation to the NBNXM format can be done on the GPU. + * + * \param[in] gridSet The grids data. + * \param[in] locality If local or non local coordinates should be copied. + * \param[in] fillLocal If the local filler particle coordinates should be zeroed. + * \param[in] nbat Data in NBNXM format, used to zero coordinates of filler particles. + * \param[in] gpu_nbv The NBNXM GPU data structure. + * \param[in] coordinatesHost Coordinates to be copied (in plain rvec format). + */ +void nbnxn_atomdata_copy_x_to_gpu(const Nbnxm::GridSet &gridSet, + Nbnxm::AtomLocality locality, + bool fillLocal, + nbnxn_atomdata_t *nbat, + gmx_nbnxn_gpu_t *gpu_nbv, + const rvec *coordinatesHost); + +/*!\brief Getter for the GPU coordinates buffer + * + * \param[in] gpu_nbv The NBNXM GPU data structure. + */ +DeviceBuffer nbnxn_atomdata_get_x_gpu(gmx_nbnxn_gpu_t *gpu_nbv); + +/*! \brief Transform coordinates to xbat layout on GPU + * + * Creates a GPU copy of the coordinates buffer using short-range ordering. + * As input, uses coordinates in plain rvec format in GPU memory. + * + * \param[in] gridSet The grids data. + * \param[in] locality If the transformation should be applied to local or non local coordinates. + * \param[in] fillLocal Tells if the local filler particle coordinates should be zeroed. + * \param[in,out] gpu_nbv The NBNXM GPU data structure. + * \param[in] coordinatesDevice Coordinates to be copied (in plain rvec format). + */ +void nbnxn_atomdata_x_to_nbat_x_gpu(const Nbnxm::GridSet &gridSet, + Nbnxm::AtomLocality locality, + bool fillLocal, + gmx_nbnxn_gpu_t *gpu_nbv, + DeviceBuffer coordinatesDevice); //! Add the computed forces to \p f, an internal reduction might be performed as well template diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu index 0b30c010b0..63051b7c43 100644 --- a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu +++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu @@ -744,20 +744,18 @@ void cuda_set_cacheconfig() } } -/* X buffer operations on GPU: performs conversion from rvec to nb format. */ -void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid &grid, - bool setFillerCoords, - gmx_nbnxn_gpu_t *nb, - void *xPmeDevicePtr, - const Nbnxm::AtomLocality locality, - const rvec *x, - int gridId, - int numColumnsMax) +/* X buffer operations on GPU: copies coordinates to the GPU in rvec format. */ +void nbnxn_gpu_copy_x_to_gpu(const Nbnxm::Grid &grid, + bool setFillerCoords, + gmx_nbnxn_gpu_t *nb, + const Nbnxm::AtomLocality locality, + const rvec *coordinatesHost, + int gridId, + int numColumnsMax) { GMX_ASSERT(nb, "Need a valid nbnxn_gpu object"); - GMX_ASSERT(x, "Need a valid x pointer"); + GMX_ASSERT(coordinatesHost, "Need a valid host pointer"); - cu_atomdata_t *adat = nb->atdat; bool bDoTime = nb->bDoTime; const int numColumns = grid.numColumns(); @@ -780,38 +778,51 @@ void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid &grid, return; } - const rvec *d_x; + if (bDoTime) + { + nb->timers->xf[locality].nb_h2d.openTimingRegion(stream); + } - // copy of coordinates will be required if null pointer has been - // passed to function - // TODO improve this mechanism - bool copyCoord = (xPmeDevicePtr == nullptr); + rvec *devicePtrDest = reinterpret_cast (nb->xrvec[copyAtomStart]); + const rvec *devicePtrSrc = reinterpret_cast (coordinatesHost[copyAtomStart]); + copyToDeviceBuffer(&devicePtrDest, devicePtrSrc, 0, nCopyAtoms, + stream, GpuApiCallBehavior::Async, nullptr); - // copy X-coordinate data to device - if (copyCoord) + if (bDoTime) { - if (bDoTime) - { - nb->timers->xf[locality].nb_h2d.openTimingRegion(stream); - } + nb->timers->xf[locality].nb_h2d.closeTimingRegion(stream); + } +} - rvec *devicePtrDest = reinterpret_cast (nb->xrvec[copyAtomStart]); - const rvec *devicePtrSrc = reinterpret_cast (x[copyAtomStart]); - copyToDeviceBuffer(&devicePtrDest, devicePtrSrc, 0, nCopyAtoms, - stream, GpuApiCallBehavior::Async, nullptr); +DeviceBuffer nbnxn_gpu_get_x_gpu(gmx_nbnxn_gpu_t *nb) +{ + return reinterpret_cast< DeviceBuffer >(nb->xrvec); +} - if (bDoTime) - { - nb->timers->xf[locality].nb_h2d.closeTimingRegion(stream); - } +/* X buffer operations on GPU: performs conversion from rvec to nb format. */ +void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid &grid, + bool setFillerCoords, + gmx_nbnxn_gpu_t *nb, + DeviceBuffer coordinatesDevice, + const Nbnxm::AtomLocality locality, + int gridId, + int numColumnsMax) +{ + GMX_ASSERT(nb, "Need a valid nbnxn_gpu object"); - d_x = nb->xrvec; - } - else //coordinates have already been copied by PME stream - { - d_x = (rvec*) xPmeDevicePtr; - } - GMX_ASSERT(d_x, "Need a valid d_x pointer"); + cu_atomdata_t *adat = nb->atdat; + + const int numColumns = grid.numColumns(); + const int cellOffset = grid.cellOffset(); + const int numAtomsPerCell = grid.numAtomsPerCell(); + Nbnxm::InteractionLocality interactionLoc = gpuAtomToInteractionLocality(locality); + int nCopyAtoms = grid.srcAtomEnd() - grid.srcAtomBegin(); + int copyAtomStart = grid.srcAtomBegin(); + + cudaStream_t stream = nb->stream[interactionLoc]; + + // TODO: This will only work with CUDA + GMX_ASSERT(coordinatesDevice, "Need a valid device pointer"); /* launch kernel on GPU */ @@ -835,7 +846,7 @@ void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid &grid, &numColumns, &xqPtr, &setFillerCoords, - &d_x, + &coordinatesDevice, &d_atomIndices, &d_cxy_na, &d_cxy_ind, diff --git a/src/gromacs/nbnxm/nbnxm.cpp b/src/gromacs/nbnxm/nbnxm.cpp index 3e4990a6ac..794b184f36 100644 --- a/src/gromacs/nbnxm/nbnxm.cpp +++ b/src/gromacs/nbnxm/nbnxm.cpp @@ -132,22 +132,52 @@ void nonbonded_verlet_t::setAtomProperties(const t_mdatoms &mdatoms, nbnxn_atomdata_set(nbat.get(), pairSearch_->gridSet(), &mdatoms, atomInfo.data()); } -void nonbonded_verlet_t::setCoordinates(const Nbnxm::AtomLocality locality, - const bool fillLocal, - gmx::ArrayRef x, - BufferOpsUseGpu useGpu, - void *xPmeDevicePtr) +void nonbonded_verlet_t::convertCoordinates(const Nbnxm::AtomLocality locality, + const bool fillLocal, + gmx::ArrayRef coordinates) { wallcycle_start(wcycle_, ewcNB_XF_BUF_OPS); wallcycle_sub_start(wcycle_, ewcsNB_X_BUF_OPS); - auto fnPtr = (useGpu == BufferOpsUseGpu::True) ? - nbnxn_atomdata_copy_x_to_nbat_x : - nbnxn_atomdata_copy_x_to_nbat_x; + nbnxn_atomdata_copy_x_to_nbat_x(pairSearch_->gridSet(), locality, fillLocal, + as_rvec_array(coordinates.data()), + nbat.get()); - fnPtr(pairSearch_->gridSet(), locality, fillLocal, - as_rvec_array(x.data()), - nbat.get(), gpu_nbv, xPmeDevicePtr); + wallcycle_sub_stop(wcycle_, ewcsNB_X_BUF_OPS); + wallcycle_stop(wcycle_, ewcNB_XF_BUF_OPS); +} + + +void nonbonded_verlet_t::copyCoordinatesToGpu(const Nbnxm::AtomLocality locality, + const bool fillLocal, + gmx::ArrayRef coordinatesHost) +{ + wallcycle_start(wcycle_, ewcNB_XF_BUF_OPS); + wallcycle_sub_start(wcycle_, ewcsNB_X_BUF_OPS); + + nbnxn_atomdata_copy_x_to_gpu(pairSearch_->gridSet(), locality, fillLocal, + nbat.get(), gpu_nbv, + as_rvec_array(coordinatesHost.data())); + + wallcycle_sub_stop(wcycle_, ewcsNB_X_BUF_OPS); + wallcycle_stop(wcycle_, ewcNB_XF_BUF_OPS); +} + +DeviceBuffer nonbonded_verlet_t::getDeviceCoordinates() +{ + return nbnxn_atomdata_get_x_gpu(gpu_nbv); +} + +void nonbonded_verlet_t::convertCoordinatesGpu(const Nbnxm::AtomLocality locality, + const bool fillLocal, + DeviceBuffer coordinatesDevice) +{ + wallcycle_start(wcycle_, ewcNB_XF_BUF_OPS); + wallcycle_sub_start(wcycle_, ewcsNB_X_BUF_OPS); + + nbnxn_atomdata_x_to_nbat_x_gpu(pairSearch_->gridSet(), locality, fillLocal, + gpu_nbv, + coordinatesDevice); wallcycle_sub_stop(wcycle_, ewcsNB_X_BUF_OPS); wallcycle_stop(wcycle_, ewcNB_XF_BUF_OPS); diff --git a/src/gromacs/nbnxm/nbnxm.h b/src/gromacs/nbnxm/nbnxm.h index 1a2ce8cc0c..50ead8c655 100644 --- a/src/gromacs/nbnxm/nbnxm.h +++ b/src/gromacs/nbnxm/nbnxm.h @@ -100,6 +100,7 @@ #include +#include "gromacs/gpu_utils/devicebuffer_datatype.h" #include "gromacs/math/vectypes.h" #include "gromacs/utility/arrayref.h" #include "gromacs/utility/enumerationhelpers.h" @@ -251,12 +252,54 @@ struct nonbonded_verlet_t void setAtomProperties(const t_mdatoms &mdatoms, gmx::ArrayRef atomInfo); - //! Updates the coordinates in Nbnxm for the given locality - void setCoordinates(Nbnxm::AtomLocality locality, - bool fillLocal, - gmx::ArrayRef x, - BufferOpsUseGpu useGpu, - void *xPmeDevicePtr); + /*!\brief Convert the coordinates to NBNXM format for the given locality. + * + * The API function for the transformation of the coordinates from one layout to another. + * + * \param[in] locality Whether coordinates for local or non-local atoms should be transformed. + * \param[in] fillLocal If the coordinates for filler particles should be zeroed. + * \param[in] coordinates Coordinates in plain rvec format to be transformed. + */ + void convertCoordinates(Nbnxm::AtomLocality locality, + bool fillLocal, + gmx::ArrayRef coordinates); + + /*!\brief Copy coordinates to the GPU memory. + * + * This function uses the internal NBNXM GPU pointer to copy coordinates in the plain rvec format + * into the GPU memory. + * + * \todo This function will be removed in future patches as the management of the device buffers + * is moved to a separate object. + * + * \param[in] locality Whether coordinates for local or non-local atoms should be transformed. + * \param[in] fillLocal If the coordinates for filler particles should be zeroed. + * \param[in] coordinatesHost Coordinates in plain rvec format to be transformed. + */ + void copyCoordinatesToGpu(Nbnxm::AtomLocality locality, + bool fillLocal, + gmx::ArrayRef coordinatesHost); + + /*!\brief Getter for the GPU coordinates buffer. + * + * \todo This function will be removed in future patches as the management of the device buffers + * is moved to a separate object. + * + * \returns The coordinates buffer in plain rvec format. + */ + DeviceBuffer getDeviceCoordinates(); + + /*!\brief Convert the coordinates to NBNXM format on the GPU for the given locality + * + * The API function for the transformation of the coordinates from one layout to another in the GPU memory. + * + * \param[in] locality Whether coordinates for local or non-local atoms should be transformed. + * \param[in] fillLocal If the coordinates for filler particles should be zeroed. + * \param[in] coordinatesDevice GPU coordinates buffer in plain rvec format to be transformed. + */ + void convertCoordinatesGpu(Nbnxm::AtomLocality locality, + bool fillLocal, + DeviceBuffer coordinatesDevice); //! Init for GPU version of setup coordinates in Nbnxm void atomdata_init_copy_x_to_nbat_x_gpu(); diff --git a/src/gromacs/nbnxm/nbnxm_gpu.h b/src/gromacs/nbnxm/nbnxm_gpu.h index 08f0aa9f96..4dfcfa15d4 100644 --- a/src/gromacs/nbnxm/nbnxm_gpu.h +++ b/src/gromacs/nbnxm/nbnxm_gpu.h @@ -228,17 +228,57 @@ CUDA_FUNC_QUALIFIER void nbnxn_gpu_init_x_to_nbat_x(const Nbnxm::GridSet gmx_unused &gridSet, gmx_nbnxn_gpu_t gmx_unused *gpu_nbv) CUDA_FUNC_TERM; +/*! \brief Copy coordinates from host to device memory. + * + * \todo This will be removed as the management of the buffers is taken out of the NBNXM module. + * + * \param[in] grid Grid to be copied. + * \param[in] setFillerCoords If the filler coordinates are used. + * \param[in,out] gpu_nbv The nonbonded data GPU structure. + * \param[in] locality Copy coordinates for local or non-local atoms. + * \param[in] coordinatesHost Host-side coordinates in plain rvec format. + * \param[in] gridId Index of the grid being copied. + * \param[in] numColumnsMax Maximum number of columns in the grid. + */ +CUDA_FUNC_QUALIFIER +void nbnxn_gpu_copy_x_to_gpu(const Nbnxm::Grid gmx_unused &grid, + bool gmx_unused setFillerCoords, + gmx_nbnxn_gpu_t gmx_unused *gpu_nbv, + Nbnxm::AtomLocality gmx_unused locality, + const rvec gmx_unused *coordinatesHost, + int gmx_unused gridId, + int gmx_unused numColumnsMax) CUDA_FUNC_TERM; + +/*! \brief Getter for the device coordinates buffer. + * + * \todo This will be removed as the management of the buffers is taken out of the NBNXM module. + * + * \param[in] gpu_nbv The nonbonded data GPU structure. + * + * \returns Device coordinates buffer in plain rvec format. + */ +CUDA_FUNC_QUALIFIER +DeviceBuffer nbnxn_gpu_get_x_gpu(gmx_nbnxn_gpu_t gmx_unused *gpu_nbv) CUDA_FUNC_TERM_WITH_RETURN(DeviceBuffer {}); + + /*! \brief X buffer operations on GPU: performs conversion from rvec to nb format. + * + * \param[in] grid Grid to be converted. + * \param[in] setFillerCoords If the filler coordinates are used. + * \param[in,out] gpu_nbv The nonbonded data GPU structure. + * \param[in] coordinatesDevice Device-side coordinates in plain rvec format. + * \param[in] locality Copy coordinates for local or non-local atoms. + * \param[in] gridId Index of the grid being converted. + * \param[in] numColumnsMax Maximum number of columns in the grid. */ CUDA_FUNC_QUALIFIER -void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid gmx_unused &grid, - bool gmx_unused setFillerCoords, - gmx_nbnxn_gpu_t gmx_unused *gpu_nbv, - void gmx_unused *xPmeDevicePtr, - Nbnxm::AtomLocality gmx_unused locality, - const rvec gmx_unused *x, - int gmx_unused gridId, - int gmx_unused numColumnsMax) CUDA_FUNC_TERM; +void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid gmx_unused &grid, + bool gmx_unused setFillerCoords, + gmx_nbnxn_gpu_t gmx_unused *gpu_nbv, + DeviceBuffer gmx_unused coordinatesDevice, + Nbnxm::AtomLocality gmx_unused locality, + int gmx_unused gridId, + int gmx_unused numColumnsMax) CUDA_FUNC_TERM; /*! \brief Sync the nonlocal stream with dependent tasks in the local queue. * \param[in] nb The nonbonded data GPU structure -- 2.22.0