From 0e940b48fa4c7c8bfcdce3dbf077c791cf954c5a Mon Sep 17 00:00:00 2001 From: Andrey Alekseenko Date: Wed, 24 Mar 2021 11:54:55 +0000 Subject: [PATCH] Unify nbnxn_gpu_init_x_to_nbat_x Previously, we had this function only for CUDA, but we will need it for SYCL (#3932). OpenCL implementation is unlikely to be needed, but should not hurt either. Refs. #2608 --- .../nbnxm/cuda/nbnxm_cuda_data_mgmt.cu | 107 ----------------- src/gromacs/nbnxm/nbnxm_gpu.h | 4 +- src/gromacs/nbnxm/nbnxm_gpu_data_mgmt.cpp | 109 ++++++++++++++++++ src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h | 20 ++++ src/gromacs/nbnxm/sycl/nbnxm_sycl_types.h | 20 ++++ 5 files changed, 151 insertions(+), 109 deletions(-) diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu index 274f40448f..be674962d6 100644 --- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu +++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu @@ -228,111 +228,4 @@ DeviceBuffer gpu_get_fshift(NbnxmGpu* nb) return reinterpret_cast>(nb->atdat->fShift); } -/* Initialization for X buffer operations on GPU. */ -/* TODO Remove explicit pinning from host arrays from here and manage in a more natural way*/ -void nbnxn_gpu_init_x_to_nbat_x(const Nbnxm::GridSet& gridSet, NbnxmGpu* gpu_nbv) -{ - const DeviceStream& localStream = *gpu_nbv->deviceStreams[InteractionLocality::Local]; - bool bDoTime = gpu_nbv->bDoTime; - const int maxNumColumns = gridSet.numColumnsMax(); - - reallocateDeviceBuffer(&gpu_nbv->cxy_na, - maxNumColumns * gridSet.grids().size(), - &gpu_nbv->ncxy_na, - &gpu_nbv->ncxy_na_alloc, - *gpu_nbv->deviceContext_); - reallocateDeviceBuffer(&gpu_nbv->cxy_ind, - maxNumColumns * gridSet.grids().size(), - &gpu_nbv->ncxy_ind, - &gpu_nbv->ncxy_ind_alloc, - *gpu_nbv->deviceContext_); - - for (unsigned int g = 0; g < gridSet.grids().size(); g++) - { - - const Nbnxm::Grid& grid = gridSet.grids()[g]; - - const int numColumns = grid.numColumns(); - const int* atomIndices = gridSet.atomIndices().data(); - const int atomIndicesSize = gridSet.atomIndices().size(); - const int* cxy_na = grid.cxy_na().data(); - const int* cxy_ind = grid.cxy_ind().data(); - - reallocateDeviceBuffer(&gpu_nbv->atomIndices, - atomIndicesSize, - &gpu_nbv->atomIndicesSize, - &gpu_nbv->atomIndicesSize_alloc, - *gpu_nbv->deviceContext_); - - if (atomIndicesSize > 0) - { - - if (bDoTime) - { - gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.openTimingRegion(localStream); - } - - copyToDeviceBuffer(&gpu_nbv->atomIndices, - atomIndices, - 0, - atomIndicesSize, - localStream, - GpuApiCallBehavior::Async, - nullptr); - - if (bDoTime) - { - gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.closeTimingRegion(localStream); - } - } - - if (numColumns > 0) - { - if (bDoTime) - { - gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.openTimingRegion(localStream); - } - - int* destPtr = &gpu_nbv->cxy_na[maxNumColumns * g]; - copyToDeviceBuffer( - &destPtr, cxy_na, 0, numColumns, localStream, GpuApiCallBehavior::Async, nullptr); - - if (bDoTime) - { - gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.closeTimingRegion(localStream); - } - - if (bDoTime) - { - gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.openTimingRegion(localStream); - } - - destPtr = &gpu_nbv->cxy_ind[maxNumColumns * g]; - copyToDeviceBuffer( - &destPtr, cxy_ind, 0, numColumns, localStream, GpuApiCallBehavior::Async, nullptr); - - if (bDoTime) - { - gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.closeTimingRegion(localStream); - } - } - } - - if (gpu_nbv->bUseTwoStreams) - { - // The above data is transferred on the local stream but is a - // dependency of the nonlocal stream (specifically the nonlocal X - // buf ops kernel). We therefore set a dependency to ensure - // that the nonlocal stream waits on the local stream here. - // This call records an event in the local stream: - gpu_nbv->misc_ops_and_local_H2D_done.markEvent( - *gpu_nbv->deviceStreams[Nbnxm::InteractionLocality::Local]); - // ...and this call instructs the nonlocal stream to wait on that event: - gpu_nbv->misc_ops_and_local_H2D_done.enqueueWaitEvent( - *gpu_nbv->deviceStreams[Nbnxm::InteractionLocality::NonLocal]); - } - - return; -} - } // namespace Nbnxm diff --git a/src/gromacs/nbnxm/nbnxm_gpu.h b/src/gromacs/nbnxm/nbnxm_gpu.h index 621878d9eb..60a28c3fa4 100644 --- a/src/gromacs/nbnxm/nbnxm_gpu.h +++ b/src/gromacs/nbnxm/nbnxm_gpu.h @@ -233,9 +233,9 @@ float gpu_wait_finish_task(NbnxmGpu gmx_unused* nb, /*! \brief Initialization for X buffer operations on GPU. * Called on the NS step and performs (re-)allocations and memory copies. !*/ -CUDA_FUNC_QUALIFIER +GPU_FUNC_QUALIFIER void nbnxn_gpu_init_x_to_nbat_x(const Nbnxm::GridSet gmx_unused& gridSet, - NbnxmGpu gmx_unused* gpu_nbv) CUDA_FUNC_TERM; + NbnxmGpu gmx_unused* gpu_nbv) GPU_FUNC_TERM; /*! \brief X buffer operations on GPU: performs conversion from rvec to nb format. * diff --git a/src/gromacs/nbnxm/nbnxm_gpu_data_mgmt.cpp b/src/gromacs/nbnxm/nbnxm_gpu_data_mgmt.cpp index 53c53e2528..ca39376595 100644 --- a/src/gromacs/nbnxm/nbnxm_gpu_data_mgmt.cpp +++ b/src/gromacs/nbnxm/nbnxm_gpu_data_mgmt.cpp @@ -69,6 +69,7 @@ #include "gromacs/mdtypes/simulation_workload.h" #include "gromacs/nbnxm/gpu_common_utils.h" #include "gromacs/nbnxm/gpu_data_mgmt.h" +#include "gromacs/nbnxm/gridset.h" #include "gromacs/pbcutil/ishift.h" #include "gromacs/timing/gpu_timing.h" #include "gromacs/pbcutil/ishift.h" @@ -973,4 +974,112 @@ void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const Atom nbnxnInsertNonlocalGpuDependency(nb, iloc); } + +/* Initialization for X buffer operations on GPU. */ +void nbnxn_gpu_init_x_to_nbat_x(const Nbnxm::GridSet& gridSet, NbnxmGpu* gpu_nbv) +{ + const DeviceStream& localStream = *gpu_nbv->deviceStreams[InteractionLocality::Local]; + const bool bDoTime = gpu_nbv->bDoTime; + const int maxNumColumns = gridSet.numColumnsMax(); + + reallocateDeviceBuffer(&gpu_nbv->cxy_na, + maxNumColumns * gridSet.grids().size(), + &gpu_nbv->ncxy_na, + &gpu_nbv->ncxy_na_alloc, + *gpu_nbv->deviceContext_); + reallocateDeviceBuffer(&gpu_nbv->cxy_ind, + maxNumColumns * gridSet.grids().size(), + &gpu_nbv->ncxy_ind, + &gpu_nbv->ncxy_ind_alloc, + *gpu_nbv->deviceContext_); + + for (unsigned int g = 0; g < gridSet.grids().size(); g++) + { + const Nbnxm::Grid& grid = gridSet.grids()[g]; + + const int numColumns = grid.numColumns(); + const int* atomIndices = gridSet.atomIndices().data(); + const int atomIndicesSize = gridSet.atomIndices().size(); + const int* cxy_na = grid.cxy_na().data(); + const int* cxy_ind = grid.cxy_ind().data(); + + auto* timerH2D = bDoTime ? &gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d : nullptr; + + reallocateDeviceBuffer(&gpu_nbv->atomIndices, + atomIndicesSize, + &gpu_nbv->atomIndicesSize, + &gpu_nbv->atomIndicesSize_alloc, + *gpu_nbv->deviceContext_); + + if (atomIndicesSize > 0) + { + if (bDoTime) + { + timerH2D->openTimingRegion(localStream); + } + + copyToDeviceBuffer(&gpu_nbv->atomIndices, + atomIndices, + 0, + atomIndicesSize, + localStream, + GpuApiCallBehavior::Async, + bDoTime ? timerH2D->fetchNextEvent() : nullptr); + + if (bDoTime) + { + timerH2D->closeTimingRegion(localStream); + } + } + + if (numColumns > 0) + { + if (bDoTime) + { + timerH2D->openTimingRegion(localStream); + } + + copyToDeviceBuffer(&gpu_nbv->cxy_na, + cxy_na, + maxNumColumns * g, + numColumns, + localStream, + GpuApiCallBehavior::Async, + bDoTime ? timerH2D->fetchNextEvent() : nullptr); + + if (bDoTime) + { + timerH2D->closeTimingRegion(localStream); + } + + if (bDoTime) + { + timerH2D->openTimingRegion(localStream); + } + + copyToDeviceBuffer(&gpu_nbv->cxy_ind, + cxy_ind, + maxNumColumns * g, + numColumns, + localStream, + GpuApiCallBehavior::Async, + bDoTime ? timerH2D->fetchNextEvent() : nullptr); + + if (bDoTime) + { + timerH2D->closeTimingRegion(localStream); + } + } + } + + // The above data is transferred on the local stream but is a + // dependency of the nonlocal stream (specifically the nonlocal X + // buf ops kernel). We therefore set a dependency to ensure + // that the nonlocal stream waits on the local stream here. + // This call records an event in the local stream: + nbnxnInsertNonlocalGpuDependency(gpu_nbv, Nbnxm::InteractionLocality::Local); + // ...and this call instructs the nonlocal stream to wait on that event: + nbnxnInsertNonlocalGpuDependency(gpu_nbv, Nbnxm::InteractionLocality::NonLocal); +} + } // namespace Nbnxm diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h b/src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h index 925f94b117..95558805bd 100644 --- a/src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h +++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h @@ -175,6 +175,26 @@ struct NbnxmGpu //! staging area where fshift/energies get downloaded NBStagingData nbst; + // Data for GPU-side coordinate conversion between integrator and NBNXM + /*! \brief array of atom indices */ + DeviceBuffer atomIndices; + /*! \brief size of atom indices */ + int atomIndicesSize = 0; + /*! \brief size of atom indices allocated in device buffer */ + int atomIndicesSize_alloc = 0; + /*! \brief x buf ops num of atoms */ + DeviceBuffer cxy_na; + /*! \brief number of elements in cxy_na */ + int ncxy_na = 0; + /*! \brief number of elements allocated allocated in device buffer */ + int ncxy_na_alloc = 0; + /*! \brief x buf ops cell index mapping */ + DeviceBuffer cxy_ind; + /*! \brief number of elements in cxy_ind */ + int ncxy_ind = 0; + /*! \brief number of elements allocated allocated in device buffer */ + int ncxy_ind_alloc = 0; + //! local and non-local GPU queues gmx::EnumerationArray deviceStreams; diff --git a/src/gromacs/nbnxm/sycl/nbnxm_sycl_types.h b/src/gromacs/nbnxm/sycl/nbnxm_sycl_types.h index fd1d655e3e..6a82823b37 100644 --- a/src/gromacs/nbnxm/sycl/nbnxm_sycl_types.h +++ b/src/gromacs/nbnxm/sycl/nbnxm_sycl_types.h @@ -74,6 +74,26 @@ struct NbnxmGpu /*! \brief atom data */ NBAtomData* atdat = nullptr; + // Data for GPU-side coordinate conversion between integrator and NBNXM + /*! \brief array of atom indices */ + DeviceBuffer atomIndices; + /*! \brief size of atom indices */ + int atomIndicesSize = 0; + /*! \brief size of atom indices allocated in device buffer */ + int atomIndicesSize_alloc = 0; + /*! \brief x buf ops num of atoms */ + DeviceBuffer cxy_na; + /*! \brief number of elements in cxy_na */ + int ncxy_na = 0; + /*! \brief number of elements allocated allocated in device buffer */ + int ncxy_na_alloc = 0; + /*! \brief x buf ops cell index mapping */ + DeviceBuffer cxy_ind; + /*! \brief number of elements in cxy_ind */ + int ncxy_ind = 0; + /*! \brief number of elements allocated allocated in device buffer */ + int ncxy_ind_alloc = 0; + NBParamGpu* nbparam = nullptr; /*! \brief pair-list data structures (local and non-local) */ gmx::EnumerationArray plist = { { nullptr } }; -- 2.22.0