From 5f8899ba4bca7b8d158242e1b426fc7f9fcbe787 Mon Sep 17 00:00:00 2001 From: Artem Zhmurov Date: Mon, 4 May 2020 17:44:01 +0000 Subject: [PATCH] Unify CUDA and OpenCL lookup-table creation In CUDA code, textures are used for the lookup-tables, whereas in OpenCL they are created as a read-only buffers. This commit hides these differences behind a unified wrapper. Refs #3318 Refs #3311 Change-Id: I003e0c982c2452a2753e331b46fc59f0b7e1b711 --- src/gromacs/gpu_utils/CMakeLists.txt | 1 - src/gromacs/gpu_utils/cudautils.cuh | 24 ------------- src/gromacs/gpu_utils/devicebuffer.cuh | 10 +++--- src/gromacs/nbnxm/cuda/nbnxm_cuda.cu | 33 ++++++++++++------ .../nbnxm/cuda/nbnxm_cuda_data_mgmt.cu | 34 ++++++++++++------- 5 files changed, 49 insertions(+), 53 deletions(-) diff --git a/src/gromacs/gpu_utils/CMakeLists.txt b/src/gromacs/gpu_utils/CMakeLists.txt index e8c02ccf31..2ad27010cf 100644 --- a/src/gromacs/gpu_utils/CMakeLists.txt +++ b/src/gromacs/gpu_utils/CMakeLists.txt @@ -54,7 +54,6 @@ if(GMX_USE_OPENCL) ) elseif(GMX_USE_CUDA) gmx_add_libgromacs_sources( - cudautils.cu device_stream.cu gpu_utils.cu pinning.cu diff --git a/src/gromacs/gpu_utils/cudautils.cuh b/src/gromacs/gpu_utils/cudautils.cuh index 02eec85bbd..2322dd8793 100644 --- a/src/gromacs/gpu_utils/cudautils.cuh +++ b/src/gromacs/gpu_utils/cudautils.cuh @@ -142,30 +142,6 @@ enum class GpuApiCallBehavior; #endif /* CHECK_CUDA_ERRORS */ -/*! Launches synchronous or asynchronous device to host memory copy. - * - * The copy is launched in stream s or if not specified, in stream 0. - */ -int cu_copy_D2H(void* h_dest, void* d_src, size_t bytes, GpuApiCallBehavior transferKind, cudaStream_t /*s = nullptr*/); - -/*! Launches synchronous host to device memory copy in stream 0. */ -int cu_copy_D2H_sync(void* /*h_dest*/, void* /*d_src*/, size_t /*bytes*/); - -/*! Launches asynchronous host to device memory copy in stream s. */ -int cu_copy_D2H_async(void* /*h_dest*/, void* /*d_src*/, size_t /*bytes*/, cudaStream_t /*s = nullptr*/); - -/*! Launches synchronous or asynchronous host to device memory copy. - * - * The copy is launched in stream s or if not specified, in stream 0. - */ -int cu_copy_H2D(void* d_dest, const void* h_src, size_t bytes, GpuApiCallBehavior transferKind, cudaStream_t /*s = nullptr*/); - -/*! Launches synchronous host to device memory copy. */ -int cu_copy_H2D_sync(void* /*d_dest*/, const void* /*h_src*/, size_t /*bytes*/); - -/*! Launches asynchronous host to device memory copy in stream s. */ -int cu_copy_H2D_async(void* /*d_dest*/, const void* /*h_src*/, size_t /*bytes*/, cudaStream_t /*s = nullptr*/); - // TODO: the 2 functions below are pretty much a constructor/destructor of a simple // GPU table object. There is also almost self-contained fetchFromParamLookupTable() // in cuda_kernel_utils.cuh. They could all live in a separate class/struct file. diff --git a/src/gromacs/gpu_utils/devicebuffer.cuh b/src/gromacs/gpu_utils/devicebuffer.cuh index c9bce141fe..3aabf4285b 100644 --- a/src/gromacs/gpu_utils/devicebuffer.cuh +++ b/src/gromacs/gpu_utils/devicebuffer.cuh @@ -91,8 +91,6 @@ void freeDeviceBuffer(DeviceBuffer* buffer) /*! \brief * Performs the host-to-device data copy, synchronous or asynchronously on request. * - * TODO: This is meant to gradually replace cu/ocl_copy_h2d. - * * \tparam ValueType Raw value type of the \p buffer. * \param[in,out] buffer Pointer to the device-side buffer * \param[in] hostBuffer Pointer to the raw host-side memory, also typed \p ValueType @@ -114,7 +112,7 @@ void copyToDeviceBuffer(DeviceBuffer* buffer, { if (numValues == 0) { - return; // such calls are actually made with empty domains + return; } GMX_ASSERT(buffer, "needs a buffer pointer"); GMX_ASSERT(hostBuffer, "needs a host buffer pointer"); @@ -144,8 +142,6 @@ void copyToDeviceBuffer(DeviceBuffer* buffer, /*! \brief * Performs the device-to-host data copy, synchronous or asynchronously on request. * - * TODO: This is meant to gradually replace cu/ocl_copy_d2h. - * * \tparam ValueType Raw value type of the \p buffer. * \param[in,out] hostBuffer Pointer to the raw host-side memory, also typed \p ValueType * \param[in] buffer Pointer to the device-side buffer @@ -165,6 +161,10 @@ void copyFromDeviceBuffer(ValueType* hostBuffer, GpuApiCallBehavior transferKind, CommandEvent* /*timingEvent*/) { + if (numValues == 0) + { + return; + } GMX_ASSERT(buffer, "needs a buffer pointer"); GMX_ASSERT(hostBuffer, "needs a host buffer pointer"); diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu index 95aca5ba1d..8c76c4e95f 100644 --- a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu +++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu @@ -54,7 +54,7 @@ #include "nbnxm_cuda.h" -#include "gromacs/gpu_utils/cudautils.cuh" +#include "gromacs/gpu_utils/gpu_utils.h" #include "gromacs/gpu_utils/gpueventsynchronizer.cuh" #include "gromacs/gpu_utils/typecasts.cuh" #include "gromacs/gpu_utils/vectype_ops.cuh" @@ -444,9 +444,10 @@ void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const Atom t->xf[atomLocality].nb_h2d.openTimingRegion(deviceStream); } - cu_copy_H2D_async(adat->xq + adat_begin, - static_cast(nbatom->x().data() + adat_begin * 4), - adat_len * sizeof(*adat->xq), deviceStream.stream()); + static_assert(sizeof(adat->xq[0]) == sizeof(float4), + "The size of the xyzq buffer element should be equal to the size of float4."); + copyToDeviceBuffer(&adat->xq, reinterpret_cast(nbatom->x().data()) + adat_begin, + adat_begin, adat_len, deviceStream, GpuApiCallBehavior::Async, nullptr); if (bDoTime) { @@ -761,8 +762,11 @@ void gpu_launch_cpyback(NbnxmGpu* nb, */ if (!stepWork.useGpuFBufferOps) { - cu_copy_D2H_async(nbatom->out[0].f.data() + adat_begin * 3, adat->f + adat_begin, - (adat_len) * sizeof(*adat->f), deviceStream.stream()); + static_assert( + sizeof(adat->f[0]) == sizeof(float3), + "The size of the force buffer element should be equal to the size of float3."); + copyFromDeviceBuffer(reinterpret_cast(nbatom->out[0].f.data()) + adat_begin, &adat->f, + adat_begin, adat_len, deviceStream, GpuApiCallBehavior::Async, nullptr); } /* After the non-local D2H is launched the nonlocal_done event can be @@ -781,15 +785,24 @@ void gpu_launch_cpyback(NbnxmGpu* nb, /* DtoH fshift when virial is needed */ if (stepWork.computeVirial) { - cu_copy_D2H_async(nb->nbst.fshift, adat->fshift, SHIFTS * sizeof(*nb->nbst.fshift), - deviceStream.stream()); + static_assert(sizeof(nb->nbst.fshift[0]) == sizeof(adat->fshift[0]), + "Sizes of host- and device-side shift vectors should be the same."); + copyFromDeviceBuffer(nb->nbst.fshift, &adat->fshift, 0, SHIFTS, deviceStream, + GpuApiCallBehavior::Async, nullptr); } /* DtoH energies */ if (stepWork.computeEnergy) { - cu_copy_D2H_async(nb->nbst.e_lj, adat->e_lj, sizeof(*nb->nbst.e_lj), deviceStream.stream()); - cu_copy_D2H_async(nb->nbst.e_el, adat->e_el, sizeof(*nb->nbst.e_el), deviceStream.stream()); + static_assert(sizeof(nb->nbst.e_lj[0]) == sizeof(adat->e_lj[0]), + "Sizes of host- and device-side LJ energy terms should be the same."); + copyFromDeviceBuffer(nb->nbst.e_lj, &adat->e_lj, 0, 1, deviceStream, + GpuApiCallBehavior::Async, nullptr); + static_assert(sizeof(nb->nbst.e_el[0]) == sizeof(adat->e_el[0]), + "Sizes of host- and device-side electrostatic energy terms should be the " + "same."); + copyFromDeviceBuffer(nb->nbst.e_el, &adat->e_el, 0, 1, deviceStream, + GpuApiCallBehavior::Async, nullptr); } } diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu index d41ee7d2ac..c6c7850493 100644 --- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu +++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu @@ -572,13 +572,16 @@ void gpu_init_pairlist(NbnxmGpu* nb, const NbnxnPairlistGpu* h_plist, const Inte void gpu_upload_shiftvec(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom) { - cu_atomdata_t* adat = nb->atdat; - cudaStream_t ls = nb->deviceStreams[InteractionLocality::Local]->stream(); + cu_atomdata_t* adat = nb->atdat; + const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local]; /* only if we have a dynamic box */ if (nbatom->bDynamicBox || !adat->bShiftVecUploaded) { - cu_copy_H2D_async(adat->shift_vec, nbatom->shift_vec.data(), SHIFTS * sizeof(*adat->shift_vec), ls); + static_assert(sizeof(adat->shift_vec[0]) == sizeof(nbatom->shift_vec[0]), + "Sizes of host- and device-side shift vectors should be the same."); + copyToDeviceBuffer(&adat->shift_vec, reinterpret_cast(nbatom->shift_vec.data()), + 0, SHIFTS, localStream, GpuApiCallBehavior::Async, nullptr); adat->bShiftVecUploaded = true; } } @@ -625,10 +628,10 @@ void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat) cudaError_t stat; int nalloc, natoms; bool realloced; - bool bDoTime = nb->bDoTime; - cu_timers_t* timers = nb->timers; - cu_atomdata_t* d_atdat = nb->atdat; - const DeviceStream& deviceStream = *nb->deviceStreams[InteractionLocality::Local]; + bool bDoTime = nb->bDoTime; + cu_timers_t* timers = nb->timers; + cu_atomdata_t* d_atdat = nb->atdat; + const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local]; natoms = nbat->numAtoms(); realloced = false; @@ -636,7 +639,7 @@ void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat) if (bDoTime) { /* time async copy */ - timers->atdat.openTimingRegion(deviceStream); + timers->atdat.openTimingRegion(localStream); } /* need to reallocate if we have to copy more atoms than the amount of space @@ -684,18 +687,23 @@ void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat) if (useLjCombRule(nb->nbparam)) { - cu_copy_H2D_async(d_atdat->lj_comb, nbat->params().lj_comb.data(), - natoms * sizeof(*d_atdat->lj_comb), deviceStream.stream()); + static_assert(sizeof(d_atdat->lj_comb[0]) == sizeof(float2), + "Size of the LJ parameters element should be equal to the size of float2."); + copyToDeviceBuffer(&d_atdat->lj_comb, + reinterpret_cast(nbat->params().lj_comb.data()), 0, + natoms, localStream, GpuApiCallBehavior::Async, nullptr); } else { - cu_copy_H2D_async(d_atdat->atom_types, nbat->params().type.data(), - natoms * sizeof(*d_atdat->atom_types), deviceStream.stream()); + static_assert(sizeof(d_atdat->atom_types[0]) == sizeof(nbat->params().type[0]), + "Sizes of host- and device-side atom types should be the same."); + copyToDeviceBuffer(&d_atdat->atom_types, nbat->params().type.data(), 0, natoms, localStream, + GpuApiCallBehavior::Async, nullptr); } if (bDoTime) { - timers->atdat.closeTimingRegion(deviceStream); + timers->atdat.closeTimingRegion(localStream); } } -- 2.22.0