From: Artem Zhmurov Date: Mon, 11 May 2020 08:31:34 +0000 (+0000) Subject: Unify CUDA and OpenCL lookup-table creation X-Git-Url: http://biod.pnpi.spb.ru/gitweb/?a=commitdiff_plain;h=986b2bb150cd9e2d673e12ebfec2af4afb678069;p=alexxy%2Fgromacs.git Unify CUDA and OpenCL lookup-table creation In CUDA code, textures are used for the lookup-tables, whereas in OpenCL they are created as a read-only buffers. This commit hides these differences behind a unified wrapper. Refs #3318 Refs #3311 Change-Id: I003e0c982c2452a2753e331b46fc59f0b7e1b711 --- diff --git a/src/gromacs/gpu_utils/devicebuffer_ocl.h b/src/gromacs/gpu_utils/devicebuffer_ocl.h index f1eac5a9d1..40d72761ba 100644 --- a/src/gromacs/gpu_utils/devicebuffer_ocl.h +++ b/src/gromacs/gpu_utils/devicebuffer_ocl.h @@ -101,7 +101,8 @@ void freeDeviceBuffer(DeviceBuffer* buffer) /*! \brief * Performs the host-to-device data copy, synchronous or asynchronously on request. * - * TODO: This is meant to gradually replace cu/ocl_copy_h2d. + * Note that synchronous copy will not synchronize the stream in case of zero \p numValues + * because of the early return. * * \tparam ValueType Raw value type of the \p buffer. * \param[in,out] buffer Pointer to the device-side buffer @@ -161,7 +162,8 @@ void copyToDeviceBuffer(DeviceBuffer* buffer, /*! \brief * Performs the device-to-host data copy, synchronous or asynchronously on request. * - * TODO: This is meant to gradually replace cu/ocl_copy_d2h. + * Note that synchronous copy will not synchronize the stream in case of zero \p numValues + * because of the early return. * * \tparam ValueType Raw value type of the \p buffer. * \param[in,out] hostBuffer Pointer to the raw host-side memory, also typed \p ValueType @@ -183,6 +185,10 @@ void copyFromDeviceBuffer(ValueType* hostBuffer, GpuApiCallBehavior transferKind, CommandEvent* timingEvent) { + if (numValues == 0) + { + return; // such calls are actually made with empty domains + } GMX_ASSERT(buffer, "needs a buffer pointer"); GMX_ASSERT(hostBuffer, "needs a host buffer pointer"); cl_int clError; diff --git a/src/gromacs/gpu_utils/oclutils.cpp b/src/gromacs/gpu_utils/oclutils.cpp index f987ae00e7..726e4f2cff 100644 --- a/src/gromacs/gpu_utils/oclutils.cpp +++ b/src/gromacs/gpu_utils/oclutils.cpp @@ -53,116 +53,6 @@ #include "gromacs/utility/fatalerror.h" #include "gromacs/utility/smalloc.h" -int ocl_copy_H2D(cl_mem d_dest, - const void* h_src, - size_t offset, - size_t bytes, - GpuApiCallBehavior transferKind, - cl_command_queue command_queue, - cl_event* copy_event) -{ - cl_int gmx_unused cl_error; - - if (d_dest == nullptr || h_src == nullptr || bytes == 0) - { - return -1; - } - - switch (transferKind) - { - case GpuApiCallBehavior::Async: - cl_error = clEnqueueWriteBuffer(command_queue, d_dest, CL_FALSE, offset, bytes, h_src, - 0, nullptr, copy_event); - break; - - case GpuApiCallBehavior::Sync: - cl_error = clEnqueueWriteBuffer(command_queue, d_dest, CL_TRUE, offset, bytes, h_src, 0, - nullptr, copy_event); - break; - - default: throw; - } - GMX_ASSERT(cl_error == CL_SUCCESS, - ("clEnqueueWriteBuffer failed: " + ocl_get_error_string(cl_error)).c_str()); - - return 0; -} - -/*! \brief Launches asynchronous host to device memory copy. - * - * If copy_event is not nullptr, on return it will contain an event object - * identifying this particular host to device operation. The event can further - * be used to queue a wait for this operation or to query profiling information. - */ -int ocl_copy_H2D_async(cl_mem d_dest, - const void* h_src, - size_t offset, - size_t bytes, - cl_command_queue command_queue, - cl_event* copy_event) -{ - return ocl_copy_H2D(d_dest, h_src, offset, bytes, GpuApiCallBehavior::Async, command_queue, copy_event); -} - -/*! \brief Launches synchronous host to device memory copy. - */ -int ocl_copy_H2D_sync(cl_mem d_dest, const void* h_src, size_t offset, size_t bytes, cl_command_queue command_queue) -{ - return ocl_copy_H2D(d_dest, h_src, offset, bytes, GpuApiCallBehavior::Sync, command_queue, nullptr); -} - -int ocl_copy_D2H(void* h_dest, - cl_mem d_src, - size_t offset, - size_t bytes, - GpuApiCallBehavior transferKind, - cl_command_queue command_queue, - cl_event* copy_event) -{ - cl_int gmx_unused cl_error; - - if (h_dest == nullptr || d_src == nullptr || bytes == 0) - { - return -1; - } - - switch (transferKind) - { - case GpuApiCallBehavior::Async: - cl_error = clEnqueueReadBuffer(command_queue, d_src, CL_FALSE, offset, bytes, h_dest, 0, - nullptr, copy_event); - break; - - case GpuApiCallBehavior::Sync: - cl_error = clEnqueueReadBuffer(command_queue, d_src, CL_TRUE, offset, bytes, h_dest, 0, - nullptr, copy_event); - break; - - default: throw; - } - GMX_ASSERT(cl_error == CL_SUCCESS, - ("clEnqueueWriteBuffer failed: " + ocl_get_error_string(cl_error)).c_str()); - - - return 0; -} - -/*! \brief Launches asynchronous device to host memory copy. - * - * If copy_event is not nullptr, on return it will contain an event object - * identifying this particular host to device operation. The event can further - * be used to queue a wait for this operation or to query profiling information. - */ -int ocl_copy_D2H_async(void* h_dest, - cl_mem d_src, - size_t offset, - size_t bytes, - cl_command_queue command_queue, - cl_event* copy_event) -{ - return ocl_copy_D2H(h_dest, d_src, offset, bytes, GpuApiCallBehavior::Async, command_queue, copy_event); -} - /*! \brief \brief Allocates nbytes of host memory. Use ocl_free to free memory allocated with this function. * * \todo diff --git a/src/gromacs/gpu_utils/oclutils.h b/src/gromacs/gpu_utils/oclutils.h index ee445047fa..333147d78f 100644 --- a/src/gromacs/gpu_utils/oclutils.h +++ b/src/gromacs/gpu_utils/oclutils.h @@ -69,54 +69,6 @@ struct gmx_device_runtime_data_t cl_program program; }; -/*! \brief Launches synchronous or asynchronous device to host memory copy. - * - * If copy_event is not NULL, on return it will contain an event object - * identifying this particular device to host operation. The event can further - * be used to queue a wait for this operation or to query profiling information. - */ -int ocl_copy_D2H(void* h_dest, - cl_mem d_src, - size_t offset, - size_t bytes, - GpuApiCallBehavior transferKind, - cl_command_queue command_queue, - cl_event* copy_event); - - -/*! \brief Launches asynchronous device to host memory copy. */ -int ocl_copy_D2H_async(void* h_dest, - cl_mem d_src, - size_t offset, - size_t bytes, - cl_command_queue command_queue, - cl_event* copy_event); - -/*! \brief Launches synchronous or asynchronous host to device memory copy. - * - * If copy_event is not NULL, on return it will contain an event object - * identifying this particular host to device operation. The event can further - * be used to queue a wait for this operation or to query profiling information. - */ -int ocl_copy_H2D(cl_mem d_dest, - const void* h_src, - size_t offset, - size_t bytes, - GpuApiCallBehavior transferKind, - cl_command_queue command_queue, - cl_event* copy_event); - -/*! \brief Launches asynchronous host to device memory copy. */ -int ocl_copy_H2D_async(cl_mem d_dest, - const void* h_src, - size_t offset, - size_t bytes, - cl_command_queue command_queue, - cl_event* copy_event); - -/*! \brief Launches synchronous host to device memory copy. */ -int ocl_copy_H2D_sync(cl_mem d_dest, const void* h_src, size_t offset, size_t bytes, cl_command_queue command_queue); - /*! \brief Allocate host memory in malloc style */ void pmalloc(void** h_ptr, size_t nbytes); diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp b/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp index a35a188400..b5018808f0 100644 --- a/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp +++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp @@ -526,8 +526,10 @@ void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const Atom } /* HtoD x, q */ - ocl_copy_H2D_async(adat->xq, nbatom->x().data() + adat_begin * 4, adat_begin * sizeof(float) * 4, - adat_len * sizeof(float) * 4, deviceStream.stream(), + GMX_ASSERT(sizeof(float) == sizeof(*nbatom->x().data()), + "The size of the xyzq buffer element should be equal to the size of float4."); + copyToDeviceBuffer(&adat->xq, nbatom->x().data() + adat_begin * 4, adat_begin * 4, adat_len * 4, + deviceStream, GpuApiCallBehavior::Async, bDoTime ? t->xf[atomLocality].nb_h2d.fetchNextEvent() : nullptr); if (bDoTime) @@ -895,10 +897,11 @@ void gpu_launch_cpyback(NbnxmGpu* nb, } /* DtoH f */ - ocl_copy_D2H_async(nbatom->out[0].f.data() + adat_begin * DIM, adat->f, - adat_begin * DIM * sizeof(nbatom->out[0].f[0]), - adat_len * DIM * sizeof(nbatom->out[0].f[0]), deviceStream.stream(), - bDoTime ? t->xf[aloc].nb_d2h.fetchNextEvent() : nullptr); + GMX_ASSERT(sizeof(*nbatom->out[0].f.data()) == sizeof(float), + "The size of the force buffer element should be equal to the size of float3."); + copyFromDeviceBuffer(&nbatom->out[0].f.data()[adat_begin * DIM], &adat->f, adat_begin * DIM, + adat_len * DIM, deviceStream, GpuApiCallBehavior::Async, + bDoTime ? t->xf[aloc].nb_d2h.fetchNextEvent() : nullptr); /* kick off work */ cl_error = clFlush(deviceStream.stream()); @@ -922,19 +925,25 @@ void gpu_launch_cpyback(NbnxmGpu* nb, /* DtoH fshift when virial is needed */ if (stepWork.computeVirial) { - ocl_copy_D2H_async(nb->nbst.fshift, adat->fshift, 0, - SHIFTS * sizeof(nb->nbst.fshift[0]), deviceStream.stream(), - bDoTime ? t->xf[aloc].nb_d2h.fetchNextEvent() : nullptr); + GMX_ASSERT(sizeof(*nb->nbst.fshift) == DIM * sizeof(float), + "Sizes of host- and device-side shift vectors should be the same."); + copyFromDeviceBuffer(reinterpret_cast(nb->nbst.fshift), &adat->fshift, 0, + SHIFTS * DIM, deviceStream, GpuApiCallBehavior::Async, + bDoTime ? t->xf[aloc].nb_d2h.fetchNextEvent() : nullptr); } /* DtoH energies */ if (stepWork.computeEnergy) { - ocl_copy_D2H_async(nb->nbst.e_lj, adat->e_lj, 0, sizeof(float), deviceStream.stream(), - bDoTime ? t->xf[aloc].nb_d2h.fetchNextEvent() : nullptr); - - ocl_copy_D2H_async(nb->nbst.e_el, adat->e_el, 0, sizeof(float), deviceStream.stream(), - bDoTime ? t->xf[aloc].nb_d2h.fetchNextEvent() : nullptr); + GMX_ASSERT(sizeof(*nb->nbst.e_lj) == sizeof(float), + "Sizes of host- and device-side LJ energy terms should be the same."); + copyFromDeviceBuffer(nb->nbst.e_lj, &adat->e_lj, 0, 1, deviceStream, GpuApiCallBehavior::Async, + bDoTime ? t->xf[aloc].nb_d2h.fetchNextEvent() : nullptr); + GMX_ASSERT(sizeof(*nb->nbst.e_el) == sizeof(float), + "Sizes of host- and device-side electrostatic energy terms should be the " + "same."); + copyFromDeviceBuffer(nb->nbst.e_el, &adat->e_el, 0, 1, deviceStream, GpuApiCallBehavior::Async, + bDoTime ? t->xf[aloc].nb_d2h.fetchNextEvent() : nullptr); } } diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp b/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp index 4f1e9fc65c..e0c25b2d73 100644 --- a/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp +++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp @@ -677,14 +677,16 @@ void gpu_init_pairlist(NbnxmGpu* nb, const NbnxnPairlistGpu* h_plist, const Inte //! This function is documented in the header file void gpu_upload_shiftvec(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom) { - cl_atomdata_t* adat = nb->atdat; - cl_command_queue ls = nb->deviceStreams[InteractionLocality::Local]->stream(); + cl_atomdata_t* adat = nb->atdat; + const DeviceStream& deviceStream = *nb->deviceStreams[InteractionLocality::Local]; /* only if we have a dynamic box */ if (nbatom->bDynamicBox || !adat->bShiftVecUploaded) { - ocl_copy_H2D_async(adat->shift_vec, nbatom->shift_vec.data(), 0, - SHIFTS * sizeof(nbatom->shift_vec[0]), ls, nullptr); + GMX_ASSERT(sizeof(float) * DIM == sizeof(*nbatom->shift_vec.data()), + "Sizes of host- and device-side shift vectors should be the same."); + copyToDeviceBuffer(&adat->shift_vec, reinterpret_cast(nbatom->shift_vec.data()), + 0, SHIFTS * DIM, deviceStream, GpuApiCallBehavior::Async, nullptr); adat->bShiftVecUploaded = CL_TRUE; } } @@ -766,13 +768,18 @@ void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat) if (useLjCombRule(nb->nbparam->vdwtype)) { - ocl_copy_H2D_async(d_atdat->lj_comb, nbat->params().lj_comb.data(), 0, natoms * sizeof(cl_float2), - deviceStream.stream(), bDoTime ? timers->atdat.fetchNextEvent() : nullptr); + GMX_ASSERT(sizeof(float) == sizeof(*nbat->params().lj_comb.data()), + "Size of the LJ parameters element should be equal to the size of float2."); + copyToDeviceBuffer(&d_atdat->lj_comb, nbat->params().lj_comb.data(), 0, 2 * natoms, + deviceStream, GpuApiCallBehavior::Async, + bDoTime ? timers->atdat.fetchNextEvent() : nullptr); } else { - ocl_copy_H2D_async(d_atdat->atom_types, nbat->params().type.data(), 0, natoms * sizeof(int), - deviceStream.stream(), bDoTime ? timers->atdat.fetchNextEvent() : nullptr); + GMX_ASSERT(sizeof(int) == sizeof(*nbat->params().type.data()), + "Sizes of host- and device-side atom types should be the same."); + copyToDeviceBuffer(&d_atdat->atom_types, nbat->params().type.data(), 0, natoms, deviceStream, + GpuApiCallBehavior::Async, bDoTime ? timers->atdat.fetchNextEvent() : nullptr); } if (bDoTime) diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h b/src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h index cda2294783..f447ce9d48 100644 --- a/src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h +++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h @@ -164,28 +164,28 @@ typedef struct cl_atomdata int nalloc; //! float4 buffer with atom coordinates + charges, size natoms - cl_mem xq; + DeviceBuffer xq; //! float3 buffer with force output array, size natoms - cl_mem f; + DeviceBuffer f; //! LJ energy output, size 1 - cl_mem e_lj; + DeviceBuffer e_lj; //! Electrostatics energy input, size 1 - cl_mem e_el; + DeviceBuffer e_el; //! float3 buffer with shift forces - cl_mem fshift; + DeviceBuffer fshift; //! number of atom types int ntypes; //! int buffer with atom type indices, size natoms - cl_mem atom_types; + DeviceBuffer atom_types; //! float2 buffer with sqrt(c6),sqrt(c12), size natoms - cl_mem lj_comb; + DeviceBuffer lj_comb; //! float3 buffer with shifts values - cl_mem shift_vec; + DeviceBuffer shift_vec; //! true if the shift vector has been uploaded bool bShiftVecUploaded;