From: Aleksei Iupinov Date: Thu, 9 Nov 2017 18:01:43 +0000 (+0100) Subject: Rename and expose "generic" GPU memory transfer functions X-Git-Url: http://biod.pnpi.spb.ru/gitweb/?a=commitdiff_plain;h=a723f4a28689193a79f8887f5eea18c27882a72f;p=alexxy%2Fgromacs.git Rename and expose "generic" GPU memory transfer functions Dropped the "_generic" suffix from the names. Made the sync/async argument an enum class instead of boolean. Made PME use synchronous versions of the functions for unit tests. Change-Id: I5fd2490d58370d9f0405aea1a74237fa8107cbab --- diff --git a/src/gromacs/ewald/pme-gpu-internal.h b/src/gromacs/ewald/pme-gpu-internal.h index f4512c4204..6d1b88414e 100644 --- a/src/gromacs/ewald/pme-gpu-internal.h +++ b/src/gromacs/ewald/pme-gpu-internal.h @@ -544,7 +544,8 @@ gmx_inline bool pme_gpu_performs_solve(const PmeGpu *pmeGPU) /*! \libinternal \brief * Enables or disables the testing mode. - * Testing mode only implies copying all the outputs, even the intermediate ones, to the host. + * Testing mode only implies copying all the outputs, even the intermediate ones, to the host, + * and also makes the copies synchronous. * * \param[in] pmeGPU The PME GPU structure. * \param[in] testing Should the testing mode be enabled, or disabled. @@ -552,6 +553,7 @@ gmx_inline bool pme_gpu_performs_solve(const PmeGpu *pmeGPU) gmx_inline void pme_gpu_set_testing(PmeGpu *pmeGPU, bool testing) { pmeGPU->settings.copyAllOutputs = testing; + pmeGPU->settings.transferKind = testing ? GpuApiCallBehavior::Sync : GpuApiCallBehavior::Async; } /*! \libinternal \brief diff --git a/src/gromacs/ewald/pme-gpu-types.h b/src/gromacs/ewald/pme-gpu-types.h index e95fa882cc..4e768410b7 100644 --- a/src/gromacs/ewald/pme-gpu-types.h +++ b/src/gromacs/ewald/pme-gpu-types.h @@ -59,6 +59,7 @@ #include #include "gromacs/ewald/pme.h" +#include "gromacs/gpu_utils/gpu_utils.h" #include "gromacs/math/vectypes.h" #include "gromacs/utility/basedefinitions.h" @@ -242,9 +243,11 @@ struct PmeGpuSettings /*! \brief A boolean which tells if any PME GPU stage should copy all of its outputs to the host. * Only intended to be used by the test framework. */ - bool copyAllOutputs; + bool copyAllOutputs; + /*! \brief An enum which tells whether most PME GPU D2H/H2D data transfers should be synchronous. */ + GpuApiCallBehavior transferKind; /*! \brief Various flags for the current PME computation, corresponding to the GMX_PME_ flags in pme.h. */ - int currentFlags; + int currentFlags; }; /*! \internal \brief diff --git a/src/gromacs/ewald/pme-solve.cu b/src/gromacs/ewald/pme-solve.cu index bf4db7b7a2..eceac20286 100644 --- a/src/gromacs/ewald/pme-solve.cu +++ b/src/gromacs/ewald/pme-solve.cu @@ -427,7 +427,8 @@ void pme_gpu_solve(const PmeGpu *pmeGpu, t_complex *h_grid, if (copyInputAndOutputGrid) { - cu_copy_H2D_async(kernelParamsPtr->grid.d_fourierGrid, h_grid, pmeGpu->archSpecific->complexGridSize * sizeof(float), stream); + cu_copy_H2D(kernelParamsPtr->grid.d_fourierGrid, h_grid, pmeGpu->archSpecific->complexGridSize * sizeof(float), + pmeGpu->settings.transferKind, stream); } int majorDim = -1, middleDim = -1, minorDim = -1; @@ -489,12 +490,13 @@ void pme_gpu_solve(const PmeGpu *pmeGpu, t_complex *h_grid, if (computeEnergyAndVirial) { - cu_copy_D2H_async(pmeGpu->staging.h_virialAndEnergy, kernelParamsPtr->constants.d_virialAndEnergy, - c_virialAndEnergyCount * sizeof(float), stream); + cu_copy_D2H(pmeGpu->staging.h_virialAndEnergy, kernelParamsPtr->constants.d_virialAndEnergy, + c_virialAndEnergyCount * sizeof(float), pmeGpu->settings.transferKind, stream); } if (copyInputAndOutputGrid) { - cu_copy_D2H_async(h_grid, kernelParamsPtr->grid.d_fourierGrid, pmeGpu->archSpecific->complexGridSize * sizeof(float), stream); + cu_copy_D2H(h_grid, kernelParamsPtr->grid.d_fourierGrid, pmeGpu->archSpecific->complexGridSize * sizeof(float), + pmeGpu->settings.transferKind, stream); } } diff --git a/src/gromacs/ewald/pme.cu b/src/gromacs/ewald/pme.cu index ca18afc6c4..a948730cde 100644 --- a/src/gromacs/ewald/pme.cu +++ b/src/gromacs/ewald/pme.cu @@ -126,8 +126,8 @@ void pme_gpu_realloc_and_copy_bspline_values(const PmeGpu *pmeGPU) memcpy(pmeGPU->staging.h_splineModuli + splineValuesOffset[i], pmeGPU->common->bsp_mod[i].data(), pmeGPU->common->bsp_mod[i].size() * sizeof(float)); } /* TODO: pin original buffer instead! */ - cu_copy_H2D_async(pmeGPU->kernelParams->grid.d_splineModuli, pmeGPU->staging.h_splineModuli, - newSplineValuesSize * sizeof(float), pmeGPU->archSpecific->pmeStream); + cu_copy_H2D(pmeGPU->kernelParams->grid.d_splineModuli, pmeGPU->staging.h_splineModuli, + newSplineValuesSize * sizeof(float), pmeGPU->settings.transferKind, pmeGPU->archSpecific->pmeStream); } void pme_gpu_free_bspline_values(const PmeGpu *pmeGPU) @@ -155,7 +155,7 @@ void pme_gpu_copy_input_forces(const PmeGpu *pmeGPU, const float *h_forces) GMX_ASSERT(h_forces, "nullptr host forces pointer in PME GPU"); const size_t forcesSize = DIM * pmeGPU->kernelParams->atoms.nAtoms * sizeof(float); GMX_ASSERT(forcesSize > 0, "Bad number of atoms in PME GPU"); - cu_copy_H2D_async(pmeGPU->kernelParams->atoms.d_forces, const_cast(h_forces), forcesSize, pmeGPU->archSpecific->pmeStream); + cu_copy_H2D(pmeGPU->kernelParams->atoms.d_forces, const_cast(h_forces), forcesSize, pmeGPU->settings.transferKind, pmeGPU->archSpecific->pmeStream); } void pme_gpu_copy_output_forces(const PmeGpu *pmeGPU, float *h_forces) @@ -163,7 +163,7 @@ void pme_gpu_copy_output_forces(const PmeGpu *pmeGPU, float *h_forces) GMX_ASSERT(h_forces, "nullptr host forces pointer in PME GPU"); const size_t forcesSize = DIM * pmeGPU->kernelParams->atoms.nAtoms * sizeof(float); GMX_ASSERT(forcesSize > 0, "Bad number of atoms in PME GPU"); - cu_copy_D2H_async(h_forces, pmeGPU->kernelParams->atoms.d_forces, forcesSize, pmeGPU->archSpecific->pmeStream); + cu_copy_D2H(h_forces, pmeGPU->kernelParams->atoms.d_forces, forcesSize, pmeGPU->settings.transferKind, pmeGPU->archSpecific->pmeStream); } void pme_gpu_realloc_coordinates(const PmeGpu *pmeGPU) @@ -191,8 +191,8 @@ void pme_gpu_copy_input_coordinates(const PmeGpu *pmeGPU, const rvec *h_coordina GMX_RELEASE_ASSERT(false, "Only single precision is supported"); GMX_UNUSED_VALUE(h_coordinates); #else - cu_copy_H2D_async(pmeGPU->kernelParams->atoms.d_coordinates, const_cast(h_coordinates), - pmeGPU->kernelParams->atoms.nAtoms * sizeof(rvec), pmeGPU->archSpecific->pmeStream); + cu_copy_H2D(pmeGPU->kernelParams->atoms.d_coordinates, const_cast(h_coordinates), + pmeGPU->kernelParams->atoms.nAtoms * sizeof(rvec), pmeGPU->settings.transferKind, pmeGPU->archSpecific->pmeStream); #endif } @@ -209,8 +209,8 @@ void pme_gpu_realloc_and_copy_input_coefficients(const PmeGpu *pmeGPU, const flo cu_realloc_buffered((void **)&pmeGPU->kernelParams->atoms.d_coefficients, nullptr, sizeof(float), &pmeGPU->archSpecific->coefficientsSize, &pmeGPU->archSpecific->coefficientsSizeAlloc, newCoefficientsSize, pmeGPU->archSpecific->pmeStream, true); - cu_copy_H2D_async(pmeGPU->kernelParams->atoms.d_coefficients, const_cast(h_coefficients), - pmeGPU->kernelParams->atoms.nAtoms * sizeof(float), pmeGPU->archSpecific->pmeStream); + cu_copy_H2D(pmeGPU->kernelParams->atoms.d_coefficients, const_cast(h_coefficients), + pmeGPU->kernelParams->atoms.nAtoms * sizeof(float), pmeGPU->settings.transferKind, pmeGPU->archSpecific->pmeStream); if (c_usePadding) { const size_t paddingIndex = pmeGPU->kernelParams->atoms.nAtoms; @@ -376,13 +376,13 @@ void pme_gpu_free_fract_shifts(const PmeGpu *pmeGPU) void pme_gpu_copy_input_gather_grid(const PmeGpu *pmeGpu, float *h_grid) { const size_t gridSize = pmeGpu->archSpecific->realGridSize * sizeof(float); - cu_copy_H2D_async(pmeGpu->kernelParams->grid.d_realGrid, h_grid, gridSize, pmeGpu->archSpecific->pmeStream); + cu_copy_H2D(pmeGpu->kernelParams->grid.d_realGrid, h_grid, gridSize, pmeGpu->settings.transferKind, pmeGpu->archSpecific->pmeStream); } void pme_gpu_copy_output_spread_grid(const PmeGpu *pmeGpu, float *h_grid) { const size_t gridSize = pmeGpu->archSpecific->realGridSize * sizeof(float); - cu_copy_D2H_async(h_grid, pmeGpu->kernelParams->grid.d_realGrid, gridSize, pmeGpu->archSpecific->pmeStream); + cu_copy_D2H(h_grid, pmeGpu->kernelParams->grid.d_realGrid, gridSize, pmeGpu->settings.transferKind, pmeGpu->archSpecific->pmeStream); cudaError_t stat = cudaEventRecord(pmeGpu->archSpecific->syncSpreadGridD2H, pmeGpu->archSpecific->pmeStream); CU_RET_ERR(stat, "PME spread grid sync event record failure"); } @@ -393,10 +393,10 @@ void pme_gpu_copy_output_spread_atom_data(const PmeGpu *pmeGpu) const size_t nAtomsPadded = ((pmeGpu->nAtomsAlloc + alignment - 1) / alignment) * alignment; const size_t splinesSize = DIM * nAtomsPadded * pmeGpu->common->pme_order * sizeof(float); auto *kernelParamsPtr = pmeGpu->kernelParams.get(); - cu_copy_D2H_async(pmeGpu->staging.h_dtheta, kernelParamsPtr->atoms.d_dtheta, splinesSize, pmeGpu->archSpecific->pmeStream); - cu_copy_D2H_async(pmeGpu->staging.h_theta, kernelParamsPtr->atoms.d_theta, splinesSize, pmeGpu->archSpecific->pmeStream); - cu_copy_D2H_async(pmeGpu->staging.h_gridlineIndices, kernelParamsPtr->atoms.d_gridlineIndices, - kernelParamsPtr->atoms.nAtoms * DIM * sizeof(int), pmeGpu->archSpecific->pmeStream); + cu_copy_D2H(pmeGpu->staging.h_dtheta, kernelParamsPtr->atoms.d_dtheta, splinesSize, pmeGpu->settings.transferKind, pmeGpu->archSpecific->pmeStream); + cu_copy_D2H(pmeGpu->staging.h_theta, kernelParamsPtr->atoms.d_theta, splinesSize, pmeGpu->settings.transferKind, pmeGpu->archSpecific->pmeStream); + cu_copy_D2H(pmeGpu->staging.h_gridlineIndices, kernelParamsPtr->atoms.d_gridlineIndices, + kernelParamsPtr->atoms.nAtoms * DIM * sizeof(int), pmeGpu->settings.transferKind, pmeGpu->archSpecific->pmeStream); } void pme_gpu_copy_input_gather_atom_data(const PmeGpu *pmeGpu) @@ -417,10 +417,10 @@ void pme_gpu_copy_input_gather_atom_data(const PmeGpu *pmeGpu) CU_RET_ERR(cudaMemsetAsync(kernelParamsPtr->atoms.d_theta, 0, pmeGpu->nAtomsAlloc * splineDataSizePerAtom, pmeGpu->archSpecific->pmeStream), "PME failed to clear the spline values"); } - cu_copy_H2D_async(kernelParamsPtr->atoms.d_dtheta, pmeGpu->staging.h_dtheta, splinesSize, pmeGpu->archSpecific->pmeStream); - cu_copy_H2D_async(kernelParamsPtr->atoms.d_theta, pmeGpu->staging.h_theta, splinesSize, pmeGpu->archSpecific->pmeStream); - cu_copy_H2D_async(kernelParamsPtr->atoms.d_gridlineIndices, pmeGpu->staging.h_gridlineIndices, - kernelParamsPtr->atoms.nAtoms * DIM * sizeof(int), pmeGpu->archSpecific->pmeStream); + cu_copy_H2D(kernelParamsPtr->atoms.d_dtheta, pmeGpu->staging.h_dtheta, splinesSize, pmeGpu->settings.transferKind, pmeGpu->archSpecific->pmeStream); + cu_copy_H2D(kernelParamsPtr->atoms.d_theta, pmeGpu->staging.h_theta, splinesSize, pmeGpu->settings.transferKind, pmeGpu->archSpecific->pmeStream); + cu_copy_H2D(kernelParamsPtr->atoms.d_gridlineIndices, pmeGpu->staging.h_gridlineIndices, + kernelParamsPtr->atoms.nAtoms * DIM * sizeof(int), pmeGpu->settings.transferKind, pmeGpu->archSpecific->pmeStream); } void pme_gpu_sync_spread_grid(const PmeGpu *pmeGPU) diff --git a/src/gromacs/gpu_utils/cudautils.cu b/src/gromacs/gpu_utils/cudautils.cu index caba44f3cc..38409c39ad 100644 --- a/src/gromacs/gpu_utils/cudautils.cu +++ b/src/gromacs/gpu_utils/cudautils.cu @@ -41,16 +41,15 @@ #include #include "gromacs/gpu_utils/cuda_arch_utils.cuh" +#include "gromacs/gpu_utils/gpu_utils.h" +#include "gromacs/utility/gmxassert.h" #include "gromacs/utility/smalloc.h" /*** Generic CUDA data operation wrappers ***/ -/*! Launches synchronous or asynchronous host to device memory copy. - * - * The copy is launched in stream s or if not specified, in stream 0. - */ -static int cu_copy_D2H_generic(void * h_dest, void * d_src, size_t bytes, - bool bAsync = false, cudaStream_t s = 0) +// TODO: template on transferKind to avoid runtime conditionals +int cu_copy_D2H(void *h_dest, void *d_src, size_t bytes, + GpuApiCallBehavior transferKind, cudaStream_t s = 0) { cudaError_t stat; @@ -59,16 +58,20 @@ static int cu_copy_D2H_generic(void * h_dest, void * d_src, size_t bytes, return -1; } - if (bAsync) - { - stat = cudaMemcpyAsync(h_dest, d_src, bytes, cudaMemcpyDeviceToHost, s); - CU_RET_ERR(stat, "DtoH cudaMemcpyAsync failed"); - - } - else + switch (transferKind) { - stat = cudaMemcpy(h_dest, d_src, bytes, cudaMemcpyDeviceToHost); - CU_RET_ERR(stat, "DtoH cudaMemcpy failed"); + case GpuApiCallBehavior::Async: + stat = cudaMemcpyAsync(h_dest, d_src, bytes, cudaMemcpyDeviceToHost, s); + CU_RET_ERR(stat, "DtoH cudaMemcpyAsync failed"); + break; + + case GpuApiCallBehavior::Sync: + stat = cudaMemcpy(h_dest, d_src, bytes, cudaMemcpyDeviceToHost); + CU_RET_ERR(stat, "DtoH cudaMemcpy failed"); + break; + + default: + throw; } return 0; @@ -76,7 +79,7 @@ static int cu_copy_D2H_generic(void * h_dest, void * d_src, size_t bytes, int cu_copy_D2H_sync(void * h_dest, void * d_src, size_t bytes) { - return cu_copy_D2H_generic(h_dest, d_src, bytes, false); + return cu_copy_D2H(h_dest, d_src, bytes, GpuApiCallBehavior::Sync); } /*! @@ -84,15 +87,12 @@ int cu_copy_D2H_sync(void * h_dest, void * d_src, size_t bytes) */ int cu_copy_D2H_async(void * h_dest, void * d_src, size_t bytes, cudaStream_t s = 0) { - return cu_copy_D2H_generic(h_dest, d_src, bytes, true, s); + return cu_copy_D2H(h_dest, d_src, bytes, GpuApiCallBehavior::Async, s); } -/*! Launches synchronous or asynchronous device to host memory copy. - * - * The copy is launched in stream s or if not specified, in stream 0. - */ -static int cu_copy_H2D_generic(void * d_dest, void * h_src, size_t bytes, - bool bAsync = false, cudaStream_t s = 0) +// TODO: template on transferKind to avoid runtime conditionals +int cu_copy_H2D(void *d_dest, void *h_src, size_t bytes, + GpuApiCallBehavior transferKind, cudaStream_t s = 0) { cudaError_t stat; @@ -101,15 +101,20 @@ static int cu_copy_H2D_generic(void * d_dest, void * h_src, size_t bytes, return -1; } - if (bAsync) - { - stat = cudaMemcpyAsync(d_dest, h_src, bytes, cudaMemcpyHostToDevice, s); - CU_RET_ERR(stat, "HtoD cudaMemcpyAsync failed"); - } - else + switch (transferKind) { - stat = cudaMemcpy(d_dest, h_src, bytes, cudaMemcpyHostToDevice); - CU_RET_ERR(stat, "HtoD cudaMemcpy failed"); + case GpuApiCallBehavior::Async: + stat = cudaMemcpyAsync(d_dest, h_src, bytes, cudaMemcpyHostToDevice, s); + CU_RET_ERR(stat, "HtoD cudaMemcpyAsync failed"); + break; + + case GpuApiCallBehavior::Sync: + stat = cudaMemcpy(d_dest, h_src, bytes, cudaMemcpyHostToDevice); + CU_RET_ERR(stat, "HtoD cudaMemcpy failed"); + break; + + default: + throw; } return 0; @@ -117,7 +122,7 @@ static int cu_copy_H2D_generic(void * d_dest, void * h_src, size_t bytes, int cu_copy_H2D_sync(void * d_dest, void * h_src, size_t bytes) { - return cu_copy_H2D_generic(d_dest, h_src, bytes, false); + return cu_copy_H2D(d_dest, h_src, bytes, GpuApiCallBehavior::Sync); } /*! @@ -125,7 +130,7 @@ int cu_copy_H2D_sync(void * d_dest, void * h_src, size_t bytes) */ int cu_copy_H2D_async(void * d_dest, void * h_src, size_t bytes, cudaStream_t s = 0) { - return cu_copy_H2D_generic(d_dest, h_src, bytes, true, s); + return cu_copy_H2D(d_dest, h_src, bytes, GpuApiCallBehavior::Async, s); } /**** Operation on buffered arrays (arrays with "over-allocation" in gmx wording) *****/ diff --git a/src/gromacs/gpu_utils/cudautils.cuh b/src/gromacs/gpu_utils/cudautils.cuh index b05cf65f99..f8d73c9697 100644 --- a/src/gromacs/gpu_utils/cudautils.cuh +++ b/src/gromacs/gpu_utils/cudautils.cuh @@ -46,6 +46,8 @@ #include "gromacs/math/vectypes.h" #include "gromacs/utility/fatalerror.h" +enum class GpuApiCallBehavior; + /* TODO error checking needs to be rewritten. We have 2 types of error checks needed based on where they occur in the code: - non performance-critical: these errors are unsafe to be ignored and must be @@ -134,6 +136,11 @@ struct gmx_device_info_t #endif /* HAVE_NVML */ }; +/*! Launches synchronous or asynchronous device to host memory copy. + * + * The copy is launched in stream s or if not specified, in stream 0. + */ +int cu_copy_D2H(void *h_dest, void *d_src, size_t bytes, GpuApiCallBehavior transferKind, cudaStream_t s /*= 0*/); /*! Launches synchronous host to device memory copy in stream 0. */ int cu_copy_D2H_sync(void * /*h_dest*/, void * /*d_src*/, size_t /*bytes*/); @@ -141,6 +148,12 @@ int cu_copy_D2H_sync(void * /*h_dest*/, void * /*d_src*/, size_t /*bytes*/); /*! Launches asynchronous host to device memory copy in stream s. */ int cu_copy_D2H_async(void * /*h_dest*/, void * /*d_src*/, size_t /*bytes*/, cudaStream_t /*s = 0*/); +/*! Launches synchronous or asynchronous host to device memory copy. + * + * The copy is launched in stream s or if not specified, in stream 0. + */ +int cu_copy_H2D(void *d_dest, void *h_src, size_t bytes, GpuApiCallBehavior transferKind, cudaStream_t /*s = 0*/); + /*! Launches synchronous host to device memory copy. */ int cu_copy_H2D_sync(void * /*d_dest*/, void * /*h_src*/, size_t /*bytes*/); diff --git a/src/gromacs/gpu_utils/gpu_utils.h b/src/gromacs/gpu_utils/gpu_utils.h index c85b1bed66..ad2297eae0 100644 --- a/src/gromacs/gpu_utils/gpu_utils.h +++ b/src/gromacs/gpu_utils/gpu_utils.h @@ -60,6 +60,13 @@ namespace gmx class MDLogger; } +//! Enum which is only used to describe transfer calls at the moment +enum class GpuApiCallBehavior +{ + Sync, + Async +}; + /*! \brief Detect all GPUs in the system. * * Will detect every GPU supported by the device driver in use. Also diff --git a/src/gromacs/gpu_utils/oclutils.cpp b/src/gromacs/gpu_utils/oclutils.cpp index 384242f827..eb270c0d1d 100644 --- a/src/gromacs/gpu_utils/oclutils.cpp +++ b/src/gromacs/gpu_utils/oclutils.cpp @@ -48,20 +48,15 @@ #include +#include "gromacs/gpu_utils/gpu_utils.h" #include "gromacs/utility/fatalerror.h" #include "gromacs/utility/smalloc.h" -/*! \brief Launches synchronous or asynchronous host to device memory copy. - * - * If copy_event is not NULL, on return it will contain an event object - * identifying this particular host to device operation. The event can further - * be used to queue a wait for this operation or to query profiling information. - */ -static int ocl_copy_H2D_generic(cl_mem d_dest, void* h_src, - size_t offset, size_t bytes, - bool bAsync /* = false*/, - cl_command_queue command_queue, - cl_event *copy_event) +int ocl_copy_H2D(cl_mem d_dest, void* h_src, + size_t offset, size_t bytes, + GpuApiCallBehavior transferKind, + cl_command_queue command_queue, + cl_event *copy_event) { cl_int gmx_unused cl_error; @@ -70,17 +65,22 @@ static int ocl_copy_H2D_generic(cl_mem d_dest, void* h_src, return -1; } - if (bAsync) + switch (transferKind) { - cl_error = clEnqueueWriteBuffer(command_queue, d_dest, CL_FALSE, offset, bytes, h_src, 0, NULL, copy_event); - assert(cl_error == CL_SUCCESS); - // TODO: handle errors - } - else - { - cl_error = clEnqueueWriteBuffer(command_queue, d_dest, CL_TRUE, offset, bytes, h_src, 0, NULL, copy_event); - assert(cl_error == CL_SUCCESS); - // TODO: handle errors + case GpuApiCallBehavior::Async: + cl_error = clEnqueueWriteBuffer(command_queue, d_dest, CL_FALSE, offset, bytes, h_src, 0, NULL, copy_event); + assert(cl_error == CL_SUCCESS); + // TODO: handle errors + break; + + case GpuApiCallBehavior::Sync: + cl_error = clEnqueueWriteBuffer(command_queue, d_dest, CL_TRUE, offset, bytes, h_src, 0, NULL, copy_event); + assert(cl_error == CL_SUCCESS); + // TODO: handle errors + break; + + default: + throw; } return 0; @@ -97,7 +97,7 @@ int ocl_copy_H2D_async(cl_mem d_dest, void * h_src, cl_command_queue command_queue, cl_event *copy_event) { - return ocl_copy_H2D_generic(d_dest, h_src, offset, bytes, true, command_queue, copy_event); + return ocl_copy_H2D(d_dest, h_src, offset, bytes, GpuApiCallBehavior::Async, command_queue, copy_event); } /*! \brief Launches synchronous host to device memory copy. @@ -106,20 +106,14 @@ int ocl_copy_H2D_sync(cl_mem d_dest, void * h_src, size_t offset, size_t bytes, cl_command_queue command_queue) { - return ocl_copy_H2D_generic(d_dest, h_src, offset, bytes, false, command_queue, NULL); + return ocl_copy_H2D(d_dest, h_src, offset, bytes, GpuApiCallBehavior::Sync, command_queue, NULL); } -/*! \brief Launches synchronous or asynchronous device to host memory copy. - * - * If copy_event is not NULL, on return it will contain an event object - * identifying this particular device to host operation. The event can further - * be used to queue a wait for this operation or to query profiling information. - */ -static int ocl_copy_D2H_generic(void * h_dest, cl_mem d_src, - size_t offset, size_t bytes, - bool bAsync, - cl_command_queue command_queue, - cl_event *copy_event) +int ocl_copy_D2H(void * h_dest, cl_mem d_src, + size_t offset, size_t bytes, + GpuApiCallBehavior transferKind, + cl_command_queue command_queue, + cl_event *copy_event) { cl_int gmx_unused cl_error; @@ -128,17 +122,22 @@ static int ocl_copy_D2H_generic(void * h_dest, cl_mem d_src, return -1; } - if (bAsync) + switch (transferKind) { - cl_error = clEnqueueReadBuffer(command_queue, d_src, CL_FALSE, offset, bytes, h_dest, 0, NULL, copy_event); - assert(cl_error == CL_SUCCESS); - // TODO: handle errors - } - else - { - cl_error = clEnqueueReadBuffer(command_queue, d_src, CL_TRUE, offset, bytes, h_dest, 0, NULL, copy_event); - assert(cl_error == CL_SUCCESS); - // TODO: handle errors + case GpuApiCallBehavior::Async: + cl_error = clEnqueueReadBuffer(command_queue, d_src, CL_FALSE, offset, bytes, h_dest, 0, NULL, copy_event); + assert(cl_error == CL_SUCCESS); + // TODO: handle errors + break; + + case GpuApiCallBehavior::Sync: + cl_error = clEnqueueReadBuffer(command_queue, d_src, CL_TRUE, offset, bytes, h_dest, 0, NULL, copy_event); + assert(cl_error == CL_SUCCESS); + // TODO: handle errors + break; + + default: + throw; } return 0; @@ -155,7 +154,7 @@ int ocl_copy_D2H_async(void * h_dest, cl_mem d_src, cl_command_queue command_queue, cl_event *copy_event) { - return ocl_copy_D2H_generic(h_dest, d_src, offset, bytes, true, command_queue, copy_event); + return ocl_copy_D2H(h_dest, d_src, offset, bytes, GpuApiCallBehavior::Async, command_queue, copy_event); } /*! \brief \brief Allocates nbytes of host memory. Use ocl_free to free memory allocated with this function. diff --git a/src/gromacs/gpu_utils/oclutils.h b/src/gromacs/gpu_utils/oclutils.h index 232ad4f5fe..184eb6dbfb 100644 --- a/src/gromacs/gpu_utils/oclutils.h +++ b/src/gromacs/gpu_utils/oclutils.h @@ -46,6 +46,8 @@ #include "gromacs/gpu_utils/gmxopencl.h" #include "gromacs/utility/gmxassert.h" +enum class GpuApiCallBehavior; + /*! \brief OpenCL vendor IDs */ typedef enum { OCL_VENDOR_NVIDIA = 0, @@ -101,15 +103,39 @@ struct gmx_device_runtime_data_t cl_program program; /**< OpenCL program */ }; +/*! \brief Launches synchronous or asynchronous device to host memory copy. + * + * If copy_event is not NULL, on return it will contain an event object + * identifying this particular device to host operation. The event can further + * be used to queue a wait for this operation or to query profiling information. + */ +int ocl_copy_D2H(void * h_dest, cl_mem d_src, + size_t offset, size_t bytes, + GpuApiCallBehavior transferKind, + cl_command_queue command_queue, + cl_event *copy_event); -/*! \brief Launches asynchronous host to device memory copy. */ -int ocl_copy_H2D_async(cl_mem d_dest, void * h_src, + +/*! \brief Launches asynchronous device to host memory copy. */ +int ocl_copy_D2H_async(void * h_dest, cl_mem d_src, size_t offset, size_t bytes, cl_command_queue command_queue, cl_event *copy_event); -/*! \brief Launches asynchronous device to host memory copy. */ -int ocl_copy_D2H_async(void * h_dest, cl_mem d_src, +/*! \brief Launches synchronous or asynchronous host to device memory copy. + * + * If copy_event is not NULL, on return it will contain an event object + * identifying this particular host to device operation. The event can further + * be used to queue a wait for this operation or to query profiling information. + */ +int ocl_copy_H2D(cl_mem d_dest, void* h_src, + size_t offset, size_t bytes, + GpuApiCallBehavior transferKind, + cl_command_queue command_queue, + cl_event *copy_event); + +/*! \brief Launches asynchronous host to device memory copy. */ +int ocl_copy_H2D_async(cl_mem d_dest, void * h_src, size_t offset, size_t bytes, cl_command_queue command_queue, cl_event *copy_event);