/*! \libinternal \brief
* Enables or disables the testing mode.
- * Testing mode only implies copying all the outputs, even the intermediate ones, to the host.
+ * Testing mode only implies copying all the outputs, even the intermediate ones, to the host,
+ * and also makes the copies synchronous.
*
* \param[in] pmeGPU The PME GPU structure.
* \param[in] testing Should the testing mode be enabled, or disabled.
gmx_inline void pme_gpu_set_testing(PmeGpu *pmeGPU, bool testing)
{
pmeGPU->settings.copyAllOutputs = testing;
+ pmeGPU->settings.transferKind = testing ? GpuApiCallBehavior::Sync : GpuApiCallBehavior::Async;
}
/*! \libinternal \brief
#include <vector>
#include "gromacs/ewald/pme.h"
+#include "gromacs/gpu_utils/gpu_utils.h"
#include "gromacs/math/vectypes.h"
#include "gromacs/utility/basedefinitions.h"
/*! \brief A boolean which tells if any PME GPU stage should copy all of its outputs to the host.
* Only intended to be used by the test framework.
*/
- bool copyAllOutputs;
+ bool copyAllOutputs;
+ /*! \brief An enum which tells whether most PME GPU D2H/H2D data transfers should be synchronous. */
+ GpuApiCallBehavior transferKind;
/*! \brief Various flags for the current PME computation, corresponding to the GMX_PME_ flags in pme.h. */
- int currentFlags;
+ int currentFlags;
};
/*! \internal \brief
if (copyInputAndOutputGrid)
{
- cu_copy_H2D_async(kernelParamsPtr->grid.d_fourierGrid, h_grid, pmeGpu->archSpecific->complexGridSize * sizeof(float), stream);
+ cu_copy_H2D(kernelParamsPtr->grid.d_fourierGrid, h_grid, pmeGpu->archSpecific->complexGridSize * sizeof(float),
+ pmeGpu->settings.transferKind, stream);
}
int majorDim = -1, middleDim = -1, minorDim = -1;
if (computeEnergyAndVirial)
{
- cu_copy_D2H_async(pmeGpu->staging.h_virialAndEnergy, kernelParamsPtr->constants.d_virialAndEnergy,
- c_virialAndEnergyCount * sizeof(float), stream);
+ cu_copy_D2H(pmeGpu->staging.h_virialAndEnergy, kernelParamsPtr->constants.d_virialAndEnergy,
+ c_virialAndEnergyCount * sizeof(float), pmeGpu->settings.transferKind, stream);
}
if (copyInputAndOutputGrid)
{
- cu_copy_D2H_async(h_grid, kernelParamsPtr->grid.d_fourierGrid, pmeGpu->archSpecific->complexGridSize * sizeof(float), stream);
+ cu_copy_D2H(h_grid, kernelParamsPtr->grid.d_fourierGrid, pmeGpu->archSpecific->complexGridSize * sizeof(float),
+ pmeGpu->settings.transferKind, stream);
}
}
memcpy(pmeGPU->staging.h_splineModuli + splineValuesOffset[i], pmeGPU->common->bsp_mod[i].data(), pmeGPU->common->bsp_mod[i].size() * sizeof(float));
}
/* TODO: pin original buffer instead! */
- cu_copy_H2D_async(pmeGPU->kernelParams->grid.d_splineModuli, pmeGPU->staging.h_splineModuli,
- newSplineValuesSize * sizeof(float), pmeGPU->archSpecific->pmeStream);
+ cu_copy_H2D(pmeGPU->kernelParams->grid.d_splineModuli, pmeGPU->staging.h_splineModuli,
+ newSplineValuesSize * sizeof(float), pmeGPU->settings.transferKind, pmeGPU->archSpecific->pmeStream);
}
void pme_gpu_free_bspline_values(const PmeGpu *pmeGPU)
GMX_ASSERT(h_forces, "nullptr host forces pointer in PME GPU");
const size_t forcesSize = DIM * pmeGPU->kernelParams->atoms.nAtoms * sizeof(float);
GMX_ASSERT(forcesSize > 0, "Bad number of atoms in PME GPU");
- cu_copy_H2D_async(pmeGPU->kernelParams->atoms.d_forces, const_cast<float *>(h_forces), forcesSize, pmeGPU->archSpecific->pmeStream);
+ cu_copy_H2D(pmeGPU->kernelParams->atoms.d_forces, const_cast<float *>(h_forces), forcesSize, pmeGPU->settings.transferKind, pmeGPU->archSpecific->pmeStream);
}
void pme_gpu_copy_output_forces(const PmeGpu *pmeGPU, float *h_forces)
GMX_ASSERT(h_forces, "nullptr host forces pointer in PME GPU");
const size_t forcesSize = DIM * pmeGPU->kernelParams->atoms.nAtoms * sizeof(float);
GMX_ASSERT(forcesSize > 0, "Bad number of atoms in PME GPU");
- cu_copy_D2H_async(h_forces, pmeGPU->kernelParams->atoms.d_forces, forcesSize, pmeGPU->archSpecific->pmeStream);
+ cu_copy_D2H(h_forces, pmeGPU->kernelParams->atoms.d_forces, forcesSize, pmeGPU->settings.transferKind, pmeGPU->archSpecific->pmeStream);
}
void pme_gpu_realloc_coordinates(const PmeGpu *pmeGPU)
GMX_RELEASE_ASSERT(false, "Only single precision is supported");
GMX_UNUSED_VALUE(h_coordinates);
#else
- cu_copy_H2D_async(pmeGPU->kernelParams->atoms.d_coordinates, const_cast<rvec *>(h_coordinates),
- pmeGPU->kernelParams->atoms.nAtoms * sizeof(rvec), pmeGPU->archSpecific->pmeStream);
+ cu_copy_H2D(pmeGPU->kernelParams->atoms.d_coordinates, const_cast<rvec *>(h_coordinates),
+ pmeGPU->kernelParams->atoms.nAtoms * sizeof(rvec), pmeGPU->settings.transferKind, pmeGPU->archSpecific->pmeStream);
#endif
}
cu_realloc_buffered((void **)&pmeGPU->kernelParams->atoms.d_coefficients, nullptr, sizeof(float),
&pmeGPU->archSpecific->coefficientsSize, &pmeGPU->archSpecific->coefficientsSizeAlloc,
newCoefficientsSize, pmeGPU->archSpecific->pmeStream, true);
- cu_copy_H2D_async(pmeGPU->kernelParams->atoms.d_coefficients, const_cast<float *>(h_coefficients),
- pmeGPU->kernelParams->atoms.nAtoms * sizeof(float), pmeGPU->archSpecific->pmeStream);
+ cu_copy_H2D(pmeGPU->kernelParams->atoms.d_coefficients, const_cast<float *>(h_coefficients),
+ pmeGPU->kernelParams->atoms.nAtoms * sizeof(float), pmeGPU->settings.transferKind, pmeGPU->archSpecific->pmeStream);
if (c_usePadding)
{
const size_t paddingIndex = pmeGPU->kernelParams->atoms.nAtoms;
void pme_gpu_copy_input_gather_grid(const PmeGpu *pmeGpu, float *h_grid)
{
const size_t gridSize = pmeGpu->archSpecific->realGridSize * sizeof(float);
- cu_copy_H2D_async(pmeGpu->kernelParams->grid.d_realGrid, h_grid, gridSize, pmeGpu->archSpecific->pmeStream);
+ cu_copy_H2D(pmeGpu->kernelParams->grid.d_realGrid, h_grid, gridSize, pmeGpu->settings.transferKind, pmeGpu->archSpecific->pmeStream);
}
void pme_gpu_copy_output_spread_grid(const PmeGpu *pmeGpu, float *h_grid)
{
const size_t gridSize = pmeGpu->archSpecific->realGridSize * sizeof(float);
- cu_copy_D2H_async(h_grid, pmeGpu->kernelParams->grid.d_realGrid, gridSize, pmeGpu->archSpecific->pmeStream);
+ cu_copy_D2H(h_grid, pmeGpu->kernelParams->grid.d_realGrid, gridSize, pmeGpu->settings.transferKind, pmeGpu->archSpecific->pmeStream);
cudaError_t stat = cudaEventRecord(pmeGpu->archSpecific->syncSpreadGridD2H, pmeGpu->archSpecific->pmeStream);
CU_RET_ERR(stat, "PME spread grid sync event record failure");
}
const size_t nAtomsPadded = ((pmeGpu->nAtomsAlloc + alignment - 1) / alignment) * alignment;
const size_t splinesSize = DIM * nAtomsPadded * pmeGpu->common->pme_order * sizeof(float);
auto *kernelParamsPtr = pmeGpu->kernelParams.get();
- cu_copy_D2H_async(pmeGpu->staging.h_dtheta, kernelParamsPtr->atoms.d_dtheta, splinesSize, pmeGpu->archSpecific->pmeStream);
- cu_copy_D2H_async(pmeGpu->staging.h_theta, kernelParamsPtr->atoms.d_theta, splinesSize, pmeGpu->archSpecific->pmeStream);
- cu_copy_D2H_async(pmeGpu->staging.h_gridlineIndices, kernelParamsPtr->atoms.d_gridlineIndices,
- kernelParamsPtr->atoms.nAtoms * DIM * sizeof(int), pmeGpu->archSpecific->pmeStream);
+ cu_copy_D2H(pmeGpu->staging.h_dtheta, kernelParamsPtr->atoms.d_dtheta, splinesSize, pmeGpu->settings.transferKind, pmeGpu->archSpecific->pmeStream);
+ cu_copy_D2H(pmeGpu->staging.h_theta, kernelParamsPtr->atoms.d_theta, splinesSize, pmeGpu->settings.transferKind, pmeGpu->archSpecific->pmeStream);
+ cu_copy_D2H(pmeGpu->staging.h_gridlineIndices, kernelParamsPtr->atoms.d_gridlineIndices,
+ kernelParamsPtr->atoms.nAtoms * DIM * sizeof(int), pmeGpu->settings.transferKind, pmeGpu->archSpecific->pmeStream);
}
void pme_gpu_copy_input_gather_atom_data(const PmeGpu *pmeGpu)
CU_RET_ERR(cudaMemsetAsync(kernelParamsPtr->atoms.d_theta, 0, pmeGpu->nAtomsAlloc * splineDataSizePerAtom, pmeGpu->archSpecific->pmeStream),
"PME failed to clear the spline values");
}
- cu_copy_H2D_async(kernelParamsPtr->atoms.d_dtheta, pmeGpu->staging.h_dtheta, splinesSize, pmeGpu->archSpecific->pmeStream);
- cu_copy_H2D_async(kernelParamsPtr->atoms.d_theta, pmeGpu->staging.h_theta, splinesSize, pmeGpu->archSpecific->pmeStream);
- cu_copy_H2D_async(kernelParamsPtr->atoms.d_gridlineIndices, pmeGpu->staging.h_gridlineIndices,
- kernelParamsPtr->atoms.nAtoms * DIM * sizeof(int), pmeGpu->archSpecific->pmeStream);
+ cu_copy_H2D(kernelParamsPtr->atoms.d_dtheta, pmeGpu->staging.h_dtheta, splinesSize, pmeGpu->settings.transferKind, pmeGpu->archSpecific->pmeStream);
+ cu_copy_H2D(kernelParamsPtr->atoms.d_theta, pmeGpu->staging.h_theta, splinesSize, pmeGpu->settings.transferKind, pmeGpu->archSpecific->pmeStream);
+ cu_copy_H2D(kernelParamsPtr->atoms.d_gridlineIndices, pmeGpu->staging.h_gridlineIndices,
+ kernelParamsPtr->atoms.nAtoms * DIM * sizeof(int), pmeGpu->settings.transferKind, pmeGpu->archSpecific->pmeStream);
}
void pme_gpu_sync_spread_grid(const PmeGpu *pmeGPU)
#include <cstdlib>
#include "gromacs/gpu_utils/cuda_arch_utils.cuh"
+#include "gromacs/gpu_utils/gpu_utils.h"
+#include "gromacs/utility/gmxassert.h"
#include "gromacs/utility/smalloc.h"
/*** Generic CUDA data operation wrappers ***/
-/*! Launches synchronous or asynchronous host to device memory copy.
- *
- * The copy is launched in stream s or if not specified, in stream 0.
- */
-static int cu_copy_D2H_generic(void * h_dest, void * d_src, size_t bytes,
- bool bAsync = false, cudaStream_t s = 0)
+// TODO: template on transferKind to avoid runtime conditionals
+int cu_copy_D2H(void *h_dest, void *d_src, size_t bytes,
+ GpuApiCallBehavior transferKind, cudaStream_t s = 0)
{
cudaError_t stat;
return -1;
}
- if (bAsync)
- {
- stat = cudaMemcpyAsync(h_dest, d_src, bytes, cudaMemcpyDeviceToHost, s);
- CU_RET_ERR(stat, "DtoH cudaMemcpyAsync failed");
-
- }
- else
+ switch (transferKind)
{
- stat = cudaMemcpy(h_dest, d_src, bytes, cudaMemcpyDeviceToHost);
- CU_RET_ERR(stat, "DtoH cudaMemcpy failed");
+ case GpuApiCallBehavior::Async:
+ stat = cudaMemcpyAsync(h_dest, d_src, bytes, cudaMemcpyDeviceToHost, s);
+ CU_RET_ERR(stat, "DtoH cudaMemcpyAsync failed");
+ break;
+
+ case GpuApiCallBehavior::Sync:
+ stat = cudaMemcpy(h_dest, d_src, bytes, cudaMemcpyDeviceToHost);
+ CU_RET_ERR(stat, "DtoH cudaMemcpy failed");
+ break;
+
+ default:
+ throw;
}
return 0;
int cu_copy_D2H_sync(void * h_dest, void * d_src, size_t bytes)
{
- return cu_copy_D2H_generic(h_dest, d_src, bytes, false);
+ return cu_copy_D2H(h_dest, d_src, bytes, GpuApiCallBehavior::Sync);
}
/*!
*/
int cu_copy_D2H_async(void * h_dest, void * d_src, size_t bytes, cudaStream_t s = 0)
{
- return cu_copy_D2H_generic(h_dest, d_src, bytes, true, s);
+ return cu_copy_D2H(h_dest, d_src, bytes, GpuApiCallBehavior::Async, s);
}
-/*! Launches synchronous or asynchronous device to host memory copy.
- *
- * The copy is launched in stream s or if not specified, in stream 0.
- */
-static int cu_copy_H2D_generic(void * d_dest, void * h_src, size_t bytes,
- bool bAsync = false, cudaStream_t s = 0)
+// TODO: template on transferKind to avoid runtime conditionals
+int cu_copy_H2D(void *d_dest, void *h_src, size_t bytes,
+ GpuApiCallBehavior transferKind, cudaStream_t s = 0)
{
cudaError_t stat;
return -1;
}
- if (bAsync)
- {
- stat = cudaMemcpyAsync(d_dest, h_src, bytes, cudaMemcpyHostToDevice, s);
- CU_RET_ERR(stat, "HtoD cudaMemcpyAsync failed");
- }
- else
+ switch (transferKind)
{
- stat = cudaMemcpy(d_dest, h_src, bytes, cudaMemcpyHostToDevice);
- CU_RET_ERR(stat, "HtoD cudaMemcpy failed");
+ case GpuApiCallBehavior::Async:
+ stat = cudaMemcpyAsync(d_dest, h_src, bytes, cudaMemcpyHostToDevice, s);
+ CU_RET_ERR(stat, "HtoD cudaMemcpyAsync failed");
+ break;
+
+ case GpuApiCallBehavior::Sync:
+ stat = cudaMemcpy(d_dest, h_src, bytes, cudaMemcpyHostToDevice);
+ CU_RET_ERR(stat, "HtoD cudaMemcpy failed");
+ break;
+
+ default:
+ throw;
}
return 0;
int cu_copy_H2D_sync(void * d_dest, void * h_src, size_t bytes)
{
- return cu_copy_H2D_generic(d_dest, h_src, bytes, false);
+ return cu_copy_H2D(d_dest, h_src, bytes, GpuApiCallBehavior::Sync);
}
/*!
*/
int cu_copy_H2D_async(void * d_dest, void * h_src, size_t bytes, cudaStream_t s = 0)
{
- return cu_copy_H2D_generic(d_dest, h_src, bytes, true, s);
+ return cu_copy_H2D(d_dest, h_src, bytes, GpuApiCallBehavior::Async, s);
}
/**** Operation on buffered arrays (arrays with "over-allocation" in gmx wording) *****/
#include "gromacs/math/vectypes.h"
#include "gromacs/utility/fatalerror.h"
+enum class GpuApiCallBehavior;
+
/* TODO error checking needs to be rewritten. We have 2 types of error checks needed
based on where they occur in the code:
- non performance-critical: these errors are unsafe to be ignored and must be
#endif /* HAVE_NVML */
};
+/*! Launches synchronous or asynchronous device to host memory copy.
+ *
+ * The copy is launched in stream s or if not specified, in stream 0.
+ */
+int cu_copy_D2H(void *h_dest, void *d_src, size_t bytes, GpuApiCallBehavior transferKind, cudaStream_t s /*= 0*/);
/*! Launches synchronous host to device memory copy in stream 0. */
int cu_copy_D2H_sync(void * /*h_dest*/, void * /*d_src*/, size_t /*bytes*/);
/*! Launches asynchronous host to device memory copy in stream s. */
int cu_copy_D2H_async(void * /*h_dest*/, void * /*d_src*/, size_t /*bytes*/, cudaStream_t /*s = 0*/);
+/*! Launches synchronous or asynchronous host to device memory copy.
+ *
+ * The copy is launched in stream s or if not specified, in stream 0.
+ */
+int cu_copy_H2D(void *d_dest, void *h_src, size_t bytes, GpuApiCallBehavior transferKind, cudaStream_t /*s = 0*/);
+
/*! Launches synchronous host to device memory copy. */
int cu_copy_H2D_sync(void * /*d_dest*/, void * /*h_src*/, size_t /*bytes*/);
class MDLogger;
}
+//! Enum which is only used to describe transfer calls at the moment
+enum class GpuApiCallBehavior
+{
+ Sync,
+ Async
+};
+
/*! \brief Detect all GPUs in the system.
*
* Will detect every GPU supported by the device driver in use. Also
#include <string>
+#include "gromacs/gpu_utils/gpu_utils.h"
#include "gromacs/utility/fatalerror.h"
#include "gromacs/utility/smalloc.h"
-/*! \brief Launches synchronous or asynchronous host to device memory copy.
- *
- * If copy_event is not NULL, on return it will contain an event object
- * identifying this particular host to device operation. The event can further
- * be used to queue a wait for this operation or to query profiling information.
- */
-static int ocl_copy_H2D_generic(cl_mem d_dest, void* h_src,
- size_t offset, size_t bytes,
- bool bAsync /* = false*/,
- cl_command_queue command_queue,
- cl_event *copy_event)
+int ocl_copy_H2D(cl_mem d_dest, void* h_src,
+ size_t offset, size_t bytes,
+ GpuApiCallBehavior transferKind,
+ cl_command_queue command_queue,
+ cl_event *copy_event)
{
cl_int gmx_unused cl_error;
return -1;
}
- if (bAsync)
+ switch (transferKind)
{
- cl_error = clEnqueueWriteBuffer(command_queue, d_dest, CL_FALSE, offset, bytes, h_src, 0, NULL, copy_event);
- assert(cl_error == CL_SUCCESS);
- // TODO: handle errors
- }
- else
- {
- cl_error = clEnqueueWriteBuffer(command_queue, d_dest, CL_TRUE, offset, bytes, h_src, 0, NULL, copy_event);
- assert(cl_error == CL_SUCCESS);
- // TODO: handle errors
+ case GpuApiCallBehavior::Async:
+ cl_error = clEnqueueWriteBuffer(command_queue, d_dest, CL_FALSE, offset, bytes, h_src, 0, NULL, copy_event);
+ assert(cl_error == CL_SUCCESS);
+ // TODO: handle errors
+ break;
+
+ case GpuApiCallBehavior::Sync:
+ cl_error = clEnqueueWriteBuffer(command_queue, d_dest, CL_TRUE, offset, bytes, h_src, 0, NULL, copy_event);
+ assert(cl_error == CL_SUCCESS);
+ // TODO: handle errors
+ break;
+
+ default:
+ throw;
}
return 0;
cl_command_queue command_queue,
cl_event *copy_event)
{
- return ocl_copy_H2D_generic(d_dest, h_src, offset, bytes, true, command_queue, copy_event);
+ return ocl_copy_H2D(d_dest, h_src, offset, bytes, GpuApiCallBehavior::Async, command_queue, copy_event);
}
/*! \brief Launches synchronous host to device memory copy.
size_t offset, size_t bytes,
cl_command_queue command_queue)
{
- return ocl_copy_H2D_generic(d_dest, h_src, offset, bytes, false, command_queue, NULL);
+ return ocl_copy_H2D(d_dest, h_src, offset, bytes, GpuApiCallBehavior::Sync, command_queue, NULL);
}
-/*! \brief Launches synchronous or asynchronous device to host memory copy.
- *
- * If copy_event is not NULL, on return it will contain an event object
- * identifying this particular device to host operation. The event can further
- * be used to queue a wait for this operation or to query profiling information.
- */
-static int ocl_copy_D2H_generic(void * h_dest, cl_mem d_src,
- size_t offset, size_t bytes,
- bool bAsync,
- cl_command_queue command_queue,
- cl_event *copy_event)
+int ocl_copy_D2H(void * h_dest, cl_mem d_src,
+ size_t offset, size_t bytes,
+ GpuApiCallBehavior transferKind,
+ cl_command_queue command_queue,
+ cl_event *copy_event)
{
cl_int gmx_unused cl_error;
return -1;
}
- if (bAsync)
+ switch (transferKind)
{
- cl_error = clEnqueueReadBuffer(command_queue, d_src, CL_FALSE, offset, bytes, h_dest, 0, NULL, copy_event);
- assert(cl_error == CL_SUCCESS);
- // TODO: handle errors
- }
- else
- {
- cl_error = clEnqueueReadBuffer(command_queue, d_src, CL_TRUE, offset, bytes, h_dest, 0, NULL, copy_event);
- assert(cl_error == CL_SUCCESS);
- // TODO: handle errors
+ case GpuApiCallBehavior::Async:
+ cl_error = clEnqueueReadBuffer(command_queue, d_src, CL_FALSE, offset, bytes, h_dest, 0, NULL, copy_event);
+ assert(cl_error == CL_SUCCESS);
+ // TODO: handle errors
+ break;
+
+ case GpuApiCallBehavior::Sync:
+ cl_error = clEnqueueReadBuffer(command_queue, d_src, CL_TRUE, offset, bytes, h_dest, 0, NULL, copy_event);
+ assert(cl_error == CL_SUCCESS);
+ // TODO: handle errors
+ break;
+
+ default:
+ throw;
}
return 0;
cl_command_queue command_queue,
cl_event *copy_event)
{
- return ocl_copy_D2H_generic(h_dest, d_src, offset, bytes, true, command_queue, copy_event);
+ return ocl_copy_D2H(h_dest, d_src, offset, bytes, GpuApiCallBehavior::Async, command_queue, copy_event);
}
/*! \brief \brief Allocates nbytes of host memory. Use ocl_free to free memory allocated with this function.
#include "gromacs/gpu_utils/gmxopencl.h"
#include "gromacs/utility/gmxassert.h"
+enum class GpuApiCallBehavior;
+
/*! \brief OpenCL vendor IDs */
typedef enum {
OCL_VENDOR_NVIDIA = 0,
cl_program program; /**< OpenCL program */
};
+/*! \brief Launches synchronous or asynchronous device to host memory copy.
+ *
+ * If copy_event is not NULL, on return it will contain an event object
+ * identifying this particular device to host operation. The event can further
+ * be used to queue a wait for this operation or to query profiling information.
+ */
+int ocl_copy_D2H(void * h_dest, cl_mem d_src,
+ size_t offset, size_t bytes,
+ GpuApiCallBehavior transferKind,
+ cl_command_queue command_queue,
+ cl_event *copy_event);
-/*! \brief Launches asynchronous host to device memory copy. */
-int ocl_copy_H2D_async(cl_mem d_dest, void * h_src,
+
+/*! \brief Launches asynchronous device to host memory copy. */
+int ocl_copy_D2H_async(void * h_dest, cl_mem d_src,
size_t offset, size_t bytes,
cl_command_queue command_queue,
cl_event *copy_event);
-/*! \brief Launches asynchronous device to host memory copy. */
-int ocl_copy_D2H_async(void * h_dest, cl_mem d_src,
+/*! \brief Launches synchronous or asynchronous host to device memory copy.
+ *
+ * If copy_event is not NULL, on return it will contain an event object
+ * identifying this particular host to device operation. The event can further
+ * be used to queue a wait for this operation or to query profiling information.
+ */
+int ocl_copy_H2D(cl_mem d_dest, void* h_src,
+ size_t offset, size_t bytes,
+ GpuApiCallBehavior transferKind,
+ cl_command_queue command_queue,
+ cl_event *copy_event);
+
+/*! \brief Launches asynchronous host to device memory copy. */
+int ocl_copy_H2D_async(cl_mem d_dest, void * h_src,
size_t offset, size_t bytes,
cl_command_queue command_queue,
cl_event *copy_event);