/*! \brief
* Performs the host-to-device data copy, synchronous or asynchronously on request.
*
- * TODO: This is meant to gradually replace cu/ocl_copy_h2d.
+ * Note that synchronous copy will not synchronize the stream in case of zero \p numValues
+ * because of the early return.
*
* \tparam ValueType Raw value type of the \p buffer.
* \param[in,out] buffer Pointer to the device-side buffer
/*! \brief
* Performs the device-to-host data copy, synchronous or asynchronously on request.
*
- * TODO: This is meant to gradually replace cu/ocl_copy_d2h.
+ * Note that synchronous copy will not synchronize the stream in case of zero \p numValues
+ * because of the early return.
*
* \tparam ValueType Raw value type of the \p buffer.
* \param[in,out] hostBuffer Pointer to the raw host-side memory, also typed \p ValueType
GpuApiCallBehavior transferKind,
CommandEvent* timingEvent)
{
+ if (numValues == 0)
+ {
+ return; // such calls are actually made with empty domains
+ }
GMX_ASSERT(buffer, "needs a buffer pointer");
GMX_ASSERT(hostBuffer, "needs a host buffer pointer");
cl_int clError;
#include "gromacs/utility/fatalerror.h"
#include "gromacs/utility/smalloc.h"
-int ocl_copy_H2D(cl_mem d_dest,
- const void* h_src,
- size_t offset,
- size_t bytes,
- GpuApiCallBehavior transferKind,
- cl_command_queue command_queue,
- cl_event* copy_event)
-{
- cl_int gmx_unused cl_error;
-
- if (d_dest == nullptr || h_src == nullptr || bytes == 0)
- {
- return -1;
- }
-
- switch (transferKind)
- {
- case GpuApiCallBehavior::Async:
- cl_error = clEnqueueWriteBuffer(command_queue, d_dest, CL_FALSE, offset, bytes, h_src,
- 0, nullptr, copy_event);
- break;
-
- case GpuApiCallBehavior::Sync:
- cl_error = clEnqueueWriteBuffer(command_queue, d_dest, CL_TRUE, offset, bytes, h_src, 0,
- nullptr, copy_event);
- break;
-
- default: throw;
- }
- GMX_ASSERT(cl_error == CL_SUCCESS,
- ("clEnqueueWriteBuffer failed: " + ocl_get_error_string(cl_error)).c_str());
-
- return 0;
-}
-
-/*! \brief Launches asynchronous host to device memory copy.
- *
- * If copy_event is not nullptr, on return it will contain an event object
- * identifying this particular host to device operation. The event can further
- * be used to queue a wait for this operation or to query profiling information.
- */
-int ocl_copy_H2D_async(cl_mem d_dest,
- const void* h_src,
- size_t offset,
- size_t bytes,
- cl_command_queue command_queue,
- cl_event* copy_event)
-{
- return ocl_copy_H2D(d_dest, h_src, offset, bytes, GpuApiCallBehavior::Async, command_queue, copy_event);
-}
-
-/*! \brief Launches synchronous host to device memory copy.
- */
-int ocl_copy_H2D_sync(cl_mem d_dest, const void* h_src, size_t offset, size_t bytes, cl_command_queue command_queue)
-{
- return ocl_copy_H2D(d_dest, h_src, offset, bytes, GpuApiCallBehavior::Sync, command_queue, nullptr);
-}
-
-int ocl_copy_D2H(void* h_dest,
- cl_mem d_src,
- size_t offset,
- size_t bytes,
- GpuApiCallBehavior transferKind,
- cl_command_queue command_queue,
- cl_event* copy_event)
-{
- cl_int gmx_unused cl_error;
-
- if (h_dest == nullptr || d_src == nullptr || bytes == 0)
- {
- return -1;
- }
-
- switch (transferKind)
- {
- case GpuApiCallBehavior::Async:
- cl_error = clEnqueueReadBuffer(command_queue, d_src, CL_FALSE, offset, bytes, h_dest, 0,
- nullptr, copy_event);
- break;
-
- case GpuApiCallBehavior::Sync:
- cl_error = clEnqueueReadBuffer(command_queue, d_src, CL_TRUE, offset, bytes, h_dest, 0,
- nullptr, copy_event);
- break;
-
- default: throw;
- }
- GMX_ASSERT(cl_error == CL_SUCCESS,
- ("clEnqueueWriteBuffer failed: " + ocl_get_error_string(cl_error)).c_str());
-
-
- return 0;
-}
-
-/*! \brief Launches asynchronous device to host memory copy.
- *
- * If copy_event is not nullptr, on return it will contain an event object
- * identifying this particular host to device operation. The event can further
- * be used to queue a wait for this operation or to query profiling information.
- */
-int ocl_copy_D2H_async(void* h_dest,
- cl_mem d_src,
- size_t offset,
- size_t bytes,
- cl_command_queue command_queue,
- cl_event* copy_event)
-{
- return ocl_copy_D2H(h_dest, d_src, offset, bytes, GpuApiCallBehavior::Async, command_queue, copy_event);
-}
-
/*! \brief \brief Allocates nbytes of host memory. Use ocl_free to free memory allocated with this function.
*
* \todo
cl_program program;
};
-/*! \brief Launches synchronous or asynchronous device to host memory copy.
- *
- * If copy_event is not NULL, on return it will contain an event object
- * identifying this particular device to host operation. The event can further
- * be used to queue a wait for this operation or to query profiling information.
- */
-int ocl_copy_D2H(void* h_dest,
- cl_mem d_src,
- size_t offset,
- size_t bytes,
- GpuApiCallBehavior transferKind,
- cl_command_queue command_queue,
- cl_event* copy_event);
-
-
-/*! \brief Launches asynchronous device to host memory copy. */
-int ocl_copy_D2H_async(void* h_dest,
- cl_mem d_src,
- size_t offset,
- size_t bytes,
- cl_command_queue command_queue,
- cl_event* copy_event);
-
-/*! \brief Launches synchronous or asynchronous host to device memory copy.
- *
- * If copy_event is not NULL, on return it will contain an event object
- * identifying this particular host to device operation. The event can further
- * be used to queue a wait for this operation or to query profiling information.
- */
-int ocl_copy_H2D(cl_mem d_dest,
- const void* h_src,
- size_t offset,
- size_t bytes,
- GpuApiCallBehavior transferKind,
- cl_command_queue command_queue,
- cl_event* copy_event);
-
-/*! \brief Launches asynchronous host to device memory copy. */
-int ocl_copy_H2D_async(cl_mem d_dest,
- const void* h_src,
- size_t offset,
- size_t bytes,
- cl_command_queue command_queue,
- cl_event* copy_event);
-
-/*! \brief Launches synchronous host to device memory copy. */
-int ocl_copy_H2D_sync(cl_mem d_dest, const void* h_src, size_t offset, size_t bytes, cl_command_queue command_queue);
-
/*! \brief Allocate host memory in malloc style */
void pmalloc(void** h_ptr, size_t nbytes);
}
/* HtoD x, q */
- ocl_copy_H2D_async(adat->xq, nbatom->x().data() + adat_begin * 4, adat_begin * sizeof(float) * 4,
- adat_len * sizeof(float) * 4, deviceStream.stream(),
+ GMX_ASSERT(sizeof(float) == sizeof(*nbatom->x().data()),
+ "The size of the xyzq buffer element should be equal to the size of float4.");
+ copyToDeviceBuffer(&adat->xq, nbatom->x().data() + adat_begin * 4, adat_begin * 4, adat_len * 4,
+ deviceStream, GpuApiCallBehavior::Async,
bDoTime ? t->xf[atomLocality].nb_h2d.fetchNextEvent() : nullptr);
if (bDoTime)
}
/* DtoH f */
- ocl_copy_D2H_async(nbatom->out[0].f.data() + adat_begin * DIM, adat->f,
- adat_begin * DIM * sizeof(nbatom->out[0].f[0]),
- adat_len * DIM * sizeof(nbatom->out[0].f[0]), deviceStream.stream(),
- bDoTime ? t->xf[aloc].nb_d2h.fetchNextEvent() : nullptr);
+ GMX_ASSERT(sizeof(*nbatom->out[0].f.data()) == sizeof(float),
+ "The size of the force buffer element should be equal to the size of float3.");
+ copyFromDeviceBuffer(&nbatom->out[0].f.data()[adat_begin * DIM], &adat->f, adat_begin * DIM,
+ adat_len * DIM, deviceStream, GpuApiCallBehavior::Async,
+ bDoTime ? t->xf[aloc].nb_d2h.fetchNextEvent() : nullptr);
/* kick off work */
cl_error = clFlush(deviceStream.stream());
/* DtoH fshift when virial is needed */
if (stepWork.computeVirial)
{
- ocl_copy_D2H_async(nb->nbst.fshift, adat->fshift, 0,
- SHIFTS * sizeof(nb->nbst.fshift[0]), deviceStream.stream(),
- bDoTime ? t->xf[aloc].nb_d2h.fetchNextEvent() : nullptr);
+ GMX_ASSERT(sizeof(*nb->nbst.fshift) == DIM * sizeof(float),
+ "Sizes of host- and device-side shift vectors should be the same.");
+ copyFromDeviceBuffer(reinterpret_cast<float*>(nb->nbst.fshift), &adat->fshift, 0,
+ SHIFTS * DIM, deviceStream, GpuApiCallBehavior::Async,
+ bDoTime ? t->xf[aloc].nb_d2h.fetchNextEvent() : nullptr);
}
/* DtoH energies */
if (stepWork.computeEnergy)
{
- ocl_copy_D2H_async(nb->nbst.e_lj, adat->e_lj, 0, sizeof(float), deviceStream.stream(),
- bDoTime ? t->xf[aloc].nb_d2h.fetchNextEvent() : nullptr);
-
- ocl_copy_D2H_async(nb->nbst.e_el, adat->e_el, 0, sizeof(float), deviceStream.stream(),
- bDoTime ? t->xf[aloc].nb_d2h.fetchNextEvent() : nullptr);
+ GMX_ASSERT(sizeof(*nb->nbst.e_lj) == sizeof(float),
+ "Sizes of host- and device-side LJ energy terms should be the same.");
+ copyFromDeviceBuffer(nb->nbst.e_lj, &adat->e_lj, 0, 1, deviceStream, GpuApiCallBehavior::Async,
+ bDoTime ? t->xf[aloc].nb_d2h.fetchNextEvent() : nullptr);
+ GMX_ASSERT(sizeof(*nb->nbst.e_el) == sizeof(float),
+ "Sizes of host- and device-side electrostatic energy terms should be the "
+ "same.");
+ copyFromDeviceBuffer(nb->nbst.e_el, &adat->e_el, 0, 1, deviceStream, GpuApiCallBehavior::Async,
+ bDoTime ? t->xf[aloc].nb_d2h.fetchNextEvent() : nullptr);
}
}
//! This function is documented in the header file
void gpu_upload_shiftvec(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom)
{
- cl_atomdata_t* adat = nb->atdat;
- cl_command_queue ls = nb->deviceStreams[InteractionLocality::Local]->stream();
+ cl_atomdata_t* adat = nb->atdat;
+ const DeviceStream& deviceStream = *nb->deviceStreams[InteractionLocality::Local];
/* only if we have a dynamic box */
if (nbatom->bDynamicBox || !adat->bShiftVecUploaded)
{
- ocl_copy_H2D_async(adat->shift_vec, nbatom->shift_vec.data(), 0,
- SHIFTS * sizeof(nbatom->shift_vec[0]), ls, nullptr);
+ GMX_ASSERT(sizeof(float) * DIM == sizeof(*nbatom->shift_vec.data()),
+ "Sizes of host- and device-side shift vectors should be the same.");
+ copyToDeviceBuffer(&adat->shift_vec, reinterpret_cast<const float*>(nbatom->shift_vec.data()),
+ 0, SHIFTS * DIM, deviceStream, GpuApiCallBehavior::Async, nullptr);
adat->bShiftVecUploaded = CL_TRUE;
}
}
if (useLjCombRule(nb->nbparam->vdwtype))
{
- ocl_copy_H2D_async(d_atdat->lj_comb, nbat->params().lj_comb.data(), 0, natoms * sizeof(cl_float2),
- deviceStream.stream(), bDoTime ? timers->atdat.fetchNextEvent() : nullptr);
+ GMX_ASSERT(sizeof(float) == sizeof(*nbat->params().lj_comb.data()),
+ "Size of the LJ parameters element should be equal to the size of float2.");
+ copyToDeviceBuffer(&d_atdat->lj_comb, nbat->params().lj_comb.data(), 0, 2 * natoms,
+ deviceStream, GpuApiCallBehavior::Async,
+ bDoTime ? timers->atdat.fetchNextEvent() : nullptr);
}
else
{
- ocl_copy_H2D_async(d_atdat->atom_types, nbat->params().type.data(), 0, natoms * sizeof(int),
- deviceStream.stream(), bDoTime ? timers->atdat.fetchNextEvent() : nullptr);
+ GMX_ASSERT(sizeof(int) == sizeof(*nbat->params().type.data()),
+ "Sizes of host- and device-side atom types should be the same.");
+ copyToDeviceBuffer(&d_atdat->atom_types, nbat->params().type.data(), 0, natoms, deviceStream,
+ GpuApiCallBehavior::Async, bDoTime ? timers->atdat.fetchNextEvent() : nullptr);
}
if (bDoTime)
int nalloc;
//! float4 buffer with atom coordinates + charges, size natoms
- cl_mem xq;
+ DeviceBuffer<float> xq;
//! float3 buffer with force output array, size natoms
- cl_mem f;
+ DeviceBuffer<float> f;
//! LJ energy output, size 1
- cl_mem e_lj;
+ DeviceBuffer<float> e_lj;
//! Electrostatics energy input, size 1
- cl_mem e_el;
+ DeviceBuffer<float> e_el;
//! float3 buffer with shift forces
- cl_mem fshift;
+ DeviceBuffer<float> fshift;
//! number of atom types
int ntypes;
//! int buffer with atom type indices, size natoms
- cl_mem atom_types;
+ DeviceBuffer<int> atom_types;
//! float2 buffer with sqrt(c6),sqrt(c12), size natoms
- cl_mem lj_comb;
+ DeviceBuffer<float> lj_comb;
//! float3 buffer with shifts values
- cl_mem shift_vec;
+ DeviceBuffer<float> shift_vec;
//! true if the shift vector has been uploaded
bool bShiftVecUploaded;