)
elseif(GMX_USE_CUDA)
gmx_add_libgromacs_sources(
- cudautils.cu
device_stream.cu
gpu_utils.cu
pinning.cu
#endif /* CHECK_CUDA_ERRORS */
-/*! Launches synchronous or asynchronous device to host memory copy.
- *
- * The copy is launched in stream s or if not specified, in stream 0.
- */
-int cu_copy_D2H(void* h_dest, void* d_src, size_t bytes, GpuApiCallBehavior transferKind, cudaStream_t /*s = nullptr*/);
-
-/*! Launches synchronous host to device memory copy in stream 0. */
-int cu_copy_D2H_sync(void* /*h_dest*/, void* /*d_src*/, size_t /*bytes*/);
-
-/*! Launches asynchronous host to device memory copy in stream s. */
-int cu_copy_D2H_async(void* /*h_dest*/, void* /*d_src*/, size_t /*bytes*/, cudaStream_t /*s = nullptr*/);
-
-/*! Launches synchronous or asynchronous host to device memory copy.
- *
- * The copy is launched in stream s or if not specified, in stream 0.
- */
-int cu_copy_H2D(void* d_dest, const void* h_src, size_t bytes, GpuApiCallBehavior transferKind, cudaStream_t /*s = nullptr*/);
-
-/*! Launches synchronous host to device memory copy. */
-int cu_copy_H2D_sync(void* /*d_dest*/, const void* /*h_src*/, size_t /*bytes*/);
-
-/*! Launches asynchronous host to device memory copy in stream s. */
-int cu_copy_H2D_async(void* /*d_dest*/, const void* /*h_src*/, size_t /*bytes*/, cudaStream_t /*s = nullptr*/);
-
// TODO: the 2 functions below are pretty much a constructor/destructor of a simple
// GPU table object. There is also almost self-contained fetchFromParamLookupTable()
// in cuda_kernel_utils.cuh. They could all live in a separate class/struct file.
/*! \brief
* Performs the host-to-device data copy, synchronous or asynchronously on request.
*
- * TODO: This is meant to gradually replace cu/ocl_copy_h2d.
- *
* \tparam ValueType Raw value type of the \p buffer.
* \param[in,out] buffer Pointer to the device-side buffer
* \param[in] hostBuffer Pointer to the raw host-side memory, also typed \p ValueType
{
if (numValues == 0)
{
- return; // such calls are actually made with empty domains
+ return;
}
GMX_ASSERT(buffer, "needs a buffer pointer");
GMX_ASSERT(hostBuffer, "needs a host buffer pointer");
/*! \brief
* Performs the device-to-host data copy, synchronous or asynchronously on request.
*
- * TODO: This is meant to gradually replace cu/ocl_copy_d2h.
- *
* \tparam ValueType Raw value type of the \p buffer.
* \param[in,out] hostBuffer Pointer to the raw host-side memory, also typed \p ValueType
* \param[in] buffer Pointer to the device-side buffer
GpuApiCallBehavior transferKind,
CommandEvent* /*timingEvent*/)
{
+ if (numValues == 0)
+ {
+ return;
+ }
GMX_ASSERT(buffer, "needs a buffer pointer");
GMX_ASSERT(hostBuffer, "needs a host buffer pointer");
#include "nbnxm_cuda.h"
-#include "gromacs/gpu_utils/cudautils.cuh"
+#include "gromacs/gpu_utils/gpu_utils.h"
#include "gromacs/gpu_utils/gpueventsynchronizer.cuh"
#include "gromacs/gpu_utils/typecasts.cuh"
#include "gromacs/gpu_utils/vectype_ops.cuh"
t->xf[atomLocality].nb_h2d.openTimingRegion(deviceStream);
}
- cu_copy_H2D_async(adat->xq + adat_begin,
- static_cast<const void*>(nbatom->x().data() + adat_begin * 4),
- adat_len * sizeof(*adat->xq), deviceStream.stream());
+ static_assert(sizeof(adat->xq[0]) == sizeof(float4),
+ "The size of the xyzq buffer element should be equal to the size of float4.");
+ copyToDeviceBuffer(&adat->xq, reinterpret_cast<const float4*>(nbatom->x().data()) + adat_begin,
+ adat_begin, adat_len, deviceStream, GpuApiCallBehavior::Async, nullptr);
if (bDoTime)
{
*/
if (!stepWork.useGpuFBufferOps)
{
- cu_copy_D2H_async(nbatom->out[0].f.data() + adat_begin * 3, adat->f + adat_begin,
- (adat_len) * sizeof(*adat->f), deviceStream.stream());
+ static_assert(
+ sizeof(adat->f[0]) == sizeof(float3),
+ "The size of the force buffer element should be equal to the size of float3.");
+ copyFromDeviceBuffer(reinterpret_cast<float3*>(nbatom->out[0].f.data()) + adat_begin, &adat->f,
+ adat_begin, adat_len, deviceStream, GpuApiCallBehavior::Async, nullptr);
}
/* After the non-local D2H is launched the nonlocal_done event can be
/* DtoH fshift when virial is needed */
if (stepWork.computeVirial)
{
- cu_copy_D2H_async(nb->nbst.fshift, adat->fshift, SHIFTS * sizeof(*nb->nbst.fshift),
- deviceStream.stream());
+ static_assert(sizeof(nb->nbst.fshift[0]) == sizeof(adat->fshift[0]),
+ "Sizes of host- and device-side shift vectors should be the same.");
+ copyFromDeviceBuffer(nb->nbst.fshift, &adat->fshift, 0, SHIFTS, deviceStream,
+ GpuApiCallBehavior::Async, nullptr);
}
/* DtoH energies */
if (stepWork.computeEnergy)
{
- cu_copy_D2H_async(nb->nbst.e_lj, adat->e_lj, sizeof(*nb->nbst.e_lj), deviceStream.stream());
- cu_copy_D2H_async(nb->nbst.e_el, adat->e_el, sizeof(*nb->nbst.e_el), deviceStream.stream());
+ static_assert(sizeof(nb->nbst.e_lj[0]) == sizeof(adat->e_lj[0]),
+ "Sizes of host- and device-side LJ energy terms should be the same.");
+ copyFromDeviceBuffer(nb->nbst.e_lj, &adat->e_lj, 0, 1, deviceStream,
+ GpuApiCallBehavior::Async, nullptr);
+ static_assert(sizeof(nb->nbst.e_el[0]) == sizeof(adat->e_el[0]),
+ "Sizes of host- and device-side electrostatic energy terms should be the "
+ "same.");
+ copyFromDeviceBuffer(nb->nbst.e_el, &adat->e_el, 0, 1, deviceStream,
+ GpuApiCallBehavior::Async, nullptr);
}
}
void gpu_upload_shiftvec(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom)
{
- cu_atomdata_t* adat = nb->atdat;
- cudaStream_t ls = nb->deviceStreams[InteractionLocality::Local]->stream();
+ cu_atomdata_t* adat = nb->atdat;
+ const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local];
/* only if we have a dynamic box */
if (nbatom->bDynamicBox || !adat->bShiftVecUploaded)
{
- cu_copy_H2D_async(adat->shift_vec, nbatom->shift_vec.data(), SHIFTS * sizeof(*adat->shift_vec), ls);
+ static_assert(sizeof(adat->shift_vec[0]) == sizeof(nbatom->shift_vec[0]),
+ "Sizes of host- and device-side shift vectors should be the same.");
+ copyToDeviceBuffer(&adat->shift_vec, reinterpret_cast<const float3*>(nbatom->shift_vec.data()),
+ 0, SHIFTS, localStream, GpuApiCallBehavior::Async, nullptr);
adat->bShiftVecUploaded = true;
}
}
cudaError_t stat;
int nalloc, natoms;
bool realloced;
- bool bDoTime = nb->bDoTime;
- cu_timers_t* timers = nb->timers;
- cu_atomdata_t* d_atdat = nb->atdat;
- const DeviceStream& deviceStream = *nb->deviceStreams[InteractionLocality::Local];
+ bool bDoTime = nb->bDoTime;
+ cu_timers_t* timers = nb->timers;
+ cu_atomdata_t* d_atdat = nb->atdat;
+ const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local];
natoms = nbat->numAtoms();
realloced = false;
if (bDoTime)
{
/* time async copy */
- timers->atdat.openTimingRegion(deviceStream);
+ timers->atdat.openTimingRegion(localStream);
}
/* need to reallocate if we have to copy more atoms than the amount of space
if (useLjCombRule(nb->nbparam))
{
- cu_copy_H2D_async(d_atdat->lj_comb, nbat->params().lj_comb.data(),
- natoms * sizeof(*d_atdat->lj_comb), deviceStream.stream());
+ static_assert(sizeof(d_atdat->lj_comb[0]) == sizeof(float2),
+ "Size of the LJ parameters element should be equal to the size of float2.");
+ copyToDeviceBuffer(&d_atdat->lj_comb,
+ reinterpret_cast<const float2*>(nbat->params().lj_comb.data()), 0,
+ natoms, localStream, GpuApiCallBehavior::Async, nullptr);
}
else
{
- cu_copy_H2D_async(d_atdat->atom_types, nbat->params().type.data(),
- natoms * sizeof(*d_atdat->atom_types), deviceStream.stream());
+ static_assert(sizeof(d_atdat->atom_types[0]) == sizeof(nbat->params().type[0]),
+ "Sizes of host- and device-side atom types should be the same.");
+ copyToDeviceBuffer(&d_atdat->atom_types, nbat->params().type.data(), 0, natoms, localStream,
+ GpuApiCallBehavior::Async, nullptr);
}
if (bDoTime)
{
- timers->atdat.closeTimingRegion(deviceStream);
+ timers->atdat.closeTimingRegion(localStream);
}
}