}
}
-/*! \brief Launch asynchronously the xq buffer host to device copy. */
-void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const AtomLocality atomLocality)
-{
- GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
- validateGpuAtomLocality(atomLocality);
-
- const InteractionLocality iloc = gpuAtomToInteractionLocality(atomLocality);
-
- NBAtomData* adat = nb->atdat;
- gpu_plist* plist = nb->plist[iloc];
- const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
-
- /* Don't launch the non-local H2D copy if there is no dependent
- work to do: neither non-local nor other (e.g. bonded) work
- to do that has as input the nbnxn coordinates.
- Doing the same for the local kernel is more complicated, since the
- local part of the force array also depends on the non-local kernel.
- So to avoid complicating the code and to reduce the risk of bugs,
- we always call the local local x+q copy (and the rest of the local
- work in nbnxn_gpu_launch_kernel().
- */
- if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(*nb, iloc))
- {
- plist->haveFreshList = false;
-
- // The event is marked for Local interactions unconditionally,
- // so it has to be released here because of the early return
- // for NonLocal interactions.
- nb->misc_ops_and_local_H2D_done.reset();
-
- return;
- }
-
- /* local/nonlocal offset and length used for xq and f */
- auto atomsRange = getGpuAtomRange(adat, atomLocality);
-
- /* HtoD x, q */
- GMX_ASSERT(adat->xq.elementSize() == sizeof(Float4),
- "The size of the xyzq buffer element should be equal to the size of float4.");
- copyToDeviceBuffer(&adat->xq,
- reinterpret_cast<const Float4*>(nbatom->x().data()) + atomsRange.begin(),
- atomsRange.begin(),
- atomsRange.size(),
- deviceStream,
- GpuApiCallBehavior::Async,
- nullptr);
-
- /* No need to enforce stream synchronization with events like we do in CUDA/OpenCL.
- * Runtime should do the scheduling correctly based on data dependencies.
- * But for consistency's sake, we do it anyway. */
- /* When we get here all misc operations issued in the local stream as well as
- * the local xq H2D are done, so we record that in the local stream and wait for it in the
- * nonlocal one. This wait needs to precede any PP tasks, bonded or nonbonded, that may
- * compute on interactions between local and nonlocal atoms. */
- nbnxnInsertNonlocalGpuDependency(nb, iloc);
-}
-
void gpu_launch_kernel_pruneonly(NbnxmGpu* nb, const InteractionLocality iloc, const int numParts)
{
gpu_plist* plist = nb->plist[iloc];