Unify gpu_launch_cpyback(...) function in NBNXM
[alexxy/gromacs.git] / src / gromacs / nbnxm / cuda / nbnxm_cuda.cu
index 6e3ad5cdfe1f87e7b39996a6b55218654ae629a4..bd5fa8de5dcc74499f7722ddd14d46dce4be5fa1 100644 (file)
@@ -692,110 +692,6 @@ void gpu_launch_kernel_pruneonly(NbnxmGpu* nb, const InteractionLocality iloc, c
     }
 }
 
-void gpu_launch_cpyback(NbnxmGpu*                nb,
-                        nbnxn_atomdata_t*        nbatom,
-                        const gmx::StepWorkload& stepWork,
-                        const AtomLocality       atomLocality)
-{
-    GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
-
-    /* determine interaction locality from atom locality */
-    const InteractionLocality iloc = gpuAtomToInteractionLocality(atomLocality);
-    GMX_ASSERT(iloc == InteractionLocality::Local
-                       || (iloc == InteractionLocality::NonLocal && nb->bNonLocalStreamDoneMarked == false),
-               "Non-local stream is indicating that the copy back event is enqueued at the "
-               "beginning of the copy back function.");
-
-    /* extract the data */
-    NBAtomData*         adat         = nb->atdat;
-    Nbnxm::GpuTimers*   timers       = nb->timers;
-    bool                bDoTime      = nb->bDoTime;
-    const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
-
-    /* don't launch non-local copy-back if there was no non-local work to do */
-    if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(*nb, iloc))
-    {
-        nb->bNonLocalStreamDoneMarked = false;
-        return;
-    }
-
-    /* local/nonlocal offset and length used for xq and f */
-    auto atomsRange = getGpuAtomRange(adat, atomLocality);
-
-    /* beginning of timed D2H section */
-    if (bDoTime)
-    {
-        timers->xf[atomLocality].nb_d2h.openTimingRegion(deviceStream);
-    }
-
-    /* With DD the local D2H transfer can only start after the non-local
-       kernel has finished. */
-    if (iloc == InteractionLocality::Local && nb->bNonLocalStreamDoneMarked)
-    {
-        nb->nonlocal_done.enqueueWaitEvent(deviceStream);
-        nb->bNonLocalStreamDoneMarked = false;
-    }
-
-    /* DtoH f
-     * Skip if buffer ops / reduction is offloaded to the GPU.
-     */
-    if (!stepWork.useGpuFBufferOps)
-    {
-        static_assert(
-                sizeof(adat->f[0]) == sizeof(Float3),
-                "The size of the force buffer element should be equal to the size of float3.");
-        copyFromDeviceBuffer(reinterpret_cast<Float3*>(nbatom->out[0].f.data()) + atomsRange.begin(),
-                             &adat->f,
-                             atomsRange.begin(),
-                             atomsRange.size(),
-                             deviceStream,
-                             GpuApiCallBehavior::Async,
-                             nullptr);
-    }
-
-    /* After the non-local D2H is launched the nonlocal_done event can be
-       recorded which signals that the local D2H can proceed. This event is not
-       placed after the non-local kernel because we want the non-local data
-       back first. */
-    if (iloc == InteractionLocality::NonLocal)
-    {
-        nb->nonlocal_done.markEvent(deviceStream);
-        nb->bNonLocalStreamDoneMarked = true;
-    }
-
-    /* only transfer energies in the local stream */
-    if (iloc == InteractionLocality::Local)
-    {
-        /* DtoH fshift when virial is needed */
-        if (stepWork.computeVirial)
-        {
-            static_assert(sizeof(nb->nbst.fShift[0]) == sizeof(adat->fShift[0]),
-                          "Sizes of host- and device-side shift vectors should be the same.");
-            copyFromDeviceBuffer(
-                    nb->nbst.fShift, &adat->fShift, 0, SHIFTS, deviceStream, GpuApiCallBehavior::Async, nullptr);
-        }
-
-        /* DtoH energies */
-        if (stepWork.computeEnergy)
-        {
-            static_assert(sizeof(nb->nbst.eLJ[0]) == sizeof(adat->eLJ[0]),
-                          "Sizes of host- and device-side LJ energy terms should be the same.");
-            copyFromDeviceBuffer(
-                    nb->nbst.eLJ, &adat->eLJ, 0, 1, deviceStream, GpuApiCallBehavior::Async, nullptr);
-            static_assert(sizeof(nb->nbst.eElec[0]) == sizeof(adat->eElec[0]),
-                          "Sizes of host- and device-side electrostatic energy terms should be the "
-                          "same.");
-            copyFromDeviceBuffer(
-                    nb->nbst.eElec, &adat->eElec, 0, 1, deviceStream, GpuApiCallBehavior::Async, nullptr);
-        }
-    }
-
-    if (bDoTime)
-    {
-        timers->xf[atomLocality].nb_d2h.closeTimingRegion(deviceStream);
-    }
-}
-
 void cuda_set_cacheconfig()
 {
     cudaError_t stat;