From d2d4a50b4c636c203028c5bff311924ec15e7825 Mon Sep 17 00:00:00 2001
From: Artem Zhmurov <zhmurov@gmail.com>
Date: Tue, 16 Mar 2021 16:26:37 +0300
Subject: [PATCH] Unify gpu_launch_cpyback(...) function in NBNXM

Refs #2608
---
 src/gromacs/nbnxm/cuda/nbnxm_cuda.cu      | 104 -----------------
 src/gromacs/nbnxm/nbnxm_gpu_data_mgmt.cpp | 130 +++++++++++++++++++++
 src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp    | 132 ----------------------
 src/gromacs/nbnxm/sycl/nbnxm_sycl.cpp     |  91 ---------------
 4 files changed, 130 insertions(+), 327 deletions(-)

diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
index 6e3ad5cdfe..bd5fa8de5d 100644
--- a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
@@ -692,110 +692,6 @@ void gpu_launch_kernel_pruneonly(NbnxmGpu* nb, const InteractionLocality iloc, c
     }
 }
 
-void gpu_launch_cpyback(NbnxmGpu*                nb,
-                        nbnxn_atomdata_t*        nbatom,
-                        const gmx::StepWorkload& stepWork,
-                        const AtomLocality       atomLocality)
-{
-    GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
-
-    /* determine interaction locality from atom locality */
-    const InteractionLocality iloc = gpuAtomToInteractionLocality(atomLocality);
-    GMX_ASSERT(iloc == InteractionLocality::Local
-                       || (iloc == InteractionLocality::NonLocal && nb->bNonLocalStreamDoneMarked == false),
-               "Non-local stream is indicating that the copy back event is enqueued at the "
-               "beginning of the copy back function.");
-
-    /* extract the data */
-    NBAtomData*         adat         = nb->atdat;
-    Nbnxm::GpuTimers*   timers       = nb->timers;
-    bool                bDoTime      = nb->bDoTime;
-    const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
-
-    /* don't launch non-local copy-back if there was no non-local work to do */
-    if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(*nb, iloc))
-    {
-        nb->bNonLocalStreamDoneMarked = false;
-        return;
-    }
-
-    /* local/nonlocal offset and length used for xq and f */
-    auto atomsRange = getGpuAtomRange(adat, atomLocality);
-
-    /* beginning of timed D2H section */
-    if (bDoTime)
-    {
-        timers->xf[atomLocality].nb_d2h.openTimingRegion(deviceStream);
-    }
-
-    /* With DD the local D2H transfer can only start after the non-local
-       kernel has finished. */
-    if (iloc == InteractionLocality::Local && nb->bNonLocalStreamDoneMarked)
-    {
-        nb->nonlocal_done.enqueueWaitEvent(deviceStream);
-        nb->bNonLocalStreamDoneMarked = false;
-    }
-
-    /* DtoH f
-     * Skip if buffer ops / reduction is offloaded to the GPU.
-     */
-    if (!stepWork.useGpuFBufferOps)
-    {
-        static_assert(
-                sizeof(adat->f[0]) == sizeof(Float3),
-                "The size of the force buffer element should be equal to the size of float3.");
-        copyFromDeviceBuffer(reinterpret_cast<Float3*>(nbatom->out[0].f.data()) + atomsRange.begin(),
-                             &adat->f,
-                             atomsRange.begin(),
-                             atomsRange.size(),
-                             deviceStream,
-                             GpuApiCallBehavior::Async,
-                             nullptr);
-    }
-
-    /* After the non-local D2H is launched the nonlocal_done event can be
-       recorded which signals that the local D2H can proceed. This event is not
-       placed after the non-local kernel because we want the non-local data
-       back first. */
-    if (iloc == InteractionLocality::NonLocal)
-    {
-        nb->nonlocal_done.markEvent(deviceStream);
-        nb->bNonLocalStreamDoneMarked = true;
-    }
-
-    /* only transfer energies in the local stream */
-    if (iloc == InteractionLocality::Local)
-    {
-        /* DtoH fshift when virial is needed */
-        if (stepWork.computeVirial)
-        {
-            static_assert(sizeof(nb->nbst.fShift[0]) == sizeof(adat->fShift[0]),
-                          "Sizes of host- and device-side shift vectors should be the same.");
-            copyFromDeviceBuffer(
-                    nb->nbst.fShift, &adat->fShift, 0, SHIFTS, deviceStream, GpuApiCallBehavior::Async, nullptr);
-        }
-
-        /* DtoH energies */
-        if (stepWork.computeEnergy)
-        {
-            static_assert(sizeof(nb->nbst.eLJ[0]) == sizeof(adat->eLJ[0]),
-                          "Sizes of host- and device-side LJ energy terms should be the same.");
-            copyFromDeviceBuffer(
-                    nb->nbst.eLJ, &adat->eLJ, 0, 1, deviceStream, GpuApiCallBehavior::Async, nullptr);
-            static_assert(sizeof(nb->nbst.eElec[0]) == sizeof(adat->eElec[0]),
-                          "Sizes of host- and device-side electrostatic energy terms should be the "
-                          "same.");
-            copyFromDeviceBuffer(
-                    nb->nbst.eElec, &adat->eElec, 0, 1, deviceStream, GpuApiCallBehavior::Async, nullptr);
-        }
-    }
-
-    if (bDoTime)
-    {
-        timers->xf[atomLocality].nb_d2h.closeTimingRegion(deviceStream);
-    }
-}
-
 void cuda_set_cacheconfig()
 {
     cudaError_t stat;
diff --git a/src/gromacs/nbnxm/nbnxm_gpu_data_mgmt.cpp b/src/gromacs/nbnxm/nbnxm_gpu_data_mgmt.cpp
index f92f636e19..50519ced6d 100644
--- a/src/gromacs/nbnxm/nbnxm_gpu_data_mgmt.cpp
+++ b/src/gromacs/nbnxm/nbnxm_gpu_data_mgmt.cpp
@@ -64,8 +64,10 @@
 
 #include "gromacs/hardware/device_information.h"
 #include "gromacs/mdtypes/interaction_const.h"
+#include "gromacs/mdtypes/simulation_workload.h"
 #include "gromacs/nbnxm/gpu_common_utils.h"
 #include "gromacs/nbnxm/gpu_data_mgmt.h"
+#include "gromacs/pbcutil/ishift.h"
 #include "gromacs/timing/gpu_timing.h"
 #include "gromacs/utility/cstringutil.h"
 #include "gromacs/utility/exceptions.h"
@@ -541,6 +543,134 @@ bool haveGpuShortRangeWork(const NbnxmGpu* nb, const gmx::AtomLocality aLocality
     return haveGpuShortRangeWork(*nb, gpuAtomToInteractionLocality(aLocality));
 }
 
+/*! \brief
+ * Launch asynchronously the download of nonbonded forces from the GPU
+ * (and energies/shift forces if required).
+ */
+void gpu_launch_cpyback(NbnxmGpu*                nb,
+                        struct nbnxn_atomdata_t* nbatom,
+                        const gmx::StepWorkload& stepWork,
+                        const AtomLocality       atomLocality)
+{
+    GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
+
+    /* determine interaction locality from atom locality */
+    const InteractionLocality iloc = gpuAtomToInteractionLocality(atomLocality);
+    GMX_ASSERT(iloc == InteractionLocality::Local
+                       || (iloc == InteractionLocality::NonLocal && nb->bNonLocalStreamDoneMarked == false),
+               "Non-local stream is indicating that the copy back event is enqueued at the "
+               "beginning of the copy back function.");
+
+    /* extract the data */
+    NBAtomData*         adat         = nb->atdat;
+    Nbnxm::GpuTimers*   timers       = nb->timers;
+    bool                bDoTime      = nb->bDoTime;
+    const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
+
+    /* don't launch non-local copy-back if there was no non-local work to do */
+    if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(*nb, iloc))
+    {
+        /* TODO An alternative way to signal that non-local work is
+           complete is to use a clEnqueueMarker+clEnqueueBarrier
+           pair. However, the use of bNonLocalStreamDoneMarked has the
+           advantage of being local to the host, so probably minimizes
+           overhead. Curiously, for NVIDIA OpenCL with an empty-domain
+           test case, overall simulation performance was higher with
+           the API calls, but this has not been tested on AMD OpenCL,
+           so could be worth considering in future. */
+        nb->bNonLocalStreamDoneMarked = false;
+        return;
+    }
+
+    /* local/nonlocal offset and length used for xq and f */
+    auto atomsRange = getGpuAtomRange(adat, atomLocality);
+
+    /* beginning of timed D2H section */
+    if (bDoTime)
+    {
+        timers->xf[atomLocality].nb_d2h.openTimingRegion(deviceStream);
+    }
+
+    /* With DD the local D2H transfer can only start after the non-local
+       has been launched. */
+    if (iloc == InteractionLocality::Local && nb->bNonLocalStreamDoneMarked)
+    {
+        nb->nonlocal_done.enqueueWaitEvent(deviceStream);
+        nb->bNonLocalStreamDoneMarked = false;
+    }
+
+    /* DtoH f */
+    static_assert(sizeof(*nbatom->out[0].f.data()) == sizeof(float),
+                  "The host force buffer should be in single precision to match device data size.");
+    copyFromDeviceBuffer(reinterpret_cast<Float3*>(nbatom->out[0].f.data()) + atomsRange.begin(),
+                         &adat->f,
+                         atomsRange.begin(),
+                         atomsRange.size(),
+                         deviceStream,
+                         GpuApiCallBehavior::Async,
+                         bDoTime ? timers->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
+
+    issueClFlushInStream(deviceStream);
+
+    /* After the non-local D2H is launched the nonlocal_done event can be
+       recorded which signals that the local D2H can proceed. This event is not
+       placed after the non-local kernel because we first need the non-local
+       data back first. */
+    if (iloc == InteractionLocality::NonLocal)
+    {
+        nb->nonlocal_done.markEvent(deviceStream);
+        nb->bNonLocalStreamDoneMarked = true;
+    }
+
+    /* only transfer energies in the local stream */
+    if (iloc == InteractionLocality::Local)
+    {
+        /* DtoH fshift when virial is needed */
+        if (stepWork.computeVirial)
+        {
+            static_assert(
+                    sizeof(*nb->nbst.fShift) == sizeof(Float3),
+                    "Sizes of host- and device-side shift vector elements should be the same.");
+            copyFromDeviceBuffer(nb->nbst.fShift,
+                                 &adat->fShift,
+                                 0,
+                                 SHIFTS,
+                                 deviceStream,
+                                 GpuApiCallBehavior::Async,
+                                 bDoTime ? timers->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
+        }
+
+        /* DtoH energies */
+        if (stepWork.computeEnergy)
+        {
+            static_assert(sizeof(*nb->nbst.eLJ) == sizeof(float),
+                          "Sizes of host- and device-side LJ energy terms should be the same.");
+            copyFromDeviceBuffer(nb->nbst.eLJ,
+                                 &adat->eLJ,
+                                 0,
+                                 1,
+                                 deviceStream,
+                                 GpuApiCallBehavior::Async,
+                                 bDoTime ? timers->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
+            static_assert(sizeof(*nb->nbst.eElec) == sizeof(float),
+                          "Sizes of host- and device-side electrostatic energy terms should be the "
+                          "same.");
+            copyFromDeviceBuffer(nb->nbst.eElec,
+                                 &adat->eElec,
+                                 0,
+                                 1,
+                                 deviceStream,
+                                 GpuApiCallBehavior::Async,
+                                 bDoTime ? timers->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
+        }
+    }
+
+    if (bDoTime)
+    {
+        timers->xf[atomLocality].nb_d2h.closeTimingRegion(deviceStream);
+    }
+}
+
 void nbnxnInsertNonlocalGpuDependency(NbnxmGpu* nb, const InteractionLocality interactionLocality)
 {
     const DeviceStream& deviceStream = *nb->deviceStreams[interactionLocality];
diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp b/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp
index af7f4ad86f..d14aad3da1 100644
--- a/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp
+++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp
@@ -81,7 +81,6 @@
 #include "gromacs/nbnxm/nbnxm.h"
 #include "gromacs/nbnxm/nbnxm_gpu.h"
 #include "gromacs/nbnxm/pairlist.h"
-#include "gromacs/pbcutil/ishift.h"
 #include "gromacs/timing/gpu_timing.h"
 #include "gromacs/utility/cstringutil.h"
 #include "gromacs/utility/fatalerror.h"
@@ -812,135 +811,4 @@ void gpu_launch_kernel_pruneonly(NbnxmGpu* nb, const InteractionLocality iloc, c
     }
 }
 
-/*! \brief
- * Launch asynchronously the download of nonbonded forces from the GPU
- * (and energies/shift forces if required).
- */
-void gpu_launch_cpyback(NbnxmGpu*                nb,
-                        struct nbnxn_atomdata_t* nbatom,
-                        const gmx::StepWorkload& stepWork,
-                        const AtomLocality       atomLocality)
-{
-    GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
-
-    cl_int gmx_unused cl_error;
-
-    /* determine interaction locality from atom locality */
-    const InteractionLocality iloc = gpuAtomToInteractionLocality(atomLocality);
-    GMX_ASSERT(iloc == InteractionLocality::Local
-                       || (iloc == InteractionLocality::NonLocal && nb->bNonLocalStreamDoneMarked == false),
-               "Non-local stream is indicating that the copy back event is enqueued at the "
-               "beginning of the copy back function.");
-
-    NBAtomData*         adat         = nb->atdat;
-    Nbnxm::GpuTimers*   timers       = nb->timers;
-    bool                bDoTime      = nb->bDoTime;
-    const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
-
-    /* don't launch non-local copy-back if there was no non-local work to do */
-    if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(*nb, iloc))
-    {
-        /* TODO An alternative way to signal that non-local work is
-           complete is to use a clEnqueueMarker+clEnqueueBarrier
-           pair. However, the use of bNonLocalStreamDoneMarked has the
-           advantage of being local to the host, so probably minimizes
-           overhead. Curiously, for NVIDIA OpenCL with an empty-domain
-           test case, overall simulation performance was higher with
-           the API calls, but this has not been tested on AMD OpenCL,
-           so could be worth considering in future. */
-        nb->bNonLocalStreamDoneMarked = false;
-        return;
-    }
-
-    /* local/nonlocal offset and length used for xq and f */
-    auto atomsRange = getGpuAtomRange(adat, atomLocality);
-
-    /* beginning of timed D2H section */
-    if (bDoTime)
-    {
-        timers->xf[atomLocality].nb_d2h.openTimingRegion(deviceStream);
-    }
-
-    /* With DD the local D2H transfer can only start after the non-local
-       has been launched. */
-    if (iloc == InteractionLocality::Local && nb->bNonLocalStreamDoneMarked)
-    {
-        nb->nonlocal_done.enqueueWaitEvent(deviceStream);
-        nb->bNonLocalStreamDoneMarked = false;
-    }
-
-    /* DtoH f */
-    GMX_ASSERT(sizeof(*nbatom->out[0].f.data()) == sizeof(float),
-               "The host force buffer should be in single precision to match device data size.");
-    copyFromDeviceBuffer(reinterpret_cast<Float3*>(nbatom->out[0].f.data()) + atomsRange.begin(),
-                         &adat->f,
-                         atomsRange.begin(),
-                         atomsRange.size(),
-                         deviceStream,
-                         GpuApiCallBehavior::Async,
-                         bDoTime ? timers->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
-
-    /* kick off work */
-    cl_error = clFlush(deviceStream.stream());
-    GMX_ASSERT(cl_error == CL_SUCCESS, ("clFlush failed: " + ocl_get_error_string(cl_error)).c_str());
-
-    /* After the non-local D2H is launched the nonlocal_done event can be
-       recorded which signals that the local D2H can proceed. This event is not
-       placed after the non-local kernel because we first need the non-local
-       data back first. */
-    if (iloc == InteractionLocality::NonLocal)
-    {
-        nb->nonlocal_done.markEvent(deviceStream);
-        nb->bNonLocalStreamDoneMarked = true;
-    }
-
-    /* only transfer energies in the local stream */
-    if (iloc == InteractionLocality::Local)
-    {
-        /* DtoH fshift when virial is needed */
-        if (stepWork.computeVirial)
-        {
-            static_assert(
-                    sizeof(*nb->nbst.fShift) == sizeof(Float3),
-                    "Sizes of host- and device-side shift vector elements should be the same.");
-            copyFromDeviceBuffer(nb->nbst.fShift,
-                                 &adat->fShift,
-                                 0,
-                                 SHIFTS,
-                                 deviceStream,
-                                 GpuApiCallBehavior::Async,
-                                 bDoTime ? timers->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
-        }
-
-        /* DtoH energies */
-        if (stepWork.computeEnergy)
-        {
-            static_assert(sizeof(*nb->nbst.eLJ) == sizeof(float),
-                          "Sizes of host- and device-side LJ energy terms should be the same.");
-            copyFromDeviceBuffer(nb->nbst.eLJ,
-                                 &adat->eLJ,
-                                 0,
-                                 1,
-                                 deviceStream,
-                                 GpuApiCallBehavior::Async,
-                                 bDoTime ? timers->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
-            static_assert(sizeof(*nb->nbst.eElec) == sizeof(float),
-                          "Sizes of host- and device-side electrostatic energy terms should be the "
-                          "same.");
-            copyFromDeviceBuffer(nb->nbst.eElec,
-                                 &adat->eElec,
-                                 0,
-                                 1,
-                                 deviceStream,
-                                 GpuApiCallBehavior::Async,
-                                 bDoTime ? timers->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
-        }
-    }
-
-    if (bDoTime)
-    {
-        timers->xf[atomLocality].nb_d2h.closeTimingRegion(deviceStream);
-    }
-}
-
 } // namespace Nbnxm
diff --git a/src/gromacs/nbnxm/sycl/nbnxm_sycl.cpp b/src/gromacs/nbnxm/sycl/nbnxm_sycl.cpp
index 1a130a6fb5..d508a20ef1 100644
--- a/src/gromacs/nbnxm/sycl/nbnxm_sycl.cpp
+++ b/src/gromacs/nbnxm/sycl/nbnxm_sycl.cpp
@@ -51,97 +51,6 @@
 namespace Nbnxm
 {
 
-/*! \brief
- * Launch asynchronously the download of nonbonded forces from the GPU
- * (and energies/shift forces if required).
- */
-void gpu_launch_cpyback(NbnxmGpu*                nb,
-                        struct nbnxn_atomdata_t* nbatom,
-                        const gmx::StepWorkload& stepWork,
-                        const AtomLocality       atomLocality)
-{
-    GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
-
-    const InteractionLocality iloc = gpuAtomToInteractionLocality(atomLocality);
-    GMX_ASSERT(iloc == InteractionLocality::Local
-                       || (iloc == InteractionLocality::NonLocal && nb->bNonLocalStreamDoneMarked == false),
-               "Non-local stream is indicating that the copy back event is enqueued at the "
-               "beginning of the copy back function.");
-
-    const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
-    NBAtomData*         adat         = nb->atdat;
-
-    /* don't launch non-local copy-back if there was no non-local work to do */
-    if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(*nb, iloc))
-    {
-        nb->bNonLocalStreamDoneMarked = false;
-        return;
-    }
-
-    /* local/nonlocal offset and length used for xq and f */
-    auto atomsRange = getGpuAtomRange(adat, atomLocality);
-
-    // With DD the local D2H transfer can only start after the non-local kernel has finished.
-    if (iloc == InteractionLocality::Local && nb->bNonLocalStreamDoneMarked)
-    {
-        nb->nonlocal_done.waitForEvent();
-        nb->bNonLocalStreamDoneMarked = false;
-    }
-
-    /* DtoH f
-     * Skip if buffer ops / reduction is offloaded to the GPU.
-     */
-    if (!stepWork.useGpuFBufferOps)
-    {
-        GMX_ASSERT(adat->f.elementSize() == sizeof(Float3),
-                   "The size of the force buffer element should be equal to the size of float3.");
-        copyFromDeviceBuffer(reinterpret_cast<Float3*>(nbatom->out[0].f.data()) + atomsRange.begin(),
-                             &adat->f,
-                             atomsRange.begin(),
-                             atomsRange.size(),
-                             deviceStream,
-                             GpuApiCallBehavior::Async,
-                             nullptr);
-    }
-
-    /* After the non-local D2H is launched the nonlocal_done event can be
-       recorded which signals that the local D2H can proceed. This event is not
-       placed after the non-local kernel because we want the non-local data
-       back first. */
-    if (iloc == InteractionLocality::NonLocal)
-    {
-        nb->nonlocal_done.markEvent(deviceStream);
-        nb->bNonLocalStreamDoneMarked = true;
-    }
-
-    /* only transfer energies in the local stream */
-    if (iloc == InteractionLocality::Local)
-    {
-        /* DtoH fshift when virial is needed */
-        if (stepWork.computeVirial)
-        {
-            GMX_ASSERT(sizeof(*nb->nbst.fShift) == adat->fShift.elementSize(),
-                       "Sizes of host- and device-side shift vector elements should be the same.");
-            copyFromDeviceBuffer(
-                    nb->nbst.fShift, &adat->fShift, 0, SHIFTS, deviceStream, GpuApiCallBehavior::Async, nullptr);
-        }
-
-        /* DtoH energies */
-        if (stepWork.computeEnergy)
-        {
-            GMX_ASSERT(sizeof(*nb->nbst.eLJ) == sizeof(float),
-                       "Sizes of host- and device-side LJ energy terms should be the same.");
-            copyFromDeviceBuffer(
-                    nb->nbst.eLJ, &adat->eLJ, 0, 1, deviceStream, GpuApiCallBehavior::Async, nullptr);
-            GMX_ASSERT(sizeof(*nb->nbst.eElec) == sizeof(float),
-                       "Sizes of host- and device-side electrostatic energy terms should be the "
-                       "same.");
-            copyFromDeviceBuffer(
-                    nb->nbst.eElec, &adat->eElec, 0, 1, deviceStream, GpuApiCallBehavior::Async, nullptr);
-        }
-    }
-}
-
 void gpu_launch_kernel_pruneonly(NbnxmGpu* nb, const InteractionLocality iloc, const int numParts)
 {
     gpu_plist* plist = nb->plist[iloc];
-- 
2.22.0