From 3d3b09a81638135f3025883c46a87faca03fa417 Mon Sep 17 00:00:00 2001
From: Artem Zhmurov <zhmurov@gmail.com>
Date: Sat, 13 Mar 2021 16:13:14 +0300
Subject: [PATCH] Unify gpu_init_atomdata(...) function

Refs #2608
---
 .../nbnxm/cuda/nbnxm_cuda_data_mgmt.cu        |  89 -------------
 src/gromacs/nbnxm/nbnxm_gpu_data_mgmt.cpp     | 126 +++++++++++++++---
 .../nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp      | 100 --------------
 .../nbnxm/sycl/nbnxm_sycl_data_mgmt.cpp       |  72 ----------
 4 files changed, 111 insertions(+), 276 deletions(-)
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu
index ac77badc41..e6eb1c2d51 100644
--- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu
@@ -319,95 +319,6 @@ void gpu_clear_outputs(NbnxmGpu* nb, bool computeVirial)
     }
 }
 
-void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat)
-{
-    int                  nalloc, natoms;
-    bool                 realloced;
-    bool                 bDoTime       = nb->bDoTime;
-    Nbnxm::GpuTimers*    timers        = nb->timers;
-    NBAtomData*          d_atdat       = nb->atdat;
-    const DeviceContext& deviceContext = *nb->deviceContext_;
-    const DeviceStream&  localStream   = *nb->deviceStreams[InteractionLocality::Local];
-
-    natoms    = nbat->numAtoms();
-    realloced = false;
-
-    if (bDoTime)
-    {
-        /* time async copy */
-        timers->atdat.openTimingRegion(localStream);
-    }
-
-    /* need to reallocate if we have to copy more atoms than the amount of space
-       available and only allocate if we haven't initialized yet, i.e d_atdat->natoms == -1 */
-    if (natoms > d_atdat->numAtomsAlloc)
-    {
-        nalloc = over_alloc_small(natoms);
-
-        /* free up first if the arrays have already been initialized */
-        if (d_atdat->numAtomsAlloc != -1)
-        {
-            freeDeviceBuffer(&d_atdat->f);
-            freeDeviceBuffer(&d_atdat->xq);
-            freeDeviceBuffer(&d_atdat->atomTypes);
-            freeDeviceBuffer(&d_atdat->ljComb);
-        }
-
-        allocateDeviceBuffer(&d_atdat->f, nalloc, deviceContext);
-        allocateDeviceBuffer(&d_atdat->xq, nalloc, deviceContext);
-        if (useLjCombRule(nb->nbparam->vdwType))
-        {
-            allocateDeviceBuffer(&d_atdat->ljComb, nalloc, deviceContext);
-        }
-        else
-        {
-            allocateDeviceBuffer(&d_atdat->atomTypes, nalloc, deviceContext);
-        }
-
-        d_atdat->numAtomsAlloc = nalloc;
-        realloced              = true;
-    }
-
-    d_atdat->numAtoms      = natoms;
-    d_atdat->numAtomsLocal = nbat->natoms_local;
-
-    /* need to clear GPU f output if realloc happened */
-    if (realloced)
-    {
-        nbnxn_cuda_clear_f(nb, nalloc);
-    }
-
-    if (useLjCombRule(nb->nbparam->vdwType))
-    {
-        static_assert(sizeof(d_atdat->ljComb[0]) == sizeof(Float2),
-                      "Size of the LJ parameters element should be equal to the size of float2.");
-        copyToDeviceBuffer(&d_atdat->ljComb,
-                           reinterpret_cast<const Float2*>(nbat->params().lj_comb.data()),
-                           0,
-                           natoms,
-                           localStream,
-                           GpuApiCallBehavior::Async,
-                           nullptr);
-    }
-    else
-    {
-        static_assert(sizeof(d_atdat->atomTypes[0]) == sizeof(nbat->params().type[0]),
-                      "Sizes of host- and device-side atom types should be the same.");
-        copyToDeviceBuffer(&d_atdat->atomTypes,
-                           nbat->params().type.data(),
-                           0,
-                           natoms,
-                           localStream,
-                           GpuApiCallBehavior::Async,
-                           nullptr);
-    }
-
-    if (bDoTime)
-    {
-        timers->atdat.closeTimingRegion(localStream);
-    }
-}
-
 void gpu_free(NbnxmGpu* nb)
 {
     if (nb == nullptr)
diff --git a/src/gromacs/nbnxm/nbnxm_gpu_data_mgmt.cpp b/src/gromacs/nbnxm/nbnxm_gpu_data_mgmt.cpp
index bd056bcee8..f92f636e19 100644
--- a/src/gromacs/nbnxm/nbnxm_gpu_data_mgmt.cpp
+++ b/src/gromacs/nbnxm/nbnxm_gpu_data_mgmt.cpp
@@ -77,6 +77,23 @@
 namespace Nbnxm
 {
 
+inline void issueClFlushInStream(const DeviceStream& deviceStream)
+{
+#if GMX_GPU_OPENCL
+    /* Based on the v1.2 section 5.13 of the OpenCL spec, a flush is needed
+     * in the stream after marking an event in it in order to be able to sync with
+     * the event from another stream.
+     */
+    cl_int cl_error = clFlush(deviceStream.stream());
+    if (cl_error != CL_SUCCESS)
+    {
+        GMX_THROW(gmx::InternalError("clFlush failed: " + ocl_get_error_string(cl_error)));
+    }
+#else
+    GMX_UNUSED_VALUE(deviceStream);
+#endif
+}
+
 void init_ewald_coulomb_force_table(const EwaldCorrectionTables& tables,
                                     NBParamGpu*                  nbp,
                                     const DeviceContext&         deviceContext)
@@ -316,6 +333,100 @@ void gpu_init_pairlist(NbnxmGpu* nb, const NbnxnPairlistGpu* h_plist, const Inte
     d_plist->haveFreshList = true;
 }
 
+void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat)
+{
+    bool                 bDoTime       = nb->bDoTime;
+    Nbnxm::GpuTimers*    timers        = bDoTime ? nb->timers : nullptr;
+    NBAtomData*          atdat         = nb->atdat;
+    const DeviceContext& deviceContext = *nb->deviceContext_;
+    const DeviceStream&  localStream   = *nb->deviceStreams[InteractionLocality::Local];
+
+    int  numAtoms  = nbat->numAtoms();
+    bool realloced = false;
+
+    if (bDoTime)
+    {
+        /* time async copy */
+        timers->atdat.openTimingRegion(localStream);
+    }
+
+    /* need to reallocate if we have to copy more atoms than the amount of space
+       available and only allocate if we haven't initialized yet, i.e atdat->natoms == -1 */
+    if (numAtoms > atdat->numAtomsAlloc)
+    {
+        int numAlloc = over_alloc_small(numAtoms);
+
+        /* free up first if the arrays have already been initialized */
+        if (atdat->numAtomsAlloc != -1)
+        {
+            freeDeviceBuffer(&atdat->f);
+            freeDeviceBuffer(&atdat->xq);
+            freeDeviceBuffer(&atdat->ljComb);
+            freeDeviceBuffer(&atdat->atomTypes);
+        }
+
+
+        allocateDeviceBuffer(&atdat->f, numAlloc, deviceContext);
+        allocateDeviceBuffer(&atdat->xq, numAlloc, deviceContext);
+
+        if (useLjCombRule(nb->nbparam->vdwType))
+        {
+            // Two Lennard-Jones parameters per atom
+            allocateDeviceBuffer(&atdat->ljComb, numAlloc, deviceContext);
+        }
+        else
+        {
+            allocateDeviceBuffer(&atdat->atomTypes, numAlloc, deviceContext);
+        }
+
+        atdat->numAtomsAlloc = numAlloc;
+        realloced            = true;
+    }
+
+    atdat->numAtoms      = numAtoms;
+    atdat->numAtomsLocal = nbat->natoms_local;
+
+    /* need to clear GPU f output if realloc happened */
+    if (realloced)
+    {
+        clearDeviceBufferAsync(&atdat->f, 0, atdat->numAtomsAlloc, localStream);
+    }
+
+    if (useLjCombRule(nb->nbparam->vdwType))
+    {
+        static_assert(
+                sizeof(Float2) == 2 * sizeof(*nbat->params().lj_comb.data()),
+                "Size of a pair of LJ parameters elements should be equal to the size of Float2.");
+        copyToDeviceBuffer(&atdat->ljComb,
+                           reinterpret_cast<const Float2*>(nbat->params().lj_comb.data()),
+                           0,
+                           numAtoms,
+                           localStream,
+                           GpuApiCallBehavior::Async,
+                           bDoTime ? timers->atdat.fetchNextEvent() : nullptr);
+    }
+    else
+    {
+        static_assert(sizeof(int) == sizeof(*nbat->params().type.data()),
+                      "Sizes of host- and device-side atom types should be the same.");
+        copyToDeviceBuffer(&atdat->atomTypes,
+                           nbat->params().type.data(),
+                           0,
+                           numAtoms,
+                           localStream,
+                           GpuApiCallBehavior::Async,
+                           bDoTime ? timers->atdat.fetchNextEvent() : nullptr);
+    }
+
+    if (bDoTime)
+    {
+        timers->atdat.closeTimingRegion(localStream);
+    }
+
+    /* kick off the tasks enqueued above to ensure concurrency with the search */
+    issueClFlushInStream(localStream);
+}
+
 //! This function is documented in the header file
 gmx_wallclock_gpu_nbnxn_t* gpu_get_timings(NbnxmGpu* nb)
 {
@@ -430,21 +541,6 @@ bool haveGpuShortRangeWork(const NbnxmGpu* nb, const gmx::AtomLocality aLocality
     return haveGpuShortRangeWork(*nb, gpuAtomToInteractionLocality(aLocality));
 }
 
-inline void issueClFlushInStream(const DeviceStream& gmx_unused deviceStream)
-{
-#if GMX_GPU_OPENCL
-    /* Based on the v1.2 section 5.13 of the OpenCL spec, a flush is needed
-     * in the stream after marking an event in it in order to be able to sync with
-     * the event from another stream.
-     */
-    cl_int cl_error = clFlush(deviceStream.stream());
-    if (cl_error != CL_SUCCESS)
-    {
-        GMX_THROW(gmx::InternalError("clFlush failed: " + ocl_get_error_string(cl_error)));
-    }
-#endif
-}
-
 void nbnxnInsertNonlocalGpuDependency(NbnxmGpu* nb, const InteractionLocality interactionLocality)
 {
     const DeviceStream& deviceStream = *nb->deviceStreams[interactionLocality];
diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp b/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp
index b927bd3196..36e538d22b 100644
--- a/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp
+++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp
@@ -426,106 +426,6 @@ void gpu_upload_shiftvec(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom)
     }
 }
 
-//! This function is documented in the header file
-void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat)
-{
-    cl_int               cl_error;
-    int                  nalloc, natoms;
-    bool                 realloced;
-    bool                 bDoTime       = nb->bDoTime;
-    Nbnxm::GpuTimers*    timers        = nb->timers;
-    NBAtomData*          d_atdat       = nb->atdat;
-    const DeviceContext& deviceContext = *nb->deviceContext_;
-    const DeviceStream&  localStream   = *nb->deviceStreams[InteractionLocality::Local];
-
-    natoms    = nbat->numAtoms();
-    realloced = false;
-
-    if (bDoTime)
-    {
-        /* time async copy */
-        timers->atdat.openTimingRegion(localStream);
-    }
-
-    /* need to reallocate if we have to copy more atoms than the amount of space
-       available and only allocate if we haven't initialized yet, i.e d_atdat->natoms == -1 */
-    if (natoms > d_atdat->numAtomsAlloc)
-    {
-        nalloc = over_alloc_small(natoms);
-
-        /* free up first if the arrays have already been initialized */
-        if (d_atdat->numAtomsAlloc != -1)
-        {
-            freeDeviceBuffer(&d_atdat->f);
-            freeDeviceBuffer(&d_atdat->xq);
-            freeDeviceBuffer(&d_atdat->ljComb);
-            freeDeviceBuffer(&d_atdat->atomTypes);
-        }
-
-
-        allocateDeviceBuffer(&d_atdat->f, nalloc, deviceContext);
-        allocateDeviceBuffer(&d_atdat->xq, nalloc, deviceContext);
-
-        if (useLjCombRule(nb->nbparam->vdwType))
-        {
-            // Two Lennard-Jones parameters per atom
-            allocateDeviceBuffer(&d_atdat->ljComb, nalloc, deviceContext);
-        }
-        else
-        {
-            allocateDeviceBuffer(&d_atdat->atomTypes, nalloc, deviceContext);
-        }
-
-        d_atdat->numAtomsAlloc = nalloc;
-        realloced              = true;
-    }
-
-    d_atdat->numAtoms      = natoms;
-    d_atdat->numAtomsLocal = nbat->natoms_local;
-
-    /* need to clear GPU f output if realloc happened */
-    if (realloced)
-    {
-        nbnxn_ocl_clear_f(nb, nalloc);
-    }
-
-    if (useLjCombRule(nb->nbparam->vdwType))
-    {
-        static_assert(
-                sizeof(Float2) == 2 * sizeof(*nbat->params().lj_comb.data()),
-                "Size of a pair of LJ parameters elements should be equal to the size of Float2.");
-        copyToDeviceBuffer(&d_atdat->ljComb,
-                           reinterpret_cast<const Float2*>(nbat->params().lj_comb.data()),
-                           0,
-                           natoms,
-                           localStream,
-                           GpuApiCallBehavior::Async,
-                           bDoTime ? timers->atdat.fetchNextEvent() : nullptr);
-    }
-    else
-    {
-        static_assert(sizeof(int) == sizeof(*nbat->params().type.data()),
-                      "Sizes of host- and device-side atom types should be the same.");
-        copyToDeviceBuffer(&d_atdat->atomTypes,
-                           nbat->params().type.data(),
-                           0,
-                           natoms,
-                           localStream,
-                           GpuApiCallBehavior::Async,
-                           bDoTime ? timers->atdat.fetchNextEvent() : nullptr);
-    }
-
-    if (bDoTime)
-    {
-        timers->atdat.closeTimingRegion(localStream);
-    }
-
-    /* kick off the tasks enqueued above to ensure concurrency with the search */
-    cl_error = clFlush(localStream.stream());
-    GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
-                       ("clFlush failed: " + ocl_get_error_string(cl_error)).c_str());
-}
-
 /*! \brief Releases an OpenCL kernel pointer */
 static void free_kernel(cl_kernel* kernel_ptr)
 {
diff --git a/src/gromacs/nbnxm/sycl/nbnxm_sycl_data_mgmt.cpp b/src/gromacs/nbnxm/sycl/nbnxm_sycl_data_mgmt.cpp
index e5a1a4e344..cc4f9f3a6b 100644
--- a/src/gromacs/nbnxm/sycl/nbnxm_sycl_data_mgmt.cpp
+++ b/src/gromacs/nbnxm/sycl/nbnxm_sycl_data_mgmt.cpp
@@ -226,78 +226,6 @@ void gpu_upload_shiftvec(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom)
     }
 }
 
-void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat)
-{
-    GMX_ASSERT(!nb->bDoTime, "Timing on SYCL not supported yet");
-    NBAtomData*          atdat         = nb->atdat;
-    const DeviceContext& deviceContext = *nb->deviceContext_;
-    const DeviceStream&  localStream   = *nb->deviceStreams[InteractionLocality::Local];
-
-    int  numAtoms    = nbat->numAtoms();
-    bool reallocated = false;
-    if (numAtoms > atdat->numAtomsAlloc)
-    {
-        int numAlloc = over_alloc_small(numAtoms);
-
-        /* free up first if the arrays have already been initialized */
-        if (atdat->numAtomsAlloc != -1)
-        {
-            freeDeviceBuffer(&atdat->f);
-            freeDeviceBuffer(&atdat->xq);
-            freeDeviceBuffer(&atdat->atomTypes);
-            freeDeviceBuffer(&atdat->ljComb);
-        }
-
-        allocateDeviceBuffer(&atdat->f, numAlloc, deviceContext);
-        allocateDeviceBuffer(&atdat->xq, numAlloc, deviceContext);
-        if (useLjCombRule(nb->nbparam->vdwType))
-        {
-            allocateDeviceBuffer(&atdat->ljComb, numAlloc, deviceContext);
-        }
-        else
-        {
-            allocateDeviceBuffer(&atdat->atomTypes, numAlloc, deviceContext);
-        }
-
-        atdat->numAtomsAlloc = numAlloc;
-        reallocated          = true;
-    }
-
-    atdat->numAtoms      = numAtoms;
-    atdat->numAtomsLocal = nbat->natoms_local;
-
-    /* need to clear GPU f output if realloc happened */
-    if (reallocated)
-    {
-        clearDeviceBufferAsync(&atdat->f, 0, atdat->numAtomsAlloc, localStream);
-    }
-
-    if (useLjCombRule(nb->nbparam->vdwType))
-    {
-        GMX_ASSERT(atdat->ljComb.elementSize() == sizeof(Float2),
-                   "Size of the LJ parameters element should be equal to the size of float2.");
-        copyToDeviceBuffer(&atdat->ljComb,
-                           reinterpret_cast<const Float2*>(nbat->params().lj_comb.data()),
-                           0,
-                           numAtoms,
-                           localStream,
-                           GpuApiCallBehavior::Async,
-                           nullptr);
-    }
-    else
-    {
-        GMX_ASSERT(atdat->atomTypes.elementSize() == sizeof(nbat->params().type[0]),
-                   "Sizes of host- and device-side atom types should be the same.");
-        copyToDeviceBuffer(&atdat->atomTypes,
-                           nbat->params().type.data(),
-                           0,
-                           numAtoms,
-                           localStream,
-                           GpuApiCallBehavior::Async,
-                           nullptr);
-    }
-}
-
 void gpu_free(NbnxmGpu* nb)
 {
     if (nb == nullptr)
-- 
2.22.0