From 0e940b48fa4c7c8bfcdce3dbf077c791cf954c5a Mon Sep 17 00:00:00 2001
From: Andrey Alekseenko <al42and@gmail.com>
Date: Wed, 24 Mar 2021 11:54:55 +0000
Subject: [PATCH] Unify nbnxn_gpu_init_x_to_nbat_x

Previously, we had this function only for CUDA, but we will need it for
SYCL (#3932). OpenCL implementation is unlikely to be needed, but should
not hurt either.

Refs. #2608
---
 .../nbnxm/cuda/nbnxm_cuda_data_mgmt.cu        | 107 -----------------
 src/gromacs/nbnxm/nbnxm_gpu.h                 |   4 +-
 src/gromacs/nbnxm/nbnxm_gpu_data_mgmt.cpp     | 109 ++++++++++++++++++
 src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h    |  20 ++++
 src/gromacs/nbnxm/sycl/nbnxm_sycl_types.h     |  20 ++++
 5 files changed, 151 insertions(+), 109 deletions(-)

diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu
index 274f40448f..be674962d6 100644
--- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu
@@ -228,111 +228,4 @@ DeviceBuffer<gmx::RVec> gpu_get_fshift(NbnxmGpu* nb)
     return reinterpret_cast<DeviceBuffer<gmx::RVec>>(nb->atdat->fShift);
 }
 
-/* Initialization for X buffer operations on GPU. */
-/* TODO  Remove explicit pinning from host arrays from here and manage in a more natural way*/
-void nbnxn_gpu_init_x_to_nbat_x(const Nbnxm::GridSet& gridSet, NbnxmGpu* gpu_nbv)
-{
-    const DeviceStream& localStream   = *gpu_nbv->deviceStreams[InteractionLocality::Local];
-    bool                bDoTime       = gpu_nbv->bDoTime;
-    const int           maxNumColumns = gridSet.numColumnsMax();
-
-    reallocateDeviceBuffer(&gpu_nbv->cxy_na,
-                           maxNumColumns * gridSet.grids().size(),
-                           &gpu_nbv->ncxy_na,
-                           &gpu_nbv->ncxy_na_alloc,
-                           *gpu_nbv->deviceContext_);
-    reallocateDeviceBuffer(&gpu_nbv->cxy_ind,
-                           maxNumColumns * gridSet.grids().size(),
-                           &gpu_nbv->ncxy_ind,
-                           &gpu_nbv->ncxy_ind_alloc,
-                           *gpu_nbv->deviceContext_);
-
-    for (unsigned int g = 0; g < gridSet.grids().size(); g++)
-    {
-
-        const Nbnxm::Grid& grid = gridSet.grids()[g];
-
-        const int  numColumns      = grid.numColumns();
-        const int* atomIndices     = gridSet.atomIndices().data();
-        const int  atomIndicesSize = gridSet.atomIndices().size();
-        const int* cxy_na          = grid.cxy_na().data();
-        const int* cxy_ind         = grid.cxy_ind().data();
-
-        reallocateDeviceBuffer(&gpu_nbv->atomIndices,
-                               atomIndicesSize,
-                               &gpu_nbv->atomIndicesSize,
-                               &gpu_nbv->atomIndicesSize_alloc,
-                               *gpu_nbv->deviceContext_);
-
-        if (atomIndicesSize > 0)
-        {
-
-            if (bDoTime)
-            {
-                gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.openTimingRegion(localStream);
-            }
-
-            copyToDeviceBuffer(&gpu_nbv->atomIndices,
-                               atomIndices,
-                               0,
-                               atomIndicesSize,
-                               localStream,
-                               GpuApiCallBehavior::Async,
-                               nullptr);
-
-            if (bDoTime)
-            {
-                gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.closeTimingRegion(localStream);
-            }
-        }
-
-        if (numColumns > 0)
-        {
-            if (bDoTime)
-            {
-                gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.openTimingRegion(localStream);
-            }
-
-            int* destPtr = &gpu_nbv->cxy_na[maxNumColumns * g];
-            copyToDeviceBuffer(
-                    &destPtr, cxy_na, 0, numColumns, localStream, GpuApiCallBehavior::Async, nullptr);
-
-            if (bDoTime)
-            {
-                gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.closeTimingRegion(localStream);
-            }
-
-            if (bDoTime)
-            {
-                gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.openTimingRegion(localStream);
-            }
-
-            destPtr = &gpu_nbv->cxy_ind[maxNumColumns * g];
-            copyToDeviceBuffer(
-                    &destPtr, cxy_ind, 0, numColumns, localStream, GpuApiCallBehavior::Async, nullptr);
-
-            if (bDoTime)
-            {
-                gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.closeTimingRegion(localStream);
-            }
-        }
-    }
-
-    if (gpu_nbv->bUseTwoStreams)
-    {
-        // The above data is transferred on the local stream but is a
-        // dependency of the nonlocal stream (specifically the nonlocal X
-        // buf ops kernel).  We therefore set a dependency to ensure
-        // that the nonlocal stream waits on the local stream here.
-        // This call records an event in the local stream:
-        gpu_nbv->misc_ops_and_local_H2D_done.markEvent(
-                *gpu_nbv->deviceStreams[Nbnxm::InteractionLocality::Local]);
-        // ...and this call instructs the nonlocal stream to wait on that event:
-        gpu_nbv->misc_ops_and_local_H2D_done.enqueueWaitEvent(
-                *gpu_nbv->deviceStreams[Nbnxm::InteractionLocality::NonLocal]);
-    }
-
-    return;
-}
-
 } // namespace Nbnxm
diff --git a/src/gromacs/nbnxm/nbnxm_gpu.h b/src/gromacs/nbnxm/nbnxm_gpu.h
index 621878d9eb..60a28c3fa4 100644
--- a/src/gromacs/nbnxm/nbnxm_gpu.h
+++ b/src/gromacs/nbnxm/nbnxm_gpu.h
@@ -233,9 +233,9 @@ float gpu_wait_finish_task(NbnxmGpu gmx_unused*    nb,
 
 /*! \brief Initialization for X buffer operations on GPU.
  * Called on the NS step and performs (re-)allocations and memory copies. !*/
-CUDA_FUNC_QUALIFIER
+GPU_FUNC_QUALIFIER
 void nbnxn_gpu_init_x_to_nbat_x(const Nbnxm::GridSet gmx_unused& gridSet,
-                                NbnxmGpu gmx_unused* gpu_nbv) CUDA_FUNC_TERM;
+                                NbnxmGpu gmx_unused* gpu_nbv) GPU_FUNC_TERM;
 
 /*! \brief X buffer operations on GPU: performs conversion from rvec to nb format.
  *
diff --git a/src/gromacs/nbnxm/nbnxm_gpu_data_mgmt.cpp b/src/gromacs/nbnxm/nbnxm_gpu_data_mgmt.cpp
index 53c53e2528..ca39376595 100644
--- a/src/gromacs/nbnxm/nbnxm_gpu_data_mgmt.cpp
+++ b/src/gromacs/nbnxm/nbnxm_gpu_data_mgmt.cpp
@@ -69,6 +69,7 @@
 #include "gromacs/mdtypes/simulation_workload.h"
 #include "gromacs/nbnxm/gpu_common_utils.h"
 #include "gromacs/nbnxm/gpu_data_mgmt.h"
+#include "gromacs/nbnxm/gridset.h"
 #include "gromacs/pbcutil/ishift.h"
 #include "gromacs/timing/gpu_timing.h"
 #include "gromacs/pbcutil/ishift.h"
@@ -973,4 +974,112 @@ void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const Atom
     nbnxnInsertNonlocalGpuDependency(nb, iloc);
 }
 
+
+/* Initialization for X buffer operations on GPU. */
+void nbnxn_gpu_init_x_to_nbat_x(const Nbnxm::GridSet& gridSet, NbnxmGpu* gpu_nbv)
+{
+    const DeviceStream& localStream   = *gpu_nbv->deviceStreams[InteractionLocality::Local];
+    const bool          bDoTime       = gpu_nbv->bDoTime;
+    const int           maxNumColumns = gridSet.numColumnsMax();
+
+    reallocateDeviceBuffer(&gpu_nbv->cxy_na,
+                           maxNumColumns * gridSet.grids().size(),
+                           &gpu_nbv->ncxy_na,
+                           &gpu_nbv->ncxy_na_alloc,
+                           *gpu_nbv->deviceContext_);
+    reallocateDeviceBuffer(&gpu_nbv->cxy_ind,
+                           maxNumColumns * gridSet.grids().size(),
+                           &gpu_nbv->ncxy_ind,
+                           &gpu_nbv->ncxy_ind_alloc,
+                           *gpu_nbv->deviceContext_);
+
+    for (unsigned int g = 0; g < gridSet.grids().size(); g++)
+    {
+        const Nbnxm::Grid& grid = gridSet.grids()[g];
+
+        const int  numColumns      = grid.numColumns();
+        const int* atomIndices     = gridSet.atomIndices().data();
+        const int  atomIndicesSize = gridSet.atomIndices().size();
+        const int* cxy_na          = grid.cxy_na().data();
+        const int* cxy_ind         = grid.cxy_ind().data();
+
+        auto* timerH2D = bDoTime ? &gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d : nullptr;
+
+        reallocateDeviceBuffer(&gpu_nbv->atomIndices,
+                               atomIndicesSize,
+                               &gpu_nbv->atomIndicesSize,
+                               &gpu_nbv->atomIndicesSize_alloc,
+                               *gpu_nbv->deviceContext_);
+
+        if (atomIndicesSize > 0)
+        {
+            if (bDoTime)
+            {
+                timerH2D->openTimingRegion(localStream);
+            }
+
+            copyToDeviceBuffer(&gpu_nbv->atomIndices,
+                               atomIndices,
+                               0,
+                               atomIndicesSize,
+                               localStream,
+                               GpuApiCallBehavior::Async,
+                               bDoTime ? timerH2D->fetchNextEvent() : nullptr);
+
+            if (bDoTime)
+            {
+                timerH2D->closeTimingRegion(localStream);
+            }
+        }
+
+        if (numColumns > 0)
+        {
+            if (bDoTime)
+            {
+                timerH2D->openTimingRegion(localStream);
+            }
+
+            copyToDeviceBuffer(&gpu_nbv->cxy_na,
+                               cxy_na,
+                               maxNumColumns * g,
+                               numColumns,
+                               localStream,
+                               GpuApiCallBehavior::Async,
+                               bDoTime ? timerH2D->fetchNextEvent() : nullptr);
+
+            if (bDoTime)
+            {
+                timerH2D->closeTimingRegion(localStream);
+            }
+
+            if (bDoTime)
+            {
+                timerH2D->openTimingRegion(localStream);
+            }
+
+            copyToDeviceBuffer(&gpu_nbv->cxy_ind,
+                               cxy_ind,
+                               maxNumColumns * g,
+                               numColumns,
+                               localStream,
+                               GpuApiCallBehavior::Async,
+                               bDoTime ? timerH2D->fetchNextEvent() : nullptr);
+
+            if (bDoTime)
+            {
+                timerH2D->closeTimingRegion(localStream);
+            }
+        }
+    }
+
+    // The above data is transferred on the local stream but is a
+    // dependency of the nonlocal stream (specifically the nonlocal X
+    // buf ops kernel).  We therefore set a dependency to ensure
+    // that the nonlocal stream waits on the local stream here.
+    // This call records an event in the local stream:
+    nbnxnInsertNonlocalGpuDependency(gpu_nbv, Nbnxm::InteractionLocality::Local);
+    // ...and this call instructs the nonlocal stream to wait on that event:
+    nbnxnInsertNonlocalGpuDependency(gpu_nbv, Nbnxm::InteractionLocality::NonLocal);
+}
+
 } // namespace Nbnxm
diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h b/src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h
index 925f94b117..95558805bd 100644
--- a/src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h
+++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h
@@ -175,6 +175,26 @@ struct NbnxmGpu
     //! staging area where fshift/energies get downloaded
     NBStagingData nbst;
 
+    // Data for GPU-side coordinate conversion between integrator and NBNXM
+    /*! \brief array of atom indices */
+    DeviceBuffer<int> atomIndices;
+    /*! \brief size of atom indices */
+    int atomIndicesSize = 0;
+    /*! \brief size of atom indices allocated in device buffer */
+    int atomIndicesSize_alloc = 0;
+    /*! \brief x buf ops num of atoms */
+    DeviceBuffer<int> cxy_na;
+    /*! \brief number of elements in cxy_na */
+    int ncxy_na = 0;
+    /*! \brief number of elements allocated allocated in device buffer */
+    int ncxy_na_alloc = 0;
+    /*! \brief x buf ops cell index mapping */
+    DeviceBuffer<int> cxy_ind;
+    /*! \brief number of elements in cxy_ind */
+    int ncxy_ind = 0;
+    /*! \brief number of elements allocated allocated in device buffer */
+    int ncxy_ind_alloc = 0;
+
     //! local and non-local GPU queues
     gmx::EnumerationArray<Nbnxm::InteractionLocality, const DeviceStream*> deviceStreams;
 
diff --git a/src/gromacs/nbnxm/sycl/nbnxm_sycl_types.h b/src/gromacs/nbnxm/sycl/nbnxm_sycl_types.h
index fd1d655e3e..6a82823b37 100644
--- a/src/gromacs/nbnxm/sycl/nbnxm_sycl_types.h
+++ b/src/gromacs/nbnxm/sycl/nbnxm_sycl_types.h
@@ -74,6 +74,26 @@ struct NbnxmGpu
     /*! \brief atom data */
     NBAtomData* atdat = nullptr;
 
+    // Data for GPU-side coordinate conversion between integrator and NBNXM
+    /*! \brief array of atom indices */
+    DeviceBuffer<int> atomIndices;
+    /*! \brief size of atom indices */
+    int atomIndicesSize = 0;
+    /*! \brief size of atom indices allocated in device buffer */
+    int atomIndicesSize_alloc = 0;
+    /*! \brief x buf ops num of atoms */
+    DeviceBuffer<int> cxy_na;
+    /*! \brief number of elements in cxy_na */
+    int ncxy_na = 0;
+    /*! \brief number of elements allocated allocated in device buffer */
+    int ncxy_na_alloc = 0;
+    /*! \brief x buf ops cell index mapping */
+    DeviceBuffer<int> cxy_ind;
+    /*! \brief number of elements in cxy_ind */
+    int ncxy_ind = 0;
+    /*! \brief number of elements allocated allocated in device buffer */
+    int ncxy_ind_alloc = 0;
+
     NBParamGpu* nbparam = nullptr;
     /*! \brief pair-list data structures (local and non-local) */
     gmx::EnumerationArray<Nbnxm::InteractionLocality, Nbnxm::gpu_plist*> plist = { { nullptr } };
-- 
2.22.0