From 32b7891450622add82abfac2f44437b3a5f6e92e Mon Sep 17 00:00:00 2001
From: Alan Gray <alang@nvidia.com>
Date: Wed, 12 Jun 2019 08:03:27 -0700
Subject: [PATCH] Bug fix and simplification for CUDA X Buffer Ops

Fixes bug when there are more than two grids (ie they don't naively
map to local/nonlocal), by using a separate GPU memory space for
cxy_na and cxy_ind data for each grid (it previously only had one
local and one nonlocal space with overwriting). Also simpifies by now
only calling init fn once per NS step to set up data for all grids
(previously called for both local and nonlocal).

Change-Id: Ia2b97d22324aa97dca34b05da2eca2e2090372af
---
 src/gromacs/mdlib/sim_util.cpp                |   8 +-
 src/gromacs/mdrun/md.cpp                      |   9 +-
 src/gromacs/nbnxm/atomdata.cpp                |   2 +-
 src/gromacs/nbnxm/cuda/nbnxm_cuda.cu          |  60 +++++----
 .../nbnxm/cuda/nbnxm_cuda_data_mgmt.cu        | 119 ++++++++----------
 src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h     |  22 ++--
 src/gromacs/nbnxm/gridset.cpp                 |   8 ++
 src/gromacs/nbnxm/gridset.h                   |  15 +++
 src/gromacs/nbnxm/nbnxm.cpp                   |  12 +-
 src/gromacs/nbnxm/nbnxm.h                     |   6 +-
 src/gromacs/nbnxm/nbnxm_gpu.h                 |  15 ++-
 11 files changed, 144 insertions(+), 132 deletions(-)

diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp
index baecb96ba1..ad61dee88a 100644
--- a/src/gromacs/mdlib/sim_util.cpp
+++ b/src/gromacs/mdlib/sim_util.cpp
@@ -1040,7 +1040,7 @@ void do_force(FILE                                     *fplog,
 
         if (useGpuXBufOps)
         {
-            nbv->atomdata_init_copy_x_to_nbat_x_gpu( Nbnxm::AtomLocality::Local);
+            nbv->atomdata_init_copy_x_to_nbat_x_gpu();
         }
 
     }
@@ -1107,12 +1107,6 @@ void do_force(FILE                                     *fplog,
                                    &top->excls, step, nrnb);
             wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_NONLOCAL);
             wallcycle_stop(wcycle, ewcNS);
-
-            if (useGpuXBufOps)
-            {
-
-                nbv->atomdata_init_copy_x_to_nbat_x_gpu( Nbnxm::AtomLocality::NonLocal);
-            }
         }
         else
         {
diff --git a/src/gromacs/mdrun/md.cpp b/src/gromacs/mdrun/md.cpp
index aba6ba9ca1..b5869d6521 100644
--- a/src/gromacs/mdrun/md.cpp
+++ b/src/gromacs/mdrun/md.cpp
@@ -300,10 +300,6 @@ void gmx::Simulator::do_md()
 
         stateInstance = std::make_unique<t_state>();
         state         = stateInstance.get();
-        if (fr->nbv->useGpu())
-        {
-            changePinningPolicy(&state->x, gmx::PinningPolicy::PinnedIfSupported);
-        }
         dd_init_local_state(cr->dd, state_global, state);
 
         /* Distribute the charge groups over the nodes from the master node */
@@ -346,6 +342,11 @@ void gmx::Simulator::do_md()
 
     }
 
+    if (fr->nbv->useGpu())
+    {
+        changePinningPolicy(&state->x, gmx::PinningPolicy::PinnedIfSupported);
+    }
+
     // NOTE: The global state is no longer used at this point.
     // But state_global is still used as temporary storage space for writing
     // the global state to file and potentially for replica exchange.
diff --git a/src/gromacs/nbnxm/atomdata.cpp b/src/gromacs/nbnxm/atomdata.cpp
index 55084763f3..c1271d8bba 100644
--- a/src/gromacs/nbnxm/atomdata.cpp
+++ b/src/gromacs/nbnxm/atomdata.cpp
@@ -1043,7 +1043,7 @@ void nbnxn_atomdata_copy_x_to_nbat_x(const Nbnxm::GridSet     &gridSet,
                                   gpu_nbv,
                                   xPmeDevicePtr,
                                   locality,
-                                  x);
+                                  x, g, gridSet.numColumnsMax());
         }
     }
     else
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
index 7162c9f593..a0117a5cb1 100644
--- a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
@@ -277,11 +277,11 @@ static inline int calc_shmem_required_nonbonded(const int num_threads_z, const g
  *
  *  As the point where the local stream tasks can be considered complete happens
  *  at the same call point where the nonlocal stream should be synced with the
- *  the local, this function recrds the event if called with the local stream as
+ *  the local, this function records the event if called with the local stream as
  *  argument and inserts in the GPU stream a wait on the event on the nonlocal.
  */
-static void insertNonlocalGpuDependency(const gmx_nbnxn_cuda_t   *nb,
-                                        const InteractionLocality interactionLocality)
+void nbnxnInsertNonlocalGpuDependency(const gmx_nbnxn_cuda_t   *nb,
+                                      const InteractionLocality interactionLocality)
 {
     cudaStream_t stream  = nb->stream[interactionLocality];
 
@@ -375,7 +375,7 @@ void gpu_copy_xq_to_gpu(gmx_nbnxn_cuda_t       *nb,
        This wait needs to precede any PP tasks, bonded or nonbonded, that may
        compute on interactions between local and nonlocal atoms.
      */
-    insertNonlocalGpuDependency(nb, iloc);
+    nbnxnInsertNonlocalGpuDependency(nb, iloc);
 }
 
 /*! As we execute nonbonded workload in separate streams, before launching
@@ -743,7 +743,9 @@ void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid               &grid,
                            gmx_nbnxn_gpu_t                 *nb,
                            void                            *xPmeDevicePtr,
                            const Nbnxm::AtomLocality        locality,
-                           const rvec                      *x)
+                           const rvec                      *x,
+                           int                              gridId,
+                           int                              numColumnsMax)
 {
     cu_atomdata_t             *adat    = nb->atdat;
     bool                       bDoTime = nb->bDoTime;
@@ -751,17 +753,11 @@ void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid               &grid,
     const int                  numColumns                = grid.numColumns();
     const int                  cellOffset                = grid.cellOffset();
     const int                  numAtomsPerCell           = grid.numAtomsPerCell();
-    // TODO: Document this, one can not infer the interaction locality from the atom locality
-    Nbnxm::InteractionLocality interactionLoc            = Nbnxm::InteractionLocality::Local;
-    int nCopyAtoms                                       = grid.srcAtomEnd() - grid.srcAtomBegin();
-    int copyAtomStart                                    = grid.srcAtomBegin();
+    Nbnxm::InteractionLocality interactionLoc            = gpuAtomToInteractionLocality(locality);
+    int                        nCopyAtoms                = grid.srcAtomEnd() - grid.srcAtomBegin();
+    int                        copyAtomStart             = grid.srcAtomBegin();
 
-    if (locality == Nbnxm::AtomLocality::NonLocal)
-    {
-        interactionLoc          = Nbnxm::InteractionLocality::NonLocal;
-    }
-
-    cudaStream_t   stream  = nb->stream[interactionLoc];
+    cudaStream_t               stream  = nb->stream[interactionLoc];
 
     // FIXME: need to either let the local stream get to the
     // insertNonlocalGpuDependency call or call it separately here
@@ -769,7 +765,7 @@ void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid               &grid,
     {
         if (interactionLoc == Nbnxm::InteractionLocality::Local)
         {
-            insertNonlocalGpuDependency(nb, interactionLoc);
+            nbnxnInsertNonlocalGpuDependency(nb, interactionLoc);
         }
         return;
     }
@@ -820,24 +816,24 @@ void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid               &grid,
     config.sharedMemorySize = 0;
     config.stream           = stream;
 
-    auto       kernelFn           = nbnxn_gpu_x_to_nbat_x_kernel;
-    float     *xqPtr              = &(adat->xq->x);
-    const int *d_atomIndices      = nb->atomIndices;
-    const int *d_cxy_na           = nb->cxy_na[locality];
-    const int *d_cxy_ind          = nb->cxy_ind[locality];
-    const auto kernelArgs         = prepareGpuKernelArguments(kernelFn, config,
-                                                              &numColumns,
-                                                              &xqPtr,
-                                                              &setFillerCoords,
-                                                              &d_x,
-                                                              &d_atomIndices,
-                                                              &d_cxy_na,
-                                                              &d_cxy_ind,
-                                                              &cellOffset,
-                                                              &numAtomsPerCell);
+    auto       kernelFn            = nbnxn_gpu_x_to_nbat_x_kernel;
+    float     *xqPtr               = &(adat->xq->x);
+    const int *d_atomIndices       = nb->atomIndices;
+    const int *d_cxy_na            = &nb->cxy_na[numColumnsMax*gridId];
+    const int *d_cxy_ind           = &nb->cxy_ind[numColumnsMax*gridId];
+    const auto kernelArgs          = prepareGpuKernelArguments(kernelFn, config,
+                                                               &numColumns,
+                                                               &xqPtr,
+                                                               &setFillerCoords,
+                                                               &d_x,
+                                                               &d_atomIndices,
+                                                               &d_cxy_na,
+                                                               &d_cxy_ind,
+                                                               &cellOffset,
+                                                               &numAtomsPerCell);
     launchGpuKernel(kernelFn, config, nullptr, "XbufferOps", kernelArgs);
 
-    insertNonlocalGpuDependency(nb, interactionLoc);
+    nbnxnInsertNonlocalGpuDependency(nb, interactionLoc);
 }
 
 } // namespace Nbnxm
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu
index 7af2d94cb0..fd2da0cef5 100644
--- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu
@@ -495,18 +495,14 @@ gpu_init(const gmx_device_info_t   *deviceInfo,
 
     cuda_init_const(nb, ic, listParams, nbat->params());
 
-    nb->natoms                = 0;
-    nb->natoms_alloc          = 0;
-    nb->atomIndicesSize       = 0;
-    nb->atomIndicesSize_alloc = 0;
-    nb->ncxy_na[AtomLocality::Local]                  = 0;
-    nb->ncxy_na[AtomLocality::NonLocal]               = 0;
-    nb->ncxy_na_alloc[AtomLocality::Local]            = 0;
-    nb->ncxy_na_alloc[AtomLocality::NonLocal]         = 0;
-    nb->ncxy_ind[AtomLocality::Local]                 = 0;
-    nb->ncxy_ind[AtomLocality::NonLocal]              = 0;
-    nb->ncxy_ind_alloc[AtomLocality::Local]           = 0;
-    nb->ncxy_ind_alloc[AtomLocality::NonLocal]        = 0;
+    nb->natoms                   = 0;
+    nb->natoms_alloc             = 0;
+    nb->atomIndicesSize          = 0;
+    nb->atomIndicesSize_alloc    = 0;
+    nb->ncxy_na                  = 0;
+    nb->ncxy_na_alloc            = 0;
+    nb->ncxy_ind                 = 0;
+    nb->ncxy_ind_alloc           = 0;
 
     if (debug)
     {
@@ -874,36 +870,20 @@ rvec *gpu_get_fshift(gmx_nbnxn_gpu_t *nb)
 /* Initialization for X buffer operations on GPU. */
 /* TODO  Remove explicit pinning from host arrays from here and manage in a more natural way*/
 void nbnxn_gpu_init_x_to_nbat_x(const Nbnxm::GridSet            &gridSet,
-                                gmx_nbnxn_gpu_t                 *gpu_nbv,
-                                const Nbnxm::AtomLocality        locality)
+                                gmx_nbnxn_gpu_t                 *gpu_nbv)
 {
     cudaError_t                      stat;
-    const Nbnxm::InteractionLocality iloc = ((locality == AtomLocality::Local) ?
-                                             InteractionLocality::Local : InteractionLocality::NonLocal);
-    cudaStream_t                     stream    = gpu_nbv->stream[iloc];
+    cudaStream_t                     stream    = gpu_nbv->stream[InteractionLocality::Local];
     bool                             bDoTime   = gpu_nbv->bDoTime;
-    int                              gridBegin = 0, gridEnd = 0;
+    const int maxNumColumns                    = gridSet.numColumnsMax();
 
-    switch (locality)
-    {
-        case Nbnxm::AtomLocality::All:
-            gridBegin = 0;
-            gridEnd   = gridSet.grids().size();
-            break;
-        case Nbnxm::AtomLocality::Local:
-            gridBegin = 0;
-            gridEnd   = 1;
-            break;
-        case Nbnxm::AtomLocality::NonLocal:
-            gridBegin = 1;
-            gridEnd   = gridSet.grids().size();
-            break;
-        case Nbnxm::AtomLocality::Count:
-            GMX_ASSERT(false, "Count is invalid locality specifier");
-            break;
-    }
 
-    for (int g = gridBegin; g < gridEnd; g++)
+    reallocateDeviceBuffer(&gpu_nbv->cxy_na, maxNumColumns*gridSet.grids().size(),
+                           &gpu_nbv->ncxy_na, &gpu_nbv->ncxy_na_alloc, nullptr);
+    reallocateDeviceBuffer(&gpu_nbv->cxy_ind, maxNumColumns*gridSet.grids().size(),
+                           &gpu_nbv->ncxy_ind, &gpu_nbv->ncxy_ind_alloc, nullptr);
+
+    for (unsigned int g = 0; g < gridSet.grids().size(); g++)
     {
 
         const Nbnxm::Grid  &grid       = gridSet.grids()[g];
@@ -915,37 +895,30 @@ void nbnxn_gpu_init_x_to_nbat_x(const Nbnxm::GridSet            &gridSet,
         const int          *cxy_ind           = grid.cxy_ind().data();
         const int           numRealAtomsTotal = gridSet.numRealAtomsTotal();
 
-        if (iloc == Nbnxm::InteractionLocality::Local)
-        {
+        reallocateDeviceBuffer(&gpu_nbv->xrvec, numRealAtomsTotal, &gpu_nbv->natoms, &gpu_nbv->natoms_alloc, nullptr);
+        reallocateDeviceBuffer(&gpu_nbv->atomIndices, atomIndicesSize, &gpu_nbv->atomIndicesSize, &gpu_nbv->atomIndicesSize_alloc, nullptr);
 
-            reallocateDeviceBuffer(&gpu_nbv->xrvec, numRealAtomsTotal, &gpu_nbv->natoms, &gpu_nbv->natoms_alloc, nullptr);
-            reallocateDeviceBuffer(&gpu_nbv->atomIndices, atomIndicesSize, &gpu_nbv->atomIndicesSize, &gpu_nbv->atomIndicesSize_alloc, nullptr);
+        if (atomIndicesSize > 0)
+        {
+            // source data must be pinned for H2D assertion. This should be moved into place where data is (re-)alloced.
+            stat = cudaHostRegister((void*) atomIndices, atomIndicesSize*sizeof(int), cudaHostRegisterDefault);
+            CU_RET_ERR(stat, "cudaHostRegister failed on atomIndices");
 
-            if (atomIndicesSize > 0)
+            if (bDoTime)
             {
-                // source data must be pinned for H2D assertion. This should be moved into place where data is (re-)alloced.
-                stat = cudaHostRegister((void*) atomIndices, atomIndicesSize*sizeof(int), cudaHostRegisterDefault);
-                CU_RET_ERR(stat, "cudaHostRegister failed on atomIndices");
-
-                if (bDoTime)
-                {
-                    gpu_nbv->timers->xf[locality].nb_h2d.openTimingRegion(stream);
-                }
+                gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.openTimingRegion(stream);
+            }
 
-                copyToDeviceBuffer(&gpu_nbv->atomIndices, atomIndices, 0, atomIndicesSize, stream, GpuApiCallBehavior::Async, nullptr);
+            copyToDeviceBuffer(&gpu_nbv->atomIndices, atomIndices, 0, atomIndicesSize, stream, GpuApiCallBehavior::Async, nullptr);
 
-                if (bDoTime)
-                {
-                    gpu_nbv->timers->xf[locality].nb_h2d.closeTimingRegion(stream);
-                }
-
-                stat = cudaHostUnregister((void*) atomIndices);
-                CU_RET_ERR(stat, "cudaHostUnRegister failed on atomIndices");
+            if (bDoTime)
+            {
+                gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.closeTimingRegion(stream);
             }
-        }
 
-        reallocateDeviceBuffer(&gpu_nbv->cxy_na[locality], numColumns, &gpu_nbv->ncxy_na[locality], &gpu_nbv->ncxy_na_alloc[locality], nullptr);
-        reallocateDeviceBuffer(&gpu_nbv->cxy_ind[locality], numColumns, &gpu_nbv->ncxy_ind[locality], &gpu_nbv->ncxy_ind_alloc[locality], nullptr);
+            stat = cudaHostUnregister((void*) atomIndices);
+            CU_RET_ERR(stat, "cudaHostUnRegister failed on atomIndices");
+        }
 
         if (numColumns > 0)
         {
@@ -955,14 +928,15 @@ void nbnxn_gpu_init_x_to_nbat_x(const Nbnxm::GridSet            &gridSet,
 
             if (bDoTime)
             {
-                gpu_nbv->timers->xf[locality].nb_h2d.openTimingRegion(stream);
+                gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.openTimingRegion(stream);
             }
 
-            copyToDeviceBuffer(&gpu_nbv->cxy_na[locality], cxy_na, 0, numColumns, stream, GpuApiCallBehavior::Async, nullptr);
+            int* destPtr = &gpu_nbv->cxy_na[maxNumColumns*g];
+            copyToDeviceBuffer(&destPtr, cxy_na, 0, numColumns, stream, GpuApiCallBehavior::Async, nullptr);
 
             if (bDoTime)
             {
-                gpu_nbv->timers->xf[locality].nb_h2d.closeTimingRegion(stream);
+                gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.closeTimingRegion(stream);
             }
 
             stat = cudaHostUnregister((void*) cxy_na);
@@ -974,20 +948,31 @@ void nbnxn_gpu_init_x_to_nbat_x(const Nbnxm::GridSet            &gridSet,
 
             if (bDoTime)
             {
-                gpu_nbv->timers->xf[locality].nb_h2d.openTimingRegion(stream);
+                gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.openTimingRegion(stream);
             }
 
-            copyToDeviceBuffer(&gpu_nbv->cxy_ind[locality], cxy_ind, 0, numColumns, stream, GpuApiCallBehavior::Async, nullptr);
+            destPtr = &gpu_nbv->cxy_ind[maxNumColumns*g];
+            copyToDeviceBuffer(&destPtr, cxy_ind, 0, numColumns, stream, GpuApiCallBehavior::Async, nullptr);
 
             if (bDoTime)
             {
-                gpu_nbv->timers->xf[locality].nb_h2d.closeTimingRegion(stream);
+                gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.closeTimingRegion(stream);
             }
 
             stat = cudaHostUnregister((void*) cxy_ind);
             CU_RET_ERR(stat, "cudaHostUnRegister failed on cxy_ind");
         }
     }
+
+    // The above data is transferred on the local stream but is a
+    // dependency of the nonlocal stream (specifically the nonlocal X
+    // buf ops kernel).  We therefore set a dependency to ensure
+    // that the nonlocal stream waits on the local stream here.
+    // This call records an event in the local stream:
+    nbnxnInsertNonlocalGpuDependency(gpu_nbv, Nbnxm::InteractionLocality::Local);
+    // ...and this call instructs the nonlocal stream to wait on that event:
+    nbnxnInsertNonlocalGpuDependency(gpu_nbv, Nbnxm::InteractionLocality::NonLocal);
+
     return;
 }
 
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h b/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h
index bd038c6f03..48c8776c79 100644
--- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h
@@ -229,18 +229,18 @@ struct gmx_nbnxn_cuda_t
     int                                                             atomIndicesSize;
     //! size of atom indices allocated in device buffer
     int                                                             atomIndicesSize_alloc;
-    //! x buf ops num of atoms (local and non-local)
-    gmx::EnumerationArray<Nbnxm::AtomLocality, int *>               cxy_na;
+    //! x buf ops num of atoms
+    int                                                            *cxy_na;
     //! number of elements in cxy_na
-    gmx::EnumerationArray<Nbnxm::AtomLocality, int >                ncxy_na;
+    int                                                             ncxy_na;
     //! number of elements allocated allocated in device buffer
-    gmx::EnumerationArray<Nbnxm::AtomLocality, int >                ncxy_na_alloc;
-    //! x buf ops cell index mapping (local and non-local)
-    gmx::EnumerationArray<Nbnxm::AtomLocality, int *>               cxy_ind;
+    int                                                             ncxy_na_alloc;
+    //! x buf ops cell index mapping
+    int                                                            *cxy_ind;
     //! number of elements in cxy_ind
-    gmx::EnumerationArray<Nbnxm::AtomLocality, int >                ncxy_ind;
+    int                                                             ncxy_ind;
     //! number of elements allocated allocated in device buffer
-    gmx::EnumerationArray<Nbnxm::AtomLocality, int >                ncxy_ind_alloc;
+    int                                                             ncxy_ind_alloc;
     //! parameters required for the non-bonded calc.
     cu_nbparam_t                                                   *nbparam;
     //! pair-list data structures (local and non-local)
@@ -255,8 +255,10 @@ struct gmx_nbnxn_cuda_t
                                                    is done (and the local transfer can proceed)           */
     cudaEvent_t    misc_ops_and_local_H2D_done; /**< event triggered when the tasks issued in
                                                    the local stream that need to precede the
-                                                   non-local force calculations are done
-                                                   (e.g. f buffer 0-ing, local x/q H2D) */
+                                                   non-local force or buffer operation calculations are done
+                                                   (e.g. f buffer 0-ing, local x/q H2D, buffer op
+                                                   initialization in local stream that is required also
+                                                   by nonlocal stream ) */
 
     /* NOTE: With current CUDA versions (<=5.0) timing doesn't work with multiple
      * concurrent streams, so we won't time if both l/nl work is done on GPUs.
diff --git a/src/gromacs/nbnxm/gridset.cpp b/src/gromacs/nbnxm/gridset.cpp
index 168fdd8792..feeb005594 100644
--- a/src/gromacs/nbnxm/gridset.cpp
+++ b/src/gromacs/nbnxm/gridset.cpp
@@ -224,6 +224,14 @@ void GridSet::putOnGrid(const matrix                    box,
         /* We are done setting up all grids, we can resize the force buffers */
         nbat->resizeForceBuffers();
     }
+
+    int maxNumColumns = 0;
+    for (const auto &grid : grids())
+    {
+        maxNumColumns = std::max(maxNumColumns, grid.numColumns());
+    }
+    setNumColumnsMax(maxNumColumns);
+
 }
 
 } // namespace Nbnxm
diff --git a/src/gromacs/nbnxm/gridset.h b/src/gromacs/nbnxm/gridset.h
index e59a8584c1..a4eaf6256e 100644
--- a/src/gromacs/nbnxm/gridset.h
+++ b/src/gromacs/nbnxm/gridset.h
@@ -185,6 +185,18 @@ class GridSet
             copy_mat(box_, box);
         }
 
+        //! Returns the maximum number of columns across all grids
+        int numColumnsMax() const
+        {
+            return numColumnsMax_;
+        }
+
+        //! Sets the maximum number of columns across all grids
+        void setNumColumnsMax(int numColumnsMax)
+        {
+            numColumnsMax_ = numColumnsMax;
+        }
+
     private:
         //! Returns collection of the data that covers all grids
         const GridSetData getGridSetData()
@@ -213,6 +225,9 @@ class GridSet
         int                   numRealAtomsTotal_;
         //! Working data for constructing a single grid, one entry per thread
         std::vector<GridWork> gridWork_;
+        //! Maximum number of columns across all grids
+        int                   numColumnsMax_;
+
 };
 
 } // namespace Nbnxm
diff --git a/src/gromacs/nbnxm/nbnxm.cpp b/src/gromacs/nbnxm/nbnxm.cpp
index 31037b2476..54c9ff9864 100644
--- a/src/gromacs/nbnxm/nbnxm.cpp
+++ b/src/gromacs/nbnxm/nbnxm.cpp
@@ -190,13 +190,13 @@ void nonbonded_verlet_t::changePairlistRadii(real rlistOuter,
 }
 
 void
-nonbonded_verlet_t::atomdata_init_copy_x_to_nbat_x_gpu(const Nbnxm::AtomLocality        locality)
+nonbonded_verlet_t::atomdata_init_copy_x_to_nbat_x_gpu()
 {
+    Nbnxm::nbnxn_gpu_init_x_to_nbat_x(pairSearch_->gridSet(), gpu_nbv);
+}
 
-    nbnxn_gpu_init_x_to_nbat_x(pairSearch_->gridSet(),
-                               gpu_nbv,
-                               locality);
-
-
+void nonbonded_verlet_t::insertNonlocalGpuDependency(const Nbnxm::InteractionLocality interactionLocality)
+{
+    Nbnxm::nbnxnInsertNonlocalGpuDependency(gpu_nbv, interactionLocality);
 }
 /*! \endcond */
diff --git a/src/gromacs/nbnxm/nbnxm.h b/src/gromacs/nbnxm/nbnxm.h
index b8bdb853cb..72eb98ae59 100644
--- a/src/gromacs/nbnxm/nbnxm.h
+++ b/src/gromacs/nbnxm/nbnxm.h
@@ -248,9 +248,11 @@ struct nonbonded_verlet_t
                             void                           *xPmeDevicePtr,
                             gmx_wallcycle                  *wcycle);
 
-        //! Init for GPU version of setup coordinates in Nbnxm, for the given locality
-        void atomdata_init_copy_x_to_nbat_x_gpu(Nbnxm::AtomLocality        locality);
+        //! Init for GPU version of setup coordinates in Nbnxm
+        void atomdata_init_copy_x_to_nbat_x_gpu();
 
+        //! Sync the nonlocal GPU stream with dependent tasks in the local queue.
+        void insertNonlocalGpuDependency(Nbnxm::InteractionLocality interactionLocality);
 
         //! Returns a reference to the pairlist sets
         const PairlistSets &pairlistSets() const
diff --git a/src/gromacs/nbnxm/nbnxm_gpu.h b/src/gromacs/nbnxm/nbnxm_gpu.h
index 49dc1bfce2..7e88129f4e 100644
--- a/src/gromacs/nbnxm/nbnxm_gpu.h
+++ b/src/gromacs/nbnxm/nbnxm_gpu.h
@@ -220,8 +220,7 @@ int gpu_pick_ewald_kernel_type(bool gmx_unused bTwinCut) GPU_FUNC_TERM_WITH_RETU
  * Called on the NS step and performs (re-)allocations and memory copies. !*/
 CUDA_FUNC_QUALIFIER
 void nbnxn_gpu_init_x_to_nbat_x(const Nbnxm::GridSet gmx_unused &gridSet,
-                                gmx_nbnxn_gpu_t    gmx_unused *gpu_nbv,
-                                Nbnxm::AtomLocality gmx_unused locality) CUDA_FUNC_TERM
+                                gmx_nbnxn_gpu_t    gmx_unused *gpu_nbv) CUDA_FUNC_TERM
 
 /*! \brief X buffer operations on GPU: performs conversion from rvec to nb format.
  */
@@ -231,7 +230,17 @@ void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid  gmx_unused &grid,
                            gmx_nbnxn_gpu_t    gmx_unused *gpu_nbv,
                            void               gmx_unused *xPmeDevicePtr,
                            Nbnxm::AtomLocality gmx_unused locality,
-                           const rvec         gmx_unused *x) CUDA_FUNC_TERM
+                           const rvec         gmx_unused *x,
+                           int                gmx_unused  gridId,
+                           int                gmx_unused  numColumnsMax) CUDA_FUNC_TERM
+
+/*! \brief Sync the nonlocal stream with dependent tasks in the local queue.
+ * \param[in] nb                   The nonbonded data GPU structure
+ * \param[in] interactionLocality  Local or NonLocal sync point
+ */
+CUDA_FUNC_QUALIFIER
+void nbnxnInsertNonlocalGpuDependency(const gmx_nbnxn_gpu_t gmx_unused    *nb,
+                                      const InteractionLocality gmx_unused interactionLocality) CUDA_FUNC_TERM
 
 } // namespace Nbnxm
 
-- 
2.22.0