Bug fix and simplification for CUDA X Buffer Ops
authorAlan Gray <alang@nvidia.com>
Wed, 12 Jun 2019 15:03:27 +0000 (08:03 -0700)
committerSzilárd Páll <pall.szilard@gmail.com>
Mon, 1 Jul 2019 21:44:33 +0000 (23:44 +0200)
Fixes bug when there are more than two grids (ie they don't naively
map to local/nonlocal), by using a separate GPU memory space for
cxy_na and cxy_ind data for each grid (it previously only had one
local and one nonlocal space with overwriting). Also simpifies by now
only calling init fn once per NS step to set up data for all grids
(previously called for both local and nonlocal).

Change-Id: Ia2b97d22324aa97dca34b05da2eca2e2090372af

src/gromacs/mdlib/sim_util.cpp
src/gromacs/mdrun/md.cpp
src/gromacs/nbnxm/atomdata.cpp
src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu
src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h
src/gromacs/nbnxm/gridset.cpp
src/gromacs/nbnxm/gridset.h
src/gromacs/nbnxm/nbnxm.cpp
src/gromacs/nbnxm/nbnxm.h
src/gromacs/nbnxm/nbnxm_gpu.h

index baecb96ba1d9e562842b76d7da9996440a6cf627..ad61dee88acadbafcb2aa3f878076ad09b28cd8a 100644 (file)
@@ -1040,7 +1040,7 @@ void do_force(FILE                                     *fplog,
 
         if (useGpuXBufOps)
         {
-            nbv->atomdata_init_copy_x_to_nbat_x_gpu( Nbnxm::AtomLocality::Local);
+            nbv->atomdata_init_copy_x_to_nbat_x_gpu();
         }
 
     }
@@ -1107,12 +1107,6 @@ void do_force(FILE                                     *fplog,
                                    &top->excls, step, nrnb);
             wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_NONLOCAL);
             wallcycle_stop(wcycle, ewcNS);
-
-            if (useGpuXBufOps)
-            {
-
-                nbv->atomdata_init_copy_x_to_nbat_x_gpu( Nbnxm::AtomLocality::NonLocal);
-            }
         }
         else
         {
index aba6ba9ca13fd7e87a27011928425be89112b2d0..b5869d65210fe182bb42fbde2cdaaba9519fbfe7 100644 (file)
@@ -300,10 +300,6 @@ void gmx::Simulator::do_md()
 
         stateInstance = std::make_unique<t_state>();
         state         = stateInstance.get();
-        if (fr->nbv->useGpu())
-        {
-            changePinningPolicy(&state->x, gmx::PinningPolicy::PinnedIfSupported);
-        }
         dd_init_local_state(cr->dd, state_global, state);
 
         /* Distribute the charge groups over the nodes from the master node */
@@ -346,6 +342,11 @@ void gmx::Simulator::do_md()
 
     }
 
+    if (fr->nbv->useGpu())
+    {
+        changePinningPolicy(&state->x, gmx::PinningPolicy::PinnedIfSupported);
+    }
+
     // NOTE: The global state is no longer used at this point.
     // But state_global is still used as temporary storage space for writing
     // the global state to file and potentially for replica exchange.
index 55084763f3805b9c4067d3785adb6d50b4d060f2..c1271d8bbac34f0fc51b8cd673e0eefbd8dabf48 100644 (file)
@@ -1043,7 +1043,7 @@ void nbnxn_atomdata_copy_x_to_nbat_x(const Nbnxm::GridSet     &gridSet,
                                   gpu_nbv,
                                   xPmeDevicePtr,
                                   locality,
-                                  x);
+                                  x, g, gridSet.numColumnsMax());
         }
     }
     else
index 7162c9f593944e68682e9c0e85ef14ba67379a7a..a0117a5cb1a23725798898d7b66e4ef7e1dafb8b 100644 (file)
@@ -277,11 +277,11 @@ static inline int calc_shmem_required_nonbonded(const int num_threads_z, const g
  *
  *  As the point where the local stream tasks can be considered complete happens
  *  at the same call point where the nonlocal stream should be synced with the
- *  the local, this function recrds the event if called with the local stream as
+ *  the local, this function records the event if called with the local stream as
  *  argument and inserts in the GPU stream a wait on the event on the nonlocal.
  */
-static void insertNonlocalGpuDependency(const gmx_nbnxn_cuda_t   *nb,
-                                        const InteractionLocality interactionLocality)
+void nbnxnInsertNonlocalGpuDependency(const gmx_nbnxn_cuda_t   *nb,
+                                      const InteractionLocality interactionLocality)
 {
     cudaStream_t stream  = nb->stream[interactionLocality];
 
@@ -375,7 +375,7 @@ void gpu_copy_xq_to_gpu(gmx_nbnxn_cuda_t       *nb,
        This wait needs to precede any PP tasks, bonded or nonbonded, that may
        compute on interactions between local and nonlocal atoms.
      */
-    insertNonlocalGpuDependency(nb, iloc);
+    nbnxnInsertNonlocalGpuDependency(nb, iloc);
 }
 
 /*! As we execute nonbonded workload in separate streams, before launching
@@ -743,7 +743,9 @@ void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid               &grid,
                            gmx_nbnxn_gpu_t                 *nb,
                            void                            *xPmeDevicePtr,
                            const Nbnxm::AtomLocality        locality,
-                           const rvec                      *x)
+                           const rvec                      *x,
+                           int                              gridId,
+                           int                              numColumnsMax)
 {
     cu_atomdata_t             *adat    = nb->atdat;
     bool                       bDoTime = nb->bDoTime;
@@ -751,17 +753,11 @@ void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid               &grid,
     const int                  numColumns                = grid.numColumns();
     const int                  cellOffset                = grid.cellOffset();
     const int                  numAtomsPerCell           = grid.numAtomsPerCell();
-    // TODO: Document this, one can not infer the interaction locality from the atom locality
-    Nbnxm::InteractionLocality interactionLoc            = Nbnxm::InteractionLocality::Local;
-    int nCopyAtoms                                       = grid.srcAtomEnd() - grid.srcAtomBegin();
-    int copyAtomStart                                    = grid.srcAtomBegin();
+    Nbnxm::InteractionLocality interactionLoc            = gpuAtomToInteractionLocality(locality);
+    int                        nCopyAtoms                = grid.srcAtomEnd() - grid.srcAtomBegin();
+    int                        copyAtomStart             = grid.srcAtomBegin();
 
-    if (locality == Nbnxm::AtomLocality::NonLocal)
-    {
-        interactionLoc          = Nbnxm::InteractionLocality::NonLocal;
-    }
-
-    cudaStream_t   stream  = nb->stream[interactionLoc];
+    cudaStream_t               stream  = nb->stream[interactionLoc];
 
     // FIXME: need to either let the local stream get to the
     // insertNonlocalGpuDependency call or call it separately here
@@ -769,7 +765,7 @@ void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid               &grid,
     {
         if (interactionLoc == Nbnxm::InteractionLocality::Local)
         {
-            insertNonlocalGpuDependency(nb, interactionLoc);
+            nbnxnInsertNonlocalGpuDependency(nb, interactionLoc);
         }
         return;
     }
@@ -820,24 +816,24 @@ void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid               &grid,
     config.sharedMemorySize = 0;
     config.stream           = stream;
 
-    auto       kernelFn           = nbnxn_gpu_x_to_nbat_x_kernel;
-    float     *xqPtr              = &(adat->xq->x);
-    const int *d_atomIndices      = nb->atomIndices;
-    const int *d_cxy_na           = nb->cxy_na[locality];
-    const int *d_cxy_ind          = nb->cxy_ind[locality];
-    const auto kernelArgs         = prepareGpuKernelArguments(kernelFn, config,
-                                                              &numColumns,
-                                                              &xqPtr,
-                                                              &setFillerCoords,
-                                                              &d_x,
-                                                              &d_atomIndices,
-                                                              &d_cxy_na,
-                                                              &d_cxy_ind,
-                                                              &cellOffset,
-                                                              &numAtomsPerCell);
+    auto       kernelFn            = nbnxn_gpu_x_to_nbat_x_kernel;
+    float     *xqPtr               = &(adat->xq->x);
+    const int *d_atomIndices       = nb->atomIndices;
+    const int *d_cxy_na            = &nb->cxy_na[numColumnsMax*gridId];
+    const int *d_cxy_ind           = &nb->cxy_ind[numColumnsMax*gridId];
+    const auto kernelArgs          = prepareGpuKernelArguments(kernelFn, config,
+                                                               &numColumns,
+                                                               &xqPtr,
+                                                               &setFillerCoords,
+                                                               &d_x,
+                                                               &d_atomIndices,
+                                                               &d_cxy_na,
+                                                               &d_cxy_ind,
+                                                               &cellOffset,
+                                                               &numAtomsPerCell);
     launchGpuKernel(kernelFn, config, nullptr, "XbufferOps", kernelArgs);
 
-    insertNonlocalGpuDependency(nb, interactionLoc);
+    nbnxnInsertNonlocalGpuDependency(nb, interactionLoc);
 }
 
 } // namespace Nbnxm
index 7af2d94cb0e335fa01cdaa8236b4cd2fabd96fcf..fd2da0cef5066c6f5342a377b113a1ab38cd1d3c 100644 (file)
@@ -495,18 +495,14 @@ gpu_init(const gmx_device_info_t   *deviceInfo,
 
     cuda_init_const(nb, ic, listParams, nbat->params());
 
-    nb->natoms                = 0;
-    nb->natoms_alloc          = 0;
-    nb->atomIndicesSize       = 0;
-    nb->atomIndicesSize_alloc = 0;
-    nb->ncxy_na[AtomLocality::Local]                  = 0;
-    nb->ncxy_na[AtomLocality::NonLocal]               = 0;
-    nb->ncxy_na_alloc[AtomLocality::Local]            = 0;
-    nb->ncxy_na_alloc[AtomLocality::NonLocal]         = 0;
-    nb->ncxy_ind[AtomLocality::Local]                 = 0;
-    nb->ncxy_ind[AtomLocality::NonLocal]              = 0;
-    nb->ncxy_ind_alloc[AtomLocality::Local]           = 0;
-    nb->ncxy_ind_alloc[AtomLocality::NonLocal]        = 0;
+    nb->natoms                   = 0;
+    nb->natoms_alloc             = 0;
+    nb->atomIndicesSize          = 0;
+    nb->atomIndicesSize_alloc    = 0;
+    nb->ncxy_na                  = 0;
+    nb->ncxy_na_alloc            = 0;
+    nb->ncxy_ind                 = 0;
+    nb->ncxy_ind_alloc           = 0;
 
     if (debug)
     {
@@ -874,36 +870,20 @@ rvec *gpu_get_fshift(gmx_nbnxn_gpu_t *nb)
 /* Initialization for X buffer operations on GPU. */
 /* TODO  Remove explicit pinning from host arrays from here and manage in a more natural way*/
 void nbnxn_gpu_init_x_to_nbat_x(const Nbnxm::GridSet            &gridSet,
-                                gmx_nbnxn_gpu_t                 *gpu_nbv,
-                                const Nbnxm::AtomLocality        locality)
+                                gmx_nbnxn_gpu_t                 *gpu_nbv)
 {
     cudaError_t                      stat;
-    const Nbnxm::InteractionLocality iloc = ((locality == AtomLocality::Local) ?
-                                             InteractionLocality::Local : InteractionLocality::NonLocal);
-    cudaStream_t                     stream    = gpu_nbv->stream[iloc];
+    cudaStream_t                     stream    = gpu_nbv->stream[InteractionLocality::Local];
     bool                             bDoTime   = gpu_nbv->bDoTime;
-    int                              gridBegin = 0, gridEnd = 0;
+    const int maxNumColumns                    = gridSet.numColumnsMax();
 
-    switch (locality)
-    {
-        case Nbnxm::AtomLocality::All:
-            gridBegin = 0;
-            gridEnd   = gridSet.grids().size();
-            break;
-        case Nbnxm::AtomLocality::Local:
-            gridBegin = 0;
-            gridEnd   = 1;
-            break;
-        case Nbnxm::AtomLocality::NonLocal:
-            gridBegin = 1;
-            gridEnd   = gridSet.grids().size();
-            break;
-        case Nbnxm::AtomLocality::Count:
-            GMX_ASSERT(false, "Count is invalid locality specifier");
-            break;
-    }
 
-    for (int g = gridBegin; g < gridEnd; g++)
+    reallocateDeviceBuffer(&gpu_nbv->cxy_na, maxNumColumns*gridSet.grids().size(),
+                           &gpu_nbv->ncxy_na, &gpu_nbv->ncxy_na_alloc, nullptr);
+    reallocateDeviceBuffer(&gpu_nbv->cxy_ind, maxNumColumns*gridSet.grids().size(),
+                           &gpu_nbv->ncxy_ind, &gpu_nbv->ncxy_ind_alloc, nullptr);
+
+    for (unsigned int g = 0; g < gridSet.grids().size(); g++)
     {
 
         const Nbnxm::Grid  &grid       = gridSet.grids()[g];
@@ -915,37 +895,30 @@ void nbnxn_gpu_init_x_to_nbat_x(const Nbnxm::GridSet            &gridSet,
         const int          *cxy_ind           = grid.cxy_ind().data();
         const int           numRealAtomsTotal = gridSet.numRealAtomsTotal();
 
-        if (iloc == Nbnxm::InteractionLocality::Local)
-        {
+        reallocateDeviceBuffer(&gpu_nbv->xrvec, numRealAtomsTotal, &gpu_nbv->natoms, &gpu_nbv->natoms_alloc, nullptr);
+        reallocateDeviceBuffer(&gpu_nbv->atomIndices, atomIndicesSize, &gpu_nbv->atomIndicesSize, &gpu_nbv->atomIndicesSize_alloc, nullptr);
 
-            reallocateDeviceBuffer(&gpu_nbv->xrvec, numRealAtomsTotal, &gpu_nbv->natoms, &gpu_nbv->natoms_alloc, nullptr);
-            reallocateDeviceBuffer(&gpu_nbv->atomIndices, atomIndicesSize, &gpu_nbv->atomIndicesSize, &gpu_nbv->atomIndicesSize_alloc, nullptr);
+        if (atomIndicesSize > 0)
+        {
+            // source data must be pinned for H2D assertion. This should be moved into place where data is (re-)alloced.
+            stat = cudaHostRegister((void*) atomIndices, atomIndicesSize*sizeof(int), cudaHostRegisterDefault);
+            CU_RET_ERR(stat, "cudaHostRegister failed on atomIndices");
 
-            if (atomIndicesSize > 0)
+            if (bDoTime)
             {
-                // source data must be pinned for H2D assertion. This should be moved into place where data is (re-)alloced.
-                stat = cudaHostRegister((void*) atomIndices, atomIndicesSize*sizeof(int), cudaHostRegisterDefault);
-                CU_RET_ERR(stat, "cudaHostRegister failed on atomIndices");
-
-                if (bDoTime)
-                {
-                    gpu_nbv->timers->xf[locality].nb_h2d.openTimingRegion(stream);
-                }
+                gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.openTimingRegion(stream);
+            }
 
-                copyToDeviceBuffer(&gpu_nbv->atomIndices, atomIndices, 0, atomIndicesSize, stream, GpuApiCallBehavior::Async, nullptr);
+            copyToDeviceBuffer(&gpu_nbv->atomIndices, atomIndices, 0, atomIndicesSize, stream, GpuApiCallBehavior::Async, nullptr);
 
-                if (bDoTime)
-                {
-                    gpu_nbv->timers->xf[locality].nb_h2d.closeTimingRegion(stream);
-                }
-
-                stat = cudaHostUnregister((void*) atomIndices);
-                CU_RET_ERR(stat, "cudaHostUnRegister failed on atomIndices");
+            if (bDoTime)
+            {
+                gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.closeTimingRegion(stream);
             }
-        }
 
-        reallocateDeviceBuffer(&gpu_nbv->cxy_na[locality], numColumns, &gpu_nbv->ncxy_na[locality], &gpu_nbv->ncxy_na_alloc[locality], nullptr);
-        reallocateDeviceBuffer(&gpu_nbv->cxy_ind[locality], numColumns, &gpu_nbv->ncxy_ind[locality], &gpu_nbv->ncxy_ind_alloc[locality], nullptr);
+            stat = cudaHostUnregister((void*) atomIndices);
+            CU_RET_ERR(stat, "cudaHostUnRegister failed on atomIndices");
+        }
 
         if (numColumns > 0)
         {
@@ -955,14 +928,15 @@ void nbnxn_gpu_init_x_to_nbat_x(const Nbnxm::GridSet            &gridSet,
 
             if (bDoTime)
             {
-                gpu_nbv->timers->xf[locality].nb_h2d.openTimingRegion(stream);
+                gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.openTimingRegion(stream);
             }
 
-            copyToDeviceBuffer(&gpu_nbv->cxy_na[locality], cxy_na, 0, numColumns, stream, GpuApiCallBehavior::Async, nullptr);
+            int* destPtr = &gpu_nbv->cxy_na[maxNumColumns*g];
+            copyToDeviceBuffer(&destPtr, cxy_na, 0, numColumns, stream, GpuApiCallBehavior::Async, nullptr);
 
             if (bDoTime)
             {
-                gpu_nbv->timers->xf[locality].nb_h2d.closeTimingRegion(stream);
+                gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.closeTimingRegion(stream);
             }
 
             stat = cudaHostUnregister((void*) cxy_na);
@@ -974,20 +948,31 @@ void nbnxn_gpu_init_x_to_nbat_x(const Nbnxm::GridSet            &gridSet,
 
             if (bDoTime)
             {
-                gpu_nbv->timers->xf[locality].nb_h2d.openTimingRegion(stream);
+                gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.openTimingRegion(stream);
             }
 
-            copyToDeviceBuffer(&gpu_nbv->cxy_ind[locality], cxy_ind, 0, numColumns, stream, GpuApiCallBehavior::Async, nullptr);
+            destPtr = &gpu_nbv->cxy_ind[maxNumColumns*g];
+            copyToDeviceBuffer(&destPtr, cxy_ind, 0, numColumns, stream, GpuApiCallBehavior::Async, nullptr);
 
             if (bDoTime)
             {
-                gpu_nbv->timers->xf[locality].nb_h2d.closeTimingRegion(stream);
+                gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.closeTimingRegion(stream);
             }
 
             stat = cudaHostUnregister((void*) cxy_ind);
             CU_RET_ERR(stat, "cudaHostUnRegister failed on cxy_ind");
         }
     }
+
+    // The above data is transferred on the local stream but is a
+    // dependency of the nonlocal stream (specifically the nonlocal X
+    // buf ops kernel).  We therefore set a dependency to ensure
+    // that the nonlocal stream waits on the local stream here.
+    // This call records an event in the local stream:
+    nbnxnInsertNonlocalGpuDependency(gpu_nbv, Nbnxm::InteractionLocality::Local);
+    // ...and this call instructs the nonlocal stream to wait on that event:
+    nbnxnInsertNonlocalGpuDependency(gpu_nbv, Nbnxm::InteractionLocality::NonLocal);
+
     return;
 }
 
index bd038c6f03de3000cb26c99e90ae4e7e8b2f8b54..48c8776c79f68bebf8b8769f2e10c07494110af7 100644 (file)
@@ -229,18 +229,18 @@ struct gmx_nbnxn_cuda_t
     int                                                             atomIndicesSize;
     //! size of atom indices allocated in device buffer
     int                                                             atomIndicesSize_alloc;
-    //! x buf ops num of atoms (local and non-local)
-    gmx::EnumerationArray<Nbnxm::AtomLocality, int *>               cxy_na;
+    //! x buf ops num of atoms
+    int                                                            *cxy_na;
     //! number of elements in cxy_na
-    gmx::EnumerationArray<Nbnxm::AtomLocality, int >                ncxy_na;
+    int                                                             ncxy_na;
     //! number of elements allocated allocated in device buffer
-    gmx::EnumerationArray<Nbnxm::AtomLocality, int >                ncxy_na_alloc;
-    //! x buf ops cell index mapping (local and non-local)
-    gmx::EnumerationArray<Nbnxm::AtomLocality, int *>               cxy_ind;
+    int                                                             ncxy_na_alloc;
+    //! x buf ops cell index mapping
+    int                                                            *cxy_ind;
     //! number of elements in cxy_ind
-    gmx::EnumerationArray<Nbnxm::AtomLocality, int >                ncxy_ind;
+    int                                                             ncxy_ind;
     //! number of elements allocated allocated in device buffer
-    gmx::EnumerationArray<Nbnxm::AtomLocality, int >                ncxy_ind_alloc;
+    int                                                             ncxy_ind_alloc;
     //! parameters required for the non-bonded calc.
     cu_nbparam_t                                                   *nbparam;
     //! pair-list data structures (local and non-local)
@@ -255,8 +255,10 @@ struct gmx_nbnxn_cuda_t
                                                    is done (and the local transfer can proceed)           */
     cudaEvent_t    misc_ops_and_local_H2D_done; /**< event triggered when the tasks issued in
                                                    the local stream that need to precede the
-                                                   non-local force calculations are done
-                                                   (e.g. f buffer 0-ing, local x/q H2D) */
+                                                   non-local force or buffer operation calculations are done
+                                                   (e.g. f buffer 0-ing, local x/q H2D, buffer op
+                                                   initialization in local stream that is required also
+                                                   by nonlocal stream ) */
 
     /* NOTE: With current CUDA versions (<=5.0) timing doesn't work with multiple
      * concurrent streams, so we won't time if both l/nl work is done on GPUs.
index 168fdd87924acbdb9735273bafcd4c9301c207e3..feeb005594d4fa277fd20f29bb5ed457bd1ea2b5 100644 (file)
@@ -224,6 +224,14 @@ void GridSet::putOnGrid(const matrix                    box,
         /* We are done setting up all grids, we can resize the force buffers */
         nbat->resizeForceBuffers();
     }
+
+    int maxNumColumns = 0;
+    for (const auto &grid : grids())
+    {
+        maxNumColumns = std::max(maxNumColumns, grid.numColumns());
+    }
+    setNumColumnsMax(maxNumColumns);
+
 }
 
 } // namespace Nbnxm
index e59a8584c1582dbb5c34f4bc46607fb396ee6937..a4eaf6256e8189e20f6d890a5a4f8a011ddf2bb1 100644 (file)
@@ -185,6 +185,18 @@ class GridSet
             copy_mat(box_, box);
         }
 
+        //! Returns the maximum number of columns across all grids
+        int numColumnsMax() const
+        {
+            return numColumnsMax_;
+        }
+
+        //! Sets the maximum number of columns across all grids
+        void setNumColumnsMax(int numColumnsMax)
+        {
+            numColumnsMax_ = numColumnsMax;
+        }
+
     private:
         //! Returns collection of the data that covers all grids
         const GridSetData getGridSetData()
@@ -213,6 +225,9 @@ class GridSet
         int                   numRealAtomsTotal_;
         //! Working data for constructing a single grid, one entry per thread
         std::vector<GridWork> gridWork_;
+        //! Maximum number of columns across all grids
+        int                   numColumnsMax_;
+
 };
 
 } // namespace Nbnxm
index 31037b24769ba4cae12d7986497fd36de4706605..54c9ff98640a9cd3262911d8d30ef9d454e39774 100644 (file)
@@ -190,13 +190,13 @@ void nonbonded_verlet_t::changePairlistRadii(real rlistOuter,
 }
 
 void
-nonbonded_verlet_t::atomdata_init_copy_x_to_nbat_x_gpu(const Nbnxm::AtomLocality        locality)
+nonbonded_verlet_t::atomdata_init_copy_x_to_nbat_x_gpu()
 {
+    Nbnxm::nbnxn_gpu_init_x_to_nbat_x(pairSearch_->gridSet(), gpu_nbv);
+}
 
-    nbnxn_gpu_init_x_to_nbat_x(pairSearch_->gridSet(),
-                               gpu_nbv,
-                               locality);
-
-
+void nonbonded_verlet_t::insertNonlocalGpuDependency(const Nbnxm::InteractionLocality interactionLocality)
+{
+    Nbnxm::nbnxnInsertNonlocalGpuDependency(gpu_nbv, interactionLocality);
 }
 /*! \endcond */
index b8bdb853cb631c447d69cf8fa646df29753eb4ed..72eb98ae59866efbbc456818c50c18a7da2cb878 100644 (file)
@@ -248,9 +248,11 @@ struct nonbonded_verlet_t
                             void                           *xPmeDevicePtr,
                             gmx_wallcycle                  *wcycle);
 
-        //! Init for GPU version of setup coordinates in Nbnxm, for the given locality
-        void atomdata_init_copy_x_to_nbat_x_gpu(Nbnxm::AtomLocality        locality);
+        //! Init for GPU version of setup coordinates in Nbnxm
+        void atomdata_init_copy_x_to_nbat_x_gpu();
 
+        //! Sync the nonlocal GPU stream with dependent tasks in the local queue.
+        void insertNonlocalGpuDependency(Nbnxm::InteractionLocality interactionLocality);
 
         //! Returns a reference to the pairlist sets
         const PairlistSets &pairlistSets() const
index 49dc1bfce2fc6a4a045b07d37e56e4c1853155f1..7e88129f4ee89728bebc38427d0d5617c2f0e863 100644 (file)
@@ -220,8 +220,7 @@ int gpu_pick_ewald_kernel_type(bool gmx_unused bTwinCut) GPU_FUNC_TERM_WITH_RETU
  * Called on the NS step and performs (re-)allocations and memory copies. !*/
 CUDA_FUNC_QUALIFIER
 void nbnxn_gpu_init_x_to_nbat_x(const Nbnxm::GridSet gmx_unused &gridSet,
-                                gmx_nbnxn_gpu_t    gmx_unused *gpu_nbv,
-                                Nbnxm::AtomLocality gmx_unused locality) CUDA_FUNC_TERM
+                                gmx_nbnxn_gpu_t    gmx_unused *gpu_nbv) CUDA_FUNC_TERM
 
 /*! \brief X buffer operations on GPU: performs conversion from rvec to nb format.
  */
@@ -231,7 +230,17 @@ void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid  gmx_unused &grid,
                            gmx_nbnxn_gpu_t    gmx_unused *gpu_nbv,
                            void               gmx_unused *xPmeDevicePtr,
                            Nbnxm::AtomLocality gmx_unused locality,
-                           const rvec         gmx_unused *x) CUDA_FUNC_TERM
+                           const rvec         gmx_unused *x,
+                           int                gmx_unused  gridId,
+                           int                gmx_unused  numColumnsMax) CUDA_FUNC_TERM
+
+/*! \brief Sync the nonlocal stream with dependent tasks in the local queue.
+ * \param[in] nb                   The nonbonded data GPU structure
+ * \param[in] interactionLocality  Local or NonLocal sync point
+ */
+CUDA_FUNC_QUALIFIER
+void nbnxnInsertNonlocalGpuDependency(const gmx_nbnxn_gpu_t gmx_unused    *nb,
+                                      const InteractionLocality gmx_unused interactionLocality) CUDA_FUNC_TERM
 
 } // namespace Nbnxm