Bug fix and simplification for CUDA X Buffer Ops

author Alan Gray <alang@nvidia.com>

Wed, 12 Jun 2019 15:03:27 +0000 (08:03 -0700)

committer Szilárd Páll <pall.szilard@gmail.com>

Mon, 1 Jul 2019 21:44:33 +0000 (23:44 +0200)
author Alan Gray <alang@nvidia.com>
Wed, 12 Jun 2019 15:03:27 +0000 (08:03 -0700)
committer Szilárd Páll <pall.szilard@gmail.com>
Mon, 1 Jul 2019 21:44:33 +0000 (23:44 +0200)
diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp

index baecb96ba1d9e562842b76d7da9996440a6cf627..ad61dee88acadbafcb2aa3f878076ad09b28cd8a 100644 (file)
--- a/src/gromacs/mdlib/sim_util.cpp
+++ b/src/gromacs/mdlib/sim_util.cpp
@@ -1040,7 +1040,7 @@ void do_force(FILE                                     *fplog,
  
          if (useGpuXBufOps)
          {
-            nbv->atomdata_init_copy_x_to_nbat_x_gpu( Nbnxm::AtomLocality::Local);
+            nbv->atomdata_init_copy_x_to_nbat_x_gpu();
          }
  
      }
@@ -1107,12 +1107,6 @@ void do_force(FILE                                     *fplog,
                                     &top->excls, step, nrnb);
              wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_NONLOCAL);
              wallcycle_stop(wcycle, ewcNS);
-
-            if (useGpuXBufOps)
-            {
-
-                nbv->atomdata_init_copy_x_to_nbat_x_gpu( Nbnxm::AtomLocality::NonLocal);
-            }
          }
          else
          {
diff --git a/src/gromacs/mdrun/md.cpp b/src/gromacs/mdrun/md.cpp

index aba6ba9ca13fd7e87a27011928425be89112b2d0..b5869d65210fe182bb42fbde2cdaaba9519fbfe7 100644 (file)
--- a/src/gromacs/mdrun/md.cpp
+++ b/src/gromacs/mdrun/md.cpp
@@ -300,10 +300,6 @@ void gmx::Simulator::do_md()
  
          stateInstance = std::make_unique<t_state>();
          state         = stateInstance.get();
-        if (fr->nbv->useGpu())
-        {
-            changePinningPolicy(&state->x, gmx::PinningPolicy::PinnedIfSupported);
-        }
          dd_init_local_state(cr->dd, state_global, state);
  
          /* Distribute the charge groups over the nodes from the master node */
@@ -346,6 +342,11 @@ void gmx::Simulator::do_md()
  
      }
  
+    if (fr->nbv->useGpu())
+    {
+        changePinningPolicy(&state->x, gmx::PinningPolicy::PinnedIfSupported);
+    }
+
      // NOTE: The global state is no longer used at this point.
      // But state_global is still used as temporary storage space for writing
      // the global state to file and potentially for replica exchange.
diff --git a/src/gromacs/nbnxm/atomdata.cpp b/src/gromacs/nbnxm/atomdata.cpp

index 55084763f3805b9c4067d3785adb6d50b4d060f2..c1271d8bbac34f0fc51b8cd673e0eefbd8dabf48 100644 (file)
--- a/src/gromacs/nbnxm/atomdata.cpp
+++ b/src/gromacs/nbnxm/atomdata.cpp
@@ -1043,7 +1043,7 @@ void nbnxn_atomdata_copy_x_to_nbat_x(const Nbnxm::GridSet     &gridSet,
                                    gpu_nbv,
                                    xPmeDevicePtr,
                                    locality,
-                                  x);
+                                  x, g, gridSet.numColumnsMax());
          }
      }
      else
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu

index 7162c9f593944e68682e9c0e85ef14ba67379a7a..a0117a5cb1a23725798898d7b66e4ef7e1dafb8b 100644 (file)
--- a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
@@ -277,11 +277,11 @@ static inline int calc_shmem_required_nonbonded(const int num_threads_z, const g
   *
   *  As the point where the local stream tasks can be considered complete happens
   *  at the same call point where the nonlocal stream should be synced with the
- *  the local, this function recrds the event if called with the local stream as
+ *  the local, this function records the event if called with the local stream as
   *  argument and inserts in the GPU stream a wait on the event on the nonlocal.
   */
-static void insertNonlocalGpuDependency(const gmx_nbnxn_cuda_t   *nb,
-                                        const InteractionLocality interactionLocality)
+void nbnxnInsertNonlocalGpuDependency(const gmx_nbnxn_cuda_t   *nb,
+                                      const InteractionLocality interactionLocality)
  {
      cudaStream_t stream  = nb->stream[interactionLocality];
  
@@ -375,7 +375,7 @@ void gpu_copy_xq_to_gpu(gmx_nbnxn_cuda_t       *nb,
         This wait needs to precede any PP tasks, bonded or nonbonded, that may
         compute on interactions between local and nonlocal atoms.
       */
-    insertNonlocalGpuDependency(nb, iloc);
+    nbnxnInsertNonlocalGpuDependency(nb, iloc);
  }
  
  /*! As we execute nonbonded workload in separate streams, before launching
@@ -743,7 +743,9 @@ void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid               &grid,
                             gmx_nbnxn_gpu_t                 *nb,
                             void                            *xPmeDevicePtr,
                             const Nbnxm::AtomLocality        locality,
-                           const rvec                      *x)
+                           const rvec                      *x,
+                           int                              gridId,
+                           int                              numColumnsMax)
  {
      cu_atomdata_t             *adat    = nb->atdat;
      bool                       bDoTime = nb->bDoTime;
@@ -751,17 +753,11 @@ void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid               &grid,
      const int                  numColumns                = grid.numColumns();
      const int                  cellOffset                = grid.cellOffset();
      const int                  numAtomsPerCell           = grid.numAtomsPerCell();
-    // TODO: Document this, one can not infer the interaction locality from the atom locality
-    Nbnxm::InteractionLocality interactionLoc            = Nbnxm::InteractionLocality::Local;
-    int nCopyAtoms                                       = grid.srcAtomEnd() - grid.srcAtomBegin();
-    int copyAtomStart                                    = grid.srcAtomBegin();
+    Nbnxm::InteractionLocality interactionLoc            = gpuAtomToInteractionLocality(locality);
+    int                        nCopyAtoms                = grid.srcAtomEnd() - grid.srcAtomBegin();
+    int                        copyAtomStart             = grid.srcAtomBegin();
  
-    if (locality == Nbnxm::AtomLocality::NonLocal)
-    {
-        interactionLoc          = Nbnxm::InteractionLocality::NonLocal;
-    }
-
-    cudaStream_t   stream  = nb->stream[interactionLoc];
+    cudaStream_t               stream  = nb->stream[interactionLoc];
  
      // FIXME: need to either let the local stream get to the
      // insertNonlocalGpuDependency call or call it separately here
@@ -769,7 +765,7 @@ void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid               &grid,
      {
          if (interactionLoc == Nbnxm::InteractionLocality::Local)
          {
-            insertNonlocalGpuDependency(nb, interactionLoc);
+            nbnxnInsertNonlocalGpuDependency(nb, interactionLoc);
          }
          return;
      }
@@ -820,24 +816,24 @@ void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid               &grid,
      config.sharedMemorySize = 0;
      config.stream           = stream;
  
-    auto       kernelFn           = nbnxn_gpu_x_to_nbat_x_kernel;
-    float     *xqPtr              = &(adat->xq->x);
-    const int *d_atomIndices      = nb->atomIndices;
-    const int *d_cxy_na           = nb->cxy_na[locality];
-    const int *d_cxy_ind          = nb->cxy_ind[locality];
-    const auto kernelArgs         = prepareGpuKernelArguments(kernelFn, config,
-                                                              &numColumns,
-                                                              &xqPtr,
-                                                              &setFillerCoords,
-                                                              &d_x,
-                                                              &d_atomIndices,
-                                                              &d_cxy_na,
-                                                              &d_cxy_ind,
-                                                              &cellOffset,
-                                                              &numAtomsPerCell);
+    auto       kernelFn            = nbnxn_gpu_x_to_nbat_x_kernel;
+    float     *xqPtr               = &(adat->xq->x);
+    const int *d_atomIndices       = nb->atomIndices;
+    const int *d_cxy_na            = &nb->cxy_na[numColumnsMax*gridId];
+    const int *d_cxy_ind           = &nb->cxy_ind[numColumnsMax*gridId];
+    const auto kernelArgs          = prepareGpuKernelArguments(kernelFn, config,
+                                                               &numColumns,
+                                                               &xqPtr,
+                                                               &setFillerCoords,
+                                                               &d_x,
+                                                               &d_atomIndices,
+                                                               &d_cxy_na,
+                                                               &d_cxy_ind,
+                                                               &cellOffset,
+                                                               &numAtomsPerCell);
      launchGpuKernel(kernelFn, config, nullptr, "XbufferOps", kernelArgs);
  
-    insertNonlocalGpuDependency(nb, interactionLoc);
+    nbnxnInsertNonlocalGpuDependency(nb, interactionLoc);
  }
  
  } // namespace Nbnxm
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu

index 7af2d94cb0e335fa01cdaa8236b4cd2fabd96fcf..fd2da0cef5066c6f5342a377b113a1ab38cd1d3c 100644 (file)
--- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu
@@ -495,18 +495,14 @@ gpu_init(const gmx_device_info_t   *deviceInfo,
  
      cuda_init_const(nb, ic, listParams, nbat->params());
  
-    nb->natoms                = 0;
-    nb->natoms_alloc          = 0;
-    nb->atomIndicesSize       = 0;
-    nb->atomIndicesSize_alloc = 0;
-    nb->ncxy_na[AtomLocality::Local]                  = 0;
-    nb->ncxy_na[AtomLocality::NonLocal]               = 0;
-    nb->ncxy_na_alloc[AtomLocality::Local]            = 0;
-    nb->ncxy_na_alloc[AtomLocality::NonLocal]         = 0;
-    nb->ncxy_ind[AtomLocality::Local]                 = 0;
-    nb->ncxy_ind[AtomLocality::NonLocal]              = 0;
-    nb->ncxy_ind_alloc[AtomLocality::Local]           = 0;
-    nb->ncxy_ind_alloc[AtomLocality::NonLocal]        = 0;
+    nb->natoms                   = 0;
+    nb->natoms_alloc             = 0;
+    nb->atomIndicesSize          = 0;
+    nb->atomIndicesSize_alloc    = 0;
+    nb->ncxy_na                  = 0;
+    nb->ncxy_na_alloc            = 0;
+    nb->ncxy_ind                 = 0;
+    nb->ncxy_ind_alloc           = 0;
  
      if (debug)
      {
@@ -874,36 +870,20 @@ rvec *gpu_get_fshift(gmx_nbnxn_gpu_t *nb)
  /* Initialization for X buffer operations on GPU. */
  /* TODO  Remove explicit pinning from host arrays from here and manage in a more natural way*/
  void nbnxn_gpu_init_x_to_nbat_x(const Nbnxm::GridSet            &gridSet,
-                                gmx_nbnxn_gpu_t                 *gpu_nbv,
-                                const Nbnxm::AtomLocality        locality)
+                                gmx_nbnxn_gpu_t                 *gpu_nbv)
  {
      cudaError_t                      stat;
-    const Nbnxm::InteractionLocality iloc = ((locality == AtomLocality::Local) ?
-                                             InteractionLocality::Local : InteractionLocality::NonLocal);
-    cudaStream_t                     stream    = gpu_nbv->stream[iloc];
+    cudaStream_t                     stream    = gpu_nbv->stream[InteractionLocality::Local];
      bool                             bDoTime   = gpu_nbv->bDoTime;
-    int                              gridBegin = 0, gridEnd = 0;
+    const int maxNumColumns                    = gridSet.numColumnsMax();
  
-    switch (locality)
-    {
-        case Nbnxm::AtomLocality::All:
-            gridBegin = 0;
-            gridEnd   = gridSet.grids().size();
-            break;
-        case Nbnxm::AtomLocality::Local:
-            gridBegin = 0;
-            gridEnd   = 1;
-            break;
-        case Nbnxm::AtomLocality::NonLocal:
-            gridBegin = 1;
-            gridEnd   = gridSet.grids().size();
-            break;
-        case Nbnxm::AtomLocality::Count:
-            GMX_ASSERT(false, "Count is invalid locality specifier");
-            break;
-    }
  
-    for (int g = gridBegin; g < gridEnd; g++)
+    reallocateDeviceBuffer(&gpu_nbv->cxy_na, maxNumColumns*gridSet.grids().size(),
+                           &gpu_nbv->ncxy_na, &gpu_nbv->ncxy_na_alloc, nullptr);
+    reallocateDeviceBuffer(&gpu_nbv->cxy_ind, maxNumColumns*gridSet.grids().size(),
+                           &gpu_nbv->ncxy_ind, &gpu_nbv->ncxy_ind_alloc, nullptr);
+
+    for (unsigned int g = 0; g < gridSet.grids().size(); g++)
      {
  
          const Nbnxm::Grid  &grid       = gridSet.grids()[g];
@@ -915,37 +895,30 @@ void nbnxn_gpu_init_x_to_nbat_x(const Nbnxm::GridSet            &gridSet,
          const int          *cxy_ind           = grid.cxy_ind().data();
          const int           numRealAtomsTotal = gridSet.numRealAtomsTotal();
  
-        if (iloc == Nbnxm::InteractionLocality::Local)
-        {
+        reallocateDeviceBuffer(&gpu_nbv->xrvec, numRealAtomsTotal, &gpu_nbv->natoms, &gpu_nbv->natoms_alloc, nullptr);
+        reallocateDeviceBuffer(&gpu_nbv->atomIndices, atomIndicesSize, &gpu_nbv->atomIndicesSize, &gpu_nbv->atomIndicesSize_alloc, nullptr);
  
-            reallocateDeviceBuffer(&gpu_nbv->xrvec, numRealAtomsTotal, &gpu_nbv->natoms, &gpu_nbv->natoms_alloc, nullptr);
-            reallocateDeviceBuffer(&gpu_nbv->atomIndices, atomIndicesSize, &gpu_nbv->atomIndicesSize, &gpu_nbv->atomIndicesSize_alloc, nullptr);
+        if (atomIndicesSize > 0)
+        {
+            // source data must be pinned for H2D assertion. This should be moved into place where data is (re-)alloced.
+            stat = cudaHostRegister((void*) atomIndices, atomIndicesSize*sizeof(int), cudaHostRegisterDefault);
+            CU_RET_ERR(stat, "cudaHostRegister failed on atomIndices");
  
-            if (atomIndicesSize > 0)
+            if (bDoTime)
              {
-                // source data must be pinned for H2D assertion. This should be moved into place where data is (re-)alloced.
-                stat = cudaHostRegister((void*) atomIndices, atomIndicesSize*sizeof(int), cudaHostRegisterDefault);
-                CU_RET_ERR(stat, "cudaHostRegister failed on atomIndices");
-
-                if (bDoTime)
-                {
-                    gpu_nbv->timers->xf[locality].nb_h2d.openTimingRegion(stream);
-                }
+                gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.openTimingRegion(stream);
+            }
  
-                copyToDeviceBuffer(&gpu_nbv->atomIndices, atomIndices, 0, atomIndicesSize, stream, GpuApiCallBehavior::Async, nullptr);
+            copyToDeviceBuffer(&gpu_nbv->atomIndices, atomIndices, 0, atomIndicesSize, stream, GpuApiCallBehavior::Async, nullptr);
  
-                if (bDoTime)
-                {
-                    gpu_nbv->timers->xf[locality].nb_h2d.closeTimingRegion(stream);
-                }
-
-                stat = cudaHostUnregister((void*) atomIndices);
-                CU_RET_ERR(stat, "cudaHostUnRegister failed on atomIndices");
+            if (bDoTime)
+            {
+                gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.closeTimingRegion(stream);
              }
-        }
  
-        reallocateDeviceBuffer(&gpu_nbv->cxy_na[locality], numColumns, &gpu_nbv->ncxy_na[locality], &gpu_nbv->ncxy_na_alloc[locality], nullptr);
-        reallocateDeviceBuffer(&gpu_nbv->cxy_ind[locality], numColumns, &gpu_nbv->ncxy_ind[locality], &gpu_nbv->ncxy_ind_alloc[locality], nullptr);
+            stat = cudaHostUnregister((void*) atomIndices);
+            CU_RET_ERR(stat, "cudaHostUnRegister failed on atomIndices");
+        }
  
          if (numColumns > 0)
          {
@@ -955,14 +928,15 @@ void nbnxn_gpu_init_x_to_nbat_x(const Nbnxm::GridSet            &gridSet,
  
              if (bDoTime)
              {
-                gpu_nbv->timers->xf[locality].nb_h2d.openTimingRegion(stream);
+                gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.openTimingRegion(stream);
              }
  
-            copyToDeviceBuffer(&gpu_nbv->cxy_na[locality], cxy_na, 0, numColumns, stream, GpuApiCallBehavior::Async, nullptr);
+            int* destPtr = &gpu_nbv->cxy_na[maxNumColumns*g];
+            copyToDeviceBuffer(&destPtr, cxy_na, 0, numColumns, stream, GpuApiCallBehavior::Async, nullptr);
  
              if (bDoTime)
              {
-                gpu_nbv->timers->xf[locality].nb_h2d.closeTimingRegion(stream);
+                gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.closeTimingRegion(stream);
              }
  
              stat = cudaHostUnregister((void*) cxy_na);
@@ -974,20 +948,31 @@ void nbnxn_gpu_init_x_to_nbat_x(const Nbnxm::GridSet            &gridSet,
  
              if (bDoTime)
              {
-                gpu_nbv->timers->xf[locality].nb_h2d.openTimingRegion(stream);
+                gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.openTimingRegion(stream);
              }
  
-            copyToDeviceBuffer(&gpu_nbv->cxy_ind[locality], cxy_ind, 0, numColumns, stream, GpuApiCallBehavior::Async, nullptr);
+            destPtr = &gpu_nbv->cxy_ind[maxNumColumns*g];
+            copyToDeviceBuffer(&destPtr, cxy_ind, 0, numColumns, stream, GpuApiCallBehavior::Async, nullptr);
  
              if (bDoTime)
              {
-                gpu_nbv->timers->xf[locality].nb_h2d.closeTimingRegion(stream);
+                gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.closeTimingRegion(stream);
              }
  
              stat = cudaHostUnregister((void*) cxy_ind);
              CU_RET_ERR(stat, "cudaHostUnRegister failed on cxy_ind");
          }
      }
+
+    // The above data is transferred on the local stream but is a
+    // dependency of the nonlocal stream (specifically the nonlocal X
+    // buf ops kernel).  We therefore set a dependency to ensure
+    // that the nonlocal stream waits on the local stream here.
+    // This call records an event in the local stream:
+    nbnxnInsertNonlocalGpuDependency(gpu_nbv, Nbnxm::InteractionLocality::Local);
+    // ...and this call instructs the nonlocal stream to wait on that event:
+    nbnxnInsertNonlocalGpuDependency(gpu_nbv, Nbnxm::InteractionLocality::NonLocal);
+
      return;
  }
  
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h b/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h

index bd038c6f03de3000cb26c99e90ae4e7e8b2f8b54..48c8776c79f68bebf8b8769f2e10c07494110af7 100644 (file)
--- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h
@@ -229,18 +229,18 @@ struct gmx_nbnxn_cuda_t
      int                                                             atomIndicesSize;
      //! size of atom indices allocated in device buffer
      int                                                             atomIndicesSize_alloc;
-    //! x buf ops num of atoms (local and non-local)
-    gmx::EnumerationArray<Nbnxm::AtomLocality, int *>               cxy_na;
+    //! x buf ops num of atoms
+    int                                                            *cxy_na;
      //! number of elements in cxy_na
-    gmx::EnumerationArray<Nbnxm::AtomLocality, int >                ncxy_na;
+    int                                                             ncxy_na;
      //! number of elements allocated allocated in device buffer
-    gmx::EnumerationArray<Nbnxm::AtomLocality, int >                ncxy_na_alloc;
-    //! x buf ops cell index mapping (local and non-local)
-    gmx::EnumerationArray<Nbnxm::AtomLocality, int *>               cxy_ind;
+    int                                                             ncxy_na_alloc;
+    //! x buf ops cell index mapping
+    int                                                            *cxy_ind;
      //! number of elements in cxy_ind
-    gmx::EnumerationArray<Nbnxm::AtomLocality, int >                ncxy_ind;
+    int                                                             ncxy_ind;
      //! number of elements allocated allocated in device buffer
-    gmx::EnumerationArray<Nbnxm::AtomLocality, int >                ncxy_ind_alloc;
+    int                                                             ncxy_ind_alloc;
      //! parameters required for the non-bonded calc.
      cu_nbparam_t                                                   *nbparam;
      //! pair-list data structures (local and non-local)
@@ -255,8 +255,10 @@ struct gmx_nbnxn_cuda_t
                                                     is done (and the local transfer can proceed)           */
      cudaEvent_t    misc_ops_and_local_H2D_done; /**< event triggered when the tasks issued in
                                                     the local stream that need to precede the
-                                                   non-local force calculations are done
-                                                   (e.g. f buffer 0-ing, local x/q H2D) */
+                                                   non-local force or buffer operation calculations are done
+                                                   (e.g. f buffer 0-ing, local x/q H2D, buffer op
+                                                   initialization in local stream that is required also
+                                                   by nonlocal stream ) */
  
      /* NOTE: With current CUDA versions (<=5.0) timing doesn't work with multiple
       * concurrent streams, so we won't time if both l/nl work is done on GPUs.
diff --git a/src/gromacs/nbnxm/gridset.cpp b/src/gromacs/nbnxm/gridset.cpp

index 168fdd87924acbdb9735273bafcd4c9301c207e3..feeb005594d4fa277fd20f29bb5ed457bd1ea2b5 100644 (file)
--- a/src/gromacs/nbnxm/gridset.cpp
+++ b/src/gromacs/nbnxm/gridset.cpp
@@ -224,6 +224,14 @@ void GridSet::putOnGrid(const matrix                    box,
          /* We are done setting up all grids, we can resize the force buffers */
          nbat->resizeForceBuffers();
      }
+
+    int maxNumColumns = 0;
+    for (const auto &grid : grids())
+    {
+        maxNumColumns = std::max(maxNumColumns, grid.numColumns());
+    }
+    setNumColumnsMax(maxNumColumns);
+
  }
  
  } // namespace Nbnxm
diff --git a/src/gromacs/nbnxm/gridset.h b/src/gromacs/nbnxm/gridset.h

index e59a8584c1582dbb5c34f4bc46607fb396ee6937..a4eaf6256e8189e20f6d890a5a4f8a011ddf2bb1 100644 (file)
--- a/src/gromacs/nbnxm/gridset.h
+++ b/src/gromacs/nbnxm/gridset.h
@@ -185,6 +185,18 @@ class GridSet
              copy_mat(box_, box);
          }
  
+        //! Returns the maximum number of columns across all grids
+        int numColumnsMax() const
+        {
+            return numColumnsMax_;
+        }
+
+        //! Sets the maximum number of columns across all grids
+        void setNumColumnsMax(int numColumnsMax)
+        {
+            numColumnsMax_ = numColumnsMax;
+        }
+
      private:
          //! Returns collection of the data that covers all grids
          const GridSetData getGridSetData()
@@ -213,6 +225,9 @@ class GridSet
          int                   numRealAtomsTotal_;
          //! Working data for constructing a single grid, one entry per thread
          std::vector<GridWork> gridWork_;
+        //! Maximum number of columns across all grids
+        int                   numColumnsMax_;
+
  };
  
  } // namespace Nbnxm
diff --git a/src/gromacs/nbnxm/nbnxm.cpp b/src/gromacs/nbnxm/nbnxm.cpp

index 31037b24769ba4cae12d7986497fd36de4706605..54c9ff98640a9cd3262911d8d30ef9d454e39774 100644 (file)
--- a/src/gromacs/nbnxm/nbnxm.cpp
+++ b/src/gromacs/nbnxm/nbnxm.cpp
@@ -190,13 +190,13 @@ void nonbonded_verlet_t::changePairlistRadii(real rlistOuter,
  }
  
  void
-nonbonded_verlet_t::atomdata_init_copy_x_to_nbat_x_gpu(const Nbnxm::AtomLocality        locality)
+nonbonded_verlet_t::atomdata_init_copy_x_to_nbat_x_gpu()
  {
+    Nbnxm::nbnxn_gpu_init_x_to_nbat_x(pairSearch_->gridSet(), gpu_nbv);
+}
  
-    nbnxn_gpu_init_x_to_nbat_x(pairSearch_->gridSet(),
-                               gpu_nbv,
-                               locality);
-
-
+void nonbonded_verlet_t::insertNonlocalGpuDependency(const Nbnxm::InteractionLocality interactionLocality)
+{
+    Nbnxm::nbnxnInsertNonlocalGpuDependency(gpu_nbv, interactionLocality);
  }
  /*! \endcond */
diff --git a/src/gromacs/nbnxm/nbnxm.h b/src/gromacs/nbnxm/nbnxm.h

index b8bdb853cb631c447d69cf8fa646df29753eb4ed..72eb98ae59866efbbc456818c50c18a7da2cb878 100644 (file)
--- a/src/gromacs/nbnxm/nbnxm.h
+++ b/src/gromacs/nbnxm/nbnxm.h
@@ -248,9 +248,11 @@ struct nonbonded_verlet_t
                              void                           *xPmeDevicePtr,
                              gmx_wallcycle                  *wcycle);
  
-        //! Init for GPU version of setup coordinates in Nbnxm, for the given locality
-        void atomdata_init_copy_x_to_nbat_x_gpu(Nbnxm::AtomLocality        locality);
+        //! Init for GPU version of setup coordinates in Nbnxm
+        void atomdata_init_copy_x_to_nbat_x_gpu();
  
+        //! Sync the nonlocal GPU stream with dependent tasks in the local queue.
+        void insertNonlocalGpuDependency(Nbnxm::InteractionLocality interactionLocality);
  
          //! Returns a reference to the pairlist sets
          const PairlistSets &pairlistSets() const
diff --git a/src/gromacs/nbnxm/nbnxm_gpu.h b/src/gromacs/nbnxm/nbnxm_gpu.h

index 49dc1bfce2fc6a4a045b07d37e56e4c1853155f1..7e88129f4ee89728bebc38427d0d5617c2f0e863 100644 (file)
--- a/src/gromacs/nbnxm/nbnxm_gpu.h
+++ b/src/gromacs/nbnxm/nbnxm_gpu.h
@@ -220,8 +220,7 @@ int gpu_pick_ewald_kernel_type(bool gmx_unused bTwinCut) GPU_FUNC_TERM_WITH_RETU
   * Called on the NS step and performs (re-)allocations and memory copies. !*/
  CUDA_FUNC_QUALIFIER
  void nbnxn_gpu_init_x_to_nbat_x(const Nbnxm::GridSet gmx_unused &gridSet,
-                                gmx_nbnxn_gpu_t    gmx_unused *gpu_nbv,
-                                Nbnxm::AtomLocality gmx_unused locality) CUDA_FUNC_TERM
+                                gmx_nbnxn_gpu_t    gmx_unused *gpu_nbv) CUDA_FUNC_TERM
  
  /*! \brief X buffer operations on GPU: performs conversion from rvec to nb format.
   */
@@ -231,7 +230,17 @@ void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid  gmx_unused &grid,
                             gmx_nbnxn_gpu_t    gmx_unused *gpu_nbv,
                             void               gmx_unused *xPmeDevicePtr,
                             Nbnxm::AtomLocality gmx_unused locality,
-                           const rvec         gmx_unused *x) CUDA_FUNC_TERM
+                           const rvec         gmx_unused *x,
+                           int                gmx_unused  gridId,
+                           int                gmx_unused  numColumnsMax) CUDA_FUNC_TERM
+
+/*! \brief Sync the nonlocal stream with dependent tasks in the local queue.
+ * \param[in] nb                   The nonbonded data GPU structure
+ * \param[in] interactionLocality  Local or NonLocal sync point
+ */
+CUDA_FUNC_QUALIFIER
+void nbnxnInsertNonlocalGpuDependency(const gmx_nbnxn_gpu_t gmx_unused    *nb,
+                                      const InteractionLocality gmx_unused interactionLocality) CUDA_FUNC_TERM
  
  } // namespace Nbnxm
author	Alan Gray <alang@nvidia.com>
	Wed, 12 Jun 2019 15:03:27 +0000 (08:03 -0700)
committer	Szilárd Páll <pall.szilard@gmail.com>
	Mon, 1 Jul 2019 21:44:33 +0000 (23:44 +0200)
src/gromacs/mdlib/sim_util.cpp		patch \| blob \| history
src/gromacs/mdrun/md.cpp		patch \| blob \| history
src/gromacs/nbnxm/atomdata.cpp		patch \| blob \| history
src/gromacs/nbnxm/cuda/nbnxm_cuda.cu		patch \| blob \| history
src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu		patch \| blob \| history
src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h		patch \| blob \| history
src/gromacs/nbnxm/gridset.cpp		patch \| blob \| history
src/gromacs/nbnxm/gridset.h		patch \| blob \| history
src/gromacs/nbnxm/nbnxm.cpp		patch \| blob \| history
src/gromacs/nbnxm/nbnxm.h		patch \| blob \| history
src/gromacs/nbnxm/nbnxm_gpu.h		patch \| blob \| history