Avoid enqueueing the same event multiple times in nbnxn_atomdata_x_to_nbat_x_gpu

author Andrey Alekseenko <al42and@gmail.com>

Sat, 27 Mar 2021 19:24:34 +0000 (19:24 +0000)

committer Artem Zhmurov <zhmurov@gmail.com>

Sat, 27 Mar 2021 19:24:34 +0000 (19:24 +0000)
author Andrey Alekseenko <al42and@gmail.com>
Sat, 27 Mar 2021 19:24:34 +0000 (19:24 +0000)
committer Artem Zhmurov <zhmurov@gmail.com>
Sat, 27 Mar 2021 19:24:34 +0000 (19:24 +0000)
diff --git a/src/gromacs/nbnxm/atomdata.cpp b/src/gromacs/nbnxm/atomdata.cpp

index 4048bea672b9d80432e3ccb7ae49590dd25018e7..d9639e9b5be8627cb8bd7a6b30f0367a1412a4f9 100644 (file)
--- a/src/gromacs/nbnxm/atomdata.cpp
+++ b/src/gromacs/nbnxm/atomdata.cpp
@@ -1033,6 +1033,7 @@ void nbnxn_atomdata_x_to_nbat_x_gpu(const Nbnxm::GridSet&   gridSet,
                                      DeviceBuffer<RVec>      d_x,
                                      GpuEventSynchronizer*   xReadyOnDevice)
  {
+    GMX_ASSERT(xReadyOnDevice != nullptr, "Need a valid GpuEventSynchronizer object");
  
      int gridBegin = 0;
      int gridEnd   = 0;
@@ -1040,8 +1041,14 @@ void nbnxn_atomdata_x_to_nbat_x_gpu(const Nbnxm::GridSet&   gridSet,
  
      for (int g = gridBegin; g < gridEnd; g++)
      {
-        nbnxn_gpu_x_to_nbat_x(
-                gridSet.grids()[g], gpu_nbv, d_x, xReadyOnDevice, locality, g, gridSet.numColumnsMax());
+        nbnxn_gpu_x_to_nbat_x(gridSet.grids()[g],
+                              gpu_nbv,
+                              d_x,
+                              (g == gridBegin) ? xReadyOnDevice : nullptr, // Sync on first iteration only
+                              locality,
+                              g,
+                              gridSet.numColumnsMax(),
+                              (g == gridEnd - 1));
      }
  }
  
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu

index 6a35676acff4ed2c1cf24b5b9876918b064400fd..190721e2d8b5878a2f7cdeb3fc6d3039fcc31205 100644 (file)
--- a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
@@ -717,7 +717,8 @@ void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid&        grid,
                             GpuEventSynchronizer*     xReadyOnDevice,
                             const Nbnxm::AtomLocality locality,
                             int                       gridId,
-                           int                       numColumnsMax)
+                           int                       numColumnsMax,
+                           bool                      mustInsertNonLocalDependency)
  {
      GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
  
@@ -730,6 +731,12 @@ void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid&        grid,
  
      const DeviceStream& deviceStream = *nb->deviceStreams[interactionLoc];
  
+    if (xReadyOnDevice != nullptr)
+    {
+        // We only need to wait on the first iteration of the loop
+        xReadyOnDevice->enqueueWaitEvent(deviceStream);
+    }
+
      int numAtoms = grid.srcAtomEnd() - grid.srcAtomBegin();
      // avoid empty kernel launch, skip to inserting stream dependency
      if (numAtoms != 0)
@@ -737,9 +744,6 @@ void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid&        grid,
          // TODO: This will only work with CUDA
          GMX_ASSERT(d_x, "Need a valid device pointer");
  
-        // ensure that coordinates are ready on the device before launching the kernel
-        GMX_ASSERT(xReadyOnDevice, "Need a valid GpuEventSynchronizer object");
-        xReadyOnDevice->enqueueWaitEvent(deviceStream);
  
          KernelLaunchConfig config;
          config.blockSize[0] = c_bufOpsThreadsPerBlock;
@@ -772,10 +776,10 @@ void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid&        grid,
          launchGpuKernel(kernelFn, config, deviceStream, nullptr, "XbufferOps", kernelArgs);
      }
  
-    // TODO: note that this is not necessary when there are no local atoms, that is:
-    // (numAtoms == 0 && interactionLoc == InteractionLocality::Local)
-    // but for now we avoid that optimization
-    nbnxnInsertNonlocalGpuDependency(nb, interactionLoc);
+    if (mustInsertNonLocalDependency)
+    {
+        Nbnxm::nbnxnInsertNonlocalGpuDependency(nb, interactionLoc);
+    }
  }
  
  DeviceBuffer<Float3> getGpuForces(NbnxmGpu* nb)
diff --git a/src/gromacs/nbnxm/nbnxm_gpu.h b/src/gromacs/nbnxm/nbnxm_gpu.h

index 60a28c3fa4abe2380b673fd71f9f40ec813d0ee9..7ea060af495603074283b717469b9880f5d1b4ea 100644 (file)
--- a/src/gromacs/nbnxm/nbnxm_gpu.h
+++ b/src/gromacs/nbnxm/nbnxm_gpu.h
@@ -247,6 +247,8 @@ void nbnxn_gpu_init_x_to_nbat_x(const Nbnxm::GridSet gmx_unused& gridSet,
   * \param[in]     locality         Copy coordinates for local or non-local atoms.
   * \param[in]     gridId           Index of the grid being converted.
   * \param[in]     numColumnsMax    Maximum number of columns in the grid.
+ * \param[in]     mustInsertNonLocalDependency Whether synchronization between local and non-local
+ * streams should be added. Typically, true if and only if that is the last grid in gridset.
   */
  CUDA_FUNC_QUALIFIER
  void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid gmx_unused& grid,
@@ -255,7 +257,8 @@ void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid gmx_unused& grid,
                             GpuEventSynchronizer gmx_unused* xReadyOnDevice,
                             gmx::AtomLocality gmx_unused locality,
                             int gmx_unused gridId,
-                           int gmx_unused numColumnsMax) CUDA_FUNC_TERM;
+                           int gmx_unused numColumnsMax,
+                           bool gmx_unused mustInsertNonLocalDependency) CUDA_FUNC_TERM;
  
  /*! \brief Sync the nonlocal stream with dependent tasks in the local queue.
   *
author	Andrey Alekseenko <al42and@gmail.com>
	Sat, 27 Mar 2021 19:24:34 +0000 (19:24 +0000)
committer	Artem Zhmurov <zhmurov@gmail.com>
	Sat, 27 Mar 2021 19:24:34 +0000 (19:24 +0000)
src/gromacs/nbnxm/atomdata.cpp		patch \| blob \| history
src/gromacs/nbnxm/cuda/nbnxm_cuda.cu		patch \| blob \| history
src/gromacs/nbnxm/nbnxm_gpu.h		patch \| blob \| history