Avoid enqueueing the same event multiple times in nbnxn_atomdata_x_to_nbat_x_gpu
authorAndrey Alekseenko <al42and@gmail.com>
Sat, 27 Mar 2021 19:24:34 +0000 (19:24 +0000)
committerArtem Zhmurov <zhmurov@gmail.com>
Sat, 27 Mar 2021 19:24:34 +0000 (19:24 +0000)
With several non-local grids, we were calling
nbnxnInsertNonlocalGpuDependency(nb, NonLocal) in nbnxn_gpu_x_to_nbat_x
after enqueueing the transformation kernel for each one.

Now, we only call it once after submitting all kernels for
given locality (marking the event for Local, enqueueing the wait for it
for NonLocal).

Refs #3988.

src/gromacs/nbnxm/atomdata.cpp
src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
src/gromacs/nbnxm/nbnxm_gpu.h

index 4048bea672b9d80432e3ccb7ae49590dd25018e7..d9639e9b5be8627cb8bd7a6b30f0367a1412a4f9 100644 (file)
@@ -1033,6 +1033,7 @@ void nbnxn_atomdata_x_to_nbat_x_gpu(const Nbnxm::GridSet&   gridSet,
                                     DeviceBuffer<RVec>      d_x,
                                     GpuEventSynchronizer*   xReadyOnDevice)
 {
+    GMX_ASSERT(xReadyOnDevice != nullptr, "Need a valid GpuEventSynchronizer object");
 
     int gridBegin = 0;
     int gridEnd   = 0;
@@ -1040,8 +1041,14 @@ void nbnxn_atomdata_x_to_nbat_x_gpu(const Nbnxm::GridSet&   gridSet,
 
     for (int g = gridBegin; g < gridEnd; g++)
     {
-        nbnxn_gpu_x_to_nbat_x(
-                gridSet.grids()[g], gpu_nbv, d_x, xReadyOnDevice, locality, g, gridSet.numColumnsMax());
+        nbnxn_gpu_x_to_nbat_x(gridSet.grids()[g],
+                              gpu_nbv,
+                              d_x,
+                              (g == gridBegin) ? xReadyOnDevice : nullptr, // Sync on first iteration only
+                              locality,
+                              g,
+                              gridSet.numColumnsMax(),
+                              (g == gridEnd - 1));
     }
 }
 
index 6a35676acff4ed2c1cf24b5b9876918b064400fd..190721e2d8b5878a2f7cdeb3fc6d3039fcc31205 100644 (file)
@@ -717,7 +717,8 @@ void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid&        grid,
                            GpuEventSynchronizer*     xReadyOnDevice,
                            const Nbnxm::AtomLocality locality,
                            int                       gridId,
-                           int                       numColumnsMax)
+                           int                       numColumnsMax,
+                           bool                      mustInsertNonLocalDependency)
 {
     GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
 
@@ -730,6 +731,12 @@ void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid&        grid,
 
     const DeviceStream& deviceStream = *nb->deviceStreams[interactionLoc];
 
+    if (xReadyOnDevice != nullptr)
+    {
+        // We only need to wait on the first iteration of the loop
+        xReadyOnDevice->enqueueWaitEvent(deviceStream);
+    }
+
     int numAtoms = grid.srcAtomEnd() - grid.srcAtomBegin();
     // avoid empty kernel launch, skip to inserting stream dependency
     if (numAtoms != 0)
@@ -737,9 +744,6 @@ void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid&        grid,
         // TODO: This will only work with CUDA
         GMX_ASSERT(d_x, "Need a valid device pointer");
 
-        // ensure that coordinates are ready on the device before launching the kernel
-        GMX_ASSERT(xReadyOnDevice, "Need a valid GpuEventSynchronizer object");
-        xReadyOnDevice->enqueueWaitEvent(deviceStream);
 
         KernelLaunchConfig config;
         config.blockSize[0] = c_bufOpsThreadsPerBlock;
@@ -772,10 +776,10 @@ void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid&        grid,
         launchGpuKernel(kernelFn, config, deviceStream, nullptr, "XbufferOps", kernelArgs);
     }
 
-    // TODO: note that this is not necessary when there are no local atoms, that is:
-    // (numAtoms == 0 && interactionLoc == InteractionLocality::Local)
-    // but for now we avoid that optimization
-    nbnxnInsertNonlocalGpuDependency(nb, interactionLoc);
+    if (mustInsertNonLocalDependency)
+    {
+        Nbnxm::nbnxnInsertNonlocalGpuDependency(nb, interactionLoc);
+    }
 }
 
 DeviceBuffer<Float3> getGpuForces(NbnxmGpu* nb)
index 60a28c3fa4abe2380b673fd71f9f40ec813d0ee9..7ea060af495603074283b717469b9880f5d1b4ea 100644 (file)
@@ -247,6 +247,8 @@ void nbnxn_gpu_init_x_to_nbat_x(const Nbnxm::GridSet gmx_unused& gridSet,
  * \param[in]     locality         Copy coordinates for local or non-local atoms.
  * \param[in]     gridId           Index of the grid being converted.
  * \param[in]     numColumnsMax    Maximum number of columns in the grid.
+ * \param[in]     mustInsertNonLocalDependency Whether synchronization between local and non-local
+ * streams should be added. Typically, true if and only if that is the last grid in gridset.
  */
 CUDA_FUNC_QUALIFIER
 void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid gmx_unused& grid,
@@ -255,7 +257,8 @@ void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid gmx_unused& grid,
                            GpuEventSynchronizer gmx_unused* xReadyOnDevice,
                            gmx::AtomLocality gmx_unused locality,
                            int gmx_unused gridId,
-                           int gmx_unused numColumnsMax) CUDA_FUNC_TERM;
+                           int gmx_unused numColumnsMax,
+                           bool gmx_unused mustInsertNonLocalDependency) CUDA_FUNC_TERM;
 
 /*! \brief Sync the nonlocal stream with dependent tasks in the local queue.
  *