From: Szilárd Páll <pall.szilard@gmail.com>
Date: Thu, 10 Oct 2019 16:10:38 +0000 (+0200)
Subject: Link GPU coordinate producer and consumer tasks
X-Git-Url: http://biod.pnpi.spb.ru/gitweb/?a=commitdiff_plain;h=13f5fac20132d264da019c27d32b1c2502da669f;p=alexxy%2Fgromacs.git

Link GPU coordinate producer and consumer tasks

The event synchronizer indicating that coordinates are ready in the GPU
is now passed to the two tasks that depend on this input: PME and
X buffer ops. Both enqueue a wait on the passed event prior to kernel
launch to ensure that the coordinates are ready before the kernels
start executing.

On the separate PME ranks and in tests, as we use a single stream,
no synchronization is necessary.

With the on-device sync in place, this change also removes the
streamSynchronize call from copyCoordinatesToGpu.

Refs. #2816, #3126.

Change-Id: I3457f01f44ca6d6ad08e0118d8b1def2ab0b381b
---

diff --git a/src/gromacs/ewald/pme.h b/src/gromacs/ewald/pme.h
index c6ec4e6fca..1af591c33d 100644
--- a/src/gromacs/ewald/pme.h
+++ b/src/gromacs/ewald/pme.h
@@ -368,10 +368,12 @@ GPU_FUNC_QUALIFIER void pme_gpu_prepare_computation(gmx_pme_t      *GPU_FUNC_ARG
  * Launches first stage of PME on GPU - spreading kernel.
  *
  * \param[in] pme                The PME data structure.
+ * \param[in] xReadyOnDevice     Event synchronizer indicating that the coordinates are ready in the device memory.
  * \param[in] wcycle             The wallclock counter.
  */
-GPU_FUNC_QUALIFIER void pme_gpu_launch_spread(gmx_pme_t      *GPU_FUNC_ARGUMENT(pme),
-                                              gmx_wallcycle  *GPU_FUNC_ARGUMENT(wcycle)) GPU_FUNC_TERM;
+GPU_FUNC_QUALIFIER void pme_gpu_launch_spread(gmx_pme_t            *GPU_FUNC_ARGUMENT(pme),
+                                              GpuEventSynchronizer *GPU_FUNC_ARGUMENT(xReadyOnDevice),
+                                              gmx_wallcycle        *GPU_FUNC_ARGUMENT(wcycle)) GPU_FUNC_TERM;
 
 /*! \brief
  * Launches middle stages of PME (FFT R2C, solving, FFT C2R) either on GPU or on CPU, depending on the run mode.
diff --git a/src/gromacs/ewald/pme_gpu.cpp b/src/gromacs/ewald/pme_gpu.cpp
index 4685913dbd..8bbd184ac8 100644
--- a/src/gromacs/ewald/pme_gpu.cpp
+++ b/src/gromacs/ewald/pme_gpu.cpp
@@ -175,6 +175,7 @@ void pme_gpu_prepare_computation(gmx_pme_t            *pme,
 }
 
 void pme_gpu_launch_spread(gmx_pme_t            *pme,
+                           GpuEventSynchronizer *xReadyOnDevice,
                            gmx_wallcycle        *wcycle)
 {
     GMX_ASSERT(pme_gpu_active(pme), "This should be a GPU run of PME but it is not enabled.");
@@ -190,7 +191,7 @@ void pme_gpu_launch_spread(gmx_pme_t            *pme,
         const bool spreadCharges  = true;
         wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
         wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_PME);
-        pme_gpu_spread(pmeGpu, gridIndex, fftgrid, computeSplines, spreadCharges);
+        pme_gpu_spread(pmeGpu, xReadyOnDevice, gridIndex, fftgrid, computeSplines, spreadCharges);
         wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_PME);
         wallcycle_stop(wcycle, ewcLAUNCH_GPU);
     }
diff --git a/src/gromacs/ewald/pme_gpu_internal.cpp b/src/gromacs/ewald/pme_gpu_internal.cpp
index 7d71782183..e98ae3a223 100644
--- a/src/gromacs/ewald/pme_gpu_internal.cpp
+++ b/src/gromacs/ewald/pme_gpu_internal.cpp
@@ -1014,11 +1014,12 @@ std::pair<int, int> inline pmeGpuCreateGrid(const PmeGpu *pmeGpu, int blockCount
     return std::pair<int, int>(colCount, minRowCount);
 }
 
-void pme_gpu_spread(const PmeGpu    *pmeGpu,
-                    int gmx_unused   gridIndex,
-                    real            *h_grid,
-                    bool             computeSplines,
-                    bool             spreadCharges)
+void pme_gpu_spread(const PmeGpu         *pmeGpu,
+                    GpuEventSynchronizer *xReadyOnDevice,
+                    int gmx_unused        gridIndex,
+                    real                 *h_grid,
+                    bool                  computeSplines,
+                    bool                  spreadCharges)
 {
     GMX_ASSERT(computeSplines || spreadCharges, "PME spline/spread kernel has invalid input (nothing to do)");
     const auto   *kernelParamsPtr = pmeGpu->kernelParams.get();
@@ -1037,6 +1038,16 @@ void pme_gpu_spread(const PmeGpu    *pmeGpu,
     //(for spline data mostly, together with varying PME_GPU_PARALLEL_SPLINE define)
     GMX_ASSERT(!c_usePadding || !(c_pmeAtomDataAlignment % atomsPerBlock), "inconsistent atom data padding vs. spreading block size");
 
+    // Ensure that coordinates are ready on the device before launching spread;
+    // only needed with CUDA on PP+PME ranks, not on separate PME ranks, in unit tests
+    // nor in OpenCL as these cases use a single stream (hence xReadyOnDevice == nullptr).
+    // Note: Consider adding an assertion on xReadyOnDevice when we can detect
+    // here separate PME ranks.
+    if (xReadyOnDevice)
+    {
+        xReadyOnDevice->enqueueWaitEvent(pmeGpu->archSpecific->pmeStream);
+    }
+
     const int          blockCount = pmeGpu->nAtomsPadded / atomsPerBlock;
     auto               dimGrid    = pmeGpuCreateGrid(pmeGpu, blockCount);
 
diff --git a/src/gromacs/ewald/pme_gpu_internal.h b/src/gromacs/ewald/pme_gpu_internal.h
index bdd92bb83b..0f66915b0d 100644
--- a/src/gromacs/ewald/pme_gpu_internal.h
+++ b/src/gromacs/ewald/pme_gpu_internal.h
@@ -383,17 +383,20 @@ void pme_gpu_get_timings(const PmeGpu            *pmeGpu,
  * A GPU spline computation and charge spreading function.
  *
  * \param[in]  pmeGpu          The PME GPU structure.
+ * \param[in]  xReadyOnDevice  Event synchronizer indicating that the coordinates are ready in the device memory;
+ *                             can be nullptr when invoked on a separate PME rank or from PME tests.
  * \param[in]  gridIndex       Index of the PME grid - unused, assumed to be 0.
  * \param[out] h_grid          The host-side grid buffer (used only if the result of the spread is expected on the host,
  *                             e.g. testing or host-side FFT)
  * \param[in]  computeSplines  Should the computation of spline parameters and gridline indices be performed.
  * \param[in]  spreadCharges   Should the charges/coefficients be spread on the grid.
  */
-GPU_FUNC_QUALIFIER void pme_gpu_spread(const PmeGpu    *GPU_FUNC_ARGUMENT(pmeGpu),
-                                       int              GPU_FUNC_ARGUMENT(gridIndex),
-                                       real            *GPU_FUNC_ARGUMENT(h_grid),
-                                       bool             GPU_FUNC_ARGUMENT(computeSplines),
-                                       bool             GPU_FUNC_ARGUMENT(spreadCharges)) GPU_FUNC_TERM;
+GPU_FUNC_QUALIFIER void pme_gpu_spread(const PmeGpu         *GPU_FUNC_ARGUMENT(pmeGpu),
+                                       GpuEventSynchronizer *GPU_FUNC_ARGUMENT(xReadyOnDevice),
+                                       int                   GPU_FUNC_ARGUMENT(gridIndex),
+                                       real                 *GPU_FUNC_ARGUMENT(h_grid),
+                                       bool                  GPU_FUNC_ARGUMENT(computeSplines),
+                                       bool                  GPU_FUNC_ARGUMENT(spreadCharges)) GPU_FUNC_TERM;
 
 /*! \libinternal \brief
  * 3D FFT R2C/C2R routine.
diff --git a/src/gromacs/ewald/pme_only.cpp b/src/gromacs/ewald/pme_only.cpp
index 10ceb2ce3b..8e5ac1de84 100644
--- a/src/gromacs/ewald/pme_only.cpp
+++ b/src/gromacs/ewald/pme_only.cpp
@@ -644,8 +644,10 @@ int gmx_pmeonly(struct gmx_pme_t *pme,
             // or maybe use inputrecDynamicBox(ir), at the very least - change this when this codepath is tested!
             pme_gpu_prepare_computation(pme, boxChanged, box, wcycle, pmeFlags, useGpuPmeForceReduction);
             stateGpu->copyCoordinatesToGpu(gmx::ArrayRef<gmx::RVec>(pme_pp->x), gmx::StatePropagatorDataGpu::AtomLocality::All);
+            // On the separate PME rank we do not need a synchronizer as we schedule everything in a single stream
+            auto xReadyOnDevice = nullptr;
 
-            pme_gpu_launch_spread(pme, wcycle);
+            pme_gpu_launch_spread(pme, xReadyOnDevice, wcycle);
             pme_gpu_launch_complex_transforms(pme, wcycle);
             pme_gpu_launch_gather(pme, wcycle, PmeForceOutputHandling::Set);
             output = pme_gpu_wait_finish_task(pme, pmeFlags, wcycle);
diff --git a/src/gromacs/ewald/tests/pmetestcommon.cpp b/src/gromacs/ewald/tests/pmetestcommon.cpp
index 2153e2c139..1311ab30c4 100644
--- a/src/gromacs/ewald/tests/pmetestcommon.cpp
+++ b/src/gromacs/ewald/tests/pmetestcommon.cpp
@@ -311,8 +311,12 @@ void pmePerformSplineAndSpread(gmx_pme_t *pme, CodePath mode, // TODO const qual
             break;
 
         case CodePath::GPU:
-            pme_gpu_spread(pme->gpu, gridIndex, fftgrid, computeSplines, spreadCharges);
-            break;
+        {
+            // no synchronization needed as x is transferred in the PME stream
+            GpuEventSynchronizer *xReadyOnDevice = nullptr;
+            pme_gpu_spread(pme->gpu, xReadyOnDevice, gridIndex, fftgrid, computeSplines, spreadCharges);
+        }
+        break;
 
         default:
             GMX_THROW(InternalError("Test not implemented for this mode"));
diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp
index 4c8323444d..8aa2d195f9 100644
--- a/src/gromacs/mdlib/sim_util.cpp
+++ b/src/gromacs/mdlib/sim_util.cpp
@@ -613,16 +613,18 @@ static int makePmeFlags(const StepWorkload &stepWork)
  * \param[in]  box                  The box matrix
  * \param[in]  stepWork             Step schedule flags
  * \param[in]  pmeFlags             PME flags
+ * \param[in]  xReadyOnDevice       Event synchronizer indicating that the coordinates are ready in the device memory.
  * \param[in]  wcycle               The wallcycle structure
  */
-static inline void launchPmeGpuSpread(gmx_pme_t          *pmedata,
-                                      const matrix        box,
-                                      const StepWorkload &stepWork,
-                                      int                 pmeFlags,
-                                      gmx_wallcycle_t     wcycle)
+static inline void launchPmeGpuSpread(gmx_pme_t            *pmedata,
+                                      const matrix          box,
+                                      const StepWorkload   &stepWork,
+                                      int                   pmeFlags,
+                                      GpuEventSynchronizer *xReadyOnDevice,
+                                      gmx_wallcycle_t       wcycle)
 {
     pme_gpu_prepare_computation(pmedata, stepWork.haveDynamicBox, box, wcycle, pmeFlags, stepWork.useGpuPmeFReduction);
-    pme_gpu_launch_spread(pmedata, wcycle);
+    pme_gpu_launch_spread(pmedata, xReadyOnDevice, wcycle);
 }
 
 /*! \brief Launch the FFT and gather stages of PME GPU
@@ -1028,9 +1030,12 @@ void do_force(FILE                                     *fplog,
         stateGpu->copyCoordinatesToGpu(x.unpaddedArrayRef(), gmx::StatePropagatorDataGpu::AtomLocality::Local);
     }
 
+    const auto localXReadyOnDevice = (stateGpu != nullptr) ? stateGpu->getCoordinatesReadyOnDeviceEvent(gmx::StatePropagatorDataGpu::AtomLocality::Local,
+                                                                                                        simulationWork, stepWork) : nullptr;
     if (useGpuPmeOnThisRank)
     {
-        launchPmeGpuSpread(fr->pmedata, box, stepWork, pmeFlags, wcycle);
+        launchPmeGpuSpread(fr->pmedata, box, stepWork, pmeFlags,
+                           localXReadyOnDevice, wcycle);
     }
 
     /* do gridding for pair search */
@@ -1148,7 +1153,8 @@ void do_force(FILE                                     *fplog,
         if (useGpuXBufOps == BufferOpsUseGpu::True)
         {
             nbv->convertCoordinatesGpu(Nbnxm::AtomLocality::Local, false,
-                                       stateGpu->getCoordinates());
+                                       stateGpu->getCoordinates(),
+                                       localXReadyOnDevice);
         }
         else
         {
@@ -1259,7 +1265,9 @@ void do_force(FILE                                     *fplog,
                     stateGpu->copyCoordinatesToGpu(x.unpaddedArrayRef(), gmx::StatePropagatorDataGpu::AtomLocality::NonLocal);
                 }
                 nbv->convertCoordinatesGpu(Nbnxm::AtomLocality::NonLocal, false,
-                                           stateGpu->getCoordinates());
+                                           stateGpu->getCoordinates(),
+                                           stateGpu->getCoordinatesReadyOnDeviceEvent(gmx::StatePropagatorDataGpu::AtomLocality::NonLocal,
+                                                                                      simulationWork, stepWork));
             }
             else
             {
diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp b/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp
index 70c73a1657..fd591bc984 100644
--- a/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp
+++ b/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp
@@ -72,7 +72,6 @@ StatePropagatorDataGpu::Impl::Impl(const void            *pmeStream,
     paddingSize_(paddingSize)
 {
     static_assert(GMX_GPU != GMX_GPU_NONE, "This object should only be constructed on the GPU code-paths.");
-    GMX_RELEASE_ASSERT(getenv("GMX_USE_GPU_BUFFER_OPS") == nullptr, "GPU buffer ops are not supported in this build.");
 
     // TODO: Refactor when the StreamManager is introduced.
     if (GMX_GPU == GMX_GPU_OPENCL)
@@ -136,7 +135,6 @@ StatePropagatorDataGpu::Impl::Impl(const void            *pmeStream,
     paddingSize_(paddingSize)
 {
     static_assert(GMX_GPU != GMX_GPU_NONE, "This object should only be constructed on the GPU code-paths.");
-    GMX_RELEASE_ASSERT(getenv("GMX_USE_GPU_BUFFER_OPS") == nullptr, "GPU buffer ops are not supported in this build.");
 
     if (GMX_GPU == GMX_GPU_OPENCL)
     {
@@ -303,8 +301,6 @@ void StatePropagatorDataGpu::Impl::copyCoordinatesToGpu(const gmx::ArrayRef<cons
     if (GMX_GPU == GMX_GPU_CUDA)
     {
         xReadyOnDevice_[atomLocality].markEvent(commandStream);
-        // TODO: Remove When event-based synchronization is introduced
-        gpuStreamSynchronize(commandStream);
     }
 }
 
@@ -319,6 +315,11 @@ GpuEventSynchronizer* StatePropagatorDataGpu::Impl::getCoordinatesReadyOnDeviceE
     //
     // TODO: This should be reconsidered to support the halo exchange.
     //
+    // In OpenCL no events are used as coordinate sync is not necessary
+    if (GMX_GPU == GMX_GPU_OPENCL)
+    {
+        return nullptr;
+    }
     if (atomLocality == AtomLocality::Local && simulationWork.useGpuUpdate && !stepWork.doNeighborSearch)
     {
         return &xUpdatedOnDevice_;
diff --git a/src/gromacs/nbnxm/atomdata.cpp b/src/gromacs/nbnxm/atomdata.cpp
index 4a22cebe83..99b69b0f9a 100644
--- a/src/gromacs/nbnxm/atomdata.cpp
+++ b/src/gromacs/nbnxm/atomdata.cpp
@@ -1085,11 +1085,12 @@ void nbnxn_atomdata_copy_x_to_nbat_x(const Nbnxm::GridSet     &gridSet,
 }
 
 /* Copies (and reorders) the coordinates to nbnxn_atomdata_t on the GPU*/
-void nbnxn_atomdata_x_to_nbat_x_gpu(const Nbnxm::GridSet     &gridSet,
-                                    const Nbnxm::AtomLocality locality,
-                                    bool                      fillLocal,
-                                    gmx_nbnxn_gpu_t          *gpu_nbv,
-                                    DeviceBuffer<float>       d_x)
+void nbnxn_atomdata_x_to_nbat_x_gpu(const Nbnxm::GridSet       &gridSet,
+                                    const Nbnxm::AtomLocality   locality,
+                                    bool                        fillLocal,
+                                    gmx_nbnxn_gpu_t            *gpu_nbv,
+                                    DeviceBuffer<float>         d_x,
+                                    GpuEventSynchronizer       *xReadyOnDevice)
 {
 
     int gridBegin = 0;
@@ -1102,6 +1103,7 @@ void nbnxn_atomdata_x_to_nbat_x_gpu(const Nbnxm::GridSet     &gridSet,
                               fillLocal && g == 0,
                               gpu_nbv,
                               d_x,
+                              xReadyOnDevice,
                               locality,
                               g,
                               gridSet.numColumnsMax());
diff --git a/src/gromacs/nbnxm/atomdata.h b/src/gromacs/nbnxm/atomdata.h
index 35d711beff..bb3984d4d3 100644
--- a/src/gromacs/nbnxm/atomdata.h
+++ b/src/gromacs/nbnxm/atomdata.h
@@ -334,12 +334,14 @@ void nbnxn_atomdata_copy_x_to_nbat_x(const Nbnxm::GridSet       &gridSet,
  * \param[in]     fillLocal  Tells if the local filler particle coordinates should be zeroed.
  * \param[in,out] gpu_nbv    The NBNXM GPU data structure.
  * \param[in]     d_x        Coordinates to be copied (in plain rvec format).
+ * \param[in]     xReadyOnDevice   Event synchronizer indicating that the coordinates are ready in the device memory.
  */
-void nbnxn_atomdata_x_to_nbat_x_gpu(const Nbnxm::GridSet     &gridSet,
-                                    Nbnxm::AtomLocality       locality,
-                                    bool                      fillLocal,
-                                    gmx_nbnxn_gpu_t          *gpu_nbv,
-                                    DeviceBuffer<float>       d_x);
+void nbnxn_atomdata_x_to_nbat_x_gpu(const Nbnxm::GridSet      &gridSet,
+                                    Nbnxm::AtomLocality        locality,
+                                    bool                       fillLocal,
+                                    gmx_nbnxn_gpu_t           *gpu_nbv,
+                                    DeviceBuffer<float>        d_x,
+                                    GpuEventSynchronizer      *xReadyOnDevice);
 
 /*! \brief Add the computed forces to \p f, an internal reduction might be performed as well
  *
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
index e6cd8253b7..7b1241e5e2 100644
--- a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
@@ -748,6 +748,7 @@ void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid               &grid,
                            bool                             setFillerCoords,
                            gmx_nbnxn_gpu_t                 *nb,
                            DeviceBuffer<float>              d_x,
+                           GpuEventSynchronizer            *xReadyOnDevice,
                            const Nbnxm::AtomLocality        locality,
                            int                              gridId,
                            int                              numColumnsMax)
@@ -770,6 +771,10 @@ void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid               &grid,
         // TODO: This will only work with CUDA
         GMX_ASSERT(d_x, "Need a valid device pointer");
 
+        // ensure that coordinates are ready on the device before launching the kernel
+        GMX_ASSERT(xReadyOnDevice, "Need a valid GpuEventSynchronizer object");
+        xReadyOnDevice->enqueueWaitEvent(stream);
+
         KernelLaunchConfig config;
         config.blockSize[0]     = c_bufOpsThreadsPerBlock;
         config.blockSize[1]     = 1;
diff --git a/src/gromacs/nbnxm/nbnxm.cpp b/src/gromacs/nbnxm/nbnxm.cpp
index 4fb8cbacda..3a796b504e 100644
--- a/src/gromacs/nbnxm/nbnxm.cpp
+++ b/src/gromacs/nbnxm/nbnxm.cpp
@@ -147,16 +147,18 @@ void nonbonded_verlet_t::convertCoordinates(const Nbnxm::AtomLocality       loca
     wallcycle_stop(wcycle_, ewcNB_XF_BUF_OPS);
 }
 
-void nonbonded_verlet_t::convertCoordinatesGpu(const Nbnxm::AtomLocality       locality,
-                                               const bool                      fillLocal,
-                                               DeviceBuffer<float>             d_x)
+void nonbonded_verlet_t::convertCoordinatesGpu(const Nbnxm::AtomLocality        locality,
+                                               const bool                       fillLocal,
+                                               DeviceBuffer<float>              d_x,
+                                               GpuEventSynchronizer            *xReadyOnDevice)
 {
     wallcycle_start(wcycle_, ewcNB_XF_BUF_OPS);
     wallcycle_sub_start(wcycle_, ewcsNB_X_BUF_OPS);
 
     nbnxn_atomdata_x_to_nbat_x_gpu(pairSearch_->gridSet(), locality, fillLocal,
                                    gpu_nbv,
-                                   d_x);
+                                   d_x,
+                                   xReadyOnDevice);
 
     wallcycle_sub_stop(wcycle_, ewcsNB_X_BUF_OPS);
     wallcycle_stop(wcycle_, ewcNB_XF_BUF_OPS);
diff --git a/src/gromacs/nbnxm/nbnxm.h b/src/gromacs/nbnxm/nbnxm.h
index a5ef5baff3..4378663b93 100644
--- a/src/gromacs/nbnxm/nbnxm.h
+++ b/src/gromacs/nbnxm/nbnxm.h
@@ -280,13 +280,15 @@ struct nonbonded_verlet_t
          *
          * The API function for the transformation of the coordinates from one layout to another in the GPU memory.
          *
-         * \param[in] locality   Whether coordinates for local or non-local atoms should be transformed.
-         * \param[in] fillLocal  If the coordinates for filler particles should be zeroed.
-         * \param[in] d_x        GPU coordinates buffer in plain rvec format to be transformed.
+         * \param[in] locality        Whether coordinates for local or non-local atoms should be transformed.
+         * \param[in] fillLocal       If the coordinates for filler particles should be zeroed.
+         * \param[in] d_x             GPU coordinates buffer in plain rvec format to be transformed.
+         * \param[in] xReadyOnDevice  Event synchronizer indicating that the coordinates are ready in the device memory.
          */
-        void convertCoordinatesGpu(Nbnxm::AtomLocality             locality,
-                                   bool                            fillLocal,
-                                   DeviceBuffer<float>             d_x);
+        void convertCoordinatesGpu(Nbnxm::AtomLocality              locality,
+                                   bool                             fillLocal,
+                                   DeviceBuffer<float>              d_x,
+                                   GpuEventSynchronizer            *xReadyOnDevice);
 
         //! Init for GPU version of setup coordinates in Nbnxm
         void atomdata_init_copy_x_to_nbat_x_gpu();
diff --git a/src/gromacs/nbnxm/nbnxm_gpu.h b/src/gromacs/nbnxm/nbnxm_gpu.h
index afdafb80ff..1061292b2d 100644
--- a/src/gromacs/nbnxm/nbnxm_gpu.h
+++ b/src/gromacs/nbnxm/nbnxm_gpu.h
@@ -235,18 +235,20 @@ void nbnxn_gpu_init_x_to_nbat_x(const Nbnxm::GridSet gmx_unused &gridSet,
  * \param[in]     setFillerCoords  If the filler coordinates are used.
  * \param[in,out] gpu_nbv          The nonbonded data GPU structure.
  * \param[in]     d_x              Device-side coordinates in plain rvec format.
+ * \param[in]     xReadyOnDevice   Event synchronizer indicating that the coordinates are ready in the device memory.
  * \param[in]     locality         Copy coordinates for local or non-local atoms.
  * \param[in]     gridId           Index of the grid being converted.
  * \param[in]     numColumnsMax    Maximum number of columns in the grid.
  */
 CUDA_FUNC_QUALIFIER
-void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid   gmx_unused &grid,
-                           bool                gmx_unused  setFillerCoords,
-                           gmx_nbnxn_gpu_t     gmx_unused *gpu_nbv,
-                           DeviceBuffer<float> gmx_unused  d_x,
-                           Nbnxm::AtomLocality gmx_unused  locality,
-                           int                 gmx_unused  gridId,
-                           int                 gmx_unused  numColumnsMax) CUDA_FUNC_TERM;
+void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid    gmx_unused &grid,
+                           bool                 gmx_unused  setFillerCoords,
+                           gmx_nbnxn_gpu_t      gmx_unused *gpu_nbv,
+                           DeviceBuffer<float>  gmx_unused  d_x,
+                           GpuEventSynchronizer gmx_unused *xReadyOnDevice,
+                           Nbnxm::AtomLocality  gmx_unused  locality,
+                           int                  gmx_unused  gridId,
+                           int                  gmx_unused  numColumnsMax) CUDA_FUNC_TERM;
 
 /*! \brief Sync the nonlocal stream with dependent tasks in the local queue.
  * \param[in] nb                   The nonbonded data GPU structure