Link GPU coordinate producer and consumer tasks
authorSzilárd Páll <pall.szilard@gmail.com>
Thu, 10 Oct 2019 16:10:38 +0000 (18:10 +0200)
committerArtem Zhmurov <zhmurov@gmail.com>
Tue, 15 Oct 2019 13:08:12 +0000 (15:08 +0200)
The event synchronizer indicating that coordinates are ready in the GPU
is now passed to the two tasks that depend on this input: PME and
X buffer ops. Both enqueue a wait on the passed event prior to kernel
launch to ensure that the coordinates are ready before the kernels
start executing.

On the separate PME ranks and in tests, as we use a single stream,
no synchronization is necessary.

With the on-device sync in place, this change also removes the
streamSynchronize call from copyCoordinatesToGpu.

Refs. #2816, #3126.

Change-Id: I3457f01f44ca6d6ad08e0118d8b1def2ab0b381b

14 files changed:
src/gromacs/ewald/pme.h
src/gromacs/ewald/pme_gpu.cpp
src/gromacs/ewald/pme_gpu_internal.cpp
src/gromacs/ewald/pme_gpu_internal.h
src/gromacs/ewald/pme_only.cpp
src/gromacs/ewald/tests/pmetestcommon.cpp
src/gromacs/mdlib/sim_util.cpp
src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp
src/gromacs/nbnxm/atomdata.cpp
src/gromacs/nbnxm/atomdata.h
src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
src/gromacs/nbnxm/nbnxm.cpp
src/gromacs/nbnxm/nbnxm.h
src/gromacs/nbnxm/nbnxm_gpu.h

index c6ec4e6fca8d1c0d8110b4a0bf9bcbfcdf4ffaba..1af591c33d2e833217404991292b9935c9fa7ed6 100644 (file)
@@ -368,10 +368,12 @@ GPU_FUNC_QUALIFIER void pme_gpu_prepare_computation(gmx_pme_t      *GPU_FUNC_ARG
  * Launches first stage of PME on GPU - spreading kernel.
  *
  * \param[in] pme                The PME data structure.
+ * \param[in] xReadyOnDevice     Event synchronizer indicating that the coordinates are ready in the device memory.
  * \param[in] wcycle             The wallclock counter.
  */
-GPU_FUNC_QUALIFIER void pme_gpu_launch_spread(gmx_pme_t      *GPU_FUNC_ARGUMENT(pme),
-                                              gmx_wallcycle  *GPU_FUNC_ARGUMENT(wcycle)) GPU_FUNC_TERM;
+GPU_FUNC_QUALIFIER void pme_gpu_launch_spread(gmx_pme_t            *GPU_FUNC_ARGUMENT(pme),
+                                              GpuEventSynchronizer *GPU_FUNC_ARGUMENT(xReadyOnDevice),
+                                              gmx_wallcycle        *GPU_FUNC_ARGUMENT(wcycle)) GPU_FUNC_TERM;
 
 /*! \brief
  * Launches middle stages of PME (FFT R2C, solving, FFT C2R) either on GPU or on CPU, depending on the run mode.
index 4685913dbde8e7922d0bff0e983f875b96fb4b94..8bbd184ac8a48d97e98e034c6c1415b7dcae504f 100644 (file)
@@ -175,6 +175,7 @@ void pme_gpu_prepare_computation(gmx_pme_t            *pme,
 }
 
 void pme_gpu_launch_spread(gmx_pme_t            *pme,
+                           GpuEventSynchronizer *xReadyOnDevice,
                            gmx_wallcycle        *wcycle)
 {
     GMX_ASSERT(pme_gpu_active(pme), "This should be a GPU run of PME but it is not enabled.");
@@ -190,7 +191,7 @@ void pme_gpu_launch_spread(gmx_pme_t            *pme,
         const bool spreadCharges  = true;
         wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
         wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_PME);
-        pme_gpu_spread(pmeGpu, gridIndex, fftgrid, computeSplines, spreadCharges);
+        pme_gpu_spread(pmeGpu, xReadyOnDevice, gridIndex, fftgrid, computeSplines, spreadCharges);
         wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_PME);
         wallcycle_stop(wcycle, ewcLAUNCH_GPU);
     }
index 7d71782183dafc9b4c5caec0637aac125b5ad8d6..e98ae3a223c8be7ee88cd7aad5682b32866f2442 100644 (file)
@@ -1014,11 +1014,12 @@ std::pair<int, int> inline pmeGpuCreateGrid(const PmeGpu *pmeGpu, int blockCount
     return std::pair<int, int>(colCount, minRowCount);
 }
 
-void pme_gpu_spread(const PmeGpu    *pmeGpu,
-                    int gmx_unused   gridIndex,
-                    real            *h_grid,
-                    bool             computeSplines,
-                    bool             spreadCharges)
+void pme_gpu_spread(const PmeGpu         *pmeGpu,
+                    GpuEventSynchronizer *xReadyOnDevice,
+                    int gmx_unused        gridIndex,
+                    real                 *h_grid,
+                    bool                  computeSplines,
+                    bool                  spreadCharges)
 {
     GMX_ASSERT(computeSplines || spreadCharges, "PME spline/spread kernel has invalid input (nothing to do)");
     const auto   *kernelParamsPtr = pmeGpu->kernelParams.get();
@@ -1037,6 +1038,16 @@ void pme_gpu_spread(const PmeGpu    *pmeGpu,
     //(for spline data mostly, together with varying PME_GPU_PARALLEL_SPLINE define)
     GMX_ASSERT(!c_usePadding || !(c_pmeAtomDataAlignment % atomsPerBlock), "inconsistent atom data padding vs. spreading block size");
 
+    // Ensure that coordinates are ready on the device before launching spread;
+    // only needed with CUDA on PP+PME ranks, not on separate PME ranks, in unit tests
+    // nor in OpenCL as these cases use a single stream (hence xReadyOnDevice == nullptr).
+    // Note: Consider adding an assertion on xReadyOnDevice when we can detect
+    // here separate PME ranks.
+    if (xReadyOnDevice)
+    {
+        xReadyOnDevice->enqueueWaitEvent(pmeGpu->archSpecific->pmeStream);
+    }
+
     const int          blockCount = pmeGpu->nAtomsPadded / atomsPerBlock;
     auto               dimGrid    = pmeGpuCreateGrid(pmeGpu, blockCount);
 
index bdd92bb83be24a3bcd10bc147c8a638bf9cd5640..0f66915b0d65374f52ae7972b61272c0d2cc2cd5 100644 (file)
@@ -383,17 +383,20 @@ void pme_gpu_get_timings(const PmeGpu            *pmeGpu,
  * A GPU spline computation and charge spreading function.
  *
  * \param[in]  pmeGpu          The PME GPU structure.
+ * \param[in]  xReadyOnDevice  Event synchronizer indicating that the coordinates are ready in the device memory;
+ *                             can be nullptr when invoked on a separate PME rank or from PME tests.
  * \param[in]  gridIndex       Index of the PME grid - unused, assumed to be 0.
  * \param[out] h_grid          The host-side grid buffer (used only if the result of the spread is expected on the host,
  *                             e.g. testing or host-side FFT)
  * \param[in]  computeSplines  Should the computation of spline parameters and gridline indices be performed.
  * \param[in]  spreadCharges   Should the charges/coefficients be spread on the grid.
  */
-GPU_FUNC_QUALIFIER void pme_gpu_spread(const PmeGpu    *GPU_FUNC_ARGUMENT(pmeGpu),
-                                       int              GPU_FUNC_ARGUMENT(gridIndex),
-                                       real            *GPU_FUNC_ARGUMENT(h_grid),
-                                       bool             GPU_FUNC_ARGUMENT(computeSplines),
-                                       bool             GPU_FUNC_ARGUMENT(spreadCharges)) GPU_FUNC_TERM;
+GPU_FUNC_QUALIFIER void pme_gpu_spread(const PmeGpu         *GPU_FUNC_ARGUMENT(pmeGpu),
+                                       GpuEventSynchronizer *GPU_FUNC_ARGUMENT(xReadyOnDevice),
+                                       int                   GPU_FUNC_ARGUMENT(gridIndex),
+                                       real                 *GPU_FUNC_ARGUMENT(h_grid),
+                                       bool                  GPU_FUNC_ARGUMENT(computeSplines),
+                                       bool                  GPU_FUNC_ARGUMENT(spreadCharges)) GPU_FUNC_TERM;
 
 /*! \libinternal \brief
  * 3D FFT R2C/C2R routine.
index 10ceb2ce3b8be2e2890c67d1af8681885374285d..8e5ac1de84ce4faf8bcd223b3db43738ccb3d994 100644 (file)
@@ -644,8 +644,10 @@ int gmx_pmeonly(struct gmx_pme_t *pme,
             // or maybe use inputrecDynamicBox(ir), at the very least - change this when this codepath is tested!
             pme_gpu_prepare_computation(pme, boxChanged, box, wcycle, pmeFlags, useGpuPmeForceReduction);
             stateGpu->copyCoordinatesToGpu(gmx::ArrayRef<gmx::RVec>(pme_pp->x), gmx::StatePropagatorDataGpu::AtomLocality::All);
+            // On the separate PME rank we do not need a synchronizer as we schedule everything in a single stream
+            auto xReadyOnDevice = nullptr;
 
-            pme_gpu_launch_spread(pme, wcycle);
+            pme_gpu_launch_spread(pme, xReadyOnDevice, wcycle);
             pme_gpu_launch_complex_transforms(pme, wcycle);
             pme_gpu_launch_gather(pme, wcycle, PmeForceOutputHandling::Set);
             output = pme_gpu_wait_finish_task(pme, pmeFlags, wcycle);
index 2153e2c1395b554b397c784b099c49747b7bf0b8..1311ab30c46b22860670043f7e3607b14d52701f 100644 (file)
@@ -311,8 +311,12 @@ void pmePerformSplineAndSpread(gmx_pme_t *pme, CodePath mode, // TODO const qual
             break;
 
         case CodePath::GPU:
-            pme_gpu_spread(pme->gpu, gridIndex, fftgrid, computeSplines, spreadCharges);
-            break;
+        {
+            // no synchronization needed as x is transferred in the PME stream
+            GpuEventSynchronizer *xReadyOnDevice = nullptr;
+            pme_gpu_spread(pme->gpu, xReadyOnDevice, gridIndex, fftgrid, computeSplines, spreadCharges);
+        }
+        break;
 
         default:
             GMX_THROW(InternalError("Test not implemented for this mode"));
index 4c8323444d1d37180d6c61af1fed89376668edb7..8aa2d195f9ed0b49552eb957d4a540249f1f8a5f 100644 (file)
@@ -613,16 +613,18 @@ static int makePmeFlags(const StepWorkload &stepWork)
  * \param[in]  box                  The box matrix
  * \param[in]  stepWork             Step schedule flags
  * \param[in]  pmeFlags             PME flags
+ * \param[in]  xReadyOnDevice       Event synchronizer indicating that the coordinates are ready in the device memory.
  * \param[in]  wcycle               The wallcycle structure
  */
-static inline void launchPmeGpuSpread(gmx_pme_t          *pmedata,
-                                      const matrix        box,
-                                      const StepWorkload &stepWork,
-                                      int                 pmeFlags,
-                                      gmx_wallcycle_t     wcycle)
+static inline void launchPmeGpuSpread(gmx_pme_t            *pmedata,
+                                      const matrix          box,
+                                      const StepWorkload   &stepWork,
+                                      int                   pmeFlags,
+                                      GpuEventSynchronizer *xReadyOnDevice,
+                                      gmx_wallcycle_t       wcycle)
 {
     pme_gpu_prepare_computation(pmedata, stepWork.haveDynamicBox, box, wcycle, pmeFlags, stepWork.useGpuPmeFReduction);
-    pme_gpu_launch_spread(pmedata, wcycle);
+    pme_gpu_launch_spread(pmedata, xReadyOnDevice, wcycle);
 }
 
 /*! \brief Launch the FFT and gather stages of PME GPU
@@ -1028,9 +1030,12 @@ void do_force(FILE                                     *fplog,
         stateGpu->copyCoordinatesToGpu(x.unpaddedArrayRef(), gmx::StatePropagatorDataGpu::AtomLocality::Local);
     }
 
+    const auto localXReadyOnDevice = (stateGpu != nullptr) ? stateGpu->getCoordinatesReadyOnDeviceEvent(gmx::StatePropagatorDataGpu::AtomLocality::Local,
+                                                                                                        simulationWork, stepWork) : nullptr;
     if (useGpuPmeOnThisRank)
     {
-        launchPmeGpuSpread(fr->pmedata, box, stepWork, pmeFlags, wcycle);
+        launchPmeGpuSpread(fr->pmedata, box, stepWork, pmeFlags,
+                           localXReadyOnDevice, wcycle);
     }
 
     /* do gridding for pair search */
@@ -1148,7 +1153,8 @@ void do_force(FILE                                     *fplog,
         if (useGpuXBufOps == BufferOpsUseGpu::True)
         {
             nbv->convertCoordinatesGpu(Nbnxm::AtomLocality::Local, false,
-                                       stateGpu->getCoordinates());
+                                       stateGpu->getCoordinates(),
+                                       localXReadyOnDevice);
         }
         else
         {
@@ -1259,7 +1265,9 @@ void do_force(FILE                                     *fplog,
                     stateGpu->copyCoordinatesToGpu(x.unpaddedArrayRef(), gmx::StatePropagatorDataGpu::AtomLocality::NonLocal);
                 }
                 nbv->convertCoordinatesGpu(Nbnxm::AtomLocality::NonLocal, false,
-                                           stateGpu->getCoordinates());
+                                           stateGpu->getCoordinates(),
+                                           stateGpu->getCoordinatesReadyOnDeviceEvent(gmx::StatePropagatorDataGpu::AtomLocality::NonLocal,
+                                                                                      simulationWork, stepWork));
             }
             else
             {
index 70c73a1657d38fb21041e6b1a9cabd3d231ff04f..fd591bc9846c822617efc7a4d0999b389c0ec81d 100644 (file)
@@ -72,7 +72,6 @@ StatePropagatorDataGpu::Impl::Impl(const void            *pmeStream,
     paddingSize_(paddingSize)
 {
     static_assert(GMX_GPU != GMX_GPU_NONE, "This object should only be constructed on the GPU code-paths.");
-    GMX_RELEASE_ASSERT(getenv("GMX_USE_GPU_BUFFER_OPS") == nullptr, "GPU buffer ops are not supported in this build.");
 
     // TODO: Refactor when the StreamManager is introduced.
     if (GMX_GPU == GMX_GPU_OPENCL)
@@ -136,7 +135,6 @@ StatePropagatorDataGpu::Impl::Impl(const void            *pmeStream,
     paddingSize_(paddingSize)
 {
     static_assert(GMX_GPU != GMX_GPU_NONE, "This object should only be constructed on the GPU code-paths.");
-    GMX_RELEASE_ASSERT(getenv("GMX_USE_GPU_BUFFER_OPS") == nullptr, "GPU buffer ops are not supported in this build.");
 
     if (GMX_GPU == GMX_GPU_OPENCL)
     {
@@ -303,8 +301,6 @@ void StatePropagatorDataGpu::Impl::copyCoordinatesToGpu(const gmx::ArrayRef<cons
     if (GMX_GPU == GMX_GPU_CUDA)
     {
         xReadyOnDevice_[atomLocality].markEvent(commandStream);
-        // TODO: Remove When event-based synchronization is introduced
-        gpuStreamSynchronize(commandStream);
     }
 }
 
@@ -319,6 +315,11 @@ GpuEventSynchronizer* StatePropagatorDataGpu::Impl::getCoordinatesReadyOnDeviceE
     //
     // TODO: This should be reconsidered to support the halo exchange.
     //
+    // In OpenCL no events are used as coordinate sync is not necessary
+    if (GMX_GPU == GMX_GPU_OPENCL)
+    {
+        return nullptr;
+    }
     if (atomLocality == AtomLocality::Local && simulationWork.useGpuUpdate && !stepWork.doNeighborSearch)
     {
         return &xUpdatedOnDevice_;
index 4a22cebe83e32e21dbde7514f7341881c3e0f8bf..99b69b0f9a49bd5d9e997a73be468356c3b54066 100644 (file)
@@ -1085,11 +1085,12 @@ void nbnxn_atomdata_copy_x_to_nbat_x(const Nbnxm::GridSet     &gridSet,
 }
 
 /* Copies (and reorders) the coordinates to nbnxn_atomdata_t on the GPU*/
-void nbnxn_atomdata_x_to_nbat_x_gpu(const Nbnxm::GridSet     &gridSet,
-                                    const Nbnxm::AtomLocality locality,
-                                    bool                      fillLocal,
-                                    gmx_nbnxn_gpu_t          *gpu_nbv,
-                                    DeviceBuffer<float>       d_x)
+void nbnxn_atomdata_x_to_nbat_x_gpu(const Nbnxm::GridSet       &gridSet,
+                                    const Nbnxm::AtomLocality   locality,
+                                    bool                        fillLocal,
+                                    gmx_nbnxn_gpu_t            *gpu_nbv,
+                                    DeviceBuffer<float>         d_x,
+                                    GpuEventSynchronizer       *xReadyOnDevice)
 {
 
     int gridBegin = 0;
@@ -1102,6 +1103,7 @@ void nbnxn_atomdata_x_to_nbat_x_gpu(const Nbnxm::GridSet     &gridSet,
                               fillLocal && g == 0,
                               gpu_nbv,
                               d_x,
+                              xReadyOnDevice,
                               locality,
                               g,
                               gridSet.numColumnsMax());
index 35d711beffc3902949d18d1c086528abe57e8d35..bb3984d4d3535a73a27d08957e63fd431db70bc6 100644 (file)
@@ -334,12 +334,14 @@ void nbnxn_atomdata_copy_x_to_nbat_x(const Nbnxm::GridSet       &gridSet,
  * \param[in]     fillLocal  Tells if the local filler particle coordinates should be zeroed.
  * \param[in,out] gpu_nbv    The NBNXM GPU data structure.
  * \param[in]     d_x        Coordinates to be copied (in plain rvec format).
+ * \param[in]     xReadyOnDevice   Event synchronizer indicating that the coordinates are ready in the device memory.
  */
-void nbnxn_atomdata_x_to_nbat_x_gpu(const Nbnxm::GridSet     &gridSet,
-                                    Nbnxm::AtomLocality       locality,
-                                    bool                      fillLocal,
-                                    gmx_nbnxn_gpu_t          *gpu_nbv,
-                                    DeviceBuffer<float>       d_x);
+void nbnxn_atomdata_x_to_nbat_x_gpu(const Nbnxm::GridSet      &gridSet,
+                                    Nbnxm::AtomLocality        locality,
+                                    bool                       fillLocal,
+                                    gmx_nbnxn_gpu_t           *gpu_nbv,
+                                    DeviceBuffer<float>        d_x,
+                                    GpuEventSynchronizer      *xReadyOnDevice);
 
 /*! \brief Add the computed forces to \p f, an internal reduction might be performed as well
  *
index e6cd8253b752af1a6faad70e83dd79139e91bae4..7b1241e5e2ca56f1bdfee93a950964d11692d3e0 100644 (file)
@@ -748,6 +748,7 @@ void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid               &grid,
                            bool                             setFillerCoords,
                            gmx_nbnxn_gpu_t                 *nb,
                            DeviceBuffer<float>              d_x,
+                           GpuEventSynchronizer            *xReadyOnDevice,
                            const Nbnxm::AtomLocality        locality,
                            int                              gridId,
                            int                              numColumnsMax)
@@ -770,6 +771,10 @@ void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid               &grid,
         // TODO: This will only work with CUDA
         GMX_ASSERT(d_x, "Need a valid device pointer");
 
+        // ensure that coordinates are ready on the device before launching the kernel
+        GMX_ASSERT(xReadyOnDevice, "Need a valid GpuEventSynchronizer object");
+        xReadyOnDevice->enqueueWaitEvent(stream);
+
         KernelLaunchConfig config;
         config.blockSize[0]     = c_bufOpsThreadsPerBlock;
         config.blockSize[1]     = 1;
index 4fb8cbacdafaf2028d967349034c2f40ec9bc138..3a796b504e67ada819190c51ffcd665e649e591e 100644 (file)
@@ -147,16 +147,18 @@ void nonbonded_verlet_t::convertCoordinates(const Nbnxm::AtomLocality       loca
     wallcycle_stop(wcycle_, ewcNB_XF_BUF_OPS);
 }
 
-void nonbonded_verlet_t::convertCoordinatesGpu(const Nbnxm::AtomLocality       locality,
-                                               const bool                      fillLocal,
-                                               DeviceBuffer<float>             d_x)
+void nonbonded_verlet_t::convertCoordinatesGpu(const Nbnxm::AtomLocality        locality,
+                                               const bool                       fillLocal,
+                                               DeviceBuffer<float>              d_x,
+                                               GpuEventSynchronizer            *xReadyOnDevice)
 {
     wallcycle_start(wcycle_, ewcNB_XF_BUF_OPS);
     wallcycle_sub_start(wcycle_, ewcsNB_X_BUF_OPS);
 
     nbnxn_atomdata_x_to_nbat_x_gpu(pairSearch_->gridSet(), locality, fillLocal,
                                    gpu_nbv,
-                                   d_x);
+                                   d_x,
+                                   xReadyOnDevice);
 
     wallcycle_sub_stop(wcycle_, ewcsNB_X_BUF_OPS);
     wallcycle_stop(wcycle_, ewcNB_XF_BUF_OPS);
index a5ef5baff3f74d6cf2558c369fe0bf587b4e01bf..4378663b933cea0b1139e7596a12ce59cb7d2865 100644 (file)
@@ -280,13 +280,15 @@ struct nonbonded_verlet_t
          *
          * The API function for the transformation of the coordinates from one layout to another in the GPU memory.
          *
-         * \param[in] locality   Whether coordinates for local or non-local atoms should be transformed.
-         * \param[in] fillLocal  If the coordinates for filler particles should be zeroed.
-         * \param[in] d_x        GPU coordinates buffer in plain rvec format to be transformed.
+         * \param[in] locality        Whether coordinates for local or non-local atoms should be transformed.
+         * \param[in] fillLocal       If the coordinates for filler particles should be zeroed.
+         * \param[in] d_x             GPU coordinates buffer in plain rvec format to be transformed.
+         * \param[in] xReadyOnDevice  Event synchronizer indicating that the coordinates are ready in the device memory.
          */
-        void convertCoordinatesGpu(Nbnxm::AtomLocality             locality,
-                                   bool                            fillLocal,
-                                   DeviceBuffer<float>             d_x);
+        void convertCoordinatesGpu(Nbnxm::AtomLocality              locality,
+                                   bool                             fillLocal,
+                                   DeviceBuffer<float>              d_x,
+                                   GpuEventSynchronizer            *xReadyOnDevice);
 
         //! Init for GPU version of setup coordinates in Nbnxm
         void atomdata_init_copy_x_to_nbat_x_gpu();
index afdafb80ff8fa343726676861f0ccb19d4be9c2e..1061292b2da57a9dd50eb8db0332a2736eeaecbd 100644 (file)
@@ -235,18 +235,20 @@ void nbnxn_gpu_init_x_to_nbat_x(const Nbnxm::GridSet gmx_unused &gridSet,
  * \param[in]     setFillerCoords  If the filler coordinates are used.
  * \param[in,out] gpu_nbv          The nonbonded data GPU structure.
  * \param[in]     d_x              Device-side coordinates in plain rvec format.
+ * \param[in]     xReadyOnDevice   Event synchronizer indicating that the coordinates are ready in the device memory.
  * \param[in]     locality         Copy coordinates for local or non-local atoms.
  * \param[in]     gridId           Index of the grid being converted.
  * \param[in]     numColumnsMax    Maximum number of columns in the grid.
  */
 CUDA_FUNC_QUALIFIER
-void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid   gmx_unused &grid,
-                           bool                gmx_unused  setFillerCoords,
-                           gmx_nbnxn_gpu_t     gmx_unused *gpu_nbv,
-                           DeviceBuffer<float> gmx_unused  d_x,
-                           Nbnxm::AtomLocality gmx_unused  locality,
-                           int                 gmx_unused  gridId,
-                           int                 gmx_unused  numColumnsMax) CUDA_FUNC_TERM;
+void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid    gmx_unused &grid,
+                           bool                 gmx_unused  setFillerCoords,
+                           gmx_nbnxn_gpu_t      gmx_unused *gpu_nbv,
+                           DeviceBuffer<float>  gmx_unused  d_x,
+                           GpuEventSynchronizer gmx_unused *xReadyOnDevice,
+                           Nbnxm::AtomLocality  gmx_unused  locality,
+                           int                  gmx_unused  gridId,
+                           int                  gmx_unused  numColumnsMax) CUDA_FUNC_TERM;
 
 /*! \brief Sync the nonlocal stream with dependent tasks in the local queue.
  * \param[in] nb                   The nonbonded data GPU structure