From 27eef47200e4bd955962596212e7f6c52368a75b Mon Sep 17 00:00:00 2001
From: =?utf8?q?Szil=C3=A1rd=20P=C3=A1ll?= <pall.szilard@gmail.com>
Date: Fri, 27 Oct 2017 18:24:36 +0200
Subject: [PATCH] Implement alternating GPU wait

When both PME and nonbonded tasks are offloaded, instead of waiting in a
blocking call for each task in a predefined order, we poll the GPU
streams and start the reduction of the forces of the task that finishes
first. This allows overlapping one of the reductions with the GPU
compute/transfer of the task arriving second.

Change-Id: I612a0c5cae54bee04c1d587b98b6fc534e766de6
---
 docs/user-guide/environment-variables.rst |   6 ++
 src/gromacs/ewald/pme-gpu-internal.cpp    |  11 +-
 src/gromacs/ewald/pme-gpu-internal.h      |  20 +++-
 src/gromacs/ewald/pme-gpu.cpp             |  78 +++++++++++---
 src/gromacs/ewald/pme-only.cpp            |   2 +-
 src/gromacs/ewald/pme.cu                  |   5 +
 src/gromacs/ewald/pme.h                   |  40 ++++++-
 src/gromacs/gpu_utils/cudautils.cuh       |  28 ++++-
 src/gromacs/gpu_utils/gpu_utils.h         |   7 ++
 src/gromacs/gpu_utils/oclutils.h          |  12 +++
 src/gromacs/mdlib/nbnxn_gpu.h             |  57 ++++++++--
 src/gromacs/mdlib/nbnxn_gpu_common.h      |  71 +++++++++---
 src/gromacs/mdlib/sim_util.cpp            | 125 +++++++++++++++++++---
 13 files changed, 394 insertions(+), 68 deletions(-)

diff --git a/docs/user-guide/environment-variables.rst b/docs/user-guide/environment-variables.rst
index 4cbd001f1e..5ed6d03587 100644
--- a/docs/user-guide/environment-variables.rst
+++ b/docs/user-guide/environment-variables.rst
@@ -113,6 +113,12 @@ Debugging
         over-ride the number of DD pulses used
         (default 0, meaning no over-ride). Normally 1 or 2.
 
+``GMX_DISABLE_ALTERNATING_GPU_WAIT``
+        disables the specialized polling wait path used to wait for the PME and nonbonded
+        GPU tasks completion to overlap to do the reduction of the resulting forces that
+        arrive first. Setting this variable switches to the generic path with fixed waiting
+        order.
+
 There are a number of extra environment variables like these
 that are used in debugging - check the code!
 
diff --git a/src/gromacs/ewald/pme-gpu-internal.cpp b/src/gromacs/ewald/pme-gpu-internal.cpp
index 869283b3c9..7a033ca7ff 100644
--- a/src/gromacs/ewald/pme-gpu-internal.cpp
+++ b/src/gromacs/ewald/pme-gpu-internal.cpp
@@ -133,21 +133,12 @@ void pme_gpu_update_input_box(PmeGpu gmx_unused       *pmeGpu,
  *
  * \param[in] pmeGpu            The PME GPU structure.
  */
-static void pme_gpu_reinit_computation(const PmeGpu *pmeGpu)
+void pme_gpu_reinit_computation(const PmeGpu *pmeGpu)
 {
     pme_gpu_clear_grids(pmeGpu);
     pme_gpu_clear_energy_virial(pmeGpu);
 }
 
-void pme_gpu_finish_computation(const PmeGpu *pmeGpu)
-{
-    // Synchronize the whole PME stream at once, including D2H result transfers.
-    pme_gpu_synchronize(pmeGpu);
-
-    pme_gpu_update_timings(pmeGpu);
-    pme_gpu_reinit_computation(pmeGpu);
-}
-
 /*! \brief \libinternal
  * (Re-)initializes all the PME GPU data related to the grid size and cut-off.
  *
diff --git a/src/gromacs/ewald/pme-gpu-internal.h b/src/gromacs/ewald/pme-gpu-internal.h
index b9d0b51484..454a44cfda 100644
--- a/src/gromacs/ewald/pme-gpu-internal.h
+++ b/src/gromacs/ewald/pme-gpu-internal.h
@@ -197,6 +197,15 @@ CUDA_FUNC_QUALIFIER void pme_gpu_copy_input_forces(PmeGpu *CUDA_FUNC_ARGUMENT(pm
  */
 CUDA_FUNC_QUALIFIER void pme_gpu_copy_output_forces(PmeGpu *CUDA_FUNC_ARGUMENT(pmeGpu)) CUDA_FUNC_TERM
 
+/*! \libinternal \brief
+ * Checks whether work in the PME GPU stream has completed.
+ *
+ * \param[in] pmeGpu            The PME GPU structure.
+ *
+ * \returns                     True if work in the PME stream has completed.
+ */
+CUDA_FUNC_QUALIFIER bool pme_gpu_stream_query(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGpu)) CUDA_FUNC_TERM_WITH_RETURN(0)
+
 /*! \libinternal \brief
  * Reallocates the input coordinates buffer on the GPU (and clears the padded part if needed).
  *
@@ -573,7 +582,6 @@ gmx::ArrayRef<gmx::RVec> pme_gpu_get_forces(PmeGpu *pmeGpu);
 
 /*! \libinternal \brief
  * Returns the output virial and energy of the PME solving.
- * Should be called after pme_gpu_finish_computation.
  *
  * \param[in] pmeGpu             The PME GPU structure.
  * \param[out] energy            The output energy.
@@ -660,4 +668,14 @@ void pme_gpu_reinit_atoms(PmeGpu           *pmeGpu,
                           const int         nAtoms,
                           const real       *charges);
 
+/*! \brief \libinternal
+ * The PME GPU reinitialization function that is called both at the end of any PME computation and on any load balancing.
+ *
+ * This clears the device-side working buffers in preparation for new computation.
+ *
+ * \param[in] pmeGpu            The PME GPU structure.
+ */
+void pme_gpu_reinit_computation(const PmeGpu *pmeGpu);
+
+
 #endif
diff --git a/src/gromacs/ewald/pme-gpu.cpp b/src/gromacs/ewald/pme-gpu.cpp
index 2a29e25be2..45948a2e02 100644
--- a/src/gromacs/ewald/pme-gpu.cpp
+++ b/src/gromacs/ewald/pme-gpu.cpp
@@ -316,21 +316,19 @@ void pme_gpu_launch_gather(const gmx_pme_t                 *pme,
     wallcycle_stop(wcycle, ewcLAUNCH_GPU);
 }
 
-void
-pme_gpu_wait_for_gpu(const gmx_pme_t                *pme,
-                     gmx_wallcycle_t                 wcycle,
-                     gmx::ArrayRef<const gmx::RVec> *forces,
-                     matrix                          virial,
-                     real                           *energy)
+/*! \brief Reduce staged virial and energy outputs.
+ *
+ * \param[in]  pme            The PME data structure.
+ * \param[out] forces         Output forces pointer, the internal ArrayRef pointers gets assigned to it.
+ * \param[out] virial         The output virial matrix.
+ * \param[out] energy         The output energy.
+ */
+static void pme_gpu_get_staged_results(const gmx_pme_t                *pme,
+                                       gmx::ArrayRef<const gmx::RVec> *forces,
+                                       matrix                          virial,
+                                       real                           *energy)
 {
-    GMX_ASSERT(pme_gpu_active(pme), "This should be a GPU run of PME but it is not enabled.");
-
     const bool haveComputedEnergyAndVirial = pme->gpu->settings.currentFlags & GMX_PME_CALC_ENER_VIR;
-
-    wallcycle_start(wcycle, ewcWAIT_GPU_PME_GATHER);
-    pme_gpu_finish_computation(pme->gpu);
-    wallcycle_stop(wcycle, ewcWAIT_GPU_PME_GATHER);
-
     *forces = pme_gpu_get_forces(pme->gpu);
 
     if (haveComputedEnergyAndVirial)
@@ -345,3 +343,57 @@ pme_gpu_wait_for_gpu(const gmx_pme_t                *pme,
         }
     }
 }
+
+bool pme_gpu_try_finish_task(const gmx_pme_t                *pme,
+                             gmx_wallcycle_t                 wcycle,
+                             gmx::ArrayRef<const gmx::RVec> *forces,
+                             matrix                          virial,
+                             real                           *energy,
+                             GpuTaskCompletion               completionKind)
+{
+    GMX_ASSERT(pme_gpu_active(pme), "This should be a GPU run of PME but it is not enabled.");
+
+    wallcycle_start_nocount(wcycle, ewcWAIT_GPU_PME_GATHER);
+
+    if (completionKind == GpuTaskCompletion::Check)
+    {
+        // Query the PME stream for completion of all tasks enqueued and
+        // if we're not done, stop the timer before early return.
+        if (!pme_gpu_stream_query(pme->gpu))
+        {
+            wallcycle_stop(wcycle, ewcWAIT_GPU_PME_GATHER);
+            return false;
+        }
+    }
+    else
+    {
+        // Synchronize the whole PME stream at once, including D2H result transfers.
+        pme_gpu_synchronize(pme->gpu);
+    }
+    wallcycle_stop(wcycle, ewcWAIT_GPU_PME_GATHER);
+
+    // Time the final staged data handling separately with a counting call to get
+    // the call count right.
+    wallcycle_start(wcycle, ewcWAIT_GPU_PME_GATHER);
+
+    // The computation has completed, do timing accounting and resetting buffers
+    pme_gpu_update_timings(pme->gpu);
+    // TODO: move this later and launch it together with the other
+    // non-bonded tasks at the end of the step
+    pme_gpu_reinit_computation(pme->gpu);
+
+    pme_gpu_get_staged_results(pme, forces, virial, energy);
+
+    wallcycle_stop(wcycle, ewcWAIT_GPU_PME_GATHER);
+
+    return true;
+}
+
+void pme_gpu_wait_finish_task(const gmx_pme_t                *pme,
+                              gmx_wallcycle_t                 wcycle,
+                              gmx::ArrayRef<const gmx::RVec> *forces,
+                              matrix                          virial,
+                              real                           *energy)
+{
+    pme_gpu_try_finish_task(pme, wcycle, forces, virial, energy, GpuTaskCompletion::Wait);
+}
diff --git a/src/gromacs/ewald/pme-only.cpp b/src/gromacs/ewald/pme-only.cpp
index 3f219be173..51807c72f8 100644
--- a/src/gromacs/ewald/pme-only.cpp
+++ b/src/gromacs/ewald/pme-only.cpp
@@ -637,7 +637,7 @@ int gmx_pmeonly(struct gmx_pme_t *pme,
             pme_gpu_launch_spread(pme, as_rvec_array(pme_pp->x.data()), wcycle);
             pme_gpu_launch_complex_transforms(pme, wcycle);
             pme_gpu_launch_gather(pme, wcycle, PmeForceOutputHandling::Set);
-            pme_gpu_wait_for_gpu(pme, wcycle, &forces, vir_q, &energy_q);
+            pme_gpu_wait_finish_task(pme, wcycle, &forces, vir_q, &energy_q);
         }
         else
         {
diff --git a/src/gromacs/ewald/pme.cu b/src/gromacs/ewald/pme.cu
index aec0074d2d..f677c7de37 100644
--- a/src/gromacs/ewald/pme.cu
+++ b/src/gromacs/ewald/pme.cu
@@ -373,6 +373,11 @@ void pme_gpu_free_fract_shifts(const PmeGpu *pmeGpu)
                             pmeGpu->deviceInfo);
 }
 
+bool pme_gpu_stream_query(const PmeGpu *pmeGpu)
+{
+    return haveStreamTasksCompleted(pmeGpu->archSpecific->pmeStream);
+}
+
 void pme_gpu_copy_input_gather_grid(const PmeGpu *pmeGpu, float *h_grid)
 {
     const size_t gridSize = pmeGpu->archSpecific->realGridSize * sizeof(float);
diff --git a/src/gromacs/ewald/pme.h b/src/gromacs/ewald/pme.h
index 7dbf2a19be..d8d8838ddb 100644
--- a/src/gromacs/ewald/pme.h
+++ b/src/gromacs/ewald/pme.h
@@ -66,6 +66,8 @@ struct gmx_wallclock_gpu_pme_t;
 struct gmx_device_info_t;
 struct gmx_pme_t;
 
+enum class GpuTaskCompletion;
+
 namespace gmx
 {
 class ForceWithVirial;
@@ -340,10 +342,38 @@ void pme_gpu_launch_gather(const gmx_pme_t        *pme,
  * \param[out] virial         The output virial matrix.
  * \param[out] energy         The output energy.
  */
-void pme_gpu_wait_for_gpu(const gmx_pme_t                *pme,
-                          gmx_wallcycle_t                 wcycle,
-                          gmx::ArrayRef<const gmx::RVec> *forces,
-                          matrix                          virial,
-                          real                           *energy);
+void pme_gpu_wait_finish_task(const gmx_pme_t                *pme,
+                              gmx_wallcycle_t                 wcycle,
+                              gmx::ArrayRef<const gmx::RVec> *forces,
+                              matrix                          virial,
+                              real                           *energy);
+/*! \brief
+ * Attempts to complete PME GPU tasks.
+ *
+ * The \p completionKind argument controls whether the function blocks until all
+ * PME GPU tasks enqueued completed (as pme_gpu_wait_finish_task() does) or only
+ * checks and returns immediately if they did not.
+ * When blocking or the tasks have completed it also gets the output forces
+ * by assigning the ArrayRef to the \p forces pointer passed in.
+ * Virial/energy are also outputs if they were to be computed.
+ *
+ * Note: also launches the reinitalization of the PME output buffers.
+ * TODO: this should be moved out to avoid miscounting its wall-time (as wait iso launch).
+ *
+ * \param[in]  pme            The PME data structure.
+ * \param[in]  wcycle         The wallclock counter.
+ * \param[out] forces         The output forces.
+ * \param[out] virial         The output virial matrix.
+ * \param[out] energy         The output energy.
+ * \param[in]  completionKind  Indicates whether PME task completion should only be checked rather than waited for
+ * \returns                   True if the PME GPU tasks have completed
+ */
+bool pme_gpu_try_finish_task(const gmx_pme_t                *pme,
+                             gmx_wallcycle_t                 wcycle,
+                             gmx::ArrayRef<const gmx::RVec> *forces,
+                             matrix                          virial,
+                             real                           *energy,
+                             GpuTaskCompletion               completionKind);
+
 
 #endif
diff --git a/src/gromacs/gpu_utils/cudautils.cuh b/src/gromacs/gpu_utils/cudautils.cuh
index dae3548fdd..b4cca13bb5 100644
--- a/src/gromacs/gpu_utils/cudautils.cuh
+++ b/src/gromacs/gpu_utils/cudautils.cuh
@@ -266,7 +266,7 @@ static inline void rvec_inc(rvec a, const float3 b)
     rvec_inc(a, tmp);
 }
 
-/*! \brief Calls cudaStreamSynchronize() in the stream \p s.
+/*! \brief Wait for all taks in stream \p s to complete.
  *
  * \param[in] s stream to synchronize with
  */
@@ -276,4 +276,30 @@ static inline void gpuStreamSynchronize(cudaStream_t s)
     CU_RET_ERR(stat, "cudaStreamSynchronize failed");
 }
 
+/*! \brief  Returns true if all tasks in \p s have completed.
+ *
+ * \param[in] s stream to check
+ *
+ *  \returns     True if all tasks enqueued in the stream \p s (at the time of this call) have completed.
+ */
+static inline bool haveStreamTasksCompleted(cudaStream_t s)
+{
+    cudaError_t stat = cudaStreamQuery(s);
+
+    if (stat == cudaErrorNotReady)
+    {
+        // work is still in progress in the stream
+        return false;
+    }
+
+    GMX_ASSERT(stat !=  cudaErrorInvalidResourceHandle, "Stream idnetifier not valid");
+
+    // cudaSuccess and cudaErrorNotReady are the expected return values
+    CU_RET_ERR(stat, "Unexpected cudaStreamQuery failure");
+
+    GMX_ASSERT(stat == cudaSuccess, "Values other than cudaSuccess should have been explicitly handled");
+
+    return true;
+}
+
 #endif
diff --git a/src/gromacs/gpu_utils/gpu_utils.h b/src/gromacs/gpu_utils/gpu_utils.h
index a3254bd467..88323f8762 100644
--- a/src/gromacs/gpu_utils/gpu_utils.h
+++ b/src/gromacs/gpu_utils/gpu_utils.h
@@ -67,6 +67,13 @@ enum class GpuApiCallBehavior
     Async
 };
 
+//! Types of actions associated to waiting or checking the completion of GPU tasks
+enum class GpuTaskCompletion
+{
+    Wait, /*<< Issue a blocking wait for the task */
+    Check /*<< Only check whether the task has completed */
+};
+
 /*! \brief Detect all GPUs in the system.
  *
  *  Will detect every GPU supported by the device driver in use. Also
diff --git a/src/gromacs/gpu_utils/oclutils.h b/src/gromacs/gpu_utils/oclutils.h
index 184eb6dbfb..2b2c82a88a 100644
--- a/src/gromacs/gpu_utils/oclutils.h
+++ b/src/gromacs/gpu_utils/oclutils.h
@@ -165,4 +165,16 @@ static inline void gpuStreamSynchronize(cl_command_queue s)
                        ("Error caught during clFinish:" + ocl_get_error_string(cl_error)).c_str());
 }
 
+/*! \brief Pretend to synchronize an OpenCL stream (dummy implementation).
+ *
+ * \param[in] s queue to check
+ *
+ *  \returns     True if all tasks enqueued in the stream \p s (at the time of this call) have completed.
+ */
+static inline bool haveStreamTasksCompleted(cl_command_queue gmx_unused s)
+{
+    GMX_RELEASE_ASSERT(false, "haveStreamTasksCompleted is not implemented for OpenCL");
+    return false;
+}
+
 #endif
diff --git a/src/gromacs/mdlib/nbnxn_gpu.h b/src/gromacs/mdlib/nbnxn_gpu.h
index cdb68597af..d5823c4edc 100644
--- a/src/gromacs/mdlib/nbnxn_gpu.h
+++ b/src/gromacs/mdlib/nbnxn_gpu.h
@@ -54,6 +54,7 @@ extern "C" {
 #endif
 
 struct nbnxn_atomdata_t;
+enum class GpuTaskCompletion;
 
 /*! \brief
  * Launch asynchronously the nonbonded force calculations.
@@ -121,8 +122,48 @@ void nbnxn_gpu_launch_cpyback(gmx_nbnxn_gpu_t  gmx_unused              *nb,
                               int                    gmx_unused         flags,
                               int                    gmx_unused         aloc) GPU_FUNC_TERM
 
-/*! \brief
- * Wait for the asynchronously launched nonbonded tasks and data
+/*! \brief Attempts to complete nonbonded GPU task.
+ *
+ *  This function attempts to complete the nonbonded task (both GPU and CPU auxiliary work).
+ *  Success, i.e. that the tasks completed and results are ready to be consumed, is signaled
+ *  by the return value (always true if blocking wait mode requested).
+ *
+ *  The \p completionKind parameter controls whether the behavior is non-blocking
+ *  (achieved by passing GpuTaskCompletion::Check) or blocking wait until the results
+ *  are ready (when GpuTaskCompletion::Wait is passed).
+ *  As the "Check" mode the function will return immediately if the GPU stream
+ *  still contain tasks that have not completed, it allows more flexible overlapping
+ *  of work on the CPU with GPU execution.
+ *
+ *  Note that it is only safe to use the results, and to continue to the next MD
+ *  step when this function has returned true which indicates successful completion of
+ *  - All nonbonded GPU tasks: both compute and device transfer(s)
+ *  - auxiliary tasks: updating the internal module state (timing accumulation, list pruning states) and
+ *  - internal staging reduction of (\p fshift, \p e_el, \p e_lj).
+ *
+ *  TODO: improve the handling of outputs e.g. by ensuring that this function explcitly returns the
+ *  force buffer (instead of that being passed only to nbnxn_gpu_launch_cpyback()) and by returning
+ *  the energy and Fshift contributions for some external/centralized reduction.
+ *
+ * \param[in]  nb     The nonbonded data GPU structure
+ * \param[in]  flags  Force flags
+ * \param[in]  aloc   Atom locality identifier
+ * \param[out] e_lj   Pointer to the LJ energy output to accumulate into
+ * \param[out] e_el   Pointer to the electrostatics energy output to accumulate into
+ * \param[out] fshift Pointer to the shift force buffer to accumulate into
+ * \param[in]  completionKind Indicates whether nnbonded task completion should only be checked rather than waited for
+ * \returns              True if the nonbonded tasks associated with \p aloc locality have completed
+ */
+GPU_FUNC_QUALIFIER
+bool nbnxn_gpu_try_finish_task(gmx_nbnxn_gpu_t gmx_unused  *nb,
+                               int             gmx_unused   flags,
+                               int             gmx_unused   aloc,
+                               real            gmx_unused  *e_lj,
+                               real            gmx_unused  *e_el,
+                               rvec            gmx_unused  *fshift,
+                               GpuTaskCompletion gmx_unused completionKind) GPU_FUNC_TERM_WITH_RETURN(false)
+
+/*! \brief  Completes the nonbonded GPU task blocking until GPU tasks and data
  * transfers to finish.
  *
  * Also does timing accounting and reduction of the internal staging buffers.
@@ -137,12 +178,12 @@ void nbnxn_gpu_launch_cpyback(gmx_nbnxn_gpu_t  gmx_unused              *nb,
  * \param[out] fshift Pointer to the shift force buffer to accumulate into
  */
 GPU_FUNC_QUALIFIER
-void nbnxn_gpu_wait_for_gpu(gmx_nbnxn_gpu_t gmx_unused *nb,
-                            int             gmx_unused  flags,
-                            int             gmx_unused  aloc,
-                            real            gmx_unused *e_lj,
-                            real            gmx_unused *e_el,
-                            rvec            gmx_unused *fshift) GPU_FUNC_TERM
+void nbnxn_gpu_wait_finish_task(gmx_nbnxn_gpu_t gmx_unused *nb,
+                                int             gmx_unused  flags,
+                                int             gmx_unused  aloc,
+                                real            gmx_unused *e_lj,
+                                real            gmx_unused *e_el,
+                                rvec            gmx_unused *fshift) GPU_FUNC_TERM
 
 /*! \brief Selects the Ewald kernel type, analytical or tabulated, single or twin cut-off. */
 GPU_FUNC_QUALIFIER
diff --git a/src/gromacs/mdlib/nbnxn_gpu_common.h b/src/gromacs/mdlib/nbnxn_gpu_common.h
index 63519e0daa..b11be12411 100644
--- a/src/gromacs/mdlib/nbnxn_gpu_common.h
+++ b/src/gromacs/mdlib/nbnxn_gpu_common.h
@@ -55,6 +55,7 @@
 #include "nbnxn_ocl/nbnxn_ocl_types.h"
 #endif
 
+#include "gromacs/gpu_utils/gpu_utils.h"
 #include "gromacs/math/vec.h"
 #include "gromacs/mdlib/nbnxn_gpu_types.h"
 #include "gromacs/pbcutil/ishift.h"
@@ -227,6 +228,10 @@ static inline void nbnxn_gpu_reduce_staged_outputs(const StagingData &nbst,
  *  nonbonded tasks have completed with the exception of the rolling pruning kernels
  *  that are accounted for during the following step.
  *
+ * NOTE: if timing with multiple GPUs (streams) becomes possible, the
+ *      counters could end up being inconsistent due to not being incremented
+ *      on some of the node when this is skipped on empty local domains!
+ *
  * \tparam     GpuTimers         GPU timers type
  * \tparam     GpuPairlist       Pair list type
  * \param[out] timings           Pointer to the NB GPU timings data
@@ -294,27 +299,35 @@ static inline void nbnxn_gpu_accumulate_timings(gmx_wallclock_gpu_nbnxn_t *timin
     }
 }
 
-// Documented in nbnxn_gpu.h
-void nbnxn_gpu_wait_for_gpu(gmx_nbnxn_gpu_t *nb,
-                            int              flags,
-                            int              aloc,
-                            real            *e_lj,
-                            real            *e_el,
-                            rvec            *fshift)
+bool nbnxn_gpu_try_finish_task(gmx_nbnxn_gpu_t  *nb,
+                               int               flags,
+                               int               aloc,
+                               real             *e_lj,
+                               real             *e_el,
+                               rvec             *fshift,
+                               GpuTaskCompletion completionKind)
 {
     /* determine interaction locality from atom locality */
     int iLocality = gpuAtomToInteractionLocality(aloc);
 
-    /* Launch wait/update timers & counters and do reduction into staging buffers
-       BUT skip it when during the non-local phase there was actually no work to do.
-       This is consistent with nbnxn_gpu_launch_kernel.
-
-       NOTE: if timing with multiple GPUs (streams) becomes possible, the
-       counters could end up being inconsistent due to not being incremented
-       on some of the nodes! */
+    //  We skip when during the non-local phase there was actually no work to do.
+    //  This is consistent with nbnxn_gpu_launch_kernel.
     if (!canSkipWork(nb, iLocality))
     {
-        gpuStreamSynchronize(nb->stream[iLocality]);
+        // Query the state of the GPU stream and return early if we're not done
+        if (completionKind == GpuTaskCompletion::Check)
+        {
+            if (!haveStreamTasksCompleted(nb->stream[iLocality]))
+            {
+                // Early return to skip the steps below that we have to do only
+                // after the NB task completed
+                return false;
+            }
+        }
+        else
+        {
+            gpuStreamSynchronize(nb->stream[iLocality]);
+        }
 
         bool calcEner   = flags & GMX_FORCE_ENERGY;
         bool calcFshift = flags & GMX_FORCE_VIRIAL;
@@ -329,6 +342,34 @@ void nbnxn_gpu_wait_for_gpu(gmx_nbnxn_gpu_t *nb,
 
     /* Turn off initial list pruning (doesn't hurt if this is not pair-search step). */
     nb->plist[iLocality]->haveFreshList = false;
+
+    return true;
+}
+
+/*! \brief
+ * Wait for the asynchronously launched nonbonded tasks and data
+ * transfers to finish.
+ *
+ * Also does timing accounting and reduction of the internal staging buffers.
+ * As this is called at the end of the step, it also resets the pair list and
+ * pruning flags.
+ *
+ * \param[in] nb The nonbonded data GPU structure
+ * \param[in] flags Force flags
+ * \param[in] aloc Atom locality identifier
+ * \param[out] e_lj Pointer to the LJ energy output to accumulate into
+ * \param[out] e_el Pointer to the electrostatics energy output to accumulate into
+ * \param[out] fshift Pointer to the shift force buffer to accumulate into
+ */
+void nbnxn_gpu_wait_finish_task(gmx_nbnxn_gpu_t *nb,
+                                int              flags,
+                                int              aloc,
+                                real            *e_lj,
+                                real            *e_el,
+                                rvec            *fshift)
+{
+    nbnxn_gpu_try_finish_task(nb, flags, aloc, e_lj, e_el, fshift,
+                              GpuTaskCompletion::Wait);
 }
 
 #endif
diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp
index 8af5b86bdc..c147ebbf97 100644
--- a/src/gromacs/mdlib/sim_util.cpp
+++ b/src/gromacs/mdlib/sim_util.cpp
@@ -60,6 +60,7 @@
 #include "gromacs/gmxlib/nonbonded/nb_free_energy.h"
 #include "gromacs/gmxlib/nonbonded/nb_kernel.h"
 #include "gromacs/gmxlib/nonbonded/nonbonded.h"
+#include "gromacs/gpu_utils/gpu_utils.h"
 #include "gromacs/imd/imd.h"
 #include "gromacs/listed-forces/bonded.h"
 #include "gromacs/listed-forces/disre.h"
@@ -115,6 +116,13 @@
 #include "nbnxn_kernels/nbnxn_kernel_cpu.h"
 #include "nbnxn_kernels/nbnxn_kernel_prune.h"
 
+// TODO: this environment variable allows us to verify before release
+// that on less common architectures the total cost of polling is not larger than
+// a blocking wait (so polling does not introduce overhead when the static
+// PME-first ordering would suffice).
+static const bool c_disableAlternatingWait = (getenv("GMX_DISABLE_ALTERNATING_GPU_WAIT") != nullptr);
+
+
 void print_time(FILE                     *out,
                 gmx_walltime_accounting_t walltime_accounting,
                 gmx_int64_t               step,
@@ -850,6 +858,86 @@ static void launchPmeGpuFftAndGather(gmx_pme_t        *pmedata,
     pme_gpu_launch_gather(pmedata, wcycle, PmeForceOutputHandling::Set);
 }
 
+/*! \brief
+ *  Polling wait for either of the PME or nonbonded GPU tasks.
+ *
+ * Instead of a static order in waiting for GPU tasks, this function
+ * polls checking which of the two tasks completes first, and does the
+ * associated force buffer reduction overlapped with the other task.
+ * By doing that, unlike static scheduling order, it can always overlap
+ * one of the reductions, regardless of the GPU task completion order.
+ *
+ * \param[in]     nbv              Nonbonded verlet structure
+ * \param[in]     pmedata          PME module data
+ * \param[in,out] force            Force array to reduce task outputs into.
+ * \param[in,out] forceWithVirial  Force and virial buffers
+ * \param[in,out] fshift           Shift force output vector results are reduced into
+ * \param[in,out] enerd            Energy data structure results are reduced into
+ * \param[in]     flags            Force flags
+ * \param[in]     wcycle           The wallcycle structure
+ */
+static void alternatePmeNbGpuWaitReduce(nonbonded_verlet_t             *nbv,
+                                        const gmx_pme_t                *pmedata,
+                                        gmx::PaddedArrayRef<gmx::RVec> *force,
+                                        ForceWithVirial                *forceWithVirial,
+                                        rvec                            fshift[],
+                                        gmx_enerdata_t                 *enerd,
+                                        int                             flags,
+                                        gmx_wallcycle_t                 wcycle)
+{
+    bool isPmeGpuDone = false;
+    bool isNbGpuDone  = false;
+
+
+    gmx::ArrayRef<const gmx::RVec> pmeGpuForces;
+
+    while (!isPmeGpuDone || !isNbGpuDone)
+    {
+        if (!isPmeGpuDone)
+        {
+            matrix            vir_Q;
+            real              Vlr_q;
+
+            GpuTaskCompletion completionType = (isNbGpuDone) ? GpuTaskCompletion::Wait : GpuTaskCompletion::Check;
+            isPmeGpuDone = pme_gpu_try_finish_task(pmedata, wcycle, &pmeGpuForces,
+                                                   vir_Q, &Vlr_q, completionType);
+
+            if (isPmeGpuDone)
+            {
+                pme_gpu_reduce_outputs(wcycle, forceWithVirial, pmeGpuForces,
+                                       enerd, vir_Q, Vlr_q);
+            }
+        }
+
+        if (!isNbGpuDone)
+        {
+            GpuTaskCompletion completionType = (isPmeGpuDone) ? GpuTaskCompletion::Wait : GpuTaskCompletion::Check;
+            wallcycle_start_nocount(wcycle, ewcWAIT_GPU_NB_L);
+            isNbGpuDone = nbnxn_gpu_try_finish_task(nbv->gpu_nbv,
+                                                    flags, eatLocal,
+                                                    enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
+                                                    fshift, completionType);
+            wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
+            // To get the call count right, when the task finished we
+            // issue a start/stop.
+            // TODO: move the ewcWAIT_GPU_NB_L cycle counting into nbnxn_gpu_try_finish_task()
+            // and ewcNB_XF_BUF_OPS counting into nbnxn_atomdata_add_nbat_f_to_f().
+            if (isNbGpuDone)
+            {
+                wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
+                wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
+
+                wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
+                wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS);
+                nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs, eatLocal,
+                                               nbv->nbat, as_rvec_array(force->data()));
+                wallcycle_sub_stop(wcycle, ewcsNB_F_BUF_OPS);
+                wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
+            }
+        }
+    }
+}
+
 /*! \brief
  *  Launch the dynamic rolling pruning GPU task.
  *
@@ -1405,10 +1493,10 @@ static void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
             if (bUseGPU)
             {
                 wallcycle_start(wcycle, ewcWAIT_GPU_NB_NL);
-                nbnxn_gpu_wait_for_gpu(nbv->gpu_nbv,
-                                       flags, eatNonlocal,
-                                       enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
-                                       fr->fshift);
+                nbnxn_gpu_wait_finish_task(nbv->gpu_nbv,
+                                           flags, eatNonlocal,
+                                           enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
+                                           fr->fshift);
                 cycles_wait_gpu += wallcycle_stop(wcycle, ewcWAIT_GPU_NB_NL);
             }
             else
@@ -1450,17 +1538,25 @@ static void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
         }
     }
 
-    if (useGpuPme)
+    // With both nonbonded and PME offloaded a GPU on the same rank, we use
+    // an alternating wait/reduction scheme.
+    bool alternateGpuWait = (!c_disableAlternatingWait && useGpuPme && bUseGPU && !DOMAINDECOMP(cr));
+    if (alternateGpuWait)
+    {
+        alternatePmeNbGpuWaitReduce(fr->nbv, fr->pmedata, &force, &forceWithVirial, fr->fshift, enerd, flags, wcycle);
+    }
+
+    if (!alternateGpuWait && useGpuPme)
     {
         gmx::ArrayRef<const gmx::RVec> pmeGpuForces;
         matrix vir_Q;
         real   Vlr_q;
-        pme_gpu_wait_for_gpu(fr->pmedata, wcycle, &pmeGpuForces, vir_Q, &Vlr_q);
+        pme_gpu_wait_finish_task(fr->pmedata, wcycle, &pmeGpuForces, vir_Q, &Vlr_q);
         pme_gpu_reduce_outputs(wcycle, &forceWithVirial, pmeGpuForces, enerd, vir_Q, Vlr_q);
     }
 
-    /* Wait for local NB forces */
-    if (bUseGPU)
+    /* Wait for local GPU NB outputs on the non-alternating wait path */
+    if (!alternateGpuWait && bUseGPU)
     {
         /* Measured overhead on CUDA and OpenCL with(out) GPU sharing
          * is between 0.5 and 1.5 Mcycles. So 2 MCycles is an overestimate,
@@ -1470,10 +1566,10 @@ static void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
         const float gpuWaitApiOverheadMargin = 2e6f; /* cycles */
 
         wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
-        nbnxn_gpu_wait_for_gpu(nbv->gpu_nbv,
-                               flags, eatLocal,
-                               enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
-                               fr->fshift);
+        nbnxn_gpu_wait_finish_task(nbv->gpu_nbv,
+                                   flags, eatLocal,
+                                   enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
+                                   fr->fshift);
         float cycles_tmp = wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
 
         if (ddCloseBalanceRegion == DdCloseBalanceRegionAfterForceComputation::yes)
@@ -1522,8 +1618,9 @@ static void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
         // TODO: move here the PME buffer clearing call pme_gpu_reinit_computation()
     }
 
-    /* Do the nonbonded GPU (or emulation) force buffer reduction. */
-    if (bUseOrEmulGPU)
+    /* Do the nonbonded GPU (or emulation) force buffer reduction
+     * on the non-alternating path. */
+    if (bUseOrEmulGPU && !alternateGpuWait)
     {
         wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
         wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS);
-- 
2.22.0