From 19ceb46678f66831acca3359f43a6be3241893ad Mon Sep 17 00:00:00 2001
From: =?utf8?q?Szil=C3=A1rd=20P=C3=A1ll?= <pall.szilard@gmail.com>
Date: Thu, 27 Jun 2019 19:53:08 +0200
Subject: [PATCH] Refactor tracking of GPU short-range work/skipping

This change introduces a set of flags that, for each interaction
locality, whether there are short-range interactions computed and
exposes a query in the nonbonded module's API.
This allows consistent checks for both when work has been done
and whether results need to be reduced.

Refs #2986

Change-Id: I15020d83f73a132d9b8e93d7339529176396089a
---
 src/gromacs/listed_forces/gpubonded_impl.cu |  1 +
 src/gromacs/mdlib/sim_util.cpp              | 23 ++++----
 src/gromacs/nbnxm/cuda/nbnxm_cuda.cu        | 16 +++---
 src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h   |  6 +++
 src/gromacs/nbnxm/gpu_common.h              | 59 ++++++++++++++++++---
 src/gromacs/nbnxm/gpu_common_utils.h        |  4 +-
 src/gromacs/nbnxm/nbnxm.cpp                 |  7 +++
 src/gromacs/nbnxm/nbnxm.h                   | 17 ++++++
 src/gromacs/nbnxm/nbnxm_gpu.h               | 51 +++++++++++++-----
 src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp      | 16 +++---
 src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h  |  6 +++
 11 files changed, 160 insertions(+), 46 deletions(-)

diff --git a/src/gromacs/listed_forces/gpubonded_impl.cu b/src/gromacs/listed_forces/gpubonded_impl.cu
index 065e88e253..d269c6918b 100644
--- a/src/gromacs/listed_forces/gpubonded_impl.cu
+++ b/src/gromacs/listed_forces/gpubonded_impl.cu
@@ -266,6 +266,7 @@ GpuBonded::Impl::launchEnergyTransfer()
     // TODO should wrap with ewcLAUNCH_GPU
     GMX_ASSERT(haveInteractions_, "No GPU bonded interactions, so no energies will be computed, so transfer should not be called");
 
+    // TODO add conditional on whether there has been any compute (and make sure host buffer doesn't contain garbage)
     float *h_vTot   = vTot_.data();
     copyFromDeviceBuffer(h_vTot, &d_vTot_,
                          0, F_NRE,
diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp
index ad61dee88a..5323c187a9 100644
--- a/src/gromacs/mdlib/sim_util.cpp
+++ b/src/gromacs/mdlib/sim_util.cpp
@@ -646,7 +646,6 @@ static void launchPmeGpuFftAndGather(gmx_pme_t        *pmedata,
  * \param[in,out] enerd            Energy data structure results are reduced into
  * \param[in]     flags            Force flags
  * \param[in]     pmeFlags         PME flags
- * \param[in]     haveOtherWork    Tells whether there is other work than non-bonded in the stream(s)
  * \param[in]     wcycle           The wallcycle structure
  */
 static void alternatePmeNbGpuWaitReduce(nonbonded_verlet_t                  *nbv,
@@ -657,7 +656,6 @@ static void alternatePmeNbGpuWaitReduce(nonbonded_verlet_t                  *nbv
                                         gmx_enerdata_t                      *enerd,
                                         int                                  flags,
                                         int                                  pmeFlags,
-                                        bool                                 haveOtherWork,
                                         gmx_wallcycle_t                      wcycle)
 {
     bool isPmeGpuDone = false;
@@ -681,7 +679,6 @@ static void alternatePmeNbGpuWaitReduce(nonbonded_verlet_t                  *nbv
             isNbGpuDone = Nbnxm::gpu_try_finish_task(nbv->gpu_nbv,
                                                      flags,
                                                      Nbnxm::AtomLocality::Local,
-                                                     haveOtherWork,
                                                      enerd->grpp.ener[egLJSR].data(),
                                                      enerd->grpp.ener[egCOULSR].data(),
                                                      fshift, completionType);
@@ -1035,6 +1032,9 @@ void do_force(FILE                                     *fplog,
         /* Note that with a GPU the launch overhead of the list transfer is not timed separately */
         nbv->constructPairlist(Nbnxm::InteractionLocality::Local,
                                &top->excls, step, nrnb);
+
+        nbv->setupGpuShortRangeWork(fr->gpuBonded, Nbnxm::InteractionLocality::Local);
+
         wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_LOCAL);
         wallcycle_stop(wcycle, ewcNS);
 
@@ -1061,8 +1061,7 @@ void do_force(FILE                                     *fplog,
         if (bNS || !useGpuXBufOps)
         {
             Nbnxm::gpu_copy_xq_to_gpu(nbv->gpu_nbv, nbv->nbat.get(),
-                                      Nbnxm::AtomLocality::Local,
-                                      ppForceWorkload->haveGpuBondedWork);
+                                      Nbnxm::AtomLocality::Local);
         }
         wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
         // with X buffer ops offloaded to the GPU on all but the search steps
@@ -1105,6 +1104,8 @@ void do_force(FILE                                     *fplog,
             /* Note that with a GPU the launch overhead of the list transfer is not timed separately */
             nbv->constructPairlist(Nbnxm::InteractionLocality::NonLocal,
                                    &top->excls, step, nrnb);
+
+            nbv->setupGpuShortRangeWork(fr->gpuBonded, Nbnxm::InteractionLocality::NonLocal);
             wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_NONLOCAL);
             wallcycle_stop(wcycle, ewcNS);
         }
@@ -1125,8 +1126,7 @@ void do_force(FILE                                     *fplog,
             {
                 wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
                 Nbnxm::gpu_copy_xq_to_gpu(nbv->gpu_nbv, nbv->nbat.get(),
-                                          Nbnxm::AtomLocality::NonLocal,
-                                          ppForceWorkload->haveGpuBondedWork);
+                                          Nbnxm::AtomLocality::NonLocal);
                 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
             }
 
@@ -1155,10 +1155,10 @@ void do_force(FILE                                     *fplog,
         if (havePPDomainDecomposition(cr))
         {
             Nbnxm::gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat.get(),
-                                      flags, Nbnxm::AtomLocality::NonLocal, ppForceWorkload->haveGpuBondedWork);
+                                      flags, Nbnxm::AtomLocality::NonLocal);
         }
         Nbnxm::gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat.get(),
-                                  flags, Nbnxm::AtomLocality::Local, ppForceWorkload->haveGpuBondedWork);
+                                  flags, Nbnxm::AtomLocality::Local);
         wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
 
         if (ppForceWorkload->haveGpuBondedWork && (flags & GMX_FORCE_ENERGY))
@@ -1323,7 +1323,6 @@ void do_force(FILE                                     *fplog,
                 wallcycle_start(wcycle, ewcWAIT_GPU_NB_NL);
                 Nbnxm::gpu_wait_finish_task(nbv->gpu_nbv,
                                             flags, Nbnxm::AtomLocality::NonLocal,
-                                            ppForceWorkload->haveGpuBondedWork,
                                             enerd->grpp.ener[egLJSR].data(),
                                             enerd->grpp.ener[egCOULSR].data(),
                                             fr->fshift);
@@ -1369,7 +1368,7 @@ void do_force(FILE                                     *fplog,
     if (alternateGpuWait)
     {
         alternatePmeNbGpuWaitReduce(fr->nbv.get(), fr->pmedata, &force, &forceOut.forceWithVirial, fr->fshift, enerd,
-                                    flags, pmeFlags, ppForceWorkload->haveGpuBondedWork, wcycle);
+                                    flags, pmeFlags, wcycle);
     }
 
     if (!alternateGpuWait && useGpuPme)
@@ -1389,7 +1388,7 @@ void do_force(FILE                                     *fplog,
 
         wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
         Nbnxm::gpu_wait_finish_task(nbv->gpu_nbv,
-                                    flags, Nbnxm::AtomLocality::Local, ppForceWorkload->haveGpuBondedWork,
+                                    flags, Nbnxm::AtomLocality::Local,
                                     enerd->grpp.ener[egLJSR].data(),
                                     enerd->grpp.ener[egCOULSR].data(),
                                     fr->fshift);
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
index a0117a5cb1..e8e6c5b5cc 100644
--- a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
@@ -309,9 +309,10 @@ void nbnxnInsertNonlocalGpuDependency(const gmx_nbnxn_cuda_t   *nb,
 /*! \brief Launch asynchronously the xq buffer host to device copy. */
 void gpu_copy_xq_to_gpu(gmx_nbnxn_cuda_t       *nb,
                         const nbnxn_atomdata_t *nbatom,
-                        const AtomLocality      atomLocality,
-                        const bool              haveOtherWork)
+                        const AtomLocality      atomLocality)
 {
+    GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
+
     GMX_ASSERT(atomLocality == AtomLocality::Local || atomLocality == AtomLocality::NonLocal,
                "Only local and non-local xq transfers are supported");
 
@@ -335,7 +336,7 @@ void gpu_copy_xq_to_gpu(gmx_nbnxn_cuda_t       *nb,
        we always call the local local x+q copy (and the rest of the local
        work in nbnxn_gpu_launch_kernel().
      */
-    if (!haveOtherWork && canSkipWork(*nb, iloc))
+    if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(*nb, iloc))
     {
         plist->haveFreshList = false;
 
@@ -418,7 +419,7 @@ void gpu_launch_kernel(gmx_nbnxn_cuda_t          *nb,
        clearing. All these operations, except for the local interaction kernel,
        are needed for the non-local interactions. The skip of the local kernel
        call is taken care of later in this function. */
-    if (canSkipWork(*nb, iloc))
+    if (canSkipNonbondedWork(*nb, iloc))
     {
         plist->haveFreshList = false;
 
@@ -639,9 +640,10 @@ void gpu_launch_kernel_pruneonly(gmx_nbnxn_cuda_t          *nb,
 void gpu_launch_cpyback(gmx_nbnxn_cuda_t       *nb,
                         nbnxn_atomdata_t       *nbatom,
                         const int               flags,
-                        const AtomLocality      atomLocality,
-                        const bool              haveOtherWork)
+                        const AtomLocality      atomLocality)
 {
+    GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
+
     cudaError_t stat;
     int         adat_begin, adat_len; /* local/nonlocal offset and length used for xq and f */
 
@@ -658,7 +660,7 @@ void gpu_launch_cpyback(gmx_nbnxn_cuda_t       *nb,
     bool             bCalcFshift = flags & GMX_FORCE_VIRIAL;
 
     /* don't launch non-local copy-back if there was no non-local work to do */
-    if (!haveOtherWork && canSkipWork(*nb, iloc))
+    if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(*nb, iloc))
     {
         return;
     }
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h b/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h
index 48c8776c79..ff8705df59 100644
--- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h
@@ -260,6 +260,12 @@ struct gmx_nbnxn_cuda_t
                                                    initialization in local stream that is required also
                                                    by nonlocal stream ) */
 
+    //! True if there has been local/nonlocal GPU work, either bonded or nonbonded, scheduled
+    //  to be executed in the current domain. As long as bonded work is not split up into
+    //  local/nonlocal, if there is bonded GPU work, both flags will be true.
+    gmx::EnumerationArray<Nbnxm::InteractionLocality, bool> haveWork;
+
+
     /* NOTE: With current CUDA versions (<=5.0) timing doesn't work with multiple
      * concurrent streams, so we won't time if both l/nl work is done on GPUs.
      * Timer init/uninit is still done even with timing off so only the condition
diff --git a/src/gromacs/nbnxm/gpu_common.h b/src/gromacs/nbnxm/gpu_common.h
index 1624c56c8c..4f4edddbeb 100644
--- a/src/gromacs/nbnxm/gpu_common.h
+++ b/src/gromacs/nbnxm/gpu_common.h
@@ -56,6 +56,7 @@
 #endif
 
 #include "gromacs/gpu_utils/gpu_utils.h"
+#include "gromacs/listed_forces/gpubonded.h"
 #include "gromacs/math/vec.h"
 #include "gromacs/mdlib/force_flags.h"
 #include "gromacs/nbnxm/nbnxm.h"
@@ -67,6 +68,11 @@
 #include "gpu_common_utils.h"
 #include "nbnxm_gpu.h"
 
+namespace gmx
+{
+class GpuBonded;
+}
+
 namespace Nbnxm
 {
 
@@ -117,6 +123,49 @@ gpuAtomToInteractionLocality(const AtomLocality atomLocality)
     }
 }
 
+
+void
+setupGpuShortRangeWork(gmx_nbnxn_gpu_t                  *nb,
+                       const gmx::GpuBonded             *gpuBonded,
+                       const Nbnxm::InteractionLocality  iLocality)
+{
+    GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
+
+    // There is short-range work if the pair list for the provided
+    // interaction locality contains entries or if there is any
+    // bonded work (as this is not split into local/nonlocal).
+    nb->haveWork[iLocality] =
+        ((nb->plist[iLocality]->nsci != 0) ||
+         (gpuBonded != nullptr && gpuBonded->haveInteractions()));
+}
+
+/*! \brief Returns true if there is GPU short-range work for the given interaction locality.
+ *
+ * Note that as, unlike nonbonded tasks, bonded tasks are not split into local/nonlocal,
+ * and therefore if there are GPU offloaded bonded interactions, this function will return
+ * true for all interaction localities.
+ *
+ * \param[inout]  nb        Pointer to the nonbonded GPU data structure
+ * \param[in]     iLocality Interaction locality identifier
+ */
+static bool
+haveGpuShortRangeWork(const gmx_nbnxn_gpu_t            &nb,
+                      const Nbnxm::InteractionLocality  iLocality)
+{
+    return nb.haveWork[iLocality];
+}
+
+bool
+haveGpuShortRangeWork(const gmx_nbnxn_gpu_t     *nb,
+                      const Nbnxm::AtomLocality  aLocality)
+{
+    GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
+
+    return haveGpuShortRangeWork(*nb, gpuAtomToInteractionLocality(aLocality));
+}
+
+
+
 /*! \brief Calculate atom range and return start index and length.
  *
  * \param[in] atomData Atom descriptor data structure
@@ -319,7 +368,6 @@ gpu_accumulate_timings(gmx_wallclock_gpu_nbnxn_t *timings,
 bool gpu_try_finish_task(gmx_nbnxn_gpu_t    *nb,
                          const int           flags,
                          const AtomLocality  aloc,
-                         const bool          haveOtherWork,
                          real               *e_lj,
                          real               *e_el,
                          rvec               *fshift,
@@ -331,8 +379,9 @@ bool gpu_try_finish_task(gmx_nbnxn_gpu_t    *nb,
     const InteractionLocality iLocality = gpuAtomToInteractionLocality(aloc);
 
     //  We skip when during the non-local phase there was actually no work to do.
-    //  This is consistent with nbnxn_gpu_launch_kernel.
-    if (haveOtherWork || !canSkipWork(*nb, iLocality))
+    //  This is consistent with nbnxn_gpu_launch_kernel but it also considers possible
+    //  bonded GPU work.
+    if ((iLocality == InteractionLocality::Local) || haveGpuShortRangeWork(*nb, iLocality))
     {
         // Query the state of the GPU stream and return early if we're not done
         if (completionKind == GpuTaskCompletion::Check)
@@ -378,7 +427,6 @@ bool gpu_try_finish_task(gmx_nbnxn_gpu_t    *nb,
  * \param[in] nb The nonbonded data GPU structure
  * \param[in] flags Force flags
  * \param[in] aloc Atom locality identifier
- * \param[in] haveOtherWork  Tells whether there is other work than non-bonded work in the nbnxn stream(s)
  * \param[out] e_lj Pointer to the LJ energy output to accumulate into
  * \param[out] e_el Pointer to the electrostatics energy output to accumulate into
  * \param[out] fshift Pointer to the shift force buffer to accumulate into
@@ -387,12 +435,11 @@ bool gpu_try_finish_task(gmx_nbnxn_gpu_t    *nb,
 void gpu_wait_finish_task(gmx_nbnxn_gpu_t *nb,
                           int              flags,
                           AtomLocality     aloc,
-                          bool             haveOtherWork,
                           real            *e_lj,
                           real            *e_el,
                           rvec            *fshift)
 {
-    gpu_try_finish_task(nb, flags, aloc, haveOtherWork, e_lj, e_el, fshift,
+    gpu_try_finish_task(nb, flags, aloc, e_lj, e_el, fshift,
                         GpuTaskCompletion::Wait);
 }
 
diff --git a/src/gromacs/nbnxm/gpu_common_utils.h b/src/gromacs/nbnxm/gpu_common_utils.h
index 02febb47ab..77d3b08e96 100644
--- a/src/gromacs/nbnxm/gpu_common_utils.h
+++ b/src/gromacs/nbnxm/gpu_common_utils.h
@@ -64,8 +64,8 @@ namespace Nbnxm
  * local part of the force array also depends on the non-local kernel.
  * The skip of the local kernel is taken care of separately.
  */
-static inline bool canSkipWork(const gmx_nbnxn_gpu_t &nb,
-                               InteractionLocality    iloc)
+static inline bool canSkipNonbondedWork(const gmx_nbnxn_gpu_t &nb,
+                                        InteractionLocality    iloc)
 {
     assert(nb.plist[iloc]);
     return (iloc == InteractionLocality::NonLocal &&
diff --git a/src/gromacs/nbnxm/nbnxm.cpp b/src/gromacs/nbnxm/nbnxm.cpp
index 54c9ff9864..a81a6b400e 100644
--- a/src/gromacs/nbnxm/nbnxm.cpp
+++ b/src/gromacs/nbnxm/nbnxm.cpp
@@ -164,6 +164,13 @@ nonbonded_verlet_t::atomdata_add_nbat_f_to_f(const Nbnxm::AtomLocality  locality
                                              rvec                      *f,
                                              gmx_wallcycle             *wcycle)
 {
+    /* Skip the reduction if there was no short-range GPU work to do
+     * (either NB or both NB and bonded work). */
+    if (!pairlistIsSimple() && !haveGpuShortRangeWork(locality))
+    {
+        return;
+    }
+
     wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
     wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS);
 
diff --git a/src/gromacs/nbnxm/nbnxm.h b/src/gromacs/nbnxm/nbnxm.h
index 72eb98ae59..7e3869b451 100644
--- a/src/gromacs/nbnxm/nbnxm.h
+++ b/src/gromacs/nbnxm/nbnxm.h
@@ -317,6 +317,23 @@ struct nonbonded_verlet_t
         void changePairlistRadii(real rlistOuter,
                                  real rlistInner);
 
+        //! Set up internal flags that indicate what type of short-range work there is.
+        void setupGpuShortRangeWork(const gmx::GpuBonded             *gpuBonded,
+                                    const Nbnxm::InteractionLocality  iLocality)
+        {
+            if (useGpu() && !emulateGpu())
+            {
+                Nbnxm::setupGpuShortRangeWork(gpu_nbv, gpuBonded, iLocality);
+            }
+        }
+
+        //! Returns true if there is GPU short-range work for the given atom locality.
+        bool haveGpuShortRangeWork(const Nbnxm::AtomLocality aLocality)
+        {
+            return ((useGpu() && !emulateGpu()) &&
+                    Nbnxm::haveGpuShortRangeWork(gpu_nbv, aLocality));
+        }
+
         // TODO: Make all data members private
     public:
         //! All data related to the pair lists
diff --git a/src/gromacs/nbnxm/nbnxm_gpu.h b/src/gromacs/nbnxm/nbnxm_gpu.h
index 7e88129f4e..a1e4800456 100644
--- a/src/gromacs/nbnxm/nbnxm_gpu.h
+++ b/src/gromacs/nbnxm/nbnxm_gpu.h
@@ -55,6 +55,11 @@
 struct nbnxn_atomdata_t;
 enum class GpuTaskCompletion;
 
+namespace gmx
+{
+class GpuBonded;
+}
+
 namespace Nbnxm
 {
 
@@ -69,13 +74,11 @@ class Grid;
  * \param [in]    nb        GPU nonbonded data.
  * \param [in]    nbdata    Host-side atom data structure.
  * \param [in]    aloc      Atom locality flag.
- * \param [in]    haveOtherWork  True if there are other tasks that require the nbnxn coordinate input.
  */
 GPU_FUNC_QUALIFIER
 void gpu_copy_xq_to_gpu(gmx_nbnxn_gpu_t gmx_unused               *nb,
                         const struct nbnxn_atomdata_t gmx_unused *nbdata,
-                        AtomLocality gmx_unused                   aloc,
-                        bool gmx_unused                           haveOtherWork) GPU_FUNC_TERM
+                        AtomLocality gmx_unused                   aloc) GPU_FUNC_TERM
 
 /*! \brief
  * Launch asynchronously the nonbonded force calculations.
@@ -133,17 +136,14 @@ void gpu_launch_kernel_pruneonly(gmx_nbnxn_gpu_t gmx_unused     *nb,
                                  int gmx_unused                  numParts) GPU_FUNC_TERM
 
 /*! \brief
- * Launch asynchronously the download of nonbonded forces from the GPU
+ * Launch asynchronously the download of short-range forces from the GPU
  * (and energies/shift forces if required).
- * When haveOtherWork=true, the copy-back is done even when there was
- * no non-bonded work.
  */
 GPU_FUNC_QUALIFIER
 void gpu_launch_cpyback(gmx_nbnxn_gpu_t  gmx_unused *nb,
                         nbnxn_atomdata_t gmx_unused *nbatom,
                         int              gmx_unused  flags,
-                        AtomLocality     gmx_unused  aloc,
-                        bool             gmx_unused  haveOtherWork) GPU_FUNC_TERM
+                        AtomLocality     gmx_unused  aloc) GPU_FUNC_TERM
 
 /*! \brief Attempts to complete nonbonded GPU task.
  *
@@ -171,7 +171,6 @@ void gpu_launch_cpyback(gmx_nbnxn_gpu_t  gmx_unused *nb,
  * \param[in]  nb     The nonbonded data GPU structure
  * \param[in]  flags  Force flags
  * \param[in]  aloc   Atom locality identifier
- * \param[in]  haveOtherWork  Tells whether there is other work than non-bonded work in the nbnxn stream(s)
  * \param[out] e_lj   Pointer to the LJ energy output to accumulate into
  * \param[out] e_el   Pointer to the electrostatics energy output to accumulate into
  * \param[out] fshift Pointer to the shift force buffer to accumulate into
@@ -182,7 +181,6 @@ GPU_FUNC_QUALIFIER
 bool gpu_try_finish_task(gmx_nbnxn_gpu_t gmx_unused  *nb,
                          int             gmx_unused   flags,
                          AtomLocality    gmx_unused   aloc,
-                         bool            gmx_unused   haveOtherWork,
                          real            gmx_unused  *e_lj,
                          real            gmx_unused  *e_el,
                          rvec            gmx_unused  *fshift,
@@ -198,7 +196,6 @@ bool gpu_try_finish_task(gmx_nbnxn_gpu_t gmx_unused  *nb,
  * \param[in] nb The nonbonded data GPU structure
  * \param[in] flags Force flags
  * \param[in] aloc Atom locality identifier
- * \param[in]  haveOtherWork  Tells whether there is other work than non-bonded work in the nbnxn stream(s)
  * \param[out] e_lj Pointer to the LJ energy output to accumulate into
  * \param[out] e_el Pointer to the electrostatics energy output to accumulate into
  * \param[out] fshift Pointer to the shift force buffer to accumulate into
@@ -207,7 +204,6 @@ GPU_FUNC_QUALIFIER
 void gpu_wait_finish_task(gmx_nbnxn_gpu_t gmx_unused *nb,
                           int             gmx_unused  flags,
                           AtomLocality    gmx_unused  aloc,
-                          bool            gmx_unused  haveOtherWork,
                           real            gmx_unused *e_lj,
                           real            gmx_unused *e_el,
                           rvec            gmx_unused *fshift) GPU_FUNC_TERM
@@ -242,6 +238,37 @@ CUDA_FUNC_QUALIFIER
 void nbnxnInsertNonlocalGpuDependency(const gmx_nbnxn_gpu_t gmx_unused    *nb,
                                       const InteractionLocality gmx_unused interactionLocality) CUDA_FUNC_TERM
 
+/*! \brief Set up internal flags that indicate what type of short-range work there is.
+ *
+ * As nonbondeds and bondeds share input/output buffers and GPU queues,
+ * both are considered when checking for work in the current domain.
+ *
+ * This function is expected to be called every time the work-distribution
+ * can change (i.e. at search/domain decomposition steps).
+ *
+ * \param[inout]  nb         Pointer to the nonbonded GPU data structure
+ * \param[in]     gpuBonded  Pointer to the GPU bonded data structure
+ * \param[in]     iLocality  Interaction locality identifier
+ */
+GPU_FUNC_QUALIFIER
+void setupGpuShortRangeWork(gmx_nbnxn_gpu_t                  gmx_unused *nb,
+                            const gmx::GpuBonded             gmx_unused *gpuBonded,
+                            const Nbnxm::InteractionLocality gmx_unused  iLocality) GPU_FUNC_TERM
+
+/*! \brief Returns true if there is GPU short-range work for the given atom locality.
+ *
+ * Note that as, unlike nonbonded tasks, bonded tasks are not split into local/nonlocal,
+ * and therefore if there are GPU offloaded bonded interactions, this function will return
+ * true for both local and nonlocal atom range.
+ *
+ * \param[inout]  nb        Pointer to the nonbonded GPU data structure
+ * \param[in]     aLocality Atom locality identifier
+ */
+GPU_FUNC_QUALIFIER
+bool haveGpuShortRangeWork(const gmx_nbnxn_gpu_t     gmx_unused *nb,
+                           const Nbnxm::AtomLocality gmx_unused  aLocality) GPU_FUNC_TERM_WITH_RETURN(false)
+
+
 } // namespace Nbnxm
 
 #endif
diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp b/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp
index d3b860e6e6..659c484287 100644
--- a/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp
+++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp
@@ -365,9 +365,10 @@ static void sync_ocl_event(cl_command_queue stream, cl_event *ocl_event)
 /*! \brief Launch asynchronously the xq buffer host to device copy. */
 void gpu_copy_xq_to_gpu(gmx_nbnxn_ocl_t        *nb,
                         const nbnxn_atomdata_t *nbatom,
-                        const AtomLocality      atomLocality,
-                        const bool              haveOtherWork)
+                        const AtomLocality      atomLocality)
 {
+    GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
+
     const InteractionLocality iloc = gpuAtomToInteractionLocality(atomLocality);
 
     /* local/nonlocal offset and length used for xq and f */
@@ -389,7 +390,7 @@ void gpu_copy_xq_to_gpu(gmx_nbnxn_ocl_t        *nb,
        we always call the local local x+q copy (and the rest of the local
        work in nbnxn_gpu_launch_kernel().
      */
-    if (!haveOtherWork && canSkipWork(*nb, iloc))
+    if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(*nb, iloc))
     {
         plist->haveFreshList = false;
 
@@ -491,7 +492,7 @@ void gpu_launch_kernel(gmx_nbnxn_ocl_t                  *nb,
        clearing. All these operations, except for the local interaction kernel,
        are needed for the non-local interactions. The skip of the local kernel
        call is taken care of later in this function. */
-    if (canSkipWork(*nb, iloc))
+    if (canSkipNonbondedWork(*nb, iloc))
     {
         plist->haveFreshList = false;
 
@@ -733,9 +734,10 @@ void gpu_launch_kernel_pruneonly(gmx_nbnxn_gpu_t           *nb,
 void gpu_launch_cpyback(gmx_nbnxn_ocl_t               *nb,
                         struct nbnxn_atomdata_t       *nbatom,
                         const int                      flags,
-                        const AtomLocality             aloc,
-                        const bool                     haveOtherWork)
+                        const AtomLocality             aloc)
 {
+    GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
+
     cl_int gmx_unused cl_error;
     int               adat_begin, adat_len; /* local/nonlocal offset and length used for xq and f */
 
@@ -752,7 +754,7 @@ void gpu_launch_cpyback(gmx_nbnxn_ocl_t               *nb,
 
 
     /* don't launch non-local copy-back if there was no non-local work to do */
-    if (!haveOtherWork && canSkipWork(*nb, iloc))
+    if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(*nb, iloc))
     {
         /* TODO An alternative way to signal that non-local work is
            complete is to use a clEnqueueMarker+clEnqueueBarrier
diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h b/src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h
index 57d945bdcb..55d93e74f5 100644
--- a/src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h
+++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h
@@ -298,6 +298,12 @@ struct gmx_nbnxn_ocl_t
                                                    non-local force calculations are done
                                                    (e.g. f buffer 0-ing, local x/q H2D) */
 
+    //! True if there has been local/nonlocal GPU work, either bonded or nonbonded, scheduled
+    //  to be executed in the current domain. As long as bonded work is not split up into
+    //  local/nonlocal, if there is bonded GPU work, both flags will be true.
+    gmx::EnumerationArray<Nbnxm::InteractionLocality, bool> haveWork;
+
+
     cl_bool                           bDoTime;  /**< True if event-based timing is enabled.                     */
     cl_timers_t                      *timers;   /**< OpenCL event-based timers.                                 */
     struct gmx_wallclock_gpu_nbnxn_t *timings;  /**< Timing data. TODO: deprecate this and query timers for accumulated data instead */
-- 
2.22.0