Refactor tracking of GPU short-range work/skipping

author Szilárd Páll <pall.szilard@gmail.com>

Thu, 27 Jun 2019 17:53:08 +0000 (19:53 +0200)

committer Mark Abraham <mark.j.abraham@gmail.com>

Mon, 1 Jul 2019 21:55:31 +0000 (23:55 +0200)
author Szilárd Páll <pall.szilard@gmail.com>
Thu, 27 Jun 2019 17:53:08 +0000 (19:53 +0200)
committer Mark Abraham <mark.j.abraham@gmail.com>
Mon, 1 Jul 2019 21:55:31 +0000 (23:55 +0200)
diff --git a/src/gromacs/listed_forces/gpubonded_impl.cu b/src/gromacs/listed_forces/gpubonded_impl.cu

index 065e88e253b049c5b08da31af00a82da6d2fe02d..d269c6918bcf6757f204f2dd58b2e1b049e6aa84 100644 (file)
--- a/src/gromacs/listed_forces/gpubonded_impl.cu
+++ b/src/gromacs/listed_forces/gpubonded_impl.cu
@@ -266,6 +266,7 @@ GpuBonded::Impl::launchEnergyTransfer()
      // TODO should wrap with ewcLAUNCH_GPU
      GMX_ASSERT(haveInteractions_, "No GPU bonded interactions, so no energies will be computed, so transfer should not be called");
  
+    // TODO add conditional on whether there has been any compute (and make sure host buffer doesn't contain garbage)
      float *h_vTot   = vTot_.data();
      copyFromDeviceBuffer(h_vTot, &d_vTot_,
                           0, F_NRE,
diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp

index ad61dee88acadbafcb2aa3f878076ad09b28cd8a..5323c187a92b75ba72ad1191dcafd4b6264dbbba 100644 (file)
--- a/src/gromacs/mdlib/sim_util.cpp
+++ b/src/gromacs/mdlib/sim_util.cpp
@@ -646,7 +646,6 @@ static void launchPmeGpuFftAndGather(gmx_pme_t        *pmedata,
   * \param[in,out] enerd            Energy data structure results are reduced into
   * \param[in]     flags            Force flags
   * \param[in]     pmeFlags         PME flags
- * \param[in]     haveOtherWork    Tells whether there is other work than non-bonded in the stream(s)
   * \param[in]     wcycle           The wallcycle structure
   */
  static void alternatePmeNbGpuWaitReduce(nonbonded_verlet_t                  *nbv,
@@ -657,7 +656,6 @@ static void alternatePmeNbGpuWaitReduce(nonbonded_verlet_t                  *nbv
                                          gmx_enerdata_t                      *enerd,
                                          int                                  flags,
                                          int                                  pmeFlags,
-                                        bool                                 haveOtherWork,
                                          gmx_wallcycle_t                      wcycle)
  {
      bool isPmeGpuDone = false;
@@ -681,7 +679,6 @@ static void alternatePmeNbGpuWaitReduce(nonbonded_verlet_t                  *nbv
              isNbGpuDone = Nbnxm::gpu_try_finish_task(nbv->gpu_nbv,
                                                       flags,
                                                       Nbnxm::AtomLocality::Local,
-                                                     haveOtherWork,
                                                       enerd->grpp.ener[egLJSR].data(),
                                                       enerd->grpp.ener[egCOULSR].data(),
                                                       fshift, completionType);
@@ -1035,6 +1032,9 @@ void do_force(FILE                                     *fplog,
          /* Note that with a GPU the launch overhead of the list transfer is not timed separately */
          nbv->constructPairlist(Nbnxm::InteractionLocality::Local,
                                 &top->excls, step, nrnb);
+
+        nbv->setupGpuShortRangeWork(fr->gpuBonded, Nbnxm::InteractionLocality::Local);
+
          wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_LOCAL);
          wallcycle_stop(wcycle, ewcNS);
  
@@ -1061,8 +1061,7 @@ void do_force(FILE                                     *fplog,
          if (bNS || !useGpuXBufOps)
          {
              Nbnxm::gpu_copy_xq_to_gpu(nbv->gpu_nbv, nbv->nbat.get(),
-                                      Nbnxm::AtomLocality::Local,
-                                      ppForceWorkload->haveGpuBondedWork);
+                                      Nbnxm::AtomLocality::Local);
          }
          wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
          // with X buffer ops offloaded to the GPU on all but the search steps
@@ -1105,6 +1104,8 @@ void do_force(FILE                                     *fplog,
              /* Note that with a GPU the launch overhead of the list transfer is not timed separately */
              nbv->constructPairlist(Nbnxm::InteractionLocality::NonLocal,
                                     &top->excls, step, nrnb);
+
+            nbv->setupGpuShortRangeWork(fr->gpuBonded, Nbnxm::InteractionLocality::NonLocal);
              wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_NONLOCAL);
              wallcycle_stop(wcycle, ewcNS);
          }
@@ -1125,8 +1126,7 @@ void do_force(FILE                                     *fplog,
              {
                  wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
                  Nbnxm::gpu_copy_xq_to_gpu(nbv->gpu_nbv, nbv->nbat.get(),
-                                          Nbnxm::AtomLocality::NonLocal,
-                                          ppForceWorkload->haveGpuBondedWork);
+                                          Nbnxm::AtomLocality::NonLocal);
                  wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
              }
  
@@ -1155,10 +1155,10 @@ void do_force(FILE                                     *fplog,
          if (havePPDomainDecomposition(cr))
          {
              Nbnxm::gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat.get(),
-                                      flags, Nbnxm::AtomLocality::NonLocal, ppForceWorkload->haveGpuBondedWork);
+                                      flags, Nbnxm::AtomLocality::NonLocal);
          }
          Nbnxm::gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat.get(),
-                                  flags, Nbnxm::AtomLocality::Local, ppForceWorkload->haveGpuBondedWork);
+                                  flags, Nbnxm::AtomLocality::Local);
          wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
  
          if (ppForceWorkload->haveGpuBondedWork && (flags & GMX_FORCE_ENERGY))
@@ -1323,7 +1323,6 @@ void do_force(FILE                                     *fplog,
                  wallcycle_start(wcycle, ewcWAIT_GPU_NB_NL);
                  Nbnxm::gpu_wait_finish_task(nbv->gpu_nbv,
                                              flags, Nbnxm::AtomLocality::NonLocal,
-                                            ppForceWorkload->haveGpuBondedWork,
                                              enerd->grpp.ener[egLJSR].data(),
                                              enerd->grpp.ener[egCOULSR].data(),
                                              fr->fshift);
@@ -1369,7 +1368,7 @@ void do_force(FILE                                     *fplog,
      if (alternateGpuWait)
      {
          alternatePmeNbGpuWaitReduce(fr->nbv.get(), fr->pmedata, &force, &forceOut.forceWithVirial, fr->fshift, enerd,
-                                    flags, pmeFlags, ppForceWorkload->haveGpuBondedWork, wcycle);
+                                    flags, pmeFlags, wcycle);
      }
  
      if (!alternateGpuWait && useGpuPme)
@@ -1389,7 +1388,7 @@ void do_force(FILE                                     *fplog,
  
          wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
          Nbnxm::gpu_wait_finish_task(nbv->gpu_nbv,
-                                    flags, Nbnxm::AtomLocality::Local, ppForceWorkload->haveGpuBondedWork,
+                                    flags, Nbnxm::AtomLocality::Local,
                                      enerd->grpp.ener[egLJSR].data(),
                                      enerd->grpp.ener[egCOULSR].data(),
                                      fr->fshift);
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu

index a0117a5cb1a23725798898d7b66e4ef7e1dafb8b..e8e6c5b5cc1748701beff7a8145c402db2e80ea5 100644 (file)
--- a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
@@ -309,9 +309,10 @@ void nbnxnInsertNonlocalGpuDependency(const gmx_nbnxn_cuda_t   *nb,
  /*! \brief Launch asynchronously the xq buffer host to device copy. */
  void gpu_copy_xq_to_gpu(gmx_nbnxn_cuda_t       *nb,
                          const nbnxn_atomdata_t *nbatom,
-                        const AtomLocality      atomLocality,
-                        const bool              haveOtherWork)
+                        const AtomLocality      atomLocality)
  {
+    GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
+
      GMX_ASSERT(atomLocality == AtomLocality::Local || atomLocality == AtomLocality::NonLocal,
                 "Only local and non-local xq transfers are supported");
  
@@ -335,7 +336,7 @@ void gpu_copy_xq_to_gpu(gmx_nbnxn_cuda_t       *nb,
         we always call the local local x+q copy (and the rest of the local
         work in nbnxn_gpu_launch_kernel().
       */
-    if (!haveOtherWork && canSkipWork(*nb, iloc))
+    if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(*nb, iloc))
      {
          plist->haveFreshList = false;
  
@@ -418,7 +419,7 @@ void gpu_launch_kernel(gmx_nbnxn_cuda_t          *nb,
         clearing. All these operations, except for the local interaction kernel,
         are needed for the non-local interactions. The skip of the local kernel
         call is taken care of later in this function. */
-    if (canSkipWork(*nb, iloc))
+    if (canSkipNonbondedWork(*nb, iloc))
      {
          plist->haveFreshList = false;
  
@@ -639,9 +640,10 @@ void gpu_launch_kernel_pruneonly(gmx_nbnxn_cuda_t          *nb,
  void gpu_launch_cpyback(gmx_nbnxn_cuda_t       *nb,
                          nbnxn_atomdata_t       *nbatom,
                          const int               flags,
-                        const AtomLocality      atomLocality,
-                        const bool              haveOtherWork)
+                        const AtomLocality      atomLocality)
  {
+    GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
+
      cudaError_t stat;
      int         adat_begin, adat_len; /* local/nonlocal offset and length used for xq and f */
  
@@ -658,7 +660,7 @@ void gpu_launch_cpyback(gmx_nbnxn_cuda_t       *nb,
      bool             bCalcFshift = flags & GMX_FORCE_VIRIAL;
  
      /* don't launch non-local copy-back if there was no non-local work to do */
-    if (!haveOtherWork && canSkipWork(*nb, iloc))
+    if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(*nb, iloc))
      {
          return;
      }
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h b/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h

index 48c8776c79f68bebf8b8769f2e10c07494110af7..ff8705df599c178d2e5361f9763fdf08f54845c0 100644 (file)
--- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h
@@ -260,6 +260,12 @@ struct gmx_nbnxn_cuda_t
                                                     initialization in local stream that is required also
                                                     by nonlocal stream ) */
  
+    //! True if there has been local/nonlocal GPU work, either bonded or nonbonded, scheduled
+    //  to be executed in the current domain. As long as bonded work is not split up into
+    //  local/nonlocal, if there is bonded GPU work, both flags will be true.
+    gmx::EnumerationArray<Nbnxm::InteractionLocality, bool> haveWork;
+
+
      /* NOTE: With current CUDA versions (<=5.0) timing doesn't work with multiple
       * concurrent streams, so we won't time if both l/nl work is done on GPUs.
       * Timer init/uninit is still done even with timing off so only the condition
diff --git a/src/gromacs/nbnxm/gpu_common.h b/src/gromacs/nbnxm/gpu_common.h

index 1624c56c8c5bb91aba3248330e2e78857ab82148..4f4edddbeb447e4b12c41b6a4e8c2285e6311c6a 100644 (file)
--- a/src/gromacs/nbnxm/gpu_common.h
+++ b/src/gromacs/nbnxm/gpu_common.h
@@ -56,6 +56,7 @@
  #endif
  
  #include "gromacs/gpu_utils/gpu_utils.h"
+#include "gromacs/listed_forces/gpubonded.h"
  #include "gromacs/math/vec.h"
  #include "gromacs/mdlib/force_flags.h"
  #include "gromacs/nbnxm/nbnxm.h"
@@ -67,6 +68,11 @@
  #include "gpu_common_utils.h"
  #include "nbnxm_gpu.h"
  
+namespace gmx
+{
+class GpuBonded;
+}
+
  namespace Nbnxm
  {
  
@@ -117,6 +123,49 @@ gpuAtomToInteractionLocality(const AtomLocality atomLocality)
      }
  }
  
+
+void
+setupGpuShortRangeWork(gmx_nbnxn_gpu_t                  *nb,
+                       const gmx::GpuBonded             *gpuBonded,
+                       const Nbnxm::InteractionLocality  iLocality)
+{
+    GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
+
+    // There is short-range work if the pair list for the provided
+    // interaction locality contains entries or if there is any
+    // bonded work (as this is not split into local/nonlocal).
+    nb->haveWork[iLocality] =
+        ((nb->plist[iLocality]->nsci != 0) ||
+         (gpuBonded != nullptr && gpuBonded->haveInteractions()));
+}
+
+/*! \brief Returns true if there is GPU short-range work for the given interaction locality.
+ *
+ * Note that as, unlike nonbonded tasks, bonded tasks are not split into local/nonlocal,
+ * and therefore if there are GPU offloaded bonded interactions, this function will return
+ * true for all interaction localities.
+ *
+ * \param[inout]  nb        Pointer to the nonbonded GPU data structure
+ * \param[in]     iLocality Interaction locality identifier
+ */
+static bool
+haveGpuShortRangeWork(const gmx_nbnxn_gpu_t            &nb,
+                      const Nbnxm::InteractionLocality  iLocality)
+{
+    return nb.haveWork[iLocality];
+}
+
+bool
+haveGpuShortRangeWork(const gmx_nbnxn_gpu_t     *nb,
+                      const Nbnxm::AtomLocality  aLocality)
+{
+    GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
+
+    return haveGpuShortRangeWork(*nb, gpuAtomToInteractionLocality(aLocality));
+}
+
+
+
  /*! \brief Calculate atom range and return start index and length.
   *
   * \param[in] atomData Atom descriptor data structure
@@ -319,7 +368,6 @@ gpu_accumulate_timings(gmx_wallclock_gpu_nbnxn_t *timings,
  bool gpu_try_finish_task(gmx_nbnxn_gpu_t    *nb,
                           const int           flags,
                           const AtomLocality  aloc,
-                         const bool          haveOtherWork,
                           real               *e_lj,
                           real               *e_el,
                           rvec               *fshift,
@@ -331,8 +379,9 @@ bool gpu_try_finish_task(gmx_nbnxn_gpu_t    *nb,
      const InteractionLocality iLocality = gpuAtomToInteractionLocality(aloc);
  
      //  We skip when during the non-local phase there was actually no work to do.
-    //  This is consistent with nbnxn_gpu_launch_kernel.
-    if (haveOtherWork || !canSkipWork(*nb, iLocality))
+    //  This is consistent with nbnxn_gpu_launch_kernel but it also considers possible
+    //  bonded GPU work.
+    if ((iLocality == InteractionLocality::Local) || haveGpuShortRangeWork(*nb, iLocality))
      {
          // Query the state of the GPU stream and return early if we're not done
          if (completionKind == GpuTaskCompletion::Check)
@@ -378,7 +427,6 @@ bool gpu_try_finish_task(gmx_nbnxn_gpu_t    *nb,
   * \param[in] nb The nonbonded data GPU structure
   * \param[in] flags Force flags
   * \param[in] aloc Atom locality identifier
- * \param[in] haveOtherWork  Tells whether there is other work than non-bonded work in the nbnxn stream(s)
   * \param[out] e_lj Pointer to the LJ energy output to accumulate into
   * \param[out] e_el Pointer to the electrostatics energy output to accumulate into
   * \param[out] fshift Pointer to the shift force buffer to accumulate into
@@ -387,12 +435,11 @@ bool gpu_try_finish_task(gmx_nbnxn_gpu_t    *nb,
  void gpu_wait_finish_task(gmx_nbnxn_gpu_t *nb,
                            int              flags,
                            AtomLocality     aloc,
-                          bool             haveOtherWork,
                            real            *e_lj,
                            real            *e_el,
                            rvec            *fshift)
  {
-    gpu_try_finish_task(nb, flags, aloc, haveOtherWork, e_lj, e_el, fshift,
+    gpu_try_finish_task(nb, flags, aloc, e_lj, e_el, fshift,
                          GpuTaskCompletion::Wait);
  }
  
diff --git a/src/gromacs/nbnxm/gpu_common_utils.h b/src/gromacs/nbnxm/gpu_common_utils.h

index 02febb47ab082b0570f4944971aecc9dbf87d1df..77d3b08e9649e3cd0dea7a62d12c3eb3519799ba 100644 (file)
--- a/src/gromacs/nbnxm/gpu_common_utils.h
+++ b/src/gromacs/nbnxm/gpu_common_utils.h
@@ -64,8 +64,8 @@ namespace Nbnxm
   * local part of the force array also depends on the non-local kernel.
   * The skip of the local kernel is taken care of separately.
   */
-static inline bool canSkipWork(const gmx_nbnxn_gpu_t &nb,
-                               InteractionLocality    iloc)
+static inline bool canSkipNonbondedWork(const gmx_nbnxn_gpu_t &nb,
+                                        InteractionLocality    iloc)
  {
      assert(nb.plist[iloc]);
      return (iloc == InteractionLocality::NonLocal &&
diff --git a/src/gromacs/nbnxm/nbnxm.cpp b/src/gromacs/nbnxm/nbnxm.cpp

index 54c9ff98640a9cd3262911d8d30ef9d454e39774..a81a6b400eab0d2cc90b078086fa6dc86b82b1e1 100644 (file)
--- a/src/gromacs/nbnxm/nbnxm.cpp
+++ b/src/gromacs/nbnxm/nbnxm.cpp
@@ -164,6 +164,13 @@ nonbonded_verlet_t::atomdata_add_nbat_f_to_f(const Nbnxm::AtomLocality  locality
                                               rvec                      *f,
                                               gmx_wallcycle             *wcycle)
  {
+    /* Skip the reduction if there was no short-range GPU work to do
+     * (either NB or both NB and bonded work). */
+    if (!pairlistIsSimple() && !haveGpuShortRangeWork(locality))
+    {
+        return;
+    }
+
      wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
      wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS);
  
diff --git a/src/gromacs/nbnxm/nbnxm.h b/src/gromacs/nbnxm/nbnxm.h

index 72eb98ae59866efbbc456818c50c18a7da2cb878..7e3869b451e3e9a7bfe69ad03dba3ca8491e2b80 100644 (file)
--- a/src/gromacs/nbnxm/nbnxm.h
+++ b/src/gromacs/nbnxm/nbnxm.h
@@ -317,6 +317,23 @@ struct nonbonded_verlet_t
          void changePairlistRadii(real rlistOuter,
                                   real rlistInner);
  
+        //! Set up internal flags that indicate what type of short-range work there is.
+        void setupGpuShortRangeWork(const gmx::GpuBonded             *gpuBonded,
+                                    const Nbnxm::InteractionLocality  iLocality)
+        {
+            if (useGpu() && !emulateGpu())
+            {
+                Nbnxm::setupGpuShortRangeWork(gpu_nbv, gpuBonded, iLocality);
+            }
+        }
+
+        //! Returns true if there is GPU short-range work for the given atom locality.
+        bool haveGpuShortRangeWork(const Nbnxm::AtomLocality aLocality)
+        {
+            return ((useGpu() && !emulateGpu()) &&
+                    Nbnxm::haveGpuShortRangeWork(gpu_nbv, aLocality));
+        }
+
          // TODO: Make all data members private
      public:
          //! All data related to the pair lists
diff --git a/src/gromacs/nbnxm/nbnxm_gpu.h b/src/gromacs/nbnxm/nbnxm_gpu.h

index 7e88129f4ee89728bebc38427d0d5617c2f0e863..a1e480045680829d8fd67106d4ce8f8dbdef1324 100644 (file)
--- a/src/gromacs/nbnxm/nbnxm_gpu.h
+++ b/src/gromacs/nbnxm/nbnxm_gpu.h
@@ -55,6 +55,11 @@
  struct nbnxn_atomdata_t;
  enum class GpuTaskCompletion;
  
+namespace gmx
+{
+class GpuBonded;
+}
+
  namespace Nbnxm
  {
  
@@ -69,13 +74,11 @@ class Grid;
   * \param [in]    nb        GPU nonbonded data.
   * \param [in]    nbdata    Host-side atom data structure.
   * \param [in]    aloc      Atom locality flag.
- * \param [in]    haveOtherWork  True if there are other tasks that require the nbnxn coordinate input.
   */
  GPU_FUNC_QUALIFIER
  void gpu_copy_xq_to_gpu(gmx_nbnxn_gpu_t gmx_unused               *nb,
                          const struct nbnxn_atomdata_t gmx_unused *nbdata,
-                        AtomLocality gmx_unused                   aloc,
-                        bool gmx_unused                           haveOtherWork) GPU_FUNC_TERM
+                        AtomLocality gmx_unused                   aloc) GPU_FUNC_TERM
  
  /*! \brief
   * Launch asynchronously the nonbonded force calculations.
@@ -133,17 +136,14 @@ void gpu_launch_kernel_pruneonly(gmx_nbnxn_gpu_t gmx_unused     *nb,
                                   int gmx_unused                  numParts) GPU_FUNC_TERM
  
  /*! \brief
- * Launch asynchronously the download of nonbonded forces from the GPU
+ * Launch asynchronously the download of short-range forces from the GPU
   * (and energies/shift forces if required).
- * When haveOtherWork=true, the copy-back is done even when there was
- * no non-bonded work.
   */
  GPU_FUNC_QUALIFIER
  void gpu_launch_cpyback(gmx_nbnxn_gpu_t  gmx_unused *nb,
                          nbnxn_atomdata_t gmx_unused *nbatom,
                          int              gmx_unused  flags,
-                        AtomLocality     gmx_unused  aloc,
-                        bool             gmx_unused  haveOtherWork) GPU_FUNC_TERM
+                        AtomLocality     gmx_unused  aloc) GPU_FUNC_TERM
  
  /*! \brief Attempts to complete nonbonded GPU task.
   *
@@ -171,7 +171,6 @@ void gpu_launch_cpyback(gmx_nbnxn_gpu_t  gmx_unused *nb,
   * \param[in]  nb     The nonbonded data GPU structure
   * \param[in]  flags  Force flags
   * \param[in]  aloc   Atom locality identifier
- * \param[in]  haveOtherWork  Tells whether there is other work than non-bonded work in the nbnxn stream(s)
   * \param[out] e_lj   Pointer to the LJ energy output to accumulate into
   * \param[out] e_el   Pointer to the electrostatics energy output to accumulate into
   * \param[out] fshift Pointer to the shift force buffer to accumulate into
@@ -182,7 +181,6 @@ GPU_FUNC_QUALIFIER
  bool gpu_try_finish_task(gmx_nbnxn_gpu_t gmx_unused  *nb,
                           int             gmx_unused   flags,
                           AtomLocality    gmx_unused   aloc,
-                         bool            gmx_unused   haveOtherWork,
                           real            gmx_unused  *e_lj,
                           real            gmx_unused  *e_el,
                           rvec            gmx_unused  *fshift,
@@ -198,7 +196,6 @@ bool gpu_try_finish_task(gmx_nbnxn_gpu_t gmx_unused  *nb,
   * \param[in] nb The nonbonded data GPU structure
   * \param[in] flags Force flags
   * \param[in] aloc Atom locality identifier
- * \param[in]  haveOtherWork  Tells whether there is other work than non-bonded work in the nbnxn stream(s)
   * \param[out] e_lj Pointer to the LJ energy output to accumulate into
   * \param[out] e_el Pointer to the electrostatics energy output to accumulate into
   * \param[out] fshift Pointer to the shift force buffer to accumulate into
@@ -207,7 +204,6 @@ GPU_FUNC_QUALIFIER
  void gpu_wait_finish_task(gmx_nbnxn_gpu_t gmx_unused *nb,
                            int             gmx_unused  flags,
                            AtomLocality    gmx_unused  aloc,
-                          bool            gmx_unused  haveOtherWork,
                            real            gmx_unused *e_lj,
                            real            gmx_unused *e_el,
                            rvec            gmx_unused *fshift) GPU_FUNC_TERM
@@ -242,6 +238,37 @@ CUDA_FUNC_QUALIFIER
  void nbnxnInsertNonlocalGpuDependency(const gmx_nbnxn_gpu_t gmx_unused    *nb,
                                        const InteractionLocality gmx_unused interactionLocality) CUDA_FUNC_TERM
  
+/*! \brief Set up internal flags that indicate what type of short-range work there is.
+ *
+ * As nonbondeds and bondeds share input/output buffers and GPU queues,
+ * both are considered when checking for work in the current domain.
+ *
+ * This function is expected to be called every time the work-distribution
+ * can change (i.e. at search/domain decomposition steps).
+ *
+ * \param[inout]  nb         Pointer to the nonbonded GPU data structure
+ * \param[in]     gpuBonded  Pointer to the GPU bonded data structure
+ * \param[in]     iLocality  Interaction locality identifier
+ */
+GPU_FUNC_QUALIFIER
+void setupGpuShortRangeWork(gmx_nbnxn_gpu_t                  gmx_unused *nb,
+                            const gmx::GpuBonded             gmx_unused *gpuBonded,
+                            const Nbnxm::InteractionLocality gmx_unused  iLocality) GPU_FUNC_TERM
+
+/*! \brief Returns true if there is GPU short-range work for the given atom locality.
+ *
+ * Note that as, unlike nonbonded tasks, bonded tasks are not split into local/nonlocal,
+ * and therefore if there are GPU offloaded bonded interactions, this function will return
+ * true for both local and nonlocal atom range.
+ *
+ * \param[inout]  nb        Pointer to the nonbonded GPU data structure
+ * \param[in]     aLocality Atom locality identifier
+ */
+GPU_FUNC_QUALIFIER
+bool haveGpuShortRangeWork(const gmx_nbnxn_gpu_t     gmx_unused *nb,
+                           const Nbnxm::AtomLocality gmx_unused  aLocality) GPU_FUNC_TERM_WITH_RETURN(false)
+
+
  } // namespace Nbnxm
  
  #endif
diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp b/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp

index d3b860e6e6f30389d059de0fc892a565fb3d4456..659c4842874f53f48bad6b0e702c0f13c40f18fc 100644 (file)
--- a/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp
+++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp
@@ -365,9 +365,10 @@ static void sync_ocl_event(cl_command_queue stream, cl_event *ocl_event)
  /*! \brief Launch asynchronously the xq buffer host to device copy. */
  void gpu_copy_xq_to_gpu(gmx_nbnxn_ocl_t        *nb,
                          const nbnxn_atomdata_t *nbatom,
-                        const AtomLocality      atomLocality,
-                        const bool              haveOtherWork)
+                        const AtomLocality      atomLocality)
  {
+    GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
+
      const InteractionLocality iloc = gpuAtomToInteractionLocality(atomLocality);
  
      /* local/nonlocal offset and length used for xq and f */
@@ -389,7 +390,7 @@ void gpu_copy_xq_to_gpu(gmx_nbnxn_ocl_t        *nb,
         we always call the local local x+q copy (and the rest of the local
         work in nbnxn_gpu_launch_kernel().
       */
-    if (!haveOtherWork && canSkipWork(*nb, iloc))
+    if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(*nb, iloc))
      {
          plist->haveFreshList = false;
  
@@ -491,7 +492,7 @@ void gpu_launch_kernel(gmx_nbnxn_ocl_t                  *nb,
         clearing. All these operations, except for the local interaction kernel,
         are needed for the non-local interactions. The skip of the local kernel
         call is taken care of later in this function. */
-    if (canSkipWork(*nb, iloc))
+    if (canSkipNonbondedWork(*nb, iloc))
      {
          plist->haveFreshList = false;
  
@@ -733,9 +734,10 @@ void gpu_launch_kernel_pruneonly(gmx_nbnxn_gpu_t           *nb,
  void gpu_launch_cpyback(gmx_nbnxn_ocl_t               *nb,
                          struct nbnxn_atomdata_t       *nbatom,
                          const int                      flags,
-                        const AtomLocality             aloc,
-                        const bool                     haveOtherWork)
+                        const AtomLocality             aloc)
  {
+    GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
+
      cl_int gmx_unused cl_error;
      int               adat_begin, adat_len; /* local/nonlocal offset and length used for xq and f */
  
@@ -752,7 +754,7 @@ void gpu_launch_cpyback(gmx_nbnxn_ocl_t               *nb,
  
  
      /* don't launch non-local copy-back if there was no non-local work to do */
-    if (!haveOtherWork && canSkipWork(*nb, iloc))
+    if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(*nb, iloc))
      {
          /* TODO An alternative way to signal that non-local work is
             complete is to use a clEnqueueMarker+clEnqueueBarrier
diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h b/src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h

index 57d945bdcb870531bb69add3e861eafe194c8814..55d93e74f57e15cc0e8a2e1a88d8d85d4cc724f4 100644 (file)
--- a/src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h
+++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h
@@ -298,6 +298,12 @@ struct gmx_nbnxn_ocl_t
                                                     non-local force calculations are done
                                                     (e.g. f buffer 0-ing, local x/q H2D) */
  
+    //! True if there has been local/nonlocal GPU work, either bonded or nonbonded, scheduled
+    //  to be executed in the current domain. As long as bonded work is not split up into
+    //  local/nonlocal, if there is bonded GPU work, both flags will be true.
+    gmx::EnumerationArray<Nbnxm::InteractionLocality, bool> haveWork;
+
+
      cl_bool                           bDoTime;  /**< True if event-based timing is enabled.                     */
      cl_timers_t                      *timers;   /**< OpenCL event-based timers.                                 */
      struct gmx_wallclock_gpu_nbnxn_t *timings;  /**< Timing data. TODO: deprecate this and query timers for accumulated data instead */
author	Szilárd Páll <pall.szilard@gmail.com>
	Thu, 27 Jun 2019 17:53:08 +0000 (19:53 +0200)
committer	Mark Abraham <mark.j.abraham@gmail.com>
	Mon, 1 Jul 2019 21:55:31 +0000 (23:55 +0200)
src/gromacs/listed_forces/gpubonded_impl.cu		patch \| blob \| history
src/gromacs/mdlib/sim_util.cpp		patch \| blob \| history
src/gromacs/nbnxm/cuda/nbnxm_cuda.cu		patch \| blob \| history
src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h		patch \| blob \| history
src/gromacs/nbnxm/gpu_common.h		patch \| blob \| history
src/gromacs/nbnxm/gpu_common_utils.h		patch \| blob \| history
src/gromacs/nbnxm/nbnxm.cpp		patch \| blob \| history
src/gromacs/nbnxm/nbnxm.h		patch \| blob \| history
src/gromacs/nbnxm/nbnxm_gpu.h		patch \| blob \| history
src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp		patch \| blob \| history
src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h		patch \| blob \| history