Implement alternating GPU wait

author Szilárd Páll <pall.szilard@gmail.com>

Fri, 27 Oct 2017 16:24:36 +0000 (18:24 +0200)

committer Mark Abraham <mark.j.abraham@gmail.com>

Wed, 6 Dec 2017 01:10:15 +0000 (02:10 +0100)
author Szilárd Páll <pall.szilard@gmail.com>
Fri, 27 Oct 2017 16:24:36 +0000 (18:24 +0200)
committer Mark Abraham <mark.j.abraham@gmail.com>
Wed, 6 Dec 2017 01:10:15 +0000 (02:10 +0100)
diff --git a/docs/user-guide/environment-variables.rst b/docs/user-guide/environment-variables.rst

index 4cbd001f1e9cf73a2f64147c2eb497d0a37e2585..5ed6d03587e64a3ba315065da3e2e5e1e1296b53 100644 (file)
--- a/docs/user-guide/environment-variables.rst
+++ b/docs/user-guide/environment-variables.rst
@@ -113,6 +113,12 @@ Debugging
          over-ride the number of DD pulses used
          (default 0, meaning no over-ride). Normally 1 or 2.
  
          over-ride the number of DD pulses used
          (default 0, meaning no over-ride). Normally 1 or 2.
  
+``GMX_DISABLE_ALTERNATING_GPU_WAIT``
+        disables the specialized polling wait path used to wait for the PME and nonbonded
+        GPU tasks completion to overlap to do the reduction of the resulting forces that
+        arrive first. Setting this variable switches to the generic path with fixed waiting
+        order.
+
  There are a number of extra environment variables like these
  that are used in debugging - check the code!
  
  There are a number of extra environment variables like these
  that are used in debugging - check the code!
  
diff --git a/src/gromacs/ewald/pme-gpu-internal.cpp b/src/gromacs/ewald/pme-gpu-internal.cpp

index 869283b3c9b9d0850a5ba8c093882410059e2541..7a033ca7ffb4fc5645a735c9431879e766cd349f 100644 (file)
--- a/src/gromacs/ewald/pme-gpu-internal.cpp
+++ b/src/gromacs/ewald/pme-gpu-internal.cpp
@@ -133,21 +133,12 @@ void pme_gpu_update_input_box(PmeGpu gmx_unused       *pmeGpu,
   *
   * \param[in] pmeGpu            The PME GPU structure.
   */
   *
   * \param[in] pmeGpu            The PME GPU structure.
   */
-static void pme_gpu_reinit_computation(const PmeGpu *pmeGpu)
+void pme_gpu_reinit_computation(const PmeGpu *pmeGpu)
  {
      pme_gpu_clear_grids(pmeGpu);
      pme_gpu_clear_energy_virial(pmeGpu);
  }
  
  {
      pme_gpu_clear_grids(pmeGpu);
      pme_gpu_clear_energy_virial(pmeGpu);
  }
  
-void pme_gpu_finish_computation(const PmeGpu *pmeGpu)
-{
-    // Synchronize the whole PME stream at once, including D2H result transfers.
-    pme_gpu_synchronize(pmeGpu);
-
-    pme_gpu_update_timings(pmeGpu);
-    pme_gpu_reinit_computation(pmeGpu);
-}
-
  /*! \brief \libinternal
   * (Re-)initializes all the PME GPU data related to the grid size and cut-off.
   *
  /*! \brief \libinternal
   * (Re-)initializes all the PME GPU data related to the grid size and cut-off.
   *
diff --git a/src/gromacs/ewald/pme-gpu-internal.h b/src/gromacs/ewald/pme-gpu-internal.h

index b9d0b51484acbe4fb6a8909ce9ebfdc12bf1d0f4..454a44cfda249c6e82d5be656a71173d3202ee07 100644 (file)
--- a/src/gromacs/ewald/pme-gpu-internal.h
+++ b/src/gromacs/ewald/pme-gpu-internal.h
@@ -197,6 +197,15 @@ CUDA_FUNC_QUALIFIER void pme_gpu_copy_input_forces(PmeGpu *CUDA_FUNC_ARGUMENT(pm
   */
  CUDA_FUNC_QUALIFIER void pme_gpu_copy_output_forces(PmeGpu *CUDA_FUNC_ARGUMENT(pmeGpu)) CUDA_FUNC_TERM
  
   */
  CUDA_FUNC_QUALIFIER void pme_gpu_copy_output_forces(PmeGpu *CUDA_FUNC_ARGUMENT(pmeGpu)) CUDA_FUNC_TERM
  
+/*! \libinternal \brief
+ * Checks whether work in the PME GPU stream has completed.
+ *
+ * \param[in] pmeGpu            The PME GPU structure.
+ *
+ * \returns                     True if work in the PME stream has completed.
+ */
+CUDA_FUNC_QUALIFIER bool pme_gpu_stream_query(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGpu)) CUDA_FUNC_TERM_WITH_RETURN(0)
+
  /*! \libinternal \brief
   * Reallocates the input coordinates buffer on the GPU (and clears the padded part if needed).
   *
  /*! \libinternal \brief
   * Reallocates the input coordinates buffer on the GPU (and clears the padded part if needed).
   *
@@ -573,7 +582,6 @@ gmx::ArrayRef<gmx::RVec> pme_gpu_get_forces(PmeGpu *pmeGpu);
  
  /*! \libinternal \brief
   * Returns the output virial and energy of the PME solving.
  
  /*! \libinternal \brief
   * Returns the output virial and energy of the PME solving.
- * Should be called after pme_gpu_finish_computation.
   *
   * \param[in] pmeGpu             The PME GPU structure.
   * \param[out] energy            The output energy.
   *
   * \param[in] pmeGpu             The PME GPU structure.
   * \param[out] energy            The output energy.
@@ -660,4 +668,14 @@ void pme_gpu_reinit_atoms(PmeGpu           *pmeGpu,
                            const int         nAtoms,
                            const real       *charges);
  
                            const int         nAtoms,
                            const real       *charges);
  
+/*! \brief \libinternal
+ * The PME GPU reinitialization function that is called both at the end of any PME computation and on any load balancing.
+ *
+ * This clears the device-side working buffers in preparation for new computation.
+ *
+ * \param[in] pmeGpu            The PME GPU structure.
+ */
+void pme_gpu_reinit_computation(const PmeGpu *pmeGpu);
+
+
  #endif
  #endif
diff --git a/src/gromacs/ewald/pme-gpu.cpp b/src/gromacs/ewald/pme-gpu.cpp

index 2a29e25be207b8d35a87f93865aabbff5371c234..45948a2e02a1331caa7a70899278341b1e6a1482 100644 (file)
--- a/src/gromacs/ewald/pme-gpu.cpp
+++ b/src/gromacs/ewald/pme-gpu.cpp
@@ -316,21 +316,19 @@ void pme_gpu_launch_gather(const gmx_pme_t                 *pme,
      wallcycle_stop(wcycle, ewcLAUNCH_GPU);
  }
  
      wallcycle_stop(wcycle, ewcLAUNCH_GPU);
  }
  
-void
-pme_gpu_wait_for_gpu(const gmx_pme_t                *pme,
-                     gmx_wallcycle_t                 wcycle,
-                     gmx::ArrayRef<const gmx::RVec> *forces,
-                     matrix                          virial,
-                     real                           *energy)
+/*! \brief Reduce staged virial and energy outputs.
+ *
+ * \param[in]  pme            The PME data structure.
+ * \param[out] forces         Output forces pointer, the internal ArrayRef pointers gets assigned to it.
+ * \param[out] virial         The output virial matrix.
+ * \param[out] energy         The output energy.
+ */
+static void pme_gpu_get_staged_results(const gmx_pme_t                *pme,
+                                       gmx::ArrayRef<const gmx::RVec> *forces,
+                                       matrix                          virial,
+                                       real                           *energy)
  {
  {
-    GMX_ASSERT(pme_gpu_active(pme), "This should be a GPU run of PME but it is not enabled.");
-
      const bool haveComputedEnergyAndVirial = pme->gpu->settings.currentFlags & GMX_PME_CALC_ENER_VIR;
      const bool haveComputedEnergyAndVirial = pme->gpu->settings.currentFlags & GMX_PME_CALC_ENER_VIR;
-
-    wallcycle_start(wcycle, ewcWAIT_GPU_PME_GATHER);
-    pme_gpu_finish_computation(pme->gpu);
-    wallcycle_stop(wcycle, ewcWAIT_GPU_PME_GATHER);
-
      *forces = pme_gpu_get_forces(pme->gpu);
  
      if (haveComputedEnergyAndVirial)
      *forces = pme_gpu_get_forces(pme->gpu);
  
      if (haveComputedEnergyAndVirial)
@@ -345,3 +343,57 @@ pme_gpu_wait_for_gpu(const gmx_pme_t                *pme,
          }
      }
  }
          }
      }
  }
+
+bool pme_gpu_try_finish_task(const gmx_pme_t                *pme,
+                             gmx_wallcycle_t                 wcycle,
+                             gmx::ArrayRef<const gmx::RVec> *forces,
+                             matrix                          virial,
+                             real                           *energy,
+                             GpuTaskCompletion               completionKind)
+{
+    GMX_ASSERT(pme_gpu_active(pme), "This should be a GPU run of PME but it is not enabled.");
+
+    wallcycle_start_nocount(wcycle, ewcWAIT_GPU_PME_GATHER);
+
+    if (completionKind == GpuTaskCompletion::Check)
+    {
+        // Query the PME stream for completion of all tasks enqueued and
+        // if we're not done, stop the timer before early return.
+        if (!pme_gpu_stream_query(pme->gpu))
+        {
+            wallcycle_stop(wcycle, ewcWAIT_GPU_PME_GATHER);
+            return false;
+        }
+    }
+    else
+    {
+        // Synchronize the whole PME stream at once, including D2H result transfers.
+        pme_gpu_synchronize(pme->gpu);
+    }
+    wallcycle_stop(wcycle, ewcWAIT_GPU_PME_GATHER);
+
+    // Time the final staged data handling separately with a counting call to get
+    // the call count right.
+    wallcycle_start(wcycle, ewcWAIT_GPU_PME_GATHER);
+
+    // The computation has completed, do timing accounting and resetting buffers
+    pme_gpu_update_timings(pme->gpu);
+    // TODO: move this later and launch it together with the other
+    // non-bonded tasks at the end of the step
+    pme_gpu_reinit_computation(pme->gpu);
+
+    pme_gpu_get_staged_results(pme, forces, virial, energy);
+
+    wallcycle_stop(wcycle, ewcWAIT_GPU_PME_GATHER);
+
+    return true;
+}
+
+void pme_gpu_wait_finish_task(const gmx_pme_t                *pme,
+                              gmx_wallcycle_t                 wcycle,
+                              gmx::ArrayRef<const gmx::RVec> *forces,
+                              matrix                          virial,
+                              real                           *energy)
+{
+    pme_gpu_try_finish_task(pme, wcycle, forces, virial, energy, GpuTaskCompletion::Wait);
+}
diff --git a/src/gromacs/ewald/pme-only.cpp b/src/gromacs/ewald/pme-only.cpp

index 3f219be17315b1d026e6154506c5d6670b015fb5..51807c72f83678afee02b3df53f7c9ede466e23b 100644 (file)
--- a/src/gromacs/ewald/pme-only.cpp
+++ b/src/gromacs/ewald/pme-only.cpp
@@ -637,7 +637,7 @@ int gmx_pmeonly(struct gmx_pme_t *pme,
              pme_gpu_launch_spread(pme, as_rvec_array(pme_pp->x.data()), wcycle);
              pme_gpu_launch_complex_transforms(pme, wcycle);
              pme_gpu_launch_gather(pme, wcycle, PmeForceOutputHandling::Set);
              pme_gpu_launch_spread(pme, as_rvec_array(pme_pp->x.data()), wcycle);
              pme_gpu_launch_complex_transforms(pme, wcycle);
              pme_gpu_launch_gather(pme, wcycle, PmeForceOutputHandling::Set);
-            pme_gpu_wait_for_gpu(pme, wcycle, &forces, vir_q, &energy_q);
+            pme_gpu_wait_finish_task(pme, wcycle, &forces, vir_q, &energy_q);
          }
          else
          {
          }
          else
          {
diff --git a/src/gromacs/ewald/pme.cu b/src/gromacs/ewald/pme.cu

index aec0074d2dfc9185c92a557bcf557b4231bd59f1..f677c7de37440cbc90e8a4c29a68e2ac9ededfb7 100644 (file)
--- a/src/gromacs/ewald/pme.cu
+++ b/src/gromacs/ewald/pme.cu
@@ -373,6 +373,11 @@ void pme_gpu_free_fract_shifts(const PmeGpu *pmeGpu)
                              pmeGpu->deviceInfo);
  }
  
                              pmeGpu->deviceInfo);
  }
  
+bool pme_gpu_stream_query(const PmeGpu *pmeGpu)
+{
+    return haveStreamTasksCompleted(pmeGpu->archSpecific->pmeStream);
+}
+
  void pme_gpu_copy_input_gather_grid(const PmeGpu *pmeGpu, float *h_grid)
  {
      const size_t gridSize = pmeGpu->archSpecific->realGridSize * sizeof(float);
  void pme_gpu_copy_input_gather_grid(const PmeGpu *pmeGpu, float *h_grid)
  {
      const size_t gridSize = pmeGpu->archSpecific->realGridSize * sizeof(float);
diff --git a/src/gromacs/ewald/pme.h b/src/gromacs/ewald/pme.h

index 7dbf2a19be29e0d869b7a1bb1901d393ebdc49bb..d8d8838ddb0bc3cfa3a12422ad3796aef3d35b16 100644 (file)
--- a/src/gromacs/ewald/pme.h
+++ b/src/gromacs/ewald/pme.h
@@ -66,6 +66,8 @@ struct gmx_wallclock_gpu_pme_t;
  struct gmx_device_info_t;
  struct gmx_pme_t;
  
  struct gmx_device_info_t;
  struct gmx_pme_t;
  
+enum class GpuTaskCompletion;
+
  namespace gmx
  {
  class ForceWithVirial;
  namespace gmx
  {
  class ForceWithVirial;
@@ -340,10 +342,38 @@ void pme_gpu_launch_gather(const gmx_pme_t        *pme,
   * \param[out] virial         The output virial matrix.
   * \param[out] energy         The output energy.
   */
   * \param[out] virial         The output virial matrix.
   * \param[out] energy         The output energy.
   */
-void pme_gpu_wait_for_gpu(const gmx_pme_t                *pme,
-                          gmx_wallcycle_t                 wcycle,
-                          gmx::ArrayRef<const gmx::RVec> *forces,
-                          matrix                          virial,
-                          real                           *energy);
+void pme_gpu_wait_finish_task(const gmx_pme_t                *pme,
+                              gmx_wallcycle_t                 wcycle,
+                              gmx::ArrayRef<const gmx::RVec> *forces,
+                              matrix                          virial,
+                              real                           *energy);
+/*! \brief
+ * Attempts to complete PME GPU tasks.
+ *
+ * The \p completionKind argument controls whether the function blocks until all
+ * PME GPU tasks enqueued completed (as pme_gpu_wait_finish_task() does) or only
+ * checks and returns immediately if they did not.
+ * When blocking or the tasks have completed it also gets the output forces
+ * by assigning the ArrayRef to the \p forces pointer passed in.
+ * Virial/energy are also outputs if they were to be computed.
+ *
+ * Note: also launches the reinitalization of the PME output buffers.
+ * TODO: this should be moved out to avoid miscounting its wall-time (as wait iso launch).
+ *
+ * \param[in]  pme            The PME data structure.
+ * \param[in]  wcycle         The wallclock counter.
+ * \param[out] forces         The output forces.
+ * \param[out] virial         The output virial matrix.
+ * \param[out] energy         The output energy.
+ * \param[in]  completionKind  Indicates whether PME task completion should only be checked rather than waited for
+ * \returns                   True if the PME GPU tasks have completed
+ */
+bool pme_gpu_try_finish_task(const gmx_pme_t                *pme,
+                             gmx_wallcycle_t                 wcycle,
+                             gmx::ArrayRef<const gmx::RVec> *forces,
+                             matrix                          virial,
+                             real                           *energy,
+                             GpuTaskCompletion               completionKind);
+
  
  #endif
  
  #endif
diff --git a/src/gromacs/gpu_utils/cudautils.cuh b/src/gromacs/gpu_utils/cudautils.cuh

index dae3548fdd950a8d5bb1dee5bf8392b61ce75df8..b4cca13bb546d5d098b29a2db9aca738242b4160 100644 (file)
--- a/src/gromacs/gpu_utils/cudautils.cuh
+++ b/src/gromacs/gpu_utils/cudautils.cuh
@@ -266,7 +266,7 @@ static inline void rvec_inc(rvec a, const float3 b)
      rvec_inc(a, tmp);
  }
  
      rvec_inc(a, tmp);
  }
  
-/*! \brief Calls cudaStreamSynchronize() in the stream \p s.
+/*! \brief Wait for all taks in stream \p s to complete.
   *
   * \param[in] s stream to synchronize with
   */
   *
   * \param[in] s stream to synchronize with
   */
@@ -276,4 +276,30 @@ static inline void gpuStreamSynchronize(cudaStream_t s)
      CU_RET_ERR(stat, "cudaStreamSynchronize failed");
  }
  
      CU_RET_ERR(stat, "cudaStreamSynchronize failed");
  }
  
+/*! \brief  Returns true if all tasks in \p s have completed.
+ *
+ * \param[in] s stream to check
+ *
+ *  \returns     True if all tasks enqueued in the stream \p s (at the time of this call) have completed.
+ */
+static inline bool haveStreamTasksCompleted(cudaStream_t s)
+{
+    cudaError_t stat = cudaStreamQuery(s);
+
+    if (stat == cudaErrorNotReady)
+    {
+        // work is still in progress in the stream
+        return false;
+    }
+
+    GMX_ASSERT(stat !=  cudaErrorInvalidResourceHandle, "Stream idnetifier not valid");
+
+    // cudaSuccess and cudaErrorNotReady are the expected return values
+    CU_RET_ERR(stat, "Unexpected cudaStreamQuery failure");
+
+    GMX_ASSERT(stat == cudaSuccess, "Values other than cudaSuccess should have been explicitly handled");
+
+    return true;
+}
+
  #endif
  #endif
diff --git a/src/gromacs/gpu_utils/gpu_utils.h b/src/gromacs/gpu_utils/gpu_utils.h

index a3254bd467cdc9cf9226459d570cc326ec333ef1..88323f87629083e02e13e43b88ce6b2b1098bfe5 100644 (file)
--- a/src/gromacs/gpu_utils/gpu_utils.h
+++ b/src/gromacs/gpu_utils/gpu_utils.h
@@ -67,6 +67,13 @@ enum class GpuApiCallBehavior
      Async
  };
  
      Async
  };
  
+//! Types of actions associated to waiting or checking the completion of GPU tasks
+enum class GpuTaskCompletion
+{
+    Wait, /*<< Issue a blocking wait for the task */
+    Check /*<< Only check whether the task has completed */
+};
+
  /*! \brief Detect all GPUs in the system.
   *
   *  Will detect every GPU supported by the device driver in use. Also
  /*! \brief Detect all GPUs in the system.
   *
   *  Will detect every GPU supported by the device driver in use. Also
diff --git a/src/gromacs/gpu_utils/oclutils.h b/src/gromacs/gpu_utils/oclutils.h

index 184eb6dbfbca93e041df7d07e8e31d84d1133735..2b2c82a88aaee9c0587c313105f3d9fc247586b0 100644 (file)
--- a/src/gromacs/gpu_utils/oclutils.h
+++ b/src/gromacs/gpu_utils/oclutils.h
@@ -165,4 +165,16 @@ static inline void gpuStreamSynchronize(cl_command_queue s)
                         ("Error caught during clFinish:" + ocl_get_error_string(cl_error)).c_str());
  }
  
                         ("Error caught during clFinish:" + ocl_get_error_string(cl_error)).c_str());
  }
  
+/*! \brief Pretend to synchronize an OpenCL stream (dummy implementation).
+ *
+ * \param[in] s queue to check
+ *
+ *  \returns     True if all tasks enqueued in the stream \p s (at the time of this call) have completed.
+ */
+static inline bool haveStreamTasksCompleted(cl_command_queue gmx_unused s)
+{
+    GMX_RELEASE_ASSERT(false, "haveStreamTasksCompleted is not implemented for OpenCL");
+    return false;
+}
+
  #endif
  #endif
diff --git a/src/gromacs/mdlib/nbnxn_gpu.h b/src/gromacs/mdlib/nbnxn_gpu.h

index cdb68597af3a8e5f1c80b48fdf2fead4f8279162..d5823c4edc963200cb3f29b057073db19e3e31b8 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_gpu.h
+++ b/src/gromacs/mdlib/nbnxn_gpu.h
@@ -54,6 +54,7 @@ extern "C" {
  #endif
  
  struct nbnxn_atomdata_t;
  #endif
  
  struct nbnxn_atomdata_t;
+enum class GpuTaskCompletion;
  
  /*! \brief
   * Launch asynchronously the nonbonded force calculations.
  
  /*! \brief
   * Launch asynchronously the nonbonded force calculations.
@@ -121,8 +122,48 @@ void nbnxn_gpu_launch_cpyback(gmx_nbnxn_gpu_t  gmx_unused              *nb,
                                int                    gmx_unused         flags,
                                int                    gmx_unused         aloc) GPU_FUNC_TERM
  
                                int                    gmx_unused         flags,
                                int                    gmx_unused         aloc) GPU_FUNC_TERM
  
-/*! \brief
- * Wait for the asynchronously launched nonbonded tasks and data
+/*! \brief Attempts to complete nonbonded GPU task.
+ *
+ *  This function attempts to complete the nonbonded task (both GPU and CPU auxiliary work).
+ *  Success, i.e. that the tasks completed and results are ready to be consumed, is signaled
+ *  by the return value (always true if blocking wait mode requested).
+ *
+ *  The \p completionKind parameter controls whether the behavior is non-blocking
+ *  (achieved by passing GpuTaskCompletion::Check) or blocking wait until the results
+ *  are ready (when GpuTaskCompletion::Wait is passed).
+ *  As the "Check" mode the function will return immediately if the GPU stream
+ *  still contain tasks that have not completed, it allows more flexible overlapping
+ *  of work on the CPU with GPU execution.
+ *
+ *  Note that it is only safe to use the results, and to continue to the next MD
+ *  step when this function has returned true which indicates successful completion of
+ *  - All nonbonded GPU tasks: both compute and device transfer(s)
+ *  - auxiliary tasks: updating the internal module state (timing accumulation, list pruning states) and
+ *  - internal staging reduction of (\p fshift, \p e_el, \p e_lj).
+ *
+ *  TODO: improve the handling of outputs e.g. by ensuring that this function explcitly returns the
+ *  force buffer (instead of that being passed only to nbnxn_gpu_launch_cpyback()) and by returning
+ *  the energy and Fshift contributions for some external/centralized reduction.
+ *
+ * \param[in]  nb     The nonbonded data GPU structure
+ * \param[in]  flags  Force flags
+ * \param[in]  aloc   Atom locality identifier
+ * \param[out] e_lj   Pointer to the LJ energy output to accumulate into
+ * \param[out] e_el   Pointer to the electrostatics energy output to accumulate into
+ * \param[out] fshift Pointer to the shift force buffer to accumulate into
+ * \param[in]  completionKind Indicates whether nnbonded task completion should only be checked rather than waited for
+ * \returns              True if the nonbonded tasks associated with \p aloc locality have completed
+ */
+GPU_FUNC_QUALIFIER
+bool nbnxn_gpu_try_finish_task(gmx_nbnxn_gpu_t gmx_unused  *nb,
+                               int             gmx_unused   flags,
+                               int             gmx_unused   aloc,
+                               real            gmx_unused  *e_lj,
+                               real            gmx_unused  *e_el,
+                               rvec            gmx_unused  *fshift,
+                               GpuTaskCompletion gmx_unused completionKind) GPU_FUNC_TERM_WITH_RETURN(false)
+
+/*! \brief  Completes the nonbonded GPU task blocking until GPU tasks and data
   * transfers to finish.
   *
   * Also does timing accounting and reduction of the internal staging buffers.
   * transfers to finish.
   *
   * Also does timing accounting and reduction of the internal staging buffers.
@@ -137,12 +178,12 @@ void nbnxn_gpu_launch_cpyback(gmx_nbnxn_gpu_t  gmx_unused              *nb,
   * \param[out] fshift Pointer to the shift force buffer to accumulate into
   */
  GPU_FUNC_QUALIFIER
   * \param[out] fshift Pointer to the shift force buffer to accumulate into
   */
  GPU_FUNC_QUALIFIER
-void nbnxn_gpu_wait_for_gpu(gmx_nbnxn_gpu_t gmx_unused *nb,
-                            int             gmx_unused  flags,
-                            int             gmx_unused  aloc,
-                            real            gmx_unused *e_lj,
-                            real            gmx_unused *e_el,
-                            rvec            gmx_unused *fshift) GPU_FUNC_TERM
+void nbnxn_gpu_wait_finish_task(gmx_nbnxn_gpu_t gmx_unused *nb,
+                                int             gmx_unused  flags,
+                                int             gmx_unused  aloc,
+                                real            gmx_unused *e_lj,
+                                real            gmx_unused *e_el,
+                                rvec            gmx_unused *fshift) GPU_FUNC_TERM
  
  /*! \brief Selects the Ewald kernel type, analytical or tabulated, single or twin cut-off. */
  GPU_FUNC_QUALIFIER
  
  /*! \brief Selects the Ewald kernel type, analytical or tabulated, single or twin cut-off. */
  GPU_FUNC_QUALIFIER
diff --git a/src/gromacs/mdlib/nbnxn_gpu_common.h b/src/gromacs/mdlib/nbnxn_gpu_common.h

index 63519e0daaf157975539b01fc95992282bd875bb..b11be12411a0482775b820dcf5a8f4d10379ba41 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_gpu_common.h
+++ b/src/gromacs/mdlib/nbnxn_gpu_common.h
@@ -55,6 +55,7 @@
  #include "nbnxn_ocl/nbnxn_ocl_types.h"
  #endif
  
  #include "nbnxn_ocl/nbnxn_ocl_types.h"
  #endif
  
+#include "gromacs/gpu_utils/gpu_utils.h"
  #include "gromacs/math/vec.h"
  #include "gromacs/mdlib/nbnxn_gpu_types.h"
  #include "gromacs/pbcutil/ishift.h"
  #include "gromacs/math/vec.h"
  #include "gromacs/mdlib/nbnxn_gpu_types.h"
  #include "gromacs/pbcutil/ishift.h"
@@ -227,6 +228,10 @@ static inline void nbnxn_gpu_reduce_staged_outputs(const StagingData &nbst,
   *  nonbonded tasks have completed with the exception of the rolling pruning kernels
   *  that are accounted for during the following step.
   *
   *  nonbonded tasks have completed with the exception of the rolling pruning kernels
   *  that are accounted for during the following step.
   *
+ * NOTE: if timing with multiple GPUs (streams) becomes possible, the
+ *      counters could end up being inconsistent due to not being incremented
+ *      on some of the node when this is skipped on empty local domains!
+ *
   * \tparam     GpuTimers         GPU timers type
   * \tparam     GpuPairlist       Pair list type
   * \param[out] timings           Pointer to the NB GPU timings data
   * \tparam     GpuTimers         GPU timers type
   * \tparam     GpuPairlist       Pair list type
   * \param[out] timings           Pointer to the NB GPU timings data
@@ -294,27 +299,35 @@ static inline void nbnxn_gpu_accumulate_timings(gmx_wallclock_gpu_nbnxn_t *timin
      }
  }
  
      }
  }
  
-// Documented in nbnxn_gpu.h
-void nbnxn_gpu_wait_for_gpu(gmx_nbnxn_gpu_t *nb,
-                            int              flags,
-                            int              aloc,
-                            real            *e_lj,
-                            real            *e_el,
-                            rvec            *fshift)
+bool nbnxn_gpu_try_finish_task(gmx_nbnxn_gpu_t  *nb,
+                               int               flags,
+                               int               aloc,
+                               real             *e_lj,
+                               real             *e_el,
+                               rvec             *fshift,
+                               GpuTaskCompletion completionKind)
  {
      /* determine interaction locality from atom locality */
      int iLocality = gpuAtomToInteractionLocality(aloc);
  
  {
      /* determine interaction locality from atom locality */
      int iLocality = gpuAtomToInteractionLocality(aloc);
  
-    /* Launch wait/update timers & counters and do reduction into staging buffers
-       BUT skip it when during the non-local phase there was actually no work to do.
-       This is consistent with nbnxn_gpu_launch_kernel.
-
-       NOTE: if timing with multiple GPUs (streams) becomes possible, the
-       counters could end up being inconsistent due to not being incremented
-       on some of the nodes! */
+    //  We skip when during the non-local phase there was actually no work to do.
+    //  This is consistent with nbnxn_gpu_launch_kernel.
      if (!canSkipWork(nb, iLocality))
      {
      if (!canSkipWork(nb, iLocality))
      {
-        gpuStreamSynchronize(nb->stream[iLocality]);
+        // Query the state of the GPU stream and return early if we're not done
+        if (completionKind == GpuTaskCompletion::Check)
+        {
+            if (!haveStreamTasksCompleted(nb->stream[iLocality]))
+            {
+                // Early return to skip the steps below that we have to do only
+                // after the NB task completed
+                return false;
+            }
+        }
+        else
+        {
+            gpuStreamSynchronize(nb->stream[iLocality]);
+        }
  
          bool calcEner   = flags & GMX_FORCE_ENERGY;
          bool calcFshift = flags & GMX_FORCE_VIRIAL;
  
          bool calcEner   = flags & GMX_FORCE_ENERGY;
          bool calcFshift = flags & GMX_FORCE_VIRIAL;
@@ -329,6 +342,34 @@ void nbnxn_gpu_wait_for_gpu(gmx_nbnxn_gpu_t *nb,
  
      /* Turn off initial list pruning (doesn't hurt if this is not pair-search step). */
      nb->plist[iLocality]->haveFreshList = false;
  
      /* Turn off initial list pruning (doesn't hurt if this is not pair-search step). */
      nb->plist[iLocality]->haveFreshList = false;
+
+    return true;
+}
+
+/*! \brief
+ * Wait for the asynchronously launched nonbonded tasks and data
+ * transfers to finish.
+ *
+ * Also does timing accounting and reduction of the internal staging buffers.
+ * As this is called at the end of the step, it also resets the pair list and
+ * pruning flags.
+ *
+ * \param[in] nb The nonbonded data GPU structure
+ * \param[in] flags Force flags
+ * \param[in] aloc Atom locality identifier
+ * \param[out] e_lj Pointer to the LJ energy output to accumulate into
+ * \param[out] e_el Pointer to the electrostatics energy output to accumulate into
+ * \param[out] fshift Pointer to the shift force buffer to accumulate into
+ */
+void nbnxn_gpu_wait_finish_task(gmx_nbnxn_gpu_t *nb,
+                                int              flags,
+                                int              aloc,
+                                real            *e_lj,
+                                real            *e_el,
+                                rvec            *fshift)
+{
+    nbnxn_gpu_try_finish_task(nb, flags, aloc, e_lj, e_el, fshift,
+                              GpuTaskCompletion::Wait);
  }
  
  #endif
  }
  
  #endif
diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp

index 8af5b86bdc85c1ae6d01ad6bfabe49c431c87afe..c147ebbf97ada4fc06c7c484960b8d7e558526a9 100644 (file)
--- a/src/gromacs/mdlib/sim_util.cpp
+++ b/src/gromacs/mdlib/sim_util.cpp
@@ -60,6 +60,7 @@
  #include "gromacs/gmxlib/nonbonded/nb_free_energy.h"
  #include "gromacs/gmxlib/nonbonded/nb_kernel.h"
  #include "gromacs/gmxlib/nonbonded/nonbonded.h"
  #include "gromacs/gmxlib/nonbonded/nb_free_energy.h"
  #include "gromacs/gmxlib/nonbonded/nb_kernel.h"
  #include "gromacs/gmxlib/nonbonded/nonbonded.h"
+#include "gromacs/gpu_utils/gpu_utils.h"
  #include "gromacs/imd/imd.h"
  #include "gromacs/listed-forces/bonded.h"
  #include "gromacs/listed-forces/disre.h"
  #include "gromacs/imd/imd.h"
  #include "gromacs/listed-forces/bonded.h"
  #include "gromacs/listed-forces/disre.h"
@@ -115,6 +116,13 @@
  #include "nbnxn_kernels/nbnxn_kernel_cpu.h"
  #include "nbnxn_kernels/nbnxn_kernel_prune.h"
  
  #include "nbnxn_kernels/nbnxn_kernel_cpu.h"
  #include "nbnxn_kernels/nbnxn_kernel_prune.h"
  
+// TODO: this environment variable allows us to verify before release
+// that on less common architectures the total cost of polling is not larger than
+// a blocking wait (so polling does not introduce overhead when the static
+// PME-first ordering would suffice).
+static const bool c_disableAlternatingWait = (getenv("GMX_DISABLE_ALTERNATING_GPU_WAIT") != nullptr);
+
+
  void print_time(FILE                     *out,
                  gmx_walltime_accounting_t walltime_accounting,
                  gmx_int64_t               step,
  void print_time(FILE                     *out,
                  gmx_walltime_accounting_t walltime_accounting,
                  gmx_int64_t               step,
@@ -850,6 +858,86 @@ static void launchPmeGpuFftAndGather(gmx_pme_t        *pmedata,
      pme_gpu_launch_gather(pmedata, wcycle, PmeForceOutputHandling::Set);
  }
  
      pme_gpu_launch_gather(pmedata, wcycle, PmeForceOutputHandling::Set);
  }
  
+/*! \brief
+ *  Polling wait for either of the PME or nonbonded GPU tasks.
+ *
+ * Instead of a static order in waiting for GPU tasks, this function
+ * polls checking which of the two tasks completes first, and does the
+ * associated force buffer reduction overlapped with the other task.
+ * By doing that, unlike static scheduling order, it can always overlap
+ * one of the reductions, regardless of the GPU task completion order.
+ *
+ * \param[in]     nbv              Nonbonded verlet structure
+ * \param[in]     pmedata          PME module data
+ * \param[in,out] force            Force array to reduce task outputs into.
+ * \param[in,out] forceWithVirial  Force and virial buffers
+ * \param[in,out] fshift           Shift force output vector results are reduced into
+ * \param[in,out] enerd            Energy data structure results are reduced into
+ * \param[in]     flags            Force flags
+ * \param[in]     wcycle           The wallcycle structure
+ */
+static void alternatePmeNbGpuWaitReduce(nonbonded_verlet_t             *nbv,
+                                        const gmx_pme_t                *pmedata,
+                                        gmx::PaddedArrayRef<gmx::RVec> *force,
+                                        ForceWithVirial                *forceWithVirial,
+                                        rvec                            fshift[],
+                                        gmx_enerdata_t                 *enerd,
+                                        int                             flags,
+                                        gmx_wallcycle_t                 wcycle)
+{
+    bool isPmeGpuDone = false;
+    bool isNbGpuDone  = false;
+
+
+    gmx::ArrayRef<const gmx::RVec> pmeGpuForces;
+
+    while (!isPmeGpuDone || !isNbGpuDone)
+    {
+        if (!isPmeGpuDone)
+        {
+            matrix            vir_Q;
+            real              Vlr_q;
+
+            GpuTaskCompletion completionType = (isNbGpuDone) ? GpuTaskCompletion::Wait : GpuTaskCompletion::Check;
+            isPmeGpuDone = pme_gpu_try_finish_task(pmedata, wcycle, &pmeGpuForces,
+                                                   vir_Q, &Vlr_q, completionType);
+
+            if (isPmeGpuDone)
+            {
+                pme_gpu_reduce_outputs(wcycle, forceWithVirial, pmeGpuForces,
+                                       enerd, vir_Q, Vlr_q);
+            }
+        }
+
+        if (!isNbGpuDone)
+        {
+            GpuTaskCompletion completionType = (isPmeGpuDone) ? GpuTaskCompletion::Wait : GpuTaskCompletion::Check;
+            wallcycle_start_nocount(wcycle, ewcWAIT_GPU_NB_L);
+            isNbGpuDone = nbnxn_gpu_try_finish_task(nbv->gpu_nbv,
+                                                    flags, eatLocal,
+                                                    enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
+                                                    fshift, completionType);
+            wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
+            // To get the call count right, when the task finished we
+            // issue a start/stop.
+            // TODO: move the ewcWAIT_GPU_NB_L cycle counting into nbnxn_gpu_try_finish_task()
+            // and ewcNB_XF_BUF_OPS counting into nbnxn_atomdata_add_nbat_f_to_f().
+            if (isNbGpuDone)
+            {
+                wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
+                wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
+
+                wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
+                wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS);
+                nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs, eatLocal,
+                                               nbv->nbat, as_rvec_array(force->data()));
+                wallcycle_sub_stop(wcycle, ewcsNB_F_BUF_OPS);
+                wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
+            }
+        }
+    }
+}
+
  /*! \brief
   *  Launch the dynamic rolling pruning GPU task.
   *
  /*! \brief
   *  Launch the dynamic rolling pruning GPU task.
   *
@@ -1405,10 +1493,10 @@ static void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
              if (bUseGPU)
              {
                  wallcycle_start(wcycle, ewcWAIT_GPU_NB_NL);
              if (bUseGPU)
              {
                  wallcycle_start(wcycle, ewcWAIT_GPU_NB_NL);
-                nbnxn_gpu_wait_for_gpu(nbv->gpu_nbv,
-                                       flags, eatNonlocal,
-                                       enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
-                                       fr->fshift);
+                nbnxn_gpu_wait_finish_task(nbv->gpu_nbv,
+                                           flags, eatNonlocal,
+                                           enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
+                                           fr->fshift);
                  cycles_wait_gpu += wallcycle_stop(wcycle, ewcWAIT_GPU_NB_NL);
              }
              else
                  cycles_wait_gpu += wallcycle_stop(wcycle, ewcWAIT_GPU_NB_NL);
              }
              else
@@ -1450,17 +1538,25 @@ static void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
          }
      }
  
          }
      }
  
-    if (useGpuPme)
+    // With both nonbonded and PME offloaded a GPU on the same rank, we use
+    // an alternating wait/reduction scheme.
+    bool alternateGpuWait = (!c_disableAlternatingWait && useGpuPme && bUseGPU && !DOMAINDECOMP(cr));
+    if (alternateGpuWait)
+    {
+        alternatePmeNbGpuWaitReduce(fr->nbv, fr->pmedata, &force, &forceWithVirial, fr->fshift, enerd, flags, wcycle);
+    }
+
+    if (!alternateGpuWait && useGpuPme)
      {
          gmx::ArrayRef<const gmx::RVec> pmeGpuForces;
          matrix vir_Q;
          real   Vlr_q;
      {
          gmx::ArrayRef<const gmx::RVec> pmeGpuForces;
          matrix vir_Q;
          real   Vlr_q;
-        pme_gpu_wait_for_gpu(fr->pmedata, wcycle, &pmeGpuForces, vir_Q, &Vlr_q);
+        pme_gpu_wait_finish_task(fr->pmedata, wcycle, &pmeGpuForces, vir_Q, &Vlr_q);
          pme_gpu_reduce_outputs(wcycle, &forceWithVirial, pmeGpuForces, enerd, vir_Q, Vlr_q);
      }
  
          pme_gpu_reduce_outputs(wcycle, &forceWithVirial, pmeGpuForces, enerd, vir_Q, Vlr_q);
      }
  
-    /* Wait for local NB forces */
-    if (bUseGPU)
+    /* Wait for local GPU NB outputs on the non-alternating wait path */
+    if (!alternateGpuWait && bUseGPU)
      {
          /* Measured overhead on CUDA and OpenCL with(out) GPU sharing
           * is between 0.5 and 1.5 Mcycles. So 2 MCycles is an overestimate,
      {
          /* Measured overhead on CUDA and OpenCL with(out) GPU sharing
           * is between 0.5 and 1.5 Mcycles. So 2 MCycles is an overestimate,
@@ -1470,10 +1566,10 @@ static void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
          const float gpuWaitApiOverheadMargin = 2e6f; /* cycles */
  
          wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
          const float gpuWaitApiOverheadMargin = 2e6f; /* cycles */
  
          wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
-        nbnxn_gpu_wait_for_gpu(nbv->gpu_nbv,
-                               flags, eatLocal,
-                               enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
-                               fr->fshift);
+        nbnxn_gpu_wait_finish_task(nbv->gpu_nbv,
+                                   flags, eatLocal,
+                                   enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
+                                   fr->fshift);
          float cycles_tmp = wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
  
          if (ddCloseBalanceRegion == DdCloseBalanceRegionAfterForceComputation::yes)
          float cycles_tmp = wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
  
          if (ddCloseBalanceRegion == DdCloseBalanceRegionAfterForceComputation::yes)
@@ -1522,8 +1618,9 @@ static void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
          // TODO: move here the PME buffer clearing call pme_gpu_reinit_computation()
      }
  
          // TODO: move here the PME buffer clearing call pme_gpu_reinit_computation()
      }
  
-    /* Do the nonbonded GPU (or emulation) force buffer reduction. */
-    if (bUseOrEmulGPU)
+    /* Do the nonbonded GPU (or emulation) force buffer reduction
+     * on the non-alternating path. */
+    if (bUseOrEmulGPU && !alternateGpuWait)
      {
          wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
          wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS);
      {
          wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
          wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS);
author	Szilárd Páll <pall.szilard@gmail.com>
	Fri, 27 Oct 2017 16:24:36 +0000 (18:24 +0200)
committer	Mark Abraham <mark.j.abraham@gmail.com>
	Wed, 6 Dec 2017 01:10:15 +0000 (02:10 +0100)
docs/user-guide/environment-variables.rst		patch \| blob \| history
src/gromacs/ewald/pme-gpu-internal.cpp		patch \| blob \| history
src/gromacs/ewald/pme-gpu-internal.h		patch \| blob \| history
src/gromacs/ewald/pme-gpu.cpp		patch \| blob \| history
src/gromacs/ewald/pme-only.cpp		patch \| blob \| history
src/gromacs/ewald/pme.cu		patch \| blob \| history
src/gromacs/ewald/pme.h		patch \| blob \| history
src/gromacs/gpu_utils/cudautils.cuh		patch \| blob \| history
src/gromacs/gpu_utils/gpu_utils.h		patch \| blob \| history
src/gromacs/gpu_utils/oclutils.h		patch \| blob \| history
src/gromacs/mdlib/nbnxn_gpu.h		patch \| blob \| history
src/gromacs/mdlib/nbnxn_gpu_common.h		patch \| blob \| history
src/gromacs/mdlib/sim_util.cpp		patch \| blob \| history