Move the PME GPU clearing to the end of the step

author Szilárd Páll <pall.szilard@gmail.com>

Wed, 18 Apr 2018 15:59:39 +0000 (17:59 +0200)

committer Szilárd Páll <pall.szilard@gmail.com>

Thu, 19 Apr 2018 09:37:51 +0000 (11:37 +0200)
author Szilárd Páll <pall.szilard@gmail.com>
Wed, 18 Apr 2018 15:59:39 +0000 (17:59 +0200)
committer Szilárd Páll <pall.szilard@gmail.com>
Thu, 19 Apr 2018 09:37:51 +0000 (11:37 +0200)
diff --git a/src/gromacs/ewald/pme-gpu-internal.cpp b/src/gromacs/ewald/pme-gpu-internal.cpp

index 2a6fa22f0d12e72b8a0fbfb0f21339a7c97d786d..9a285f20e6dffb55e4dcfef954cc72dd9e6b869e 100644 (file)
--- a/src/gromacs/ewald/pme-gpu-internal.cpp
+++ b/src/gromacs/ewald/pme-gpu-internal.cpp
@@ -126,17 +126,6 @@ void pme_gpu_update_input_box(PmeGpu gmx_unused       *pmeGpu,
  #endif
  }
  
-/*! \brief \libinternal
- * The PME GPU reinitialization function that is called both at the end of any PME computation and on any load balancing.
- *
- * \param[in] pmeGpu            The PME GPU structure.
- */
-void pme_gpu_reinit_computation(const PmeGpu *pmeGpu)
-{
-    pme_gpu_clear_grids(pmeGpu);
-    pme_gpu_clear_energy_virial(pmeGpu);
-}
-
  /*! \brief \libinternal
   * (Re-)initializes all the PME GPU data related to the grid size and cut-off.
   *
@@ -348,7 +337,9 @@ void pme_gpu_reinit(gmx_pme_t *pme, gmx_device_info_t *gpuInfo)
      pme_gpu_reinit_timings(pme->gpu);
  
      pme_gpu_reinit_grids(pme->gpu);
-    pme_gpu_reinit_computation(pme->gpu);
+    // Note: if timing the reinit launch overhead becomes more relevant
+    // (e.g. with regulat PP-PME re-balancing), we should pass wcycle here.
+    pme_gpu_reinit_computation(pme, nullptr);
      /* Clear the previous box - doesn't hurt, and forces the PME CPU recipbox
       * update for mixed mode on grid switch. TODO: use shared recipbox field.
       */
diff --git a/src/gromacs/ewald/pme-gpu.cpp b/src/gromacs/ewald/pme-gpu.cpp

index 4f6a4dee8f7f3734d5e91591c72eedf1e3fafd0e..29c357d3ca821e6e255d55279ad795e4ff4f52d1 100644 (file)
--- a/src/gromacs/ewald/pme-gpu.cpp
+++ b/src/gromacs/ewald/pme-gpu.cpp
@@ -163,8 +163,9 @@ void pme_gpu_launch_spread(gmx_pme_t            *pme,
  
      PmeGpu *pmeGpu = pme->gpu;
  
-    // The only spot of PME GPU where LAUNCH_GPU (sub)counter increases call-count
+    // The only spot of PME GPU where LAUNCH_GPU counter increases call-count
      wallcycle_start(wcycle, ewcLAUNCH_GPU);
+    // The only spot of PME GPU where ewcsLAUNCH_GPU_PME subcounter increases call-count
      wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_PME);
      pme_gpu_copy_input_coordinates(pmeGpu, x);
      wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_PME);
@@ -216,7 +217,7 @@ void pme_gpu_launch_complex_transforms(gmx_pme_t      *pme,
              {
                  const auto gridOrdering = pme_gpu_uses_dd(pmeGpu) ? GridOrdering::YZX : GridOrdering::XYZ;
                  wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
-                wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_PME); //FIXME nocount
+                wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_PME);
                  pme_gpu_solve(pmeGpu, cfftgrid, gridOrdering, computeEnergyAndVirial);
                  wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_PME);
                  wallcycle_stop(wcycle, ewcLAUNCH_GPU);
@@ -320,15 +321,8 @@ bool pme_gpu_try_finish_task(const gmx_pme_t                *pme,
      // Time the final staged data handling separately with a counting call to get
      // the call count right.
      wallcycle_start(wcycle, ewcWAIT_GPU_PME_GATHER);
-
-    // The computation has completed, do timing accounting and resetting buffers
      pme_gpu_update_timings(pme->gpu);
-    // TODO: move this later and launch it together with the other
-    // non-bonded tasks at the end of the step
-    pme_gpu_reinit_computation(pme->gpu);
-
      pme_gpu_get_staged_results(pme, forces, virial, energy);
-
      wallcycle_stop(wcycle, ewcWAIT_GPU_PME_GATHER);
  
      return true;
@@ -342,3 +336,18 @@ void pme_gpu_wait_finish_task(const gmx_pme_t                *pme,
  {
      pme_gpu_try_finish_task(pme, wcycle, forces, virial, energy, GpuTaskCompletion::Wait);
  }
+
+void pme_gpu_reinit_computation(const gmx_pme_t *pme,
+                                gmx_wallcycle   *wcycle)
+{
+    GMX_ASSERT(pme_gpu_active(pme), "This should be a GPU run of PME but it is not enabled.");
+
+    wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
+    wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_PME);
+
+    pme_gpu_clear_grids(pme->gpu);
+    pme_gpu_clear_energy_virial(pme->gpu);
+
+    wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_PME);
+    wallcycle_stop(wcycle, ewcLAUNCH_GPU);
+}
diff --git a/src/gromacs/ewald/pme-only.cpp b/src/gromacs/ewald/pme-only.cpp

index fc25c547b15dc43dd8a80a04cff1cd34f46dd599..136659bf7692575914c70c5e69bd61fc5eff6de4 100644 (file)
--- a/src/gromacs/ewald/pme-only.cpp
+++ b/src/gromacs/ewald/pme-only.cpp
@@ -644,6 +644,7 @@ int gmx_pmeonly(struct gmx_pme_t *pme,
              pme_gpu_launch_complex_transforms(pme, wcycle);
              pme_gpu_launch_gather(pme, wcycle, PmeForceOutputHandling::Set);
              pme_gpu_wait_finish_task(pme, wcycle, &forces, vir_q, &energy_q);
+            pme_gpu_reinit_computation(pme, wcycle);
          }
          else
          {
diff --git a/src/gromacs/ewald/pme.h b/src/gromacs/ewald/pme.h

index 073f2881232053e475166e91a2ad83794eb63315..168baaf0369932a279872e9851162db4d3fbcde9 100644 (file)
--- a/src/gromacs/ewald/pme.h
+++ b/src/gromacs/ewald/pme.h
@@ -379,5 +379,20 @@ CUDA_FUNC_QUALIFIER bool pme_gpu_try_finish_task(const gmx_pme_t
                                                   real                           *CUDA_FUNC_ARGUMENT(energy),
                                                   GpuTaskCompletion               CUDA_FUNC_ARGUMENT(completionKind)) CUDA_FUNC_TERM_WITH_RETURN(false)
  
+/*! \brief
+ * The PME GPU reinitialization function that is called both at the end of any PME computation and on any load balancing.
+ *
+ * Clears the internal grid and energy/virial buffers; it is not safe to start
+ * the PME computation without calling this.
+ * Note that unlike in the nbnxn module, the force buffer does not need clearing.
+ *
+ * \todo Rename this function to *clear* -- it clearly only does output resetting
+ * and we should be clear about what the function does..
+ *
+ * \param[in] pme            The PME data structure.
+ * \param[in] wcycle         The wallclock counter.
+ */
+CUDA_FUNC_QUALIFIER void pme_gpu_reinit_computation(const gmx_pme_t *CUDA_FUNC_ARGUMENT(pme),
+                                                    gmx_wallcycle   *CUDA_FUNC_ARGUMENT(wcycle)) CUDA_FUNC_TERM
  
  #endif
diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp

index 137e5028c84ff22af597968e1dac9b41ba0f968b..366b265a38a649b38de31c907080c8d74781e0d6 100644 (file)
--- a/src/gromacs/mdlib/sim_util.cpp
+++ b/src/gromacs/mdlib/sim_util.cpp
@@ -1641,6 +1641,11 @@ static void do_force_cutsVERLET(FILE *fplog, const t_commrec *cr,
          wallcycle_stop(wcycle, ewcFORCE);
      }
  
+    if (useGpuPme)
+    {
+        pme_gpu_reinit_computation(fr->pmedata, wcycle);
+    }
+
      if (bUseGPU)
      {
          /* now clear the GPU outputs while we finish the step on the CPU */
@@ -1655,8 +1660,6 @@ static void do_force_cutsVERLET(FILE *fplog, const t_commrec *cr,
          }
          wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
          wallcycle_stop(wcycle, ewcLAUNCH_GPU);
-
-        // TODO: move here the PME buffer clearing call pme_gpu_reinit_computation()
      }
  
      /* Do the nonbonded GPU (or emulation) force buffer reduction
author	Szilárd Páll <pall.szilard@gmail.com>
	Wed, 18 Apr 2018 15:59:39 +0000 (17:59 +0200)
committer	Szilárd Páll <pall.szilard@gmail.com>
	Thu, 19 Apr 2018 09:37:51 +0000 (11:37 +0200)
src/gromacs/ewald/pme-gpu-internal.cpp		patch \| blob \| history
src/gromacs/ewald/pme-gpu.cpp		patch \| blob \| history
src/gromacs/ewald/pme-only.cpp		patch \| blob \| history
src/gromacs/ewald/pme.h		patch \| blob \| history
src/gromacs/mdlib/sim_util.cpp		patch \| blob \| history