Move the PME GPU clearing to the end of the step
authorSzilárd Páll <pall.szilard@gmail.com>
Wed, 18 Apr 2018 15:59:39 +0000 (17:59 +0200)
committerSzilárd Páll <pall.szilard@gmail.com>
Thu, 19 Apr 2018 09:37:51 +0000 (11:37 +0200)
This places the call to the end of the force-compute next to the nbnxn
output clearing also fixing the cycle counting that previously
incorrectly assigned the cost of launching the clearing to the wait for
gather cycle counter.

Change-Id: I63b27a91d88a54d701831edda592b5d6277e3fb9

src/gromacs/ewald/pme-gpu-internal.cpp
src/gromacs/ewald/pme-gpu.cpp
src/gromacs/ewald/pme-only.cpp
src/gromacs/ewald/pme.h
src/gromacs/mdlib/sim_util.cpp

index 2a6fa22f0d12e72b8a0fbfb0f21339a7c97d786d..9a285f20e6dffb55e4dcfef954cc72dd9e6b869e 100644 (file)
@@ -126,17 +126,6 @@ void pme_gpu_update_input_box(PmeGpu gmx_unused       *pmeGpu,
 #endif
 }
 
-/*! \brief \libinternal
- * The PME GPU reinitialization function that is called both at the end of any PME computation and on any load balancing.
- *
- * \param[in] pmeGpu            The PME GPU structure.
- */
-void pme_gpu_reinit_computation(const PmeGpu *pmeGpu)
-{
-    pme_gpu_clear_grids(pmeGpu);
-    pme_gpu_clear_energy_virial(pmeGpu);
-}
-
 /*! \brief \libinternal
  * (Re-)initializes all the PME GPU data related to the grid size and cut-off.
  *
@@ -348,7 +337,9 @@ void pme_gpu_reinit(gmx_pme_t *pme, gmx_device_info_t *gpuInfo)
     pme_gpu_reinit_timings(pme->gpu);
 
     pme_gpu_reinit_grids(pme->gpu);
-    pme_gpu_reinit_computation(pme->gpu);
+    // Note: if timing the reinit launch overhead becomes more relevant
+    // (e.g. with regulat PP-PME re-balancing), we should pass wcycle here.
+    pme_gpu_reinit_computation(pme, nullptr);
     /* Clear the previous box - doesn't hurt, and forces the PME CPU recipbox
      * update for mixed mode on grid switch. TODO: use shared recipbox field.
      */
index 4f6a4dee8f7f3734d5e91591c72eedf1e3fafd0e..29c357d3ca821e6e255d55279ad795e4ff4f52d1 100644 (file)
@@ -163,8 +163,9 @@ void pme_gpu_launch_spread(gmx_pme_t            *pme,
 
     PmeGpu *pmeGpu = pme->gpu;
 
-    // The only spot of PME GPU where LAUNCH_GPU (sub)counter increases call-count
+    // The only spot of PME GPU where LAUNCH_GPU counter increases call-count
     wallcycle_start(wcycle, ewcLAUNCH_GPU);
+    // The only spot of PME GPU where ewcsLAUNCH_GPU_PME subcounter increases call-count
     wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_PME);
     pme_gpu_copy_input_coordinates(pmeGpu, x);
     wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_PME);
@@ -216,7 +217,7 @@ void pme_gpu_launch_complex_transforms(gmx_pme_t      *pme,
             {
                 const auto gridOrdering = pme_gpu_uses_dd(pmeGpu) ? GridOrdering::YZX : GridOrdering::XYZ;
                 wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
-                wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_PME); //FIXME nocount
+                wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_PME);
                 pme_gpu_solve(pmeGpu, cfftgrid, gridOrdering, computeEnergyAndVirial);
                 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_PME);
                 wallcycle_stop(wcycle, ewcLAUNCH_GPU);
@@ -320,15 +321,8 @@ bool pme_gpu_try_finish_task(const gmx_pme_t                *pme,
     // Time the final staged data handling separately with a counting call to get
     // the call count right.
     wallcycle_start(wcycle, ewcWAIT_GPU_PME_GATHER);
-
-    // The computation has completed, do timing accounting and resetting buffers
     pme_gpu_update_timings(pme->gpu);
-    // TODO: move this later and launch it together with the other
-    // non-bonded tasks at the end of the step
-    pme_gpu_reinit_computation(pme->gpu);
-
     pme_gpu_get_staged_results(pme, forces, virial, energy);
-
     wallcycle_stop(wcycle, ewcWAIT_GPU_PME_GATHER);
 
     return true;
@@ -342,3 +336,18 @@ void pme_gpu_wait_finish_task(const gmx_pme_t                *pme,
 {
     pme_gpu_try_finish_task(pme, wcycle, forces, virial, energy, GpuTaskCompletion::Wait);
 }
+
+void pme_gpu_reinit_computation(const gmx_pme_t *pme,
+                                gmx_wallcycle   *wcycle)
+{
+    GMX_ASSERT(pme_gpu_active(pme), "This should be a GPU run of PME but it is not enabled.");
+
+    wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
+    wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_PME);
+
+    pme_gpu_clear_grids(pme->gpu);
+    pme_gpu_clear_energy_virial(pme->gpu);
+
+    wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_PME);
+    wallcycle_stop(wcycle, ewcLAUNCH_GPU);
+}
index fc25c547b15dc43dd8a80a04cff1cd34f46dd599..136659bf7692575914c70c5e69bd61fc5eff6de4 100644 (file)
@@ -644,6 +644,7 @@ int gmx_pmeonly(struct gmx_pme_t *pme,
             pme_gpu_launch_complex_transforms(pme, wcycle);
             pme_gpu_launch_gather(pme, wcycle, PmeForceOutputHandling::Set);
             pme_gpu_wait_finish_task(pme, wcycle, &forces, vir_q, &energy_q);
+            pme_gpu_reinit_computation(pme, wcycle);
         }
         else
         {
index 073f2881232053e475166e91a2ad83794eb63315..168baaf0369932a279872e9851162db4d3fbcde9 100644 (file)
@@ -379,5 +379,20 @@ CUDA_FUNC_QUALIFIER bool pme_gpu_try_finish_task(const gmx_pme_t
                                                  real                           *CUDA_FUNC_ARGUMENT(energy),
                                                  GpuTaskCompletion               CUDA_FUNC_ARGUMENT(completionKind)) CUDA_FUNC_TERM_WITH_RETURN(false)
 
+/*! \brief
+ * The PME GPU reinitialization function that is called both at the end of any PME computation and on any load balancing.
+ *
+ * Clears the internal grid and energy/virial buffers; it is not safe to start
+ * the PME computation without calling this.
+ * Note that unlike in the nbnxn module, the force buffer does not need clearing.
+ *
+ * \todo Rename this function to *clear* -- it clearly only does output resetting
+ * and we should be clear about what the function does..
+ *
+ * \param[in] pme            The PME data structure.
+ * \param[in] wcycle         The wallclock counter.
+ */
+CUDA_FUNC_QUALIFIER void pme_gpu_reinit_computation(const gmx_pme_t *CUDA_FUNC_ARGUMENT(pme),
+                                                    gmx_wallcycle   *CUDA_FUNC_ARGUMENT(wcycle)) CUDA_FUNC_TERM
 
 #endif
index 137e5028c84ff22af597968e1dac9b41ba0f968b..366b265a38a649b38de31c907080c8d74781e0d6 100644 (file)
@@ -1641,6 +1641,11 @@ static void do_force_cutsVERLET(FILE *fplog, const t_commrec *cr,
         wallcycle_stop(wcycle, ewcFORCE);
     }
 
+    if (useGpuPme)
+    {
+        pme_gpu_reinit_computation(fr->pmedata, wcycle);
+    }
+
     if (bUseGPU)
     {
         /* now clear the GPU outputs while we finish the step on the CPU */
@@ -1655,8 +1660,6 @@ static void do_force_cutsVERLET(FILE *fplog, const t_commrec *cr,
         }
         wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
         wallcycle_stop(wcycle, ewcLAUNCH_GPU);
-
-        // TODO: move here the PME buffer clearing call pme_gpu_reinit_computation()
     }
 
     /* Do the nonbonded GPU (or emulation) force buffer reduction