Simplify PME GPU synchronization code

author Aleksei Iupinov <a.yupinov@gmail.com>

Fri, 27 Oct 2017 13:37:47 +0000 (15:37 +0200)

committer Mark Abraham <mark.j.abraham@gmail.com>

Wed, 1 Nov 2017 09:56:48 +0000 (10:56 +0100)
author Aleksei Iupinov <a.yupinov@gmail.com>
Fri, 27 Oct 2017 13:37:47 +0000 (15:37 +0200)
committer Mark Abraham <mark.j.abraham@gmail.com>
Wed, 1 Nov 2017 09:56:48 +0000 (10:56 +0100)
diff --git a/src/gromacs/ewald/pme-gpu-internal.cpp b/src/gromacs/ewald/pme-gpu-internal.cpp

index ab7cec033cb10d1c2a6d04c88bc027045f5ec5c6..05b24bc33e9b7dec2c2bc84dc720b598035712b6 100644 (file)
--- a/src/gromacs/ewald/pme-gpu-internal.cpp
+++ b/src/gromacs/ewald/pme-gpu-internal.cpp
@@ -79,6 +79,11 @@ static PmeGpuKernelParamsBase *pme_gpu_get_kernel_params_base_ptr(const PmeGpu *
  
  void pme_gpu_get_energy_virial(const PmeGpu *pmeGPU, real *energy, matrix virial)
  {
+    for (int j = 0; j < c_virialAndEnergyCount; j++)
+    {
+        GMX_ASSERT(std::isfinite(pmeGPU->staging.h_virialAndEnergy[j]), "PME GPU produces incorrect energy/virial.");
+    }
+
      GMX_ASSERT(energy, "Invalid energy output pointer in PME GPU");
      unsigned int j = 0;
      virial[XX][XX] = 0.25f * pmeGPU->staging.h_virialAndEnergy[j++];
@@ -128,16 +133,15 @@ static void pme_gpu_reinit_step(const PmeGpu *pmeGPU)
      pme_gpu_clear_energy_virial(pmeGPU);
  }
  
-void pme_gpu_finish_step(const PmeGpu *pmeGPU, const bool bCalcF, const bool bCalcEnerVir)
+void pme_gpu_finish_step(const PmeGpu *pmeGPU)
  {
-    if (bCalcF && pme_gpu_performs_gather(pmeGPU))
-    {
-        pme_gpu_sync_output_forces(pmeGPU);
-    }
-    if (bCalcEnerVir && pme_gpu_performs_solve(pmeGPU))
-    {
-        pme_gpu_sync_output_energy_virial(pmeGPU);
-    }
+    // Synchronize the whole PME stream at once, including D2H result transfers.
+    // If forces were computed, they will have arrived at the external host buffer provided to gather.
+    // If virial/energy were computed, they will have arrived into the internal staging buffer
+    // (even though that should have already happened before even launching the gather).
+    // Finally, cudaEvent_t based GPU timers need stream synchronization for correctness.
+    pme_gpu_synchronize(pmeGPU);
+
      pme_gpu_update_timings(pmeGPU);
      pme_gpu_reinit_step(pmeGPU);
  }
diff --git a/src/gromacs/ewald/pme-gpu-internal.h b/src/gromacs/ewald/pme-gpu-internal.h

index 91ab374d978fee7fc18c2981a7b3bb50a03d2f78..52bc72df2cde5dfc8931b9ecc7bf1a4d8fac9c7a 100644 (file)
--- a/src/gromacs/ewald/pme-gpu-internal.h
+++ b/src/gromacs/ewald/pme-gpu-internal.h
@@ -201,13 +201,6 @@ CUDA_FUNC_QUALIFIER void pme_gpu_copy_input_forces(const PmeGpu    *CUDA_FUNC_AR
  CUDA_FUNC_QUALIFIER void pme_gpu_copy_output_forces(const PmeGpu    *CUDA_FUNC_ARGUMENT(pmeGPU),
                                                      float           *CUDA_FUNC_ARGUMENT(h_forces)) CUDA_FUNC_TERM
  
-/*! \libinternal \brief
- * Waits for the PME GPU output forces copying to the CPU buffer to finish.
- *
- * \param[in] pmeGPU            The PME GPU structure.
- */
-CUDA_FUNC_QUALIFIER void pme_gpu_sync_output_forces(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
-
  /*! \libinternal \brief
   * Reallocates the input coordinates buffer on the GPU (and clears the padded part if needed).
   *
@@ -319,13 +312,6 @@ CUDA_FUNC_QUALIFIER void pme_gpu_realloc_and_copy_fract_shifts(PmeGpu *CUDA_FUNC
   */
  CUDA_FUNC_QUALIFIER void pme_gpu_free_fract_shifts(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
  
-/*! \libinternal \brief
- * Waits for the output virial/energy copying to the intermediate CPU buffer to finish.
- *
- * \param[in] pmeGPU  The PME GPU structure.
- */
-CUDA_FUNC_QUALIFIER void pme_gpu_sync_output_energy_virial(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
-
  /*! \libinternal \brief
   * Copies the input real-space grid from the host to the GPU.
   *
@@ -617,11 +603,8 @@ void pme_gpu_update_input_box(PmeGpu *pmeGPU, const matrix box);
   * Finishes the PME GPU step, waiting for the output forces and/or energy/virial to be copied to the host.
   *
   * \param[in] pmeGPU         The PME GPU structure.
- * \param[in] bCalcForces    The left-over flag from the CPU code which tells the function to copy the forces to the CPU side. Should be passed to the launch call instead. FIXME
- * \param[in] bCalcEnerVir   The left-over flag from the CPU code which tells the function to copy the energy/virial to the CPU side. Should be passed to the launch call instead.
   */
-void pme_gpu_finish_step(const PmeGpu *pmeGPU,  const bool       bCalcForces,
-                         const bool       bCalcEnerVir);
+void pme_gpu_finish_step(const PmeGpu *pmeGPU);
  
  //! A binary enum for spline data layout transformation
  enum class PmeLayoutTransform
diff --git a/src/gromacs/ewald/pme-gpu.cpp b/src/gromacs/ewald/pme-gpu.cpp

index 0ba9603b11cf99f59948ea5cee2b66efdc4f68db..7d70736a979a121146deb122d8940ab5c88ecbc0 100644 (file)
--- a/src/gromacs/ewald/pme-gpu.cpp
+++ b/src/gromacs/ewald/pme-gpu.cpp
@@ -325,10 +325,9 @@ void pme_gpu_wait_for_gpu(const gmx_pme_t *pme,
      GMX_ASSERT(pme_gpu_active(pme), "This should be a GPU run of PME but it is not enabled.");
  
      const bool haveComputedEnergyAndVirial = pme->gpu->settings.stepFlags & GMX_PME_CALC_ENER_VIR;
-    const bool haveComputedForces          = pme->gpu->settings.stepFlags & GMX_PME_CALC_F;
  
      wallcycle_start(wcycle, ewcWAIT_GPU_PME_GATHER);
-    pme_gpu_finish_step(pme->gpu, haveComputedForces, haveComputedEnergyAndVirial);
+    pme_gpu_finish_step(pme->gpu);
      wallcycle_stop(wcycle, ewcWAIT_GPU_PME_GATHER);
  
      if (haveComputedEnergyAndVirial)
diff --git a/src/gromacs/ewald/pme-solve.cu b/src/gromacs/ewald/pme-solve.cu

index 6f68453a5afabebb9c90119a8a65a965186af8af..ce00728279ccb2e56239345bd857fb95e1e88a42 100644 (file)
--- a/src/gromacs/ewald/pme-solve.cu
+++ b/src/gromacs/ewald/pme-solve.cu
@@ -491,14 +491,10 @@ void pme_gpu_solve(const PmeGpu *pmeGpu, t_complex *h_grid,
      {
          cu_copy_D2H_async(pmeGpu->staging.h_virialAndEnergy, kernelParamsPtr->constants.d_virialAndEnergy,
                            c_virialAndEnergyCount * sizeof(float), stream);
-        cudaError_t stat = cudaEventRecord(pmeGpu->archSpecific->syncEnerVirD2H, stream);
-        CU_RET_ERR(stat, "PME solve energy/virial event record failure");
      }
  
      if (copyInputAndOutputGrid)
      {
          cu_copy_D2H_async(h_grid, kernelParamsPtr->grid.d_fourierGrid, pmeGpu->archSpecific->complexGridSize * sizeof(float), stream);
-        cudaError_t stat = cudaEventRecord(pmeGpu->archSpecific->syncSolveGridD2H, stream);
-        CU_RET_ERR(stat, "PME solve grid sync event record failure");
      }
  }
diff --git a/src/gromacs/ewald/pme-timings.cu b/src/gromacs/ewald/pme-timings.cu

index ee47c27ec715ac4c8599a018ee8ccfddc3c6a47a..2dfb4829805275251d12fae7c4b7dd3022889e51 100644 (file)
--- a/src/gromacs/ewald/pme-timings.cu
+++ b/src/gromacs/ewald/pme-timings.cu
@@ -93,8 +93,6 @@ void pme_gpu_update_timings(const PmeGpu *pmeGPU)
  {
      if (pme_gpu_timings_enabled(pmeGPU))
      {
-        pme_gpu_synchronize(pmeGPU);
-
          for (const size_t &activeTimer : pmeGPU->archSpecific->activeTimers)
          {
              pmeGPU->archSpecific->timingEvents[activeTimer].getLastRangeTime();
diff --git a/src/gromacs/ewald/pme.cu b/src/gromacs/ewald/pme.cu

index 1a59b01e4d17104e142f8cc55c8ac812262cbc2e..ca18afc6c475889449e9dc17082309cca64d0224 100644 (file)
--- a/src/gromacs/ewald/pme.cu
+++ b/src/gromacs/ewald/pme.cu
@@ -164,14 +164,6 @@ void pme_gpu_copy_output_forces(const PmeGpu *pmeGPU, float *h_forces)
      const size_t forcesSize   = DIM * pmeGPU->kernelParams->atoms.nAtoms * sizeof(float);
      GMX_ASSERT(forcesSize > 0, "Bad number of atoms in PME GPU");
      cu_copy_D2H_async(h_forces, pmeGPU->kernelParams->atoms.d_forces, forcesSize, pmeGPU->archSpecific->pmeStream);
-    cudaError_t stat = cudaEventRecord(pmeGPU->archSpecific->syncForcesD2H, pmeGPU->archSpecific->pmeStream);
-    CU_RET_ERR(stat, "PME gather forces synchronization failure");
-}
-
-void pme_gpu_sync_output_forces(const PmeGpu *pmeGPU)
-{
-    cudaError_t  stat = cudaEventSynchronize(pmeGPU->archSpecific->syncForcesD2H);
-    CU_RET_ERR(stat, "Error while waiting for the PME GPU forces");
  }
  
  void pme_gpu_realloc_coordinates(const PmeGpu *pmeGPU)
@@ -381,17 +373,6 @@ void pme_gpu_free_fract_shifts(const PmeGpu *pmeGPU)
                              pmeGPU->deviceInfo);
  }
  
-void pme_gpu_sync_output_energy_virial(const PmeGpu *pmeGPU)
-{
-    cudaError_t stat = cudaEventSynchronize(pmeGPU->archSpecific->syncEnerVirD2H);
-    CU_RET_ERR(stat, "Error while waiting for PME solve output");
-
-    for (int j = 0; j < c_virialAndEnergyCount; j++)
-    {
-        GMX_ASSERT(std::isfinite(pmeGPU->staging.h_virialAndEnergy[j]), "PME GPU produces incorrect energy/virial.");
-    }
-}
-
  void pme_gpu_copy_input_gather_grid(const PmeGpu *pmeGpu, float *h_grid)
  {
      const size_t gridSize = pmeGpu->archSpecific->realGridSize * sizeof(float);
@@ -416,8 +397,6 @@ void pme_gpu_copy_output_spread_atom_data(const PmeGpu *pmeGpu)
      cu_copy_D2H_async(pmeGpu->staging.h_theta, kernelParamsPtr->atoms.d_theta, splinesSize, pmeGpu->archSpecific->pmeStream);
      cu_copy_D2H_async(pmeGpu->staging.h_gridlineIndices, kernelParamsPtr->atoms.d_gridlineIndices,
                        kernelParamsPtr->atoms.nAtoms * DIM * sizeof(int), pmeGpu->archSpecific->pmeStream);
-    cudaError_t stat = cudaEventRecord(pmeGpu->archSpecific->syncSplineAtomDataD2H, pmeGpu->archSpecific->pmeStream);
-    CU_RET_ERR(stat, "PME spread atom data sync event record failure");
  }
  
  void pme_gpu_copy_input_gather_atom_data(const PmeGpu *pmeGpu)
@@ -450,19 +429,6 @@ void pme_gpu_sync_spread_grid(const PmeGpu *pmeGPU)
      CU_RET_ERR(stat, "Error while waiting for the PME GPU spread grid to be copied to the host");
  }
  
-void pme_gpu_sync_spline_atom_data(const PmeGpu *pmeGPU)
-{
-    cudaError_t stat = cudaEventSynchronize(pmeGPU->archSpecific->syncSplineAtomDataD2H);
-    CU_RET_ERR(stat, "Error while waiting for the PME GPU atom data to be copied to the host");
-}
-
-void pme_gpu_sync_solve_grid(const PmeGpu *pmeGPU)
-{
-    cudaError_t stat = cudaEventSynchronize(pmeGPU->archSpecific->syncSolveGridD2H);
-    CU_RET_ERR(stat, "Error while waiting for the PME GPU solve grid to be copied to the host");
-    //should check for pme_gpu_performs_solve(pmeGPU)
-}
-
  void pme_gpu_init_internal(PmeGpu *pmeGPU)
  {
      /* Allocate the target-specific structures */
@@ -501,33 +467,13 @@ void pme_gpu_destroy_specific(const PmeGpu *pmeGPU)
  
  void pme_gpu_init_sync_events(const PmeGpu *pmeGPU)
  {
-    cudaError_t stat;
      const auto  eventFlags = cudaEventDisableTiming;
-    stat = cudaEventCreateWithFlags(&pmeGPU->archSpecific->syncEnerVirD2H, eventFlags);
-    CU_RET_ERR(stat, "cudaEventCreate on syncEnerVirD2H failed");
-    stat = cudaEventCreateWithFlags(&pmeGPU->archSpecific->syncForcesD2H, eventFlags);
-    CU_RET_ERR(stat, "cudaEventCreate on syncForcesD2H failed");
-    stat = cudaEventCreateWithFlags(&pmeGPU->archSpecific->syncSpreadGridD2H, eventFlags);
-    CU_RET_ERR(stat, "cudaEventCreate on syncSpreadGridD2H failed");
-    stat = cudaEventCreateWithFlags(&pmeGPU->archSpecific->syncSplineAtomDataD2H, eventFlags);
-    CU_RET_ERR(stat, "cudaEventCreate on syncSplineAtomDataD2H failed");
-    stat = cudaEventCreateWithFlags(&pmeGPU->archSpecific->syncSolveGridD2H, eventFlags);
-    CU_RET_ERR(stat, "cudaEventCreate on syncSolveGridD2H failed");
+    CU_RET_ERR(cudaEventCreateWithFlags(&pmeGPU->archSpecific->syncSpreadGridD2H, eventFlags), "cudaEventCreate on syncSpreadGridD2H failed");
  }
  
  void pme_gpu_destroy_sync_events(const PmeGpu *pmeGPU)
  {
-    cudaError_t stat;
-    stat = cudaEventDestroy(pmeGPU->archSpecific->syncEnerVirD2H);
-    CU_RET_ERR(stat, "cudaEventDestroy failed on syncEnerVirD2H");
-    stat = cudaEventDestroy(pmeGPU->archSpecific->syncForcesD2H);
-    CU_RET_ERR(stat, "cudaEventDestroy failed on syncForcesD2H");
-    stat = cudaEventDestroy(pmeGPU->archSpecific->syncSpreadGridD2H);
-    CU_RET_ERR(stat, "cudaEventDestroy failed on syncSpreadGridD2H");
-    stat = cudaEventDestroy(pmeGPU->archSpecific->syncSplineAtomDataD2H);
-    CU_RET_ERR(stat, "cudaEventDestroy failed on syncSplineAtomDataD2H");
-    stat = cudaEventDestroy(pmeGPU->archSpecific->syncSolveGridD2H);
-    CU_RET_ERR(stat, "cudaEventDestroy failed on syncSolveGridD2H");
+    CU_RET_ERR(cudaEventDestroy(pmeGPU->archSpecific->syncSpreadGridD2H), "cudaEventDestroy failed on syncSpreadGridD2H");
  }
  
  void pme_gpu_reinit_3dfft(const PmeGpu *pmeGPU)
diff --git a/src/gromacs/ewald/pme.cuh b/src/gromacs/ewald/pme.cuh

index 001caf7686c158ac29e6084e6ccca6b86a1384a7..a18792a7810447e0941e9cbd38444a2f555685f5 100644 (file)
--- a/src/gromacs/ewald/pme.cuh
+++ b/src/gromacs/ewald/pme.cuh
@@ -146,16 +146,8 @@ struct PmeGpuCuda
      cudaStream_t pmeStream;
  
      /* Synchronization events */
-    /*! \brief Triggered after the energy/virial have been copied to the host (after the solving stage). */
-    cudaEvent_t syncEnerVirD2H;
-    /*! \brief Triggered after the output forces have been copied to the host (after the gathering stage). */
-    cudaEvent_t syncForcesD2H;
      /*! \brief Triggered after the grid has been copied to the host (after the spreading stage). */
      cudaEvent_t syncSpreadGridD2H;
-    /*! \brief Triggered after the atom spline data has been copied to the host (after the spline computation). */
-    cudaEvent_t syncSplineAtomDataD2H;
-    /*! \brief Triggered after the grid hes been copied to the host (after the solving stage) */
-    cudaEvent_t syncSolveGridD2H;
  
      // TODO: consider moving some things below into the non-CUDA struct.
author	Aleksei Iupinov <a.yupinov@gmail.com>
	Fri, 27 Oct 2017 13:37:47 +0000 (15:37 +0200)
committer	Mark Abraham <mark.j.abraham@gmail.com>
	Wed, 1 Nov 2017 09:56:48 +0000 (10:56 +0100)
src/gromacs/ewald/pme-gpu-internal.cpp		patch \| blob \| history
src/gromacs/ewald/pme-gpu-internal.h		patch \| blob \| history
src/gromacs/ewald/pme-gpu.cpp		patch \| blob \| history
src/gromacs/ewald/pme-solve.cu		patch \| blob \| history
src/gromacs/ewald/pme-timings.cu		patch \| blob \| history
src/gromacs/ewald/pme.cu		patch \| blob \| history
src/gromacs/ewald/pme.cuh		patch \| blob \| history