From 8a0d4d971981a7c9a36e71873284cf8d2cb05129 Mon Sep 17 00:00:00 2001
From: =?utf8?q?Szil=C3=A1rd=20P=C3=A1ll?= <pall.szilard@gmail.com>
Date: Thu, 10 Oct 2019 22:20:40 +0200
Subject: [PATCH] Enable StatePropagatorGpuData for force transfers

Force transfers have been switched to use StatePropagatorGpuData already
before. This change updates the synchronization mechanisms as:
- replaces the previous stream sync after GPU buffer/ops reduction with
  a waitForcesReadyOnHost call;
- removes the barriers in copyForces[From|To]Gpu() as dependencies
  are now satisfied: most dependencies are intra-stream and therefore
  implicit, the exception being the halo exchange that uses its own
  mechanism to sync H2D in the local stream with the nonlocal stream
  (which is yet to be replaces Refs #3093).

Refs. #3126.

Change-Id: I8bfd39f79c87f20492c4ae287d6f19261724f806
---
 src/gromacs/mdlib/sim_util.cpp                      |  5 ++---
 .../mdtypes/state_propagator_data_gpu_impl_gpu.cpp  |  4 ----
 src/gromacs/nbnxm/cuda/nbnxm_cuda.cu                | 13 -------------
 src/gromacs/nbnxm/nbnxm.cpp                         |  5 -----
 src/gromacs/nbnxm/nbnxm.h                           |  6 ------
 src/gromacs/nbnxm/nbnxm_gpu.h                       |  5 -----
 6 files changed, 2 insertions(+), 36 deletions(-)

diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp
index 32c30aab8f..4c8323444d 100644
--- a/src/gromacs/mdlib/sim_util.cpp
+++ b/src/gromacs/mdlib/sim_util.cpp
@@ -1573,7 +1573,7 @@ void do_force(FILE                                     *fplog,
             {
                 if (useGpuFBufOps == BufferOpsUseGpu::True)
                 {
-                    nbv->wait_for_gpu_force_reduction(Nbnxm::AtomLocality::NonLocal);
+                    stateGpu->waitForcesReadyOnHost(gmx::StatePropagatorDataGpu::AtomLocality::NonLocal);
                 }
                 dd_move_f(cr->dd, &forceOut.forceWithShiftForces(), wcycle);
             }
@@ -1691,9 +1691,8 @@ void do_force(FILE                                     *fplog,
                                               pme_gpu_get_device_f(fr->pmedata),
                                               dependencyList,
                                               stepWork.useGpuPmeFReduction, haveLocalForceContribInCpuBuffer);
-            // This function call synchronizes the local stream
-            nbv->wait_for_gpu_force_reduction(Nbnxm::AtomLocality::Local);
             stateGpu->copyForcesFromGpu(forceWithShift, gmx::StatePropagatorDataGpu::AtomLocality::Local);
+            stateGpu->waitForcesReadyOnHost(gmx::StatePropagatorDataGpu::AtomLocality::Local);
         }
         else
         {
diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp b/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp
index 2f66ea4d3e..5aa64f6e25 100644
--- a/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp
+++ b/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp
@@ -371,8 +371,6 @@ void StatePropagatorDataGpu::Impl::copyForcesToGpu(const gmx::ArrayRef<const gmx
     GMX_ASSERT(commandStream != nullptr, "No stream is valid for copying forces with given atom locality.");
 
     copyToDevice(d_f_, h_f, d_fSize_, atomLocality, commandStream);
-    // TODO: Remove When event-based synchronization is introduced
-    gpuStreamSynchronize(commandStream);
     fReadyOnDevice_[atomLocality].markEvent(commandStream);
 }
 
@@ -402,8 +400,6 @@ void StatePropagatorDataGpu::Impl::copyForcesFromGpu(gmx::ArrayRef<gmx::RVec>  h
     GMX_ASSERT(commandStream != nullptr, "No stream is valid for copying forces with given atom locality.");
 
     copyFromDevice(h_f, d_f_, d_fSize_, atomLocality, commandStream);
-    // TODO: Remove When event-based synchronization is introduced
-    gpuStreamSynchronize(commandStream);
     fReadyOnHost_[atomLocality].markEvent(commandStream);
 }
 
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
index 23502b3fbe..e6cd8253b7 100644
--- a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
@@ -876,19 +876,6 @@ void nbnxn_gpu_add_nbat_f_to_f(const AtomLocality                          atomL
 
 }
 
-void nbnxn_wait_for_gpu_force_reduction(const AtomLocality      gmx_unused atomLocality,
-                                        gmx_nbnxn_gpu_t                   *nb)
-{
-    GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
-
-    const InteractionLocality iLocality = gpuAtomToInteractionLocality(atomLocality);
-
-    cudaStream_t              stream    = nb->stream[iLocality];
-
-    cudaStreamSynchronize(stream);
-
-}
-
 void* nbnxn_get_x_on_device_event(const gmx_nbnxn_cuda_t   *nb)
 {
     return static_cast<void*> (nb->xAvailableOnDevice);
diff --git a/src/gromacs/nbnxm/nbnxm.cpp b/src/gromacs/nbnxm/nbnxm.cpp
index 0ec882f3b7..4fb8cbacda 100644
--- a/src/gromacs/nbnxm/nbnxm.cpp
+++ b/src/gromacs/nbnxm/nbnxm.cpp
@@ -260,11 +260,6 @@ void nonbonded_verlet_t::insertNonlocalGpuDependency(const Nbnxm::InteractionLoc
     Nbnxm::nbnxnInsertNonlocalGpuDependency(gpu_nbv, interactionLocality);
 }
 
-void nonbonded_verlet_t::wait_for_gpu_force_reduction(const Nbnxm::AtomLocality locality)
-{
-    nbnxn_wait_for_gpu_force_reduction(locality, gpu_nbv);
-}
-
 void* nonbonded_verlet_t::get_x_on_device_event()
 {
     return Nbnxm::nbnxn_get_x_on_device_event(gpu_nbv);
diff --git a/src/gromacs/nbnxm/nbnxm.h b/src/gromacs/nbnxm/nbnxm.h
index 2f25f701cb..a5ef5baff3 100644
--- a/src/gromacs/nbnxm/nbnxm.h
+++ b/src/gromacs/nbnxm/nbnxm.h
@@ -360,12 +360,6 @@ struct nonbonded_verlet_t
         /*! \brief Outer body of function to perform initialization for F buffer operations on GPU. */
         void atomdata_init_add_nbat_f_to_f_gpu();
 
-        /*! \brief Wait for GPU force reduction task and D2H transfer of its results to complete
-         *
-         * FIXME: need more details: when should be called / after which operation, etc.
-         */
-        void wait_for_gpu_force_reduction(Nbnxm::AtomLocality locality);
-
         /*! \brief return pointer to GPU event recorded when coordinates have been copied to device */
         void* get_x_on_device_event();
 
diff --git a/src/gromacs/nbnxm/nbnxm_gpu.h b/src/gromacs/nbnxm/nbnxm_gpu.h
index ccb2aa5bde..afdafb80ff 100644
--- a/src/gromacs/nbnxm/nbnxm_gpu.h
+++ b/src/gromacs/nbnxm/nbnxm_gpu.h
@@ -319,11 +319,6 @@ void nbnxn_gpu_add_nbat_f_to_f(AtomLocality                               gmx_un
                                bool                                       gmx_unused  useGpuFPmeReduction,
                                bool                                       gmx_unused  accumulateForce) CUDA_FUNC_TERM;
 
-/*! \brief Wait for GPU stream to complete */
-CUDA_FUNC_QUALIFIER
-void nbnxn_wait_for_gpu_force_reduction(AtomLocality            gmx_unused  atomLocality,
-                                        gmx_nbnxn_gpu_t         gmx_unused *nb) CUDA_FUNC_TERM;
-
 /*! \brief sync CPU thread on coordinate copy to device
  * \param[in] nb                   The nonbonded data GPU structure
  */
-- 
2.22.0