Enable StatePropagatorGpuData for force transfers
authorSzilárd Páll <pall.szilard@gmail.com>
Thu, 10 Oct 2019 20:20:40 +0000 (22:20 +0200)
committerArtem Zhmurov <zhmurov@gmail.com>
Sun, 13 Oct 2019 11:27:40 +0000 (13:27 +0200)
Force transfers have been switched to use StatePropagatorGpuData already
before. This change updates the synchronization mechanisms as:
- replaces the previous stream sync after GPU buffer/ops reduction with
  a waitForcesReadyOnHost call;
- removes the barriers in copyForces[From|To]Gpu() as dependencies
  are now satisfied: most dependencies are intra-stream and therefore
  implicit, the exception being the halo exchange that uses its own
  mechanism to sync H2D in the local stream with the nonlocal stream
  (which is yet to be replaces Refs #3093).

Refs. #3126.

Change-Id: I8bfd39f79c87f20492c4ae287d6f19261724f806

src/gromacs/mdlib/sim_util.cpp
src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp
src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
src/gromacs/nbnxm/nbnxm.cpp
src/gromacs/nbnxm/nbnxm.h
src/gromacs/nbnxm/nbnxm_gpu.h

index 32c30aab8f86de8c2221f5ab43f3e31ed3637b2e..4c8323444d1d37180d6c61af1fed89376668edb7 100644 (file)
@@ -1573,7 +1573,7 @@ void do_force(FILE                                     *fplog,
             {
                 if (useGpuFBufOps == BufferOpsUseGpu::True)
                 {
-                    nbv->wait_for_gpu_force_reduction(Nbnxm::AtomLocality::NonLocal);
+                    stateGpu->waitForcesReadyOnHost(gmx::StatePropagatorDataGpu::AtomLocality::NonLocal);
                 }
                 dd_move_f(cr->dd, &forceOut.forceWithShiftForces(), wcycle);
             }
@@ -1691,9 +1691,8 @@ void do_force(FILE                                     *fplog,
                                               pme_gpu_get_device_f(fr->pmedata),
                                               dependencyList,
                                               stepWork.useGpuPmeFReduction, haveLocalForceContribInCpuBuffer);
-            // This function call synchronizes the local stream
-            nbv->wait_for_gpu_force_reduction(Nbnxm::AtomLocality::Local);
             stateGpu->copyForcesFromGpu(forceWithShift, gmx::StatePropagatorDataGpu::AtomLocality::Local);
+            stateGpu->waitForcesReadyOnHost(gmx::StatePropagatorDataGpu::AtomLocality::Local);
         }
         else
         {
index 2f66ea4d3e034928333b48eee78467e1ac9e66d0..5aa64f6e2551482d8852be7a30b022f3d4bc12a6 100644 (file)
@@ -371,8 +371,6 @@ void StatePropagatorDataGpu::Impl::copyForcesToGpu(const gmx::ArrayRef<const gmx
     GMX_ASSERT(commandStream != nullptr, "No stream is valid for copying forces with given atom locality.");
 
     copyToDevice(d_f_, h_f, d_fSize_, atomLocality, commandStream);
-    // TODO: Remove When event-based synchronization is introduced
-    gpuStreamSynchronize(commandStream);
     fReadyOnDevice_[atomLocality].markEvent(commandStream);
 }
 
@@ -402,8 +400,6 @@ void StatePropagatorDataGpu::Impl::copyForcesFromGpu(gmx::ArrayRef<gmx::RVec>  h
     GMX_ASSERT(commandStream != nullptr, "No stream is valid for copying forces with given atom locality.");
 
     copyFromDevice(h_f, d_f_, d_fSize_, atomLocality, commandStream);
-    // TODO: Remove When event-based synchronization is introduced
-    gpuStreamSynchronize(commandStream);
     fReadyOnHost_[atomLocality].markEvent(commandStream);
 }
 
index 23502b3fbe8bac6d88c26a5c7910c65d5790e2b2..e6cd8253b752af1a6faad70e83dd79139e91bae4 100644 (file)
@@ -876,19 +876,6 @@ void nbnxn_gpu_add_nbat_f_to_f(const AtomLocality                          atomL
 
 }
 
-void nbnxn_wait_for_gpu_force_reduction(const AtomLocality      gmx_unused atomLocality,
-                                        gmx_nbnxn_gpu_t                   *nb)
-{
-    GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
-
-    const InteractionLocality iLocality = gpuAtomToInteractionLocality(atomLocality);
-
-    cudaStream_t              stream    = nb->stream[iLocality];
-
-    cudaStreamSynchronize(stream);
-
-}
-
 void* nbnxn_get_x_on_device_event(const gmx_nbnxn_cuda_t   *nb)
 {
     return static_cast<void*> (nb->xAvailableOnDevice);
index 0ec882f3b7f50a9652d1b8e9d39a72af215a54ed..4fb8cbacdafaf2028d967349034c2f40ec9bc138 100644 (file)
@@ -260,11 +260,6 @@ void nonbonded_verlet_t::insertNonlocalGpuDependency(const Nbnxm::InteractionLoc
     Nbnxm::nbnxnInsertNonlocalGpuDependency(gpu_nbv, interactionLocality);
 }
 
-void nonbonded_verlet_t::wait_for_gpu_force_reduction(const Nbnxm::AtomLocality locality)
-{
-    nbnxn_wait_for_gpu_force_reduction(locality, gpu_nbv);
-}
-
 void* nonbonded_verlet_t::get_x_on_device_event()
 {
     return Nbnxm::nbnxn_get_x_on_device_event(gpu_nbv);
index 2f25f701cb5bd72145a3c5d5d599253509dc576e..a5ef5baff3f74d6cf2558c369fe0bf587b4e01bf 100644 (file)
@@ -360,12 +360,6 @@ struct nonbonded_verlet_t
         /*! \brief Outer body of function to perform initialization for F buffer operations on GPU. */
         void atomdata_init_add_nbat_f_to_f_gpu();
 
-        /*! \brief Wait for GPU force reduction task and D2H transfer of its results to complete
-         *
-         * FIXME: need more details: when should be called / after which operation, etc.
-         */
-        void wait_for_gpu_force_reduction(Nbnxm::AtomLocality locality);
-
         /*! \brief return pointer to GPU event recorded when coordinates have been copied to device */
         void* get_x_on_device_event();
 
index ccb2aa5bdea7b8f4b3f64910d6b0582dcbc8bec1..afdafb80ff8fa343726676861f0ccb19d4be9c2e 100644 (file)
@@ -319,11 +319,6 @@ void nbnxn_gpu_add_nbat_f_to_f(AtomLocality                               gmx_un
                                bool                                       gmx_unused  useGpuFPmeReduction,
                                bool                                       gmx_unused  accumulateForce) CUDA_FUNC_TERM;
 
-/*! \brief Wait for GPU stream to complete */
-CUDA_FUNC_QUALIFIER
-void nbnxn_wait_for_gpu_force_reduction(AtomLocality            gmx_unused  atomLocality,
-                                        gmx_nbnxn_gpu_t         gmx_unused *nb) CUDA_FUNC_TERM;
-
 /*! \brief sync CPU thread on coordinate copy to device
  * \param[in] nb                   The nonbonded data GPU structure
  */