From 7b77ac03f6364c62b746c24983611d9dbef23b98 Mon Sep 17 00:00:00 2001
From: Artem Zhmurov <zhmurov@gmail.com>
Date: Thu, 10 Oct 2019 18:03:11 +0200
Subject: [PATCH] Centralize management of forces ready on GPU event

This change adds the GpuEventSynchronizer for the forces reduced on GPU event
to the StatePropagatorDataGpu. This event should be marked if the buffer ops
are offloaded when the force reduction is done. The consumers of of the forces
on the GPU will get this event or the event on the H2D copy is done,
depending on the current step workload and offload scenario.

Change-Id: Ib559dbed5ad777eac3a906e4ee0ebaa07caf0ac1
---
 src/gromacs/mdlib/sim_util.cpp                |  6 +++--
 .../mdtypes/state_propagator_data_gpu.h       | 21 ++++++++++++---
 .../state_propagator_data_gpu_impl.cpp        | 11 ++++++--
 .../mdtypes/state_propagator_data_gpu_impl.h  | 23 +++++++++++++---
 .../state_propagator_data_gpu_impl_gpu.cpp    | 26 ++++++++++++++++---
 5 files changed, 73 insertions(+), 14 deletions(-)
diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp
index 57ff728da0..8386e34139 100644
--- a/src/gromacs/mdlib/sim_util.cpp
+++ b/src/gromacs/mdlib/sim_util.cpp
@@ -1506,7 +1506,8 @@ void do_force(FILE                                     *fplog,
                 if (haveNonLocalForceContribInCpuBuffer)
                 {
                     stateGpu->copyForcesToGpu(forceOut.forceWithShiftForces().force(), gmx::StatePropagatorDataGpu::AtomLocality::NonLocal);
-                    dependencyList.push_back(stateGpu->getForcesReadyOnDeviceEvent(gmx::StatePropagatorDataGpu::AtomLocality::NonLocal));
+                    dependencyList.push_back(stateGpu->getForcesReadyOnDeviceEvent(gmx::StatePropagatorDataGpu::AtomLocality::NonLocal,
+                                                                                   useGpuFBufOps == BufferOpsUseGpu::True));
                 }
 
                 nbv->atomdata_add_nbat_f_to_f_gpu(Nbnxm::AtomLocality::NonLocal,
@@ -1660,7 +1661,8 @@ void do_force(FILE                                     *fplog,
             if (haveLocalForceContribInCpuBuffer && !useGpuForcesHaloExchange)
             {
                 stateGpu->copyForcesToGpu(forceWithShift, gmx::StatePropagatorDataGpu::AtomLocality::Local);
-                dependencyList.push_back(stateGpu->getForcesReadyOnDeviceEvent(gmx::StatePropagatorDataGpu::AtomLocality::Local));
+                dependencyList.push_back(stateGpu->getForcesReadyOnDeviceEvent(gmx::StatePropagatorDataGpu::AtomLocality::Local,
+                                                                               useGpuFBufOps == BufferOpsUseGpu::True));
             }
             if (useGpuForcesHaloExchange)
             {
diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu.h b/src/gromacs/mdtypes/state_propagator_data_gpu.h
index 13d00ab245..086c43fd4e 100644
--- a/src/gromacs/mdtypes/state_propagator_data_gpu.h
+++ b/src/gromacs/mdtypes/state_propagator_data_gpu.h
@@ -266,13 +266,28 @@ class StatePropagatorDataGpu
         void copyForcesToGpu(gmx::ArrayRef<const gmx::RVec>  h_f,
                              AtomLocality                    atomLocality);
 
-        /*! \brief Get the event synchronizer for the H2D forces copy.
+        /*! \brief Get the event synchronizer for the forces ready on device.
          *
-         *  \param[in] atomLocality  Locality of the particles to wait for.
+         *  Returns either of the event synchronizers, depending on the offload scenario
+         *  for the current simulation timestep:
+         *  1. The forces are copied to the device (when GPU buffer ops are off)
+         *  2. The forces are reduced on the device (GPU buffer ops are on)
+         *
+         *  \todo Pass step workload instead of the useGpuFBufferOps boolean.
+         *
+         *  \param[in] atomLocality      Locality of the particles to wait for.
+         *  \param[in] useGpuFBufferOps  If the force buffer ops are offloaded to the GPU.
          *
          *  \returns  The event to synchronize the stream that consumes forces on device.
          */
-        GpuEventSynchronizer* getForcesReadyOnDeviceEvent(AtomLocality  atomLocality);
+        GpuEventSynchronizer* getForcesReadyOnDeviceEvent(AtomLocality  atomLocality,
+                                                          bool          useGpuFBufferOps);
+
+        /*! \brief Getter for the event synchronizer for the forces are reduced on the GPU.
+         *
+         *  \returns  The event to mark when forces are reduced on the GPU.
+         */
+        GpuEventSynchronizer* fReducedOnDevice();
 
         /*! \brief Copy forces from the GPU memory.
          *
diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp
index 1f6da3a5d7..68d16ccd3b 100644
--- a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp
+++ b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp
@@ -162,9 +162,16 @@ void StatePropagatorDataGpu::copyForcesToGpu(const gmx::ArrayRef<const gmx::RVec
     GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called instead of one from GPU implementation.");
 }
 
-GpuEventSynchronizer* StatePropagatorDataGpu::getForcesReadyOnDeviceEvent(AtomLocality  /* atomLocality */)
+GpuEventSynchronizer* StatePropagatorDataGpu::getForcesReadyOnDeviceEvent(AtomLocality  /* atomLocality     */,
+                                                                          bool          /* useGpuFBufferOps */)
 {
-    GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called insted of one from GPU implementation.");
+    GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called instead of one from GPU implementation.");
+    return nullptr;
+}
+
+GpuEventSynchronizer* StatePropagatorDataGpu::fReducedOnDevice()
+{
+    GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called instead of one from GPU implementation.");
     return nullptr;
 }
 
diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h
index b5fcffeb19..bd8c770d04 100644
--- a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h
+++ b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h
@@ -252,13 +252,28 @@ class StatePropagatorDataGpu::Impl
         void copyForcesToGpu(gmx::ArrayRef<const gmx::RVec>  h_f,
                              AtomLocality                    atomLocality);
 
-        /*! \brief Get the event synchronizer on the H2D forces copy.
+        /*! \brief Get the event synchronizer for the forces ready on device.
          *
-         *  \param[in] atomLocality  Locality of the particles to wait for.
+         *  Returns either of the event synchronizers, depending on the offload scenario
+         *  for the current simulation timestep:
+         *  1. The forces are copied to the device (when GPU buffer ops are off)
+         *  2. The forces are reduced on the device (GPU buffer ops are on)
+         *
+         *  \todo Pass step workload instead of the useGpuFBufferOps boolean.
+         *
+         *  \param[in] atomLocality      Locality of the particles to wait for.
+         *  \param[in] useGpuFBufferOps  If the force buffer ops are offloaded to the GPU.
          *
          *  \returns  The event to synchronize the stream that consumes forces on device.
          */
-        GpuEventSynchronizer* getForcesReadyOnDeviceEvent(AtomLocality  atomLocality);
+        GpuEventSynchronizer* getForcesReadyOnDeviceEvent(AtomLocality  atomLocality,
+                                                          bool          useGpuFBufferOps);
+
+        /*! \brief Getter for the event synchronizer for the forces are reduced on the GPU.
+         *
+         *  \returns  The event to mark when forces are reduced on the GPU.
+         */
+        GpuEventSynchronizer* fReducedOnDevice();
 
         /*! \brief Copy forces from the GPU memory.
          *
@@ -329,6 +344,8 @@ class StatePropagatorDataGpu::Impl
 
         //! An array of events that indicate H2D copy of forces is complete (one event for each atom locality)
         EnumerationArray<AtomLocality, GpuEventSynchronizer> fReadyOnDevice_;
+        //! An event that the forces were reduced on the GPU
+        GpuEventSynchronizer                                 fReducedOnDevice_;
         //! An array of events that indicate D2H copy of forces is complete (one event for each atom locality)
         EnumerationArray<AtomLocality, GpuEventSynchronizer> fReadyOnHost_;
 
diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp b/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp
index 88bb6e5fed..2f66ea4d3e 100644
--- a/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp
+++ b/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp
@@ -376,11 +376,23 @@ void StatePropagatorDataGpu::Impl::copyForcesToGpu(const gmx::ArrayRef<const gmx
     fReadyOnDevice_[atomLocality].markEvent(commandStream);
 }
 
-GpuEventSynchronizer* StatePropagatorDataGpu::Impl::getForcesReadyOnDeviceEvent(AtomLocality  atomLocality)
+GpuEventSynchronizer* StatePropagatorDataGpu::Impl::getForcesReadyOnDeviceEvent(AtomLocality  atomLocality,
+                                                                                bool          useGpuFBufferOps)
 {
-    return &fReadyOnDevice_[atomLocality];
+    if ((atomLocality == AtomLocality::Local || atomLocality == AtomLocality::NonLocal) && useGpuFBufferOps)
+    {
+        return &fReducedOnDevice_;
+    }
+    else
+    {
+        return &fReadyOnDevice_[atomLocality];
+    }
 }
 
+GpuEventSynchronizer* StatePropagatorDataGpu::Impl::fReducedOnDevice()
+{
+    return &fReducedOnDevice_;
+}
 
 void StatePropagatorDataGpu::Impl::copyForcesFromGpu(gmx::ArrayRef<gmx::RVec>  h_f,
                                                      AtomLocality              atomLocality)
@@ -524,9 +536,15 @@ void StatePropagatorDataGpu::copyForcesToGpu(const gmx::ArrayRef<const gmx::RVec
     return impl_->copyForcesToGpu(h_f, atomLocality);
 }
 
-GpuEventSynchronizer* StatePropagatorDataGpu::getForcesReadyOnDeviceEvent(AtomLocality  atomLocality)
+GpuEventSynchronizer* StatePropagatorDataGpu::getForcesReadyOnDeviceEvent(AtomLocality  atomLocality,
+                                                                          bool          useGpuFBufferOps)
+{
+    return impl_->getForcesReadyOnDeviceEvent(atomLocality, useGpuFBufferOps);
+}
+
+GpuEventSynchronizer* StatePropagatorDataGpu::fReducedOnDevice()
 {
-    return impl_->getForcesReadyOnDeviceEvent(atomLocality);
+    return impl_->fReducedOnDevice();
 }
 
 void StatePropagatorDataGpu::copyForcesFromGpu(gmx::ArrayRef<RVec>  h_f,
-- 
2.22.0