Centralize management of forces ready on GPU event
authorArtem Zhmurov <zhmurov@gmail.com>
Thu, 10 Oct 2019 16:03:11 +0000 (18:03 +0200)
committerArtem Zhmurov <zhmurov@gmail.com>
Sat, 12 Oct 2019 14:49:20 +0000 (16:49 +0200)
This change adds the GpuEventSynchronizer for the forces reduced on GPU event
to the StatePropagatorDataGpu. This event should be marked if the buffer ops
are offloaded when the force reduction is done. The consumers of of the forces
on the GPU will get this event or the event on the H2D copy is done,
depending on the current step workload and offload scenario.

Change-Id: Ib559dbed5ad777eac3a906e4ee0ebaa07caf0ac1

src/gromacs/mdlib/sim_util.cpp
src/gromacs/mdtypes/state_propagator_data_gpu.h
src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp
src/gromacs/mdtypes/state_propagator_data_gpu_impl.h
src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp

index 57ff728da03663cf55e723227cf47cc9f22233ba..8386e341393cb5b6050b380fd9836c92636bc4b9 100644 (file)
@@ -1506,7 +1506,8 @@ void do_force(FILE                                     *fplog,
                 if (haveNonLocalForceContribInCpuBuffer)
                 {
                     stateGpu->copyForcesToGpu(forceOut.forceWithShiftForces().force(), gmx::StatePropagatorDataGpu::AtomLocality::NonLocal);
-                    dependencyList.push_back(stateGpu->getForcesReadyOnDeviceEvent(gmx::StatePropagatorDataGpu::AtomLocality::NonLocal));
+                    dependencyList.push_back(stateGpu->getForcesReadyOnDeviceEvent(gmx::StatePropagatorDataGpu::AtomLocality::NonLocal,
+                                                                                   useGpuFBufOps == BufferOpsUseGpu::True));
                 }
 
                 nbv->atomdata_add_nbat_f_to_f_gpu(Nbnxm::AtomLocality::NonLocal,
@@ -1660,7 +1661,8 @@ void do_force(FILE                                     *fplog,
             if (haveLocalForceContribInCpuBuffer && !useGpuForcesHaloExchange)
             {
                 stateGpu->copyForcesToGpu(forceWithShift, gmx::StatePropagatorDataGpu::AtomLocality::Local);
-                dependencyList.push_back(stateGpu->getForcesReadyOnDeviceEvent(gmx::StatePropagatorDataGpu::AtomLocality::Local));
+                dependencyList.push_back(stateGpu->getForcesReadyOnDeviceEvent(gmx::StatePropagatorDataGpu::AtomLocality::Local,
+                                                                               useGpuFBufOps == BufferOpsUseGpu::True));
             }
             if (useGpuForcesHaloExchange)
             {
index 13d00ab245ba4ba7a59820250a8400d798c107ae..086c43fd4e0023983252032b93ee4a23fa8760c6 100644 (file)
@@ -266,13 +266,28 @@ class StatePropagatorDataGpu
         void copyForcesToGpu(gmx::ArrayRef<const gmx::RVec>  h_f,
                              AtomLocality                    atomLocality);
 
-        /*! \brief Get the event synchronizer for the H2D forces copy.
+        /*! \brief Get the event synchronizer for the forces ready on device.
          *
-         *  \param[in] atomLocality  Locality of the particles to wait for.
+         *  Returns either of the event synchronizers, depending on the offload scenario
+         *  for the current simulation timestep:
+         *  1. The forces are copied to the device (when GPU buffer ops are off)
+         *  2. The forces are reduced on the device (GPU buffer ops are on)
+         *
+         *  \todo Pass step workload instead of the useGpuFBufferOps boolean.
+         *
+         *  \param[in] atomLocality      Locality of the particles to wait for.
+         *  \param[in] useGpuFBufferOps  If the force buffer ops are offloaded to the GPU.
          *
          *  \returns  The event to synchronize the stream that consumes forces on device.
          */
-        GpuEventSynchronizer* getForcesReadyOnDeviceEvent(AtomLocality  atomLocality);
+        GpuEventSynchronizer* getForcesReadyOnDeviceEvent(AtomLocality  atomLocality,
+                                                          bool          useGpuFBufferOps);
+
+        /*! \brief Getter for the event synchronizer for the forces are reduced on the GPU.
+         *
+         *  \returns  The event to mark when forces are reduced on the GPU.
+         */
+        GpuEventSynchronizer* fReducedOnDevice();
 
         /*! \brief Copy forces from the GPU memory.
          *
index 1f6da3a5d7b0ad67f45130923bd05fbe81f228ff..68d16ccd3b3622173fe2f618c69c94634f6348dc 100644 (file)
@@ -162,9 +162,16 @@ void StatePropagatorDataGpu::copyForcesToGpu(const gmx::ArrayRef<const gmx::RVec
     GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called instead of one from GPU implementation.");
 }
 
-GpuEventSynchronizer* StatePropagatorDataGpu::getForcesReadyOnDeviceEvent(AtomLocality  /* atomLocality */)
+GpuEventSynchronizer* StatePropagatorDataGpu::getForcesReadyOnDeviceEvent(AtomLocality  /* atomLocality     */,
+                                                                          bool          /* useGpuFBufferOps */)
 {
-    GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called insted of one from GPU implementation.");
+    GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called instead of one from GPU implementation.");
+    return nullptr;
+}
+
+GpuEventSynchronizer* StatePropagatorDataGpu::fReducedOnDevice()
+{
+    GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called instead of one from GPU implementation.");
     return nullptr;
 }
 
index b5fcffeb19ffcc429a99a5057fce781c14786747..bd8c770d0423088c3623bc49ba34790bc0e89c84 100644 (file)
@@ -252,13 +252,28 @@ class StatePropagatorDataGpu::Impl
         void copyForcesToGpu(gmx::ArrayRef<const gmx::RVec>  h_f,
                              AtomLocality                    atomLocality);
 
-        /*! \brief Get the event synchronizer on the H2D forces copy.
+        /*! \brief Get the event synchronizer for the forces ready on device.
          *
-         *  \param[in] atomLocality  Locality of the particles to wait for.
+         *  Returns either of the event synchronizers, depending on the offload scenario
+         *  for the current simulation timestep:
+         *  1. The forces are copied to the device (when GPU buffer ops are off)
+         *  2. The forces are reduced on the device (GPU buffer ops are on)
+         *
+         *  \todo Pass step workload instead of the useGpuFBufferOps boolean.
+         *
+         *  \param[in] atomLocality      Locality of the particles to wait for.
+         *  \param[in] useGpuFBufferOps  If the force buffer ops are offloaded to the GPU.
          *
          *  \returns  The event to synchronize the stream that consumes forces on device.
          */
-        GpuEventSynchronizer* getForcesReadyOnDeviceEvent(AtomLocality  atomLocality);
+        GpuEventSynchronizer* getForcesReadyOnDeviceEvent(AtomLocality  atomLocality,
+                                                          bool          useGpuFBufferOps);
+
+        /*! \brief Getter for the event synchronizer for the forces are reduced on the GPU.
+         *
+         *  \returns  The event to mark when forces are reduced on the GPU.
+         */
+        GpuEventSynchronizer* fReducedOnDevice();
 
         /*! \brief Copy forces from the GPU memory.
          *
@@ -329,6 +344,8 @@ class StatePropagatorDataGpu::Impl
 
         //! An array of events that indicate H2D copy of forces is complete (one event for each atom locality)
         EnumerationArray<AtomLocality, GpuEventSynchronizer> fReadyOnDevice_;
+        //! An event that the forces were reduced on the GPU
+        GpuEventSynchronizer                                 fReducedOnDevice_;
         //! An array of events that indicate D2H copy of forces is complete (one event for each atom locality)
         EnumerationArray<AtomLocality, GpuEventSynchronizer> fReadyOnHost_;
 
index 88bb6e5fedd84b99458c36f45342e6562b90bd6c..2f66ea4d3e034928333b48eee78467e1ac9e66d0 100644 (file)
@@ -376,11 +376,23 @@ void StatePropagatorDataGpu::Impl::copyForcesToGpu(const gmx::ArrayRef<const gmx
     fReadyOnDevice_[atomLocality].markEvent(commandStream);
 }
 
-GpuEventSynchronizer* StatePropagatorDataGpu::Impl::getForcesReadyOnDeviceEvent(AtomLocality  atomLocality)
+GpuEventSynchronizer* StatePropagatorDataGpu::Impl::getForcesReadyOnDeviceEvent(AtomLocality  atomLocality,
+                                                                                bool          useGpuFBufferOps)
 {
-    return &fReadyOnDevice_[atomLocality];
+    if ((atomLocality == AtomLocality::Local || atomLocality == AtomLocality::NonLocal) && useGpuFBufferOps)
+    {
+        return &fReducedOnDevice_;
+    }
+    else
+    {
+        return &fReadyOnDevice_[atomLocality];
+    }
 }
 
+GpuEventSynchronizer* StatePropagatorDataGpu::Impl::fReducedOnDevice()
+{
+    return &fReducedOnDevice_;
+}
 
 void StatePropagatorDataGpu::Impl::copyForcesFromGpu(gmx::ArrayRef<gmx::RVec>  h_f,
                                                      AtomLocality              atomLocality)
@@ -524,9 +536,15 @@ void StatePropagatorDataGpu::copyForcesToGpu(const gmx::ArrayRef<const gmx::RVec
     return impl_->copyForcesToGpu(h_f, atomLocality);
 }
 
-GpuEventSynchronizer* StatePropagatorDataGpu::getForcesReadyOnDeviceEvent(AtomLocality  atomLocality)
+GpuEventSynchronizer* StatePropagatorDataGpu::getForcesReadyOnDeviceEvent(AtomLocality  atomLocality,
+                                                                          bool          useGpuFBufferOps)
+{
+    return impl_->getForcesReadyOnDeviceEvent(atomLocality, useGpuFBufferOps);
+}
+
+GpuEventSynchronizer* StatePropagatorDataGpu::fReducedOnDevice()
 {
-    return impl_->getForcesReadyOnDeviceEvent(atomLocality);
+    return impl_->fReducedOnDevice();
 }
 
 void StatePropagatorDataGpu::copyForcesFromGpu(gmx::ArrayRef<RVec>  h_f,