Link GPU force producer and consumer tasks
authorArtem Zhmurov <zhmurov@gmail.com>
Mon, 14 Oct 2019 20:58:08 +0000 (22:58 +0200)
committerArtem Zhmurov <zhmurov@gmail.com>
Wed, 16 Oct 2019 08:21:40 +0000 (10:21 +0200)
The GPU event synchronizer that indicates that forces are ready
for a consumption is now passed to the GPU update-constraints.
The update-constraints enqueue a wait on the event in the update
stream before performing numerical integration and constraining.
Note that the event is conditionally returned by the
StatePropagatorDataGpu and indicates that either the reduction of
forces on the GPU or the H2D copy is done, depending on offload
scenario on a current timestep.

Refs. #2816, #2888, #3126.

Change-Id: Ic12b0c55b75ec5f0c31ce500a2760fb4d5cf3b91

src/gromacs/mdlib/update_constrain_cuda.h
src/gromacs/mdlib/update_constrain_cuda_impl.cpp
src/gromacs/mdlib/update_constrain_cuda_impl.cu
src/gromacs/mdlib/update_constrain_cuda_impl.h
src/gromacs/mdrun/md.cpp

index 3c4fc103f00746f5afa12a5b8bf1b8b1eade7572..edad9c6c0a9739b2419a215f021aaa6301d2b56d 100644 (file)
@@ -93,6 +93,7 @@ class UpdateConstrainCuda
          * This will extract temperature scaling factors from tcstat, transform them into the plain
          * array and call the normal integrate method.
          *
+         * \param[in]  fReadyOnDevice         Event synchronizer indicating that the forces are ready in the device memory.
          * \param[in]  dt                     Timestep.
          * \param[in]  updateVelocities       If the velocities should be constrained.
          * \param[in]  computeVirial          If virial should be updated.
@@ -103,7 +104,8 @@ class UpdateConstrainCuda
          * \param[in]  dtPressureCouple       Period between pressure coupling steps
          * \param[in]  velocityScalingMatrix  Parrinello-Rahman velocity scaling matrix
          */
-        void integrate(real                              dt,
+        void integrate(GpuEventSynchronizer             *fReadyOnDevice,
+                       real                              dt,
                        bool                              updateVelocities,
                        bool                              computeVirial,
                        tensor                            virial,
index 647493ec163348ad6041cc4ef1deba8612dae769..adc765c8754ea4e6eddcc6c892d7cce45fbe5df1 100644 (file)
@@ -66,7 +66,8 @@ UpdateConstrainCuda::UpdateConstrainCuda(gmx_unused const t_inputrec     &ir,
 
 UpdateConstrainCuda::~UpdateConstrainCuda() = default;
 
-void UpdateConstrainCuda::integrate(gmx_unused const real                        dt,
+void UpdateConstrainCuda::integrate(gmx_unused GpuEventSynchronizer             *fReadyOnDevice,
+                                    gmx_unused const real                        dt,
                                     gmx_unused const bool                        updateVelocities,
                                     gmx_unused const bool                        computeVirial,
                                     gmx_unused tensor                            virialScaled,
index b146c83ca8b4b850d24b389edce6f7c5b535ead8..2052bccb7c801b96b4b4916349cbcb0338cb76bd 100644 (file)
@@ -68,7 +68,8 @@
 namespace gmx
 {
 
-void UpdateConstrainCuda::Impl::integrate(const real                        dt,
+void UpdateConstrainCuda::Impl::integrate(GpuEventSynchronizer             *fReadyOnDevice,
+                                          const real                        dt,
                                           const bool                        updateVelocities,
                                           const bool                        computeVirial,
                                           tensor                            virial,
@@ -82,6 +83,9 @@ void UpdateConstrainCuda::Impl::integrate(const real                        dt,
     // TODO There is no point in having separate virial matrix for constraints
     clear_mat(virial);
 
+    // Make sure that the forces are ready on device before proceeding with the update.
+    fReadyOnDevice->enqueueWaitEvent(commandStream_);
+
     // The integrate should save a copy of the current coordinates in d_xp_ and write updated once into d_x_.
     // The d_xp_ is only needed by constraints.
     integrator_->integrate(d_x_, d_xp_, d_v_, d_f_, dt,
@@ -188,7 +192,8 @@ UpdateConstrainCuda::UpdateConstrainCuda(const t_inputrec     &ir,
 
 UpdateConstrainCuda::~UpdateConstrainCuda() = default;
 
-void UpdateConstrainCuda::integrate(const real                        dt,
+void UpdateConstrainCuda::integrate(GpuEventSynchronizer             *fReadyOnDevice,
+                                    const real                        dt,
                                     const bool                        updateVelocities,
                                     const bool                        computeVirial,
                                     tensor                            virialScaled,
@@ -198,7 +203,8 @@ void UpdateConstrainCuda::integrate(const real                        dt,
                                     const float                       dtPressureCouple,
                                     const matrix                      velocityScalingMatrix)
 {
-    impl_->integrate(dt, updateVelocities, computeVirial, virialScaled,
+    impl_->integrate(fReadyOnDevice,
+                     dt, updateVelocities, computeVirial, virialScaled,
                      doTempCouple, tcstat,
                      doPressureCouple, dtPressureCouple, velocityScalingMatrix);
 }
index 326578d12fc7519b8942f34313b6e081f9c64a32..ab6e38431eb9671712104c6f13f8d812e9a275fc 100644 (file)
@@ -89,13 +89,13 @@ class UpdateConstrainCuda::Impl
          *
          * Integrates the equation of motion using Leap-Frog algorithm and applies
          * LINCS and SETTLE constraints.
-         * Updates d_xp_ and d_v_ fields of this object.
          * If computeVirial is true, constraints virial is written at the provided pointer.
          * doTempCouple should be true if:
          *   1. The temperature coupling is enabled.
          *   2. This is the temperature coupling step.
          * Parameters virial/lambdas can be nullptr if computeVirial/doTempCouple are false.
          *
+         * \param[in]  fReadyOnDevice         Event synchronizer indicating that the forces are ready in the device memory.
          * \param[in]  dt                     Timestep.
          * \param[in]  updateVelocities       If the velocities should be constrained.
          * \param[in]  computeVirial          If virial should be updated.
@@ -106,7 +106,8 @@ class UpdateConstrainCuda::Impl
          * \param[in]  dtPressureCouple       Period between pressure coupling steps
          * \param[in]  velocityScalingMatrix  Parrinello-Rahman velocity scaling matrix
          */
-        void integrate(real                              dt,
+        void integrate(GpuEventSynchronizer             *fReadyOnDevice,
+                       real                              dt,
                        bool                              updateVelocities,
                        bool                              computeVirial,
                        tensor                            virial,
index b0fccfa1a7c588968b0c48a8e0608471155ed130..014cb78ab4ec0ae29b08a44ad05cbf15125abf29 100644 (file)
@@ -1238,8 +1238,12 @@ void gmx::LegacySimulator::do_md()
             bool doTempCouple     = (ir->etc != etcNO && do_per_step(step + ir->nsttcouple - 1, ir->nsttcouple));
             bool doPressureCouple = (ir->epc == epcPARRINELLORAHMAN && do_per_step(step + ir->nstpcouple - 1, ir->nstpcouple));
 
+            // TODO: Use StepWorkload fields.
+            bool useGpuFBufferOps = simulationWork.useGpuBufferOps && !(bCalcVir || bCalcEner);
+
             // This applies Leap-Frog, LINCS and SETTLE in succession
-            integrator->integrate(ir->delta_t, true, bCalcVir, shake_vir,
+            integrator->integrate(stateGpu->getForcesReadyOnDeviceEvent(StatePropagatorDataGpu::AtomLocality::Local, useGpuFBufferOps),
+                                  ir->delta_t, true, bCalcVir, shake_vir,
                                   doTempCouple, ekind->tcstat,
                                   doPressureCouple, ir->nstpcouple*ir->delta_t, M);
             stateGpu->copyCoordinatesFromGpu(ArrayRef<RVec>(state->x), StatePropagatorDataGpu::AtomLocality::All);