From 62b8797f78762ac66ac72e31dcb0f6b511537bb1 Mon Sep 17 00:00:00 2001
From: =?utf8?q?Szil=C3=A1rd=20P=C3=A1ll?= <pall.szilard@gmail.com>
Date: Tue, 8 Oct 2019 03:47:46 +0200
Subject: [PATCH] Pass list of EventSynchronizers to GPU reduction

Instead of passing events one-by-one or doing implicit synchronization
hidden in a force tasks' internals, a list of synchronizer objects that
signal the availability of forces from the source tasks is assembled and
passed to the GPU buffer ops/reduction.

The new dependency added here is the H2D transfer for CPU-side force
contribution, PME-related event is kept, halo exchange dependency needs
to be added.

Change-Id: I5ca5ef919b49508790e813e0469aaeef4f6484c0
---
 src/gromacs/mdlib/sim_util.cpp       | 28 +++++++++++++++++++++++++--
 src/gromacs/nbnxm/atomdata.cpp       | 18 ++++++++---------
 src/gromacs/nbnxm/atomdata.h         | 18 ++++++++---------
 src/gromacs/nbnxm/cuda/nbnxm_cuda.cu | 29 ++++++++++++++++------------
 src/gromacs/nbnxm/nbnxm.cpp          | 14 +++++++-------
 src/gromacs/nbnxm/nbnxm.h            | 14 +++++++-------
 src/gromacs/nbnxm/nbnxm_gpu.h        | 20 +++++++++----------
 7 files changed, 85 insertions(+), 56 deletions(-)

diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp
index 0ea11d9146..331c48a640 100644
--- a/src/gromacs/mdlib/sim_util.cpp
+++ b/src/gromacs/mdlib/sim_util.cpp
@@ -1493,6 +1493,14 @@ void do_force(FILE                                     *fplog,
 
             if (useGpuFBufOps == BufferOpsUseGpu::True)
             {
+                std::vector<GpuEventSynchronizer*> dependencyList;
+                dependencyList.reserve(2);
+
+                if (useGpuPmeFReduction)
+                {
+                    dependencyList.push_back(pme_gpu_get_f_ready_synchronizer(fr->pmedata));
+                }
+
                 // TODO: move this into DomainLifetimeWorkload, including the second part of the condition
                 // The bonded and free energy CPU tasks can have non-local force contributions
                 // which are a dependency for the GPU force reduction.
@@ -1501,11 +1509,16 @@ void do_force(FILE                                     *fplog,
                 if (haveNonLocalForceContribInCpuBuffer)
                 {
                     stateGpu->copyForcesToGpu(forceOut.forceWithShiftForces().force(), gmx::StatePropagatorDataGpu::AtomLocality::NonLocal);
+                    dependencyList.push_back(stateGpu->getForcesReadyOnDeviceEvent(gmx::StatePropagatorDataGpu::AtomLocality::NonLocal));
                 }
+
+                //
+                // FIXME: are we adding a PME->nonlocal dep here?
+                //
                 nbv->atomdata_add_nbat_f_to_f_gpu(Nbnxm::AtomLocality::NonLocal,
                                                   stateGpu->getForces(),
                                                   pme_gpu_get_device_f(fr->pmedata),
-                                                  pme_gpu_get_f_ready_synchronizer(fr->pmedata),
+                                                  dependencyList,
                                                   useGpuPmeFReduction, haveNonLocalForceContribInCpuBuffer);
                 stateGpu->copyForcesFromGpu(forceOut.forceWithShiftForces().force(), gmx::StatePropagatorDataGpu::AtomLocality::NonLocal);
             }
@@ -1626,6 +1639,14 @@ void do_force(FILE                                     *fplog,
      * on the non-alternating path. */
     if (bUseOrEmulGPU && !alternateGpuWait)
     {
+        std::vector<GpuEventSynchronizer*> dependencyList;
+        dependencyList.reserve(2);
+
+        if (useGpuPmeFReduction)
+        {
+            dependencyList.push_back(pme_gpu_get_f_ready_synchronizer(fr->pmedata));
+        }
+
         gmx::ArrayRef<gmx::RVec>  forceWithShift = forceOut.forceWithShiftForces().force();
 
         if (useGpuFBufOps == BufferOpsUseGpu::True)
@@ -1646,6 +1667,7 @@ void do_force(FILE                                     *fplog,
             if (haveLocalForceContribInCpuBuffer && !useGpuForcesHaloExchange)
             {
                 stateGpu->copyForcesToGpu(forceWithShift, gmx::StatePropagatorDataGpu::AtomLocality::Local);
+                dependencyList.push_back(stateGpu->getForcesReadyOnDeviceEvent(gmx::StatePropagatorDataGpu::AtomLocality::Local));
             }
             if (useGpuForcesHaloExchange)
             {
@@ -1653,13 +1675,15 @@ void do_force(FILE                                     *fplog,
                 // for the local buffer ops on the result of GPU halo
                 // exchange, which operates in the non-local stream and
                 // writes to to local parf og the force buffer.
+                //
                 // TODO improve this through use of an event - see Redmine #3093
+                //      push the event into the dependencyList
                 nbv->stream_local_wait_for_nonlocal();
             }
             nbv->atomdata_add_nbat_f_to_f_gpu(Nbnxm::AtomLocality::Local,
                                               stateGpu->getForces(),
                                               pme_gpu_get_device_f(fr->pmedata),
-                                              pme_gpu_get_f_ready_synchronizer(fr->pmedata),
+                                              dependencyList,
                                               useGpuPmeFReduction, haveLocalForceContribInCpuBuffer);
             // This function call synchronizes the local stream
             nbv->wait_for_gpu_force_reduction(Nbnxm::AtomLocality::Local);
diff --git a/src/gromacs/nbnxm/atomdata.cpp b/src/gromacs/nbnxm/atomdata.cpp
index a4e7b0285d..4a22cebe83 100644
--- a/src/gromacs/nbnxm/atomdata.cpp
+++ b/src/gromacs/nbnxm/atomdata.cpp
@@ -1492,14 +1492,14 @@ void reduceForces(nbnxn_atomdata_t                *nbat,
 }
 
 /* Add the force array(s) from nbnxn_atomdata_t to f */
-void reduceForcesGpu(const Nbnxm::AtomLocality        locality,
-                     DeviceBuffer<float>              totalForcesDevice,
-                     const Nbnxm::GridSet            &gridSet,
-                     void                            *pmeForcesDevice,
-                     GpuEventSynchronizer            *pmeForcesReady,
-                     gmx_nbnxn_gpu_t                 *gpu_nbv,
-                     bool                             useGpuFPmeReduction,
-                     bool                             accumulateForce)
+void reduceForcesGpu(const Nbnxm::AtomLocality                   locality,
+                     DeviceBuffer<float>                         totalForcesDevice,
+                     const Nbnxm::GridSet                       &gridSet,
+                     void                                       *pmeForcesDevice,
+                     gmx::ArrayRef<GpuEventSynchronizer* const>  dependencyList,
+                     gmx_nbnxn_gpu_t                            *gpu_nbv,
+                     bool                                        useGpuFPmeReduction,
+                     bool                                        accumulateForce)
 {
     int atomsStart = 0;
     int numAtoms   = 0;
@@ -1516,7 +1516,7 @@ void reduceForcesGpu(const Nbnxm::AtomLocality        locality,
                                      totalForcesDevice,
                                      gpu_nbv,
                                      pmeForcesDevice,
-                                     pmeForcesReady,
+                                     dependencyList,
                                      atomsStart, numAtoms,
                                      useGpuFPmeReduction,
                                      accumulateForce);
diff --git a/src/gromacs/nbnxm/atomdata.h b/src/gromacs/nbnxm/atomdata.h
index 412d328bce..35d711beff 100644
--- a/src/gromacs/nbnxm/atomdata.h
+++ b/src/gromacs/nbnxm/atomdata.h
@@ -359,19 +359,19 @@ void reduceForces(nbnxn_atomdata_t                   *nbat,
  * \param[out] totalForcesDevice    Device buffer to accumulate resulting force.
  * \param[in]  gridSet              The grids data.
  * \param[in]  pmeForcesDevice      Device buffer with PME forces.
- * \param[in]  pmeForcesReady       Event that signals when the PME forces are ready for the reduction.
+ * \param[in]  dependencyList       List of synchronizers that represent the dependencies the reduction task needs to sync on.
  * \param[in]  gpu_nbv              The NBNXM GPU data structure.
  * \param[in]  useGpuFPmeReduction  Whether PME forces should be added.
  * \param[in]  accumulateForce      Whether there are usefull data already in the total force buffer.
  */
-void reduceForcesGpu(Nbnxm::AtomLocality                 locality,
-                     DeviceBuffer<float>                 totalForcesDevice,
-                     const Nbnxm::GridSet               &gridSet,
-                     void                               *pmeForcesDevice,
-                     GpuEventSynchronizer               *pmeForcesReady,
-                     gmx_nbnxn_gpu_t                    *gpu_nbv,
-                     bool                                useGpuFPmeReduction,
-                     bool                                accumulateForce);
+void reduceForcesGpu(Nbnxm::AtomLocality                         locality,
+                     DeviceBuffer<float>                         totalForcesDevice,
+                     const Nbnxm::GridSet                       &gridSet,
+                     void                                       *pmeForcesDevice,
+                     gmx::ArrayRef<GpuEventSynchronizer* const>  dependencyList,
+                     gmx_nbnxn_gpu_t                            *gpu_nbv,
+                     bool                                        useGpuFPmeReduction,
+                     bool                                        accumulateForce);
 
 /* Add the fshift force stored in nbat to fshift */
 void nbnxn_atomdata_add_nbat_fshift_to_fshift(const nbnxn_atomdata_t   &nbat,
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
index ca88b5e94f..23502b3fbe 100644
--- a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
@@ -806,15 +806,15 @@ void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid               &grid,
 }
 
 /* F buffer operations on GPU: performs force summations and conversion from nb to rvec format. */
-void nbnxn_gpu_add_nbat_f_to_f(const AtomLocality               atomLocality,
-                               DeviceBuffer<float>              totalForcesDevice,
-                               gmx_nbnxn_gpu_t                 *nb,
-                               void                            *pmeForcesDevice,
-                               GpuEventSynchronizer            *pmeForcesReady,
-                               int                              atomStart,
-                               int                              numAtoms,
-                               bool                             useGpuFPmeReduction,
-                               bool                             accumulateForce)
+void nbnxn_gpu_add_nbat_f_to_f(const AtomLocality                          atomLocality,
+                               DeviceBuffer<float>                         totalForcesDevice,
+                               gmx_nbnxn_gpu_t                            *nb,
+                               void                                       *pmeForcesDevice,
+                               gmx::ArrayRef<GpuEventSynchronizer* const>  dependencyList,
+                               int                                         atomStart,
+                               int                                         numAtoms,
+                               bool                                        useGpuFPmeReduction,
+                               bool                                        accumulateForce)
 {
     GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
     GMX_ASSERT(numAtoms != 0, "Cannot call function with no atoms");
@@ -824,10 +824,15 @@ void nbnxn_gpu_add_nbat_f_to_f(const AtomLocality               atomLocality,
     cudaStream_t              stream        = nb->stream[iLocality];
     cu_atomdata_t            *adat          = nb->atdat;
 
-    if (useGpuFPmeReduction)
+    size_t gmx_used_in_debug  numDependency =
+        static_cast<size_t>((useGpuFPmeReduction == true)) +
+        static_cast<size_t>((accumulateForce == true));
+    GMX_ASSERT(numDependency >= dependencyList.size(), "Mismatching number of dependencies and call signature");
+
+    // Enqueue wait on all dependencies passed
+    for (auto const synchronizer : dependencyList)
     {
-        //Stream must wait for PME force completion
-        pmeForcesReady->enqueueWaitEvent(stream);
+        synchronizer->enqueueWaitEvent(stream);
     }
 
     /* launch kernel */
diff --git a/src/gromacs/nbnxm/nbnxm.cpp b/src/gromacs/nbnxm/nbnxm.cpp
index 88e6169b71..0ec882f3b7 100644
--- a/src/gromacs/nbnxm/nbnxm.cpp
+++ b/src/gromacs/nbnxm/nbnxm.cpp
@@ -189,12 +189,12 @@ nonbonded_verlet_t::atomdata_add_nbat_f_to_f(const Nbnxm::AtomLocality
 }
 
 void
-nonbonded_verlet_t::atomdata_add_nbat_f_to_f_gpu(const Nbnxm::AtomLocality           locality,
-                                                 DeviceBuffer<float>                 totalForcesDevice,
-                                                 void                               *forcesPmeDevice,
-                                                 GpuEventSynchronizer               *pmeForcesReady,
-                                                 bool                                useGpuFPmeReduction,
-                                                 bool                                accumulateForce)
+nonbonded_verlet_t::atomdata_add_nbat_f_to_f_gpu(const Nbnxm::AtomLocality                   locality,
+                                                 DeviceBuffer<float>                         totalForcesDevice,
+                                                 void                                       *forcesPmeDevice,
+                                                 gmx::ArrayRef<GpuEventSynchronizer* const>  dependencyList,
+                                                 bool                                        useGpuFPmeReduction,
+                                                 bool                                        accumulateForce)
 {
 
     GMX_ASSERT((useGpuFPmeReduction == (forcesPmeDevice != nullptr)),
@@ -210,7 +210,7 @@ nonbonded_verlet_t::atomdata_add_nbat_f_to_f_gpu(const Nbnxm::AtomLocality
     wallcycle_start(wcycle_, ewcNB_XF_BUF_OPS);
     wallcycle_sub_start(wcycle_, ewcsNB_F_BUF_OPS);
 
-    reduceForcesGpu(locality, totalForcesDevice, pairSearch_->gridSet(), forcesPmeDevice, pmeForcesReady, gpu_nbv, useGpuFPmeReduction, accumulateForce);
+    reduceForcesGpu(locality, totalForcesDevice, pairSearch_->gridSet(), forcesPmeDevice, dependencyList, gpu_nbv, useGpuFPmeReduction, accumulateForce);
 
     wallcycle_sub_stop(wcycle_, ewcsNB_F_BUF_OPS);
     wallcycle_stop(wcycle_, ewcNB_XF_BUF_OPS);
diff --git a/src/gromacs/nbnxm/nbnxm.h b/src/gromacs/nbnxm/nbnxm.h
index 3be931f7d9..2f25f701cb 100644
--- a/src/gromacs/nbnxm/nbnxm.h
+++ b/src/gromacs/nbnxm/nbnxm.h
@@ -346,16 +346,16 @@ struct nonbonded_verlet_t
          * \param [in]     locality             Local or non-local
          * \param [in,out] totalForcesDevice    Force to be added to
          * \param [in]     forcesPmeDevice      Device buffer with PME forces
-         * \param [in]     pmeForcesReady       Event triggered when PME force calculation has completed
+         * \param[in]      dependencyList       List of synchronizers that represent the dependencies the reduction task needs to sync on.
          * \param [in]     useGpuFPmeReduction  Whether PME forces should be added
          * \param [in]     accumulateForce      If the total force buffer already contains data
          */
-        void atomdata_add_nbat_f_to_f_gpu(Nbnxm::AtomLocality                 locality,
-                                          DeviceBuffer<float>                 totalForcesDevice,
-                                          void                               *forcesPmeDevice,
-                                          GpuEventSynchronizer               *pmeForcesReady,
-                                          bool                                useGpuFPmeReduction,
-                                          bool                                accumulateForce);
+        void atomdata_add_nbat_f_to_f_gpu(Nbnxm::AtomLocality                         locality,
+                                          DeviceBuffer<float>                         totalForcesDevice,
+                                          void                                       *forcesPmeDevice,
+                                          gmx::ArrayRef<GpuEventSynchronizer* const>  dependencyList,
+                                          bool                                        useGpuFPmeReduction,
+                                          bool                                        accumulateForce);
 
         /*! \brief Outer body of function to perform initialization for F buffer operations on GPU. */
         void atomdata_init_add_nbat_f_to_f_gpu();
diff --git a/src/gromacs/nbnxm/nbnxm_gpu.h b/src/gromacs/nbnxm/nbnxm_gpu.h
index ae396e9089..ccb2aa5bde 100644
--- a/src/gromacs/nbnxm/nbnxm_gpu.h
+++ b/src/gromacs/nbnxm/nbnxm_gpu.h
@@ -301,7 +301,7 @@ void nbnxn_gpu_init_add_nbat_f_to_f(const int               gmx_unused *cell,
  * \param[in]   totalForcesDevice    Device buffer to accumulate resulting force.
  * \param[in]   gpu_nbv              The NBNXM GPU data structure.
  * \param[in]   pmeForcesDevice      Device buffer with PME forces.
- * \param[in]   pmeForcesReady       Event that signals when the PME forces are ready for the reduction.
+ * \param[in]   dependencyList       List of synchronizers that represent the dependencies the reduction task needs to sync on.
  * \param[in]   atomStart            Index of the first atom to reduce forces for.
  * \param[in]   numAtoms             Number of atoms to reduce forces for.
  * \param[in]   useGpuFPmeReduction  Whether PME forces should be added.
@@ -309,15 +309,15 @@ void nbnxn_gpu_init_add_nbat_f_to_f(const int               gmx_unused *cell,
  *
  */
 CUDA_FUNC_QUALIFIER
-void nbnxn_gpu_add_nbat_f_to_f(AtomLocality                 gmx_unused  atomLocality,
-                               DeviceBuffer<float>          gmx_unused  totalForcesDevice,
-                               gmx_nbnxn_gpu_t              gmx_unused *gpu_nbv,
-                               void                         gmx_unused *pmeForcesDevice,
-                               GpuEventSynchronizer         gmx_unused *pmeForcesReady,
-                               int                          gmx_unused  atomStart,
-                               int                          gmx_unused  numAtoms,
-                               bool                         gmx_unused  useGpuFPmeReduction,
-                               bool                         gmx_unused  accumulateForce) CUDA_FUNC_TERM;
+void nbnxn_gpu_add_nbat_f_to_f(AtomLocality                               gmx_unused  atomLocality,
+                               DeviceBuffer<float>                        gmx_unused  totalForcesDevice,
+                               gmx_nbnxn_gpu_t                            gmx_unused *gpu_nbv,
+                               void                                       gmx_unused *pmeForcesDevice,
+                               gmx::ArrayRef<GpuEventSynchronizer* const> gmx_unused  dependencyList,
+                               int                                        gmx_unused  atomStart,
+                               int                                        gmx_unused  numAtoms,
+                               bool                                       gmx_unused  useGpuFPmeReduction,
+                               bool                                       gmx_unused  accumulateForce) CUDA_FUNC_TERM;
 
 /*! \brief Wait for GPU stream to complete */
 CUDA_FUNC_QUALIFIER
-- 
2.22.0