From f310be38d375b49f4312c6ee0bd6cd62729174cf Mon Sep 17 00:00:00 2001
From: =?utf8?q?Szil=C3=A1rd=20P=C3=A1ll?= <pall.szilard@gmail.com>
Date: Mon, 14 Oct 2019 19:19:03 +0200
Subject: [PATCH] Trigger synchronizer when local forces are ready

The sycnhronizer is created and managed in StatePropagatorDataGpu and is
passed to the nonbonded mdoule at the f buffer ops init.

Refs #2888 #3126

Change-Id: Ie9bf0b6cd8511fe282e377e48f3940e591db214c
---
 src/gromacs/mdlib/sim_util.cpp                 |  4 +++-
 src/gromacs/nbnxm/cuda/nbnxm_cuda.cu           |  5 +++++
 src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu | 10 +++++++---
 src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h      |  6 ++++++
 src/gromacs/nbnxm/nbnxm.cpp                    |  5 +++--
 src/gromacs/nbnxm/nbnxm.h                      |  7 +++++--
 src/gromacs/nbnxm/nbnxm_gpu.h                  |  7 ++++---
 7 files changed, 33 insertions(+), 11 deletions(-)

diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp
index 2f2cade7db..5c40ff7f16 100644
--- a/src/gromacs/mdlib/sim_util.cpp
+++ b/src/gromacs/mdlib/sim_util.cpp
@@ -1155,7 +1155,8 @@ void do_force(FILE                                     *fplog,
         // NS step is also a virial step (on which f buf ops are deactivated).
         if (simulationWork.useGpuBufferOps && simulationWork.useGpuNonbonded && (GMX_GPU == GMX_GPU_CUDA))
         {
-            nbv->atomdata_init_add_nbat_f_to_f_gpu();
+            GMX_ASSERT(stateGpu, "stateGpu should be valid here");
+            nbv->atomdata_init_add_nbat_f_to_f_gpu(stateGpu->fReducedOnDevice());
         }
     }
     else if (!EI_TPI(inputrec->eI))
@@ -1544,6 +1545,7 @@ void do_force(FILE                                     *fplog,
                                                   pme_gpu_get_device_f(fr->pmedata),
                                                   dependencyList,
                                                   false, haveNonLocalForceContribInCpuBuffer);
+                // TODO: this should be conditional on whether GPU direct comm is used?
                 stateGpu->copyForcesFromGpu(forceOut.forceWithShiftForces().force(), gmx::StatePropagatorDataGpu::AtomLocality::NonLocal);
             }
             else
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
index 7b1241e5e2..201a97055f 100644
--- a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
@@ -879,6 +879,11 @@ void nbnxn_gpu_add_nbat_f_to_f(const AtomLocality                          atomL
 
     launchGpuKernel(kernelFn, config, nullptr, "FbufferOps", kernelArgs);
 
+    if (atomLocality == AtomLocality::Local)
+    {
+        GMX_ASSERT(nb->localFReductionDone != nullptr, "localFReductionDone has to be a valid pointer");
+        nb->localFReductionDone->markEvent(stream);
+    }
 }
 
 void* nbnxn_get_x_on_device_event(const gmx_nbnxn_cuda_t   *nb)
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu
index 71e25d23b1..67e7b581e7 100644
--- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu
@@ -963,13 +963,17 @@ void nbnxn_gpu_init_x_to_nbat_x(const Nbnxm::GridSet            &gridSet,
 }
 
 /* Initialization for F buffer operations on GPU. */
-void nbnxn_gpu_init_add_nbat_f_to_f(const int                *cell,
-                                    gmx_nbnxn_gpu_t          *gpu_nbv,
-                                    int                       natoms_total)
+void nbnxn_gpu_init_add_nbat_f_to_f(const int                  *cell,
+                                    gmx_nbnxn_gpu_t            *gpu_nbv,
+                                    int                         natoms_total,
+                                    GpuEventSynchronizer* const localReductionDone)
 {
 
     cudaStream_t         stream  = gpu_nbv->stream[InteractionLocality::Local];
 
+    GMX_ASSERT(localReductionDone, "localReductionDone should be a valid pointer");
+    gpu_nbv->localFReductionDone = localReductionDone;
+
     if (natoms_total > 0)
     {
         reallocateDeviceBuffer(&gpu_nbv->cell, natoms_total, &gpu_nbv->ncell, &gpu_nbv->ncell_alloc, nullptr);
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h b/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h
index f3fc0e8852..4667d63c42 100644
--- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h
@@ -274,6 +274,12 @@ struct gmx_nbnxn_cuda_t
                                                    any dependent task (e.g. transfer of coordinates
                                                    to the PME rank's GPU) can proceed. */
 
+    /*! \brief Pointer to event synchronizer triggered when the local GPU buffer ops / reduction is complete
+     *
+     * \note That the synchronizer is managed outside of this module in StatePropagatorDataGpu.
+     */
+    GpuEventSynchronizer *localFReductionDone;
+
     GpuEventSynchronizer *xNonLocalCopyD2HDone; /**< event triggered when
                                                    non-local coordinate buffer has been
                                                    copied from device to host*/
diff --git a/src/gromacs/nbnxm/nbnxm.cpp b/src/gromacs/nbnxm/nbnxm.cpp
index 3a796b504e..52d2fecb13 100644
--- a/src/gromacs/nbnxm/nbnxm.cpp
+++ b/src/gromacs/nbnxm/nbnxm.cpp
@@ -219,7 +219,7 @@ nonbonded_verlet_t::atomdata_add_nbat_f_to_f_gpu(const Nbnxm::AtomLocality
 }
 
 void
-nonbonded_verlet_t::atomdata_init_add_nbat_f_to_f_gpu()
+nonbonded_verlet_t::atomdata_init_add_nbat_f_to_f_gpu(GpuEventSynchronizer* const localReductionDone)
 {
 
     wallcycle_start(wcycle_, ewcNB_XF_BUF_OPS);
@@ -229,7 +229,8 @@ nonbonded_verlet_t::atomdata_init_add_nbat_f_to_f_gpu()
 
     Nbnxm::nbnxn_gpu_init_add_nbat_f_to_f(gridSet.cells().data(),
                                           gpu_nbv,
-                                          gridSet.numRealAtomsTotal());
+                                          gridSet.numRealAtomsTotal(),
+                                          localReductionDone);
 
     wallcycle_sub_stop(wcycle_, ewcsNB_F_BUF_OPS);
     wallcycle_stop(wcycle_, ewcNB_XF_BUF_OPS);
diff --git a/src/gromacs/nbnxm/nbnxm.h b/src/gromacs/nbnxm/nbnxm.h
index 4378663b93..9e58003db2 100644
--- a/src/gromacs/nbnxm/nbnxm.h
+++ b/src/gromacs/nbnxm/nbnxm.h
@@ -359,8 +359,11 @@ struct nonbonded_verlet_t
                                           bool                                        useGpuFPmeReduction,
                                           bool                                        accumulateForce);
 
-        /*! \brief Outer body of function to perform initialization for F buffer operations on GPU. */
-        void atomdata_init_add_nbat_f_to_f_gpu();
+        /*! \brief Outer body of function to perform initialization for F buffer operations on GPU.
+         *
+         * \param localReductionDone     Pointer to an event synchronizer that marks the completion of the local f buffer ops kernel.
+         */
+        void atomdata_init_add_nbat_f_to_f_gpu(GpuEventSynchronizer* localReductionDone);
 
         /*! \brief return pointer to GPU event recorded when coordinates have been copied to device */
         void* get_x_on_device_event();
diff --git a/src/gromacs/nbnxm/nbnxm_gpu.h b/src/gromacs/nbnxm/nbnxm_gpu.h
index 1061292b2d..b0941b46b0 100644
--- a/src/gromacs/nbnxm/nbnxm_gpu.h
+++ b/src/gromacs/nbnxm/nbnxm_gpu.h
@@ -290,9 +290,10 @@ bool haveGpuShortRangeWork(const gmx_nbnxn_gpu_t     gmx_unused *nb,
 
 /*! \brief Initialization for F buffer operations on GPU */
 CUDA_FUNC_QUALIFIER
-void nbnxn_gpu_init_add_nbat_f_to_f(const int               gmx_unused *cell,
-                                    gmx_nbnxn_gpu_t         gmx_unused *gpu_nbv,
-                                    int                     gmx_unused  natoms_total) CUDA_FUNC_TERM;
+void nbnxn_gpu_init_add_nbat_f_to_f(const int                   gmx_unused *cell,
+                                    gmx_nbnxn_gpu_t             gmx_unused *gpu_nbv,
+                                    int                         gmx_unused  natoms_total,
+                                    GpuEventSynchronizer        gmx_unused *localReductionDone) CUDA_FUNC_TERM;
 
 /*! \brief Force buffer operations on GPU.
  *
-- 
2.22.0