Trigger synchronizer when local forces are ready

author Szilárd Páll <pall.szilard@gmail.com>

Mon, 14 Oct 2019 17:19:03 +0000 (19:19 +0200)

committer Szilárd Páll <pall.szilard@gmail.com>

Tue, 15 Oct 2019 23:54:41 +0000 (01:54 +0200)
author Szilárd Páll <pall.szilard@gmail.com>
Mon, 14 Oct 2019 17:19:03 +0000 (19:19 +0200)
committer Szilárd Páll <pall.szilard@gmail.com>
Tue, 15 Oct 2019 23:54:41 +0000 (01:54 +0200)
diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp

index 2f2cade7db89105f95e6ebdfb1f2c6a00d43e450..5c40ff7f165c710376db8a40c833b6551d7b1280 100644 (file)
--- a/src/gromacs/mdlib/sim_util.cpp
+++ b/src/gromacs/mdlib/sim_util.cpp
@@ -1155,7 +1155,8 @@ void do_force(FILE                                     *fplog,
          // NS step is also a virial step (on which f buf ops are deactivated).
          if (simulationWork.useGpuBufferOps && simulationWork.useGpuNonbonded && (GMX_GPU == GMX_GPU_CUDA))
          {
-            nbv->atomdata_init_add_nbat_f_to_f_gpu();
+            GMX_ASSERT(stateGpu, "stateGpu should be valid here");
+            nbv->atomdata_init_add_nbat_f_to_f_gpu(stateGpu->fReducedOnDevice());
          }
      }
      else if (!EI_TPI(inputrec->eI))
@@ -1544,6 +1545,7 @@ void do_force(FILE                                     *fplog,
                                                    pme_gpu_get_device_f(fr->pmedata),
                                                    dependencyList,
                                                    false, haveNonLocalForceContribInCpuBuffer);
+                // TODO: this should be conditional on whether GPU direct comm is used?
                  stateGpu->copyForcesFromGpu(forceOut.forceWithShiftForces().force(), gmx::StatePropagatorDataGpu::AtomLocality::NonLocal);
              }
              else
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu

index 7b1241e5e2ca56f1bdfee93a950964d11692d3e0..201a97055fdce93cb7df1b0b8de171590299cb44 100644 (file)
--- a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
@@ -879,6 +879,11 @@ void nbnxn_gpu_add_nbat_f_to_f(const AtomLocality                          atomL
  
      launchGpuKernel(kernelFn, config, nullptr, "FbufferOps", kernelArgs);
  
+    if (atomLocality == AtomLocality::Local)
+    {
+        GMX_ASSERT(nb->localFReductionDone != nullptr, "localFReductionDone has to be a valid pointer");
+        nb->localFReductionDone->markEvent(stream);
+    }
  }
  
  void* nbnxn_get_x_on_device_event(const gmx_nbnxn_cuda_t   *nb)
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu

index 71e25d23b110b905aa579168ccde66f0fe3f1056..67e7b581e7830c7fc1738e5bf4cafd45a3dd86d5 100644 (file)
--- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu
@@ -963,13 +963,17 @@ void nbnxn_gpu_init_x_to_nbat_x(const Nbnxm::GridSet            &gridSet,
  }
  
  /* Initialization for F buffer operations on GPU. */
-void nbnxn_gpu_init_add_nbat_f_to_f(const int                *cell,
-                                    gmx_nbnxn_gpu_t          *gpu_nbv,
-                                    int                       natoms_total)
+void nbnxn_gpu_init_add_nbat_f_to_f(const int                  *cell,
+                                    gmx_nbnxn_gpu_t            *gpu_nbv,
+                                    int                         natoms_total,
+                                    GpuEventSynchronizer* const localReductionDone)
  {
  
      cudaStream_t         stream  = gpu_nbv->stream[InteractionLocality::Local];
  
+    GMX_ASSERT(localReductionDone, "localReductionDone should be a valid pointer");
+    gpu_nbv->localFReductionDone = localReductionDone;
+
      if (natoms_total > 0)
      {
          reallocateDeviceBuffer(&gpu_nbv->cell, natoms_total, &gpu_nbv->ncell, &gpu_nbv->ncell_alloc, nullptr);
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h b/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h

index f3fc0e88527c6626c0a4021e9b03898f47da8d61..4667d63c4271559f7a2f2191289fa62b40ea4886 100644 (file)
--- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h
@@ -274,6 +274,12 @@ struct gmx_nbnxn_cuda_t
                                                     any dependent task (e.g. transfer of coordinates
                                                     to the PME rank's GPU) can proceed. */
  
+    /*! \brief Pointer to event synchronizer triggered when the local GPU buffer ops / reduction is complete
+     *
+     * \note That the synchronizer is managed outside of this module in StatePropagatorDataGpu.
+     */
+    GpuEventSynchronizer *localFReductionDone;
+
      GpuEventSynchronizer *xNonLocalCopyD2HDone; /**< event triggered when
                                                     non-local coordinate buffer has been
                                                     copied from device to host*/
diff --git a/src/gromacs/nbnxm/nbnxm.cpp b/src/gromacs/nbnxm/nbnxm.cpp

index 3a796b504e67ada819190c51ffcd665e649e591e..52d2fecb1305d7c63aabb81608c7416756aa664f 100644 (file)
--- a/src/gromacs/nbnxm/nbnxm.cpp
+++ b/src/gromacs/nbnxm/nbnxm.cpp
@@ -219,7 +219,7 @@ nonbonded_verlet_t::atomdata_add_nbat_f_to_f_gpu(const Nbnxm::AtomLocality
  }
  
  void
-nonbonded_verlet_t::atomdata_init_add_nbat_f_to_f_gpu()
+nonbonded_verlet_t::atomdata_init_add_nbat_f_to_f_gpu(GpuEventSynchronizer* const localReductionDone)
  {
  
      wallcycle_start(wcycle_, ewcNB_XF_BUF_OPS);
@@ -229,7 +229,8 @@ nonbonded_verlet_t::atomdata_init_add_nbat_f_to_f_gpu()
  
      Nbnxm::nbnxn_gpu_init_add_nbat_f_to_f(gridSet.cells().data(),
                                            gpu_nbv,
-                                          gridSet.numRealAtomsTotal());
+                                          gridSet.numRealAtomsTotal(),
+                                          localReductionDone);
  
      wallcycle_sub_stop(wcycle_, ewcsNB_F_BUF_OPS);
      wallcycle_stop(wcycle_, ewcNB_XF_BUF_OPS);
diff --git a/src/gromacs/nbnxm/nbnxm.h b/src/gromacs/nbnxm/nbnxm.h

index 4378663b933cea0b1139e7596a12ce59cb7d2865..9e58003db287e4a7cfbc4ed3a521ce01b9491dad 100644 (file)
--- a/src/gromacs/nbnxm/nbnxm.h
+++ b/src/gromacs/nbnxm/nbnxm.h
@@ -359,8 +359,11 @@ struct nonbonded_verlet_t
                                            bool                                        useGpuFPmeReduction,
                                            bool                                        accumulateForce);
  
-        /*! \brief Outer body of function to perform initialization for F buffer operations on GPU. */
-        void atomdata_init_add_nbat_f_to_f_gpu();
+        /*! \brief Outer body of function to perform initialization for F buffer operations on GPU.
+         *
+         * \param localReductionDone     Pointer to an event synchronizer that marks the completion of the local f buffer ops kernel.
+         */
+        void atomdata_init_add_nbat_f_to_f_gpu(GpuEventSynchronizer* localReductionDone);
  
          /*! \brief return pointer to GPU event recorded when coordinates have been copied to device */
          void* get_x_on_device_event();
diff --git a/src/gromacs/nbnxm/nbnxm_gpu.h b/src/gromacs/nbnxm/nbnxm_gpu.h

index 1061292b2da57a9dd50eb8db0332a2736eeaecbd..b0941b46b0b1ff8f98a9b82d8ae307f9e0d3cc03 100644 (file)
--- a/src/gromacs/nbnxm/nbnxm_gpu.h
+++ b/src/gromacs/nbnxm/nbnxm_gpu.h
@@ -290,9 +290,10 @@ bool haveGpuShortRangeWork(const gmx_nbnxn_gpu_t     gmx_unused *nb,
  
  /*! \brief Initialization for F buffer operations on GPU */
  CUDA_FUNC_QUALIFIER
-void nbnxn_gpu_init_add_nbat_f_to_f(const int               gmx_unused *cell,
-                                    gmx_nbnxn_gpu_t         gmx_unused *gpu_nbv,
-                                    int                     gmx_unused  natoms_total) CUDA_FUNC_TERM;
+void nbnxn_gpu_init_add_nbat_f_to_f(const int                   gmx_unused *cell,
+                                    gmx_nbnxn_gpu_t             gmx_unused *gpu_nbv,
+                                    int                         gmx_unused  natoms_total,
+                                    GpuEventSynchronizer        gmx_unused *localReductionDone) CUDA_FUNC_TERM;
  
  /*! \brief Force buffer operations on GPU.
   *
author	Szilárd Páll <pall.szilard@gmail.com>
	Mon, 14 Oct 2019 17:19:03 +0000 (19:19 +0200)
committer	Szilárd Páll <pall.szilard@gmail.com>
	Tue, 15 Oct 2019 23:54:41 +0000 (01:54 +0200)
src/gromacs/mdlib/sim_util.cpp		patch \| blob \| history
src/gromacs/nbnxm/cuda/nbnxm_cuda.cu		patch \| blob \| history
src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu		patch \| blob \| history
src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h		patch \| blob \| history
src/gromacs/nbnxm/nbnxm.cpp		patch \| blob \| history
src/gromacs/nbnxm/nbnxm.h		patch \| blob \| history
src/gromacs/nbnxm/nbnxm_gpu.h		patch \| blob \| history