Event-based Dependency for GPU Force Halo Exchange

author Alan Gray <alangraygerrit@gmail.com>

Tue, 22 Oct 2019 14:23:11 +0000 (07:23 -0700)

committer Mark Abraham <mark.j.abraham@gmail.com>

Wed, 27 Nov 2019 05:48:02 +0000 (06:48 +0100)
author Alan Gray <alangraygerrit@gmail.com>
Tue, 22 Oct 2019 14:23:11 +0000 (07:23 -0700)
committer Mark Abraham <mark.j.abraham@gmail.com>
Wed, 27 Nov 2019 05:48:02 +0000 (06:48 +0100)
diff --git a/src/gromacs/domdec/gpuhaloexchange.h b/src/gromacs/domdec/gpuhaloexchange.h

index f577124adadae6fab7ac5a3c7823f87eba1cfbf5..b7e6ff54fa70c0aa273aa3300ac177e137384a26 100644 (file)
--- a/src/gromacs/domdec/gpuhaloexchange.h
+++ b/src/gromacs/domdec/gpuhaloexchange.h
@@ -110,6 +110,10 @@ public:
       */
      void communicateHaloForces(bool accumulateForces);
  
+    /*! \brief Get the event synchronizer for the forces ready on device.
+     *  \returns  The event to synchronize the stream that consumes forces on device.
+     */
+    GpuEventSynchronizer* getForcesReadyOnDeviceEvent();
  
  private:
      class Impl;
diff --git a/src/gromacs/domdec/gpuhaloexchange_impl.cpp b/src/gromacs/domdec/gpuhaloexchange_impl.cpp

index 7ea94ac6a4a068a781bc55a213e03d84493450f0..2511673218b3eac56b681e808c80da36989b17c6 100644 (file)
--- a/src/gromacs/domdec/gpuhaloexchange_impl.cpp
+++ b/src/gromacs/domdec/gpuhaloexchange_impl.cpp
@@ -96,6 +96,14 @@ void GpuHaloExchange::communicateHaloForces(bool gmx_unused accumulateForces)
                 "A CPU stub for GPU Halo Exchange was called insted of the correct implementation.");
  }
  
+/*!\brief get forces ready on device event stub. */
+GpuEventSynchronizer* GpuHaloExchange::getForcesReadyOnDeviceEvent()
+{
+    GMX_ASSERT(false,
+               "A CPU stub for GPU Halo Exchange was called insted of the correct implementation.");
+    return nullptr;
+}
+
  } // namespace gmx
  
  #endif /* GMX_GPU != GMX_GPU_CUDA */
diff --git a/src/gromacs/domdec/gpuhaloexchange_impl.cu b/src/gromacs/domdec/gpuhaloexchange_impl.cu

index 09061493b38afd1d11ca26b5fc83c3da6b73bd39..b1e743db28e09d74af0a5e327f93142cf5cc8329 100644 (file)
--- a/src/gromacs/domdec/gpuhaloexchange_impl.cu
+++ b/src/gromacs/domdec/gpuhaloexchange_impl.cu
@@ -289,6 +289,7 @@ void GpuHaloExchange::Impl::communicateHaloForces(bool accumulateForces)
  
          launchGpuKernel(kernelFn, config, nullptr, "Domdec GPU Apply F Halo Exchange", kernelArgs);
      }
+    fReadyOnDevice_.markEvent(nonLocalStream_);
  }
  
  
@@ -378,6 +379,11 @@ void GpuHaloExchange::Impl::communicateHaloDataWithCudaDirect(void* sendPtr,
  #endif
  }
  
+GpuEventSynchronizer* GpuHaloExchange::Impl::getForcesReadyOnDeviceEvent()
+{
+    return &fReadyOnDevice_;
+}
+
  /*! \brief Create Domdec GPU object */
  GpuHaloExchange::Impl::Impl(gmx_domdec_t* dd, MPI_Comm mpi_comm_mysim, void* localStream, void* nonLocalStream) :
      dd_(dd),
@@ -443,4 +449,8 @@ void GpuHaloExchange::communicateHaloForces(bool accumulateForces)
      impl_->communicateHaloForces(accumulateForces);
  }
  
+GpuEventSynchronizer* GpuHaloExchange::getForcesReadyOnDeviceEvent()
+{
+    return impl_->getForcesReadyOnDeviceEvent();
+}
  } // namespace gmx
diff --git a/src/gromacs/domdec/gpuhaloexchange_impl.cuh b/src/gromacs/domdec/gpuhaloexchange_impl.cuh

index 9f4bf8cdd84744304479356da2d7fda46479ece0..017cb191869e40bc9d5b46cb1c9f164a0767e6a5 100644 (file)
--- a/src/gromacs/domdec/gpuhaloexchange_impl.cuh
+++ b/src/gromacs/domdec/gpuhaloexchange_impl.cuh
@@ -96,6 +96,11 @@ public:
       */
      void communicateHaloForces(bool accumulateForces);
  
+    /*! \brief Get the event synchronizer for the forces ready on device.
+     *  \returns  The event to synchronize the stream that consumes forces on device.
+     */
+    GpuEventSynchronizer* getForcesReadyOnDeviceEvent();
+
  private:
      /*! \brief Data transfer wrapper for GPU halo exchange
       * \param [inout] d_ptr      pointer to coordinates or force buffer in GPU memory
@@ -177,6 +182,8 @@ private:
      float3* d_x_ = nullptr;
      //! full forces buffer in GPU memory
      float3* d_f_ = nullptr;
+    //! An event recorded once the exchanged forces are ready on the GPU
+    GpuEventSynchronizer fReadyOnDevice_;
  };
  
  } // namespace gmx
diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp

index 6009b1a5a653526d1d19d316e258b5073aa27a54..b52ec1b416584c3a44d9deea379f988f69314895 100644 (file)
--- a/src/gromacs/mdlib/sim_util.cpp
+++ b/src/gromacs/mdlib/sim_util.cpp
@@ -1670,7 +1670,7 @@ void do_force(FILE*                               fplog,
                                             fr->pmePpCommGpu->getForcesReadySynchronizer())) // buffer received from other GPU
                          : nullptr; // PME reduction not active on GPU
  
-        gmx::FixedCapacityVector<GpuEventSynchronizer*, 2> dependencyList;
+        gmx::FixedCapacityVector<GpuEventSynchronizer*, 3> dependencyList;
  
          if (stepWork.useGpuPmeFReduction)
          {
@@ -1703,14 +1703,7 @@ void do_force(FILE*                               fplog,
              }
              if (useGpuForcesHaloExchange)
              {
-                // Add a stream synchronization to satisfy a dependency
-                // for the local buffer ops on the result of GPU halo
-                // exchange, which operates in the non-local stream and
-                // writes to to local parf og the force buffer.
-                //
-                // TODO improve this through use of an event - see Redmine #3093
-                //      push the event into the dependencyList
-                nbv->stream_local_wait_for_nonlocal();
+                dependencyList.push_back(gpuHaloExchange->getForcesReadyOnDeviceEvent());
              }
              nbv->atomdata_add_nbat_f_to_f_gpu(AtomLocality::Local, stateGpu->getForces(), pmeForcePtr,
                                                dependencyList, stepWork.useGpuPmeFReduction,
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu

index 4f2c54132fdac61d9858a77be88a07329acbe56f..45963daba6f4fca5121086d82501cdc7a93aaa42 100644 (file)
--- a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
@@ -945,14 +945,4 @@ void nbnxn_gpu_add_nbat_f_to_f(const AtomLocality                         atomLo
      }
  }
  
-void nbnxn_stream_local_wait_for_nonlocal(gmx_nbnxn_cuda_t* nb)
-{
-    cudaStream_t localStream    = nb->stream[InteractionLocality::Local];
-    cudaStream_t nonLocalStream = nb->stream[InteractionLocality::NonLocal];
-
-    GpuEventSynchronizer event;
-    event.markEvent(nonLocalStream);
-    event.enqueueWaitEvent(localStream);
-}
-
  } // namespace Nbnxm
diff --git a/src/gromacs/nbnxm/nbnxm.cpp b/src/gromacs/nbnxm/nbnxm.cpp

index 089c7f277f1f861003f349d4a291aed9c7b01513..1531f02816f73744b0cf3d687040079a296ab983 100644 (file)
--- a/src/gromacs/nbnxm/nbnxm.cpp
+++ b/src/gromacs/nbnxm/nbnxm.cpp
@@ -245,9 +245,4 @@ void nonbonded_verlet_t::insertNonlocalGpuDependency(const gmx::InteractionLocal
      Nbnxm::nbnxnInsertNonlocalGpuDependency(gpu_nbv, interactionLocality);
  }
  
-void nonbonded_verlet_t::stream_local_wait_for_nonlocal()
-{
-    Nbnxm::nbnxn_stream_local_wait_for_nonlocal(gpu_nbv);
-}
-
  /*! \endcond */
diff --git a/src/gromacs/nbnxm/nbnxm.h b/src/gromacs/nbnxm/nbnxm.h

index fd11ec9de527f203e975d4c99c0b1e77f8306712..1548f3704ff9c74b83d01b82d29a29ce1e2a1cd4 100644 (file)
--- a/src/gromacs/nbnxm/nbnxm.h
+++ b/src/gromacs/nbnxm/nbnxm.h
@@ -353,9 +353,6 @@ public:
      /*! \brief return GPU pointer to f in rvec format */
      void* get_gpu_frvec();
  
-    /*! \brief Ensure local stream waits for non-local stream */
-    void stream_local_wait_for_nonlocal();
-
      //! Return the kernel setup
      const Nbnxm::KernelSetup& kernelSetup() const { return kernelSetup_; }
  
diff --git a/src/gromacs/nbnxm/nbnxm_gpu.h b/src/gromacs/nbnxm/nbnxm_gpu.h

index 423ec5b9254cdc9a7f9b7a3cc86bb739e2960aa9..7a4a94b6a52f5888932418bcce202b1391e97808 100644 (file)
--- a/src/gromacs/nbnxm/nbnxm_gpu.h
+++ b/src/gromacs/nbnxm/nbnxm_gpu.h
@@ -328,11 +328,5 @@ void nbnxn_gpu_add_nbat_f_to_f(gmx::AtomLocality gmx_unused atomLocality,
  CUDA_FUNC_QUALIFIER
  void nbnxn_wait_x_on_device(gmx_nbnxn_gpu_t gmx_unused* nb) CUDA_FUNC_TERM;
  
-/*! \brief Ensure local stream waits for non-local stream
- * \param[in] nb                   The nonbonded data GPU structure
- */
-CUDA_FUNC_QUALIFIER
-void nbnxn_stream_local_wait_for_nonlocal(gmx_nbnxn_gpu_t gmx_unused* nb) CUDA_FUNC_TERM;
-
  } // namespace Nbnxm
  #endif
author	Alan Gray <alangraygerrit@gmail.com>
	Tue, 22 Oct 2019 14:23:11 +0000 (07:23 -0700)
committer	Mark Abraham <mark.j.abraham@gmail.com>
	Wed, 27 Nov 2019 05:48:02 +0000 (06:48 +0100)
src/gromacs/domdec/gpuhaloexchange.h		patch \| blob \| history
src/gromacs/domdec/gpuhaloexchange_impl.cpp		patch \| blob \| history
src/gromacs/domdec/gpuhaloexchange_impl.cu		patch \| blob \| history
src/gromacs/domdec/gpuhaloexchange_impl.cuh		patch \| blob \| history
src/gromacs/mdlib/sim_util.cpp		patch \| blob \| history
src/gromacs/nbnxm/cuda/nbnxm_cuda.cu		patch \| blob \| history
src/gromacs/nbnxm/nbnxm.cpp		patch \| blob \| history
src/gromacs/nbnxm/nbnxm.h		patch \| blob \| history
src/gromacs/nbnxm/nbnxm_gpu.h		patch \| blob \| history