Event-based Dependency for GPU Force Halo Exchange
authorAlan Gray <alangraygerrit@gmail.com>
Tue, 22 Oct 2019 14:23:11 +0000 (07:23 -0700)
committerMark Abraham <mark.j.abraham@gmail.com>
Wed, 27 Nov 2019 05:48:02 +0000 (06:48 +0100)
Introduces new event recorded when exchanged forces are ready on GPU,
and passes this into force buffer ops using dependencyList. Removes previous
mechanism of forcing local stream to wait on non-local stream.

Addresses part of #3093
Refs  #3194

Change-Id: I768898839e5c6a653894d5eb80354f0e423e06ed

src/gromacs/domdec/gpuhaloexchange.h
src/gromacs/domdec/gpuhaloexchange_impl.cpp
src/gromacs/domdec/gpuhaloexchange_impl.cu
src/gromacs/domdec/gpuhaloexchange_impl.cuh
src/gromacs/mdlib/sim_util.cpp
src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
src/gromacs/nbnxm/nbnxm.cpp
src/gromacs/nbnxm/nbnxm.h
src/gromacs/nbnxm/nbnxm_gpu.h

index f577124adadae6fab7ac5a3c7823f87eba1cfbf5..b7e6ff54fa70c0aa273aa3300ac177e137384a26 100644 (file)
@@ -110,6 +110,10 @@ public:
      */
     void communicateHaloForces(bool accumulateForces);
 
+    /*! \brief Get the event synchronizer for the forces ready on device.
+     *  \returns  The event to synchronize the stream that consumes forces on device.
+     */
+    GpuEventSynchronizer* getForcesReadyOnDeviceEvent();
 
 private:
     class Impl;
index 7ea94ac6a4a068a781bc55a213e03d84493450f0..2511673218b3eac56b681e808c80da36989b17c6 100644 (file)
@@ -96,6 +96,14 @@ void GpuHaloExchange::communicateHaloForces(bool gmx_unused accumulateForces)
                "A CPU stub for GPU Halo Exchange was called insted of the correct implementation.");
 }
 
+/*!\brief get forces ready on device event stub. */
+GpuEventSynchronizer* GpuHaloExchange::getForcesReadyOnDeviceEvent()
+{
+    GMX_ASSERT(false,
+               "A CPU stub for GPU Halo Exchange was called insted of the correct implementation.");
+    return nullptr;
+}
+
 } // namespace gmx
 
 #endif /* GMX_GPU != GMX_GPU_CUDA */
index 09061493b38afd1d11ca26b5fc83c3da6b73bd39..b1e743db28e09d74af0a5e327f93142cf5cc8329 100644 (file)
@@ -289,6 +289,7 @@ void GpuHaloExchange::Impl::communicateHaloForces(bool accumulateForces)
 
         launchGpuKernel(kernelFn, config, nullptr, "Domdec GPU Apply F Halo Exchange", kernelArgs);
     }
+    fReadyOnDevice_.markEvent(nonLocalStream_);
 }
 
 
@@ -378,6 +379,11 @@ void GpuHaloExchange::Impl::communicateHaloDataWithCudaDirect(void* sendPtr,
 #endif
 }
 
+GpuEventSynchronizer* GpuHaloExchange::Impl::getForcesReadyOnDeviceEvent()
+{
+    return &fReadyOnDevice_;
+}
+
 /*! \brief Create Domdec GPU object */
 GpuHaloExchange::Impl::Impl(gmx_domdec_t* dd, MPI_Comm mpi_comm_mysim, void* localStream, void* nonLocalStream) :
     dd_(dd),
@@ -443,4 +449,8 @@ void GpuHaloExchange::communicateHaloForces(bool accumulateForces)
     impl_->communicateHaloForces(accumulateForces);
 }
 
+GpuEventSynchronizer* GpuHaloExchange::getForcesReadyOnDeviceEvent()
+{
+    return impl_->getForcesReadyOnDeviceEvent();
+}
 } // namespace gmx
index 9f4bf8cdd84744304479356da2d7fda46479ece0..017cb191869e40bc9d5b46cb1c9f164a0767e6a5 100644 (file)
@@ -96,6 +96,11 @@ public:
      */
     void communicateHaloForces(bool accumulateForces);
 
+    /*! \brief Get the event synchronizer for the forces ready on device.
+     *  \returns  The event to synchronize the stream that consumes forces on device.
+     */
+    GpuEventSynchronizer* getForcesReadyOnDeviceEvent();
+
 private:
     /*! \brief Data transfer wrapper for GPU halo exchange
      * \param [inout] d_ptr      pointer to coordinates or force buffer in GPU memory
@@ -177,6 +182,8 @@ private:
     float3* d_x_ = nullptr;
     //! full forces buffer in GPU memory
     float3* d_f_ = nullptr;
+    //! An event recorded once the exchanged forces are ready on the GPU
+    GpuEventSynchronizer fReadyOnDevice_;
 };
 
 } // namespace gmx
index 6009b1a5a653526d1d19d316e258b5073aa27a54..b52ec1b416584c3a44d9deea379f988f69314895 100644 (file)
@@ -1670,7 +1670,7 @@ void do_force(FILE*                               fplog,
                                            fr->pmePpCommGpu->getForcesReadySynchronizer())) // buffer received from other GPU
                         : nullptr; // PME reduction not active on GPU
 
-        gmx::FixedCapacityVector<GpuEventSynchronizer*, 2> dependencyList;
+        gmx::FixedCapacityVector<GpuEventSynchronizer*, 3> dependencyList;
 
         if (stepWork.useGpuPmeFReduction)
         {
@@ -1703,14 +1703,7 @@ void do_force(FILE*                               fplog,
             }
             if (useGpuForcesHaloExchange)
             {
-                // Add a stream synchronization to satisfy a dependency
-                // for the local buffer ops on the result of GPU halo
-                // exchange, which operates in the non-local stream and
-                // writes to to local parf og the force buffer.
-                //
-                // TODO improve this through use of an event - see Redmine #3093
-                //      push the event into the dependencyList
-                nbv->stream_local_wait_for_nonlocal();
+                dependencyList.push_back(gpuHaloExchange->getForcesReadyOnDeviceEvent());
             }
             nbv->atomdata_add_nbat_f_to_f_gpu(AtomLocality::Local, stateGpu->getForces(), pmeForcePtr,
                                               dependencyList, stepWork.useGpuPmeFReduction,
index 4f2c54132fdac61d9858a77be88a07329acbe56f..45963daba6f4fca5121086d82501cdc7a93aaa42 100644 (file)
@@ -945,14 +945,4 @@ void nbnxn_gpu_add_nbat_f_to_f(const AtomLocality                         atomLo
     }
 }
 
-void nbnxn_stream_local_wait_for_nonlocal(gmx_nbnxn_cuda_t* nb)
-{
-    cudaStream_t localStream    = nb->stream[InteractionLocality::Local];
-    cudaStream_t nonLocalStream = nb->stream[InteractionLocality::NonLocal];
-
-    GpuEventSynchronizer event;
-    event.markEvent(nonLocalStream);
-    event.enqueueWaitEvent(localStream);
-}
-
 } // namespace Nbnxm
index 089c7f277f1f861003f349d4a291aed9c7b01513..1531f02816f73744b0cf3d687040079a296ab983 100644 (file)
@@ -245,9 +245,4 @@ void nonbonded_verlet_t::insertNonlocalGpuDependency(const gmx::InteractionLocal
     Nbnxm::nbnxnInsertNonlocalGpuDependency(gpu_nbv, interactionLocality);
 }
 
-void nonbonded_verlet_t::stream_local_wait_for_nonlocal()
-{
-    Nbnxm::nbnxn_stream_local_wait_for_nonlocal(gpu_nbv);
-}
-
 /*! \endcond */
index fd11ec9de527f203e975d4c99c0b1e77f8306712..1548f3704ff9c74b83d01b82d29a29ce1e2a1cd4 100644 (file)
@@ -353,9 +353,6 @@ public:
     /*! \brief return GPU pointer to f in rvec format */
     void* get_gpu_frvec();
 
-    /*! \brief Ensure local stream waits for non-local stream */
-    void stream_local_wait_for_nonlocal();
-
     //! Return the kernel setup
     const Nbnxm::KernelSetup& kernelSetup() const { return kernelSetup_; }
 
index 423ec5b9254cdc9a7f9b7a3cc86bb739e2960aa9..7a4a94b6a52f5888932418bcce202b1391e97808 100644 (file)
@@ -328,11 +328,5 @@ void nbnxn_gpu_add_nbat_f_to_f(gmx::AtomLocality gmx_unused atomLocality,
 CUDA_FUNC_QUALIFIER
 void nbnxn_wait_x_on_device(gmx_nbnxn_gpu_t gmx_unused* nb) CUDA_FUNC_TERM;
 
-/*! \brief Ensure local stream waits for non-local stream
- * \param[in] nb                   The nonbonded data GPU structure
- */
-CUDA_FUNC_QUALIFIER
-void nbnxn_stream_local_wait_for_nonlocal(gmx_nbnxn_gpu_t gmx_unused* nb) CUDA_FUNC_TERM;
-
 } // namespace Nbnxm
 #endif