From: Szilárd Páll <pall.szilard@gmail.com>
Date: Tue, 26 Oct 2021 21:52:34 +0000 (+0200)
Subject: Unify coordinate copy handling across GPU platforms
X-Git-Url: http://biod.pnpi.spb.ru/gitweb/?p=alexxy%2Fgromacs.git;a=commitdiff_plain;h=0d86b360f5da797960a7d5dc3520690439bbbc1f

Unify coordinate copy handling across GPU platforms

Instead of handling the OpenCL host to device copy as a special case
where this operation is issued in the PME stream, as the "Update" stream
is anyway created unconditionally it makes sense to simplify codepaths
at the negligible cost of an extra GPU-side synchronization.

This allows unifying the coordinate handling in stateGpu as well as in
PME spread (with combined PP+PME) which removes some complexity and makes
it easier to reason about event consumption, reducing the need for balancing
workarounds.

The change also introduces a data member in StatePropagatorDataGpu to
indicate whether it is use on a separate PME rank, case in which marking
of the coordinate H2D event can be skipped as no event dependency is
required.

Refs #3988
---

diff --git a/src/gromacs/ewald/pme_gpu.cpp b/src/gromacs/ewald/pme_gpu.cpp
index 680fcec0fc..3f8e00f995 100644
--- a/src/gromacs/ewald/pme_gpu.cpp
+++ b/src/gromacs/ewald/pme_gpu.cpp
@@ -198,8 +198,7 @@ void pme_gpu_launch_spread(gmx_pme_t*                     pme,
                            gmx::PmeCoordinateReceiverGpu* pmeCoordinateReceiverGpu)
 {
     GMX_ASSERT(pme_gpu_active(pme), "This should be a GPU run of PME but it is not enabled.");
-    GMX_ASSERT(!GMX_GPU_CUDA || xReadyOnDevice || !pme->bPPnode,
-               "Need a valid xReadyOnDevice on PP+PME ranks with CUDA.");
+    GMX_ASSERT(xReadyOnDevice || !pme->bPPnode, "Need a valid xReadyOnDevice on PP+PME ranks.");
     GMX_ASSERT(pme->doCoulomb, "Only Coulomb PME can be run on GPU.");
 
     PmeGpu* pmeGpu = pme->gpu;
diff --git a/src/gromacs/ewald/pme_gpu_internal.cpp b/src/gromacs/ewald/pme_gpu_internal.cpp
index 35fff48b6a..5e5bfd8ebe 100644
--- a/src/gromacs/ewald/pme_gpu_internal.cpp
+++ b/src/gromacs/ewald/pme_gpu_internal.cpp
@@ -1324,9 +1324,9 @@ void pme_gpu_spread(const PmeGpu*                  pmeGpu,
                "inconsistent atom data padding vs. spreading block size");
 
     // Ensure that coordinates are ready on the device before launching spread;
-    // only needed with CUDA on PP+PME ranks, not on separate PME ranks, in unit tests
-    // nor in OpenCL as these cases use a single stream (hence xReadyOnDevice == nullptr).
-    GMX_ASSERT(!GMX_GPU_CUDA || xReadyOnDevice != nullptr || pmeGpu->common->isRankPmeOnly
+    // only needed on PP+PME ranks, not on separate PME ranks, in unit tests
+    // as these cases use a single stream (hence xReadyOnDevice == nullptr).
+    GMX_ASSERT(xReadyOnDevice != nullptr || pmeGpu->common->isRankPmeOnly
                        || pme_gpu_settings(pmeGpu).copyAllOutputs,
                "Need a valid coordinate synchronizer on PP+PME ranks with CUDA.");
 
diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h
index d72bc5a0f2..ec73a14d7a 100644
--- a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h
+++ b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h
@@ -419,6 +419,9 @@ private:
     //! \brief Pointer to wallcycle structure.
     gmx_wallcycle* wcycle_;
 
+    //! Whether this instance of the class is used on a PME-only rank
+    bool isPmeOnly_ = false;
+
     /*! \brief Performs the copy of data from host to device buffer.
      *
      * \todo Template on locality.
diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp b/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp
index 9bac56b216..4e3126efba 100644
--- a/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp
+++ b/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp
@@ -77,9 +77,7 @@ StatePropagatorDataGpu::Impl::Impl(const DeviceStreamManager& deviceStreamManage
     pmeStream_      = &deviceStreamManager.stream(DeviceStreamType::Pme);
     localStream_    = &deviceStreamManager.stream(DeviceStreamType::NonBondedLocal);
     nonLocalStream_ = &deviceStreamManager.stream(DeviceStreamType::NonBondedNonLocal);
-    // PME stream is used in OpenCL for H2D coordinate transfer
-    updateStream_ = &deviceStreamManager.stream(
-            GMX_GPU_OPENCL ? DeviceStreamType::Pme : DeviceStreamType::UpdateAndConstraints);
+    updateStream_   = &deviceStreamManager.stream(DeviceStreamType::UpdateAndConstraints);
 
     // Map the atom locality to the stream that will be used for coordinates,
     // velocities and forces transfers. Same streams are used for H2D and D2H copies.
@@ -120,6 +118,7 @@ StatePropagatorDataGpu::Impl::Impl(const DeviceStream*  pmeStream,
     nonLocalStream_ = nullptr;
     updateStream_   = nullptr;
 
+    isPmeOnly_ = true;
 
     // Only local/all coordinates are allowed to be copied in PME-only rank/ PME tests.
     // This it temporary measure to make it safe to use this class in those cases.
@@ -330,11 +329,8 @@ void StatePropagatorDataGpu::Impl::copyCoordinatesToGpu(const gmx::ArrayRef<cons
 
     copyToDevice(d_x_, h_x, d_xSize_, atomLocality, *deviceStream);
 
-    // markEvent is skipped in OpenCL as:
-    //   - it's not needed, copy is done in the same stream as the only consumer task (PME)
-    //   - we don't consume the events in OpenCL which is not allowed by GpuEventSynchronizer (would leak memory).
-    // TODO: remove this by adding an event-mark free flavor of this function
-    if (bool(GMX_GPU_CUDA) || bool(GMX_GPU_SYCL))
+    // marking is skipped on the PME-rank mode as everything is on the same stream
+    if (!isPmeOnly_)
     {
         xReadyOnDevice_[atomLocality].markEvent(*deviceStream);
     }
@@ -353,12 +349,7 @@ GpuEventSynchronizer* StatePropagatorDataGpu::Impl::getCoordinatesReadyOnDeviceE
     // and this is not a neighbor search step, then the consumer needs to wait for the update
     // to complete. Otherwise, the coordinates are copied from the host and we need to wait for
     // the copy event. Non-local coordinates are provided by the GPU halo exchange (if active), otherwise by H2D copy.
-    //
-    // In OpenCL no events are used as coordinate sync is not necessary
-    if (GMX_GPU_OPENCL)
-    {
-        return nullptr;
-    }
+
     if (atomLocality == AtomLocality::NonLocal && stepWork.useGpuXHalo)
     {
         GMX_ASSERT(gpuCoordinateHaloLaunched != nullptr,