Unify coordinate copy handling across GPU platforms

author Szilárd Páll <pall.szilard@gmail.com>

Tue, 26 Oct 2021 21:52:34 +0000 (23:52 +0200)

committer Szilárd Páll <pall.szilard@gmail.com>

Wed, 27 Oct 2021 16:02:37 +0000 (16:02 +0000)
author Szilárd Páll <pall.szilard@gmail.com>
Tue, 26 Oct 2021 21:52:34 +0000 (23:52 +0200)
committer Szilárd Páll <pall.szilard@gmail.com>
Wed, 27 Oct 2021 16:02:37 +0000 (16:02 +0000)
diff --git a/src/gromacs/ewald/pme_gpu.cpp b/src/gromacs/ewald/pme_gpu.cpp

index 680fcec0fc24d809eb17dca3f251b5e30a387aed..3f8e00f9953222e72934ca985fa368dea0e03449 100644 (file)
--- a/src/gromacs/ewald/pme_gpu.cpp
+++ b/src/gromacs/ewald/pme_gpu.cpp
@@ -198,8 +198,7 @@ void pme_gpu_launch_spread(gmx_pme_t*                     pme,
                             gmx::PmeCoordinateReceiverGpu* pmeCoordinateReceiverGpu)
  {
      GMX_ASSERT(pme_gpu_active(pme), "This should be a GPU run of PME but it is not enabled.");
                             gmx::PmeCoordinateReceiverGpu* pmeCoordinateReceiverGpu)
  {
      GMX_ASSERT(pme_gpu_active(pme), "This should be a GPU run of PME but it is not enabled.");
-    GMX_ASSERT(!GMX_GPU_CUDA || xReadyOnDevice || !pme->bPPnode,
-               "Need a valid xReadyOnDevice on PP+PME ranks with CUDA.");
+    GMX_ASSERT(xReadyOnDevice || !pme->bPPnode, "Need a valid xReadyOnDevice on PP+PME ranks.");
      GMX_ASSERT(pme->doCoulomb, "Only Coulomb PME can be run on GPU.");
  
      PmeGpu* pmeGpu = pme->gpu;
      GMX_ASSERT(pme->doCoulomb, "Only Coulomb PME can be run on GPU.");
  
      PmeGpu* pmeGpu = pme->gpu;
diff --git a/src/gromacs/ewald/pme_gpu_internal.cpp b/src/gromacs/ewald/pme_gpu_internal.cpp

index 35fff48b6aee68f439c6298ec76c4c0e35e04d79..5e5bfd8ebeac7cdd9b6cf22ebff2f2dbf0cfc1ab 100644 (file)
--- a/src/gromacs/ewald/pme_gpu_internal.cpp
+++ b/src/gromacs/ewald/pme_gpu_internal.cpp
@@ -1324,9 +1324,9 @@ void pme_gpu_spread(const PmeGpu*                  pmeGpu,
                 "inconsistent atom data padding vs. spreading block size");
  
      // Ensure that coordinates are ready on the device before launching spread;
                 "inconsistent atom data padding vs. spreading block size");
  
      // Ensure that coordinates are ready on the device before launching spread;
-    // only needed with CUDA on PP+PME ranks, not on separate PME ranks, in unit tests
-    // nor in OpenCL as these cases use a single stream (hence xReadyOnDevice == nullptr).
-    GMX_ASSERT(!GMX_GPU_CUDA || xReadyOnDevice != nullptr || pmeGpu->common->isRankPmeOnly
+    // only needed on PP+PME ranks, not on separate PME ranks, in unit tests
+    // as these cases use a single stream (hence xReadyOnDevice == nullptr).
+    GMX_ASSERT(xReadyOnDevice != nullptr || pmeGpu->common->isRankPmeOnly
                         || pme_gpu_settings(pmeGpu).copyAllOutputs,
                 "Need a valid coordinate synchronizer on PP+PME ranks with CUDA.");
  
                         || pme_gpu_settings(pmeGpu).copyAllOutputs,
                 "Need a valid coordinate synchronizer on PP+PME ranks with CUDA.");
  
diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h

index d72bc5a0f27f0a4277ef2f1f51d711d0c07795ad..ec73a14d7a7ba4cd88c713d38a36675b3da5f537 100644 (file)
--- a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h
+++ b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h
@@ -419,6 +419,9 @@ private:
      //! \brief Pointer to wallcycle structure.
      gmx_wallcycle* wcycle_;
  
      //! \brief Pointer to wallcycle structure.
      gmx_wallcycle* wcycle_;
  
+    //! Whether this instance of the class is used on a PME-only rank
+    bool isPmeOnly_ = false;
+
      /*! \brief Performs the copy of data from host to device buffer.
       *
       * \todo Template on locality.
      /*! \brief Performs the copy of data from host to device buffer.
       *
       * \todo Template on locality.
diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp b/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp

index 9bac56b2160451c9b7ccd95599d5c18c6771875d..4e3126efbabc8b989f95a4eeb5f77461988e1752 100644 (file)
--- a/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp
+++ b/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp
@@ -77,9 +77,7 @@ StatePropagatorDataGpu::Impl::Impl(const DeviceStreamManager& deviceStreamManage
      pmeStream_      = &deviceStreamManager.stream(DeviceStreamType::Pme);
      localStream_    = &deviceStreamManager.stream(DeviceStreamType::NonBondedLocal);
      nonLocalStream_ = &deviceStreamManager.stream(DeviceStreamType::NonBondedNonLocal);
      pmeStream_      = &deviceStreamManager.stream(DeviceStreamType::Pme);
      localStream_    = &deviceStreamManager.stream(DeviceStreamType::NonBondedLocal);
      nonLocalStream_ = &deviceStreamManager.stream(DeviceStreamType::NonBondedNonLocal);
-    // PME stream is used in OpenCL for H2D coordinate transfer
-    updateStream_ = &deviceStreamManager.stream(
-            GMX_GPU_OPENCL ? DeviceStreamType::Pme : DeviceStreamType::UpdateAndConstraints);
+    updateStream_   = &deviceStreamManager.stream(DeviceStreamType::UpdateAndConstraints);
  
      // Map the atom locality to the stream that will be used for coordinates,
      // velocities and forces transfers. Same streams are used for H2D and D2H copies.
  
      // Map the atom locality to the stream that will be used for coordinates,
      // velocities and forces transfers. Same streams are used for H2D and D2H copies.
@@ -120,6 +118,7 @@ StatePropagatorDataGpu::Impl::Impl(const DeviceStream*  pmeStream,
      nonLocalStream_ = nullptr;
      updateStream_   = nullptr;
  
      nonLocalStream_ = nullptr;
      updateStream_   = nullptr;
  
+    isPmeOnly_ = true;
  
      // Only local/all coordinates are allowed to be copied in PME-only rank/ PME tests.
      // This it temporary measure to make it safe to use this class in those cases.
  
      // Only local/all coordinates are allowed to be copied in PME-only rank/ PME tests.
      // This it temporary measure to make it safe to use this class in those cases.
@@ -330,11 +329,8 @@ void StatePropagatorDataGpu::Impl::copyCoordinatesToGpu(const gmx::ArrayRef<cons
  
      copyToDevice(d_x_, h_x, d_xSize_, atomLocality, *deviceStream);
  
  
      copyToDevice(d_x_, h_x, d_xSize_, atomLocality, *deviceStream);
  
-    // markEvent is skipped in OpenCL as:
-    //   - it's not needed, copy is done in the same stream as the only consumer task (PME)
-    //   - we don't consume the events in OpenCL which is not allowed by GpuEventSynchronizer (would leak memory).
-    // TODO: remove this by adding an event-mark free flavor of this function
-    if (bool(GMX_GPU_CUDA) || bool(GMX_GPU_SYCL))
+    // marking is skipped on the PME-rank mode as everything is on the same stream
+    if (!isPmeOnly_)
      {
          xReadyOnDevice_[atomLocality].markEvent(*deviceStream);
      }
      {
          xReadyOnDevice_[atomLocality].markEvent(*deviceStream);
      }
@@ -353,12 +349,7 @@ GpuEventSynchronizer* StatePropagatorDataGpu::Impl::getCoordinatesReadyOnDeviceE
      // and this is not a neighbor search step, then the consumer needs to wait for the update
      // to complete. Otherwise, the coordinates are copied from the host and we need to wait for
      // the copy event. Non-local coordinates are provided by the GPU halo exchange (if active), otherwise by H2D copy.
      // and this is not a neighbor search step, then the consumer needs to wait for the update
      // to complete. Otherwise, the coordinates are copied from the host and we need to wait for
      // the copy event. Non-local coordinates are provided by the GPU halo exchange (if active), otherwise by H2D copy.
-    //
-    // In OpenCL no events are used as coordinate sync is not necessary
-    if (GMX_GPU_OPENCL)
-    {
-        return nullptr;
-    }
+
      if (atomLocality == AtomLocality::NonLocal && stepWork.useGpuXHalo)
      {
          GMX_ASSERT(gpuCoordinateHaloLaunched != nullptr,
      if (atomLocality == AtomLocality::NonLocal && stepWork.useGpuXHalo)
      {
          GMX_ASSERT(gpuCoordinateHaloLaunched != nullptr,
author	Szilárd Páll <pall.szilard@gmail.com>
	Tue, 26 Oct 2021 21:52:34 +0000 (23:52 +0200)
committer	Szilárd Páll <pall.szilard@gmail.com>
	Wed, 27 Oct 2021 16:02:37 +0000 (16:02 +0000)
src/gromacs/ewald/pme_gpu.cpp		patch \| blob \| history
src/gromacs/ewald/pme_gpu_internal.cpp		patch \| blob \| history
src/gromacs/mdtypes/state_propagator_data_gpu_impl.h		patch \| blob \| history
src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp		patch \| blob \| history