Move responsibility for GPU force clearing to state propagator

author Alan Gray <alangray3@gmail.com>

Fri, 12 Mar 2021 06:52:02 +0000 (06:52 +0000)

committer Artem Zhmurov <zhmurov@gmail.com>

Fri, 12 Mar 2021 06:52:02 +0000 (06:52 +0000)
author Alan Gray <alangray3@gmail.com>
Fri, 12 Mar 2021 06:52:02 +0000 (06:52 +0000)
committer Artem Zhmurov <zhmurov@gmail.com>
Fri, 12 Mar 2021 06:52:02 +0000 (06:52 +0000)
diff --git a/src/gromacs/domdec/gpuhaloexchange.h b/src/gromacs/domdec/gpuhaloexchange.h

index 0f6089df2f4f4605370ce8befd911cf70531523e..adc8d667127fe4f0c14361d8fd4f4c529e052ce1 100644 (file)
--- a/src/gromacs/domdec/gpuhaloexchange.h
+++ b/src/gromacs/domdec/gpuhaloexchange.h
@@ -74,8 +74,7 @@ public:
       * after the local coordinates buffer operations (where the
       * coordinates are copied to the device and hence the \c
       * coordinatesReadyOnDeviceEvent is recorded). Force Halo exchange
-     * will be performed in \c streamNonLocal (also potentally
-     * with buffer clearing in \c streamLocal)and the \c
+     * will be performed in \c streamNonLocal and the \c
       * communicateHaloForces method must be called after the
       * non-local buffer operations, after the local force buffer
       * has been copied to the GPU (if CPU forces are present), and
diff --git a/src/gromacs/domdec/gpuhaloexchange_impl.cu b/src/gromacs/domdec/gpuhaloexchange_impl.cu

index 2189f39fe41a2a8163d257cf22e25e065b73b5fa..ebc98e784feca58b79c89a32edd86b416d27ccd8 100644 (file)
--- a/src/gromacs/domdec/gpuhaloexchange_impl.cu
+++ b/src/gromacs/domdec/gpuhaloexchange_impl.cu
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2019,2020,2021, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -335,16 +335,9 @@ void GpuHaloExchange::Impl::communicateHaloForces(bool accumulateForces)
      // activities
      if ((pulse_ == (dd_->comm->cd[dimIndex_].numPulses() - 1)) && (dimIndex_ == (dd_->ndim - 1)))
      {
-        if (!accumulateForces)
-        {
-            // Clear local portion of force array (in local stream)
-            cudaMemsetAsync(d_f, 0, numHomeAtoms_ * sizeof(rvec), localStream_.stream());
-        }
-
          // ensure non-local stream waits for local stream, due to dependence on
          // the previous H2D copy of CPU forces (if accumulateForces is true)
-        // or the above clearing.
-        // TODO remove this dependency on localStream - edmine Issue #3093
+        // or local force clearing.
          GpuEventSynchronizer eventLocal;
          eventLocal.markEvent(localStream_);
          eventLocal.enqueueWaitEvent(nonLocalStream_);
diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp

index 80190c6fa833da33d9c76e441485df290658d8c2..1db242af88b3374ced1cfadf2cc88826764b1df7 100644 (file)
--- a/src/gromacs/mdlib/sim_util.cpp
+++ b/src/gromacs/mdlib/sim_util.cpp
@@ -2039,7 +2039,15 @@ void do_force(FILE*                               fplog,
  
              if (stepWork.useGpuFHalo)
              {
-                communicateGpuHaloForces(*cr, domainWork.haveCpuLocalForceWork);
+                // If there exist CPU forces, data from halo exchange should accumulate into these
+                bool accumulateForces = domainWork.haveCpuLocalForceWork;
+                if (!accumulateForces)
+                {
+                    // Force halo exchange will set a subset of local atoms with remote non-local data
+                    // First clear local portion of force array, so that untouched atoms are zero
+                    stateGpu->clearForcesOnGpu(AtomLocality::Local);
+                }
+                communicateGpuHaloForces(*cr, accumulateForces);
              }
              else
              {
diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu.h b/src/gromacs/mdtypes/state_propagator_data_gpu.h

index 82b3378551a0c24b145122a717c7fa3b586994ca..3f66196509f742859e0dcd670cc5f0189a62fad4 100644 (file)
--- a/src/gromacs/mdtypes/state_propagator_data_gpu.h
+++ b/src/gromacs/mdtypes/state_propagator_data_gpu.h
@@ -270,6 +270,12 @@ public:
       */
      void copyForcesToGpu(gmx::ArrayRef<const gmx::RVec> h_f, AtomLocality atomLocality);
  
+    /*! \brief Clear forces in the GPU memory.
+     *
+     *  \param[in] atomLocality  Locality of the particles to clear.
+     */
+    void clearForcesOnGpu(AtomLocality atomLocality);
+
      /*! \brief Get the event synchronizer for the forces ready on device.
       *
       *  Returns either of the event synchronizers, depending on the offload scenario
diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp

index 3274d3dd02bf8892e3de11f951669fcdb578e1f7..f112d1b1cb08180971938eda99eb4632f9e6e6fc 100644 (file)
--- a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp
+++ b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2019,2020,2021, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -206,6 +206,13 @@ void StatePropagatorDataGpu::copyForcesToGpu(const gmx::ArrayRef<const gmx::RVec
                 "GPU implementation.");
  }
  
+void StatePropagatorDataGpu::clearForcesOnGpu(AtomLocality /* atomLocality */)
+{
+    GMX_ASSERT(!impl_,
+               "A CPU stub method from GPU state propagator data was called instead of one from "
+               "GPU implementation.");
+}
+
  GpuEventSynchronizer* StatePropagatorDataGpu::getForcesReadyOnDeviceEvent(AtomLocality /* atomLocality */,
                                                                            bool /* useGpuFBufferOps */)
  {
diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h

index 84035e4f9176f8580d373ec01efcc25c51b28440..cc9f92bcfe9883351caa1f73bc42740ed57ed55e 100644 (file)
--- a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h
+++ b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h
@@ -265,6 +265,12 @@ public:
       */
      void copyForcesToGpu(gmx::ArrayRef<const gmx::RVec> h_f, AtomLocality atomLocality);
  
+    /*! \brief Clear forces in the GPU memory.
+     *
+     *  \param[in] atomLocality  Locality of the particles to clear.
+     */
+    void clearForcesOnGpu(AtomLocality atomLocality);
+
      /*! \brief Get the event synchronizer for the forces ready on device.
       *
       *  Returns either of the event synchronizers, depending on the offload scenario
@@ -424,6 +430,20 @@ private:
                          int                      dataSize,
                          AtomLocality             atomLocality,
                          const DeviceStream&      deviceStream);
+
+    /*! \brief Performs the clearing of data in device buffer.
+     *
+     * \todo Template on locality.
+     *
+     *  \param[out] d_data         Device-side buffer.
+     *  \param[in]  dataSize       Device-side data allocation size.
+     *  \param[in]  atomLocality   If all, local or non-local ranges should be cleared.
+     *  \param[in]  deviceStream   GPU stream to execute copy in.
+     */
+    void clearOnDevice(DeviceBuffer<RVec>  d_data,
+                       int                 dataSize,
+                       AtomLocality        atomLocality,
+                       const DeviceStream& deviceStream);
  };
  
  } // namespace gmx
diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp b/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp

index 37d00888d502e9a69f01f72b355cbb8f4528b28d..16fbe131f86ad1c733b25a05b3f96079ff835cea 100644 (file)
--- a/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp
+++ b/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2019,2020,2021, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -278,6 +278,31 @@ void StatePropagatorDataGpu::Impl::copyFromDevice(gmx::ArrayRef<gmx::RVec> h_dat
      }
  }
  
+void StatePropagatorDataGpu::Impl::clearOnDevice(DeviceBuffer<RVec>  d_data,
+                                                 int                 dataSize,
+                                                 AtomLocality        atomLocality,
+                                                 const DeviceStream& deviceStream)
+{
+    GMX_UNUSED_VALUE(dataSize);
+
+    GMX_ASSERT(atomLocality < AtomLocality::Count, "Wrong atom locality.");
+
+    GMX_ASSERT(dataSize >= 0, "Trying to clear to device buffer before it was allocated.");
+
+    GMX_ASSERT(deviceStream.isValid(), "No stream is valid for clearing with given atom locality.");
+
+    int atomsStartAt, numAtomsToClear;
+    std::tie(atomsStartAt, numAtomsToClear) = getAtomRangesFromAtomLocality(atomLocality);
+
+    if (numAtomsToClear != 0)
+    {
+        GMX_ASSERT(atomsStartAt + numAtomsToClear <= dataSize,
+                   "The device allocation is smaller than requested clear range.");
+
+        clearDeviceBufferAsync(&d_data, atomsStartAt, numAtomsToClear, deviceStream);
+    }
+}
+
  DeviceBuffer<RVec> StatePropagatorDataGpu::Impl::getCoordinates()
  {
      return d_x_;
@@ -452,6 +477,22 @@ void StatePropagatorDataGpu::Impl::copyForcesToGpu(const gmx::ArrayRef<const gmx
      wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
  }
  
+void StatePropagatorDataGpu::Impl::clearForcesOnGpu(AtomLocality atomLocality)
+{
+    GMX_ASSERT(atomLocality < AtomLocality::Count, "Wrong atom locality.");
+    const DeviceStream* deviceStream = fCopyStreams_[atomLocality];
+    GMX_ASSERT(deviceStream != nullptr,
+               "No stream is valid for clearing forces with given atom locality.");
+
+    wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
+    wallcycle_sub_start(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
+
+    clearOnDevice(d_f_, d_fSize_, atomLocality, *deviceStream);
+
+    wallcycle_sub_stop(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
+    wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
+}
+
  GpuEventSynchronizer* StatePropagatorDataGpu::Impl::getForcesReadyOnDeviceEvent(AtomLocality atomLocality,
                                                                                  bool useGpuFBufferOps)
  {
@@ -622,6 +663,11 @@ void StatePropagatorDataGpu::copyForcesToGpu(const gmx::ArrayRef<const gmx::RVec
      return impl_->copyForcesToGpu(h_f, atomLocality);
  }
  
+void StatePropagatorDataGpu::clearForcesOnGpu(AtomLocality atomLocality)
+{
+    return impl_->clearForcesOnGpu(atomLocality);
+}
+
  GpuEventSynchronizer* StatePropagatorDataGpu::getForcesReadyOnDeviceEvent(AtomLocality atomLocality,
                                                                            bool useGpuFBufferOps)
  {
author	Alan Gray <alangray3@gmail.com>
	Fri, 12 Mar 2021 06:52:02 +0000 (06:52 +0000)
committer	Artem Zhmurov <zhmurov@gmail.com>
	Fri, 12 Mar 2021 06:52:02 +0000 (06:52 +0000)
src/gromacs/domdec/gpuhaloexchange.h		patch \| blob \| history
src/gromacs/domdec/gpuhaloexchange_impl.cu		patch \| blob \| history
src/gromacs/mdlib/sim_util.cpp		patch \| blob \| history
src/gromacs/mdtypes/state_propagator_data_gpu.h		patch \| blob \| history
src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp		patch \| blob \| history
src/gromacs/mdtypes/state_propagator_data_gpu_impl.h		patch \| blob \| history
src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp		patch \| blob \| history