Move responsibility for GPU force clearing to state propagator
authorAlan Gray <alangray3@gmail.com>
Fri, 12 Mar 2021 06:52:02 +0000 (06:52 +0000)
committerArtem Zhmurov <zhmurov@gmail.com>
Fri, 12 Mar 2021 06:52:02 +0000 (06:52 +0000)
Force was previously cleared within GPU force halo exchange which was
quite opaque. Now moved into do_force() and done by state propagator,
to make schedule clearer.

Fixes #3971 and partly addresses #3970

src/gromacs/domdec/gpuhaloexchange.h
src/gromacs/domdec/gpuhaloexchange_impl.cu
src/gromacs/mdlib/sim_util.cpp
src/gromacs/mdtypes/state_propagator_data_gpu.h
src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp
src/gromacs/mdtypes/state_propagator_data_gpu_impl.h
src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp

index 0f6089df2f4f4605370ce8befd911cf70531523e..adc8d667127fe4f0c14361d8fd4f4c529e052ce1 100644 (file)
@@ -74,8 +74,7 @@ public:
      * after the local coordinates buffer operations (where the
      * coordinates are copied to the device and hence the \c
      * coordinatesReadyOnDeviceEvent is recorded). Force Halo exchange
-     * will be performed in \c streamNonLocal (also potentally
-     * with buffer clearing in \c streamLocal)and the \c
+     * will be performed in \c streamNonLocal and the \c
      * communicateHaloForces method must be called after the
      * non-local buffer operations, after the local force buffer
      * has been copied to the GPU (if CPU forces are present), and
index 2189f39fe41a2a8163d257cf22e25e065b73b5fa..ebc98e784feca58b79c89a32edd86b416d27ccd8 100644 (file)
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2019,2020,2021, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -335,16 +335,9 @@ void GpuHaloExchange::Impl::communicateHaloForces(bool accumulateForces)
     // activities
     if ((pulse_ == (dd_->comm->cd[dimIndex_].numPulses() - 1)) && (dimIndex_ == (dd_->ndim - 1)))
     {
-        if (!accumulateForces)
-        {
-            // Clear local portion of force array (in local stream)
-            cudaMemsetAsync(d_f, 0, numHomeAtoms_ * sizeof(rvec), localStream_.stream());
-        }
-
         // ensure non-local stream waits for local stream, due to dependence on
         // the previous H2D copy of CPU forces (if accumulateForces is true)
-        // or the above clearing.
-        // TODO remove this dependency on localStream - edmine Issue #3093
+        // or local force clearing.
         GpuEventSynchronizer eventLocal;
         eventLocal.markEvent(localStream_);
         eventLocal.enqueueWaitEvent(nonLocalStream_);
index 80190c6fa833da33d9c76e441485df290658d8c2..1db242af88b3374ced1cfadf2cc88826764b1df7 100644 (file)
@@ -2039,7 +2039,15 @@ void do_force(FILE*                               fplog,
 
             if (stepWork.useGpuFHalo)
             {
-                communicateGpuHaloForces(*cr, domainWork.haveCpuLocalForceWork);
+                // If there exist CPU forces, data from halo exchange should accumulate into these
+                bool accumulateForces = domainWork.haveCpuLocalForceWork;
+                if (!accumulateForces)
+                {
+                    // Force halo exchange will set a subset of local atoms with remote non-local data
+                    // First clear local portion of force array, so that untouched atoms are zero
+                    stateGpu->clearForcesOnGpu(AtomLocality::Local);
+                }
+                communicateGpuHaloForces(*cr, accumulateForces);
             }
             else
             {
index 82b3378551a0c24b145122a717c7fa3b586994ca..3f66196509f742859e0dcd670cc5f0189a62fad4 100644 (file)
@@ -270,6 +270,12 @@ public:
      */
     void copyForcesToGpu(gmx::ArrayRef<const gmx::RVec> h_f, AtomLocality atomLocality);
 
+    /*! \brief Clear forces in the GPU memory.
+     *
+     *  \param[in] atomLocality  Locality of the particles to clear.
+     */
+    void clearForcesOnGpu(AtomLocality atomLocality);
+
     /*! \brief Get the event synchronizer for the forces ready on device.
      *
      *  Returns either of the event synchronizers, depending on the offload scenario
index 3274d3dd02bf8892e3de11f951669fcdb578e1f7..f112d1b1cb08180971938eda99eb4632f9e6e6fc 100644 (file)
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2019,2020,2021, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -206,6 +206,13 @@ void StatePropagatorDataGpu::copyForcesToGpu(const gmx::ArrayRef<const gmx::RVec
                "GPU implementation.");
 }
 
+void StatePropagatorDataGpu::clearForcesOnGpu(AtomLocality /* atomLocality */)
+{
+    GMX_ASSERT(!impl_,
+               "A CPU stub method from GPU state propagator data was called instead of one from "
+               "GPU implementation.");
+}
+
 GpuEventSynchronizer* StatePropagatorDataGpu::getForcesReadyOnDeviceEvent(AtomLocality /* atomLocality */,
                                                                           bool /* useGpuFBufferOps */)
 {
index 84035e4f9176f8580d373ec01efcc25c51b28440..cc9f92bcfe9883351caa1f73bc42740ed57ed55e 100644 (file)
@@ -265,6 +265,12 @@ public:
      */
     void copyForcesToGpu(gmx::ArrayRef<const gmx::RVec> h_f, AtomLocality atomLocality);
 
+    /*! \brief Clear forces in the GPU memory.
+     *
+     *  \param[in] atomLocality  Locality of the particles to clear.
+     */
+    void clearForcesOnGpu(AtomLocality atomLocality);
+
     /*! \brief Get the event synchronizer for the forces ready on device.
      *
      *  Returns either of the event synchronizers, depending on the offload scenario
@@ -424,6 +430,20 @@ private:
                         int                      dataSize,
                         AtomLocality             atomLocality,
                         const DeviceStream&      deviceStream);
+
+    /*! \brief Performs the clearing of data in device buffer.
+     *
+     * \todo Template on locality.
+     *
+     *  \param[out] d_data         Device-side buffer.
+     *  \param[in]  dataSize       Device-side data allocation size.
+     *  \param[in]  atomLocality   If all, local or non-local ranges should be cleared.
+     *  \param[in]  deviceStream   GPU stream to execute copy in.
+     */
+    void clearOnDevice(DeviceBuffer<RVec>  d_data,
+                       int                 dataSize,
+                       AtomLocality        atomLocality,
+                       const DeviceStream& deviceStream);
 };
 
 } // namespace gmx
index 37d00888d502e9a69f01f72b355cbb8f4528b28d..16fbe131f86ad1c733b25a05b3f96079ff835cea 100644 (file)
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2019,2020,2021, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -278,6 +278,31 @@ void StatePropagatorDataGpu::Impl::copyFromDevice(gmx::ArrayRef<gmx::RVec> h_dat
     }
 }
 
+void StatePropagatorDataGpu::Impl::clearOnDevice(DeviceBuffer<RVec>  d_data,
+                                                 int                 dataSize,
+                                                 AtomLocality        atomLocality,
+                                                 const DeviceStream& deviceStream)
+{
+    GMX_UNUSED_VALUE(dataSize);
+
+    GMX_ASSERT(atomLocality < AtomLocality::Count, "Wrong atom locality.");
+
+    GMX_ASSERT(dataSize >= 0, "Trying to clear to device buffer before it was allocated.");
+
+    GMX_ASSERT(deviceStream.isValid(), "No stream is valid for clearing with given atom locality.");
+
+    int atomsStartAt, numAtomsToClear;
+    std::tie(atomsStartAt, numAtomsToClear) = getAtomRangesFromAtomLocality(atomLocality);
+
+    if (numAtomsToClear != 0)
+    {
+        GMX_ASSERT(atomsStartAt + numAtomsToClear <= dataSize,
+                   "The device allocation is smaller than requested clear range.");
+
+        clearDeviceBufferAsync(&d_data, atomsStartAt, numAtomsToClear, deviceStream);
+    }
+}
+
 DeviceBuffer<RVec> StatePropagatorDataGpu::Impl::getCoordinates()
 {
     return d_x_;
@@ -452,6 +477,22 @@ void StatePropagatorDataGpu::Impl::copyForcesToGpu(const gmx::ArrayRef<const gmx
     wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
 }
 
+void StatePropagatorDataGpu::Impl::clearForcesOnGpu(AtomLocality atomLocality)
+{
+    GMX_ASSERT(atomLocality < AtomLocality::Count, "Wrong atom locality.");
+    const DeviceStream* deviceStream = fCopyStreams_[atomLocality];
+    GMX_ASSERT(deviceStream != nullptr,
+               "No stream is valid for clearing forces with given atom locality.");
+
+    wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
+    wallcycle_sub_start(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
+
+    clearOnDevice(d_f_, d_fSize_, atomLocality, *deviceStream);
+
+    wallcycle_sub_stop(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
+    wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
+}
+
 GpuEventSynchronizer* StatePropagatorDataGpu::Impl::getForcesReadyOnDeviceEvent(AtomLocality atomLocality,
                                                                                 bool useGpuFBufferOps)
 {
@@ -622,6 +663,11 @@ void StatePropagatorDataGpu::copyForcesToGpu(const gmx::ArrayRef<const gmx::RVec
     return impl_->copyForcesToGpu(h_f, atomLocality);
 }
 
+void StatePropagatorDataGpu::clearForcesOnGpu(AtomLocality atomLocality)
+{
+    return impl_->clearForcesOnGpu(atomLocality);
+}
+
 GpuEventSynchronizer* StatePropagatorDataGpu::getForcesReadyOnDeviceEvent(AtomLocality atomLocality,
                                                                           bool useGpuFBufferOps)
 {