From c97789fa23c05eabde93a6bacd07fe7d86605878 Mon Sep 17 00:00:00 2001
From: Artem Zhmurov <zhmurov@gmail.com>
Date: Thu, 28 Nov 2019 17:41:43 +0100
Subject: [PATCH] Clear device force buffer if it was reallocated

The force buffer on the device is assumed to be clear at the start of
the step. This was not ensured if the buffer was realocated after e.g.
domain decomposition.

Change-Id: I4677f71edc9479d972a6d9471b8cbe2c377f8827
---
 src/gromacs/mdtypes/state_propagator_data_gpu.h           | 6 ++++++
 src/gromacs/mdtypes/state_propagator_data_gpu_impl.h      | 6 ++++++
 .../mdtypes/state_propagator_data_gpu_impl_gpu.cpp        | 8 ++++++++
 src/gromacs/nbnxm/cuda/nbnxm_cuda.cu                      | 7 ++++++-
 4 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu.h b/src/gromacs/mdtypes/state_propagator_data_gpu.h
index a0d9309f67..c6ae19c589 100644
--- a/src/gromacs/mdtypes/state_propagator_data_gpu.h
+++ b/src/gromacs/mdtypes/state_propagator_data_gpu.h
@@ -144,6 +144,8 @@ public:
     ~StatePropagatorDataGpu();
 
     /*! \brief Set the ranges for local and non-local atoms and reallocates buffers.
+     *
+     * Reallocates coordinate, velocities and force buffers on the device.
      *
      * \note
      * The coordinates buffer is (re)allocated, when required by PME, with a padding,
@@ -151,6 +153,10 @@ public:
      * is scheduled in the \p pmeStream_ (unlike the coordinates H2D) as only the PME
      * task uses this padding area.
      *
+     * \note
+     * The force buffer is cleared if its size increases, so that previously unused
+     * memory is cleared before forces are accumulated.
+     *
      *  \param[in] numAtomsLocal  Number of atoms in local domain.
      *  \param[in] numAtomsAll    Total number of atoms to handle.
      */
diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h
index 70365ac62e..af073b6284 100644
--- a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h
+++ b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h
@@ -137,6 +137,8 @@ public:
 
 
     /*! \brief Set the ranges for local and non-local atoms and reallocates buffers.
+     *
+     * Reallocates coordinate, velocities and force buffers on the device.
      *
      * \note
      * The coordinates buffer is (re)allocated, when required by PME, with a padding,
@@ -144,6 +146,10 @@ public:
      * is scheduled in the \p pmeStream_ (unlike the coordinates H2D) as only the PME
      * task uses this padding area.
      *
+     * \note
+     * The force buffer is cleared if its size increases, so that previously unused
+     * memory is cleared before forces are accumulated.
+     *
      *  \param[in] numAtomsLocal  Number of atoms in local domain.
      *  \param[in] numAtomsAll    Total number of atoms to handle.
      */
diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp b/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp
index 2a8a394bda..f42ad7230e 100644
--- a/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp
+++ b/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp
@@ -194,7 +194,15 @@ void StatePropagatorDataGpu::Impl::reinit(int numAtomsLocal, int numAtomsAll)
     }
 
     reallocateDeviceBuffer(&d_v_, DIM * numAtomsAll_, &d_vSize_, &d_vCapacity_, deviceContext_);
+    const int d_fOldCapacity = d_fCapacity_;
     reallocateDeviceBuffer(&d_f_, DIM * numAtomsAll_, &d_fSize_, &d_fCapacity_, deviceContext_);
+    // Clearing of the forces can be done in local stream since the nonlocal stream cannot reach
+    // the force accumulation stage before syncing with the local stream. Only done in CUDA,
+    // since the force buffer ops are not implemented in OpenCL.
+    if (GMX_GPU == GMX_GPU_CUDA && d_fCapacity_ != d_fOldCapacity)
+    {
+        clearDeviceBufferAsync(&d_f_, 0, d_fCapacity_, localStream_);
+    }
 }
 
 std::tuple<int, int> StatePropagatorDataGpu::Impl::getAtomRangesFromAtomLocality(AtomLocality atomLocality)
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
index 45963daba6..04d0dfd383 100644
--- a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
@@ -875,7 +875,12 @@ void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid&        grid,
     nbnxnInsertNonlocalGpuDependency(nb, interactionLoc);
 }
 
-/* F buffer operations on GPU: performs force summations and conversion from nb to rvec format. */
+/* F buffer operations on GPU: performs force summations and conversion from nb to rvec format.
+ *
+ * NOTE: When the total force device buffer is reallocated and its size increases, it is cleared in
+ *       Local stream. Hence, if accumulateForce is true, NonLocal stream should start accumulating
+ *       forces only after Local stream already done so.
+ */
 void nbnxn_gpu_add_nbat_f_to_f(const AtomLocality                         atomLocality,
                                DeviceBuffer<float>                        totalForcesDevice,
                                gmx_nbnxn_gpu_t*                           nb,
-- 
2.22.0