Clear device force buffer if it was reallocated

author Artem Zhmurov <zhmurov@gmail.com>

Thu, 28 Nov 2019 16:41:43 +0000 (17:41 +0100)

committer Mark Abraham <mark.j.abraham@gmail.com>

Thu, 28 Nov 2019 20:48:15 +0000 (21:48 +0100)
author Artem Zhmurov <zhmurov@gmail.com>
Thu, 28 Nov 2019 16:41:43 +0000 (17:41 +0100)
committer Mark Abraham <mark.j.abraham@gmail.com>
Thu, 28 Nov 2019 20:48:15 +0000 (21:48 +0100)
diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu.h b/src/gromacs/mdtypes/state_propagator_data_gpu.h

index a0d9309f6714ec14ad949b613c54441e3a8e98e2..c6ae19c589c0101759ddd7b135f2161f282872e8 100644 (file)
--- a/src/gromacs/mdtypes/state_propagator_data_gpu.h
+++ b/src/gromacs/mdtypes/state_propagator_data_gpu.h
@@ -144,6 +144,8 @@ public:
      ~StatePropagatorDataGpu();
  
      /*! \brief Set the ranges for local and non-local atoms and reallocates buffers.
+     *
+     * Reallocates coordinate, velocities and force buffers on the device.
       *
       * \note
       * The coordinates buffer is (re)allocated, when required by PME, with a padding,
@@ -151,6 +153,10 @@ public:
       * is scheduled in the \p pmeStream_ (unlike the coordinates H2D) as only the PME
       * task uses this padding area.
       *
+     * \note
+     * The force buffer is cleared if its size increases, so that previously unused
+     * memory is cleared before forces are accumulated.
+     *
       *  \param[in] numAtomsLocal  Number of atoms in local domain.
       *  \param[in] numAtomsAll    Total number of atoms to handle.
       */
diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h

index 70365ac62eba349060f56b5f89c8970cec324f5b..af073b6284dde35d568465192004b375be8b2dd0 100644 (file)
--- a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h
+++ b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h
@@ -137,6 +137,8 @@ public:
  
  
      /*! \brief Set the ranges for local and non-local atoms and reallocates buffers.
+     *
+     * Reallocates coordinate, velocities and force buffers on the device.
       *
       * \note
       * The coordinates buffer is (re)allocated, when required by PME, with a padding,
@@ -144,6 +146,10 @@ public:
       * is scheduled in the \p pmeStream_ (unlike the coordinates H2D) as only the PME
       * task uses this padding area.
       *
+     * \note
+     * The force buffer is cleared if its size increases, so that previously unused
+     * memory is cleared before forces are accumulated.
+     *
       *  \param[in] numAtomsLocal  Number of atoms in local domain.
       *  \param[in] numAtomsAll    Total number of atoms to handle.
       */
diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp b/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp

index 2a8a394bdaf57232db08f773f39fb5e996ea4ff3..f42ad7230e9b220eaca5be3cae2ebb401fa8390f 100644 (file)
--- a/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp
+++ b/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp
@@ -194,7 +194,15 @@ void StatePropagatorDataGpu::Impl::reinit(int numAtomsLocal, int numAtomsAll)
      }
  
      reallocateDeviceBuffer(&d_v_, DIM * numAtomsAll_, &d_vSize_, &d_vCapacity_, deviceContext_);
+    const int d_fOldCapacity = d_fCapacity_;
      reallocateDeviceBuffer(&d_f_, DIM * numAtomsAll_, &d_fSize_, &d_fCapacity_, deviceContext_);
+    // Clearing of the forces can be done in local stream since the nonlocal stream cannot reach
+    // the force accumulation stage before syncing with the local stream. Only done in CUDA,
+    // since the force buffer ops are not implemented in OpenCL.
+    if (GMX_GPU == GMX_GPU_CUDA && d_fCapacity_ != d_fOldCapacity)
+    {
+        clearDeviceBufferAsync(&d_f_, 0, d_fCapacity_, localStream_);
+    }
  }
  
  std::tuple<int, int> StatePropagatorDataGpu::Impl::getAtomRangesFromAtomLocality(AtomLocality atomLocality)
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu

index 45963daba6f4fca5121086d82501cdc7a93aaa42..04d0dfd3838fa29fd09171261ea4f42c5cd0b08d 100644 (file)
--- a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
@@ -875,7 +875,12 @@ void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid&        grid,
      nbnxnInsertNonlocalGpuDependency(nb, interactionLoc);
  }
  
-/* F buffer operations on GPU: performs force summations and conversion from nb to rvec format. */
+/* F buffer operations on GPU: performs force summations and conversion from nb to rvec format.
+ *
+ * NOTE: When the total force device buffer is reallocated and its size increases, it is cleared in
+ *       Local stream. Hence, if accumulateForce is true, NonLocal stream should start accumulating
+ *       forces only after Local stream already done so.
+ */
  void nbnxn_gpu_add_nbat_f_to_f(const AtomLocality                         atomLocality,
                                 DeviceBuffer<float>                        totalForcesDevice,
                                 gmx_nbnxn_gpu_t*                           nb,
author	Artem Zhmurov <zhmurov@gmail.com>
	Thu, 28 Nov 2019 16:41:43 +0000 (17:41 +0100)
committer	Mark Abraham <mark.j.abraham@gmail.com>
	Thu, 28 Nov 2019 20:48:15 +0000 (21:48 +0100)
src/gromacs/mdtypes/state_propagator_data_gpu.h		patch \| blob \| history
src/gromacs/mdtypes/state_propagator_data_gpu_impl.h		patch \| blob \| history
src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp		patch \| blob \| history
src/gromacs/nbnxm/cuda/nbnxm_cuda.cu		patch \| blob \| history