~StatePropagatorDataGpu();
/*! \brief Set the ranges for local and non-local atoms and reallocates buffers.
+ *
+ * Reallocates coordinate, velocities and force buffers on the device.
*
* \note
* The coordinates buffer is (re)allocated, when required by PME, with a padding,
* is scheduled in the \p pmeStream_ (unlike the coordinates H2D) as only the PME
* task uses this padding area.
*
+ * \note
+ * The force buffer is cleared if its size increases, so that previously unused
+ * memory is cleared before forces are accumulated.
+ *
* \param[in] numAtomsLocal Number of atoms in local domain.
* \param[in] numAtomsAll Total number of atoms to handle.
*/
/*! \brief Set the ranges for local and non-local atoms and reallocates buffers.
+ *
+ * Reallocates coordinate, velocities and force buffers on the device.
*
* \note
* The coordinates buffer is (re)allocated, when required by PME, with a padding,
* is scheduled in the \p pmeStream_ (unlike the coordinates H2D) as only the PME
* task uses this padding area.
*
+ * \note
+ * The force buffer is cleared if its size increases, so that previously unused
+ * memory is cleared before forces are accumulated.
+ *
* \param[in] numAtomsLocal Number of atoms in local domain.
* \param[in] numAtomsAll Total number of atoms to handle.
*/
}
reallocateDeviceBuffer(&d_v_, DIM * numAtomsAll_, &d_vSize_, &d_vCapacity_, deviceContext_);
+ const int d_fOldCapacity = d_fCapacity_;
reallocateDeviceBuffer(&d_f_, DIM * numAtomsAll_, &d_fSize_, &d_fCapacity_, deviceContext_);
+ // Clearing of the forces can be done in local stream since the nonlocal stream cannot reach
+ // the force accumulation stage before syncing with the local stream. Only done in CUDA,
+ // since the force buffer ops are not implemented in OpenCL.
+ if (GMX_GPU == GMX_GPU_CUDA && d_fCapacity_ != d_fOldCapacity)
+ {
+ clearDeviceBufferAsync(&d_f_, 0, d_fCapacity_, localStream_);
+ }
}
std::tuple<int, int> StatePropagatorDataGpu::Impl::getAtomRangesFromAtomLocality(AtomLocality atomLocality)
nbnxnInsertNonlocalGpuDependency(nb, interactionLoc);
}
-/* F buffer operations on GPU: performs force summations and conversion from nb to rvec format. */
+/* F buffer operations on GPU: performs force summations and conversion from nb to rvec format.
+ *
+ * NOTE: When the total force device buffer is reallocated and its size increases, it is cleared in
+ * Local stream. Hence, if accumulateForce is true, NonLocal stream should start accumulating
+ * forces only after Local stream already done so.
+ */
void nbnxn_gpu_add_nbat_f_to_f(const AtomLocality atomLocality,
DeviceBuffer<float> totalForcesDevice,
gmx_nbnxn_gpu_t* nb,