The force buffer on the device is assumed to be clear at the start of
the step. This was not ensured if the buffer was realocated after e.g.
domain decomposition.
Change-Id: I4677f71edc9479d972a6d9471b8cbe2c377f8827
~StatePropagatorDataGpu();
/*! \brief Set the ranges for local and non-local atoms and reallocates buffers.
~StatePropagatorDataGpu();
/*! \brief Set the ranges for local and non-local atoms and reallocates buffers.
+ *
+ * Reallocates coordinate, velocities and force buffers on the device.
*
* \note
* The coordinates buffer is (re)allocated, when required by PME, with a padding,
*
* \note
* The coordinates buffer is (re)allocated, when required by PME, with a padding,
* is scheduled in the \p pmeStream_ (unlike the coordinates H2D) as only the PME
* task uses this padding area.
*
* is scheduled in the \p pmeStream_ (unlike the coordinates H2D) as only the PME
* task uses this padding area.
*
+ * \note
+ * The force buffer is cleared if its size increases, so that previously unused
+ * memory is cleared before forces are accumulated.
+ *
* \param[in] numAtomsLocal Number of atoms in local domain.
* \param[in] numAtomsAll Total number of atoms to handle.
*/
* \param[in] numAtomsLocal Number of atoms in local domain.
* \param[in] numAtomsAll Total number of atoms to handle.
*/
/*! \brief Set the ranges for local and non-local atoms and reallocates buffers.
/*! \brief Set the ranges for local and non-local atoms and reallocates buffers.
+ *
+ * Reallocates coordinate, velocities and force buffers on the device.
*
* \note
* The coordinates buffer is (re)allocated, when required by PME, with a padding,
*
* \note
* The coordinates buffer is (re)allocated, when required by PME, with a padding,
* is scheduled in the \p pmeStream_ (unlike the coordinates H2D) as only the PME
* task uses this padding area.
*
* is scheduled in the \p pmeStream_ (unlike the coordinates H2D) as only the PME
* task uses this padding area.
*
+ * \note
+ * The force buffer is cleared if its size increases, so that previously unused
+ * memory is cleared before forces are accumulated.
+ *
* \param[in] numAtomsLocal Number of atoms in local domain.
* \param[in] numAtomsAll Total number of atoms to handle.
*/
* \param[in] numAtomsLocal Number of atoms in local domain.
* \param[in] numAtomsAll Total number of atoms to handle.
*/
}
reallocateDeviceBuffer(&d_v_, DIM * numAtomsAll_, &d_vSize_, &d_vCapacity_, deviceContext_);
}
reallocateDeviceBuffer(&d_v_, DIM * numAtomsAll_, &d_vSize_, &d_vCapacity_, deviceContext_);
+ const int d_fOldCapacity = d_fCapacity_;
reallocateDeviceBuffer(&d_f_, DIM * numAtomsAll_, &d_fSize_, &d_fCapacity_, deviceContext_);
reallocateDeviceBuffer(&d_f_, DIM * numAtomsAll_, &d_fSize_, &d_fCapacity_, deviceContext_);
+ // Clearing of the forces can be done in local stream since the nonlocal stream cannot reach
+ // the force accumulation stage before syncing with the local stream. Only done in CUDA,
+ // since the force buffer ops are not implemented in OpenCL.
+ if (GMX_GPU == GMX_GPU_CUDA && d_fCapacity_ != d_fOldCapacity)
+ {
+ clearDeviceBufferAsync(&d_f_, 0, d_fCapacity_, localStream_);
+ }
}
std::tuple<int, int> StatePropagatorDataGpu::Impl::getAtomRangesFromAtomLocality(AtomLocality atomLocality)
}
std::tuple<int, int> StatePropagatorDataGpu::Impl::getAtomRangesFromAtomLocality(AtomLocality atomLocality)
nbnxnInsertNonlocalGpuDependency(nb, interactionLoc);
}
nbnxnInsertNonlocalGpuDependency(nb, interactionLoc);
}
-/* F buffer operations on GPU: performs force summations and conversion from nb to rvec format. */
+/* F buffer operations on GPU: performs force summations and conversion from nb to rvec format.
+ *
+ * NOTE: When the total force device buffer is reallocated and its size increases, it is cleared in
+ * Local stream. Hence, if accumulateForce is true, NonLocal stream should start accumulating
+ * forces only after Local stream already done so.
+ */
void nbnxn_gpu_add_nbat_f_to_f(const AtomLocality atomLocality,
DeviceBuffer<float> totalForcesDevice,
gmx_nbnxn_gpu_t* nb,
void nbnxn_gpu_add_nbat_f_to_f(const AtomLocality atomLocality,
DeviceBuffer<float> totalForcesDevice,
gmx_nbnxn_gpu_t* nb,