* after the local coordinates buffer operations (where the
* coordinates are copied to the device and hence the \c
* coordinatesReadyOnDeviceEvent is recorded). Force Halo exchange
- * will be performed in \c streamNonLocal (also potentally
- * with buffer clearing in \c streamLocal)and the \c
+ * will be performed in \c streamNonLocal and the \c
* communicateHaloForces method must be called after the
* non-local buffer operations, after the local force buffer
* has been copied to the GPU (if CPU forces are present), and
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2019,2020,2021, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
// activities
if ((pulse_ == (dd_->comm->cd[dimIndex_].numPulses() - 1)) && (dimIndex_ == (dd_->ndim - 1)))
{
- if (!accumulateForces)
- {
- // Clear local portion of force array (in local stream)
- cudaMemsetAsync(d_f, 0, numHomeAtoms_ * sizeof(rvec), localStream_.stream());
- }
-
// ensure non-local stream waits for local stream, due to dependence on
// the previous H2D copy of CPU forces (if accumulateForces is true)
- // or the above clearing.
- // TODO remove this dependency on localStream - edmine Issue #3093
+ // or local force clearing.
GpuEventSynchronizer eventLocal;
eventLocal.markEvent(localStream_);
eventLocal.enqueueWaitEvent(nonLocalStream_);
if (stepWork.useGpuFHalo)
{
- communicateGpuHaloForces(*cr, domainWork.haveCpuLocalForceWork);
+ // If there exist CPU forces, data from halo exchange should accumulate into these
+ bool accumulateForces = domainWork.haveCpuLocalForceWork;
+ if (!accumulateForces)
+ {
+ // Force halo exchange will set a subset of local atoms with remote non-local data
+ // First clear local portion of force array, so that untouched atoms are zero
+ stateGpu->clearForcesOnGpu(AtomLocality::Local);
+ }
+ communicateGpuHaloForces(*cr, accumulateForces);
}
else
{
*/
void copyForcesToGpu(gmx::ArrayRef<const gmx::RVec> h_f, AtomLocality atomLocality);
+ /*! \brief Clear forces in the GPU memory.
+ *
+ * \param[in] atomLocality Locality of the particles to clear.
+ */
+ void clearForcesOnGpu(AtomLocality atomLocality);
+
/*! \brief Get the event synchronizer for the forces ready on device.
*
* Returns either of the event synchronizers, depending on the offload scenario
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2019,2020,2021, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
"GPU implementation.");
}
+void StatePropagatorDataGpu::clearForcesOnGpu(AtomLocality /* atomLocality */)
+{
+ GMX_ASSERT(!impl_,
+ "A CPU stub method from GPU state propagator data was called instead of one from "
+ "GPU implementation.");
+}
+
GpuEventSynchronizer* StatePropagatorDataGpu::getForcesReadyOnDeviceEvent(AtomLocality /* atomLocality */,
bool /* useGpuFBufferOps */)
{
*/
void copyForcesToGpu(gmx::ArrayRef<const gmx::RVec> h_f, AtomLocality atomLocality);
+ /*! \brief Clear forces in the GPU memory.
+ *
+ * \param[in] atomLocality Locality of the particles to clear.
+ */
+ void clearForcesOnGpu(AtomLocality atomLocality);
+
/*! \brief Get the event synchronizer for the forces ready on device.
*
* Returns either of the event synchronizers, depending on the offload scenario
int dataSize,
AtomLocality atomLocality,
const DeviceStream& deviceStream);
+
+ /*! \brief Performs the clearing of data in device buffer.
+ *
+ * \todo Template on locality.
+ *
+ * \param[out] d_data Device-side buffer.
+ * \param[in] dataSize Device-side data allocation size.
+ * \param[in] atomLocality If all, local or non-local ranges should be cleared.
+ * \param[in] deviceStream GPU stream to execute copy in.
+ */
+ void clearOnDevice(DeviceBuffer<RVec> d_data,
+ int dataSize,
+ AtomLocality atomLocality,
+ const DeviceStream& deviceStream);
};
} // namespace gmx
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2019,2020,2021, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
}
}
+void StatePropagatorDataGpu::Impl::clearOnDevice(DeviceBuffer<RVec> d_data,
+ int dataSize,
+ AtomLocality atomLocality,
+ const DeviceStream& deviceStream)
+{
+ GMX_UNUSED_VALUE(dataSize);
+
+ GMX_ASSERT(atomLocality < AtomLocality::Count, "Wrong atom locality.");
+
+ GMX_ASSERT(dataSize >= 0, "Trying to clear to device buffer before it was allocated.");
+
+ GMX_ASSERT(deviceStream.isValid(), "No stream is valid for clearing with given atom locality.");
+
+ int atomsStartAt, numAtomsToClear;
+ std::tie(atomsStartAt, numAtomsToClear) = getAtomRangesFromAtomLocality(atomLocality);
+
+ if (numAtomsToClear != 0)
+ {
+ GMX_ASSERT(atomsStartAt + numAtomsToClear <= dataSize,
+ "The device allocation is smaller than requested clear range.");
+
+ clearDeviceBufferAsync(&d_data, atomsStartAt, numAtomsToClear, deviceStream);
+ }
+}
+
DeviceBuffer<RVec> StatePropagatorDataGpu::Impl::getCoordinates()
{
return d_x_;
wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
}
+void StatePropagatorDataGpu::Impl::clearForcesOnGpu(AtomLocality atomLocality)
+{
+ GMX_ASSERT(atomLocality < AtomLocality::Count, "Wrong atom locality.");
+ const DeviceStream* deviceStream = fCopyStreams_[atomLocality];
+ GMX_ASSERT(deviceStream != nullptr,
+ "No stream is valid for clearing forces with given atom locality.");
+
+ wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
+ wallcycle_sub_start(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
+
+ clearOnDevice(d_f_, d_fSize_, atomLocality, *deviceStream);
+
+ wallcycle_sub_stop(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
+ wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
+}
+
GpuEventSynchronizer* StatePropagatorDataGpu::Impl::getForcesReadyOnDeviceEvent(AtomLocality atomLocality,
bool useGpuFBufferOps)
{
return impl_->copyForcesToGpu(h_f, atomLocality);
}
+void StatePropagatorDataGpu::clearForcesOnGpu(AtomLocality atomLocality)
+{
+ return impl_->clearForcesOnGpu(atomLocality);
+}
+
GpuEventSynchronizer* StatePropagatorDataGpu::getForcesReadyOnDeviceEvent(AtomLocality atomLocality,
bool useGpuFBufferOps)
{