From: Alan Gray Date: Fri, 12 Mar 2021 06:52:02 +0000 (+0000) Subject: Move responsibility for GPU force clearing to state propagator X-Git-Url: http://biod.pnpi.spb.ru/gitweb/?a=commitdiff_plain;h=c0d936a0458e9e8082a546af3b390b53f9b1ee58;p=alexxy%2Fgromacs.git Move responsibility for GPU force clearing to state propagator Force was previously cleared within GPU force halo exchange which was quite opaque. Now moved into do_force() and done by state propagator, to make schedule clearer. Fixes #3971 and partly addresses #3970 --- diff --git a/src/gromacs/domdec/gpuhaloexchange.h b/src/gromacs/domdec/gpuhaloexchange.h index 0f6089df2f..adc8d66712 100644 --- a/src/gromacs/domdec/gpuhaloexchange.h +++ b/src/gromacs/domdec/gpuhaloexchange.h @@ -74,8 +74,7 @@ public: * after the local coordinates buffer operations (where the * coordinates are copied to the device and hence the \c * coordinatesReadyOnDeviceEvent is recorded). Force Halo exchange - * will be performed in \c streamNonLocal (also potentally - * with buffer clearing in \c streamLocal)and the \c + * will be performed in \c streamNonLocal and the \c * communicateHaloForces method must be called after the * non-local buffer operations, after the local force buffer * has been copied to the GPU (if CPU forces are present), and diff --git a/src/gromacs/domdec/gpuhaloexchange_impl.cu b/src/gromacs/domdec/gpuhaloexchange_impl.cu index 2189f39fe4..ebc98e784f 100644 --- a/src/gromacs/domdec/gpuhaloexchange_impl.cu +++ b/src/gromacs/domdec/gpuhaloexchange_impl.cu @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2019,2020, by the GROMACS development team, led by + * Copyright (c) 2019,2020,2021, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -335,16 +335,9 @@ void GpuHaloExchange::Impl::communicateHaloForces(bool accumulateForces) // activities if ((pulse_ == (dd_->comm->cd[dimIndex_].numPulses() - 1)) && (dimIndex_ == (dd_->ndim - 1))) { - if (!accumulateForces) - { - // Clear local portion of force array (in local stream) - cudaMemsetAsync(d_f, 0, numHomeAtoms_ * sizeof(rvec), localStream_.stream()); - } - // ensure non-local stream waits for local stream, due to dependence on // the previous H2D copy of CPU forces (if accumulateForces is true) - // or the above clearing. - // TODO remove this dependency on localStream - edmine Issue #3093 + // or local force clearing. GpuEventSynchronizer eventLocal; eventLocal.markEvent(localStream_); eventLocal.enqueueWaitEvent(nonLocalStream_); diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp index 80190c6fa8..1db242af88 100644 --- a/src/gromacs/mdlib/sim_util.cpp +++ b/src/gromacs/mdlib/sim_util.cpp @@ -2039,7 +2039,15 @@ void do_force(FILE* fplog, if (stepWork.useGpuFHalo) { - communicateGpuHaloForces(*cr, domainWork.haveCpuLocalForceWork); + // If there exist CPU forces, data from halo exchange should accumulate into these + bool accumulateForces = domainWork.haveCpuLocalForceWork; + if (!accumulateForces) + { + // Force halo exchange will set a subset of local atoms with remote non-local data + // First clear local portion of force array, so that untouched atoms are zero + stateGpu->clearForcesOnGpu(AtomLocality::Local); + } + communicateGpuHaloForces(*cr, accumulateForces); } else { diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu.h b/src/gromacs/mdtypes/state_propagator_data_gpu.h index 82b3378551..3f66196509 100644 --- a/src/gromacs/mdtypes/state_propagator_data_gpu.h +++ b/src/gromacs/mdtypes/state_propagator_data_gpu.h @@ -270,6 +270,12 @@ public: */ void copyForcesToGpu(gmx::ArrayRef h_f, AtomLocality atomLocality); + /*! \brief Clear forces in the GPU memory. + * + * \param[in] atomLocality Locality of the particles to clear. + */ + void clearForcesOnGpu(AtomLocality atomLocality); + /*! \brief Get the event synchronizer for the forces ready on device. * * Returns either of the event synchronizers, depending on the offload scenario diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp index 3274d3dd02..f112d1b1cb 100644 --- a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp +++ b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2019,2020, by the GROMACS development team, led by + * Copyright (c) 2019,2020,2021, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -206,6 +206,13 @@ void StatePropagatorDataGpu::copyForcesToGpu(const gmx::ArrayRef h_f, AtomLocality atomLocality); + /*! \brief Clear forces in the GPU memory. + * + * \param[in] atomLocality Locality of the particles to clear. + */ + void clearForcesOnGpu(AtomLocality atomLocality); + /*! \brief Get the event synchronizer for the forces ready on device. * * Returns either of the event synchronizers, depending on the offload scenario @@ -424,6 +430,20 @@ private: int dataSize, AtomLocality atomLocality, const DeviceStream& deviceStream); + + /*! \brief Performs the clearing of data in device buffer. + * + * \todo Template on locality. + * + * \param[out] d_data Device-side buffer. + * \param[in] dataSize Device-side data allocation size. + * \param[in] atomLocality If all, local or non-local ranges should be cleared. + * \param[in] deviceStream GPU stream to execute copy in. + */ + void clearOnDevice(DeviceBuffer d_data, + int dataSize, + AtomLocality atomLocality, + const DeviceStream& deviceStream); }; } // namespace gmx diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp b/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp index 37d00888d5..16fbe131f8 100644 --- a/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp +++ b/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2019,2020, by the GROMACS development team, led by + * Copyright (c) 2019,2020,2021, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -278,6 +278,31 @@ void StatePropagatorDataGpu::Impl::copyFromDevice(gmx::ArrayRef h_dat } } +void StatePropagatorDataGpu::Impl::clearOnDevice(DeviceBuffer d_data, + int dataSize, + AtomLocality atomLocality, + const DeviceStream& deviceStream) +{ + GMX_UNUSED_VALUE(dataSize); + + GMX_ASSERT(atomLocality < AtomLocality::Count, "Wrong atom locality."); + + GMX_ASSERT(dataSize >= 0, "Trying to clear to device buffer before it was allocated."); + + GMX_ASSERT(deviceStream.isValid(), "No stream is valid for clearing with given atom locality."); + + int atomsStartAt, numAtomsToClear; + std::tie(atomsStartAt, numAtomsToClear) = getAtomRangesFromAtomLocality(atomLocality); + + if (numAtomsToClear != 0) + { + GMX_ASSERT(atomsStartAt + numAtomsToClear <= dataSize, + "The device allocation is smaller than requested clear range."); + + clearDeviceBufferAsync(&d_data, atomsStartAt, numAtomsToClear, deviceStream); + } +} + DeviceBuffer StatePropagatorDataGpu::Impl::getCoordinates() { return d_x_; @@ -452,6 +477,22 @@ void StatePropagatorDataGpu::Impl::copyForcesToGpu(const gmx::ArrayRefcopyForcesToGpu(h_f, atomLocality); } +void StatePropagatorDataGpu::clearForcesOnGpu(AtomLocality atomLocality) +{ + return impl_->clearForcesOnGpu(atomLocality); +} + GpuEventSynchronizer* StatePropagatorDataGpu::getForcesReadyOnDeviceEvent(AtomLocality atomLocality, bool useGpuFBufferOps) {