From: Andrey Alekseenko Date: Tue, 19 Oct 2021 16:34:43 +0000 (+0200) Subject: Balance event consumption for GPU update code path X-Git-Url: http://biod.pnpi.spb.ru/gitweb/?p=alexxy%2Fgromacs.git;a=commitdiff_plain;h=0504f4b552b9713671975a3ab0864755657bd8b3 Balance event consumption for GPU update code path - Explicit consumption of events when needed. - In some cases, reset the event in case the conditionals for the consumption are too complex. - Added helper functions to StatePropagatorDataGpu. - A special handling for force reduction when it has no atoms. Refs #3988, #4106, #4227. --- diff --git a/src/gromacs/mdlib/gpuforcereduction_impl.cpp b/src/gromacs/mdlib/gpuforcereduction_impl.cpp index 2a3e48c4ad..3de3002b3a 100644 --- a/src/gromacs/mdlib/gpuforcereduction_impl.cpp +++ b/src/gromacs/mdlib/gpuforcereduction_impl.cpp @@ -139,14 +139,27 @@ void GpuForceReduction::Impl::execute() baseForce_, cellInfo_.d_cell, deviceStream_); - - // Mark that kernel has been launched - if (completionMarker_ != nullptr) + } + else + { + /* In case we have nothing to do, but still have dependencies, we need + * to consume them and mark our own event. + * Happens sometimes in MdrunVsitesTest. + * Issue #3988, #4227. */ + for (auto* synchronizer : dependencyList_) { - completionMarker_->markEvent(deviceStream_); + synchronizer->consume(); } } + /* Mark that kernel has been launched. + * Even if we have no work to do and have not launched the kernel, we still mark the event + * in order to ensure proper marking/consumption balance, see Issue #3988, #4227. */ + if (completionMarker_ != nullptr) + { + completionMarker_->markEvent(deviceStream_); + } + wallcycle_sub_stop(wcycle_, WallCycleSubCounter::LaunchGpuNBFBufOps); wallcycle_stop(wcycle_, WallCycleCounter::LaunchGpu); } diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp index a2ccaed47b..e5c3d9bad4 100644 --- a/src/gromacs/mdlib/sim_util.cpp +++ b/src/gromacs/mdlib/sim_util.cpp @@ -1375,6 +1375,21 @@ void do_force(FILE* fplog, { GMX_ASSERT(stateGpu != nullptr, "stateGpu should not be null"); stateGpu->copyCoordinatesToGpu(x.unpaddedArrayRef(), AtomLocality::Local); + if (stepWork.doNeighborSearch) + { + /* On NS steps, we skip X buffer ops. So, unless we use PME or direct GPU + * communications, we don't wait for the coordinates on the device, + * and we must consume the event here. + * Issue #3988. */ + const bool eventWillBeConsumedByGpuPme = stepWork.haveGpuPmeOnThisRank; + const bool eventWillBeConsumedByGpuPmePPComm = + (simulationWork.haveSeparatePmeRank && stepWork.computeSlowForces) + && pmeSendCoordinatesFromGpu; + if (!eventWillBeConsumedByGpuPme && !eventWillBeConsumedByGpuPmePPComm) + { + stateGpu->consumeCoordinatesCopiedToDeviceEvent(AtomLocality::Local); + } + } } } @@ -2064,6 +2079,10 @@ void do_force(FILE* fplog, if (!stepWork.useGpuFHalo) { + /* We don't explicitly wait for the forces to be reduced on device, + * but wait for them to finish copying to CPU instead. + * So, we manually consume the event, see Issue #3988. */ + stateGpu->consumeForcesReducedOnDeviceEvent(AtomLocality::NonLocal); // copy from GPU input for dd_move_f() stateGpu->copyForcesFromGpu(forceOutMtsLevel0.forceWithShiftForces().force(), AtomLocality::NonLocal); @@ -2271,6 +2290,13 @@ void do_force(FILE* fplog, || (simulationWork.useGpuUpdate && haveDDAtomOrdering(*cr) && simulationWork.useCpuPmePpCommunication) || vsite) { + if (stepWork.computeNonbondedForces) + { + /* We have previously issued force reduction on the GPU, but we will + * not use this event, instead relying on the stream being in-order. + * Issue #3988. */ + stateGpu->consumeForcesReducedOnDeviceEvent(AtomLocality::Local); + } stateGpu->copyForcesFromGpu(forceWithShift, AtomLocality::Local); stateGpu->waitForcesReadyOnHost(AtomLocality::Local); } diff --git a/src/gromacs/mdrun/md.cpp b/src/gromacs/mdrun/md.cpp index 0a59a4f253..3898e07d3a 100644 --- a/src/gromacs/mdrun/md.cpp +++ b/src/gromacs/mdrun/md.cpp @@ -1707,6 +1707,8 @@ void gmx::LegacySimulator::do_md() // TODO: The special case of removing CM motion should be dealt more gracefully if (useGpuForUpdate) { + // Issue #3988, #4106. + stateGpu->resetCoordinatesCopiedToDeviceEvent(AtomLocality::Local); stateGpu->copyCoordinatesToGpu(state->x, AtomLocality::Local); // Here we block until the H2D copy completes because event sync with the // force kernels that use the coordinates on the next steps is not implemented diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu.h b/src/gromacs/mdtypes/state_propagator_data_gpu.h index ad07df2c13..4cf4c7998d 100644 --- a/src/gromacs/mdtypes/state_propagator_data_gpu.h +++ b/src/gromacs/mdtypes/state_propagator_data_gpu.h @@ -208,6 +208,22 @@ public: */ void waitCoordinatesCopiedToDevice(AtomLocality atomLocality); + /*! \brief Consume the event for copying coordinates to the device. + * + * Used for manual event consumption. Does nothing except changing the internal event counter. + * + * \param[in] atomLocality Locality of the particles. + */ + void consumeCoordinatesCopiedToDeviceEvent(AtomLocality atomLocality); + + /*! \brief Reset the event for copying coordinates to the device. + * + * Used for manual event consumption. Does nothing except resetting the event. + * + * \param[in] atomLocality Locality of the particles. + */ + void resetCoordinatesCopiedToDeviceEvent(AtomLocality atomLocality); + /*! \brief Setter for the event synchronizer for the update is done on th GPU * * \param[in] xUpdatedOnDeviceEvent The event to synchronize the stream coordinates wre updated on device. @@ -302,6 +318,12 @@ public: */ GpuEventSynchronizer* fReducedOnDevice(AtomLocality atomLocality); + /*! \brief Consume the event for when the forces are reduced on the GPU. + * + * \param[in] atomLocality Locality of the particles to wait for. + */ + void consumeForcesReducedOnDeviceEvent(AtomLocality atomLocality); + /*! \brief Getter for the event synchronizer for the forces are ready on the GPU. * * \param[in] atomLocality Locality of the particles to wait for. diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp index 618856d0ce..99c456277b 100644 --- a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp +++ b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp @@ -119,6 +119,20 @@ void StatePropagatorDataGpu::waitCoordinatesCopiedToDevice(AtomLocality /* atomL "GPU implementation."); } +void StatePropagatorDataGpu::consumeCoordinatesCopiedToDeviceEvent(AtomLocality /* atomLocality */) +{ + GMX_ASSERT(!impl_, + "A CPU stub method from GPU state propagator data was called instead of one from " + "GPU implementation."); +} + +void StatePropagatorDataGpu::resetCoordinatesCopiedToDeviceEvent(AtomLocality /* atomLocality */) +{ + GMX_ASSERT(!impl_, + "A CPU stub method from GPU state propagator data was called instead of one from " + "GPU implementation."); +} + void StatePropagatorDataGpu::setXUpdatedOnDeviceEvent(GpuEventSynchronizer* /* xUpdatedOnDeviceEvent */) { GMX_ASSERT(!impl_, @@ -223,6 +237,13 @@ GpuEventSynchronizer* StatePropagatorDataGpu::fReducedOnDevice(AtomLocality /*at return nullptr; } +void StatePropagatorDataGpu::consumeForcesReducedOnDeviceEvent(AtomLocality /*atomLocality*/) +{ + GMX_ASSERT(!impl_, + "A CPU stub method from GPU state propagator data was called instead of one from " + "GPU implementation."); +} + GpuEventSynchronizer* StatePropagatorDataGpu::fReadyOnDevice(AtomLocality /*atomLocality*/) { GMX_ASSERT(!impl_, diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h index dbe435d508..d72bc5a0f2 100644 --- a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h +++ b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h @@ -192,6 +192,22 @@ public: */ void waitCoordinatesCopiedToDevice(AtomLocality atomLocality); + /*! \brief Consume the event for copying coordinates to the device. + * + * Used for manual event consumption. Does nothing except changing the internal event counter. + * + * \param[in] atomLocality Locality of the particles. + */ + void consumeCoordinatesCopiedToDeviceEvent(AtomLocality atomLocality); + + /*! \brief Reset the event for copying coordinates to the device. + * + * Used for manual event consumption. Does nothing except resetting the event. + * + * \param[in] atomLocality Locality of the particles. + */ + void resetCoordinatesCopiedToDeviceEvent(AtomLocality atomLocality); + /*! \brief Setter for the event synchronizer for the update is done on th GPU * * \param[in] xUpdatedOnDeviceEvent The event to synchronize the stream coordinates wre updated on device. @@ -284,6 +300,9 @@ public: */ GpuEventSynchronizer* fReducedOnDevice(AtomLocality atomLocality); + //! \brief Consume the event for when the forces are reduced on device. + void consumeForcesReducedOnDeviceEvent(AtomLocality atomLocality); + /*! \brief Getter for the event synchronizer for the forces are ready for GPU update. * * \param[in] atomLocality Locality of the particles to wait for. diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp b/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp index 30478ef046..e6db89f1a9 100644 --- a/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp +++ b/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp @@ -372,6 +372,16 @@ GpuEventSynchronizer* StatePropagatorDataGpu::Impl::getCoordinatesReadyOnDeviceE } else { + if (stepWork.doNeighborSearch && xUpdatedOnDeviceEvent_) + { + /* On search steps, we do not consume the result of the GPU update + * but rather that of a H2D transfer. So, we reset the event triggered after + * update to avoid leaving it unconsumed. + * Unfortunately, we don't always have the event marked either (e.g., on the + * first step) so we just reset it here. + * See Issue #3988. */ + xUpdatedOnDeviceEvent_->reset(); + } return &xReadyOnDevice_[atomLocality]; } } @@ -384,6 +394,18 @@ void StatePropagatorDataGpu::Impl::waitCoordinatesCopiedToDevice(AtomLocality at wallcycle_stop(wcycle_, WallCycleCounter::WaitGpuStatePropagatorData); } +void StatePropagatorDataGpu::Impl::consumeCoordinatesCopiedToDeviceEvent(AtomLocality atomLocality) +{ + GMX_ASSERT(atomLocality < AtomLocality::Count, "Wrong atom locality."); + xReadyOnDevice_[atomLocality].consume(); +} + +void StatePropagatorDataGpu::Impl::resetCoordinatesCopiedToDeviceEvent(AtomLocality atomLocality) +{ + GMX_ASSERT(atomLocality < AtomLocality::Count, "Wrong atom locality."); + xReadyOnDevice_[atomLocality].reset(); +} + void StatePropagatorDataGpu::Impl::setXUpdatedOnDeviceEvent(GpuEventSynchronizer* xUpdatedOnDeviceEvent) { GMX_ASSERT(xUpdatedOnDeviceEvent != nullptr, "The event synchronizer can not be nullptr."); @@ -551,6 +573,11 @@ GpuEventSynchronizer* StatePropagatorDataGpu::Impl::fReducedOnDevice(AtomLocalit return &fReducedOnDevice_[atomLocality]; } +void StatePropagatorDataGpu::Impl::consumeForcesReducedOnDeviceEvent(AtomLocality atomLocality) +{ + fReducedOnDevice_[atomLocality].consume(); +} + GpuEventSynchronizer* StatePropagatorDataGpu::Impl::fReadyOnDevice(AtomLocality atomLocality) { return &fReadyOnDevice_[atomLocality]; @@ -657,6 +684,16 @@ void StatePropagatorDataGpu::waitCoordinatesCopiedToDevice(AtomLocality atomLoca return impl_->waitCoordinatesCopiedToDevice(atomLocality); } +void StatePropagatorDataGpu::consumeCoordinatesCopiedToDeviceEvent(AtomLocality atomLocality) +{ + return impl_->consumeCoordinatesCopiedToDeviceEvent(atomLocality); +} + +void StatePropagatorDataGpu::resetCoordinatesCopiedToDeviceEvent(AtomLocality atomLocality) +{ + return impl_->resetCoordinatesCopiedToDeviceEvent(atomLocality); +} + void StatePropagatorDataGpu::setXUpdatedOnDeviceEvent(GpuEventSynchronizer* xUpdatedOnDeviceEvent) { impl_->setXUpdatedOnDeviceEvent(xUpdatedOnDeviceEvent); @@ -723,6 +760,11 @@ GpuEventSynchronizer* StatePropagatorDataGpu::fReducedOnDevice(AtomLocality atom return impl_->fReducedOnDevice(atomLocality); } +void StatePropagatorDataGpu::consumeForcesReducedOnDeviceEvent(AtomLocality atomLocality) +{ + impl_->consumeForcesReducedOnDeviceEvent(atomLocality); +} + GpuEventSynchronizer* StatePropagatorDataGpu::fReadyOnDevice(AtomLocality atomLocality) { return impl_->fReadyOnDevice(atomLocality);