From a73c3ec2dd9dd64f0c728b7b1d90ac5bcfb246cc Mon Sep 17 00:00:00 2001 From: Artem Zhmurov Date: Wed, 16 Oct 2019 16:33:31 +0200 Subject: [PATCH] Remove excessive H2D and D2H coordinates copies when update is offloaded The H2D copies are only needed: 1. When update is not ofloaded. 2. At the search steps, after device buffers were reinitialized. The D2H copies are only needed: 1. On the search steps, since the device buffers are reinitialized. 2. If there are CPU consumers, e.g. CPU bondeds. 3. When the energy is computed. 4. When coordinates are needed for output. There are two special cases, when coordinates are needed on host, that dealt with separately: 1. When the PME it tuned. 2. When center of mass motion is removed. The locality of copied atoms when update is offloaded is changed from All to Local in preparation for multi-GPU case. The blocking sync on H2D copy event is moved from UpdateConstraints to StatePropagatorDataGpu. Change-Id: I971a6273b39fa7da07600312c085ce343b5d25ee --- src/gromacs/mdlib/sim_util.cpp | 11 ++++- src/gromacs/mdlib/update_constrain_cuda.h | 7 ---- .../mdlib/update_constrain_cuda_impl.cpp | 5 --- .../mdlib/update_constrain_cuda_impl.cu | 10 ----- .../mdlib/update_constrain_cuda_impl.h | 7 ---- src/gromacs/mdrun/md.cpp | 40 +++++++++++++++---- .../mdtypes/state_propagator_data_gpu.h | 8 ++++ .../state_propagator_data_gpu_impl.cpp | 5 +++ .../mdtypes/state_propagator_data_gpu_impl.h | 8 ++++ .../state_propagator_data_gpu_impl_gpu.cpp | 11 +++++ 10 files changed, 73 insertions(+), 39 deletions(-) diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp index adf6da0e5b..eb2185a6fa 100644 --- a/src/gromacs/mdlib/sim_util.cpp +++ b/src/gromacs/mdlib/sim_util.cpp @@ -1037,7 +1037,13 @@ void do_force(FILE *fplog, pme_gpu_set_device_x(fr->pmedata, stateGpu->getCoordinates()); } } - stateGpu->copyCoordinatesToGpu(x.unpaddedArrayRef(), gmx::StatePropagatorDataGpu::AtomLocality::Local); + // We need to copy coordinates when: + // 1. Update is not offloaded + // 2. The buffers were reinitialized on search step + if (!simulationWork.useGpuUpdate || stepWork.doNeighborSearch) + { + stateGpu->copyCoordinatesToGpu(x.unpaddedArrayRef(), gmx::StatePropagatorDataGpu::AtomLocality::Local); + } } const auto localXReadyOnDevice = (stateGpu != nullptr) ? stateGpu->getCoordinatesReadyOnDeviceEvent(gmx::StatePropagatorDataGpu::AtomLocality::Local, @@ -1155,7 +1161,7 @@ void do_force(FILE *fplog, // NS step is also a virial step (on which f buf ops are deactivated). if (simulationWork.useGpuBufferOps && simulationWork.useGpuNonbonded && (GMX_GPU == GMX_GPU_CUDA)) { - GMX_ASSERT(stateGpu, "stateGpu should be valid here"); + GMX_ASSERT(stateGpu, "stateGpu should be valid when buffer ops are offloaded"); nbv->atomdata_init_add_nbat_f_to_f_gpu(stateGpu->fReducedOnDevice()); } } @@ -1163,6 +1169,7 @@ void do_force(FILE *fplog, { if (useGpuXBufOps == BufferOpsUseGpu::True) { + GMX_ASSERT(stateGpu, "stateGpu should be valid when buffer ops are offloaded"); nbv->convertCoordinatesGpu(Nbnxm::AtomLocality::Local, false, stateGpu->getCoordinates(), localXReadyOnDevice); diff --git a/src/gromacs/mdlib/update_constrain_cuda.h b/src/gromacs/mdlib/update_constrain_cuda.h index edad9c6c0a..07add03456 100644 --- a/src/gromacs/mdlib/update_constrain_cuda.h +++ b/src/gromacs/mdlib/update_constrain_cuda.h @@ -140,13 +140,6 @@ class UpdateConstrainCuda */ void setPbc(const t_pbc *pbc); - /*! \brief Blocking wait on the update of coordinates being ready. - * - * \todo Remove when the "stitching" is done. - */ - void waitCoordinatesReadyOnDevice(); - - /*! \brief Return the synchronizer associated with the event indicated that the coordinates are ready on the device. */ GpuEventSynchronizer* getCoordinatesReadySync(); diff --git a/src/gromacs/mdlib/update_constrain_cuda_impl.cpp b/src/gromacs/mdlib/update_constrain_cuda_impl.cpp index adc765c875..80dadb153c 100644 --- a/src/gromacs/mdlib/update_constrain_cuda_impl.cpp +++ b/src/gromacs/mdlib/update_constrain_cuda_impl.cpp @@ -95,11 +95,6 @@ void UpdateConstrainCuda::setPbc(gmx_unused const t_pbc *pbc) GMX_ASSERT(false, "A CPU stub for UpdateConstrain was called instead of the correct implementation."); } -void UpdateConstrainCuda::waitCoordinatesReadyOnDevice() -{ - GMX_ASSERT(false, "A CPU stub for UpdateConstrain was called instead of the correct implementation."); -} - GpuEventSynchronizer* UpdateConstrainCuda::getCoordinatesReadySync() { GMX_ASSERT(false, "A CPU stub for UpdateConstrain was called instead of the correct implementation."); diff --git a/src/gromacs/mdlib/update_constrain_cuda_impl.cu b/src/gromacs/mdlib/update_constrain_cuda_impl.cu index 2052bccb7c..e06aa7a5eb 100644 --- a/src/gromacs/mdlib/update_constrain_cuda_impl.cu +++ b/src/gromacs/mdlib/update_constrain_cuda_impl.cu @@ -172,11 +172,6 @@ void UpdateConstrainCuda::Impl::setPbc(const t_pbc *pbc) settleCuda_->setPbc(pbc); } -void UpdateConstrainCuda::Impl::waitCoordinatesReadyOnDevice() -{ - coordinatesReady_->waitForEvent(); -} - GpuEventSynchronizer* UpdateConstrainCuda::Impl::getCoordinatesReadySync() { return coordinatesReady_; @@ -224,11 +219,6 @@ void UpdateConstrainCuda::setPbc(const t_pbc *pbc) impl_->setPbc(pbc); } -void UpdateConstrainCuda::waitCoordinatesReadyOnDevice() -{ - impl_->waitCoordinatesReadyOnDevice(); -} - GpuEventSynchronizer* UpdateConstrainCuda::getCoordinatesReadySync() { return impl_->getCoordinatesReadySync(); diff --git a/src/gromacs/mdlib/update_constrain_cuda_impl.h b/src/gromacs/mdlib/update_constrain_cuda_impl.h index ab6e38431e..b52cfce2c7 100644 --- a/src/gromacs/mdlib/update_constrain_cuda_impl.h +++ b/src/gromacs/mdlib/update_constrain_cuda_impl.h @@ -142,13 +142,6 @@ class UpdateConstrainCuda::Impl */ void setPbc(const t_pbc *pbc); - /*! \brief Blocking wait on the update of coordinates being ready. - * - * \todo Remove when the "stitching" is done. - */ - void waitCoordinatesReadyOnDevice(); - - /*! \brief Return the synchronizer associated with the event indicated that the coordinates are ready on the device. */ GpuEventSynchronizer* getCoordinatesReadySync(); diff --git a/src/gromacs/mdrun/md.cpp b/src/gromacs/mdrun/md.cpp index b30edb8473..d62fafa249 100644 --- a/src/gromacs/mdrun/md.cpp +++ b/src/gromacs/mdrun/md.cpp @@ -738,6 +738,13 @@ void gmx::LegacySimulator::do_md() if (bPMETune && bNStList) { + // This has to be here because PME load balancing is called so early. + // TODO: Move to after all booleans are defined. + if (useGpuForUpdate && !bFirstStep) + { + stateGpu->copyCoordinatesFromGpu(ArrayRef(state->x), StatePropagatorDataGpu::AtomLocality::Local); + stateGpu->waitCoordinatesReadyOnHost(StatePropagatorDataGpu::AtomLocality::Local); + } /* PME grid + cut-off optimization with GPUs or PME nodes */ pme_loadbal_do(pme_loadbal, cr, (mdrunOptions.verbose && MASTER(cr)) ? stderr : nullptr, @@ -795,18 +802,29 @@ void gmx::LegacySimulator::do_md() do_verbose = mdrunOptions.verbose && (step % mdrunOptions.verboseStepPrintInterval == 0 || bFirstStep || bLastStep); - // Copy velocities from the GPU when needed: - // - On search steps to keep copy on host (device buffers are reinitialized). - // - When needed for the output. if (useGpuForUpdate && !bFirstStep) { + // Copy velocities from the GPU when needed: + // - On search steps to keep copy on host (device buffers are reinitialized). + // - When needed for the output. if (bNS || do_per_step(step, ir->nstvout)) { stateGpu->copyVelocitiesFromGpu(state->v, StatePropagatorDataGpu::AtomLocality::Local); stateGpu->waitVelocitiesReadyOnHost(StatePropagatorDataGpu::AtomLocality::Local); } - } + // Copy coordinate from the GPU when needed: + // - On search steps to keep copy on host (device buffers are reinitialized). + // - There are CPU bonded forces that need current coordinates + // - When needed for the output. + if (bNS || + (runScheduleWork->domainWork.haveCpuBondedWork || runScheduleWork->domainWork.haveFreeEnergyWork) || + do_per_step(step, ir->nstxout) || do_per_step(step, ir->nstxout_compressed)) + { + stateGpu->copyCoordinatesFromGpu(ArrayRef(state->x), StatePropagatorDataGpu::AtomLocality::Local); + stateGpu->waitCoordinatesReadyOnHost(StatePropagatorDataGpu::AtomLocality::Local); + } + } if (bNS && !(bFirstStep && ir->bContinuation)) { @@ -1254,9 +1272,9 @@ void gmx::LegacySimulator::do_md() // Copy data to the GPU after buffers might have being reinitialized stateGpu->copyVelocitiesToGpu(state->v, StatePropagatorDataGpu::AtomLocality::Local); + stateGpu->copyCoordinatesToGpu(ArrayRef(state->x), StatePropagatorDataGpu::AtomLocality::Local); } - stateGpu->copyCoordinatesToGpu(ArrayRef(state->x), StatePropagatorDataGpu::AtomLocality::All); stateGpu->copyForcesToGpu(ArrayRef(f), StatePropagatorDataGpu::AtomLocality::All); // TODO: Use StepWorkload fields. @@ -1270,7 +1288,6 @@ void gmx::LegacySimulator::do_md() ir->delta_t, true, bCalcVir, shake_vir, doTempCouple, ekind->tcstat, doParrinelloRahman, ir->nstpcouple*ir->delta_t, M); - stateGpu->copyCoordinatesFromGpu(ArrayRef(state->x), StatePropagatorDataGpu::AtomLocality::All); // Copy velocities D2H after update if: // - Globals are computed this step (includes the energy output steps). @@ -1279,10 +1296,10 @@ void gmx::LegacySimulator::do_md() { stateGpu->copyVelocitiesFromGpu(state->v, StatePropagatorDataGpu::AtomLocality::Local); stateGpu->waitVelocitiesReadyOnHost(StatePropagatorDataGpu::AtomLocality::Local); + stateGpu->copyCoordinatesFromGpu(ArrayRef(state->x), StatePropagatorDataGpu::AtomLocality::Local); + stateGpu->waitCoordinatesReadyOnHost(StatePropagatorDataGpu::AtomLocality::Local); } - // TODO: replace with stateGpu->waitForCopyCoordinatesFromGpu(...) - integrator->waitCoordinatesReadyOnDevice(); } else { @@ -1423,6 +1440,13 @@ void gmx::LegacySimulator::do_md() { process_and_stopcm_grp(fplog, &vcm, *mdatoms, state->x.rvec_array(), state->v.rvec_array()); inc_nrnb(nrnb, eNR_STOPCM, mdatoms->homenr); + + // TODO: The special case of removing CM motion should be dealt more gracefully + if (useGpuForUpdate) + { + stateGpu->copyCoordinatesToGpu(ArrayRef(state->x), StatePropagatorDataGpu::AtomLocality::Local); + stateGpu->waitCoordinatesCopiedToDevice(StatePropagatorDataGpu::AtomLocality::Local); + } } } } diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu.h b/src/gromacs/mdtypes/state_propagator_data_gpu.h index 96066d0ac3..b5bc2699c0 100644 --- a/src/gromacs/mdtypes/state_propagator_data_gpu.h +++ b/src/gromacs/mdtypes/state_propagator_data_gpu.h @@ -215,6 +215,14 @@ class StatePropagatorDataGpu const SimulationWorkload &simulationWork, const StepWorkload &stepWork); + /*! \brief Blocking wait until coordinates are copied to the device. + * + * Synchronizes the stream in which the copy was executed. + * + * \param[in] atomLocality Locality of the particles to wait for. + */ + void waitCoordinatesCopiedToDevice(AtomLocality atomLocality); + /*! \brief Getter for the event synchronizer for the update is done on th GPU * * \returns The event to synchronize the stream coordinates wre updated on device. diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp index 417d488a80..43d707dc3c 100644 --- a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp +++ b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp @@ -104,6 +104,11 @@ GpuEventSynchronizer* StatePropagatorDataGpu::getCoordinatesReadyOnDeviceEvent(A return nullptr; } +void StatePropagatorDataGpu::waitCoordinatesCopiedToDevice(AtomLocality /* atomLocality */) +{ + GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called instead of one from GPU implementation."); +} + GpuEventSynchronizer* StatePropagatorDataGpu::xUpdatedOnDevice() { GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called instead of one from GPU implementation."); diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h index 1b687c022e..2b1db0d791 100644 --- a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h +++ b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h @@ -200,6 +200,14 @@ class StatePropagatorDataGpu::Impl const SimulationWorkload &simulationWork, const StepWorkload &stepWork); + /*! \brief Blocking wait until coordinates are copied to the device. + * + * Synchronizes the stream in which the copy was executed. + * + * \param[in] atomLocality Locality of the particles to wait for. + */ + void waitCoordinatesCopiedToDevice(AtomLocality atomLocality); + /*! \brief Getter for the event synchronizer for the update is done on th GPU * * \returns The event to synchronize the stream coordinates wre updated on device. diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp b/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp index 50294a921c..cca10920d1 100644 --- a/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp +++ b/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp @@ -330,6 +330,12 @@ GpuEventSynchronizer* StatePropagatorDataGpu::Impl::getCoordinatesReadyOnDeviceE } } +void StatePropagatorDataGpu::Impl::waitCoordinatesCopiedToDevice(AtomLocality atomLocality) +{ + GMX_ASSERT(atomLocality < AtomLocality::Count, "Wrong atom locality."); + xReadyOnDevice_[atomLocality].waitForEvent(); +} + GpuEventSynchronizer* StatePropagatorDataGpu::Impl::xUpdatedOnDevice() { return &xUpdatedOnDevice_; @@ -527,6 +533,11 @@ GpuEventSynchronizer* StatePropagatorDataGpu::getCoordinatesReadyOnDeviceEvent(A return impl_->getCoordinatesReadyOnDeviceEvent(atomLocality, simulationWork, stepWork); } +void StatePropagatorDataGpu::waitCoordinatesCopiedToDevice(AtomLocality atomLocality) +{ + return impl_->waitCoordinatesCopiedToDevice(atomLocality); +} + GpuEventSynchronizer* StatePropagatorDataGpu::xUpdatedOnDevice() { return impl_->xUpdatedOnDevice(); -- 2.22.0