From 3b46d7d65c19a7a156f66f9eb5d0b85bf10c134b Mon Sep 17 00:00:00 2001 From: Alan Gray Date: Wed, 4 Nov 2020 18:56:03 +0000 Subject: [PATCH] Allow GPU update without GPU DD --- admin/gitlab-ci/gromacs.gitlab-ci.yml | 23 ++++++ docs/release-notes/2021/major/performance.rst | 7 ++ src/gromacs/mdlib/sim_util.cpp | 74 +++++++++---------- src/gromacs/mdrun/md.cpp | 11 ++- src/gromacs/taskassignment/decidegpuusage.cpp | 32 +++----- 5 files changed, 85 insertions(+), 62 deletions(-) diff --git a/admin/gitlab-ci/gromacs.gitlab-ci.yml b/admin/gitlab-ci/gromacs.gitlab-ci.yml index b38de61e53..a4deaa01e7 100644 --- a/admin/gitlab-ci/gromacs.gitlab-ci.yml +++ b/admin/gitlab-ci/gromacs.gitlab-ci.yml @@ -1015,6 +1015,29 @@ gromacs:gcc-8-cuda-11.0:regressiontest-gpucommupd-tMPI: when: always expire_in: 1 week +gromacs:gcc-8-cuda-11.0:regressiontest-upd-tMPI: + extends: + - .gromacs:base:regressiontest + - .rules:post-merge-acceptance + image: gromacs/cmake-3.15.7-gcc-8-cuda-11.0-nvidiaopencl-clfft-openmpi-master + variables: + KUBERNETES_EXTENDED_RESOURCE_NAME: "nvidia.com/gpu" + KUBERNETES_EXTENDED_RESOURCE_LIMIT: 2 + REGRESSIONTEST_PME_RANK_NUMBER: 0 + REGRESSIONTEST_TOTAL_RANK_NUMBER: 4 + REGRESSIONTEST_OMP_RANK_NUMBER: 1 + GMX_FORCE_UPDATE_DEFAULT_GPU: 1 + tags: + - k8s-scilifelab + needs: + - job: gromacs:gcc-8-cuda-11.0:build + - job: regressiontests:prepare + artifacts: + paths: + - regressiontests + when: always + expire_in: 1 week + gromacs:gcc-8-cuda-11.0:regressiontest-gpucommupd-MPI: extends: - .gromacs:base:regressiontest diff --git a/docs/release-notes/2021/major/performance.rst b/docs/release-notes/2021/major/performance.rst index b94c016d48..85b08653c8 100644 --- a/docs/release-notes/2021/major/performance.rst +++ b/docs/release-notes/2021/major/performance.rst @@ -32,3 +32,10 @@ CPU SIMD accelerated implementation of harmonic bonds SIMD acceleration for bonds slightly improves performance for systems with H-bonds only constrained or no constraints. This gives a significant improvement with multiple time stepping. + +Allow offloading GPU update and constraints without direct GPU communication +"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" + +Allow domain-decomposition and separate PME rank parallel runs to offload update and +constraints to a GPU with CUDA without requiring the (experimental) direct GPU +communication features to be also enabled. diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp index e788c6b66b..18606f120f 100644 --- a/src/gromacs/mdlib/sim_util.cpp +++ b/src/gromacs/mdlib/sim_util.cpp @@ -1213,6 +1213,30 @@ void do_force(FILE* fplog, AtomLocality::Local, simulationWork, stepWork) : nullptr; + // Copy coordinate from the GPU if update is on the GPU and there + // are forces to be computed on the CPU, or for the computation of + // virial, or if host-side data will be transferred from this task + // to a remote task for halo exchange or PME-PP communication. At + // search steps the current coordinates are already on the host, + // hence copy is not needed. + const bool haveHostPmePpComms = + !thisRankHasDuty(cr, DUTY_PME) && !simulationWork.useGpuPmePpCommunication; + + GMX_ASSERT(simulationWork.useGpuHaloExchange + == ((cr->dd != nullptr) && (!cr->dd->gpuHaloExchange[0].empty())), + "The GPU halo exchange is active, but it has not been constructed."); + const bool haveHostHaloExchangeComms = + havePPDomainDecomposition(cr) && !simulationWork.useGpuHaloExchange; + + bool gmx_used_in_debug haveCopiedXFromGpu = false; + if (simulationWork.useGpuUpdate && !stepWork.doNeighborSearch + && (runScheduleWork->domainWork.haveCpuLocalForceWork || stepWork.computeVirial + || haveHostPmePpComms || haveHostHaloExchangeComms)) + { + stateGpu->copyCoordinatesFromGpu(x.unpaddedArrayRef(), AtomLocality::Local); + haveCopiedXFromGpu = true; + } + // If coordinates are to be sent to PME task from CPU memory, perform that send here. // Otherwise the send will occur after H2D coordinate transfer. if (GMX_MPI && !thisRankHasDuty(cr, DUTY_PME) && !pmeSendCoordinatesFromGpu && stepWork.computeSlowForces) @@ -1220,11 +1244,7 @@ void do_force(FILE* fplog, /* Send particle coordinates to the pme nodes */ if (!stepWork.doNeighborSearch && simulationWork.useGpuUpdate) { - GMX_RELEASE_ASSERT(false, - "GPU update and separate PME ranks are only supported with GPU " - "direct communication!"); - // TODO: when this code-path becomes supported add: - // stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local); + stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local); } gmx_pme_send_coordinates(fr, cr, box, as_rvec_array(x.unpaddedArrayRef().data()), lambda[efptCOUL], @@ -1260,31 +1280,6 @@ void do_force(FILE* fplog, } } - // Copy coordinate from the GPU if update is on the GPU and there - // are forces to be computed on the CPU, or for the computation of - // virial, or if host-side data will be transferred from this task - // to a remote task for halo exchange or PME-PP communication. At - // search steps the current coordinates are already on the host, - // hence copy is not needed. - const bool haveHostPmePpComms = - !thisRankHasDuty(cr, DUTY_PME) && !simulationWork.useGpuPmePpCommunication; - - GMX_ASSERT(simulationWork.useGpuHaloExchange - == ((cr->dd != nullptr) && (!cr->dd->gpuHaloExchange[0].empty())), - "The GPU halo exchange is active, but it has not been constructed."); - const bool haveHostHaloExchangeComms = - havePPDomainDecomposition(cr) && !simulationWork.useGpuHaloExchange; - - bool gmx_used_in_debug haveCopiedXFromGpu = false; - if (simulationWork.useGpuUpdate && !stepWork.doNeighborSearch - && (runScheduleWork->domainWork.haveCpuLocalForceWork || stepWork.computeVirial - || haveHostPmePpComms || haveHostHaloExchangeComms)) - { - GMX_ASSERT(stateGpu != nullptr, "stateGpu should not be null"); - stateGpu->copyCoordinatesFromGpu(x.unpaddedArrayRef(), AtomLocality::Local); - haveCopiedXFromGpu = true; - } - // If coordinates are to be sent to PME task from GPU memory, perform that send here. // Otherwise the send will occur before the H2D coordinate transfer. if (!thisRankHasDuty(cr, DUTY_PME) && pmeSendCoordinatesFromGpu) @@ -1495,10 +1490,12 @@ void do_force(FILE* fplog, } else { - // Note: GPU update + DD without direct communication is not supported, - // a waitCoordinatesReadyOnHost() should be issued if it will be. - GMX_ASSERT(!simulationWork.useGpuUpdate, - "GPU update is not supported with CPU halo exchange"); + if (simulationWork.useGpuUpdate) + { + GMX_ASSERT(haveCopiedXFromGpu, + "a wait should only be triggered if copy has been scheduled"); + stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local); + } dd_move_x(cr->dd, box, x.unpaddedArrayRef(), wcycle); } @@ -1978,10 +1975,10 @@ void do_force(FILE* fplog, wallcycle_stop(wcycle, ewcFORCE); } - // If on GPU PME-PP comms or GPU update path, receive forces from PME before GPU buffer ops + // If on GPU PME-PP comms path, receive forces from PME before GPU buffer ops // TODO refactor this and unify with below default-path call to the same function if (PAR(cr) && !thisRankHasDuty(cr, DUTY_PME) && stepWork.computeSlowForces - && (simulationWork.useGpuPmePpCommunication || simulationWork.useGpuUpdate)) + && simulationWork.useGpuPmePpCommunication) { /* In case of node-splitting, the PP nodes receive the long-range * forces, virial and energy from the PME nodes here. @@ -2039,7 +2036,8 @@ void do_force(FILE* fplog, // copy call done in sim_utils(...) for the output. // NOTE: If there are virtual sites, the forces are modified on host after this D2H copy. Hence, // they should not be copied in do_md(...) for the output. - if (!simulationWork.useGpuUpdate || vsite) + if (!simulationWork.useGpuUpdate + || (simulationWork.useGpuUpdate && DOMAINDECOMP(cr) && haveHostPmePpComms) || vsite) { stateGpu->copyForcesFromGpu(forceWithShift, AtomLocality::Local); stateGpu->waitForcesReadyOnHost(AtomLocality::Local); @@ -2076,7 +2074,7 @@ void do_force(FILE* fplog, // TODO refactor this and unify with above GPU PME-PP / GPU update path call to the same function if (PAR(cr) && !thisRankHasDuty(cr, DUTY_PME) && !simulationWork.useGpuPmePpCommunication - && !simulationWork.useGpuUpdate && stepWork.computeSlowForces) + && stepWork.computeSlowForces) { /* In case of node-splitting, the PP nodes receive the long-range * forces, virial and energy from the PME nodes here. diff --git a/src/gromacs/mdrun/md.cpp b/src/gromacs/mdrun/md.cpp index c20574e37f..f5c0fd393e 100644 --- a/src/gromacs/mdrun/md.cpp +++ b/src/gromacs/mdrun/md.cpp @@ -1267,9 +1267,16 @@ void gmx::LegacySimulator::do_md() stateGpu->copyCoordinatesToGpu(state->x, AtomLocality::Local); } - // If the buffer ops were not offloaded this step, the forces are on the host and have to be copied - if (!runScheduleWork->stepWork.useGpuFBufferOps) + if (simulationWork.useGpuPme && !runScheduleWork->simulationWork.useGpuPmePpCommunication + && !thisRankHasDuty(cr, DUTY_PME)) { + // The PME forces were recieved to the host, so have to be copied + stateGpu->copyForcesToGpu(f.view().force(), AtomLocality::All); + } + else if (!runScheduleWork->stepWork.useGpuFBufferOps) + { + // The buffer ops were not offloaded this step, so the forces are on the + // host and have to be copied stateGpu->copyForcesToGpu(f.view().force(), AtomLocality::Local); } diff --git a/src/gromacs/taskassignment/decidegpuusage.cpp b/src/gromacs/taskassignment/decidegpuusage.cpp index 91f0eb4720..e22b51a29c 100644 --- a/src/gromacs/taskassignment/decidegpuusage.cpp +++ b/src/gromacs/taskassignment/decidegpuusage.cpp @@ -561,26 +561,19 @@ bool decideWhetherToUseGpuForUpdate(const bool isDomainDecom if (isDomainDecomposition) { - if (!devFlags.enableGpuHaloExchange) + if (hasAnyConstraints && !useUpdateGroups) { - errorMessage += "Domain decomposition without GPU halo exchange is not supported.\n "; + errorMessage += + "Domain decomposition is only supported with constraints when update " + "groups " + "are used. This means constraining all bonds is not supported, except for " + "small molecules, and box sizes close to half the pair-list cutoff are not " + "supported.\n "; } - else - { - if (hasAnyConstraints && !useUpdateGroups) - { - errorMessage += - "Domain decomposition is only supported with constraints when update " - "groups " - "are used. This means constraining all bonds is not supported, except for " - "small molecules, and box sizes close to half the pair-list cutoff are not " - "supported.\n "; - } - if (pmeUsesCpu) - { - errorMessage += "With domain decomposition, PME must run fully on the GPU.\n"; - } + if (pmeUsesCpu) + { + errorMessage += "With domain decomposition, PME must run fully on the GPU.\n"; } } @@ -590,11 +583,6 @@ bool decideWhetherToUseGpuForUpdate(const bool isDomainDecom { errorMessage += "With separate PME rank(s), PME must run fully on the GPU.\n"; } - - if (!devFlags.enableGpuPmePPComm) - { - errorMessage += "With separate PME rank(s), PME must use direct communication.\n"; - } } if (inputrec.useMts) -- 2.22.0