From f8a8252cd2a4b7760167cbbce3dc3474ae1534e1 Mon Sep 17 00:00:00 2001 From: Artem Zhmurov Date: Thu, 5 Dec 2019 14:49:25 +0100 Subject: [PATCH] Allow using GPU update with DD and update groups The GPU update is now can be enabled for the supported DD cases with the GMX_FORCE_UPDATE_DEFAULT_GPU environment variable. Also added the checks on whether the SHAKE algorithm was requested, since SHAKE is not supported by the GPU update. Refs. #3226, #3163. Change-Id: I57e3ad3b8a571ec244989e888afd5cfcbaf9b75e --- admin/builds/gpuupdate-matrix.txt | 21 +++----- src/gromacs/domdec/domdec.cpp | 5 ++ src/gromacs/domdec/domdec.h | 3 ++ src/gromacs/mdlib/sim_util.cpp | 53 +++++++++++-------- src/gromacs/mdrun/md.cpp | 9 +++- src/gromacs/mdrun/runner.cpp | 42 ++++++++++----- src/gromacs/taskassignment/decidegpuusage.cpp | 22 ++++++-- src/gromacs/taskassignment/decidegpuusage.h | 28 +++++----- 8 files changed, 117 insertions(+), 66 deletions(-) diff --git a/admin/builds/gpuupdate-matrix.txt b/admin/builds/gpuupdate-matrix.txt index 5fdbc2ed0f..cd0c31b09e 100644 --- a/admin/builds/gpuupdate-matrix.txt +++ b/admin/builds/gpuupdate-matrix.txt @@ -1,26 +1,19 @@ # This matrix is intended to permit Jenkins on-demand testing -# of code hidden behind the GMX_USE_BUFFER_OPS feature flag -# during development. When the feature flag is removed, the -# normal test matrices will be adapted to cover this code path. +# of GPU update code path during development. # # Comment line(s) preceding each configuration document the main # intent behind that configuration, so that we can correctly judge # whether to preserve that during maintenance decisions. -# -# Both configurations currently target bs_nix1204, for better load -# balance with pre-submit matrix, which makes heavier use of -# bs_nix1310 agent. -# Test newest gcc supported by newest CUDA at time of release -# Test thread-MPI with CUDA -# Test GPU update-constraints features in the above combination +# Test GPU update-constraints features on a single PP+PME rank gcc-8 gpuhw=nvidia nranks=1 gpu_id=1 cuda-10.1 thread-mpi openmp cmake-3.10.0 release-with-assert simd=avx2_256 hwloc libhwloc-2.0.4 gpuupdate -# Test CUDA build on a agent with no CUDA devices -# Test without TNG support -# Test GPU update-constraints features in the above combination +# Test GPU update-constraints features in a CUDA build without CUDA devices gcc-7 gpuhw=none cuda-10.0 openmp no-tng release-with-assert gpuupdate -# Test OpenCL build with gpudev features # Test GPU update-constraints on the OpenCL path where it is unsupported clang-8 openmp gpuhw=amd opencl-1.2 clFFT-2.14 simd=None gpuupdate + +# Test GPU update-constraints features with multiple PP ranks and one PME rank +# Note: this should fall back correctly to the CPU codepath +gcc-5 gpuhw=nvidia cuda-9.0 cmake-3.9.6 thread-mpi npme=1 nranks=3 release-with-assert gpuupdate diff --git a/src/gromacs/domdec/domdec.cpp b/src/gromacs/domdec/domdec.cpp index 2f931240e4..de3fc3c4b3 100644 --- a/src/gromacs/domdec/domdec.cpp +++ b/src/gromacs/domdec/domdec.cpp @@ -1032,6 +1032,11 @@ bool ddHaveSplitConstraints(const gmx_domdec_t& dd) return dd.comm->systemInfo.haveSplitConstraints; } +bool ddUsesUpdateGroups(const gmx_domdec_t& dd) +{ + return dd.comm->systemInfo.useUpdateGroups; +} + void dd_cycles_add(const gmx_domdec_t* dd, float cycles, int ddCycl) { /* Note that the cycles value can be incorrect, either 0 or some diff --git a/src/gromacs/domdec/domdec.h b/src/gromacs/domdec/domdec.h index 61691d63e3..9fb46bc060 100644 --- a/src/gromacs/domdec/domdec.h +++ b/src/gromacs/domdec/domdec.h @@ -147,6 +147,9 @@ int dd_pme_maxshift_y(const gmx_domdec_t* dd); /*! \brief Return whether constraints, not including settles, cross domain boundaries */ bool ddHaveSplitConstraints(const gmx_domdec_t& dd); +/*! \brief Return whether update groups are used */ +bool ddUsesUpdateGroups(const gmx_domdec_t& dd); + /*! \brief Return whether the DD has a single dimension with a single pulse * * The GPU halo exchange code requires a 1D single-pulse DD, and its diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp index c6adc7b73c..832ff21620 100644 --- a/src/gromacs/mdlib/sim_util.cpp +++ b/src/gromacs/mdlib/sim_util.cpp @@ -1032,11 +1032,31 @@ void do_force(FILE* fplog, } } - // Copy coordinate from the GPU if update is on the GPU and there are forces to be computed on - // the CPU, or for the computation of virial. At search steps the current coordinates are - // already on the host, hence copy is not needed. + // TODO Update this comment when introducing SimulationWorkload + // + // The conditions for gpuHaloExchange e.g. using GPU buffer + // operations were checked before construction, so here we can + // just use it and assert upon any conditions. + gmx::GpuHaloExchange* gpuHaloExchange = + (havePPDomainDecomposition(cr) ? cr->dd->gpuHaloExchange.get() : nullptr); + const bool ddUsesGpuDirectCommunication = (gpuHaloExchange != nullptr); + GMX_ASSERT(!ddUsesGpuDirectCommunication || (useGpuXBufOps == BufferOpsUseGpu::True), + "Must use coordinate buffer ops with GPU halo exchange"); + const bool useGpuForcesHaloExchange = + ddUsesGpuDirectCommunication && (useGpuFBufOps == BufferOpsUseGpu::True); + + // Copy coordinate from the GPU if update is on the GPU and there + // are forces to be computed on the CPU, or for the computation of + // virial, or if host-side data will be transferred from this task + // to a remote task for halo exchange or PME-PP communication. At + // search steps the current coordinates are already on the host, + // hence copy is not needed. + const bool haveHostPmePpComms = + !thisRankHasDuty(cr, DUTY_PME) && !simulationWork.useGpuPmePpCommunication; + const bool haveHostHaloExchangeComms = havePPDomainDecomposition(cr) && !ddUsesGpuDirectCommunication; if (simulationWork.useGpuUpdate && !stepWork.doNeighborSearch - && (runScheduleWork->domainWork.haveCpuLocalForceWork || stepWork.computeVirial)) + && (runScheduleWork->domainWork.haveCpuLocalForceWork || stepWork.computeVirial + || haveHostPmePpComms || haveHostHaloExchangeComms)) { stateGpu->copyCoordinatesFromGpu(x.unpaddedArrayRef(), AtomLocality::Local); stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local); @@ -1223,19 +1243,6 @@ void do_force(FILE* fplog, launchPmeGpuFftAndGather(fr->pmedata, wcycle); } - // TODO Update this comment when introducing SimulationWorkload - // - // The conditions for gpuHaloExchange e.g. using GPU buffer - // operations were checked before construction, so here we can - // just use it and assert upon any conditions. - gmx::GpuHaloExchange* gpuHaloExchange = - (havePPDomainDecomposition(cr) ? cr->dd->gpuHaloExchange.get() : nullptr); - const bool ddUsesGpuDirectCommunication = (gpuHaloExchange != nullptr); - GMX_ASSERT(!ddUsesGpuDirectCommunication || (useGpuXBufOps == BufferOpsUseGpu::True), - "Must use coordinate buffer ops with GPU halo exchange"); - const bool useGpuForcesHaloExchange = - ddUsesGpuDirectCommunication && (useGpuFBufOps == BufferOpsUseGpu::True); - /* Communicate coordinates and sum dipole if necessary + do non-local pair search */ if (havePPDomainDecomposition(cr)) @@ -1638,9 +1645,10 @@ void do_force(FILE* fplog, wallcycle_stop(wcycle, ewcFORCE); } - // If on GPU PME-PP comms path, receive forces from PME before GPU buffer ops - // TODO refoactor this and unify with below default-path call to the same function - if (PAR(cr) && !thisRankHasDuty(cr, DUTY_PME) && simulationWork.useGpuPmePpCommunication) + // If on GPU PME-PP comms or GPU update path, receive forces from PME before GPU buffer ops + // TODO refactor this and unify with below default-path call to the same function + if (PAR(cr) && !thisRankHasDuty(cr, DUTY_PME) + && (simulationWork.useGpuPmePpCommunication || simulationWork.useGpuUpdate)) { /* In case of node-splitting, the PP nodes receive the long-range * forces, virial and energy from the PME nodes here. @@ -1757,8 +1765,9 @@ void do_force(FILE* fplog, } } - // TODO refoactor this and unify with above PME-PP GPU communication path call to the same function - if (PAR(cr) && !thisRankHasDuty(cr, DUTY_PME) && !simulationWork.useGpuPmePpCommunication) + // TODO refactor this and unify with above GPU PME-PP / GPU update path call to the same function + if (PAR(cr) && !thisRankHasDuty(cr, DUTY_PME) && !simulationWork.useGpuPmePpCommunication + && !simulationWork.useGpuUpdate) { /* In case of node-splitting, the PP nodes receive the long-range * forces, virial and energy from the PME nodes here. diff --git a/src/gromacs/mdrun/md.cpp b/src/gromacs/mdrun/md.cpp index a5cdccce96..ff3d0f88b8 100644 --- a/src/gromacs/mdrun/md.cpp +++ b/src/gromacs/mdrun/md.cpp @@ -334,8 +334,13 @@ void gmx::LegacySimulator::do_md() if (useGpuForUpdate) { - GMX_RELEASE_ASSERT(!DOMAINDECOMP(cr), - "Domain decomposition is not supported with the GPU update.\n"); + GMX_RELEASE_ASSERT(!DOMAINDECOMP(cr) || ddUsesUpdateGroups(*cr->dd) || constr == nullptr + || constr->numConstraintsTotal() == 0, + "Constraints in domain decomposition are only supported with update " + "groups if using GPU update.\n"); + GMX_RELEASE_ASSERT(ir->eConstrAlg != econtSHAKE || constr == nullptr + || constr->numConstraintsTotal() == 0, + "SHAKE is not supported with GPU update."); GMX_RELEASE_ASSERT(useGpuForPme || (useGpuForNonbonded && simulationWork.useGpuBufferOps), "Either PME or short-ranged non-bonded interaction tasks must run on " "the GPU to use GPU update.\n"); diff --git a/src/gromacs/mdrun/runner.cpp b/src/gromacs/mdrun/runner.cpp index 9b30f4d60c..2a1a0be8b0 100644 --- a/src/gromacs/mdrun/runner.cpp +++ b/src/gromacs/mdrun/runner.cpp @@ -99,6 +99,7 @@ #include "gromacs/mdlib/qmmm.h" #include "gromacs/mdlib/sighandler.h" #include "gromacs/mdlib/stophandler.h" +#include "gromacs/mdlib/updategroups.h" #include "gromacs/mdrun/mdmodules.h" #include "gromacs/mdrun/simulationcontext.h" #include "gromacs/mdrunutility/handlerestart.h" @@ -175,6 +176,8 @@ struct DevelopmentFeatureFlags //! True if the Buffer ops development feature is enabled // TODO: when the trigger of the buffer ops offload is fully automated this should go away bool enableGpuBufferOps = false; + //! If true, forces 'mdrun -update auto' default to 'gpu' when running with DD + bool forceGpuUpdateDefaultWithDD = false; //! True if the GPU halo exchange development feature is enabled bool enableGpuHaloExchange = false; //! True if the PME PP direct communication GPU development feature is enabled @@ -209,6 +212,7 @@ static DevelopmentFeatureFlags manageDevelopmentFeatures(const gmx::MDLogger& md #pragma GCC diagnostic ignored "-Wunused-result" devFlags.enableGpuBufferOps = (getenv("GMX_USE_GPU_BUFFER_OPS") != nullptr) && (GMX_GPU == GMX_GPU_CUDA) && useGpuForNonbonded; + devFlags.forceGpuUpdateDefaultWithDD = (getenv("GMX_FORCE_UPDATE_DEFAULT_GPU") != nullptr); devFlags.enableGpuHaloExchange = (getenv("GMX_GPU_DD_COMMS") != nullptr && GMX_THREAD_MPI && (GMX_GPU == GMX_GPU_CUDA)); devFlags.enableGpuPmePPComm = @@ -224,6 +228,15 @@ static DevelopmentFeatureFlags manageDevelopmentFeatures(const gmx::MDLogger& md "GMX_USE_GPU_BUFFER_OPS environment variable."); } + if (devFlags.forceGpuUpdateDefaultWithDD) + { + GMX_LOG(mdlog.warning) + .asParagraph() + .appendTextFormatted( + "NOTE: This run will default to '-update gpu' as requested by the " + "GMX_FORCE_UPDATE_DEFAULT_GPU environment variable."); + } + if (devFlags.enableGpuHaloExchange) { if (useGpuForNonbonded) @@ -890,19 +903,6 @@ int Mdrunner::mdrunner() const DevelopmentFeatureFlags devFlags = manageDevelopmentFeatures(mdlog, useGpuForNonbonded, pmeRunMode); - // NOTE: The devFlags need decideWhetherToUseGpusForNonbonded(...) and decideWhetherToUseGpusForPme(...) for overrides, - // decideWhetherToUseGpuForUpdate() needs devFlags for the '-update auto' override, hence the interleaving. - // NOTE: When the simulationWork is constructed, the useGpuForUpdate overrides the devFlags.enableGpuBufferOps. - try - { - useGpuForUpdate = decideWhetherToUseGpuForUpdate( - useDomainDecomposition, useGpuForPme, useGpuForNonbonded, updateTarget, - gpusWereDetected, *inputrec, mtop, doEssentialDynamics, - gmx_mtop_ftype_count(mtop, F_ORIRES) > 0, replExParams.exchangeInterval > 0, doRerun); - } - GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR - - // Build restraints. // TODO: hide restraint implementation details from Mdrunner. // There is nothing unique about restraints at this point as far as the @@ -1178,6 +1178,22 @@ int Mdrunner::mdrunner() // Note that local state still does not exist yet. } + // The GPU update is decided here because we need to know whether the constraints or + // SETTLEs can span accross the domain borders (i.e. whether or not update groups are + // defined). This is only known after DD is initialized, hence decision on using GPU + // update is done so late. + try + { + const bool useUpdateGroups = cr->dd ? ddUsesUpdateGroups(*cr->dd) : false; + + useGpuForUpdate = decideWhetherToUseGpuForUpdate( + devFlags.forceGpuUpdateDefaultWithDD, useDomainDecomposition, useUpdateGroups, + useGpuForPme, useGpuForNonbonded, updateTarget, gpusWereDetected, *inputrec, mtop, + doEssentialDynamics, gmx_mtop_ftype_count(mtop, F_ORIRES) > 0, + replExParams.exchangeInterval > 0, doRerun); + } + GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR + if (PAR(cr)) { /* After possible communicator splitting in make_dd_communicators. diff --git a/src/gromacs/taskassignment/decidegpuusage.cpp b/src/gromacs/taskassignment/decidegpuusage.cpp index b7eff1d3bf..fc1f4faf60 100644 --- a/src/gromacs/taskassignment/decidegpuusage.cpp +++ b/src/gromacs/taskassignment/decidegpuusage.cpp @@ -489,7 +489,9 @@ bool decideWhetherToUseGpusForBonded(const bool useGpuForNonbonded, return gpusWereDetected && usingOurCpuForPmeOrEwald; } -bool decideWhetherToUseGpuForUpdate(const bool isDomainDecomposition, +bool decideWhetherToUseGpuForUpdate(const bool forceGpuUpdateDefaultWithDD, + const bool isDomainDecomposition, + const bool useUpdateGroups, const bool useGpuForPme, const bool useGpuForNonbonded, const TaskTarget updateTarget, @@ -507,11 +509,20 @@ bool decideWhetherToUseGpuForUpdate(const bool isDomainDecomposition, return false; } + const bool hasAnyConstraints = gmx_mtop_interaction_count(mtop, IF_CONSTRAINT) > 0; + std::string errorMessage; - if (isDomainDecomposition) + if (isDomainDecomposition && hasAnyConstraints && !useUpdateGroups) + { + errorMessage += + "Domain decomposition is only supported with constraints when update groups are " + "used. This means constraining all bonds is not supported, except for small " + "molecules, and box sizes close to half the pair-list cutoff are not supported.\n "; + } + if (inputrec.eConstrAlg == econtSHAKE && hasAnyConstraints && gmx_mtop_ftype_count(mtop, F_CONSTR) > 0) { - errorMessage += "Domain decomposition is not supported.\n"; + errorMessage += "SHAKE constraints are not supported.\n"; } // Using the GPU-version of update if: // 1. PME is on the GPU (there should be a copy of coordinates on GPU for PME spread), or @@ -606,6 +617,11 @@ bool decideWhetherToUseGpuForUpdate(const bool isDomainDecomposition, return false; } + if (isDomainDecomposition) + { + return forceGpuUpdateDefaultWithDD; + } + return true; } diff --git a/src/gromacs/taskassignment/decidegpuusage.h b/src/gromacs/taskassignment/decidegpuusage.h index ace5f39cc2..b01aa97c42 100644 --- a/src/gromacs/taskassignment/decidegpuusage.h +++ b/src/gromacs/taskassignment/decidegpuusage.h @@ -231,23 +231,27 @@ bool decideWhetherToUseGpusForBonded(bool useGpuForNonbonded, /*! \brief Decide whether to use GPU for update. * - * \param[in] isDomainDecomposition Whether there more than one domain. - * \param[in] useGpuForPme Whether GPUs will be used for PME interactions. - * \param[in] useGpuForNonbonded Whether GPUs will be used for nonbonded interactions. - * \param[in] updateTarget User choice for running simulation on GPU. - * \param[in] gpusWereDetected Whether compatible GPUs were detected on any node. - * \param[in] inputrec The user input. - * \param[in] mtop The global topology. - * \param[in] useEssentialDynamics If essential dynamics is active. - * \param[in] doOrientationRestraints If orientation restraints are enabled. - * \param[in] useReplicaExchange If this is a REMD simulation. - * \param[in] doRerun It this is a rerun. + * \param[in] forceGpuUpdateDefaultWithDD If update should run on GPU with DD by default. + * \param[in] isDomainDecomposition Whether there more than one domain. + * \param[in] useUpdateGroups If the constraints can be split across domains. + * \param[in] useGpuForPme Whether GPUs will be used for PME interactions. + * \param[in] useGpuForNonbonded Whether GPUs will be used for nonbonded interactions. + * \param[in] updateTarget User choice for running simulation on GPU. + * \param[in] gpusWereDetected Whether compatible GPUs were detected on any node. + * \param[in] inputrec The user input. + * \param[in] mtop The global topology. + * \param[in] useEssentialDynamics If essential dynamics is active. + * \param[in] doOrientationRestraints If orientation restraints are enabled. + * \param[in] useReplicaExchange If this is a REMD simulation. + * \param[in] doRerun It this is a rerun. * * \returns Whether complete simulation can be run on GPU. * \throws std::bad_alloc If out of memory * InconsistentInputError If the user requirements are inconsistent. */ -bool decideWhetherToUseGpuForUpdate(bool isDomainDecomposition, +bool decideWhetherToUseGpuForUpdate(bool forceGpuUpdateDefaultWithDD, + bool isDomainDecomposition, + bool useUpdateGroups, bool useGpuForPme, bool useGpuForNonbonded, TaskTarget updateTarget, -- 2.22.0