From badb7d80fa41ad9a363c1eb0bcfea72b6d3722e8 Mon Sep 17 00:00:00 2001 From: Artem Zhmurov Date: Mon, 30 Dec 2019 12:17:20 +0100 Subject: [PATCH] Fix gpuupdate matrix 1. Set GPU direct communications environment variables in update matrix to allow for multi-rank testing. 2. Fall back to the CPU update if PME is not fully offloaded in DD case and with PME-only rank (requred for GPU direct communications). Change-Id: I66283a446b0cc58fd0165226b01bf58ce9ec90f1 --- admin/builds/gromacs.py | 2 ++ src/gromacs/mdlib/sim_util.cpp | 2 +- src/gromacs/mdrun/runner.cpp | 7 ++++--- src/gromacs/taskassignment/decidegpuusage.cpp | 10 ++++++++-- src/gromacs/taskassignment/decidegpuusage.h | 7 +++++-- 5 files changed, 20 insertions(+), 8 deletions(-) diff --git a/admin/builds/gromacs.py b/admin/builds/gromacs.py index e429c623bc..a734f2cd8b 100644 --- a/admin/builds/gromacs.py +++ b/admin/builds/gromacs.py @@ -186,6 +186,8 @@ def do_build(context): # GPU update flag enables GPU update+constraints as well as buffer ops (dependency) if context.opts.gpuupdate: context.env.set_env_var('GMX_FORCE_UPDATE_DEFAULT_GPU', "1") + context.env.set_env_var('GMX_GPU_DD_COMMS', "1") + context.env.set_env_var('GMX_GPU_PME_PP_COMMS', "1") regressiontests_path = context.workspace.get_project_dir(Project.REGRESSIONTESTS) diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp index c8d3cbb713..91761324e0 100644 --- a/src/gromacs/mdlib/sim_util.cpp +++ b/src/gromacs/mdlib/sim_util.cpp @@ -1301,7 +1301,7 @@ void do_force(FILE* fplog, // Note: GPU update + DD without direct communication is not supported, // a waitCoordinatesReadyOnHost() should be issued if it will be. GMX_ASSERT(!simulationWork.useGpuUpdate, - "GPU update is not supported with halo exchange"); + "GPU update is not supported with CPU halo exchange"); dd_move_x(cr->dd, box, x.unpaddedArrayRef(), wcycle); } diff --git a/src/gromacs/mdrun/runner.cpp b/src/gromacs/mdrun/runner.cpp index f5b9cb1b7a..4eea987110 100644 --- a/src/gromacs/mdrun/runner.cpp +++ b/src/gromacs/mdrun/runner.cpp @@ -1191,9 +1191,10 @@ int Mdrunner::mdrunner() const bool useUpdateGroups = cr->dd ? ddUsesUpdateGroups(*cr->dd) : false; useGpuForUpdate = decideWhetherToUseGpuForUpdate( - devFlags.forceGpuUpdateDefault, useDomainDecomposition, useUpdateGroups, useGpuForPme, - useGpuForNonbonded, updateTarget, gpusWereDetected, *inputrec, mtop, doEssentialDynamics, - gmx_mtop_ftype_count(mtop, F_ORIRES) > 0, replExParams.exchangeInterval > 0, doRerun); + devFlags.forceGpuUpdateDefault, useDomainDecomposition, useUpdateGroups, pmeRunMode, + domdecOptions.numPmeRanks > 0, useGpuForNonbonded, updateTarget, gpusWereDetected, + *inputrec, mtop, doEssentialDynamics, gmx_mtop_ftype_count(mtop, F_ORIRES) > 0, + replExParams.exchangeInterval > 0, doRerun); } GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR diff --git a/src/gromacs/taskassignment/decidegpuusage.cpp b/src/gromacs/taskassignment/decidegpuusage.cpp index 9b069fd890..3bb192d446 100644 --- a/src/gromacs/taskassignment/decidegpuusage.cpp +++ b/src/gromacs/taskassignment/decidegpuusage.cpp @@ -492,7 +492,8 @@ bool decideWhetherToUseGpusForBonded(const bool useGpuForNonbonded, bool decideWhetherToUseGpuForUpdate(const bool forceGpuUpdateDefault, const bool isDomainDecomposition, const bool useUpdateGroups, - const bool useGpuForPme, + const PmeRunMode pmeRunMode, + const bool havePmeOnlyRank, const bool useGpuForNonbonded, const TaskTarget updateTarget, const bool gpusWereDetected, @@ -536,11 +537,16 @@ bool decideWhetherToUseGpuForUpdate(const bool forceGpuUpdateDefault, // Using the GPU-version of update if: // 1. PME is on the GPU (there should be a copy of coordinates on GPU for PME spread), or // 2. Non-bonded interactions are on the GPU. - if (!(useGpuForPme || useGpuForNonbonded)) + if (pmeRunMode == PmeRunMode::CPU && !useGpuForNonbonded) { errorMessage += "Either PME or short-ranged non-bonded interaction tasks must run on the GPU.\n"; } + // Since only direct GPU communications are supported with GPU update, PME should be fully offloaded in DD and PME only cases. + if (pmeRunMode != PmeRunMode::GPU && (isDomainDecomposition || havePmeOnlyRank)) + { + errorMessage += "PME should run on GPU.\n"; + } if (!gpusWereDetected) { errorMessage += "Compatible GPUs must have been found.\n"; diff --git a/src/gromacs/taskassignment/decidegpuusage.h b/src/gromacs/taskassignment/decidegpuusage.h index 45ba2d333f..94db669b4f 100644 --- a/src/gromacs/taskassignment/decidegpuusage.h +++ b/src/gromacs/taskassignment/decidegpuusage.h @@ -48,6 +48,7 @@ struct gmx_hw_info_t; struct gmx_mtop_t; struct t_inputrec; +enum class PmeRunMode; namespace gmx { @@ -235,7 +236,8 @@ bool decideWhetherToUseGpusForBonded(bool useGpuForNonbonded, * \param[in] forceGpuUpdateDefault If update should run on GPU by default. * \param[in] isDomainDecomposition Whether there more than one domain. * \param[in] useUpdateGroups If the constraints can be split across domains. - * \param[in] useGpuForPme Whether GPUs will be used for PME interactions. + * \param[in] pmeRunMode PME running mode: CPU, GPU or mixed. + * \param[in] havePmeOnlyRank If there is a PME-only rank in the simulation. * \param[in] useGpuForNonbonded Whether GPUs will be used for nonbonded interactions. * \param[in] updateTarget User choice for running simulation on GPU. * \param[in] gpusWereDetected Whether compatible GPUs were detected on any node. @@ -253,7 +255,8 @@ bool decideWhetherToUseGpusForBonded(bool useGpuForNonbonded, bool decideWhetherToUseGpuForUpdate(bool forceGpuUpdateDefault, bool isDomainDecomposition, bool useUpdateGroups, - bool useGpuForPme, + PmeRunMode pmeRunMode, + bool havePmeOnlyRank, bool useGpuForNonbonded, TaskTarget updateTarget, bool gpusWereDetected, -- 2.22.0