From: Mark Abraham Date: Tue, 5 Dec 2017 09:27:21 +0000 (+1100) Subject: Fix mdrun -nb auto -pme auto when GPUs are absent X-Git-Url: http://biod.pnpi.spb.ru/gitweb/?a=commitdiff_plain;h=a0d89bf8e412554a58e803715847b0f9dd920886;p=alexxy%2Fgromacs.git Fix mdrun -nb auto -pme auto when GPUs are absent The logic was flawed such that GPUs were "selected" for use even though none had been detected. That led to the GPU behaviour of avoiding using separate PME ranks. Also made a minor fix to the logic for emulation. The new interpretation of mdrun -gpu_id does not need to trigger an error when GPU IDs have been supplied along with the emulation environmnet variable. Fixes #2315 Change-Id: I68da27c9bfef9f73b9dae4f04f196066d2efb1e2 --- diff --git a/src/gromacs/taskassignment/decidegpuusage.cpp b/src/gromacs/taskassignment/decidegpuusage.cpp index 6002bb913b..d5132add21 100644 --- a/src/gromacs/taskassignment/decidegpuusage.cpp +++ b/src/gromacs/taskassignment/decidegpuusage.cpp @@ -233,11 +233,11 @@ decideWhetherToUseGpusForPmeWithThreadMpi(const bool useGpuForNonbo } bool decideWhetherToUseGpusForNonbonded(const TaskTarget nonbondedTarget, - const std::vector &gpuIdsToUse, const std::vector &userGpuTaskAssignment, const EmulateGpuNonbonded emulateGpuNonbonded, const bool usingVerletScheme, - const bool nonbondedOnGpuIsUseful) + const bool nonbondedOnGpuIsUseful, + const bool gpusWereDetected) { if (nonbondedTarget == TaskTarget::Cpu) { @@ -262,7 +262,7 @@ bool decideWhetherToUseGpusForNonbonded(const TaskTarget nonbondedTarg ("Nonbonded interactions on the GPU were required, which is inconsistent " "with choosing emulation. Make no more than one of these choices.")); } - if (!gpuIdsToUse.empty() || !userGpuTaskAssignment.empty()) + if (!userGpuTaskAssignment.empty()) { GMX_THROW(InconsistentInputError ("GPU ID usage was specified, as was GPU emulation. Make no more than one of these choices.")); @@ -306,13 +306,18 @@ bool decideWhetherToUseGpusForNonbonded(const TaskTarget nonbondedTarg return true; } - // We still don't know whether it is an error if no GPUs are found - // because we don't know the duty of this rank, yet. For example, - // a node with only PME ranks and -pme cpu is OK if there are not - // GPUs. + if (nonbondedTarget == TaskTarget::Gpu) + { + // We still don't know whether it is an error if no GPUs are found + // because we don't know the duty of this rank, yet. For example, + // a node with only PME ranks and -pme cpu is OK if there are not + // GPUs. + return true; + } - // If we get here, then the user permitted or required GPUs. - return true; + // If we get here, then the user permitted GPUs, which we should + // use for nonbonded interactions. + return gpusWereDetected; } bool decideWhetherToUseGpusForPme(const bool useGpuForNonbonded, @@ -320,7 +325,8 @@ bool decideWhetherToUseGpusForPme(const bool useGpuForNonbonded, const std::vector &userGpuTaskAssignment, const bool canUseGpuForPme, const int numRanksPerSimulation, - const int numPmeRanksPerSimulation) + const int numPmeRanksPerSimulation, + const bool gpusWereDetected) { if (pmeTarget == TaskTarget::Cpu) { @@ -389,11 +395,13 @@ bool decideWhetherToUseGpusForPme(const bool useGpuForNonbonded, return true; } + // If we get here, then the user permitted GPUs. if (numRanksPerSimulation == 1) { - // PME can run well on a single GPU shared with NB when - // there is one rank, so we permit mdrun to try that. - return true; + // PME can run well on a single GPU shared with NB when there + // is one rank, so we permit mdrun to try that if we have + // detected GPUs. + return gpusWereDetected; } // Not enough support for PME on GPUs for anything else diff --git a/src/gromacs/taskassignment/decidegpuusage.h b/src/gromacs/taskassignment/decidegpuusage.h index 438e00b8a6..7e01c99eab 100644 --- a/src/gromacs/taskassignment/decidegpuusage.h +++ b/src/gromacs/taskassignment/decidegpuusage.h @@ -133,22 +133,22 @@ bool decideWhetherToUseGpusForPmeWithThreadMpi(const bool useGpuFor * consistency checks. * * \param[in] nonbondedTarget The user's choice for mdrun -nb for where to assign short-ranged nonbonded interaction tasks. - * \param[in] gpuIdsToUse The compatible GPUs that the user permitted us to use. * \param[in] userGpuTaskAssignment The user-specified assignment of GPU tasks to device IDs. * \param[in] emulateGpuNonbonded Whether we will emulate GPU calculation of nonbonded interactions. * \param[in] usingVerletScheme Whether the nonbondeds are using the Verlet scheme. * \param[in] nonbondedOnGpuIsUseful Whether computing nonbonded interactions on a GPU is useful for this calculation. + * \param[in] gpusWereDetected Whether compatible GPUs were detected on any node. * * \returns Whether the simulation will run nonbonded and PME tasks, respectively, on GPUs. * * \throws std::bad_alloc If out of memory * InconsistentInputError If the user requirements are inconsistent. */ bool decideWhetherToUseGpusForNonbonded(const TaskTarget nonbondedTarget, - const std::vector &gpuIdsToUse, const std::vector &userGpuTaskAssignment, const EmulateGpuNonbonded emulateGpuNonbonded, const bool usingVerletScheme, - const bool nonbondedOnGpuIsUseful); + const bool nonbondedOnGpuIsUseful, + const bool gpusWereDetected); /*! \brief Decide whether the simulation will try to run tasks of * different types on GPUs. @@ -172,6 +172,7 @@ bool decideWhetherToUseGpusForNonbonded(const TaskTarget nonbondedTarg * \param[in] canUseGpuForPme Whether the form of PME chosen can run on a GPU * \param[in] numRanksPerSimulation The number of ranks in each simulation. * \param[in] numPmeRanksPerSimulation The number of PME ranks in each simulation. + * \param[in] gpusWereDetected Whether compatible GPUs were detected on any node. * * \returns Whether the simulation will run nonbonded and PME tasks, respectively, on GPUs. * @@ -182,7 +183,8 @@ bool decideWhetherToUseGpusForPme(const bool useGpuForNonbonded, const std::vector &userGpuTaskAssignment, const bool canUseGpuForPme, const int numRanksPerSimulation, - const int numPmeRanksPerSimulation); + const int numPmeRanksPerSimulation, + const bool gpusWereDetected); } diff --git a/src/programs/mdrun/runner.cpp b/src/programs/mdrun/runner.cpp index d0d7a0cb8b..7dabeb32e7 100644 --- a/src/programs/mdrun/runner.cpp +++ b/src/programs/mdrun/runner.cpp @@ -647,12 +647,20 @@ int Mdrunner::mdrunner() bool useGpuForPme = false; try { - useGpuForNonbonded = decideWhetherToUseGpusForNonbonded(nonbondedTarget, gpuIdsToUse, userGpuTaskAssignment, + // It's possible that there are different numbers of GPUs on + // different nodes, which is the user's responsibilty to + // handle. If unsuitable, we will notice that during task + // assignment. + bool gpusWereDetected = hwinfo->ngpu_compatible_tot > 0; + useGpuForNonbonded = decideWhetherToUseGpusForNonbonded(nonbondedTarget, userGpuTaskAssignment, emulateGpuNonbonded, inputrec->cutoff_scheme == ecutsVERLET, - gpuAccelerationOfNonbondedIsUseful(mdlog, inputrec, doRerun)); + gpuAccelerationOfNonbondedIsUseful(mdlog, inputrec, doRerun), + gpusWereDetected); auto inputSystemHasPme = EEL_PME(inputrec->coulombtype) || EVDW_PME(inputrec->vdwtype); auto canUseGpuForPme = inputSystemHasPme && pme_gpu_supports_input(inputrec, nullptr); - useGpuForPme = decideWhetherToUseGpusForPme(useGpuForNonbonded, pmeTarget, userGpuTaskAssignment, canUseGpuForPme, cr->nnodes, domdecOptions.numPmeRanks); + useGpuForPme = decideWhetherToUseGpusForPme(useGpuForNonbonded, pmeTarget, userGpuTaskAssignment, + canUseGpuForPme, cr->nnodes, domdecOptions.numPmeRanks, + gpusWereDetected); pmeRunMode = (useGpuForPme ? PmeRunMode::GPU : PmeRunMode::CPU); if ((pmeRunMode == PmeRunMode::GPU) && (pmeFftTarget == TaskTarget::Cpu)) {