From e87a5310c36130ac1d26867f8f7e597dcd9fd513 Mon Sep 17 00:00:00 2001 From: Mark Abraham Date: Wed, 23 Aug 2017 16:06:44 +0200 Subject: [PATCH] Extend task assignment code Existing behaviour is largely unchanged, apart from some details of how conditions that prevent task assignment are handled, and when. However it is not feasible in the longer term to continue to implement a way for gmx mdrun -gpu_id to imply the thread-MPI rank split, so that is disabled now, along with a useful error message. Instead, for both real and thread MPI, -gpu_id now limits the available GPU IDs (issuing an error if there are any duplicates), somewhat like CUDA_VISIBLE_DEVICES. The new mdrun -gputasks option specifies a full GPU task assignment, and must be accompanied by a choice of ranks and what kind of device recevies tasks of each type. Documentation is updated accordingly. Aspects of the implementation anticipate the extension to support long-ranged PME interactions on GPUs, and others in future, so that the task assignment on a node now takes the form of a container of tasks, potentially of different types, on each rank of the node. A flat vector of ints is no longer sufficient. Errors e.g. from inconsistent user input are now handled with exceptions, so that the runner can take the responsibility of reporting those correctly, rather than always aborting the program at the point where the issue is detected. gmx tune_pme now explicitly only supports the new form of -gpu_id, though it would not be difficult to support -gputasks if there was need. Change-Id: I0c149913bd43418d374171f5f95dad7f25d3cfe4 --- docs/user-guide/environment-variables.rst | 8 +- docs/user-guide/mdrun-features.rst | 2 +- docs/user-guide/mdrun-performance.rst | 61 +-- .../ewald/tests/testhardwarecontexts.cpp | 2 +- src/gromacs/gmxana/gmx_tune_pme.cpp | 106 +++-- src/gromacs/hardware/detecthardware.cpp | 1 - src/gromacs/hardware/hw_info.h | 7 +- src/gromacs/taskassignment/CMakeLists.txt | 5 +- src/gromacs/taskassignment/decidegpuusage.cpp | 398 ++++++++++++++++++ src/gromacs/taskassignment/decidegpuusage.h | 185 ++++++++ .../taskassignment/findallgputasks.cpp | 221 ++++++++++ src/gromacs/taskassignment/findallgputasks.h | 65 +++ src/gromacs/taskassignment/hardwareassign.cpp | 258 ------------ src/gromacs/taskassignment/hardwareassign.h | 125 ------ src/gromacs/taskassignment/reportgpuusage.cpp | 142 +++++++ src/gromacs/taskassignment/reportgpuusage.h | 83 ++++ .../taskassignment/resourcedivision.cpp | 91 ++-- src/gromacs/taskassignment/resourcedivision.h | 4 +- src/gromacs/taskassignment/taskassignment.cpp | 296 +++++++++++++ src/gromacs/taskassignment/taskassignment.h | 123 ++++++ src/gromacs/taskassignment/usergpuids.h | 11 +- src/programs/mdrun/mdrun.cpp | 41 +- src/programs/mdrun/runner.cpp | 361 ++++++++++------ 23 files changed, 1904 insertions(+), 692 deletions(-) create mode 100644 src/gromacs/taskassignment/decidegpuusage.cpp create mode 100644 src/gromacs/taskassignment/decidegpuusage.h create mode 100644 src/gromacs/taskassignment/findallgputasks.cpp create mode 100644 src/gromacs/taskassignment/findallgputasks.h delete mode 100644 src/gromacs/taskassignment/hardwareassign.cpp delete mode 100644 src/gromacs/taskassignment/hardwareassign.h create mode 100644 src/gromacs/taskassignment/reportgpuusage.cpp create mode 100644 src/gromacs/taskassignment/reportgpuusage.h create mode 100644 src/gromacs/taskassignment/taskassignment.cpp create mode 100644 src/gromacs/taskassignment/taskassignment.h diff --git a/docs/user-guide/environment-variables.rst b/docs/user-guide/environment-variables.rst index 217629c61c..4d6d780bf2 100644 --- a/docs/user-guide/environment-variables.rst +++ b/docs/user-guide/environment-variables.rst @@ -209,9 +209,15 @@ Performance and Run Control ``GMX_GPU_ID`` set in the same way as ``mdrun -gpu_id``, ``GMX_GPU_ID`` - allows the user to specify different GPU id-s, which can be useful for selecting different + allows the user to specify different GPU IDs for different ranks, which can be useful for selecting different devices on different compute nodes in a cluster. Cannot be used in conjunction with ``mdrun -gpu_id``. +``GMX_GPUTASKS`` + set in the same way as ``mdrun -gputasks``, ``GMX_GPUTASKS`` allows the mapping + of GPU tasks to GPU device IDs to be different on different ranks, if e.g. the MPI + runtime permits this variable to be different for different ranks. Cannot be used + in conjunction with ``mdrun -gputasks``. Has all the same requirements as ``mdrun -gputasks``. + ``GMX_IGNORE_FSYNC_FAILURE_ENV`` allow :ref:`gmx mdrun` to continue even if a file is missing. diff --git a/docs/user-guide/mdrun-features.rst b/docs/user-guide/mdrun-features.rst index f18afcdba1..f4456d5818 100644 --- a/docs/user-guide/mdrun-features.rst +++ b/docs/user-guide/mdrun-features.rst @@ -98,7 +98,7 @@ and output files are found in directories ``a``, ``b``, ``c``, and ``d``. :: - mpirun -np 32 gmx_mpi mdrun -multidir a b c d -gpu_id 0000000011111111 + mpirun -np 32 gmx_mpi mdrun -multidir a b c d -gputasks 0000000011111111 Starts the same multi-simulation as before. On a machine with two physical nodes and two GPUs per node, there will be 16 MPI ranks per diff --git a/docs/user-guide/mdrun-performance.rst b/docs/user-guide/mdrun-performance.rst index 62d1925409..fcfa4f61a9 100644 --- a/docs/user-guide/mdrun-performance.rst +++ b/docs/user-guide/mdrun-performance.rst @@ -205,12 +205,6 @@ behavior. the total number of OpenMP threads per separate PME ranks. The default, 0, copies the value from ``-ntomp``. -``-gpu_id`` - A string that specifies the ID numbers of the GPUs to be - used by corresponding PP ranks on this node. For example, - "0011" specifies that the lowest two PP ranks use GPU 0, - and the other two use GPU 1. - ``-pin`` Can be set to "auto," "on" or "off" to control whether mdrun will attempt to set the affinity of threads to cores. @@ -253,6 +247,29 @@ behavior. Setting "cpu" requires that no GPU is used. Setting "gpu" requires that a compatible GPU be available and will be used. +``-gpu_id`` + A string that specifies the ID numbers of the GPUs that + are available to be used by ranks on this node. For example, + "12" specifies that the GPUs with IDs 1 and 2 (as reported + by the GPU runtime) can be used by mdrun. This is useful + when sharing a node with other computations, or if a GPU + is best used to support a display. If many GPUs are + present, a comma may be used to separate the IDs, so + "12,13" would make GPUs 12 and 13 available to mdrun. + It could be necessary to use different GPUs on different + nodes of a simulation, in which case the environment + variable ``GMX_GPU_ID`` can be set differently for the ranks + on different nodes to achieve that result. + +``-gputasks`` + A string that specifies the ID numbers of the GPUs to be + used by corresponding GPU tasks on this node. For example, + "0011" specifies that the first two GPU tasks will use GPU 0, + and the other two use GPU 1. When using this option, the + number of ranks must be known to mdrun, as well as where + tasks of different types should be run, such as by using + ``-nb gpu``. + Examples for mdrun on one node ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -298,7 +315,7 @@ CPU cores between them using OpenMP threads. :: - gmx mdrun -ntmpi 4 -gpu_id "1122" + gmx mdrun -ntmpi 4 -nb gpu -gputasks 1122 Starts mdrun using four thread-MPI ranks, and maps them to GPUs with IDs 1 and 2. The CPU cores available will @@ -437,7 +454,7 @@ each. :: - mpirun -np 4 gmx mdrun -ntomp 6 -gpu_id 00 + mpirun -np 4 gmx mdrun -ntomp 6 -nb gpu -gputasks 00 Starts :ref:`mdrun_mpi` on a machine with two nodes, using four total ranks, each rank with six OpenMP threads, @@ -445,7 +462,7 @@ and both ranks on a node sharing GPU with ID 0. :: - mpirun -np 8 gmx mdrun -ntomp 3 -gpu_id 0000 + mpirun -np 8 gmx mdrun -ntomp 3 -gputasks 0000 Using a same/similar hardware as above, starts :ref:`mdrun_mpi` on a machine with two nodes, using @@ -456,21 +473,23 @@ on the same hardware. :: - mpirun -np 20 gmx_mpi mdrun -ntomp 4 -gpu_id 0 + mpirun -np 20 gmx_mpi mdrun -ntomp 4 -gputasks 00 Starts :ref:`mdrun_mpi` with 20 ranks, and assigns the CPU cores evenly across ranks each to one OpenMP thread. This setup is likely to be suitable when there are ten nodes, each with one GPU, and each node -has two sockets. +has two sockets each of four cores. :: - mpirun -np 20 gmx_mpi mdrun -gpu_id 00 + mpirun -np 10 gmx_mpi mdrun -gpu_id 1 Starts :ref:`mdrun_mpi` with 20 ranks, and assigns the CPU cores evenly across ranks each to one OpenMP thread. This setup is likely to be -suitable when there are ten nodes, each with one GPU, and each node -has two sockets. +suitable when there are ten nodes, each with two GPUs, but another +job on each node is using GPU 0. The job scheduler should set the +affinity of threads of both jobs to their allocated cores, or the +performance of mdrun will suffer greatly. :: @@ -478,15 +497,9 @@ has two sockets. Starts :ref:`mdrun_mpi` with 20 ranks. This setup is likely to be suitable when there are ten nodes, each with two -GPUs. - -:: - - mpirun -np 40 gmx_mpi mdrun -gpu_id 0011 - -Starts :ref:`mdrun_mpi` with 40 ranks. This setup is likely -to be suitable when there are ten nodes, each with two -GPUs, and OpenMP performs poorly on the hardware. +GPUs, but there is no need to specify ``-gpu_id`` for the +normal case where all the GPUs on the node are available +for use. Controlling the domain decomposition algorithm ---------------------------------------------- @@ -800,7 +813,7 @@ of 2. So it can be useful go through the checklist. * For CUDA, use the newest CUDA availabe for your GPU to take advantage of the latest performance enhancements. * Use a recent GPU driver. - * If compiling on a cluster head node, make sure that ``GMX_CPU_ACCELERATION`` + * If compiling on a cluster head node, make sure that ``GMX_SIMD`` is appropriate for the compute nodes. Run setup diff --git a/src/gromacs/ewald/tests/testhardwarecontexts.cpp b/src/gromacs/ewald/tests/testhardwarecontexts.cpp index cad2af2595..3a5b29f2b3 100644 --- a/src/gromacs/ewald/tests/testhardwarecontexts.cpp +++ b/src/gromacs/ewald/tests/testhardwarecontexts.cpp @@ -104,7 +104,7 @@ void PmeTestEnvironment::SetUp() // Constructing contexts for all compatible GPUs - will be empty on non-GPU builds TestHardwareContexts gpuContexts; - for (int gpuIndex : hardwareInfo_->compatibleGpus) + for (int gpuIndex : getCompatibleGpus(hardwareInfo_->gpu_info)) { char stmp[200] = {}; get_gpu_device_info_string(stmp, hardwareInfo_->gpu_info, gpuIndex); diff --git a/src/gromacs/gmxana/gmx_tune_pme.cpp b/src/gromacs/gmxana/gmx_tune_pme.cpp index 4c46036ec9..517c171f3a 100644 --- a/src/gromacs/gmxana/gmx_tune_pme.cpp +++ b/src/gromacs/gmxana/gmx_tune_pme.cpp @@ -754,14 +754,14 @@ static void check_mdrun_works(gmx_bool bThreads, } /* Handles the no-GPU case by emitting an empty string. */ -static std::string make_gpu_id_command_line(int numRanks, int numPmeRanks, const std::vector &gpu_ids) +static std::string make_gpu_id_command_line(const char *eligible_gpu_ids) { /* If the user has given no eligible GPU IDs, or we're trying the - * default behaviour, then there is nothing for g_tune_pme to give + * default behaviour, then there is nothing for tune_pme to give * to mdrun -gpu_id */ - if (!gpu_ids.empty() && numPmeRanks > -1) + if (eligible_gpu_ids != nullptr) { - return "-gpu_id " + gmx::makeGpuIdString(gpu_ids, numRanks - numPmeRanks); + return gmx::formatString("-gpu_id %s", eligible_gpu_ids); } @@ -769,18 +769,17 @@ static std::string make_gpu_id_command_line(int numRanks, int numPmeRanks, const } static void launch_simulation( - gmx_bool bLaunch, /* Should the simulation be launched? */ - FILE *fp, /* General log file */ - gmx_bool bThreads, /* whether to use threads */ - char *cmd_mpirun, /* Command for mpirun */ - char *cmd_np, /* Switch for -np or -ntmpi or empty */ - char *cmd_mdrun, /* Command for mdrun */ - char *args_for_mdrun, /* Arguments for mdrun */ - const char *simulation_tpr, /* This tpr will be simulated */ - int nnodes, /* Number of ranks to use */ - int nPMEnodes, /* Number of PME ranks to use */ - const std::vector &gpu_ids) /* Vector of GPU IDs for - * constructing mdrun command lines */ + gmx_bool bLaunch, /* Should the simulation be launched? */ + FILE *fp, /* General log file */ + gmx_bool bThreads, /* whether to use threads */ + char *cmd_mpirun, /* Command for mpirun */ + char *cmd_np, /* Switch for -np or -ntmpi or empty */ + char *cmd_mdrun, /* Command for mdrun */ + char *args_for_mdrun, /* Arguments for mdrun */ + const char *simulation_tpr, /* This tpr will be simulated */ + int nPMEnodes, /* Number of PME ranks to use */ + const char *eligible_gpu_ids) /* Available GPU IDs for + * constructing mdrun command lines */ { char *command; @@ -789,7 +788,7 @@ static void launch_simulation( * (200 extra chars for -npme ... etc. options should suffice): */ snew(command, std::strlen(cmd_mpirun)+std::strlen(cmd_mdrun)+std::strlen(cmd_np)+std::strlen(args_for_mdrun)+std::strlen(simulation_tpr)+200); - auto cmd_gpu_ids = make_gpu_id_command_line(nnodes, nPMEnodes, gpu_ids); + auto cmd_gpu_ids = make_gpu_id_command_line(eligible_gpu_ids); /* Note that the -passall options requires args_for_mdrun to be at the end * of the command line string */ @@ -1376,30 +1375,30 @@ static void make_sure_it_runs(char *mdrun_cmd_line, int length, FILE *fp, } static void do_the_tests( - FILE *fp, /* General g_tune_pme output file */ - char **tpr_names, /* Filenames of the input files to test */ - int maxPMEnodes, /* Max fraction of nodes to use for PME */ - int minPMEnodes, /* Min fraction of nodes to use for PME */ - int npme_fixed, /* If >= -1, test fixed number of PME - * nodes only */ - const char *npmevalues_opt, /* Which -npme values should be tested */ - t_perf **perfdata, /* Here the performace data is stored */ - int *pmeentries, /* Entries in the nPMEnodes list */ - int repeats, /* Repeat each test this often */ - int nnodes, /* Total number of nodes = nPP + nPME */ - int nr_tprs, /* Total number of tpr files to test */ - gmx_bool bThreads, /* Threads or MPI? */ - char *cmd_mpirun, /* mpirun command string */ - char *cmd_np, /* "-np", "-n", whatever mpirun needs */ - char *cmd_mdrun, /* mdrun command string */ - char *cmd_args_bench, /* arguments for mdrun in a string */ - const t_filenm *fnm, /* List of filenames from command line */ - int nfile, /* Number of files specified on the cmdl. */ - int presteps, /* DLB equilibration steps, is checked */ - gmx_int64_t cpt_steps, /* Time step counter in the checkpoint */ - gmx_bool bCheck, /* Check whether benchmark mdrun works */ - const std::vector &gpu_ids) /* GPU IDs for - * constructing mdrun command lines */ + FILE *fp, /* General tune_pme output file */ + char **tpr_names, /* Filenames of the input files to test */ + int maxPMEnodes, /* Max fraction of nodes to use for PME */ + int minPMEnodes, /* Min fraction of nodes to use for PME */ + int npme_fixed, /* If >= -1, test fixed number of PME + * nodes only */ + const char *npmevalues_opt, /* Which -npme values should be tested */ + t_perf **perfdata, /* Here the performace data is stored */ + int *pmeentries, /* Entries in the nPMEnodes list */ + int repeats, /* Repeat each test this often */ + int nnodes, /* Total number of nodes = nPP + nPME */ + int nr_tprs, /* Total number of tpr files to test */ + gmx_bool bThreads, /* Threads or MPI? */ + char *cmd_mpirun, /* mpirun command string */ + char *cmd_np, /* "-np", "-n", whatever mpirun needs */ + char *cmd_mdrun, /* mdrun command string */ + char *cmd_args_bench, /* arguments for mdrun in a string */ + const t_filenm *fnm, /* List of filenames from command line */ + int nfile, /* Number of files specified on the cmdl. */ + int presteps, /* DLB equilibration steps, is checked */ + gmx_int64_t cpt_steps, /* Time step counter in the checkpoint */ + gmx_bool bCheck, /* Check whether benchmark mdrun works */ + const char *eligible_gpu_ids) /* GPU IDs for + * constructing mdrun command lines */ { int i, nr, k, ret, count = 0, totaltests; int *nPMEnodes = nullptr; @@ -1487,7 +1486,7 @@ static void do_the_tests( { pd = &perfdata[k][i]; - auto cmd_gpu_ids = make_gpu_id_command_line(nnodes, nPMEnodes[i], gpu_ids); + auto cmd_gpu_ids = make_gpu_id_command_line(eligible_gpu_ids); /* Loop over the repeats for each scenario: */ for (nr = 0; nr < repeats; nr++) @@ -2131,11 +2130,10 @@ int gmx_tune_pme(int argc, char *argv[]) "optimized parameters, use the command line option [TT]-launch[tt].[PAR]", "Basic support for GPU-enabled [TT]mdrun[tt] exists. Give a string containing the IDs", "of the GPUs that you wish to use in the optimization in the [TT]-gpu_id[tt]", - "command-line argument. Unlike [TT]mdrun -gpu_id[tt], this does not imply a mapping", - "but merely the eligible set. [TT]g_tune_pme[tt] will construct calls to", - "mdrun that use this set appropriately, assuming that PP ranks with low indices", - "should map to GPUs with low indices, and increasing both monotonically", - "over the respective sets.[PAR]", + "command-line argument. This works exactly like [TT]mdrun -gpu_id[tt], does not imply a mapping,", + "and merely declares the eligible set of GPU devices. [TT]gmx-tune_pme[tt] will construct calls to", + "mdrun that use this set appropriately. [TT]gmx-tune_pme[tt] does not support", + "[TT]-gputasks[tt].[PAR]", }; int nnodes = 1; @@ -2184,7 +2182,7 @@ int gmx_tune_pme(int argc, char *argv[]) double seconds; static t_filenm fnm[] = { - /* g_tune_pme */ + /* tune_pme */ { efOUT, "-p", "perf", ffWRITE }, { efLOG, "-err", "bencherr", ffWRITE }, { efTPR, "-so", "tuned", ffWRITE }, @@ -2261,7 +2259,7 @@ int gmx_tune_pme(int argc, char *argv[]) t_pargs pa[] = { /***********************/ - /* g_tune_pme options: */ + /* tune_pme options: */ /***********************/ { "-mdrun", FALSE, etSTR, {&cmd_mdrun}, "Command line to run a simulation, e.g. 'gmx mdrun' or 'mdrun_mpi'" }, @@ -2304,11 +2302,11 @@ int gmx_tune_pme(int argc, char *argv[]) { "-check", FALSE, etBOOL, {&bCheck}, "Before the benchmark runs, check whether mdrun works in parallel" }, { "-gpu_id", FALSE, etSTR, {&eligible_gpu_ids}, - "List of GPU device id-s that are eligible for use (unlike mdrun, does not imply any mapping)" }, + "List of unique GPU device IDs that are eligible for use" }, /******************/ /* mdrun options: */ /******************/ - /* We let g_tune_pme parse and understand these options, because we need to + /* We let tune_pme parse and understand these options, because we need to * prevent that they appear on the mdrun command line for the benchmarks */ { "-append", FALSE, etBOOL, {&bAppendFiles}, "Append to previous output files when continuing from checkpoint instead of adding the simulation part number to all file names (for launch only)" }, @@ -2414,8 +2412,6 @@ int gmx_tune_pme(int argc, char *argv[]) maxPMEfraction, minPMEfraction, npme_fixed, bench_nsteps, fnm, NFILE, sim_part, presteps, asize(pa), pa); - /* Check any GPU IDs passed make sense, and fill the data structure for them */ - auto gpu_ids = gmx::parseUserGpuIds(eligible_gpu_ids); /* Determine the maximum and minimum number of PME nodes to test, * the actual list of settings is build in do_the_tests(). */ @@ -2562,7 +2558,7 @@ int gmx_tune_pme(int argc, char *argv[]) GMX_RELEASE_ASSERT(npmevalues_opt[0] != nullptr, "Options inconsistency; npmevalues_opt[0] is NULL"); do_the_tests(fp, tpr_names, maxPMEnodes, minPMEnodes, npme_fixed, npmevalues_opt[0], perfdata, &pmeentries, repeats, nnodes, ntprs, bThreads, cmd_mpirun, cmd_np, cmd_mdrun, - cmd_args_bench, fnm, NFILE, presteps, cpt_steps, bCheck, gpu_ids); + cmd_args_bench, fnm, NFILE, presteps, cpt_steps, bCheck, eligible_gpu_ids); fprintf(fp, "\nTuning took%8.1f minutes.\n", (gmx_gettime()-seconds)/60.0); @@ -2591,7 +2587,7 @@ int gmx_tune_pme(int argc, char *argv[]) /* Now start the real simulation if the user requested it ... */ launch_simulation(bLaunch, fp, bThreads, cmd_mpirun, cmd_np, cmd_mdrun, - cmd_args_launch, simulation_tpr, nnodes, best_npme, gpu_ids); + cmd_args_launch, simulation_tpr, best_npme, eligible_gpu_ids); } gmx_ffclose(fp); diff --git a/src/gromacs/hardware/detecthardware.cpp b/src/gromacs/hardware/detecthardware.cpp index 48f9a8c2c7..af9fd2edc6 100644 --- a/src/gromacs/hardware/detecthardware.cpp +++ b/src/gromacs/hardware/detecthardware.cpp @@ -496,7 +496,6 @@ gmx_hw_info_t *gmx_detect_hardware(const gmx::MDLogger &mdlog, const t_commrec * gmx_detect_gpus(mdlog, cr); gmx_collect_hardware_mpi(*hwinfo_g->cpuInfo); - hwinfo_g->compatibleGpus = getCompatibleGpus(hwinfo_g->gpu_info); } /* increase the reference counter */ n_hwinfo++; diff --git a/src/gromacs/hardware/hw_info.h b/src/gromacs/hardware/hw_info.h index 29e59de023..e86d8cdc4b 100644 --- a/src/gromacs/hardware/hw_info.h +++ b/src/gromacs/hardware/hw_info.h @@ -55,7 +55,6 @@ struct gmx_hw_info_t { /* Data for our local physical node */ struct gmx_gpu_info_t gpu_info; /* Information about GPUs detected in the system */ - std::vector compatibleGpus; /* Contains the device IDs of all GPUs that are compatible */ int nthreads_hw_avail; /* Number of hardware threads available; this number is based on the number of CPUs reported as available @@ -113,8 +112,10 @@ struct gmx_hw_opt_t int core_pinning_stride = 0; //! Logical core pinning offset. int core_pinning_offset = 0; - //! Empty, or a GPU task-assignment string provided by the user. - std::string gpuIdTaskAssignment = ""; + //! Empty, or a string provided by the user declaring (unique) GPU IDs available for mdrun to use. + std::string gpuIdsAvailable = ""; + //! Empty, or a string provided by the user mapping GPU tasks to devices. + std::string userGpuTaskAssignment = ""; }; #endif diff --git a/src/gromacs/taskassignment/CMakeLists.txt b/src/gromacs/taskassignment/CMakeLists.txt index 67c7be03ae..27f179abc3 100644 --- a/src/gromacs/taskassignment/CMakeLists.txt +++ b/src/gromacs/taskassignment/CMakeLists.txt @@ -33,8 +33,11 @@ # the research papers on the package. Check out http://www.gromacs.org. gmx_add_libgromacs_sources( - hardwareassign.cpp + decidegpuusage.cpp + findallgputasks.cpp + reportgpuusage.cpp resourcedivision.cpp + taskassignment.cpp usergpuids.cpp ) diff --git a/src/gromacs/taskassignment/decidegpuusage.cpp b/src/gromacs/taskassignment/decidegpuusage.cpp new file mode 100644 index 0000000000..d20ca674d6 --- /dev/null +++ b/src/gromacs/taskassignment/decidegpuusage.cpp @@ -0,0 +1,398 @@ +/* + * This file is part of the GROMACS molecular simulation package. + * + * Copyright (c) 2015,2016,2017, by the GROMACS development team, led by + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, + * and including many others, as listed in the AUTHORS file in the + * top-level source directory and at http://www.gromacs.org. + * + * GROMACS is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 + * of the License, or (at your option) any later version. + * + * GROMACS is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with GROMACS; if not, see + * http://www.gnu.org/licenses, or write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * If you want to redistribute modifications to GROMACS, please + * consider that scientific software is very special. Version + * control is crucial - bugs must be traceable. We will be happy to + * consider code for inclusion in the official distribution, but + * derived work must not be called official GROMACS. Details are found + * in the README & COPYING files - if they are missing, get the + * official version at http://www.gromacs.org. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the research papers on the package. Check out http://www.gromacs.org. + */ +/*! \internal \file + * \brief Defines functionality for deciding whether tasks will run on GPUs. + * + * \author Mark Abraham + * \ingroup module_taskassignment + */ + +#include "gmxpre.h" + +#include "decidegpuusage.h" + +#include "config.h" + +#include +#include + +#include +#include + +#include "gromacs/hardware/cpuinfo.h" +#include "gromacs/hardware/detecthardware.h" +#include "gromacs/hardware/hardwaretopology.h" +#include "gromacs/hardware/hw_info.h" +#include "gromacs/mdlib/gmx_omp_nthreads.h" +#include "gromacs/mdlib/nb_verlet.h" +#include "gromacs/mdtypes/commrec.h" +#include "gromacs/mdtypes/inputrec.h" +#include "gromacs/mdtypes/md_enums.h" +#include "gromacs/taskassignment/taskassignment.h" +#include "gromacs/topology/topology.h" +#include "gromacs/utility/baseversion.h" +#include "gromacs/utility/exceptions.h" +#include "gromacs/utility/fatalerror.h" +#include "gromacs/utility/gmxassert.h" +#include "gromacs/utility/logger.h" +#include "gromacs/utility/stringutil.h" + + +namespace gmx +{ + +namespace +{ + +//! Helper variable to localise the text of an often repeated message. +const char * g_specifyEverythingFormatString = + "When you use mdrun -gputasks, %s must be set to non-default " + "values, so that the device IDs can be interpreted correctly." +#if GMX_GPU != GMX_GPU_NONE + " If you simply want to restrict which GPUs are used, then it is " + "better to use mdrun -gpu_id. Otherwise, setting the " +# if GMX_GPU == GMX_GPU_CUDA + "CUDA_VISIBLE_DEVICES" +# elif GMX_GPU == GMX_GPU_OPENCL + // Technically there is no portable way to do this offered by the + // OpenCL standard, but the only current relevant case for GROMACS + // is AMD OpenCL, which offers this variable. + "GPU_DEVICE_ORDINAL" +# else +# error "Unreachable branch" +# endif + " environment variable in your bash profile or job " + "script may be more convenient." +#endif +; + +} // namespace + +bool +decideWhetherToUseGpusForNonbondedWithThreadMpi(const TaskTarget nonbondedTarget, + const std::vector &gpuIdsToUse, + const std::vector &userGpuTaskAssignment, + const EmulateGpuNonbonded emulateGpuNonbonded, + const bool usingVerletScheme, + const bool nonbondedOnGpuIsUseful, + const int numRanksPerSimulation) +{ + // First, exclude all cases where we can't run NB on GPUs. + if (nonbondedTarget == TaskTarget::Cpu || + emulateGpuNonbonded == EmulateGpuNonbonded::Yes || + !usingVerletScheme || + !nonbondedOnGpuIsUseful) + { + // If the user required NB on GPUs, we issue an error later. + return false; + } + + // We now know that NB on GPUs makes sense, if we have any. + + if (!userGpuTaskAssignment.empty()) + { + // Specifying -gputasks requires specifying everything. + if (nonbondedTarget == TaskTarget::Auto || + numRanksPerSimulation < 1) + { + GMX_THROW(InconsistentInputError(formatString(g_specifyEverythingFormatString, "-nb and -ntmpi"))); + } + return true; + } + + if (nonbondedTarget == TaskTarget::Gpu) + { + return true; + } + + // Because this is thread-MPI, we already know about the GPUs that + // all potential ranks can use, and can use that in a global + // decision that will later be consistent. + auto haveGpus = !gpuIdsToUse.empty(); + + // If we get here, then the user permitted or required GPUs. + return haveGpus; +} + +bool +decideWhetherToUseGpusForPmeWithThreadMpi(const bool useGpuForNonbonded, + const TaskTarget pmeTarget, + const std::vector &gpuIdsToUse, + const std::vector &userGpuTaskAssignment, + const bool canUseGpuForPme, + const int numRanksPerSimulation) +{ + // First, exclude all cases where we can't run PME on GPUs. + if ((pmeTarget == TaskTarget::Cpu) || + !useGpuForNonbonded || + !canUseGpuForPme) + { + // PME can't run on a GPU. If the user required that, we issue + // an error later. + return false; + } + + // We now know that PME on GPUs might make sense, if we have any. + + if (!userGpuTaskAssignment.empty()) + { + // Follow the user's choice of GPU task assignment, if we + // can. Checking that their IDs are for compatible GPUs comes + // later. + + // Specifying -gputasks requires specifying everything. + if (pmeTarget == TaskTarget::Auto || + numRanksPerSimulation < 1) + { + GMX_THROW(InconsistentInputError(formatString(g_specifyEverythingFormatString, "all of -nb, -pme, and -ntmpi"))); + } + + // PME on GPUs is only supported in a single case + if (pmeTarget == TaskTarget::Gpu) + { + if (numRanksPerSimulation > 1) + { + GMX_THROW(InconsistentInputError + ("When you run mdrun -pme gpu -gputasks, you must supply a PME .tpr file and use a single rank.")); + } + return true; + } + + // pmeTarget == TaskTarget::Auto + return numRanksPerSimulation == 1; + } + + // Because this is thread-MPI, we already know about the GPUs that + // all potential ranks can use, and can use that in a global + // decision that will later be consistent. + + if (pmeTarget == TaskTarget::Gpu) + { + if (numRanksPerSimulation > 1) + { + GMX_THROW(NotImplementedError + ("PME tasks were required to run on GPUs, but that is not implemented with " + "more than one rank. Use a single rank, or permit PME tasks to be assigned " + "to the CPU.")); + } + return true; + } + + if (numRanksPerSimulation == 1) + { + // PME can run well on a GPU shared with NB, and we permit + // mdrun to default to try that. + return gpuIdsToUse.size() >= 1; + } + + if (numRanksPerSimulation < 1) + { + // Full automated mode for thread-MPI (the default). PME can + // run well on a GPU shared with NB, and we permit mdrun to + // default to it if there is only one GPU available. + return (gpuIdsToUse.size() == 1); + } + + // Not enough support for PME on GPUs for anything else + return false; +} + +bool decideWhetherToUseGpusForNonbonded(const TaskTarget nonbondedTarget, + const std::vector &gpuIdsToUse, + const std::vector &userGpuTaskAssignment, + const EmulateGpuNonbonded emulateGpuNonbonded, + const bool usingVerletScheme, + const bool nonbondedOnGpuIsUseful) +{ + if (nonbondedTarget == TaskTarget::Cpu) + { + if (!userGpuTaskAssignment.empty()) + { + GMX_THROW(InconsistentInputError + ("A GPU task assignment was specified, but nonbonded interactions were " + "assigned to the CPU. Make no more than one of these choices.")); + } + + return false; + } + + // TODO refactor all these TaskTarget::Gpu checks into one place? + // e.g. use a subfunction that handles only the cases where + // TaskTargets are not Cpu? + if (emulateGpuNonbonded == EmulateGpuNonbonded::Yes) + { + if (nonbondedTarget == TaskTarget::Gpu) + { + GMX_THROW(InconsistentInputError + ("Nonbonded interactions on the GPU were required, which is inconsistent " + "with choosing emulation. Make no more than one of these choices.")); + } + if (!gpuIdsToUse.empty() || !userGpuTaskAssignment.empty()) + { + GMX_THROW(InconsistentInputError + ("GPU ID usage was specified, as was GPU emulation. Make no more than one of these choices.")); + } + + return false; + } + + if (!usingVerletScheme) + { + if (nonbondedTarget == TaskTarget::Gpu) + { + GMX_THROW(InconsistentInputError + ("Nonbonded interactions on the GPU were required, which requires using " + "the Verlet scheme. Either use the Verlet scheme, or do not require using GPUs.")); + } + + return false; + } + + if (!nonbondedOnGpuIsUseful) + { + if (nonbondedTarget == TaskTarget::Gpu) + { + GMX_THROW(InconsistentInputError + ("Nonbonded interactions on the GPU were required, but this would not be " + "useful. Probably you should not require using GPUs.")); + } + + return false; + } + + if (!userGpuTaskAssignment.empty()) + { + // Specifying -gputasks requires specifying everything. + if (nonbondedTarget == TaskTarget::Auto) + { + GMX_THROW(InconsistentInputError(formatString(g_specifyEverythingFormatString, "-nb and -ntmpi"))); + } + + return true; + } + + // We still don't know whether it is an error if no GPUs are found + // because we don't know the duty of this rank, yet. For example, + // a node with only PME ranks and -pme cpu is OK if there are not + // GPUs. + + // If we get here, then the user permitted or required GPUs. + return true; +} + +bool decideWhetherToUseGpusForPme(const bool useGpuForNonbonded, + const TaskTarget pmeTarget, + const std::vector &userGpuTaskAssignment, + const bool canUseGpuForPme, + const int numRanksPerSimulation) +{ + if (pmeTarget == TaskTarget::Cpu) + { + return false; + } + + if (!useGpuForNonbonded) + { + if (pmeTarget == TaskTarget::Gpu) + { + GMX_THROW(NotImplementedError + ("The PME on the GPU is only supported when nonbonded interactions run on GPUs also.")); + } + return false; + } + + if (!canUseGpuForPme) + { + if (pmeTarget == TaskTarget::Gpu) + { + // TODO Pass in the inputrec so we can give more help here? + GMX_THROW(NotImplementedError + ("The input simulation did not use PME in a way that is supported on the GPU.")); + } + return false; + } + + if (pmeTarget == TaskTarget::Cpu) + { + if (!userGpuTaskAssignment.empty()) + { + GMX_THROW(InconsistentInputError + ("A GPU task assignment was specified, but PME interactions were " + "assigned to the CPU. Make no more than one of these choices.")); + } + + return false; + } + + if (!userGpuTaskAssignment.empty()) + { + // Specifying -gputasks requires specifying everything. + if (pmeTarget == TaskTarget::Auto) + { + GMX_THROW(InconsistentInputError(formatString(g_specifyEverythingFormatString, "all of -nb, -pme, and -ntmpi"))); + } + + return true; + } + + // We still don't know whether it is an error if no GPUs are found + // because we don't know the duty of this rank, yet. For example, + // a node with only PME ranks and -pme cpu is OK if there are not + // GPUs. + + if (pmeTarget == TaskTarget::Gpu) + { + if (numRanksPerSimulation > 1) + { + GMX_THROW(NotImplementedError + ("PME tasks were required to run on GPUs, but that is not implemented with " + "more than one rank. Use a single rank, or permit PME tasks to be assigned " + "to the CPU.")); + } + return true; + } + + if (numRanksPerSimulation == 1) + { + // PME can run well on a single GPU shared with NB when + // there is one rank, so we permit mdrun to try that. + return true; + } + + // Not enough support for PME on GPUs for anything else + return false; +} + +} // namespace diff --git a/src/gromacs/taskassignment/decidegpuusage.h b/src/gromacs/taskassignment/decidegpuusage.h new file mode 100644 index 0000000000..a1738429b4 --- /dev/null +++ b/src/gromacs/taskassignment/decidegpuusage.h @@ -0,0 +1,185 @@ +/* + * This file is part of the GROMACS molecular simulation package. + * + * Copyright (c) 2017, by the GROMACS development team, led by + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, + * and including many others, as listed in the AUTHORS file in the + * top-level source directory and at http://www.gromacs.org. + * + * GROMACS is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 + * of the License, or (at your option) any later version. + * + * GROMACS is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with GROMACS; if not, see + * http://www.gnu.org/licenses, or write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * If you want to redistribute modifications to GROMACS, please + * consider that scientific software is very special. Version + * control is crucial - bugs must be traceable. We will be happy to + * consider code for inclusion in the official distribution, but + * derived work must not be called official GROMACS. Details are found + * in the README & COPYING files - if they are missing, get the + * official version at http://www.gromacs.org. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the research papers on the package. Check out http://www.gromacs.org. + */ +/*! \libinternal \file + * \brief Declares functionality for deciding whether tasks will run on GPUs. + * + * \author Mark Abraham + * \ingroup module_taskassignment + * \inlibraryapi + */ + +#ifndef GMX_TASKASSIGNMENT_DECIDEGPUUSAGE_H +#define GMX_TASKASSIGNMENT_DECIDEGPUUSAGE_H + +#include + +struct gmx_hw_info_t; + +enum class EmulateGpuNonbonded : bool; + +namespace gmx +{ + +//! Record where a compute task is targetted. +enum class TaskTarget : int +{ + Auto, + Cpu, + Gpu +}; + +/*! \brief Decide whether this thread-MPI simulation will run + * nonbonded tasks on GPUs. + * + * The number of GPU tasks and devices influences both the choice of + * the number of ranks, and checks upon any such choice made by the + * user. So we need to consider this before any automated choice of + * the number of thread-MPI ranks. + * + * \param[in] nonbondedTarget The user's choice for mdrun -nb for where to assign short-ranged nonbonded interaction tasks. + * \param[in] gpuIdsToUse The compatible GPUs that the user permitted us to use. + * \param[in] userGpuTaskAssignment The user-specified assignment of GPU tasks to device IDs. + * \param[in] emulateGpuNonbonded Whether we will emulate GPU calculation of nonbonded interactions. + * \param[in] usingVerletScheme Whether the nonbondeds are using the Verlet scheme. + * \param[in] nonbondedOnGpuIsUseful Whether computing nonbonded interactions on a GPU is useful for this calculation. + * \param[in] numRanksPerSimulation The number of ranks in each simulation. + * + * \returns Whether the simulation will run nonbonded tasks on GPUs. + * + * \throws std::bad_alloc If out of memory + * InconsistentInputError If the user requirements are inconsistent. */ +bool decideWhetherToUseGpusForNonbondedWithThreadMpi(const TaskTarget nonbondedTarget, + const std::vector &gpuIdsToUse, + const std::vector &userGpuTaskAssignment, + const EmulateGpuNonbonded emulateGpuNonbonded, + const bool usingVerletScheme, + const bool nonbondedOnGpuIsUseful, + const int numRanksPerSimulation); + +/*! \brief Decide whether this thread-MPI simulation will run + * PME tasks on GPUs. + * + * The number of GPU tasks and devices influences both the choice of + * the number of ranks, and checks upon any such choice made by the + * user. So we need to consider this before any automated choice of + * the number of thread-MPI ranks. + * + * \param[in] useGpuForNonbonded Whether GPUs will be used for nonbonded interactions. + * \param[in] pmeTarget The user's choice for mdrun -pme for where to assign long-ranged PME nonbonded interaction tasks. + * \param[in] gpuIdsToUse The compatible GPUs that the user permitted us to use. + * \param[in] userGpuTaskAssignment The user-specified assignment of GPU tasks to device IDs. + * \param[in] canUseGpuForPme Whether the form of PME chosen can run on a GPU + * \param[in] numRanksPerSimulation The number of ranks in each simulation. + * + * \returns Whether the simulation will run PME tasks on GPUs. + * + * \throws std::bad_alloc If out of memory + * InconsistentInputError If the user requirements are inconsistent. */ +bool decideWhetherToUseGpusForPmeWithThreadMpi(const bool useGpuForNonbonded, + const TaskTarget pmeTarget, + const std::vector &gpuIdsToUse, + const std::vector &userGpuTaskAssignment, + const bool canUseGpuForPme, + const int numRanksPerSimulation); + +/*! \brief Decide whether the simulation will try to run nonbonded + * tasks on GPUs. + * + * The final decision cannot be made until after the duty of the rank + * is known. But we need to know if nonbonded will run on GPUs for + * setting up DD (particularly rlist) and determining duty. If the + * user requires GPUs for the tasks of that duty, then it will be an + * error when none are found. + * + * With thread-MPI, calls have been made to + * decideWhetherToUseGpusForNonbondedWithThreadMpi() and + * decideWhetherToUseGpusForPmeWithThreadMpi() to help determine + * the number of ranks and run some checks, but the final + * decision is made in this routine, along with many more + * consistency checks. + * + * \param[in] nonbondedTarget The user's choice for mdrun -nb for where to assign short-ranged nonbonded interaction tasks. + * \param[in] gpuIdsToUse The compatible GPUs that the user permitted us to use. + * \param[in] userGpuTaskAssignment The user-specified assignment of GPU tasks to device IDs. + * \param[in] emulateGpuNonbonded Whether we will emulate GPU calculation of nonbonded interactions. + * \param[in] usingVerletScheme Whether the nonbondeds are using the Verlet scheme. + * \param[in] nonbondedOnGpuIsUseful Whether computing nonbonded interactions on a GPU is useful for this calculation. + * + * \returns Whether the simulation will run nonbonded and PME tasks, respectively, on GPUs. + * + * \throws std::bad_alloc If out of memory + * InconsistentInputError If the user requirements are inconsistent. */ +bool decideWhetherToUseGpusForNonbonded(const TaskTarget nonbondedTarget, + const std::vector &gpuIdsToUse, + const std::vector &userGpuTaskAssignment, + const EmulateGpuNonbonded emulateGpuNonbonded, + const bool usingVerletScheme, + const bool nonbondedOnGpuIsUseful); + +/*! \brief Decide whether the simulation will try to run tasks of + * different types on GPUs. + * + * The final decision cannot be made until after the duty of the rank + * is known. But we need to know if nonbonded will run on GPUs for + * setting up DD (particularly rlist) and determining duty. If the + * user requires GPUs for the tasks of that duty, then it will be an + * error when none are found. + * + * With thread-MPI, calls have been made to + * decideWhetherToUseGpusForNonbondedWithThreadMpi() and + * decideWhetherToUseGpusForPmeWithThreadMpi() to help determine + * the number of ranks and run some checks, but the final + * decision is made in this routine, along with many more + * consistency checks. + * + * \param[in] useGpuForNonbonded Whether GPUs will be used for nonbonded interactions. + * \param[in] pmeTarget The user's choice for mdrun -pme for where to assign long-ranged PME nonbonded interaction tasks. + * \param[in] userGpuTaskAssignment The user-specified assignment of GPU tasks to device IDs. + * \param[in] canUseGpuForPme Whether the form of PME chosen can run on a GPU + * \param[in] numRanksPerSimulation The number of ranks in each simulation. + * + * \returns Whether the simulation will run nonbonded and PME tasks, respectively, on GPUs. + * + * \throws std::bad_alloc If out of memory + * InconsistentInputError If the user requirements are inconsistent. */ +bool decideWhetherToUseGpusForPme(const bool useGpuForNonbonded, + const TaskTarget pmeTarget, + const std::vector &userGpuTaskAssignment, + const bool canUseGpuForPme, + const int numRanksPerSimulation); + +} + +#endif diff --git a/src/gromacs/taskassignment/findallgputasks.cpp b/src/gromacs/taskassignment/findallgputasks.cpp new file mode 100644 index 0000000000..a9c63e5e48 --- /dev/null +++ b/src/gromacs/taskassignment/findallgputasks.cpp @@ -0,0 +1,221 @@ +/* + * This file is part of the GROMACS molecular simulation package. + * + * Copyright (c) 2017, by the GROMACS development team, led by + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, + * and including many others, as listed in the AUTHORS file in the + * top-level source directory and at http://www.gromacs.org. + * + * GROMACS is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 + * of the License, or (at your option) any later version. + * + * GROMACS is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with GROMACS; if not, see + * http://www.gnu.org/licenses, or write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * If you want to redistribute modifications to GROMACS, please + * consider that scientific software is very special. Version + * control is crucial - bugs must be traceable. We will be happy to + * consider code for inclusion in the official distribution, but + * derived work must not be called official GROMACS. Details are found + * in the README & COPYING files - if they are missing, get the + * official version at http://www.gromacs.org. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the research papers on the package. Check out http://www.gromacs.org. + */ +/*! \internal \file + * \brief + * Defines routine for collecting all GPU tasks found on ranks of a node. + * + * \author Mark Abraham + * \ingroup module_taskassignment + */ +#include "gmxpre.h" + +#include "findallgputasks.h" + +#include "config.h" + +#include +#include + +#include "gromacs/mdtypes/commrec.h" +#include "gromacs/utility/exceptions.h" +#include "gromacs/utility/gmxassert.h" +#include "gromacs/utility/gmxmpi.h" + +namespace gmx +{ + +namespace +{ + +//! Constant used to help minimize preprocessing of code. +constexpr bool g_usingMpi = GMX_MPI; + +//! Helper function to prepare to all-gather the vector of non-bonded tasks on this node. +static std::vector allgather(const int &input, + int numRanks, + MPI_Comm communicator) +{ + std::vector result(numRanks); + if (g_usingMpi && numRanks > 1) + { + // TODO This works as an MPI_Allgather, but thread-MPI does + // not implement that. It's only intra-node communication, and + // happens rarely, so not worth optimizing (yet). Also + // thread-MPI segfaults with 1 rank. +#if GMX_MPI + int root = 0; + // Calling a C API with the const T * from data() doesn't seem + // to compile warning-free with all versions of MPI headers. + // + // TODO Make an allgather template to deal with this nonsense. + MPI_Gather(const_cast(&input), + 1, + MPI_INT, + const_cast(result.data()), + 1, + MPI_INT, + root, + communicator); + MPI_Bcast(const_cast(result.data()), + result.size(), + MPI_INT, + root, + communicator); +#else + GMX_UNUSED_VALUE(communicator); +#endif + } + else + { + result[0] = input; + } + + return result; +} + +//! Helper function to compute allgatherv displacements. +static std::vector computeDisplacements(ArrayRef extentOnEachRank, + int numRanks) +{ + std::vector displacements(numRanks + 1); + displacements[0] = 0; + std::partial_sum(std::begin(extentOnEachRank), std::end(extentOnEachRank), std::begin(displacements) + 1); + return displacements; +} + +//! Helper function to all-gather the vector of all GPU tasks on ranks of this node. +static std::vector allgatherv(ArrayRef input, + ArrayRef extentOnEachRank, + ArrayRef displacementForEachRank, + MPI_Comm communicator) +{ + // Now allocate the vector and do the allgatherv + int totalExtent = displacementForEachRank.back(); + + std::vector result; + result.reserve(totalExtent); + if (g_usingMpi && extentOnEachRank.size() > 1 && totalExtent > 0) + { + result.resize(totalExtent); + // TODO This works as an MPI_Allgatherv, but thread-MPI does + // not implement that. It's only intra-node communication, and + // happens rarely, so not worth optimizing (yet). Also + // thread-MPI segfaults with 1 rank and with zero totalExtent. +#if GMX_MPI + int root = 0; + // Calling a C API with the const T * from data() doesn't seem to compile reliably. + // TODO Make an allgatherv template to deal with this nonsense. + MPI_Gatherv(const_cast(input.data()), + input.size(), + MPI_INT, + const_cast(result.data()), + const_cast(extentOnEachRank.data()), + const_cast(displacementForEachRank.data()), + MPI_INT, + root, + communicator); + MPI_Bcast(const_cast(result.data()), + result.size(), + MPI_INT, + root, + communicator); +#else + GMX_UNUSED_VALUE(communicator); +#endif + } + else + { + for (const auto &gpuTask : input) + { + result.push_back(gpuTask); + } + } + return result; +} + +} // namespace + +/*! \brief Returns container of all tasks on all ranks of this node + * that are eligible for GPU execution. + * + * Perform all necessary communication for preparing for task + * assignment. Separating this aspect makes it possible to unit test + * the logic of task assignment. */ +GpuTasksOnRanks +findAllGpuTasksOnThisNode(ArrayRef gpuTasksOnThisRank, + int numRanksOnThisNode, + MPI_Comm communicator) +{ + // Find out how many GPU tasks are on each rank on this node. + auto numGpuTasksOnEachRankOfThisNode = + allgather(gpuTasksOnThisRank.size(), numRanksOnThisNode, communicator); + + /* Collect on each rank of this node a vector describing all + * GPU tasks on this node, in ascending order of rank. This + * requires a vector allgather. The displacements indicate where + * the GPU tasks on each rank of this node start and end within + * the vector. */ + auto displacementsForEachRank = computeDisplacements(numGpuTasksOnEachRankOfThisNode, numRanksOnThisNode); + auto gpuTasksOnThisNode = allgatherv(gpuTasksOnThisRank, numGpuTasksOnEachRankOfThisNode, + displacementsForEachRank, communicator); + + /* Next, we re-use the displacements to break up the vector + * of GPU tasks into something that can be indexed like + * gpuTasks[rankIndex][taskIndex]. */ + GpuTasksOnRanks gpuTasksOnRanksOfThisNode; + // TODO This would be nicer if we had a good abstraction for "pair + // of iterators that point to adjacent container elements" or + // "iterator that points to the first of a pair of valid adjacent + // container elements, or end". + GMX_ASSERT(displacementsForEachRank.size() > 1, "Even with one rank, there's always both a start and end displacement"); + auto currentDisplacementIt = displacementsForEachRank.begin(); + auto nextDisplacementIt = currentDisplacementIt + 1; + do + { + gpuTasksOnRanksOfThisNode.emplace_back(std::vector()); + for (auto taskOnThisRankIndex = *currentDisplacementIt; taskOnThisRankIndex != *nextDisplacementIt; ++taskOnThisRankIndex) + { + gpuTasksOnRanksOfThisNode.back().push_back(gpuTasksOnThisNode[taskOnThisRankIndex]); + } + + currentDisplacementIt = nextDisplacementIt; + ++nextDisplacementIt; + } + while (nextDisplacementIt != displacementsForEachRank.end()); + + return gpuTasksOnRanksOfThisNode; +} + +} // namespace diff --git a/src/gromacs/taskassignment/findallgputasks.h b/src/gromacs/taskassignment/findallgputasks.h new file mode 100644 index 0000000000..06069d3f5b --- /dev/null +++ b/src/gromacs/taskassignment/findallgputasks.h @@ -0,0 +1,65 @@ +/* + * This file is part of the GROMACS molecular simulation package. + * + * Copyright (c) 2017, by the GROMACS development team, led by + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, + * and including many others, as listed in the AUTHORS file in the + * top-level source directory and at http://www.gromacs.org. + * + * GROMACS is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 + * of the License, or (at your option) any later version. + * + * GROMACS is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with GROMACS; if not, see + * http://www.gnu.org/licenses, or write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * If you want to redistribute modifications to GROMACS, please + * consider that scientific software is very special. Version + * control is crucial - bugs must be traceable. We will be happy to + * consider code for inclusion in the official distribution, but + * derived work must not be called official GROMACS. Details are found + * in the README & COPYING files - if they are missing, get the + * official version at http://www.gromacs.org. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the research papers on the package. Check out http://www.gromacs.org. + */ +/*! \internal + * \file + * \brief Declares routine for collecting all GPU tasks found on ranks of a node. + * + * \author Mark Abraham + * \ingroup module_taskassignment + */ +#ifndef GMX_TASKASSIGNMENT_FINDALLGPUTASKS_H +#define GMX_TASKASSIGNMENT_FINDALLGPUTASKS_H + +#include "gromacs/taskassignment/taskassignment.h" +#include "gromacs/utility/arrayref.h" +#include "gromacs/utility/gmxmpi.h" + +namespace gmx +{ + +/*! \brief Returns container of all tasks on all ranks of this node + * that are eligible for GPU execution. + * + * Perform all necessary communication for preparing for task + * assignment. Separating this aspect makes it possible to unit test + * the logic of task assignment. */ +GpuTasksOnRanks +findAllGpuTasksOnThisNode(ArrayRef gpuTasksOnThisRank, + int numRanksOnThisNode, + MPI_Comm communicator); + +} // namespace + +#endif diff --git a/src/gromacs/taskassignment/hardwareassign.cpp b/src/gromacs/taskassignment/hardwareassign.cpp deleted file mode 100644 index 19896f00c1..0000000000 --- a/src/gromacs/taskassignment/hardwareassign.cpp +++ /dev/null @@ -1,258 +0,0 @@ -/* - * This file is part of the GROMACS molecular simulation package. - * - * Copyright (c) 2016,2017, by the GROMACS development team, led by - * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, - * and including many others, as listed in the AUTHORS file in the - * top-level source directory and at http://www.gromacs.org. - * - * GROMACS is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public License - * as published by the Free Software Foundation; either version 2.1 - * of the License, or (at your option) any later version. - * - * GROMACS is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with GROMACS; if not, see - * http://www.gnu.org/licenses, or write to the Free Software Foundation, - * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - * - * If you want to redistribute modifications to GROMACS, please - * consider that scientific software is very special. Version - * control is crucial - bugs must be traceable. We will be happy to - * consider code for inclusion in the official distribution, but - * derived work must not be called official GROMACS. Details are found - * in the README & COPYING files - if they are missing, get the - * official version at http://www.gromacs.org. - * - * To help us fund GROMACS development, we humbly ask that you cite - * the research papers on the package. Check out http://www.gromacs.org. - */ -#include "gmxpre.h" - -#include "hardwareassign.h" - -#include "config.h" - -#include - -#include -#include -#include -#include - -#include "gromacs/gmxlib/network.h" -#include "gromacs/gpu_utils/gpu_utils.h" -#include "gromacs/hardware/gpu_hw_info.h" -#include "gromacs/hardware/hw_info.h" -#include "gromacs/mdtypes/commrec.h" -#include "gromacs/taskassignment/usergpuids.h" -#include "gromacs/utility/cstringutil.h" -#include "gromacs/utility/exceptions.h" -#include "gromacs/utility/fatalerror.h" -#include "gromacs/utility/gmxassert.h" -#include "gromacs/utility/logger.h" -#include "gromacs/utility/smalloc.h" -#include "gromacs/utility/stringutil.h" -#include "gromacs/utility/sysinfo.h" - -#define HOSTNAMELEN 80 - -namespace gmx -{ - -/*! \brief This function is responsible for the automated mapping the - * GPUs to the processes on a single node. - * - * This selects the GPUs we will use. This is an operation local to each physical node. - * If we have less MPI ranks than GPUs, we will waste some GPUs. - * - * \param[in] compatibleGpus Vector of GPUs that are compatible - * \param[in] nrank Number of PP GPU ranks on the node. - * \param[in] rank Index of PP GPU rank on the node. - * - * \returns The assignment of GPU tasks on ranks of this node to GPU devices on this node. - */ -static std::vector assign_rank_gpu_ids(const std::vector &compatibleGpus, - int nrank, int rank) -{ - int numCompatibleGpus = static_cast(compatibleGpus.size()); - GMX_RELEASE_ASSERT(nrank >= 1, - gmx::formatString("Invalid limit (%d) for the number of GPUs (detected %d compatible GPUs)", - rank, numCompatibleGpus).c_str()); - - if (numCompatibleGpus == 0) - { - char host[HOSTNAMELEN]; - - gmx_gethostname(host, HOSTNAMELEN); - gmx_fatal(FARGS, "A GPU was requested on host %s, but no compatible GPUs were detected. All nodes with PP ranks need to have GPUs. If you intended to use GPU acceleration in a parallel run, you can either avoid using the nodes that don't have GPUs or place PME ranks on these nodes.", host); - } - - int nshare; - - nshare = 1; - if (nrank > numCompatibleGpus) - { - if (nrank % numCompatibleGpus == 0) - { - nshare = nrank/numCompatibleGpus; - } - else - { - if (rank == 0) - { - gmx_fatal(FARGS, "The number of MPI ranks (%d) in a physical node is not a multiple of the number of GPUs (%d). Select a different number of MPI ranks or use the -gpu_id option to manually specify the GPU to be used.", - nrank, numCompatibleGpus); - } - -#if GMX_MPI - /* We use a global barrier to prevent ranks from continuing with - * an invalid setup. - */ - MPI_Barrier(MPI_COMM_WORLD); -#endif - } - } - - /* Here we will waste GPUs when nrank < numCompatibleGpus */ - std::vector taskAssignment; - taskAssignment.resize(std::min(numCompatibleGpus*nshare, nrank)); - for (size_t i = 0; i != taskAssignment.size(); ++i) - { - /* TODO: improve this implementation: either sort GPUs or remove the weakest here */ - taskAssignment[i] = compatibleGpus[i/nshare]; - } - return taskAssignment; -} - -std::vector mapPpRanksToGpus(bool rankCanUseGpu, - const t_commrec *cr, - const gmx_gpu_info_t &gpu_info, - const std::vector &compatibleGpus, - const std::vector &userGpuIds) -{ - std::vector taskAssignment; - - if (!rankCanUseGpu) - { - return taskAssignment; - } - - if (!userGpuIds.empty()) - { - checkUserGpuIds(gpu_info, compatibleGpus, userGpuIds); - taskAssignment = userGpuIds; - } - else - { - taskAssignment = assign_rank_gpu_ids(compatibleGpus, cr->nrank_pp_intranode, cr->rank_pp_intranode); - } - return taskAssignment; -} - -} // namespace - -/*! \brief Return the number of PP rank pairs that share a GPU device between them. - * - * Sharing GPUs among multiple PP ranks is possible via either user or - * automated selection. */ -static int gmx_count_gpu_dev_shared(const std::vector &gpuTaskAssignment, - bool userSetGpuIds) -{ - int same_count = 0; - - if (userSetGpuIds) - { - GMX_RELEASE_ASSERT(!gpuTaskAssignment.empty(), - "The user cannot choose an empty set of GPU IDs, code is wrong somewhere"); - size_t ngpu = gpuTaskAssignment.size(); - - for (size_t i = 0; i < ngpu - 1; i++) - { - for (size_t j = i + 1; j < ngpu; j++) - { - same_count += (gpuTaskAssignment[i] == - gpuTaskAssignment[j]); - } - } - } - - return same_count; -} - -/* Count and return the number of unique GPUs (per node) selected. - * - * As sharing GPUs among multiple PP ranks is possible, the number of - * GPUs used (per node) can be different from the number of GPU IDs - * used. - */ -static size_t gmx_count_gpu_dev_unique(const std::vector &gpuTaskAssignment) -{ - std::set uniqIds; - for (const auto &deviceId : gpuTaskAssignment) - { - uniqIds.insert(deviceId); - } - return uniqIds.size(); -} - -void reportGpuUsage(const gmx::MDLogger &mdlog, - const gmx_gpu_info_t &gpu_info, - bool userSetGpuIds, - const std::vector &gpuTaskAssignment, - size_t numPpRanks, - bool bPrintHostName) -{ - if (gpuTaskAssignment.empty()) - { - return; - } - - std::string output; - { - std::string gpuIdsString = - formatAndJoin(gpuTaskAssignment, ",", gmx::StringFormatter("%d")); - size_t numGpusInUse = gmx_count_gpu_dev_unique(gpuTaskAssignment); - bool bPluralGpus = numGpusInUse > 1; - - if (bPrintHostName) - { - char host[STRLEN]; - gmx_gethostname(host, STRLEN); - output += gmx::formatString("On host %s ", host); - } - output += gmx::formatString("%zu GPU%s %sselected for this run.\n" - "Mapping of GPU ID%s to the %d PP rank%s in this node: %s\n", - numGpusInUse, bPluralGpus ? "s" : "", - userSetGpuIds ? "user-" : "auto-", - bPluralGpus ? "s" : "", - numPpRanks, - (numPpRanks > 1) ? "s" : "", - gpuIdsString.c_str()); - } - - int same_count = gmx_count_gpu_dev_shared(gpuTaskAssignment, userSetGpuIds); - - if (same_count > 0) - { - output += gmx::formatString("NOTE: You assigned %s to multiple ranks.\n", - same_count > 1 ? "GPU IDs" : "a GPU ID"); - } - - if (static_cast(gpu_info.n_dev_compatible) > numPpRanks) - { - /* TODO In principle, this warning could be warranted only on - * ranks on some nodes, but we lack the infrastructure to do a - * good job of reporting that. */ - output += gmx::formatString("NOTE: potentially sub-optimal launch configuration using fewer\n" - " PP ranks on a node than GPUs available on that node.\n"); - } - - /* NOTE: this print is only for and on one physical node */ - GMX_LOG(mdlog.warning).appendText(output); -} diff --git a/src/gromacs/taskassignment/hardwareassign.h b/src/gromacs/taskassignment/hardwareassign.h deleted file mode 100644 index 5dc3498a52..0000000000 --- a/src/gromacs/taskassignment/hardwareassign.h +++ /dev/null @@ -1,125 +0,0 @@ -/* - * This file is part of the GROMACS molecular simulation package. - * - * Copyright (c) 2016,2017, by the GROMACS development team, led by - * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, - * and including many others, as listed in the AUTHORS file in the - * top-level source directory and at http://www.gromacs.org. - * - * GROMACS is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public License - * as published by the Free Software Foundation; either version 2.1 - * of the License, or (at your option) any later version. - * - * GROMACS is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with GROMACS; if not, see - * http://www.gnu.org/licenses, or write to the Free Software Foundation, - * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - * - * If you want to redistribute modifications to GROMACS, please - * consider that scientific software is very special. Version - * control is crucial - bugs must be traceable. We will be happy to - * consider code for inclusion in the official distribution, but - * derived work must not be called official GROMACS. Details are found - * in the README & COPYING files - if they are missing, get the - * official version at http://www.gromacs.org. - * - * To help us fund GROMACS development, we humbly ask that you cite - * the research papers on the package. Check out http://www.gromacs.org. - */ -/*! \defgroup module_taskassignment Assigning simulation tasks to hardware (taskassignment) - * \ingroup group_mdrun - * \brief Provides code that manages assignment of simulation tasks to hardware. - */ -/*! \libinternal - * \file - * \brief Declares high-level functionality for managing assigning - * tasks on ranks of a node to hardware on that node. - * - * \author Mark Abraham - * \ingroup module_taskassignment - * \inlibraryapi - */ -#ifndef GMX_TASKASSIGNMENT_HARDWAREASSIGN_H -#define GMX_TASKASSIGNMENT_HARDWAREASSIGN_H - -#include -#include - -#include "gromacs/utility/basedefinitions.h" - -struct gmx_gpu_info_t; -struct gmx_hw_info_t; -struct t_commrec; - -namespace gmx -{ - -class MDLogger; - -/*! \brief Parse a GPU assignment string into digits - * - * \param[in] gpuTaskAssignment String like "013" or "0,1,3" typically - * supplied by the user to mdrun -gpu_id. - * - * \returns A vector of integer GPU ids, like {0, 1, 3}. - * - * \throws std::bad_alloc If out of memory. - * InvalidInputError If an invalid character is found (ie not a digit or ','). - */ -std::vector parseGpuTaskAssignment(const std::string &gpuTaskAssignment); - -/*! \brief Assign PP ranks to valid GPU IDs. - * - * Will return a validated mapping from PP ranks (ie tasks that can - * run on GPUs) to the device IDs of compatible GPUs on their node. - * This will be from any non-empty assignment in \c userGpuIds, otherwise a - * default automated mapping is generated. - * - * Note that PME-only ranks have always ignored mdrun -gpu_id, so do - * not attempt to validate -gpu_id. They should continue this behaviour - * until PME tasks can use GPUs. - * - * \param[in] rankCanUseGpu Whether this rank can execute a task on a GPU. - * \param[in] cr Communication record. - * \param[in] gpu_info Information detected about GPUs - * \param[in] compatibleGpus Vector of GPUs that are compatible - * \param[in] userGpuIds The GPU ID task assignment string from the user. - * - * \returns A valid GPU selection. - */ -std::vector mapPpRanksToGpus(bool rankCanUseGpu, - const t_commrec *cr, - const gmx_gpu_info_t &gpu_info, - const std::vector &compatibleGpus, - const std::vector &userGpuIds); - -} // namespace - -/*! \brief Log a report on how GPUs are (or could be) being used on - * the ranks of the physical node of rank 0 of the simulation. - * - * \todo It could be useful to report also whether any nodes differed, - * and in what way. - * - * \param[out] mdlog Logging object. - * \param[in] gpu_info Information detected about GPUs - * \param[in] userSetGpuIds Whether the user selected the GPU ids - * \param[in] gpuTaskAssignment The selected GPU IDs. - * \param[in] numPpRanks Number of PP ranks per node - * \param[in] bPrintHostName Print the hostname in the usage information - * - * \throws std::bad_alloc if out of memory */ -void reportGpuUsage(const gmx::MDLogger &mdlog, - const gmx_gpu_info_t &gpu_info, - bool userSetGpuIds, - const std::vector &gpuTaskAssignment, - size_t numPpRanks, - bool bPrintHostName); - -#endif diff --git a/src/gromacs/taskassignment/reportgpuusage.cpp b/src/gromacs/taskassignment/reportgpuusage.cpp new file mode 100644 index 0000000000..fa153e01de --- /dev/null +++ b/src/gromacs/taskassignment/reportgpuusage.cpp @@ -0,0 +1,142 @@ +/* + * This file is part of the GROMACS molecular simulation package. + * + * Copyright (c) 2017, by the GROMACS development team, led by + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, + * and including many others, as listed in the AUTHORS file in the + * top-level source directory and at http://www.gromacs.org. + * + * GROMACS is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 + * of the License, or (at your option) any later version. + * + * GROMACS is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with GROMACS; if not, see + * http://www.gnu.org/licenses, or write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * If you want to redistribute modifications to GROMACS, please + * consider that scientific software is very special. Version + * control is crucial - bugs must be traceable. We will be happy to + * consider code for inclusion in the official distribution, but + * derived work must not be called official GROMACS. Details are found + * in the README & COPYING files - if they are missing, get the + * official version at http://www.gromacs.org. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the research papers on the package. Check out http://www.gromacs.org. + */ +/*! \internal \file + * \brief Defines routine for reporting GPU usage. + * + * \author Mark Abraham + * \ingroup module_taskassignment + */ +#include "gmxpre.h" + +#include "reportgpuusage.h" + +#include +#include + +#include "gromacs/gpu_utils/gpu_utils.h" +#include "gromacs/utility/cstringutil.h" +#include "gromacs/utility/logger.h" +#include "gromacs/utility/stringutil.h" +#include "gromacs/utility/sysinfo.h" + +namespace gmx +{ + +namespace +{ + +/*! \brief Count and return the number of unique GPUs (per node) selected. + * + * As sharing GPUs among multiple ranks is possible, the number of + * GPUs used (per node) can be different from the number of GPU IDs + * used. + */ +static size_t countUniqueGpuIdsUsed(const GpuTaskAssignments &gpuTaskAssignmentOnRanksOfThisNode) +{ + std::set uniqueIds; + for (const auto &assignmentsOnRank : gpuTaskAssignmentOnRanksOfThisNode) + { + for (const auto &assignmentOfTask : assignmentsOnRank) + { + uniqueIds.insert(assignmentOfTask.deviceId_); + } + } + return uniqueIds.size(); +} + +} // namespace + +void +reportGpuUsage(const MDLogger &mdlog, + bool userSetGpuIds, + const GpuTaskAssignments &gpuTaskAssignmentOnRanksOfThisNode, + size_t numGpuTasksOnThisNode, + size_t numRanks, + bool bPrintHostName) +{ + size_t numGpusInUse = countUniqueGpuIdsUsed(gpuTaskAssignmentOnRanksOfThisNode); + if (numGpusInUse == 0) + { + return; + } + + std::string output; + { + std::string gpuIdsString; + const char *currentSeparator = ""; + const char *separator = ","; + for (const auto &assignmentsOnRank : gpuTaskAssignmentOnRanksOfThisNode) + { + if (assignmentsOnRank.empty()) + { + gpuIdsString += currentSeparator; + gpuIdsString += "none"; + currentSeparator = separator; + } + else + { + for (const auto &assignmentOnRank : assignmentsOnRank) + { + const char *rankType = (assignmentOnRank.task_ == GpuTask::Nonbonded ? "PP" : "PME"); + gpuIdsString += currentSeparator; + gpuIdsString += formatString("%s:%d", rankType, assignmentOnRank.deviceId_); + currentSeparator = separator; + } + } + } + bool bPluralGpus = numGpusInUse > 1; + + if (bPrintHostName) + { + char host[STRLEN]; + gmx_gethostname(host, STRLEN); + output += gmx::formatString("On host %s ", host); + } + output += gmx::formatString("%zu GPU%s %sselected for this run.\n" + "Mapping of GPU IDs to the %d GPU task%s in the %d rank%s on this node:\n %s\n", + numGpusInUse, bPluralGpus ? "s" : "", + userSetGpuIds ? "user-" : "auto-", + numGpuTasksOnThisNode, + (numGpuTasksOnThisNode > 1) ? "s" : "", + numRanks, + (numRanks > 1) ? "s" : "", + gpuIdsString.c_str()); + } + + /* NOTE: this print is only for and on one physical node */ + GMX_LOG(mdlog.warning).appendText(output); +} + +} // namespace diff --git a/src/gromacs/taskassignment/reportgpuusage.h b/src/gromacs/taskassignment/reportgpuusage.h new file mode 100644 index 0000000000..8fba70b560 --- /dev/null +++ b/src/gromacs/taskassignment/reportgpuusage.h @@ -0,0 +1,83 @@ +/* + * This file is part of the GROMACS molecular simulation package. + * + * Copyright (c) 2017, by the GROMACS development team, led by + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, + * and including many others, as listed in the AUTHORS file in the + * top-level source directory and at http://www.gromacs.org. + * + * GROMACS is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 + * of the License, or (at your option) any later version. + * + * GROMACS is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with GROMACS; if not, see + * http://www.gnu.org/licenses, or write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * If you want to redistribute modifications to GROMACS, please + * consider that scientific software is very special. Version + * control is crucial - bugs must be traceable. We will be happy to + * consider code for inclusion in the official distribution, but + * derived work must not be called official GROMACS. Details are found + * in the README & COPYING files - if they are missing, get the + * official version at http://www.gromacs.org. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the research papers on the package. Check out http://www.gromacs.org. + */ +/*! \defgroup module_taskassignment Assigning simulation tasks to hardware (taskassignment) + * \ingroup group_mdrun + * \brief Provides code that manages assignment of simulation tasks to hardware. + */ +/*! \internal + * \file + * \brief Declares routine for reporting GPU usage. + * + * \author Mark Abraham + * \ingroup module_taskassignment + */ +#ifndef GMX_TASKASSIGNMENT_REPORTGPUUSAGE_H +#define GMX_TASKASSIGNMENT_REPORTGPUUSAGE_H + +#include + +#include "gromacs/taskassignment/taskassignment.h" + +namespace gmx +{ + +class MDLogger; + +/*! \brief Log a report on how GPUs are being used on + * the ranks of the physical node of rank 0 of the simulation. + * + * \todo It could be useful to report also whether any nodes differed, + * and in what way. + * + * \param[in] mdlog Logging object. + * \param[in] userSetGpuIds Whether the user selected the GPU ids + * \param[in] gpuTaskAssignmentOnRanksOfThisNode The selected GPU IDs. + * \param[in] numGpuTasksOnThisNode The number of GPU tasks on this node. + * \param[in] numPpRanks Number of PP ranks on this node + * \param[in] bPrintHostName Print the hostname in the usage information + * + * \throws std::bad_alloc if out of memory */ +void +reportGpuUsage(const MDLogger &mdlog, + bool userSetGpuIds, + const GpuTaskAssignments &gpuTaskAssignmentOnRanksOfThisNode, + size_t numGpuTasksOnThisNode, + size_t numPpRanks, + bool bPrintHostName); + + +} // namespace + +#endif diff --git a/src/gromacs/taskassignment/resourcedivision.cpp b/src/gromacs/taskassignment/resourcedivision.cpp index 705bbc8c5e..aae46bd895 100644 --- a/src/gromacs/taskassignment/resourcedivision.cpp +++ b/src/gromacs/taskassignment/resourcedivision.cpp @@ -51,6 +51,7 @@ #include +#include "gromacs/ewald/pme.h" #include "gromacs/hardware/cpuinfo.h" #include "gromacs/hardware/detecthardware.h" #include "gromacs/hardware/hardwaretopology.h" @@ -59,7 +60,6 @@ #include "gromacs/mdtypes/commrec.h" #include "gromacs/mdtypes/inputrec.h" #include "gromacs/mdtypes/md_enums.h" -#include "gromacs/taskassignment/hardwareassign.h" #include "gromacs/topology/mtop_util.h" #include "gromacs/topology/topology.h" #include "gromacs/utility/baseversion.h" @@ -82,13 +82,16 @@ //! Constant used to help minimize preprocessed code static const bool bHasOmpSupport = GMX_OPENMP; -#if GMX_THREAD_MPI -/* The minimum number of atoms per tMPI thread. With fewer atoms than this, - * the number of threads will get lowered. +/*! \brief The minimum number of atoms per thread-MPI thread when GPUs + * are present. With fewer atoms than this, the number of thread-MPI + * ranks will get lowered. */ static const int min_atoms_per_mpi_thread = 90; +/*! \brief The minimum number of atoms per GPU with thread-MPI + * active. With fewer atoms than this, the number of thread-MPI ranks + * will get lowered. + */ static const int min_atoms_per_gpu = 900; -#endif /* GMX_THREAD_MPI */ /**@{*/ /*! \brief Constants for implementing default divisions of threads */ @@ -276,9 +279,7 @@ gmx_unused static int get_tmpi_omp_thread_division(const gmx_hw_info_t *hwinfo, return nrank; } - -#if GMX_THREAD_MPI - +//! Return whether hyper threading is enabled. static bool gmxSmtIsEnabled(const gmx::HardwareTopology &hwTop) { @@ -288,6 +289,7 @@ gmxSmtIsEnabled(const gmx::HardwareTopology &hwTop) namespace { +//! Handles checks for algorithms that must use a single rank. class SingleRankChecker { public: @@ -332,9 +334,9 @@ class SingleRankChecker */ int get_nthreads_mpi(const gmx_hw_info_t *hwinfo, gmx_hw_opt_t *hw_opt, - const std::vector &userGpuIds, - int numPmeRanks, + const std::vector &gpuIdsToUse, bool nonbondedOnGpu, + bool pmeOnGpu, const t_inputrec *inputrec, const gmx_mtop_t *mtop, const gmx::MDLogger &mdlog, @@ -346,31 +348,20 @@ int get_nthreads_mpi(const gmx_hw_info_t *hwinfo, const gmx::CpuInfo &cpuInfo = *hwinfo->cpuInfo; const gmx::HardwareTopology &hwTop = *hwinfo->hardwareTopology; - /* If the user made a GPU task assignment, that sets the number of thread-MPI ranks. */ - int numGpuIdsSupplied = static_cast(userGpuIds.size()); - - /* TODO Here we handle the case where the user set GPU IDs, and - further below we handle the case where the algorithm does not - support multiple ranks. We need also to handle the case where - the user set multiple GPU IDs for an algorithm that cannot - handle multiple ranks. */ - if (hw_opt->nthreads_tmpi < 1 && numGpuIdsSupplied > 0) + if (pmeOnGpu) { - /* If the user chose both mdrun -nt -gpu_id, is that consistent? */ - if (numPmeRanks <= 0) + GMX_RELEASE_ASSERT((EEL_PME(inputrec->coulombtype) || EVDW_PME(inputrec->vdwtype)) && + pme_gpu_supports_input(inputrec, nullptr), + "PME can't be on GPUs unless we are using PME"); + + // A single rank is all that is supported with PME on GPUs + if (hw_opt->nthreads_tmpi < 1) { - if (hw_opt->nthreads_tot > 0 && - (hw_opt->nthreads_tot % numGpuIdsSupplied) != 0) - { - gmx_fatal(FARGS, "Cannot run %d total threads with %d GPU ranks. Choose the total number of threads to be a multiple of the number of GPU ranks.", hw_opt->nthreads_tot, numGpuIdsSupplied); - } - return numGpuIdsSupplied; + return 1; } - else + if (hw_opt->nthreads_tmpi > 1) { - gmx_fatal(FARGS, "The combination of choosing a number of PME ranks, and specific GPU IDs " - "is not supported. Use also -ntmpi and/or -ntomp and -ntomp_pme to specify what " - "distribution of threads to ranks you require."); + gmx_fatal(FARGS, "PME on GPUs is only supported with a single rank"); } } @@ -393,45 +384,15 @@ int get_nthreads_mpi(const gmx_hw_info_t *hwinfo, gmx_fatal(FARGS, "%s However, you asked for more than 1 thread-MPI rank, so mdrun cannot continue. Choose a single rank, or a different algorithm.", message.c_str()); } GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted("%s Choosing to use only a single thread-MPI rank.", message.c_str()); - - if (numGpuIdsSupplied > 1) - { - gmx_fatal(FARGS, "You supplied %d GPU IDs but only 1 rank can be used " - "by this simulation. Supply only one GPU ID.", numGpuIdsSupplied); - } return 1; } } if (hw_opt->nthreads_tmpi > 0) { - if (numPmeRanks <= 0) - { - int numPpRanks = hw_opt->nthreads_tmpi; - if ((numGpuIdsSupplied > 0) && - (numGpuIdsSupplied != numPpRanks)) - { - gmx_fatal(FARGS, "Cannot run %d thread-MPI total ranks with %d " - "GPU IDs supplied. The number of particle-particle (PP) ranks and the " - "number of GPU IDs must match.", hw_opt->nthreads_tmpi, numGpuIdsSupplied); - } - } - else - { - int numPpRanks = hw_opt->nthreads_tmpi - numPmeRanks; - if ((numGpuIdsSupplied > 0) && - (numGpuIdsSupplied != numPpRanks)) - { - gmx_fatal(FARGS, "Cannot run %d thread-MPI total ranks with %d PME ranks and %d " - "GPU IDs supplied. The number of particle-particle ranks and the " - "number of GPU IDs must match.", hw_opt->nthreads_tmpi, numPmeRanks, numGpuIdsSupplied); - } - } /* Trivial, return the user's choice right away */ return hw_opt->nthreads_tmpi; } - GMX_RELEASE_ASSERT(numGpuIdsSupplied == 0, - "If mdrun -gpu_id had information, the number of ranks should have already been chosen"); // Now implement automatic selection of number of thread-MPI ranks nthreads_hw = hwinfo->nthreads_hw_avail; @@ -454,7 +415,7 @@ int get_nthreads_mpi(const gmx_hw_info_t *hwinfo, /* nonbondedOnGpu might be false e.g. because this simulation uses * the group scheme, or is a rerun with energy groups. */ - ngpu = (nonbondedOnGpu ? hwinfo->gpu_info.n_dev_compatible : 0); + ngpu = (nonbondedOnGpu ? static_cast(gpuIdsToUse.size()) : 0); if (inputrec->cutoff_scheme == ecutsGROUP) { @@ -569,7 +530,6 @@ int get_nthreads_mpi(const gmx_hw_info_t *hwinfo, return nrank; } -#endif /* GMX_THREAD_MPI */ void check_resource_division_efficiency(const gmx_hw_info_t *hwinfo, @@ -719,12 +679,13 @@ void check_resource_division_efficiency(const gmx_hw_info_t *hwinfo, //! Dump a \c hw_opt to \c fp. static void print_hw_opt(FILE *fp, const gmx_hw_opt_t *hw_opt) { - fprintf(fp, "hw_opt: nt %d ntmpi %d ntomp %d ntomp_pme %d gpu_id '%s'\n", + fprintf(fp, "hw_opt: nt %d ntmpi %d ntomp %d ntomp_pme %d gpu_id '%s' gputasks '%s'\n", hw_opt->nthreads_tot, hw_opt->nthreads_tmpi, hw_opt->nthreads_omp, hw_opt->nthreads_omp_pme, - hw_opt->gpuIdTaskAssignment.c_str()); + hw_opt->gpuIdsAvailable.c_str(), + hw_opt->userGpuTaskAssignment.c_str()); } void check_and_update_hw_opt_1(gmx_hw_opt_t *hw_opt, diff --git a/src/gromacs/taskassignment/resourcedivision.h b/src/gromacs/taskassignment/resourcedivision.h index cc97752042..95f719ed30 100644 --- a/src/gromacs/taskassignment/resourcedivision.h +++ b/src/gromacs/taskassignment/resourcedivision.h @@ -72,9 +72,9 @@ class MDLogger; */ int get_nthreads_mpi(const gmx_hw_info_t *hwinfo, gmx_hw_opt_t *hw_opt, - const std::vector &userGpuIds, - int numPmeRanks, + const std::vector &gpuIdsToUse, bool nonbondedOnGpu, + bool pmeOnGpu, const t_inputrec *inputrec, const gmx_mtop_t *mtop, const gmx::MDLogger &mdlog, diff --git a/src/gromacs/taskassignment/taskassignment.cpp b/src/gromacs/taskassignment/taskassignment.cpp new file mode 100644 index 0000000000..2b5655fae7 --- /dev/null +++ b/src/gromacs/taskassignment/taskassignment.cpp @@ -0,0 +1,296 @@ +/* + * This file is part of the GROMACS molecular simulation package. + * + * Copyright (c) 2017, by the GROMACS development team, led by + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, + * and including many others, as listed in the AUTHORS file in the + * top-level source directory and at http://www.gromacs.org. + * + * GROMACS is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 + * of the License, or (at your option) any later version. + * + * GROMACS is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with GROMACS; if not, see + * http://www.gnu.org/licenses, or write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * If you want to redistribute modifications to GROMACS, please + * consider that scientific software is very special. Version + * control is crucial - bugs must be traceable. We will be happy to + * consider code for inclusion in the official distribution, but + * derived work must not be called official GROMACS. Details are found + * in the README & COPYING files - if they are missing, get the + * official version at http://www.gromacs.org. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the research papers on the package. Check out http://www.gromacs.org. + */ +/*! \internal \file + * \brief + * Defines helper and factory functionality for task assignment. + * + * Note that the GPU ID assignment could potentially match many + * different kinds of simulation setups, including ranks from multiple + * simulations, ranks from the same simulation, and/or ranks with duty + * only for particular tasks (e.g. PME-only ranks). Which GPU ID + * assignments are valid will naturally depend on the other run-time + * options given to mdrun, and the current capabilities of the + * implementation. + * + * \author Mark Abraham + * \ingroup module_taskassignment + */ +#include "gmxpre.h" + +#include "taskassignment.h" + +#include "config.h" + +#include +#include + +#include "gromacs/hardware/hw_info.h" +#include "gromacs/mdtypes/commrec.h" +#include "gromacs/taskassignment/usergpuids.h" +#include "gromacs/utility/cstringutil.h" +#include "gromacs/utility/exceptions.h" +#include "gromacs/utility/fatalerror.h" +#include "gromacs/utility/gmxassert.h" +#include "gromacs/utility/gmxmpi.h" +#include "gromacs/utility/logger.h" +#include "gromacs/utility/stringutil.h" +#include "gromacs/utility/sysinfo.h" + +#include "findallgputasks.h" +#include "reportgpuusage.h" + +namespace gmx +{ + +namespace +{ + +/*! \brief Build data structure of types of GPU tasks on a rank, + * together with the mapped GPU device IDs, for all GPU tasks on all + * the ranks of this node. + * + * \param[in] gpuTasksOnRanksOfThisNode For each rank on this node, the set of tasks + * that are eligible to run on GPUs. + * \param[in] gpuIds The user-supplied GPU IDs. + */ +static GpuTaskAssignments +buildTaskAssignment(const GpuTasksOnRanks &gpuTasksOnRanksOfThisNode, + ArrayRef gpuIds) +{ + GpuTaskAssignments gpuTaskAssignmentOnRanksOfThisNode(gpuTasksOnRanksOfThisNode.size()); + + // Loop over the ranks on this node, and the tasks on each + // rank. For each task, take the next device ID from those + // provided by the user, to build a vector of mappings of task to + // ID, for each rank on this node. Note that if there have not + // been any GPU tasks identified, then gpuIds can be empty. + auto currentGpuId = gpuIds.begin(); + auto gpuTaskAssignmentOnRank = gpuTaskAssignmentOnRanksOfThisNode.begin(); + for (const auto &gpuTasksOnRank : gpuTasksOnRanksOfThisNode) + { + gpuTaskAssignmentOnRank->reserve(gpuTasksOnRank.size()); + for (const auto &gpuTaskType : gpuTasksOnRank) + { + GMX_RELEASE_ASSERT(currentGpuId != gpuIds.end(), "Indexing out of range for GPU tasks"); + gpuTaskAssignmentOnRank->push_back({gpuTaskType, *currentGpuId}); + ++currentGpuId; + } + GMX_RELEASE_ASSERT(gpuTaskAssignmentOnRank->size() == gpuTasksOnRank.size(), + "Mismatch in number of GPU tasks on a rank with the number of elements in the resulting task assignment"); + ++gpuTaskAssignmentOnRank; + } + + return gpuTaskAssignmentOnRanksOfThisNode; +} + +/*! \brief Return whether a GPU device is shared between any ranks. + * + * Sharing GPUs among multiple ranks is possible via either user or + * automated selection. */ +static bool isAnyGpuSharedBetweenRanks(const GpuTaskAssignments &gpuTaskAssignments) +{ + // Loop over all ranks i, looking on all higher ranks j whether + // any tasks on them share GPU device IDs. + // + // TODO Should this functionality also consider whether tasks on + // the same rank are sharing a device? + for (size_t i = 0; i < gpuTaskAssignments.size(); ++i) + { + for (const auto &taskOnRankI : gpuTaskAssignments[i]) + { + for (size_t j = i+1; j < gpuTaskAssignments.size(); ++j) + { + for (const auto &taskOnRankJ : gpuTaskAssignments[j]) + { + if (taskOnRankI.deviceId_ == taskOnRankJ.deviceId_) + { + return true; + } + } + } + } + } + return false; +} + +//! Logs to \c mdlog information that may help a user learn how to let mdrun make a task assignment that runs faster. +void logPerformanceHints(const MDLogger &mdlog, + size_t numCompatibleGpus, + size_t numGpuTasksOnThisNode, + const GpuTaskAssignments &gpuTaskAssignments) +{ + if (numCompatibleGpus > numGpuTasksOnThisNode) + { + /* TODO In principle, this warning could be warranted only on + * some nodes, but we lack the infrastructure to do a good job + * of reporting that. */ + GMX_LOG(mdlog.warning).asParagraph(). + appendText("NOTE: You assigned the GPU tasks on a node such that some GPUs " + "available on that node are unused, which might not be optimal."); + } + + if (isAnyGpuSharedBetweenRanks(gpuTaskAssignments)) + { + GMX_LOG(mdlog.warning).asParagraph(). + appendText("NOTE: You assigned the same GPU ID(s) to multiple ranks, which is a good idea if you have measured the performance of alternatives."); + } +} + +//! Counts all the GPU tasks on this node. +size_t countGpuTasksOnThisNode(const GpuTasksOnRanks &gpuTasksOnRanksOfThisNode) +{ + size_t numGpuTasksOnThisNode = 0; + for (const auto &gpuTasksOnRank : gpuTasksOnRanksOfThisNode) + { + numGpuTasksOnThisNode += gpuTasksOnRank.size(); + } + return numGpuTasksOnThisNode; +} + +} // namespace + +GpuTaskAssignments::value_type +runTaskAssignment(const std::vector &gpuIdsToUse, + const std::vector &userGpuTaskAssignment, + const gmx_hw_info_t &hardwareInfo, + const MDLogger &mdlog, + const t_commrec *cr, + const std::vector &gpuTasksOnThisRank) +{ + /* Communicate among ranks on this node to find each task that can + * be executed on a GPU, on each rank. */ + auto gpuTasksOnRanksOfThisNode = findAllGpuTasksOnThisNode(gpuTasksOnThisRank, + cr->nrank_intranode, + cr->mpi_comm_physicalnode); + auto numGpuTasksOnThisNode = countGpuTasksOnThisNode(gpuTasksOnRanksOfThisNode); + + GpuTaskAssignments taskAssignmentOnRanksOfThisNode; + try + { + // Use the GPU IDs from the user if they supplied + // them. Otherwise, choose from the compatible GPUs. + // + // GPU ID assignment strings, if provided, cover all the ranks + // on a node. If nodes or the process placement on them are + // heterogeneous, then the GMX_GPU_ID environment variable + // must be set by a user who also wishes to direct GPU ID + // assignment. Thus this implementation of task assignment + // can assume it has a GPU ID assignment appropriate for the + // node upon which its process is running. + // + // Valid GPU ID assignments are `an ordered set of digits that + // identify GPU device IDs (e.g. as understood by the GPU + // runtime, and subject to environment modification such as + // with CUDA_VISIBLE_DEVICES) that will be used for the + // GPU-suitable tasks on all of the ranks of that node. + ArrayRef gpuIdsForTaskAssignment; + std::vector generatedGpuIds; + if (userGpuTaskAssignment.empty()) + { + generatedGpuIds = makeGpuIds(gpuIdsToUse, numGpuTasksOnThisNode); + gpuIdsForTaskAssignment = generatedGpuIds; + } + else + { + if (numGpuTasksOnThisNode != userGpuTaskAssignment.size()) + { + // TODO Decorating the message with hostname should be + // the job of an error-reporting module. + char host[STRLEN]; + gmx_gethostname(host, STRLEN); + + GMX_THROW(InconsistentInputError + (formatString("There were %zu GPU tasks assigned on node %s, but %zu GPU tasks were " + "identified, and these must match. Reconsider your GPU task assignment, " + "number of ranks, or your use of the -nb, -pme, and -npme options.", userGpuTaskAssignment.size(), + host, numGpuTasksOnThisNode))); + } + // Did the user choose compatible GPUs? + checkUserGpuIds(hardwareInfo.gpu_info, gpuIdsToUse, userGpuTaskAssignment); + + gpuIdsForTaskAssignment = userGpuTaskAssignment; + } + taskAssignmentOnRanksOfThisNode = + buildTaskAssignment(gpuTasksOnRanksOfThisNode, gpuIdsForTaskAssignment); + + } + catch (const std::exception &ex) + { + // TODO This implementation is quite similar to that of + // processExceptionAsFatalError (which implements + // GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR), but it is unclear + // how we should involve MPI in the implementation of error + // handling. + if (cr->rank_intranode == 0) + { + printFatalErrorMessage(stderr, ex); + } + + if (PAR(cr)) + { +#if GMX_MPI + MPI_Barrier(cr->mpi_comm_mysim); +#endif + } + if (MULTISIM(cr)) + { +#if GMX_MPI + MPI_Barrier(cr->ms->mpi_comm_masters); +#endif + } + + gmx_exit_on_fatal_error(ExitType_Abort, 1); + } + + reportGpuUsage(mdlog, !userGpuTaskAssignment.empty(), taskAssignmentOnRanksOfThisNode, + numGpuTasksOnThisNode, cr->nrank_intranode, cr->nnodes > 1); + + // If the user chose a task assignment, give them some hints where appropriate. + if (!userGpuTaskAssignment.empty()) + { + logPerformanceHints(mdlog, gpuIdsToUse.size(), + numGpuTasksOnThisNode, + taskAssignmentOnRanksOfThisNode); + } + + return taskAssignmentOnRanksOfThisNode[cr->rank_intranode]; + + // TODO There is no check that mdrun -nb gpu or -pme gpu or + // -gpu_id is actually being implemented such that nonbonded tasks + // are being run on compatible GPUs, on all applicable ranks. That + // would require communication. +} + +} // namespace diff --git a/src/gromacs/taskassignment/taskassignment.h b/src/gromacs/taskassignment/taskassignment.h new file mode 100644 index 0000000000..294230543e --- /dev/null +++ b/src/gromacs/taskassignment/taskassignment.h @@ -0,0 +1,123 @@ +/* + * This file is part of the GROMACS molecular simulation package. + * + * Copyright (c) 2017, by the GROMACS development team, led by + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, + * and including many others, as listed in the AUTHORS file in the + * top-level source directory and at http://www.gromacs.org. + * + * GROMACS is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 + * of the License, or (at your option) any later version. + * + * GROMACS is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with GROMACS; if not, see + * http://www.gnu.org/licenses, or write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * If you want to redistribute modifications to GROMACS, please + * consider that scientific software is very special. Version + * control is crucial - bugs must be traceable. We will be happy to + * consider code for inclusion in the official distribution, but + * derived work must not be called official GROMACS. Details are found + * in the README & COPYING files - if they are missing, get the + * official version at http://www.gromacs.org. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the research papers on the package. Check out http://www.gromacs.org. + */ +/*! \defgroup module_taskassignment Assigning simulation tasks to hardware (taskassignment) + * \ingroup group_mdrun + * \brief Provides code that manages assignment of simulation tasks to hardware. + */ +/*! \libinternal + * \file + * \brief Declares high-level functionality for managing assigning + * tasks on ranks of a node to hardware on that node, and the factory + * function to build the correct flavours of gmx::INodeTaskAssigner + * required to implement the user's requirements. + * + * \author Mark Abraham + * \ingroup module_taskassignment + * \inlibraryapi + */ +#ifndef GMX_TASKASSIGNMENT_TASKASSIGNMENT_H +#define GMX_TASKASSIGNMENT_TASKASSIGNMENT_H + +#include + +struct gmx_hw_info_t; +struct t_commrec; + +namespace gmx +{ + +class MDLogger; + +/*! \brief Types of compute tasks that can be run on a GPU. + * + * These names refer to existing practice in GROMACS, which is not + * strictly accurate. */ +enum class GpuTask : int +{ + //! Short-ranged interactions. + Nonbonded, + //! Long-ranged interactions. + Pme +}; + +/*! \libinternal + * \brief Specifies the GPU deviceID_ available for task_ to use. */ +struct GpuTaskMapping +{ + //! The type of this GPU task. + GpuTask task_; + //! Device ID on this node to which this GPU task is mapped. + int deviceId_; +}; + +//! Container of GPU tasks on a rank, specifying the task type and GPU device ID, e.g. potentially ready for consumption by the modules on that rank. +using GpuTaskAssignment = std::vector ; +//! Container of compute tasks suitable to run on a GPU e.g. on each rank of a node. +using GpuTasksOnRanks = std::vector< std::vector >; +//! Container of RankGpuTaskAssignments e.g. for all ranks on a node. +using GpuTaskAssignments = std::vector; + +/*! \brief Coordinate the final stages of task assignment and + * reporting, and return the assignment for this rank. + * + * Communicates between ranks on a node to coordinate task assignment + * between them onto available hardware, e.g. accelerators. + * + * Releases the taskAssigner once its work is complete. + * + * \param[in] gpuIdsToUse The compatible GPUs that the user permitted us to use. + * \param[in] userGpuTaskAssignment The user-specified assignment of GPU tasks to device IDs. + * \param[in] hardwareInfo The detected hardware + * \param[in] mdlog Logging object to write to. + * \param[in] cr Communication object. + * \param[in] gpuTasksOnThisRank Information about what GPU tasks + * exist on this rank. + * + * \returns A GPU task assignment for this rank. + * + * \throws std::bad_alloc If out of memory. + * InconsistentInputError If user and/or detected inputs are inconsistent. + */ +GpuTaskAssignments::value_type +runTaskAssignment(const std::vector &gpuIdsToUse, + const std::vector &userGpuTaskAssignment, + const gmx_hw_info_t &hardwareInfo, + const MDLogger &mdlog, + const t_commrec *cr, + const std::vector &gpuTasksOnThisRank); + +} // namespace + +#endif diff --git a/src/gromacs/taskassignment/usergpuids.h b/src/gromacs/taskassignment/usergpuids.h index 280a819f0a..2209fe258a 100644 --- a/src/gromacs/taskassignment/usergpuids.h +++ b/src/gromacs/taskassignment/usergpuids.h @@ -60,7 +60,7 @@ namespace gmx /*! \brief Parse a GPU ID string into a container describing the task types and associated device IDs. * * \param[in] gpuIdString String like "013" or "0,1,3" typically - * supplied by the user to mdrun -gpu_id. + * supplied by the user to mdrun -gpu_id or -gputasks. * Must contain only decimal digits, or only decimal * digits separated by comma delimiters. A terminal * comma is accceptable (and required to specify a @@ -86,16 +86,17 @@ makeGpuIds(const std::vector &compatibleGpus, size_t numGpuTasks); /*! \brief Convert a container of GPU deviced IDs to a string that - * can be used by gmx tune_pme as input to mdrun -gpu_id. + * can be used by gmx tune_pme as input to mdrun -gputasks. * - * Produce a valid input for mdrun -gpu_id that refers to the device + * Produce a valid input for mdrun -gputasks that refers to the device * IDs in \c gpuIds but produces a mapping for \c - * totalNumberOfTasks tasks. + * totalNumberOfTasks tasks. Note that gmx tune_pme does not + * currently support filling mdrun -gputasks. * * \param[in] gpuIds Container of device IDs * \param[in] totalNumberOfTasks Total number of tasks for the output mapping produced by the returned string. * - * \returns A string that is suitable to pass to mdrun -gpu_id. + * \returns A string that is suitable to pass to mdrun -gputasks. * * \throws std::bad_alloc If out of memory. */ diff --git a/src/programs/mdrun/mdrun.cpp b/src/programs/mdrun/mdrun.cpp index 8cab874a42..9af3bb7ca3 100644 --- a/src/programs/mdrun/mdrun.cpp +++ b/src/programs/mdrun/mdrun.cpp @@ -251,7 +251,8 @@ int Mdrunner::mainFunction(int argc, char *argv[]) const char *nbpu_opt_choices[] = { nullptr, "auto", "cpu", "gpu", nullptr }; gmx_bool bTryToAppendFiles = TRUE; - const char *gpuIdTaskAssignment = ""; + const char *gpuIdsAvailable = ""; + const char *userGpuTaskAssignment = ""; ImdOptions &imdOptions = mdrunOptions.imdOptions; @@ -277,8 +278,10 @@ int Mdrunner::mainFunction(int argc, char *argv[]) "The lowest logical core number to which mdrun should pin the first thread" }, { "-pinstride", FALSE, etINT, {&hw_opt.core_pinning_stride}, "Pinning distance in logical cores for threads, use 0 to minimize the number of threads per physical core" }, - { "-gpu_id", FALSE, etSTR, {&gpuIdTaskAssignment}, - "List of GPU device id-s to use, specifies the per-node PP rank to GPU mapping" }, + { "-gpu_id", FALSE, etSTR, {&gpuIdsAvailable}, + "List of unique GPU device IDs available to use" }, + { "-gputasks", FALSE, etSTR, {&userGpuTaskAssignment}, + "List of GPU device IDs, mapping each PP task on each node to a device" }, { "-ddcheck", FALSE, etBOOL, {&domdecOptions.checkBondedInteractions}, "Check for all bonded interactions with DD" }, { "-ddbondcomm", FALSE, etBOOL, {&domdecOptions.useBondedCommunication}, @@ -389,24 +392,42 @@ int Mdrunner::mainFunction(int argc, char *argv[]) return 0; } - // Handle the option that permits the user to select a GPU task - // assignment, which could be in an environment variable (so that - // there is a way to customize it, when using MPI in heterogeneous - // contexts). + // Handle the options that permits the user to either declare + // which compatible GPUs are availble for use, or to select a GPU + // task assignment. Either could be in an environment variable (so + // that there is a way to customize it, when using MPI in + // heterogeneous contexts). { // TODO Argument parsing can't handle std::string. We should // fix that by changing the parsing, once more of the roles of // handling, validating and implementing defaults for user // command-line options have been seperated. - hw_opt.gpuIdTaskAssignment = gpuIdTaskAssignment; + hw_opt.gpuIdsAvailable = gpuIdsAvailable; + hw_opt.userGpuTaskAssignment = userGpuTaskAssignment; + const char *env = getenv("GMX_GPU_ID"); if (env != nullptr) { - if (!hw_opt.gpuIdTaskAssignment.empty()) + if (!hw_opt.gpuIdsAvailable.empty()) { gmx_fatal(FARGS, "GMX_GPU_ID and -gpu_id can not be used at the same time"); } - hw_opt.gpuIdTaskAssignment = env; + hw_opt.gpuIdsAvailable = env; + } + + env = getenv("GMX_GPUTASKS"); + if (env != nullptr) + { + if (!hw_opt.userGpuTaskAssignment.empty()) + { + gmx_fatal(FARGS, "GMX_GPUTASKS and -gputasks can not be used at the same time"); + } + hw_opt.userGpuTaskAssignment = env; + } + + if (!hw_opt.gpuIdsAvailable.empty() && !hw_opt.userGpuTaskAssignment.empty()) + { + gmx_fatal(FARGS, "-gpu_id and -gputasks cannot be used at the same time"); } } diff --git a/src/programs/mdrun/runner.cpp b/src/programs/mdrun/runner.cpp index ed9209d564..53c55ff09f 100644 --- a/src/programs/mdrun/runner.cpp +++ b/src/programs/mdrun/runner.cpp @@ -100,8 +100,9 @@ #include "gromacs/pbcutil/pbc.h" #include "gromacs/pulling/pull.h" #include "gromacs/pulling/pull_rotation.h" -#include "gromacs/taskassignment/hardwareassign.h" +#include "gromacs/taskassignment/decidegpuusage.h" #include "gromacs/taskassignment/resourcedivision.h" +#include "gromacs/taskassignment/taskassignment.h" #include "gromacs/taskassignment/usergpuids.h" #include "gromacs/timing/wallcycle.h" #include "gromacs/topology/mtop_util.h" @@ -117,6 +118,7 @@ #include "gromacs/utility/pleasecite.h" #include "gromacs/utility/programcontext.h" #include "gromacs/utility/smalloc.h" +#include "gromacs/utility/stringutil.h" #include "deform.h" #include "md.h" @@ -134,13 +136,6 @@ matrix deform_init_box_tpx; //! MPI variable for use in pressure scaling tMPI_Thread_mutex_t deform_init_box_mutex = TMPI_THREAD_MUTEX_INITIALIZER; -#if GMX_THREAD_MPI -/* The minimum number of atoms per tMPI thread. With fewer atoms than this, - * the number of threads will get lowered. - */ -#define MIN_ATOMS_PER_MPI_THREAD 90 -#define MIN_ATOMS_PER_GPU 900 - namespace gmx { @@ -209,6 +204,7 @@ t_commrec *Mdrunner::spawnThreads(int numThreadsToLaunch) // Mdrunner. spawnedMdrunner.fnm = dup_tfn(this->nfile, fnm); +#if GMX_THREAD_MPI /* now spawn new threads that start mdrunner_start_fn(), while the main thread returns, we set thread affinity later */ if (tMPI_Init_fn(TRUE, numThreadsToLaunch, TMPI_AFFINITY_NONE, @@ -216,14 +212,15 @@ t_commrec *Mdrunner::spawnThreads(int numThreadsToLaunch) { GMX_THROW(gmx::InternalError("Failed to spawn thread-MPI threads")); } +#else + GMX_UNUSED_VALUE(mdrunner_start_fn); +#endif return reinitialize_commrec_for_this_thread(cr); } } // namespace -#endif /* GMX_THREAD_MPI */ - /*! \brief Initialize variables for Verlet scheme simulation */ static void prepare_verlet_scheme(FILE *fplog, t_commrec *cr, @@ -318,47 +315,12 @@ static void override_nsteps_cmdline(const gmx::MDLogger &mdlog, namespace gmx { -//! Halt the run if there are inconsistences between user choices to run with GPUs and/or hardware detection. -static void exitIfCannotForceGpuRun(bool requirePhysicalGpu, - EmulateGpuNonbonded emulateGpuNonbonded, - bool useVerletScheme, - bool compatibleGpusFound) -{ - /* Was GPU acceleration either explicitly (-nb gpu) or implicitly - * (gpu ID passed) requested? */ - if (!requirePhysicalGpu) - { - return; - } - - if (GMX_GPU == GMX_GPU_NONE) - { - gmx_fatal(FARGS, "GPU acceleration requested, but %s was compiled without GPU support!", - gmx::getProgramContext().displayName()); - } - - if (emulateGpuNonbonded == EmulateGpuNonbonded::Yes) - { - gmx_fatal(FARGS, "GPU emulation cannot be requested together with GPU acceleration!"); - } - - if (!useVerletScheme) - { - gmx_fatal(FARGS, "GPU acceleration requested, but can't be used without cutoff-scheme=Verlet"); - } - - if (!compatibleGpusFound) - { - gmx_fatal(FARGS, "GPU acceleration requested, but no compatible GPUs were detected."); - } -} - -/*! \brief Return whether GPU acceleration is useful with the given settings. +/*! \brief Return whether GPU acceleration of nonbondeds is useful with the given settings. * * If not, logs a message about falling back to CPU code. */ -static bool gpuAccelerationIsUseful(const MDLogger &mdlog, - const t_inputrec *ir, - bool doRerun) +static bool gpuAccelerationOfNonbondedIsUseful(const MDLogger &mdlog, + const t_inputrec *ir, + bool doRerun) { if (doRerun && ir->opts.ngener > 1) { @@ -430,6 +392,31 @@ static gmx::LoggerOwner buildLogger(FILE *fplog, const t_commrec *cr) return builder.build(); } +//! Make a TaskTarget from an mdrun argument string. +static TaskTarget findTaskTarget(const char *optionString) +{ + TaskTarget returnValue = TaskTarget::Auto; + + if (strncmp(optionString, "auto", 3) == 0) + { + returnValue = TaskTarget::Auto; + } + else if (strncmp(optionString, "cpu", 3) == 0) + { + returnValue = TaskTarget::Cpu; + } + else if (strncmp(optionString, "gpu", 3) == 0) + { + returnValue = TaskTarget::Gpu; + } + else + { + GMX_ASSERT(false, "Option string should have been checked for sanity already"); + } + + return returnValue; +} + int Mdrunner::mdrunner() { matrix box; @@ -467,27 +454,44 @@ int Mdrunner::mdrunner() bool doMembed = opt2bSet("-membed", nfile, fnm); bool doRerun = mdrunOptions.rerun; - /* Handle GPU-related user options. Later, we check consistency - * with things like whether support is compiled, or tMPI thread - * count. */ + // Handle task-assignment related user options. EmulateGpuNonbonded emulateGpuNonbonded = (getenv("GMX_EMULATE_GPU") != nullptr ? EmulateGpuNonbonded::Yes : EmulateGpuNonbonded::No); - std::vector userGpuIds; + std::vector gpuIdsAvailable; try { - userGpuIds = parseUserGpuIds(hw_opt.gpuIdTaskAssignment); + gpuIdsAvailable = parseUserGpuIds(hw_opt.gpuIdsAvailable); + // TODO We could put the GPU IDs into a std::map to find + // duplicates, but for the small numbers of IDs involved, this + // code is simple and fast. + for (size_t i = 0; i != gpuIdsAvailable.size(); ++i) + { + for (size_t j = i+1; j != gpuIdsAvailable.size(); ++j) + { + if (gpuIdsAvailable[i] == gpuIdsAvailable[j]) + { + GMX_THROW(InvalidInputError(formatString("The string of available GPU device IDs '%s' may not contain duplicate device IDs", hw_opt.gpuIdsAvailable.c_str()))); + } + } + } } GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR; - bool forceUseCpu = (strncmp(nbpu_opt, "cpu", 3) == 0); - if (!userGpuIds.empty() && forceUseCpu) + std::vector userGpuTaskAssignment; + try { - gmx_fatal(FARGS, "GPU IDs were specified, and short-ranged interactions were assigned to the CPU. Make no more than one of these choices."); + userGpuTaskAssignment = parseUserGpuIds(hw_opt.userGpuTaskAssignment); } - bool forceUsePhysicalGpu = (strncmp(nbpu_opt, "gpu", 3) == 0) || !userGpuIds.empty(); - bool tryUsePhysicalGpu = (strncmp(nbpu_opt, "auto", 4) == 0) && userGpuIds.empty() && (emulateGpuNonbonded == EmulateGpuNonbonded::No); - GMX_RELEASE_ASSERT(!(forceUsePhysicalGpu && tryUsePhysicalGpu), "Must either force use of " - "GPUs for short-ranged interactions, or try to use them, not both."); + GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR; + auto nonbondedTarget = findTaskTarget(nbpu_opt); + // TODO Connect these to actual mdrun arguments and some functionality + const char *pme_opt = "cpu"; + auto pmeTarget = findTaskTarget(pme_opt); + + // TODO find a sensible home and behaviour for this + //const char *pme_fft_opt = "auto"; + //auto pmeFftTarget = findTaskTarget(pme_fft_opt); + const PmeRunMode pmeRunMode = PmeRunMode::CPU; //TODO this is a placeholder as PME on GPU is not permitted yet //TODO should there exist a PmeRunMode::None value for consistency? @@ -501,6 +505,33 @@ int Mdrunner::mdrunner() gmx_print_detected_hardware(fplog, cr, mdlog, hwinfo); + std::vector gpuIdsToUse; + auto compatibleGpus = getCompatibleGpus(hwinfo->gpu_info); + if (gpuIdsAvailable.empty()) + { + gpuIdsToUse = compatibleGpus; + } + else + { + for (const auto &availableGpuId : gpuIdsAvailable) + { + bool availableGpuIsCompatible = false; + for (const auto &compatibleGpuId : compatibleGpus) + { + if (availableGpuId == compatibleGpuId) + { + availableGpuIsCompatible = true; + break; + } + } + if (!availableGpuIsCompatible) + { + gmx_fatal(FARGS, "You limited the set of compatible GPUs to a set that included ID #%d, but that ID is not for a compatible GPU. List only compatible GPUs.", availableGpuId); + } + gpuIdsToUse.push_back(availableGpuId); + } + } + if (fplog != nullptr) { /* Print references after all software/hardware printing */ @@ -523,45 +554,21 @@ int Mdrunner::mdrunner() /* Read (nearly) all data required for the simulation */ read_tpx_state(ftp2fn(efTPR, nfile, fnm), inputrec, globalState.get(), mtop); - exitIfCannotForceGpuRun(forceUsePhysicalGpu, - emulateGpuNonbonded, - inputrec->cutoff_scheme == ecutsVERLET, - compatibleGpusFound(hwinfo->gpu_info)); - - if (inputrec->cutoff_scheme == ecutsVERLET) - { - /* TODO This logic could run later, e.g. before -npme -1 - is handled. If inputrec has already been communicated, - then the resulting tryUsePhysicalGpu does not need to - be communicated. */ - if ((tryUsePhysicalGpu || forceUsePhysicalGpu) && - !gpuAccelerationIsUseful(mdlog, inputrec, doRerun)) - { - /* Fallback message printed by nbnxn_acceleration_supported */ - if (forceUsePhysicalGpu) - { - gmx_fatal(FARGS, "GPU acceleration requested, but not supported with the given input settings"); - } - tryUsePhysicalGpu = false; - } - } - else + if (inputrec->cutoff_scheme != ecutsVERLET) { if (nstlist_cmdline > 0) { gmx_fatal(FARGS, "Can not set nstlist with the group cut-off scheme"); } - if (compatibleGpusFound(hwinfo->gpu_info)) + if (!compatibleGpus.empty()) { GMX_LOG(mdlog.warning).asParagraph().appendText( "NOTE: GPU(s) found, but the current simulation can not use GPUs\n" " To use a GPU, set the mdp option: cutoff-scheme = Verlet"); } - tryUsePhysicalGpu = false; } } - bool nonbondedOnGpu = (tryUsePhysicalGpu || forceUsePhysicalGpu) && compatibleGpusFound(hwinfo->gpu_info); /* Check and update the hardware options for internal consistency */ check_and_update_hw_opt_1(&hw_opt, cr, domdecOptions.numPmeRanks); @@ -570,8 +577,7 @@ int Mdrunner::mdrunner() gmx_check_thread_affinity_set(mdlog, cr, &hw_opt, hwinfo->nthreads_hw_avail, FALSE); -#if GMX_THREAD_MPI - if (SIMMASTER(cr)) + if (GMX_THREAD_MPI && SIMMASTER(cr)) { if (domdecOptions.numPmeRanks > 0 && hw_opt.nthreads_tmpi <= 0) { @@ -584,6 +590,26 @@ int Mdrunner::mdrunner() */ check_and_update_hw_opt_2(&hw_opt, inputrec->cutoff_scheme); + bool useGpuForNonbonded = false; + bool useGpuForPme = false; + try + { + // If the user specified the number of ranks, then we must + // respect that, but in default mode, we need to allow for + // the number of GPUs to choose the number of ranks. + + useGpuForNonbonded = decideWhetherToUseGpusForNonbondedWithThreadMpi + (nonbondedTarget, gpuIdsToUse, userGpuTaskAssignment, emulateGpuNonbonded, + inputrec->cutoff_scheme == ecutsVERLET, + gpuAccelerationOfNonbondedIsUseful(mdlog, inputrec, doRerun), + hw_opt.nthreads_tmpi); + auto inputSystemHasPme = EEL_PME(inputrec->coulombtype) || EVDW_PME(inputrec->vdwtype); + auto canUseGpuForPme = inputSystemHasPme && pme_gpu_supports_input(inputrec, nullptr); + useGpuForPme = decideWhetherToUseGpusForPmeWithThreadMpi + (useGpuForNonbonded, pmeTarget, gpuIdsToUse, userGpuTaskAssignment, + canUseGpuForPme, hw_opt.nthreads_tmpi); + } + GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR; /* Determine how many thread-MPI ranks to start. * * TODO Over-writing the user-supplied value here does @@ -591,9 +617,9 @@ int Mdrunner::mdrunner() * correctly. */ hw_opt.nthreads_tmpi = get_nthreads_mpi(hwinfo, &hw_opt, - userGpuIds, - domdecOptions.numPmeRanks, - nonbondedOnGpu, + gpuIdsToUse, + useGpuForNonbonded, + useGpuForPme, inputrec, mtop, mdlog, doMembed); @@ -606,16 +632,37 @@ int Mdrunner::mdrunner() // reinitialize_commrec_for_this_thread. Find a way to express // this better. } -#endif /* END OF CAUTION: cr is now reliable */ if (PAR(cr)) { /* now broadcast everything to the non-master nodes/threads: */ init_parallel(cr, inputrec, mtop); + } - gmx_bcast_sim(sizeof(nonbondedOnGpu), &nonbondedOnGpu, cr); + // Now each rank knows the inputrec that SIMMASTER read and used, + // and (if applicable) cr->nnodes has been assigned the number of + // thread-MPI ranks that have been chosen. The ranks can now all + // run the task-deciding functions and will agree on the result + // without needing to communicate. + // + // TODO Should we do the communication in debug mode to support + // having an assertion? + // + // Note that these variables describe only their own node. + bool useGpuForNonbonded = false; + bool useGpuForPme = false; + try + { + useGpuForNonbonded = decideWhetherToUseGpusForNonbonded(nonbondedTarget, gpuIdsToUse, userGpuTaskAssignment, + emulateGpuNonbonded, inputrec->cutoff_scheme == ecutsVERLET, + gpuAccelerationOfNonbondedIsUseful(mdlog, inputrec, doRerun)); + auto inputSystemHasPme = EEL_PME(inputrec->coulombtype) || EVDW_PME(inputrec->vdwtype); + auto canUseGpuForPme = inputSystemHasPme && pme_gpu_supports_input(inputrec, nullptr); + useGpuForPme = decideWhetherToUseGpusForPme(useGpuForNonbonded, pmeTarget, userGpuTaskAssignment, canUseGpuForPme, cr->nnodes); } + GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR; + // TODO: Error handling mdModules.assignOptionsToModules(*inputrec->params, nullptr); @@ -687,7 +734,7 @@ int Mdrunner::mdrunner() domdecOptions.numPmeRanks = 0; } - if (nonbondedOnGpu && domdecOptions.numPmeRanks < 0) + if (useGpuForNonbonded && domdecOptions.numPmeRanks < 0) { /* With GPUs we don't automatically use PME-only ranks. PME ranks can * improve performance with many threads per GPU, since our OpenMP @@ -790,7 +837,7 @@ int Mdrunner::mdrunner() if (inputrec->cutoff_scheme == ecutsVERLET) { prepare_verlet_scheme(fplog, cr, inputrec, nstlist_cmdline, mtop, box, - nonbondedOnGpu || (emulateGpuNonbonded == EmulateGpuNonbonded::Yes), *hwinfo->cpuInfo); + useGpuForNonbonded || (emulateGpuNonbonded == EmulateGpuNonbonded::Yes), *hwinfo->cpuInfo); } if (PAR(cr) && !(EI_TPI(inputrec->eI) || @@ -871,44 +918,72 @@ int Mdrunner::mdrunner() } #endif - // Contains the ID of the GPU used by each PP rank on this node, - // indexed by that rank. Empty if no GPUs are selected for use on - // this node. - std::vector gpuTaskAssignment; - if (nonbondedOnGpu) - { - // Currently the DD code assigns duty to ranks that can - // include PP work that currently can be executed on a single - // GPU, if present and compatible. This has to be coordinated - // across PP ranks on a node, with possible multiple devices - // or sharing devices on a node, either from the user - // selection, or automatically. - // - // GPU ID assignment strings, if provided, cover all the ranks on - // a node. If nodes or the process placement on them are - // heterogeneous, then the GMX_GPU_ID environment variable must be - // set by a user who also wishes to direct GPU ID assignment. - // Thus the implementation of task assignment can assume it has a - // GPU ID assignment appropriate for the node upon which its - // process is running. - // - // Valid GPU ID assignments are an ordered set of digits that - // identify GPU device IDs (e.g. as understood by the GPU runtime, - // and subject to environment modification such as with - // CUDA_VISIBLE_DEVICES) that will be used for the GPU-suitable - // tasks on all of the ranks of that node. - bool rankCanUseGpu = thisRankHasDuty(cr, DUTY_PP); - gpuTaskAssignment = mapPpRanksToGpus(rankCanUseGpu, cr, hwinfo->gpu_info, hwinfo->compatibleGpus, userGpuIds); - } - - reportGpuUsage(mdlog, hwinfo->gpu_info, !userGpuIds.empty(), - gpuTaskAssignment, cr->nrank_pp_intranode, cr->nnodes > 1); - - if (!gpuTaskAssignment.empty()) - { - GMX_RELEASE_ASSERT(cr->nrank_pp_intranode == static_cast(gpuTaskAssignment.size()), - "The number of PP ranks on each node must equal the number of GPU tasks used on each node"); + // Build a data structure that expresses which kinds of non-bonded + // task are handled by this rank. + // + // TODO Later, this might become a loop over all registered modules + // relevant to the mdp inputs, to find those that have such tasks. + // + // TODO This could move before init_domain_decomposition() as part + // of refactoring that separates the responsibility for duty + // assignment from setup for communication between tasks, and + // setup for tasks handled with a domain (ie including short-ranged + // tasks, bonded tasks, etc.). + // + // Note that in general useGpuForNonbonded, etc. can have a value + // that is inconsistent with the presence of actual GPUs on any + // rank, and that is not known to be a problem until the + // duty of the ranks on a node become node. + // + // TODO Later we might need the concept of computeTasksOnThisRank, + // from which we construct gpuTasksOnThisRank. + // + // Currently the DD code assigns duty to ranks that can + // include PP work that currently can be executed on a single + // GPU, if present and compatible. This has to be coordinated + // across PP ranks on a node, with possible multiple devices + // or sharing devices on a node, either from the user + // selection, or automatically. + auto haveGpus = !gpuIdsToUse.empty(); + std::vector gpuTasksOnThisRank; + if (thisRankHasDuty(cr, DUTY_PP)) + { + if (useGpuForNonbonded) + { + if (haveGpus) + { + gpuTasksOnThisRank.push_back(GpuTask::Nonbonded); + } + else if (nonbondedTarget == TaskTarget::Gpu) + { + gmx_fatal(FARGS, "Cannot run short-ranged nonbonded interactions on a GPU because there is none detected."); + } + } + } + // TODO cr->duty & DUTY_PME should imply that a PME algorithm is active, but currently does not. + if (EEL_PME(inputrec->coulombtype) && (thisRankHasDuty(cr, DUTY_PME))) + { + if (useGpuForPme) + { + if (haveGpus) + { + gpuTasksOnThisRank.push_back(GpuTask::Pme); + } + else if (pmeTarget == TaskTarget::Gpu) + { + gmx_fatal(FARGS, "Cannot run PME on a GPU because there is none detected."); + } + } + } + + GpuTaskAssignment gpuTaskAssignment; + try + { + // Produce the task assignment for this rank. + gpuTaskAssignment = runTaskAssignment(gpuIdsToUse, userGpuTaskAssignment, *hwinfo, + mdlog, cr, gpuTasksOnThisRank); } + GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR; /* Prevent other ranks from continuing after an issue was found * and reported as a fatal error. @@ -923,27 +998,33 @@ int Mdrunner::mdrunner() { MPI_Barrier(cr->mpi_comm_mysim); } + if (MULTISIM(cr)) + { + MPI_Barrier(cr->ms->mpi_comm_masters); + } #endif /* Now that we know the setup is consistent, check for efficiency */ check_resource_division_efficiency(hwinfo, hw_opt.nthreads_tot, !gpuTaskAssignment.empty(), mdrunOptions.ntompOptionIsSet, cr, mdlog); - gmx_device_info_t *shortRangedDeviceInfo = nullptr; - int shortRangedDeviceId = -1; + gmx_device_info_t *nonbondedDeviceInfo = nullptr; + int nonbondedDeviceId = -1; if (thisRankHasDuty(cr, DUTY_PP)) { if (!gpuTaskAssignment.empty()) { - shortRangedDeviceId = gpuTaskAssignment[cr->rank_pp_intranode]; - shortRangedDeviceInfo = getDeviceInfo(hwinfo->gpu_info, shortRangedDeviceId); + GMX_RELEASE_ASSERT(gpuTaskAssignment.size() == 1, "A valid GPU assignment can only have one task per rank"); + GMX_RELEASE_ASSERT(gpuTaskAssignment[0].task_ == gmx::GpuTask::Nonbonded, "A valid GPU assignment can only include short-ranged tasks"); + nonbondedDeviceId = gpuTaskAssignment[0].deviceId_; + nonbondedDeviceInfo = getDeviceInfo(hwinfo->gpu_info, nonbondedDeviceId); } } if (DOMAINDECOMP(cr)) { /* When we share GPUs over ranks, we need to know this for the DLB */ - dd_setup_dlb_resource_sharing(cr, shortRangedDeviceId); + dd_setup_dlb_resource_sharing(cr, nonbondedDeviceId); } /* getting number of PP/PME threads @@ -989,7 +1070,7 @@ int Mdrunner::mdrunner() opt2fn("-table", nfile, fnm), opt2fn("-tablep", nfile, fnm), getFilenm("-tableb", nfile, fnm), - shortRangedDeviceInfo, + nonbondedDeviceInfo, FALSE, pforce); @@ -1226,7 +1307,7 @@ int Mdrunner::mdrunner() } /* Free GPU memory and context */ - free_gpu_resources(fr, cr, shortRangedDeviceInfo); + free_gpu_resources(fr, cr, nonbondedDeviceInfo); if (doMembed) { -- 2.22.0