From e9ca6039cd82cc4184372fee701b212a4b2d1ee1 Mon Sep 17 00:00:00 2001 From: Berk Hess Date: Wed, 22 Apr 2015 11:42:57 +0200 Subject: [PATCH] Refactored mdrun resource division The functions concerned with divided MPI/thread resources have been moved from runner.cpp to the new file resource-division.cpp. Set min_atoms_per_mpi_rank to 1 for NM and TPI to avoid compiler warning and for NM on systems of a few atoms. Change-Id: I94f770c47b4b5ca03de8f5a29a165631796204ad --- src/programs/mdrun/resource-division.cpp | 422 +++++++++++++++++++++++ src/programs/mdrun/resource-division.h | 64 ++++ src/programs/mdrun/runner.cpp | 360 +------------------ 3 files changed, 487 insertions(+), 359 deletions(-) create mode 100644 src/programs/mdrun/resource-division.cpp create mode 100644 src/programs/mdrun/resource-division.h diff --git a/src/programs/mdrun/resource-division.cpp b/src/programs/mdrun/resource-division.cpp new file mode 100644 index 0000000000..70f69c7d20 --- /dev/null +++ b/src/programs/mdrun/resource-division.cpp @@ -0,0 +1,422 @@ +/* + * This file is part of the GROMACS molecular simulation package. + * + * Copyright (c) 2015, by the GROMACS development team, led by + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, + * and including many others, as listed in the AUTHORS file in the + * top-level source directory and at http://www.gromacs.org. + * + * GROMACS is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 + * of the License, or (at your option) any later version. + * + * GROMACS is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with GROMACS; if not, see + * http://www.gnu.org/licenses, or write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * If you want to redistribute modifications to GROMACS, please + * consider that scientific software is very special. Version + * control is crucial - bugs must be traceable. We will be happy to + * consider code for inclusion in the official distribution, but + * derived work must not be called official GROMACS. Details are found + * in the README & COPYING files - if they are missing, get the + * official version at http://www.gromacs.org. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the research papers on the package. Check out http://www.gromacs.org. + */ + +#include "gmxpre.h" + +#include "resource-division.h" + +#include "config.h" + +#include +#include +#include + +#include + +#include "gromacs/legacyheaders/gmx_detect_hardware.h" +#include "gromacs/legacyheaders/gmx_omp_nthreads.h" +#include "gromacs/legacyheaders/md_logging.h" +#include "gromacs/legacyheaders/names.h" +#include "gromacs/utility/fatalerror.h" + + +#ifdef GMX_THREAD_MPI +/* The minimum number of atoms per tMPI thread. With fewer atoms than this, + * the number of threads will get lowered. + */ +static const int min_atoms_per_mpi_thread = 90; +static const int min_atoms_per_gpu = 900; + +static int get_tmpi_omp_thread_division(const gmx_hw_info_t *hwinfo, + const gmx_hw_opt_t *hw_opt, + int nthreads_tot, + int ngpu) +{ + int nthreads_tmpi; + + /* There are no separate PME nodes here, as we ensured in + * check_and_update_hw_opt that nthreads_tmpi>0 with PME nodes + * and a conditional ensures we would not have ended up here. + * Note that separate PME nodes might be switched on later. + */ + if (ngpu > 0) + { + nthreads_tmpi = ngpu; + if (nthreads_tot > 0 && nthreads_tot < nthreads_tmpi) + { + nthreads_tmpi = nthreads_tot; + } + } + else if (hw_opt->nthreads_omp > 0) + { + /* Here we could oversubscribe, when we do, we issue a warning later */ + nthreads_tmpi = std::max(1, nthreads_tot/hw_opt->nthreads_omp); + } + else + { + /* TODO choose nthreads_omp based on hardware topology + when we have a hardware topology detection library */ + /* In general, when running up to 4 threads, OpenMP should be faster. + * Note: on AMD Bulldozer we should avoid running OpenMP over two dies. + * On Intel>=Nehalem running OpenMP on a single CPU is always faster, + * even on two CPUs it's usually faster (but with many OpenMP threads + * it could be faster not to use HT, currently we always use HT). + * On Nehalem/Westmere we want to avoid running 16 threads over + * two CPUs with HT, so we need a limit<16; thus we use 12. + * A reasonable limit for Intel Sandy and Ivy bridge, + * not knowing the topology, is 16 threads. + * Below we check for Intel and AVX, which for now includes + * Sandy/Ivy Bridge, Has/Broadwell. By checking for AVX instead of + * model numbers we ensure also future Intel CPUs are covered. + */ + const int nthreads_omp_always_faster = 4; + const int nthreads_omp_always_faster_Nehalem = 12; + const int nthreads_omp_always_faster_Intel_AVX = 16; + gmx_bool bIntelAVX; + + bIntelAVX = + (gmx_cpuid_vendor(hwinfo->cpuid_info) == GMX_CPUID_VENDOR_INTEL && + gmx_cpuid_feature(hwinfo->cpuid_info, GMX_CPUID_FEATURE_X86_AVX)); + + if (nthreads_tot <= nthreads_omp_always_faster || + ((gmx_cpuid_is_intel_nehalem(hwinfo->cpuid_info) && nthreads_tot <= nthreads_omp_always_faster_Nehalem) || + (bIntelAVX && nthreads_tot <= nthreads_omp_always_faster_Intel_AVX))) + { + /* Use pure OpenMP parallelization */ + nthreads_tmpi = 1; + } + else + { + /* Don't use OpenMP parallelization */ + nthreads_tmpi = nthreads_tot; + } + } + + return nthreads_tmpi; +} + + +/* Get the number of threads to use for thread-MPI based on how many + * were requested, which algorithms we're using, + * and how many particles there are. + * At the point we have already called check_and_update_hw_opt. + * Thus all options should be internally consistent and consistent + * with the hardware, except that ntmpi could be larger than #GPU. + */ +int get_nthreads_mpi(const gmx_hw_info_t *hwinfo, + const gmx_hw_opt_t *hw_opt, + const t_inputrec *inputrec, + const gmx_mtop_t *mtop, + const t_commrec *cr, + FILE *fplog) +{ + int nthreads_hw, nthreads_tot_max, nthreads_tmpi, nthreads_new, ngpu; + int min_atoms_per_mpi_rank; + gmx_bool bCanUseGPU; + + if (hw_opt->nthreads_tmpi > 0) + { + /* Trivial, return right away */ + return hw_opt->nthreads_tmpi; + } + + nthreads_hw = hwinfo->nthreads_hw_avail; + + /* How many total (#tMPI*#OpenMP) threads can we start? */ + if (hw_opt->nthreads_tot > 0) + { + nthreads_tot_max = hw_opt->nthreads_tot; + } + else + { + nthreads_tot_max = nthreads_hw; + } + + bCanUseGPU = (inputrec->cutoff_scheme == ecutsVERLET && + hwinfo->gpu_info.n_dev_compatible > 0); + if (bCanUseGPU) + { + ngpu = hwinfo->gpu_info.n_dev_compatible; + } + else + { + ngpu = 0; + } + + if (inputrec->cutoff_scheme == ecutsGROUP) + { + /* We checked this before, but it doesn't hurt to do it once more */ + assert(hw_opt->nthreads_omp == 1); + } + + nthreads_tmpi = + get_tmpi_omp_thread_division(hwinfo, hw_opt, nthreads_tot_max, ngpu); + + if (inputrec->eI == eiNM || EI_TPI(inputrec->eI)) + { + /* Dims/steps are divided over the nodes iso splitting the atoms. + * With NM we can't have more ranks than #atoms*#dim. With TPI it's + * unlikely we have fewer atoms than ranks, and if so, communication + * would become a bottleneck, so we set the limit to 1 atom/rank. + */ + min_atoms_per_mpi_rank = 1; + } + else + { + if (bCanUseGPU) + { + min_atoms_per_mpi_rank = min_atoms_per_gpu; + } + else + { + min_atoms_per_mpi_rank = min_atoms_per_mpi_thread; + } + } + + /* Check if an algorithm does not support parallel simulation. */ + if (nthreads_tmpi != 1 && + ( inputrec->eI == eiLBFGS || + inputrec->coulombtype == eelEWALD ) ) + { + nthreads_tmpi = 1; + + md_print_warn(cr, fplog, "The integration or electrostatics algorithm doesn't support parallel runs. Using a single thread-MPI thread.\n"); + if (hw_opt->nthreads_tmpi > nthreads_tmpi) + { + gmx_fatal(FARGS, "You asked for more than 1 thread-MPI thread, but an algorithm doesn't support that"); + } + } + else if (mtop->natoms/nthreads_tmpi < min_atoms_per_mpi_rank) + { + /* the thread number was chosen automatically, but there are too many + threads (too few atoms per thread) */ + nthreads_new = std::max(1, mtop->natoms/min_atoms_per_mpi_rank); + + /* Avoid partial use of Hyper-Threading */ + if (gmx_cpuid_x86_smt(hwinfo->cpuid_info) == GMX_CPUID_X86_SMT_ENABLED && + nthreads_new > nthreads_hw/2 && nthreads_new < nthreads_hw) + { + nthreads_new = nthreads_hw/2; + } + + /* Avoid large prime numbers in the thread count */ + if (nthreads_new >= 6) + { + /* Use only 6,8,10 with additional factors of 2 */ + int fac; + + fac = 2; + while (3*fac*2 <= nthreads_new) + { + fac *= 2; + } + + nthreads_new = (nthreads_new/fac)*fac; + } + else + { + /* Avoid 5 */ + if (nthreads_new == 5) + { + nthreads_new = 4; + } + } + + nthreads_tmpi = nthreads_new; + + fprintf(stderr, "\n"); + fprintf(stderr, "NOTE: Parallelization is limited by the small number of atoms,\n"); + fprintf(stderr, " only starting %d thread-MPI threads.\n", nthreads_tmpi); + fprintf(stderr, " You can use the -nt and/or -ntmpi option to optimize the number of threads.\n\n"); + } + + return nthreads_tmpi; +} +#endif /* GMX_THREAD_MPI */ + + +static void print_hw_opt(FILE *fp, const gmx_hw_opt_t *hw_opt) +{ + fprintf(fp, "hw_opt: nt %d ntmpi %d ntomp %d ntomp_pme %d gpu_id '%s'\n", + hw_opt->nthreads_tot, + hw_opt->nthreads_tmpi, + hw_opt->nthreads_omp, + hw_opt->nthreads_omp_pme, + hw_opt->gpu_opt.gpu_id != NULL ? hw_opt->gpu_opt.gpu_id : ""); +} + +/* Checks we can do when we don't (yet) know the cut-off scheme */ +void check_and_update_hw_opt_1(gmx_hw_opt_t *hw_opt, + gmx_bool bIsSimMaster) +{ + gmx_omp_nthreads_read_env(&hw_opt->nthreads_omp, bIsSimMaster); + +#ifndef GMX_THREAD_MPI + if (hw_opt->nthreads_tot > 0) + { + gmx_fatal(FARGS, "Setting the total number of threads is only supported with thread-MPI and GROMACS was compiled without thread-MPI"); + } + if (hw_opt->nthreads_tmpi > 0) + { + gmx_fatal(FARGS, "Setting the number of thread-MPI threads is only supported with thread-MPI and GROMACS was compiled without thread-MPI"); + } +#endif + +#ifndef GMX_OPENMP + if (hw_opt->nthreads_omp > 1) + { + gmx_fatal(FARGS, "More than 1 OpenMP thread requested, but GROMACS was compiled without OpenMP support"); + } + hw_opt->nthreads_omp = 1; +#endif + + if (hw_opt->nthreads_tot > 0 && hw_opt->nthreads_omp_pme <= 0) + { + /* We have the same number of OpenMP threads for PP and PME processes, + * thus we can perform several consistency checks. + */ + if (hw_opt->nthreads_tmpi > 0 && + hw_opt->nthreads_omp > 0 && + hw_opt->nthreads_tot != hw_opt->nthreads_tmpi*hw_opt->nthreads_omp) + { + gmx_fatal(FARGS, "The total number of threads requested (%d) does not match the thread-MPI threads (%d) times the OpenMP threads (%d) requested", + hw_opt->nthreads_tot, hw_opt->nthreads_tmpi, hw_opt->nthreads_omp); + } + + if (hw_opt->nthreads_tmpi > 0 && + hw_opt->nthreads_tot % hw_opt->nthreads_tmpi != 0) + { + gmx_fatal(FARGS, "The total number of threads requested (%d) is not divisible by the number of thread-MPI threads requested (%d)", + hw_opt->nthreads_tot, hw_opt->nthreads_tmpi); + } + + if (hw_opt->nthreads_omp > 0 && + hw_opt->nthreads_tot % hw_opt->nthreads_omp != 0) + { + gmx_fatal(FARGS, "The total number of threads requested (%d) is not divisible by the number of OpenMP threads requested (%d)", + hw_opt->nthreads_tot, hw_opt->nthreads_omp); + } + + if (hw_opt->nthreads_tmpi > 0 && + hw_opt->nthreads_omp <= 0) + { + hw_opt->nthreads_omp = hw_opt->nthreads_tot/hw_opt->nthreads_tmpi; + } + } + +#ifndef GMX_OPENMP + if (hw_opt->nthreads_omp > 1) + { + gmx_fatal(FARGS, "OpenMP threads are requested, but GROMACS was compiled without OpenMP support"); + } +#endif + + if (hw_opt->nthreads_omp_pme > 0 && hw_opt->nthreads_omp <= 0) + { + gmx_fatal(FARGS, "You need to specify -ntomp in addition to -ntomp_pme"); + } + + if (hw_opt->nthreads_tot == 1) + { + hw_opt->nthreads_tmpi = 1; + + if (hw_opt->nthreads_omp > 1) + { + gmx_fatal(FARGS, "You requested %d OpenMP threads with %d total threads", + hw_opt->nthreads_tmpi, hw_opt->nthreads_tot); + } + hw_opt->nthreads_omp = 1; + } + + if (hw_opt->nthreads_omp_pme <= 0 && hw_opt->nthreads_omp > 0) + { + hw_opt->nthreads_omp_pme = hw_opt->nthreads_omp; + } + + /* Parse GPU IDs, if provided. + * We check consistency with the tMPI thread count later. + */ + gmx_parse_gpu_ids(&hw_opt->gpu_opt); + +#ifdef GMX_THREAD_MPI + if (hw_opt->gpu_opt.n_dev_use > 0 && hw_opt->nthreads_tmpi == 0) + { + /* Set the number of MPI threads equal to the number of GPUs */ + hw_opt->nthreads_tmpi = hw_opt->gpu_opt.n_dev_use; + + if (hw_opt->nthreads_tot > 0 && + hw_opt->nthreads_tmpi > hw_opt->nthreads_tot) + { + /* We have more GPUs than total threads requested. + * We choose to (later) generate a mismatch error, + * instead of launching more threads than requested. + */ + hw_opt->nthreads_tmpi = hw_opt->nthreads_tot; + } + } +#endif + + if (debug) + { + print_hw_opt(debug, hw_opt); + } +} + +/* Checks we can do when we know the cut-off scheme */ +void check_and_update_hw_opt_2(gmx_hw_opt_t *hw_opt, + int cutoff_scheme) +{ + if (cutoff_scheme == ecutsGROUP) + { + /* We only have OpenMP support for PME only nodes */ + if (hw_opt->nthreads_omp > 1) + { + gmx_fatal(FARGS, "OpenMP threads have been requested with cut-off scheme %s, but these are only supported with cut-off scheme %s", + ecutscheme_names[cutoff_scheme], + ecutscheme_names[ecutsVERLET]); + } + hw_opt->nthreads_omp = 1; + } + + if (hw_opt->nthreads_omp_pme <= 0 && hw_opt->nthreads_omp > 0) + { + hw_opt->nthreads_omp_pme = hw_opt->nthreads_omp; + } + + if (debug) + { + print_hw_opt(debug, hw_opt); + } +} diff --git a/src/programs/mdrun/resource-division.h b/src/programs/mdrun/resource-division.h new file mode 100644 index 0000000000..b662f7100a --- /dev/null +++ b/src/programs/mdrun/resource-division.h @@ -0,0 +1,64 @@ +/* + * This file is part of the GROMACS molecular simulation package. + * + * Copyright (c) 2015, by the GROMACS development team, led by + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, + * and including many others, as listed in the AUTHORS file in the + * top-level source directory and at http://www.gromacs.org. + * + * GROMACS is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 + * of the License, or (at your option) any later version. + * + * GROMACS is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with GROMACS; if not, see + * http://www.gnu.org/licenses, or write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * If you want to redistribute modifications to GROMACS, please + * consider that scientific software is very special. Version + * control is crucial - bugs must be traceable. We will be happy to + * consider code for inclusion in the official distribution, but + * derived work must not be called official GROMACS. Details are found + * in the README & COPYING files - if they are missing, get the + * official version at http://www.gromacs.org. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the research papers on the package. Check out http://www.gromacs.org. + */ + +#ifndef GMX_RESOURCE_DIVISION_H +#define GMX_RESOURCE_DIVISION_H + +#include "gromacs/legacyheaders/typedefs.h" +#include "gromacs/legacyheaders/types/commrec_fwd.h" + +/* Return the number of threads to use for thread-MPI based on how many + * were requested, which algorithms we're using, + * and how many particles there are. + * At the point we have already called check_and_update_hw_opt. + * Thus all options should be internally consistent and consistent + * with the hardware, except that ntmpi could be larger than #GPU. + */ +int get_nthreads_mpi(const gmx_hw_info_t *hwinfo, + const gmx_hw_opt_t *hw_opt, + const t_inputrec *inputrec, + const gmx_mtop_t *mtop, + const t_commrec *cr, + FILE *fplog); + +/* Checks we can do when we don't (yet) know the cut-off scheme */ +void check_and_update_hw_opt_1(gmx_hw_opt_t *hw_opt, + gmx_bool bIsSimMaster); + +/* Checks we can do when we know the cut-off scheme */ +void check_and_update_hw_opt_2(gmx_hw_opt_t *hw_opt, + int cutoff_scheme); + +#endif /* GMX_RESOURCE_DIVISION_H */ diff --git a/src/programs/mdrun/runner.cpp b/src/programs/mdrun/runner.cpp index 3f520dd22e..613611896c 100644 --- a/src/programs/mdrun/runner.cpp +++ b/src/programs/mdrun/runner.cpp @@ -91,6 +91,7 @@ #include "deform.h" #include "membed.h" #include "repl_ex.h" +#include "resource-division.h" #ifdef GMX_FAHCORE #include "corewrap.h" @@ -268,208 +269,6 @@ static t_commrec *mdrunner_start_threads(gmx_hw_opt_t *hw_opt, return crn; } - -static int get_tmpi_omp_thread_division(const gmx_hw_info_t *hwinfo, - const gmx_hw_opt_t *hw_opt, - int nthreads_tot, - int ngpu) -{ - int nthreads_tmpi; - - /* There are no separate PME nodes here, as we ensured in - * check_and_update_hw_opt that nthreads_tmpi>0 with PME nodes - * and a conditional ensures we would not have ended up here. - * Note that separate PME nodes might be switched on later. - */ - if (ngpu > 0) - { - nthreads_tmpi = ngpu; - if (nthreads_tot > 0 && nthreads_tot < nthreads_tmpi) - { - nthreads_tmpi = nthreads_tot; - } - } - else if (hw_opt->nthreads_omp > 0) - { - /* Here we could oversubscribe, when we do, we issue a warning later */ - nthreads_tmpi = std::max(1, nthreads_tot/hw_opt->nthreads_omp); - } - else - { - /* TODO choose nthreads_omp based on hardware topology - when we have a hardware topology detection library */ - /* In general, when running up to 4 threads, OpenMP should be faster. - * Note: on AMD Bulldozer we should avoid running OpenMP over two dies. - * On Intel>=Nehalem running OpenMP on a single CPU is always faster, - * even on two CPUs it's usually faster (but with many OpenMP threads - * it could be faster not to use HT, currently we always use HT). - * On Nehalem/Westmere we want to avoid running 16 threads over - * two CPUs with HT, so we need a limit<16; thus we use 12. - * A reasonable limit for Intel Sandy and Ivy bridge, - * not knowing the topology, is 16 threads. - * Below we check for Intel and AVX, which for now includes - * Sandy/Ivy Bridge, Has/Broadwell. By checking for AVX instead of - * model numbers we ensure also future Intel CPUs are covered. - */ - const int nthreads_omp_always_faster = 4; - const int nthreads_omp_always_faster_Nehalem = 12; - const int nthreads_omp_always_faster_Intel_AVX = 16; - gmx_bool bIntelAVX; - - bIntelAVX = - (gmx_cpuid_vendor(hwinfo->cpuid_info) == GMX_CPUID_VENDOR_INTEL && - gmx_cpuid_feature(hwinfo->cpuid_info, GMX_CPUID_FEATURE_X86_AVX)); - - if (nthreads_tot <= nthreads_omp_always_faster || - ((gmx_cpuid_is_intel_nehalem(hwinfo->cpuid_info) && nthreads_tot <= nthreads_omp_always_faster_Nehalem) || - (bIntelAVX && nthreads_tot <= nthreads_omp_always_faster_Intel_AVX))) - { - /* Use pure OpenMP parallelization */ - nthreads_tmpi = 1; - } - else - { - /* Don't use OpenMP parallelization */ - nthreads_tmpi = nthreads_tot; - } - } - - return nthreads_tmpi; -} - - -/* Get the number of threads to use for thread-MPI based on how many - * were requested, which algorithms we're using, - * and how many particles there are. - * At the point we have already called check_and_update_hw_opt. - * Thus all options should be internally consistent and consistent - * with the hardware, except that ntmpi could be larger than #GPU. - */ -static int get_nthreads_mpi(const gmx_hw_info_t *hwinfo, - gmx_hw_opt_t *hw_opt, - t_inputrec *inputrec, gmx_mtop_t *mtop, - const t_commrec *cr, - FILE *fplog) -{ - int nthreads_hw, nthreads_tot_max, nthreads_tmpi, nthreads_new, ngpu; - int min_atoms_per_mpi_thread; - gmx_bool bCanUseGPU; - - if (hw_opt->nthreads_tmpi > 0) - { - /* Trivial, return right away */ - return hw_opt->nthreads_tmpi; - } - - nthreads_hw = hwinfo->nthreads_hw_avail; - - /* How many total (#tMPI*#OpenMP) threads can we start? */ - if (hw_opt->nthreads_tot > 0) - { - nthreads_tot_max = hw_opt->nthreads_tot; - } - else - { - nthreads_tot_max = nthreads_hw; - } - - bCanUseGPU = (inputrec->cutoff_scheme == ecutsVERLET && - hwinfo->gpu_info.n_dev_compatible > 0); - - if (bCanUseGPU) - { - ngpu = hwinfo->gpu_info.n_dev_compatible; - } - else - { - ngpu = 0; - } - - if (inputrec->cutoff_scheme == ecutsGROUP) - { - /* We checked this before, but it doesn't hurt to do it once more */ - assert(hw_opt->nthreads_omp == 1); - } - - nthreads_tmpi = - get_tmpi_omp_thread_division(hwinfo, hw_opt, nthreads_tot_max, ngpu); - - if (inputrec->eI == eiNM || EI_TPI(inputrec->eI)) - { - /* Dims/steps are divided over the nodes iso splitting the atoms */ - min_atoms_per_mpi_thread = 0; - } - else - { - if (bCanUseGPU) - { - min_atoms_per_mpi_thread = MIN_ATOMS_PER_GPU; - } - else - { - min_atoms_per_mpi_thread = MIN_ATOMS_PER_MPI_THREAD; - } - } - - /* Check if an algorithm does not support parallel simulation. */ - if (nthreads_tmpi != 1 && - ( inputrec->eI == eiLBFGS || - inputrec->coulombtype == eelEWALD ) ) - { - nthreads_tmpi = 1; - - md_print_warn(cr, fplog, "The integration or electrostatics algorithm doesn't support parallel runs. Using a single thread-MPI thread.\n"); - if (hw_opt->nthreads_tmpi > nthreads_tmpi) - { - gmx_fatal(FARGS, "You asked for more than 1 thread-MPI thread, but an algorithm doesn't support that"); - } - } - else if (mtop->natoms/nthreads_tmpi < min_atoms_per_mpi_thread) - { - /* the thread number was chosen automatically, but there are too many - threads (too few atoms per thread) */ - nthreads_new = std::max(1, mtop->natoms/min_atoms_per_mpi_thread); - - /* Avoid partial use of Hyper-Threading */ - if (gmx_cpuid_x86_smt(hwinfo->cpuid_info) == GMX_CPUID_X86_SMT_ENABLED && - nthreads_new > nthreads_hw/2 && nthreads_new < nthreads_hw) - { - nthreads_new = nthreads_hw/2; - } - - /* Avoid large prime numbers in the thread count */ - if (nthreads_new >= 6) - { - /* Use only 6,8,10 with additional factors of 2 */ - int fac; - - fac = 2; - while (3*fac*2 <= nthreads_new) - { - fac *= 2; - } - - nthreads_new = (nthreads_new/fac)*fac; - } - else - { - /* Avoid 5 */ - if (nthreads_new == 5) - { - nthreads_new = 4; - } - } - - nthreads_tmpi = nthreads_new; - - fprintf(stderr, "\n"); - fprintf(stderr, "NOTE: Parallelization is limited by the small number of atoms,\n"); - fprintf(stderr, " only starting %d thread-MPI threads.\n", nthreads_tmpi); - fprintf(stderr, " You can use the -nt and/or -ntmpi option to optimize the number of threads.\n\n"); - } - - return nthreads_tmpi; -} #endif /* GMX_THREAD_MPI */ @@ -754,163 +553,6 @@ static void prepare_verlet_scheme(FILE *fplog, } } -static void print_hw_opt(FILE *fp, const gmx_hw_opt_t *hw_opt) -{ - fprintf(fp, "hw_opt: nt %d ntmpi %d ntomp %d ntomp_pme %d gpu_id '%s'\n", - hw_opt->nthreads_tot, - hw_opt->nthreads_tmpi, - hw_opt->nthreads_omp, - hw_opt->nthreads_omp_pme, - hw_opt->gpu_opt.gpu_id != NULL ? hw_opt->gpu_opt.gpu_id : ""); -} - -/* Checks we can do when we don't (yet) know the cut-off scheme */ -static void check_and_update_hw_opt_1(gmx_hw_opt_t *hw_opt, - gmx_bool bIsSimMaster) -{ - gmx_omp_nthreads_read_env(&hw_opt->nthreads_omp, bIsSimMaster); - -#ifndef GMX_THREAD_MPI - if (hw_opt->nthreads_tot > 0) - { - gmx_fatal(FARGS, "Setting the total number of threads is only supported with thread-MPI and GROMACS was compiled without thread-MPI"); - } - if (hw_opt->nthreads_tmpi > 0) - { - gmx_fatal(FARGS, "Setting the number of thread-MPI threads is only supported with thread-MPI and GROMACS was compiled without thread-MPI"); - } -#endif - -#ifndef GMX_OPENMP - if (hw_opt->nthreads_omp > 1) - { - gmx_fatal(FARGS, "More than 1 OpenMP thread requested, but GROMACS was compiled without OpenMP support"); - } - hw_opt->nthreads_omp = 1; -#endif - - if (hw_opt->nthreads_tot > 0 && hw_opt->nthreads_omp_pme <= 0) - { - /* We have the same number of OpenMP threads for PP and PME processes, - * thus we can perform several consistency checks. - */ - if (hw_opt->nthreads_tmpi > 0 && - hw_opt->nthreads_omp > 0 && - hw_opt->nthreads_tot != hw_opt->nthreads_tmpi*hw_opt->nthreads_omp) - { - gmx_fatal(FARGS, "The total number of threads requested (%d) does not match the thread-MPI threads (%d) times the OpenMP threads (%d) requested", - hw_opt->nthreads_tot, hw_opt->nthreads_tmpi, hw_opt->nthreads_omp); - } - - if (hw_opt->nthreads_tmpi > 0 && - hw_opt->nthreads_tot % hw_opt->nthreads_tmpi != 0) - { - gmx_fatal(FARGS, "The total number of threads requested (%d) is not divisible by the number of thread-MPI threads requested (%d)", - hw_opt->nthreads_tot, hw_opt->nthreads_tmpi); - } - - if (hw_opt->nthreads_omp > 0 && - hw_opt->nthreads_tot % hw_opt->nthreads_omp != 0) - { - gmx_fatal(FARGS, "The total number of threads requested (%d) is not divisible by the number of OpenMP threads requested (%d)", - hw_opt->nthreads_tot, hw_opt->nthreads_omp); - } - - if (hw_opt->nthreads_tmpi > 0 && - hw_opt->nthreads_omp <= 0) - { - hw_opt->nthreads_omp = hw_opt->nthreads_tot/hw_opt->nthreads_tmpi; - } - } - -#ifndef GMX_OPENMP - if (hw_opt->nthreads_omp > 1) - { - gmx_fatal(FARGS, "OpenMP threads are requested, but GROMACS was compiled without OpenMP support"); - } -#endif - - if (hw_opt->nthreads_omp_pme > 0 && hw_opt->nthreads_omp <= 0) - { - gmx_fatal(FARGS, "You need to specify -ntomp in addition to -ntomp_pme"); - } - - if (hw_opt->nthreads_tot == 1) - { - hw_opt->nthreads_tmpi = 1; - - if (hw_opt->nthreads_omp > 1) - { - gmx_fatal(FARGS, "You requested %d OpenMP threads with %d total threads", - hw_opt->nthreads_tmpi, hw_opt->nthreads_tot); - } - hw_opt->nthreads_omp = 1; - } - - if (hw_opt->nthreads_omp_pme <= 0 && hw_opt->nthreads_omp > 0) - { - hw_opt->nthreads_omp_pme = hw_opt->nthreads_omp; - } - - /* Parse GPU IDs, if provided. - * We check consistency with the tMPI thread count later. - */ - gmx_parse_gpu_ids(&hw_opt->gpu_opt); - -#ifdef GMX_THREAD_MPI - if (hw_opt->gpu_opt.n_dev_use > 0 - && - hw_opt->nthreads_tmpi == 0) - { - /* Set the number of MPI threads equal to the number of GPUs */ - hw_opt->nthreads_tmpi = hw_opt->gpu_opt.n_dev_use; - - if (hw_opt->nthreads_tot > 0 && - hw_opt->nthreads_tmpi > hw_opt->nthreads_tot) - { - /* We have more GPUs than total threads requested. - * We choose to (later) generate a mismatch error, - * instead of launching more threads than requested. - */ - hw_opt->nthreads_tmpi = hw_opt->nthreads_tot; - } - } -#endif - - if (debug) - { - print_hw_opt(debug, hw_opt); - } -} - -/* Checks we can do when we know the cut-off scheme */ -static void check_and_update_hw_opt_2(gmx_hw_opt_t *hw_opt, - int cutoff_scheme) -{ - if (cutoff_scheme == ecutsGROUP) - { - /* We only have OpenMP support for PME only nodes */ - if (hw_opt->nthreads_omp > 1) - { - gmx_fatal(FARGS, "OpenMP threads have been requested with cut-off scheme %s, but these are only supported with cut-off scheme %s", - ecutscheme_names[cutoff_scheme], - ecutscheme_names[ecutsVERLET]); - } - hw_opt->nthreads_omp = 1; - } - - if (hw_opt->nthreads_omp_pme <= 0 && hw_opt->nthreads_omp > 0) - { - hw_opt->nthreads_omp_pme = hw_opt->nthreads_omp; - } - - if (debug) - { - print_hw_opt(debug, hw_opt); - } -} - - /* Override the value in inputrec with value passed on the command line (if any) */ static void override_nsteps_cmdline(FILE *fplog, gmx_int64_t nsteps_cmdline, -- 2.22.0