2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2015, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
38 #include "resource-division.h"
48 #include "gromacs/legacyheaders/gmx_detect_hardware.h"
49 #include "gromacs/legacyheaders/gmx_omp_nthreads.h"
50 #include "gromacs/legacyheaders/md_logging.h"
51 #include "gromacs/legacyheaders/names.h"
52 #include "gromacs/utility/fatalerror.h"
56 /* The minimum number of atoms per tMPI thread. With fewer atoms than this,
57 * the number of threads will get lowered.
59 static const int min_atoms_per_mpi_thread = 90;
60 static const int min_atoms_per_gpu = 900;
62 static int get_tmpi_omp_thread_division(const gmx_hw_info_t *hwinfo,
63 const gmx_hw_opt_t *hw_opt,
69 /* There are no separate PME nodes here, as we ensured in
70 * check_and_update_hw_opt that nthreads_tmpi>0 with PME nodes
71 * and a conditional ensures we would not have ended up here.
72 * Note that separate PME nodes might be switched on later.
77 if (nthreads_tot > 0 && nthreads_tot < nthreads_tmpi)
79 nthreads_tmpi = nthreads_tot;
82 else if (hw_opt->nthreads_omp > 0)
84 /* Here we could oversubscribe, when we do, we issue a warning later */
85 nthreads_tmpi = std::max(1, nthreads_tot/hw_opt->nthreads_omp);
89 /* TODO choose nthreads_omp based on hardware topology
90 when we have a hardware topology detection library */
91 /* In general, when running up to 4 threads, OpenMP should be faster.
92 * Note: on AMD Bulldozer we should avoid running OpenMP over two dies.
93 * On Intel>=Nehalem running OpenMP on a single CPU is always faster,
94 * even on two CPUs it's usually faster (but with many OpenMP threads
95 * it could be faster not to use HT, currently we always use HT).
96 * On Nehalem/Westmere we want to avoid running 16 threads over
97 * two CPUs with HT, so we need a limit<16; thus we use 12.
98 * A reasonable limit for Intel Sandy and Ivy bridge,
99 * not knowing the topology, is 16 threads.
100 * Below we check for Intel and AVX, which for now includes
101 * Sandy/Ivy Bridge, Has/Broadwell. By checking for AVX instead of
102 * model numbers we ensure also future Intel CPUs are covered.
104 const int nthreads_omp_always_faster = 4;
105 const int nthreads_omp_always_faster_Nehalem = 12;
106 const int nthreads_omp_always_faster_Intel_AVX = 16;
110 (gmx_cpuid_vendor(hwinfo->cpuid_info) == GMX_CPUID_VENDOR_INTEL &&
111 gmx_cpuid_feature(hwinfo->cpuid_info, GMX_CPUID_FEATURE_X86_AVX));
113 if (nthreads_tot <= nthreads_omp_always_faster ||
114 ((gmx_cpuid_is_intel_nehalem(hwinfo->cpuid_info) && nthreads_tot <= nthreads_omp_always_faster_Nehalem) ||
115 (bIntelAVX && nthreads_tot <= nthreads_omp_always_faster_Intel_AVX)))
117 /* Use pure OpenMP parallelization */
122 /* Don't use OpenMP parallelization */
123 nthreads_tmpi = nthreads_tot;
127 return nthreads_tmpi;
131 /* Get the number of threads to use for thread-MPI based on how many
132 * were requested, which algorithms we're using,
133 * and how many particles there are.
134 * At the point we have already called check_and_update_hw_opt.
135 * Thus all options should be internally consistent and consistent
136 * with the hardware, except that ntmpi could be larger than #GPU.
138 int get_nthreads_mpi(const gmx_hw_info_t *hwinfo,
139 const gmx_hw_opt_t *hw_opt,
140 const t_inputrec *inputrec,
141 const gmx_mtop_t *mtop,
145 int nthreads_hw, nthreads_tot_max, nthreads_tmpi, nthreads_new, ngpu;
146 int min_atoms_per_mpi_rank;
149 if (hw_opt->nthreads_tmpi > 0)
151 /* Trivial, return right away */
152 return hw_opt->nthreads_tmpi;
155 nthreads_hw = hwinfo->nthreads_hw_avail;
157 /* How many total (#tMPI*#OpenMP) threads can we start? */
158 if (hw_opt->nthreads_tot > 0)
160 nthreads_tot_max = hw_opt->nthreads_tot;
164 nthreads_tot_max = nthreads_hw;
167 bCanUseGPU = (inputrec->cutoff_scheme == ecutsVERLET &&
168 hwinfo->gpu_info.n_dev_compatible > 0);
171 ngpu = hwinfo->gpu_info.n_dev_compatible;
178 if (inputrec->cutoff_scheme == ecutsGROUP)
180 /* We checked this before, but it doesn't hurt to do it once more */
181 assert(hw_opt->nthreads_omp == 1);
185 get_tmpi_omp_thread_division(hwinfo, hw_opt, nthreads_tot_max, ngpu);
187 if (inputrec->eI == eiNM || EI_TPI(inputrec->eI))
189 /* Dims/steps are divided over the nodes iso splitting the atoms.
190 * With NM we can't have more ranks than #atoms*#dim. With TPI it's
191 * unlikely we have fewer atoms than ranks, and if so, communication
192 * would become a bottleneck, so we set the limit to 1 atom/rank.
194 min_atoms_per_mpi_rank = 1;
200 min_atoms_per_mpi_rank = min_atoms_per_gpu;
204 min_atoms_per_mpi_rank = min_atoms_per_mpi_thread;
208 /* Check if an algorithm does not support parallel simulation. */
209 if (nthreads_tmpi != 1 &&
210 ( inputrec->eI == eiLBFGS ||
211 inputrec->coulombtype == eelEWALD ) )
215 md_print_warn(cr, fplog, "The integration or electrostatics algorithm doesn't support parallel runs. Using a single thread-MPI thread.\n");
216 if (hw_opt->nthreads_tmpi > nthreads_tmpi)
218 gmx_fatal(FARGS, "You asked for more than 1 thread-MPI thread, but an algorithm doesn't support that");
221 else if (mtop->natoms/nthreads_tmpi < min_atoms_per_mpi_rank)
223 /* the thread number was chosen automatically, but there are too many
224 threads (too few atoms per thread) */
225 nthreads_new = std::max(1, mtop->natoms/min_atoms_per_mpi_rank);
227 /* Avoid partial use of Hyper-Threading */
228 if (gmx_cpuid_x86_smt(hwinfo->cpuid_info) == GMX_CPUID_X86_SMT_ENABLED &&
229 nthreads_new > nthreads_hw/2 && nthreads_new < nthreads_hw)
231 nthreads_new = nthreads_hw/2;
234 /* Avoid large prime numbers in the thread count */
235 if (nthreads_new >= 6)
237 /* Use only 6,8,10 with additional factors of 2 */
241 while (3*fac*2 <= nthreads_new)
246 nthreads_new = (nthreads_new/fac)*fac;
251 if (nthreads_new == 5)
257 nthreads_tmpi = nthreads_new;
259 fprintf(stderr, "\n");
260 fprintf(stderr, "NOTE: Parallelization is limited by the small number of atoms,\n");
261 fprintf(stderr, " only starting %d thread-MPI threads.\n", nthreads_tmpi);
262 fprintf(stderr, " You can use the -nt and/or -ntmpi option to optimize the number of threads.\n\n");
265 return nthreads_tmpi;
267 #endif /* GMX_THREAD_MPI */
270 static void print_hw_opt(FILE *fp, const gmx_hw_opt_t *hw_opt)
272 fprintf(fp, "hw_opt: nt %d ntmpi %d ntomp %d ntomp_pme %d gpu_id '%s'\n",
273 hw_opt->nthreads_tot,
274 hw_opt->nthreads_tmpi,
275 hw_opt->nthreads_omp,
276 hw_opt->nthreads_omp_pme,
277 hw_opt->gpu_opt.gpu_id != NULL ? hw_opt->gpu_opt.gpu_id : "");
280 /* Checks we can do when we don't (yet) know the cut-off scheme */
281 void check_and_update_hw_opt_1(gmx_hw_opt_t *hw_opt,
282 gmx_bool bIsSimMaster)
284 gmx_omp_nthreads_read_env(&hw_opt->nthreads_omp, bIsSimMaster);
286 #ifndef GMX_THREAD_MPI
287 if (hw_opt->nthreads_tot > 0)
289 gmx_fatal(FARGS, "Setting the total number of threads is only supported with thread-MPI and GROMACS was compiled without thread-MPI");
291 if (hw_opt->nthreads_tmpi > 0)
293 gmx_fatal(FARGS, "Setting the number of thread-MPI threads is only supported with thread-MPI and GROMACS was compiled without thread-MPI");
298 if (hw_opt->nthreads_omp > 1)
300 gmx_fatal(FARGS, "More than 1 OpenMP thread requested, but GROMACS was compiled without OpenMP support");
302 hw_opt->nthreads_omp = 1;
305 if (hw_opt->nthreads_tot > 0 && hw_opt->nthreads_omp_pme <= 0)
307 /* We have the same number of OpenMP threads for PP and PME processes,
308 * thus we can perform several consistency checks.
310 if (hw_opt->nthreads_tmpi > 0 &&
311 hw_opt->nthreads_omp > 0 &&
312 hw_opt->nthreads_tot != hw_opt->nthreads_tmpi*hw_opt->nthreads_omp)
314 gmx_fatal(FARGS, "The total number of threads requested (%d) does not match the thread-MPI threads (%d) times the OpenMP threads (%d) requested",
315 hw_opt->nthreads_tot, hw_opt->nthreads_tmpi, hw_opt->nthreads_omp);
318 if (hw_opt->nthreads_tmpi > 0 &&
319 hw_opt->nthreads_tot % hw_opt->nthreads_tmpi != 0)
321 gmx_fatal(FARGS, "The total number of threads requested (%d) is not divisible by the number of thread-MPI threads requested (%d)",
322 hw_opt->nthreads_tot, hw_opt->nthreads_tmpi);
325 if (hw_opt->nthreads_omp > 0 &&
326 hw_opt->nthreads_tot % hw_opt->nthreads_omp != 0)
328 gmx_fatal(FARGS, "The total number of threads requested (%d) is not divisible by the number of OpenMP threads requested (%d)",
329 hw_opt->nthreads_tot, hw_opt->nthreads_omp);
332 if (hw_opt->nthreads_tmpi > 0 &&
333 hw_opt->nthreads_omp <= 0)
335 hw_opt->nthreads_omp = hw_opt->nthreads_tot/hw_opt->nthreads_tmpi;
340 if (hw_opt->nthreads_omp > 1)
342 gmx_fatal(FARGS, "OpenMP threads are requested, but GROMACS was compiled without OpenMP support");
346 if (hw_opt->nthreads_omp_pme > 0 && hw_opt->nthreads_omp <= 0)
348 gmx_fatal(FARGS, "You need to specify -ntomp in addition to -ntomp_pme");
351 if (hw_opt->nthreads_tot == 1)
353 hw_opt->nthreads_tmpi = 1;
355 if (hw_opt->nthreads_omp > 1)
357 gmx_fatal(FARGS, "You requested %d OpenMP threads with %d total threads",
358 hw_opt->nthreads_tmpi, hw_opt->nthreads_tot);
360 hw_opt->nthreads_omp = 1;
363 if (hw_opt->nthreads_omp_pme <= 0 && hw_opt->nthreads_omp > 0)
365 hw_opt->nthreads_omp_pme = hw_opt->nthreads_omp;
368 /* Parse GPU IDs, if provided.
369 * We check consistency with the tMPI thread count later.
371 gmx_parse_gpu_ids(&hw_opt->gpu_opt);
373 #ifdef GMX_THREAD_MPI
374 if (hw_opt->gpu_opt.n_dev_use > 0 && hw_opt->nthreads_tmpi == 0)
376 /* Set the number of MPI threads equal to the number of GPUs */
377 hw_opt->nthreads_tmpi = hw_opt->gpu_opt.n_dev_use;
379 if (hw_opt->nthreads_tot > 0 &&
380 hw_opt->nthreads_tmpi > hw_opt->nthreads_tot)
382 /* We have more GPUs than total threads requested.
383 * We choose to (later) generate a mismatch error,
384 * instead of launching more threads than requested.
386 hw_opt->nthreads_tmpi = hw_opt->nthreads_tot;
393 print_hw_opt(debug, hw_opt);
397 /* Checks we can do when we know the cut-off scheme */
398 void check_and_update_hw_opt_2(gmx_hw_opt_t *hw_opt,
401 if (cutoff_scheme == ecutsGROUP)
403 /* We only have OpenMP support for PME only nodes */
404 if (hw_opt->nthreads_omp > 1)
406 gmx_fatal(FARGS, "OpenMP threads have been requested with cut-off scheme %s, but these are only supported with cut-off scheme %s",
407 ecutscheme_names[cutoff_scheme],
408 ecutscheme_names[ecutsVERLET]);
410 hw_opt->nthreads_omp = 1;
413 if (hw_opt->nthreads_omp_pme <= 0 && hw_opt->nthreads_omp > 0)
415 hw_opt->nthreads_omp_pme = hw_opt->nthreads_omp;
420 print_hw_opt(debug, hw_opt);