src/programs/mdrun/resource-division.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2015, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 #include "gmxpre.h"
  37
  38 #include "resource-division.h"
  39
  40 #include "config.h"
  41
  42 #include <assert.h>
  43 #include <stdlib.h>
  44 #include <string.h>
  45
  46 #include <algorithm>
  47
  48 #include "gromacs/legacyheaders/gmx_detect_hardware.h"
  49 #include "gromacs/legacyheaders/gmx_omp_nthreads.h"
  50 #include "gromacs/legacyheaders/md_logging.h"
  51 #include "gromacs/legacyheaders/names.h"
  52 #include "gromacs/utility/fatalerror.h"
  53
  54
  55 #ifdef GMX_THREAD_MPI
  56 /* The minimum number of atoms per tMPI thread. With fewer atoms than this,
  57  * the number of threads will get lowered.
  58  */
  59 static const int min_atoms_per_mpi_thread =  90;
  60 static const int min_atoms_per_gpu        = 900;
  61
  62 static int get_tmpi_omp_thread_division(const gmx_hw_info_t *hwinfo,
  63                                         const gmx_hw_opt_t  *hw_opt,
  64                                         int                  nthreads_tot,
  65                                         int                  ngpu)
  66 {
  67     int nthreads_tmpi;
  68
  69     /* There are no separate PME nodes here, as we ensured in
  70      * check_and_update_hw_opt that nthreads_tmpi>0 with PME nodes
  71      * and a conditional ensures we would not have ended up here.
  72      * Note that separate PME nodes might be switched on later.
  73      */
  74     if (ngpu > 0)
  75     {
  76         nthreads_tmpi = ngpu;
  77         if (nthreads_tot > 0 && nthreads_tot < nthreads_tmpi)
  78         {
  79             nthreads_tmpi = nthreads_tot;
  80         }
  81     }
  82     else if (hw_opt->nthreads_omp > 0)
  83     {
  84         /* Here we could oversubscribe, when we do, we issue a warning later */
  85         nthreads_tmpi = std::max(1, nthreads_tot/hw_opt->nthreads_omp);
  86     }
  87     else
  88     {
  89         /* TODO choose nthreads_omp based on hardware topology
  90            when we have a hardware topology detection library */
  91         /* In general, when running up to 4 threads, OpenMP should be faster.
  92          * Note: on AMD Bulldozer we should avoid running OpenMP over two dies.
  93          * On Intel>=Nehalem running OpenMP on a single CPU is always faster,
  94          * even on two CPUs it's usually faster (but with many OpenMP threads
  95          * it could be faster not to use HT, currently we always use HT).
  96          * On Nehalem/Westmere we want to avoid running 16 threads over
  97          * two CPUs with HT, so we need a limit<16; thus we use 12.
  98          * A reasonable limit for Intel Sandy and Ivy bridge,
  99          * not knowing the topology, is 16 threads.
 100          * Below we check for Intel and AVX, which for now includes
 101          * Sandy/Ivy Bridge, Has/Broadwell. By checking for AVX instead of
 102          * model numbers we ensure also future Intel CPUs are covered.
 103          */
 104         const int nthreads_omp_always_faster             =  4;
 105         const int nthreads_omp_always_faster_Nehalem     = 12;
 106         const int nthreads_omp_always_faster_Intel_AVX   = 16;
 107         gmx_bool  bIntelAVX;
 108
 109         bIntelAVX =
 110             (gmx_cpuid_vendor(hwinfo->cpuid_info) == GMX_CPUID_VENDOR_INTEL &&
 111              gmx_cpuid_feature(hwinfo->cpuid_info, GMX_CPUID_FEATURE_X86_AVX));
 112
 113         if (nthreads_tot <= nthreads_omp_always_faster ||
 114             ((gmx_cpuid_is_intel_nehalem(hwinfo->cpuid_info) && nthreads_tot <= nthreads_omp_always_faster_Nehalem) ||
 115              (bIntelAVX && nthreads_tot <= nthreads_omp_always_faster_Intel_AVX)))
 116         {
 117             /* Use pure OpenMP parallelization */
 118             nthreads_tmpi = 1;
 119         }
 120         else
 121         {
 122             /* Don't use OpenMP parallelization */
 123             nthreads_tmpi = nthreads_tot;
 124         }
 125     }
 126
 127     return nthreads_tmpi;
 128 }
 129
 130
 131 /* Get the number of threads to use for thread-MPI based on how many
 132  * were requested, which algorithms we're using,
 133  * and how many particles there are.
 134  * At the point we have already called check_and_update_hw_opt.
 135  * Thus all options should be internally consistent and consistent
 136  * with the hardware, except that ntmpi could be larger than #GPU.
 137  */
 138 int get_nthreads_mpi(const gmx_hw_info_t *hwinfo,
 139                      const gmx_hw_opt_t  *hw_opt,
 140                      const t_inputrec    *inputrec,
 141                      const gmx_mtop_t    *mtop,
 142                      const t_commrec     *cr,
 143                      FILE                *fplog)
 144 {
 145     int      nthreads_hw, nthreads_tot_max, nthreads_tmpi, nthreads_new, ngpu;
 146     int      min_atoms_per_mpi_rank;
 147     gmx_bool bCanUseGPU;
 148
 149     if (hw_opt->nthreads_tmpi > 0)
 150     {
 151         /* Trivial, return right away */
 152         return hw_opt->nthreads_tmpi;
 153     }
 154
 155     nthreads_hw = hwinfo->nthreads_hw_avail;
 156
 157     /* How many total (#tMPI*#OpenMP) threads can we start? */
 158     if (hw_opt->nthreads_tot > 0)
 159     {
 160         nthreads_tot_max = hw_opt->nthreads_tot;
 161     }
 162     else
 163     {
 164         nthreads_tot_max = nthreads_hw;
 165     }
 166
 167     bCanUseGPU = (inputrec->cutoff_scheme == ecutsVERLET &&
 168                   hwinfo->gpu_info.n_dev_compatible > 0);
 169     if (bCanUseGPU)
 170     {
 171         ngpu = hwinfo->gpu_info.n_dev_compatible;
 172     }
 173     else
 174     {
 175         ngpu = 0;
 176     }
 177
 178     if (inputrec->cutoff_scheme == ecutsGROUP)
 179     {
 180         /* We checked this before, but it doesn't hurt to do it once more */
 181         assert(hw_opt->nthreads_omp == 1);
 182     }
 183
 184     nthreads_tmpi =
 185         get_tmpi_omp_thread_division(hwinfo, hw_opt, nthreads_tot_max, ngpu);
 186
 187     if (inputrec->eI == eiNM || EI_TPI(inputrec->eI))
 188     {
 189         /* Dims/steps are divided over the nodes iso splitting the atoms.
 190          * With NM we can't have more ranks than #atoms*#dim. With TPI it's
 191          * unlikely we have fewer atoms than ranks, and if so, communication
 192          * would become a bottleneck, so we set the limit to 1 atom/rank.
 193          */
 194         min_atoms_per_mpi_rank = 1;
 195     }
 196     else
 197     {
 198         if (bCanUseGPU)
 199         {
 200             min_atoms_per_mpi_rank = min_atoms_per_gpu;
 201         }
 202         else
 203         {
 204             min_atoms_per_mpi_rank = min_atoms_per_mpi_thread;
 205         }
 206     }
 207
 208     /* Check if an algorithm does not support parallel simulation.  */
 209     if (nthreads_tmpi != 1 &&
 210         ( inputrec->eI == eiLBFGS ||
 211           inputrec->coulombtype == eelEWALD ) )
 212     {
 213         nthreads_tmpi = 1;
 214
 215         md_print_warn(cr, fplog, "The integration or electrostatics algorithm doesn't support parallel runs. Using a single thread-MPI thread.\n");
 216         if (hw_opt->nthreads_tmpi > nthreads_tmpi)
 217         {
 218             gmx_fatal(FARGS, "You asked for more than 1 thread-MPI thread, but an algorithm doesn't support that");
 219         }
 220     }
 221     else if (mtop->natoms/nthreads_tmpi < min_atoms_per_mpi_rank)
 222     {
 223         /* the thread number was chosen automatically, but there are too many
 224            threads (too few atoms per thread) */
 225         nthreads_new = std::max(1, mtop->natoms/min_atoms_per_mpi_rank);
 226
 227         /* Avoid partial use of Hyper-Threading */
 228         if (gmx_cpuid_x86_smt(hwinfo->cpuid_info) == GMX_CPUID_X86_SMT_ENABLED &&
 229             nthreads_new > nthreads_hw/2 && nthreads_new < nthreads_hw)
 230         {
 231             nthreads_new = nthreads_hw/2;
 232         }
 233
 234         /* Avoid large prime numbers in the thread count */
 235         if (nthreads_new >= 6)
 236         {
 237             /* Use only 6,8,10 with additional factors of 2 */
 238             int fac;
 239
 240             fac = 2;
 241             while (3*fac*2 <= nthreads_new)
 242             {
 243                 fac *= 2;
 244             }
 245
 246             nthreads_new = (nthreads_new/fac)*fac;
 247         }
 248         else
 249         {
 250             /* Avoid 5 */
 251             if (nthreads_new == 5)
 252             {
 253                 nthreads_new = 4;
 254             }
 255         }
 256
 257         nthreads_tmpi = nthreads_new;
 258
 259         fprintf(stderr, "\n");
 260         fprintf(stderr, "NOTE: Parallelization is limited by the small number of atoms,\n");
 261         fprintf(stderr, "      only starting %d thread-MPI threads.\n", nthreads_tmpi);
 262         fprintf(stderr, "      You can use the -nt and/or -ntmpi option to optimize the number of threads.\n\n");
 263     }
 264
 265     return nthreads_tmpi;
 266 }
 267 #endif /* GMX_THREAD_MPI */
 268
 269
 270 static void print_hw_opt(FILE *fp, const gmx_hw_opt_t *hw_opt)
 271 {
 272     fprintf(fp, "hw_opt: nt %d ntmpi %d ntomp %d ntomp_pme %d gpu_id '%s'\n",
 273             hw_opt->nthreads_tot,
 274             hw_opt->nthreads_tmpi,
 275             hw_opt->nthreads_omp,
 276             hw_opt->nthreads_omp_pme,
 277             hw_opt->gpu_opt.gpu_id != NULL ? hw_opt->gpu_opt.gpu_id : "");
 278 }
 279
 280 /* Checks we can do when we don't (yet) know the cut-off scheme */
 281 void check_and_update_hw_opt_1(gmx_hw_opt_t *hw_opt,
 282                                gmx_bool      bIsSimMaster)
 283 {
 284     gmx_omp_nthreads_read_env(&hw_opt->nthreads_omp, bIsSimMaster);
 285
 286 #ifndef GMX_THREAD_MPI
 287     if (hw_opt->nthreads_tot > 0)
 288     {
 289         gmx_fatal(FARGS, "Setting the total number of threads is only supported with thread-MPI and GROMACS was compiled without thread-MPI");
 290     }
 291     if (hw_opt->nthreads_tmpi > 0)
 292     {
 293         gmx_fatal(FARGS, "Setting the number of thread-MPI threads is only supported with thread-MPI and GROMACS was compiled without thread-MPI");
 294     }
 295 #endif
 296
 297 #ifndef GMX_OPENMP
 298     if (hw_opt->nthreads_omp > 1)
 299     {
 300         gmx_fatal(FARGS, "More than 1 OpenMP thread requested, but GROMACS was compiled without OpenMP support");
 301     }
 302     hw_opt->nthreads_omp = 1;
 303 #endif
 304
 305     if (hw_opt->nthreads_tot > 0 && hw_opt->nthreads_omp_pme <= 0)
 306     {
 307         /* We have the same number of OpenMP threads for PP and PME processes,
 308          * thus we can perform several consistency checks.
 309          */
 310         if (hw_opt->nthreads_tmpi > 0 &&
 311             hw_opt->nthreads_omp > 0 &&
 312             hw_opt->nthreads_tot != hw_opt->nthreads_tmpi*hw_opt->nthreads_omp)
 313         {
 314             gmx_fatal(FARGS, "The total number of threads requested (%d) does not match the thread-MPI threads (%d) times the OpenMP threads (%d) requested",
 315                       hw_opt->nthreads_tot, hw_opt->nthreads_tmpi, hw_opt->nthreads_omp);
 316         }
 317
 318         if (hw_opt->nthreads_tmpi > 0 &&
 319             hw_opt->nthreads_tot % hw_opt->nthreads_tmpi != 0)
 320         {
 321             gmx_fatal(FARGS, "The total number of threads requested (%d) is not divisible by the number of thread-MPI threads requested (%d)",
 322                       hw_opt->nthreads_tot, hw_opt->nthreads_tmpi);
 323         }
 324
 325         if (hw_opt->nthreads_omp > 0 &&
 326             hw_opt->nthreads_tot % hw_opt->nthreads_omp != 0)
 327         {
 328             gmx_fatal(FARGS, "The total number of threads requested (%d) is not divisible by the number of OpenMP threads requested (%d)",
 329                       hw_opt->nthreads_tot, hw_opt->nthreads_omp);
 330         }
 331
 332         if (hw_opt->nthreads_tmpi > 0 &&
 333             hw_opt->nthreads_omp <= 0)
 334         {
 335             hw_opt->nthreads_omp = hw_opt->nthreads_tot/hw_opt->nthreads_tmpi;
 336         }
 337     }
 338
 339 #ifndef GMX_OPENMP
 340     if (hw_opt->nthreads_omp > 1)
 341     {
 342         gmx_fatal(FARGS, "OpenMP threads are requested, but GROMACS was compiled without OpenMP support");
 343     }
 344 #endif
 345
 346     if (hw_opt->nthreads_omp_pme > 0 && hw_opt->nthreads_omp <= 0)
 347     {
 348         gmx_fatal(FARGS, "You need to specify -ntomp in addition to -ntomp_pme");
 349     }
 350
 351     if (hw_opt->nthreads_tot == 1)
 352     {
 353         hw_opt->nthreads_tmpi = 1;
 354
 355         if (hw_opt->nthreads_omp > 1)
 356         {
 357             gmx_fatal(FARGS, "You requested %d OpenMP threads with %d total threads",
 358                       hw_opt->nthreads_tmpi, hw_opt->nthreads_tot);
 359         }
 360         hw_opt->nthreads_omp = 1;
 361     }
 362
 363     if (hw_opt->nthreads_omp_pme <= 0 && hw_opt->nthreads_omp > 0)
 364     {
 365         hw_opt->nthreads_omp_pme = hw_opt->nthreads_omp;
 366     }
 367
 368     /* Parse GPU IDs, if provided.
 369      * We check consistency with the tMPI thread count later.
 370      */
 371     gmx_parse_gpu_ids(&hw_opt->gpu_opt);
 372
 373 #ifdef GMX_THREAD_MPI
 374     if (hw_opt->gpu_opt.n_dev_use > 0 && hw_opt->nthreads_tmpi == 0)
 375     {
 376         /* Set the number of MPI threads equal to the number of GPUs */
 377         hw_opt->nthreads_tmpi = hw_opt->gpu_opt.n_dev_use;
 378
 379         if (hw_opt->nthreads_tot > 0 &&
 380             hw_opt->nthreads_tmpi > hw_opt->nthreads_tot)
 381         {
 382             /* We have more GPUs than total threads requested.
 383              * We choose to (later) generate a mismatch error,
 384              * instead of launching more threads than requested.
 385              */
 386             hw_opt->nthreads_tmpi = hw_opt->nthreads_tot;
 387         }
 388     }
 389 #endif
 390
 391     if (debug)
 392     {
 393         print_hw_opt(debug, hw_opt);
 394     }
 395 }
 396
 397 /* Checks we can do when we know the cut-off scheme */
 398 void check_and_update_hw_opt_2(gmx_hw_opt_t *hw_opt,
 399                                int           cutoff_scheme)
 400 {
 401     if (cutoff_scheme == ecutsGROUP)
 402     {
 403         /* We only have OpenMP support for PME only nodes */
 404         if (hw_opt->nthreads_omp > 1)
 405         {
 406             gmx_fatal(FARGS, "OpenMP threads have been requested with cut-off scheme %s, but these are only supported with cut-off scheme %s",
 407                       ecutscheme_names[cutoff_scheme],
 408                       ecutscheme_names[ecutsVERLET]);
 409         }
 410         hw_opt->nthreads_omp = 1;
 411     }
 412
 413     if (hw_opt->nthreads_omp_pme <= 0 && hw_opt->nthreads_omp > 0)
 414     {
 415         hw_opt->nthreads_omp_pme = hw_opt->nthreads_omp;
 416     }
 417
 418     if (debug)
 419     {
 420         print_hw_opt(debug, hw_opt);
 421     }
 422 }