src/gromacs/taskassignment/resourcedivision.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2015,2016,2017, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35 /*! \internal \file
  36  * \brief Defines utility functionality for dividing resources and
  37  * checking for consistency and usefulness.
  38  *
  39  * \author Mark Abraham <mark.j.abraham@gmail.com>
  40  * \ingroup module_taskassignment
  41  */
  42
  43 #include "gmxpre.h"
  44
  45 #include "resourcedivision.h"
  46
  47 #include "config.h"
  48
  49 #include <stdlib.h>
  50 #include <string.h>
  51
  52 #include <algorithm>
  53
  54 #include "gromacs/hardware/cpuinfo.h"
  55 #include "gromacs/hardware/detecthardware.h"
  56 #include "gromacs/hardware/hardwaretopology.h"
  57 #include "gromacs/hardware/hw_info.h"
  58 #include "gromacs/mdlib/gmx_omp_nthreads.h"
  59 #include "gromacs/mdtypes/commrec.h"
  60 #include "gromacs/mdtypes/inputrec.h"
  61 #include "gromacs/mdtypes/md_enums.h"
  62 #include "gromacs/taskassignment/hardwareassign.h"
  63 #include "gromacs/topology/topology.h"
  64 #include "gromacs/utility/baseversion.h"
  65 #include "gromacs/utility/fatalerror.h"
  66 #include "gromacs/utility/gmxassert.h"
  67 #include "gromacs/utility/logger.h"
  68 #include "gromacs/utility/stringutil.h"
  69
  70
  71 /* DISCLAIMER: All the atom count and thread numbers below are heuristic.
  72  * The real switching points will depend on the system simulation,
  73  * the algorithms used and the hardware it's running on, as well as if there
  74  * are other jobs running on the same machine. We try to take into account
  75  * factors that have a large influence, such as recent Intel CPUs being
  76  * much better at wide multi-threading. The remaining factors should
  77  * (hopefully) have a small influence, such that the performance just before
  78  * and after a switch point doesn't change too much.
  79  */
  80
  81 //! Constant used to help minimize preprocessed code
  82 static const bool bHasOmpSupport = GMX_OPENMP;
  83
  84 #if GMX_THREAD_MPI
  85 /* The minimum number of atoms per tMPI thread. With fewer atoms than this,
  86  * the number of threads will get lowered.
  87  */
  88 static const int min_atoms_per_mpi_thread =  90;
  89 static const int min_atoms_per_gpu        = 900;
  90 #endif /* GMX_THREAD_MPI */
  91
  92 /**@{*/
  93 /*! \brief Constants for implementing default divisions of threads */
  94
  95 /* TODO choose nthreads_omp based on hardware topology
  96    when we have a hardware topology detection library */
  97 /* First we consider the case of no MPI (1 MPI rank).
  98  * In general, when running up to 8 threads, OpenMP should be faster.
  99  * Note: on AMD Bulldozer we should avoid running OpenMP over two dies.
 100  * On Intel>=Nehalem running OpenMP on a single CPU is always faster,
 101  * even on two CPUs it's usually faster (but with many OpenMP threads
 102  * it could be faster not to use HT, currently we always use HT).
 103  * On Nehalem/Westmere we want to avoid running 16 threads over
 104  * two CPUs with HT, so we need a limit<16; thus we use 12.
 105  * A reasonable limit for Intel Sandy and Ivy bridge,
 106  * not knowing the topology, is 16 threads.
 107  * Below we check for Intel and AVX, which for now includes
 108  * Sandy/Ivy Bridge, Has/Broadwell. By checking for AVX instead of
 109  * model numbers we ensure also future Intel CPUs are covered.
 110  */
 111 const int nthreads_omp_faster_default   =  8;
 112 const int nthreads_omp_faster_Nehalem   = 12;
 113 const int nthreads_omp_faster_Intel_AVX = 16;
 114 const int nthreads_omp_faster_AMD_Ryzen = 16;
 115 /* For CPU only runs the fastest options are usually MPI or OpenMP only.
 116  * With one GPU, using MPI only is almost never optimal, so we need to
 117  * compare running pure OpenMP with combined MPI+OpenMP. This means higher
 118  * OpenMP threads counts can still be ok. Multiplying the numbers above
 119  * by a factor of 2 seems to be a good estimate.
 120  */
 121 const int nthreads_omp_faster_gpu_fac   =  2;
 122
 123 /* This is the case with MPI (2 or more MPI PP ranks).
 124  * By default we will terminate with a fatal error when more than 8
 125  * OpenMP thread are (indirectly) requested, since using less threads
 126  * nearly always results in better performance.
 127  * With thread-mpi and multiple GPUs or one GPU and too many threads
 128  * we first try 6 OpenMP threads and then less until the number of MPI ranks
 129  * is divisible by the number of GPUs.
 130  */
 131 #if GMX_OPENMP && GMX_MPI
 132 const int nthreads_omp_mpi_ok_max              =  8;
 133 const int nthreads_omp_mpi_ok_min_cpu          =  1;
 134 #endif
 135 const int nthreads_omp_mpi_ok_min_gpu          =  2;
 136 const int nthreads_omp_mpi_target_max          =  6;
 137
 138 /**@}*/
 139
 140 /*! \brief Returns the maximum OpenMP thread count for which using a single MPI rank
 141  * should be faster than using multiple ranks with the same total thread count.
 142  */
 143 static int nthreads_omp_faster(const gmx::CpuInfo &cpuInfo, gmx_bool bUseGPU)
 144 {
 145     int nth;
 146
 147     if (cpuInfo.vendor() == gmx::CpuInfo::Vendor::Intel &&
 148         cpuInfo.feature(gmx::CpuInfo::Feature::X86_Avx))
 149     {
 150         nth = nthreads_omp_faster_Intel_AVX;
 151     }
 152     else if (gmx::cpuIsX86Nehalem(cpuInfo))
 153     {
 154         // Intel Nehalem
 155         nth = nthreads_omp_faster_Nehalem;
 156     }
 157     else if (cpuInfo.vendor() == gmx::CpuInfo::Vendor::Amd && cpuInfo.family() >= 23)
 158     {
 159         // AMD Ryzen
 160         nth = nthreads_omp_faster_AMD_Ryzen;
 161     }
 162     else
 163     {
 164         nth = nthreads_omp_faster_default;
 165     }
 166
 167     if (bUseGPU)
 168     {
 169         nth *= nthreads_omp_faster_gpu_fac;
 170     }
 171
 172     nth = std::min(nth, GMX_OPENMP_MAX_THREADS);
 173
 174     return nth;
 175 }
 176
 177 /*! \brief Returns that maximum OpenMP thread count that passes the efficiency check */
 178 gmx_unused static int nthreads_omp_efficient_max(int gmx_unused       nrank,
 179                                                  const gmx::CpuInfo  &cpuInfo,
 180                                                  gmx_bool             bUseGPU)
 181 {
 182 #if GMX_OPENMP && GMX_MPI
 183     if (nrank > 1)
 184     {
 185         return nthreads_omp_mpi_ok_max;
 186     }
 187     else
 188 #endif
 189     {
 190         return nthreads_omp_faster(cpuInfo, bUseGPU);
 191     }
 192 }
 193
 194 /*! \brief Return the number of thread-MPI ranks to use.
 195  * This is chosen such that we can always obey our own efficiency checks.
 196  */
 197 gmx_unused static int get_tmpi_omp_thread_division(const gmx_hw_info_t *hwinfo,
 198                                                    const gmx_hw_opt_t  &hw_opt,
 199                                                    int                  nthreads_tot,
 200                                                    int                  ngpu)
 201 {
 202     int                 nrank;
 203     const gmx::CpuInfo &cpuInfo = *hwinfo->cpuInfo;
 204
 205     GMX_RELEASE_ASSERT(nthreads_tot > 0, "There must be at least one thread per rank");
 206
 207     /* There are no separate PME nodes here, as we ensured in
 208      * check_and_update_hw_opt that nthreads_tmpi>0 with PME nodes
 209      * and a conditional ensures we would not have ended up here.
 210      * Note that separate PME nodes might be switched on later.
 211      */
 212     if (ngpu > 0)
 213     {
 214         nrank = ngpu;
 215
 216         /* When the user sets nthreads_omp, we can end up oversubscribing CPU cores
 217          * if we simply start as many ranks as GPUs. To avoid this, we start as few
 218          * tMPI ranks as necessary to avoid oversubscription and instead leave GPUs idle.
 219          * If the user does not set the number of OpenMP threads, nthreads_omp==0 and
 220          * this code has no effect.
 221          */
 222         GMX_RELEASE_ASSERT(hw_opt.nthreads_omp >= 0, "nthreads_omp is negative, but previous checks should have prevented this");
 223         while (nrank*hw_opt.nthreads_omp > hwinfo->nthreads_hw_avail && nrank > 1)
 224         {
 225             nrank--;
 226         }
 227
 228         if (nthreads_tot < nrank)
 229         {
 230             /* #thread < #gpu is very unlikely, but if so: waste gpu(s) */
 231             nrank = nthreads_tot;
 232         }
 233         else if (nthreads_tot > nthreads_omp_faster(cpuInfo, ngpu > 0) ||
 234                  (ngpu > 1 && nthreads_tot/ngpu > nthreads_omp_mpi_target_max))
 235         {
 236             /* The high OpenMP thread count will likely result in sub-optimal
 237              * performance. Increase the rank count to reduce the thread count
 238              * per rank. This will lead to GPU sharing by MPI ranks/threads.
 239              */
 240             int nshare;
 241
 242             /* Increase the rank count as long as have we more than 6 OpenMP
 243              * threads per rank or the number of hardware threads is not
 244              * divisible by the rank count. Don't go below 2 OpenMP threads.
 245              */
 246             nshare = 1;
 247             do
 248             {
 249                 nshare++;
 250                 nrank = ngpu*nshare;
 251             }
 252             while (nthreads_tot/nrank > nthreads_omp_mpi_target_max ||
 253                    (nthreads_tot/(ngpu*(nshare + 1)) >= nthreads_omp_mpi_ok_min_gpu && nthreads_tot % nrank != 0));
 254         }
 255     }
 256     else if (hw_opt.nthreads_omp > 0)
 257     {
 258         /* Here we could oversubscribe, when we do, we issue a warning later */
 259         nrank = std::max(1, nthreads_tot/hw_opt.nthreads_omp);
 260     }
 261     else
 262     {
 263         if (nthreads_tot <= nthreads_omp_faster(cpuInfo, ngpu > 0))
 264         {
 265             /* Use pure OpenMP parallelization */
 266             nrank = 1;
 267         }
 268         else
 269         {
 270             /* Don't use OpenMP parallelization */
 271             nrank = nthreads_tot;
 272         }
 273     }
 274
 275     return nrank;
 276 }
 277
 278
 279 #if GMX_THREAD_MPI
 280
 281 static bool
 282 gmxSmtIsEnabled(const gmx::HardwareTopology &hwTop)
 283 {
 284     return (hwTop.supportLevel() >= gmx::HardwareTopology::SupportLevel::Basic && hwTop.machine().sockets[0].cores[0].hwThreads.size() > 1);
 285 }
 286
 287 namespace
 288 {
 289
 290 class SingleRankChecker
 291 {
 292     public:
 293         //! Constructor
 294         SingleRankChecker() : value_(false), reasons_() {}
 295         /*! \brief Call this function for each possible condition
 296             under which a single rank is required, along with a string
 297             describing the constraint when it is applied. */
 298         void applyConstraint(bool condition, const char *description)
 299         {
 300             if (condition)
 301             {
 302                 value_ = true;
 303                 reasons_.push_back(gmx::formatString("%s only supports a single rank.", description));
 304             }
 305         }
 306         //! After applying any conditions, is a single rank required?
 307         bool mustUseOneRank() const
 308         {
 309             return value_;
 310         }
 311         /*! \brief Return a formatted string to use when writing a
 312             message when a single rank is required, (or empty if no
 313             constraint exists.) */
 314         std::string getMessage() const
 315         {
 316             return formatAndJoin(reasons_, "\n", gmx::IdentityFormatter());
 317         }
 318     private:
 319         bool                     value_;
 320         std::vector<std::string> reasons_;
 321 };
 322
 323 } // namespace
 324
 325 /* Get the number of MPI ranks to use for thread-MPI based on how many
 326  * were requested, which algorithms we're using,
 327  * and how many particles there are.
 328  * At the point we have already called check_and_update_hw_opt.
 329  * Thus all options should be internally consistent and consistent
 330  * with the hardware, except that ntmpi could be larger than #GPU.
 331  */
 332 int get_nthreads_mpi(const gmx_hw_info_t    *hwinfo,
 333                      gmx_hw_opt_t           *hw_opt,
 334                      const std::vector<int> &userGpuIds,
 335                      int                     numPmeRanks,
 336                      bool                    nonbondedOnGpu,
 337                      const t_inputrec       *inputrec,
 338                      const gmx_mtop_t       *mtop,
 339                      const gmx::MDLogger    &mdlog,
 340                      bool                    doMembed)
 341 {
 342     int                          nthreads_hw, nthreads_tot_max, nrank, ngpu;
 343     int                          min_atoms_per_mpi_rank;
 344
 345     const gmx::CpuInfo          &cpuInfo = *hwinfo->cpuInfo;
 346     const gmx::HardwareTopology &hwTop   = *hwinfo->hardwareTopology;
 347
 348     /* If the user made a GPU task assignment, that sets the number of thread-MPI ranks. */
 349     int  numGpuIdsSupplied = static_cast<int>(userGpuIds.size());
 350
 351     /* TODO Here we handle the case where the user set GPU IDs, and
 352        further below we handle the case where the algorithm does not
 353        support multiple ranks. We need also to handle the case where
 354        the user set multiple GPU IDs for an algorithm that cannot
 355        handle multiple ranks. */
 356     if (hw_opt->nthreads_tmpi < 1 && numGpuIdsSupplied > 0)
 357     {
 358         /* If the user chose both mdrun -nt -gpu_id, is that consistent? */
 359         if (numPmeRanks <= 0)
 360         {
 361             if (hw_opt->nthreads_tot > 0 &&
 362                 (hw_opt->nthreads_tot % numGpuIdsSupplied) != 0)
 363             {
 364                 gmx_fatal(FARGS, "Cannot run %d total threads with %d GPU ranks. Choose the total number of threads to be a multiple of the number of GPU ranks.", hw_opt->nthreads_tot, numGpuIdsSupplied);
 365             }
 366             return numGpuIdsSupplied;
 367         }
 368         else
 369         {
 370             gmx_fatal(FARGS, "The combination of choosing a number of PME ranks, and specific GPU IDs "
 371                       "is not supported. Use also -ntmpi and/or -ntomp and -ntomp_pme to specify what "
 372                       "distribution of threads to ranks you require.");
 373         }
 374     }
 375
 376     {
 377         /* Check if an algorithm does not support parallel simulation.  */
 378         // TODO This might work better if e.g. implemented algorithms
 379         // had to define a function that returns such requirements,
 380         // and a description string.
 381         SingleRankChecker checker;
 382         checker.applyConstraint(inputrec->eI == eiLBFGS, "L-BFGS minimization");
 383         checker.applyConstraint(inputrec->coulombtype == eelEWALD, "Plain Ewald electrostatics");
 384         checker.applyConstraint(doMembed, "Membrane embedding");
 385         if (checker.mustUseOneRank())
 386         {
 387             std::string message = checker.getMessage();
 388             if (hw_opt->nthreads_tmpi > 1)
 389             {
 390                 gmx_fatal(FARGS, "%s However, you asked for more than 1 thread-MPI rank, so mdrun cannot continue. Choose a single rank, or a different algorithm.", message.c_str());
 391             }
 392             GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted("%s Choosing to use only a single thread-MPI rank.", message.c_str());
 393
 394             if (numGpuIdsSupplied > 1)
 395             {
 396                 gmx_fatal(FARGS, "You supplied %d GPU IDs but only 1 rank can be used "
 397                           "by this simulation. Supply only one GPU ID.", numGpuIdsSupplied);
 398             }
 399             return 1;
 400         }
 401     }
 402
 403     if (hw_opt->nthreads_tmpi > 0)
 404     {
 405         if (numPmeRanks <= 0)
 406         {
 407             int numPpRanks = hw_opt->nthreads_tmpi;
 408             if ((numGpuIdsSupplied > 0) &&
 409                 (numGpuIdsSupplied != numPpRanks))
 410             {
 411                 gmx_fatal(FARGS, "Cannot run %d thread-MPI total ranks with %d "
 412                           "GPU IDs supplied. The number of particle-particle (PP) ranks and the "
 413                           "number of GPU IDs must match.", hw_opt->nthreads_tmpi, numGpuIdsSupplied);
 414             }
 415         }
 416         else
 417         {
 418             int numPpRanks = hw_opt->nthreads_tmpi - numPmeRanks;
 419             if ((numGpuIdsSupplied > 0) &&
 420                 (numGpuIdsSupplied != numPpRanks))
 421             {
 422                 gmx_fatal(FARGS, "Cannot run %d thread-MPI total ranks with %d PME ranks and %d "
 423                           "GPU IDs supplied. The number of particle-particle ranks and the "
 424                           "number of GPU IDs must match.", hw_opt->nthreads_tmpi, numPmeRanks, numGpuIdsSupplied);
 425             }
 426         }
 427         /* Trivial, return the user's choice right away */
 428         return hw_opt->nthreads_tmpi;
 429     }
 430     GMX_RELEASE_ASSERT(numGpuIdsSupplied == 0,
 431                        "If mdrun -gpu_id had information, the number of ranks should have already been chosen");
 432
 433     // Now implement automatic selection of number of thread-MPI ranks
 434     nthreads_hw = hwinfo->nthreads_hw_avail;
 435
 436     if (nthreads_hw <= 0)
 437     {
 438         /* This should normally not happen, but if it does, we handle it */
 439         gmx_fatal(FARGS, "The number of available hardware threads can not be detected, please specify the number of MPI ranks and the number of OpenMP threads (if supported) manually with options -ntmpi and -ntomp, respectively");
 440     }
 441
 442     /* How many total (#tMPI*#OpenMP) threads can we start? */
 443     if (hw_opt->nthreads_tot > 0)
 444     {
 445         nthreads_tot_max = hw_opt->nthreads_tot;
 446     }
 447     else
 448     {
 449         nthreads_tot_max = nthreads_hw;
 450     }
 451
 452     /* nonbondedOnGpu might be false e.g. because this simulation uses
 453      * the group scheme, or is a rerun with energy groups. */
 454     ngpu = (nonbondedOnGpu ? hwinfo->gpu_info.n_dev_compatible : 0);
 455
 456     if (inputrec->cutoff_scheme == ecutsGROUP)
 457     {
 458         /* We checked this before, but it doesn't hurt to do it once more */
 459         GMX_RELEASE_ASSERT(hw_opt->nthreads_omp == 1, "The group scheme only supports one OpenMP thread per rank");
 460     }
 461
 462     nrank =
 463         get_tmpi_omp_thread_division(hwinfo, *hw_opt, nthreads_tot_max, ngpu);
 464
 465     if (inputrec->eI == eiNM || EI_TPI(inputrec->eI))
 466     {
 467         /* Dims/steps are divided over the nodes iso splitting the atoms.
 468          * With NM we can't have more ranks than #atoms*#dim. With TPI it's
 469          * unlikely we have fewer atoms than ranks, and if so, communication
 470          * would become a bottleneck, so we set the limit to 1 atom/rank.
 471          */
 472         min_atoms_per_mpi_rank = 1;
 473     }
 474     else
 475     {
 476         if (ngpu >= 1)
 477         {
 478             min_atoms_per_mpi_rank = min_atoms_per_gpu;
 479         }
 480         else
 481         {
 482             min_atoms_per_mpi_rank = min_atoms_per_mpi_thread;
 483         }
 484     }
 485
 486     if (mtop->natoms/nrank < min_atoms_per_mpi_rank)
 487     {
 488         int nrank_new;
 489
 490         /* the rank number was chosen automatically, but there are too few
 491            atoms per rank, so we need to reduce the rank count */
 492         nrank_new = std::max(1, mtop->natoms/min_atoms_per_mpi_rank);
 493
 494         /* Avoid partial use of Hyper-Threading */
 495         if (gmxSmtIsEnabled(hwTop) &&
 496             nrank_new > nthreads_hw/2 && nrank_new < nthreads_hw)
 497         {
 498             nrank_new = nthreads_hw/2;
 499         }
 500
 501         /* If the user specified the total thread count, ensure this is
 502          * divisible by the number of ranks.
 503          * It is quite likely that we have too many total threads compared
 504          * to the size of the system, but if the user asked for this many
 505          * threads we should respect that.
 506          */
 507         while (hw_opt->nthreads_tot > 0 &&
 508                hw_opt->nthreads_tot % nrank_new != 0)
 509         {
 510             nrank_new--;
 511         }
 512
 513         /* Avoid large prime numbers in the rank count */
 514         if (nrank_new >= 6)
 515         {
 516             /* Use only 6,8,10 with additional factors of 2 */
 517             int fac;
 518
 519             fac = 2;
 520             while (3*fac*2 <= nrank_new)
 521             {
 522                 fac *= 2;
 523             }
 524
 525             nrank_new = (nrank_new/fac)*fac;
 526         }
 527         else
 528         {
 529             /* Avoid 5, since small system won't fit 5 domains along
 530              * a dimension. This might lead to waisting some cores, but this
 531              * will have a small impact in this regime of very small systems.
 532              */
 533             if (nrank_new == 5)
 534             {
 535                 nrank_new = 4;
 536             }
 537         }
 538
 539         nrank = nrank_new;
 540
 541         /* We reduced the number of tMPI ranks, which means we might violate
 542          * our own efficiency checks if we simply use all hardware threads.
 543          */
 544         if (bHasOmpSupport && hw_opt->nthreads_omp <= 0 && hw_opt->nthreads_tot <= 0)
 545         {
 546             /* The user set neither the total nor the OpenMP thread count,
 547              * we should use all hardware threads, unless we will violate
 548              * our own efficiency limitation on the thread count.
 549              */
 550             int  nt_omp_max;
 551
 552             nt_omp_max = nthreads_omp_efficient_max(nrank, cpuInfo, ngpu >= 1);
 553
 554             if (nrank*nt_omp_max < hwinfo->nthreads_hw_avail)
 555             {
 556                 /* Limit the number of OpenMP threads to start */
 557                 hw_opt->nthreads_omp = nt_omp_max;
 558             }
 559         }
 560
 561         fprintf(stderr, "\n");
 562         fprintf(stderr, "NOTE: Parallelization is limited by the small number of atoms,\n");
 563         fprintf(stderr, "      only starting %d thread-MPI ranks.\n", nrank);
 564         fprintf(stderr, "      You can use the -nt and/or -ntmpi option to optimize the number of threads.\n\n");
 565     }
 566
 567     return nrank;
 568 }
 569 #endif /* GMX_THREAD_MPI */
 570
 571
 572 void check_resource_division_efficiency(const gmx_hw_info_t *hwinfo,
 573                                         int                  numTotalThreads,
 574                                         bool                 willUsePhysicalGpu,
 575                                         gmx_bool             bNtOmpOptionSet,
 576                                         t_commrec           *cr,
 577                                         const gmx::MDLogger &mdlog)
 578 {
 579 #if GMX_OPENMP && GMX_MPI
 580     int         nth_omp_min, nth_omp_max;
 581     char        buf[1000];
 582 #if GMX_THREAD_MPI
 583     const char *mpi_option = " (option -ntmpi)";
 584 #else
 585     const char *mpi_option = "";
 586 #endif
 587
 588     /* This function should be called after thread-MPI (when configured) and
 589      * OpenMP have been initialized. Check that here.
 590      */
 591 #if GMX_THREAD_MPI
 592     GMX_RELEASE_ASSERT(nthreads_omp_faster_default >= nthreads_omp_mpi_ok_max, "Inconsistent OpenMP thread count default values");
 593 #endif
 594     GMX_RELEASE_ASSERT(gmx_omp_nthreads_get(emntDefault) >= 1, "Must have at least one OpenMP thread");
 595
 596     nth_omp_min = gmx_omp_nthreads_get(emntDefault);
 597     nth_omp_max = gmx_omp_nthreads_get(emntDefault);
 598
 599     bool anyRankIsUsingGpus = willUsePhysicalGpu;
 600     /* Thread-MPI seems to have a bug with reduce on 1 node, so use a cond. */
 601     if (cr->nnodes + cr->npmenodes > 1)
 602     {
 603         int count[3], count_max[3];
 604
 605         count[0] = -nth_omp_min;
 606         count[1] =  nth_omp_max;
 607         count[2] =  willUsePhysicalGpu;
 608
 609         MPI_Allreduce(count, count_max, 3, MPI_INT, MPI_MAX, cr->mpi_comm_mysim);
 610
 611         /* In case of an inhomogeneous run setup we use the maximum counts */
 612         nth_omp_min        = -count_max[0];
 613         nth_omp_max        =  count_max[1];
 614         anyRankIsUsingGpus = count_max[2] > 0;
 615     }
 616
 617     int nthreads_omp_mpi_ok_min;
 618
 619     if (!anyRankIsUsingGpus)
 620     {
 621         nthreads_omp_mpi_ok_min = nthreads_omp_mpi_ok_min_cpu;
 622     }
 623     else
 624     {
 625         /* With GPUs we set the minimum number of OpenMP threads to 2 to catch
 626          * cases where the user specifies #ranks == #cores.
 627          */
 628         nthreads_omp_mpi_ok_min = nthreads_omp_mpi_ok_min_gpu;
 629     }
 630
 631     if (DOMAINDECOMP(cr) && cr->nnodes > 1)
 632     {
 633         if (nth_omp_max < nthreads_omp_mpi_ok_min ||
 634             nth_omp_max > nthreads_omp_mpi_ok_max)
 635         {
 636             /* Note that we print target_max here, not ok_max */
 637             sprintf(buf, "Your choice of number of MPI ranks and amount of resources results in using %d OpenMP threads per rank, which is most likely inefficient. The optimum is usually between %d and %d threads per rank.",
 638                     nth_omp_max,
 639                     nthreads_omp_mpi_ok_min,
 640                     nthreads_omp_mpi_target_max);
 641
 642             if (bNtOmpOptionSet)
 643             {
 644                 GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted("NOTE: %s", buf);
 645             }
 646             else
 647             {
 648                 /* This fatal error, and the one below, is nasty, but it's
 649                  * probably the only way to ensure that all users don't waste
 650                  * a lot of resources, since many users don't read logs/stderr.
 651                  */
 652                 gmx_fatal(FARGS, "%s If you want to run with this setup, specify the -ntomp option. But we suggest to change the number of MPI ranks%s.", buf, mpi_option);
 653             }
 654         }
 655     }
 656     else
 657     {
 658         const gmx::CpuInfo &cpuInfo = *hwinfo->cpuInfo;
 659
 660         /* No domain decomposition (or only one domain) */
 661         if (nth_omp_max > nthreads_omp_faster(cpuInfo, anyRankIsUsingGpus))
 662         {
 663             /* To arrive here, the user/system set #ranks and/or #OMPthreads */
 664             gmx_bool bEnvSet;
 665             char     buf2[256];
 666
 667             bEnvSet = (getenv("OMP_NUM_THREADS") != nullptr);
 668
 669             if (bNtOmpOptionSet || bEnvSet)
 670             {
 671                 sprintf(buf2, "You requested %d OpenMP threads", nth_omp_max);
 672             }
 673             else
 674             {
 675                 sprintf(buf2, "Your choice of %d MPI rank%s and the use of %d total threads %sleads to the use of %d OpenMP threads",
 676                         cr->nnodes + cr->npmenodes,
 677                         cr->nnodes + cr->npmenodes == 1 ? "" : "s",
 678                         numTotalThreads > 0 ? numTotalThreads : hwinfo->nthreads_hw_avail,
 679                         hwinfo->nphysicalnode > 1 ? "on a node " : "",
 680                         nth_omp_max);
 681             }
 682             sprintf(buf, "%s, whereas we expect the optimum to be with more MPI ranks with %d to %d OpenMP threads.",
 683                     buf2, nthreads_omp_mpi_ok_min, nthreads_omp_mpi_target_max);
 684
 685             /* We can not quit with a fatal error when OMP_NUM_THREADS is set
 686              * with different values per rank or node, since in that case
 687              * the user can not set -ntomp to override the error.
 688              */
 689             if (bNtOmpOptionSet || (bEnvSet && nth_omp_min != nth_omp_max))
 690             {
 691                 GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted("NOTE: %s", buf);
 692             }
 693             else
 694             {
 695                 gmx_fatal(FARGS, "%s If you want to run with this many OpenMP threads, specify the -ntomp option. But we suggest to increase the number of MPI ranks%s.", buf, mpi_option);
 696             }
 697         }
 698     }
 699 #else /* GMX_OPENMP && GMX_MPI */
 700       /* No OpenMP and/or MPI: it doesn't make much sense to check */
 701     GMX_UNUSED_VALUE(bNtOmpOptionSet);
 702     GMX_UNUSED_VALUE(numTotalThreads);
 703     GMX_UNUSED_VALUE(willUsePhysicalGpu);
 704     GMX_UNUSED_VALUE(cr);
 705     /* Check if we have more than 1 physical core, if detected,
 706      * or more than 1 hardware thread if physical cores were not detected.
 707      */
 708     if (!GMX_OPENMP && !GMX_MPI && hwinfo->hardwareTopology->numberOfCores() > 1)
 709     {
 710         GMX_LOG(mdlog.warning).asParagraph().appendText("NOTE: GROMACS was compiled without OpenMP and (thread-)MPI support, can only use a single CPU core");
 711     }
 712 #endif /* GMX_OPENMP && GMX_MPI */
 713 }
 714
 715
 716 //! Dump a \c hw_opt to \c fp.
 717 static void print_hw_opt(FILE *fp, const gmx_hw_opt_t *hw_opt)
 718 {
 719     fprintf(fp, "hw_opt: nt %d ntmpi %d ntomp %d ntomp_pme %d gpu_id '%s'\n",
 720             hw_opt->nthreads_tot,
 721             hw_opt->nthreads_tmpi,
 722             hw_opt->nthreads_omp,
 723             hw_opt->nthreads_omp_pme,
 724             hw_opt->gpuIdTaskAssignment.c_str());
 725 }
 726
 727 void check_and_update_hw_opt_1(gmx_hw_opt_t    *hw_opt,
 728                                const t_commrec *cr,
 729                                int              nPmeRanks)
 730 {
 731     /* Currently hw_opt only contains default settings or settings supplied
 732      * by the user on the command line.
 733      */
 734     if (hw_opt->nthreads_omp < 0)
 735     {
 736         gmx_fatal(FARGS, "The number of OpenMP threads supplied on the command line is %d, which is negative and not allowed", hw_opt->nthreads_omp);
 737     }
 738
 739     /* Check for OpenMP settings stored in environment variables, which can
 740      * potentially be different on different MPI ranks.
 741      */
 742     gmx_omp_nthreads_read_env(&hw_opt->nthreads_omp, SIMMASTER(cr));
 743
 744     /* Check restrictions on the user supplied options before modifying them.
 745      * TODO: Put the user values in a const struct and preserve them.
 746      */
 747 #if !GMX_THREAD_MPI
 748     if (hw_opt->nthreads_tot > 0)
 749     {
 750         gmx_fatal(FARGS, "Setting the total number of threads is only supported with thread-MPI and GROMACS was compiled without thread-MPI");
 751     }
 752     if (hw_opt->nthreads_tmpi > 0)
 753     {
 754         gmx_fatal(FARGS, "Setting the number of thread-MPI ranks is only supported with thread-MPI and GROMACS was compiled without thread-MPI");
 755     }
 756 #endif
 757
 758     if (bHasOmpSupport)
 759     {
 760         /* Check restrictions on PME thread related options set by the user */
 761
 762         if (hw_opt->nthreads_omp_pme > 0 && hw_opt->nthreads_omp <= 0)
 763         {
 764             gmx_fatal(FARGS, "You need to specify -ntomp in addition to -ntomp_pme");
 765         }
 766
 767         if (hw_opt->nthreads_omp_pme >= 1 &&
 768             hw_opt->nthreads_omp_pme != hw_opt->nthreads_omp &&
 769             nPmeRanks <= 0)
 770         {
 771             /* This can result in a fatal error on many MPI ranks,
 772              * but since the thread count can differ per rank,
 773              * we can't easily avoid this.
 774              */
 775             gmx_fatal(FARGS, "You need to explicitly specify the number of PME ranks (-npme) when using different number of OpenMP threads for PP and PME ranks");
 776         }
 777     }
 778     else
 779     {
 780         /* GROMACS was configured without OpenMP support */
 781
 782         if (hw_opt->nthreads_omp > 1 || hw_opt->nthreads_omp_pme > 1)
 783         {
 784             gmx_fatal(FARGS, "More than 1 OpenMP thread requested, but GROMACS was compiled without OpenMP support");
 785         }
 786         hw_opt->nthreads_omp     = 1;
 787         hw_opt->nthreads_omp_pme = 1;
 788     }
 789
 790     if (hw_opt->nthreads_tot > 0 && hw_opt->nthreads_omp_pme <= 0)
 791     {
 792         /* We have the same number of OpenMP threads for PP and PME ranks,
 793          * thus we can perform several consistency checks.
 794          */
 795         if (hw_opt->nthreads_tmpi > 0 &&
 796             hw_opt->nthreads_omp > 0 &&
 797             hw_opt->nthreads_tot != hw_opt->nthreads_tmpi*hw_opt->nthreads_omp)
 798         {
 799             gmx_fatal(FARGS, "The total number of threads requested (%d) does not match the thread-MPI ranks (%d) times the OpenMP threads (%d) requested",
 800                       hw_opt->nthreads_tot, hw_opt->nthreads_tmpi, hw_opt->nthreads_omp);
 801         }
 802
 803         if (hw_opt->nthreads_tmpi > 0 &&
 804             hw_opt->nthreads_tot % hw_opt->nthreads_tmpi != 0)
 805         {
 806             gmx_fatal(FARGS, "The total number of threads requested (%d) is not divisible by the number of thread-MPI ranks requested (%d)",
 807                       hw_opt->nthreads_tot, hw_opt->nthreads_tmpi);
 808         }
 809
 810         if (hw_opt->nthreads_omp > 0 &&
 811             hw_opt->nthreads_tot % hw_opt->nthreads_omp != 0)
 812         {
 813             gmx_fatal(FARGS, "The total number of threads requested (%d) is not divisible by the number of OpenMP threads requested (%d)",
 814                       hw_opt->nthreads_tot, hw_opt->nthreads_omp);
 815         }
 816     }
 817
 818     if (hw_opt->nthreads_tot > 0)
 819     {
 820         if (hw_opt->nthreads_omp > hw_opt->nthreads_tot)
 821         {
 822             gmx_fatal(FARGS, "You requested %d OpenMP threads with %d total threads. Choose a total number of threads that is a multiple of the number of OpenMP threads.",
 823                       hw_opt->nthreads_omp, hw_opt->nthreads_tot);
 824         }
 825
 826         if (hw_opt->nthreads_tmpi > hw_opt->nthreads_tot)
 827         {
 828             gmx_fatal(FARGS, "You requested %d thread-MPI ranks with %d total threads. Choose a total number of threads that is a multiple of the number of thread-MPI ranks.",
 829                       hw_opt->nthreads_tmpi, hw_opt->nthreads_tot);
 830         }
 831     }
 832
 833     if (debug)
 834     {
 835         print_hw_opt(debug, hw_opt);
 836     }
 837
 838     /* Asserting this simplifies the hardware resource division later
 839      * on. */
 840     GMX_RELEASE_ASSERT(!(hw_opt->nthreads_omp_pme >= 1 && hw_opt->nthreads_omp <= 0),
 841                        "PME thread count should only be set when the normal thread count is also set");
 842 }
 843
 844 void check_and_update_hw_opt_2(gmx_hw_opt_t *hw_opt,
 845                                int           cutoff_scheme)
 846 {
 847     if (cutoff_scheme == ecutsGROUP)
 848     {
 849         /* We only have OpenMP support for PME only nodes */
 850         if (hw_opt->nthreads_omp > 1)
 851         {
 852             gmx_fatal(FARGS, "OpenMP threads have been requested with cut-off scheme %s, but these are only supported with cut-off scheme %s",
 853                       ecutscheme_names[cutoff_scheme],
 854                       ecutscheme_names[ecutsVERLET]);
 855         }
 856         hw_opt->nthreads_omp = 1;
 857     }
 858 }
 859
 860 void check_and_update_hw_opt_3(gmx_hw_opt_t *hw_opt)
 861 {
 862 #if GMX_THREAD_MPI
 863     GMX_RELEASE_ASSERT(hw_opt->nthreads_tmpi >= 1, "Must have at least one thread-MPI rank");
 864
 865     /* If the user set the total number of threads on the command line
 866      * and did not specify the number of OpenMP threads, set the latter here.
 867      */
 868     if (hw_opt->nthreads_tot > 0 && hw_opt->nthreads_omp <= 0)
 869     {
 870         hw_opt->nthreads_omp = hw_opt->nthreads_tot/hw_opt->nthreads_tmpi;
 871
 872         if (!bHasOmpSupport && hw_opt->nthreads_omp > 1)
 873         {
 874             gmx_fatal(FARGS, "You (indirectly) asked for OpenMP threads by setting -nt > -ntmpi, but GROMACS was compiled without OpenMP support");
 875         }
 876     }
 877 #endif
 878
 879     GMX_RELEASE_ASSERT(bHasOmpSupport || hw_opt->nthreads_omp == 1, "Without OpenMP support, only one thread per rank can be used");
 880
 881     /* We are done with updating nthreads_omp, we can set nthreads_omp_pme */
 882     if (hw_opt->nthreads_omp_pme <= 0 && hw_opt->nthreads_omp > 0)
 883     {
 884         hw_opt->nthreads_omp_pme = hw_opt->nthreads_omp;
 885     }
 886
 887     if (debug)
 888     {
 889         print_hw_opt(debug, hw_opt);
 890     }
 891 }