src/gromacs/mdrunutility/threadaffinity.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2012,2013,2014,2015,2016,2017,2018, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35 #include "gmxpre.h"
  36
  37 #include "threadaffinity.h"
  38
  39 #include "config.h"
  40
  41 #include <cerrno>
  42 #include <cstdio>
  43 #include <cstring>
  44
  45 #if HAVE_SCHED_AFFINITY
  46 #  include <sched.h>
  47 #  include <sys/syscall.h>
  48 #endif
  49
  50 #include "thread_mpi/threads.h"
  51
  52 #include "gromacs/hardware/hardwaretopology.h"
  53 #include "gromacs/hardware/hw_info.h"
  54 #include "gromacs/mdtypes/commrec.h"
  55 #include "gromacs/utility/basenetwork.h"
  56 #include "gromacs/utility/cstringutil.h"
  57 #include "gromacs/utility/exceptions.h"
  58 #include "gromacs/utility/fatalerror.h"
  59 #include "gromacs/utility/gmxassert.h"
  60 #include "gromacs/utility/gmxomp.h"
  61 #include "gromacs/utility/logger.h"
  62 #include "gromacs/utility/physicalnodecommunicator.h"
  63 #include "gromacs/utility/programcontext.h"
  64 #include "gromacs/utility/smalloc.h"
  65 #include "gromacs/utility/unique_cptr.h"
  66
  67 namespace
  68 {
  69
  70 class DefaultThreadAffinityAccess : public gmx::IThreadAffinityAccess
  71 {
  72     public:
  73         virtual bool isThreadAffinitySupported() const
  74         {
  75             return tMPI_Thread_setaffinity_support() == TMPI_SETAFFINITY_SUPPORT_YES;
  76         }
  77         virtual bool setCurrentThreadAffinityToCore(int core)
  78         {
  79             const int ret = tMPI_Thread_setaffinity_single(tMPI_Thread_self(), core);
  80             return ret == 0;
  81         }
  82 };
  83
  84 //! Global instance of DefaultThreadAffinityAccess
  85 DefaultThreadAffinityAccess g_defaultAffinityAccess;
  86
  87 } // namespace
  88
  89 gmx::IThreadAffinityAccess::~IThreadAffinityAccess()
  90 {
  91 }
  92
  93 static bool invalidWithinSimulation(const t_commrec *cr, bool invalidLocally)
  94 {
  95 #if GMX_MPI
  96     if (cr->nnodes > 1)
  97     {
  98         int value = invalidLocally ? 1 : 0;
  99         int globalValue;
 100         MPI_Reduce(&value, &globalValue, 1, MPI_INT, MPI_LOR, MASTERRANK(cr),
 101                    cr->mpi_comm_mysim);
 102         return SIMMASTER(cr) ? (globalValue != 0) : invalidLocally;
 103     }
 104 #else
 105     GMX_UNUSED_VALUE(cr);
 106 #endif
 107     return invalidLocally;
 108 }
 109
 110 static bool
 111 get_thread_affinity_layout(const gmx::MDLogger &mdlog,
 112                            const t_commrec *cr,
 113                            const gmx::HardwareTopology &hwTop,
 114                            int   threads,
 115                            bool  affinityIsAutoAndNumThreadsIsNotAuto,
 116                            int pin_offset, int * pin_stride,
 117                            int **localityOrder,
 118                            bool *issuedWarning)
 119 {
 120     int                          hwThreads;
 121     int                          hwThreadsPerCore = 0;
 122     bool                         bPickPinStride;
 123     bool                         haveTopology;
 124     bool                         invalidValue;
 125
 126     haveTopology = (hwTop.supportLevel() >= gmx::HardwareTopology::SupportLevel::Basic);
 127
 128     if (pin_offset < 0)
 129     {
 130         gmx_fatal(FARGS, "Negative thread pinning offset requested");
 131     }
 132     if (*pin_stride < 0)
 133     {
 134         gmx_fatal(FARGS, "Negative thread pinning stride requested");
 135     }
 136
 137     if (haveTopology)
 138     {
 139         hwThreads           = hwTop.machine().logicalProcessorCount;
 140         // Just use the value for the first core
 141         hwThreadsPerCore    = hwTop.machine().sockets[0].cores[0].hwThreads.size();
 142         snew(*localityOrder, hwThreads);
 143         int i = 0;
 144         for (auto &s : hwTop.machine().sockets)
 145         {
 146             for (auto &c : s.cores)
 147             {
 148                 for (auto &t : c.hwThreads)
 149                 {
 150                     (*localityOrder)[i++] = t.logicalProcessorId;
 151                 }
 152             }
 153         }
 154     }
 155     else
 156     {
 157         /* topology information not available or invalid, ignore it */
 158         hwThreads       = hwTop.machine().logicalProcessorCount;
 159         *localityOrder  = nullptr;
 160     }
 161     // Only warn about the first problem per node.  Otherwise, the first test
 162     // failing would essentially always cause also the other problems get
 163     // reported, leading to bogus warnings.  The order in the conditionals
 164     // with this variable is important, since the MPI_Reduce() in
 165     // invalidWithinSimulation() needs to always happen.
 166     bool alreadyWarned = false;
 167     invalidValue = (hwThreads <= 0);
 168     if (invalidWithinSimulation(cr, invalidValue))
 169     {
 170         /* We don't know anything about the hardware, don't pin */
 171         GMX_LOG(mdlog.warning).asParagraph().appendText(
 172                 "NOTE: No information on available cores, thread pinning disabled.");
 173         alreadyWarned = true;
 174     }
 175     bool validLayout = !invalidValue;
 176
 177     if (affinityIsAutoAndNumThreadsIsNotAuto)
 178     {
 179         invalidValue = (threads != hwThreads);
 180         bool warn = (invalidValue && threads > 1 && threads < hwThreads);
 181         if (invalidWithinSimulation(cr, warn) && !alreadyWarned)
 182         {
 183             GMX_LOG(mdlog.warning).asParagraph().appendText(
 184                     "NOTE: The number of threads is not equal to the number of (logical) cores\n"
 185                     "      and the -pin option is set to auto: will not pin threads to cores.\n"
 186                     "      This can lead to significant performance degradation.\n"
 187                     "      Consider using -pin on (and -pinoffset in case you run multiple jobs).");
 188             alreadyWarned = true;
 189         }
 190         validLayout = validLayout && !invalidValue;
 191     }
 192
 193     invalidValue = (threads > hwThreads);
 194     if (invalidWithinSimulation(cr, invalidValue) && !alreadyWarned)
 195     {
 196         GMX_LOG(mdlog.warning).asParagraph().appendText(
 197                 "NOTE: Oversubscribing the CPU, will not pin threads");
 198         alreadyWarned = true;
 199     }
 200     validLayout = validLayout && !invalidValue;
 201
 202     invalidValue = (pin_offset + threads > hwThreads);
 203     if (invalidWithinSimulation(cr, invalidValue) && !alreadyWarned)
 204     {
 205         GMX_LOG(mdlog.warning).asParagraph().appendText(
 206                 "WARNING: Requested offset too large for available cores, thread pinning disabled.");
 207         alreadyWarned = true;
 208
 209     }
 210     validLayout = validLayout && !invalidValue;
 211
 212     invalidValue   = false;
 213     /* do we need to choose the pinning stride? */
 214     bPickPinStride = (*pin_stride == 0);
 215
 216     if (bPickPinStride)
 217     {
 218         if (haveTopology && pin_offset + threads*hwThreadsPerCore <= hwThreads)
 219         {
 220             /* Put one thread on each physical core */
 221             *pin_stride = hwThreadsPerCore;
 222         }
 223         else
 224         {
 225             /* We don't know if we have SMT, and if we do, we don't know
 226              * if hw threads in the same physical core are consecutive.
 227              * Without SMT the pinning layout should not matter too much.
 228              * so we assume a consecutive layout and maximally spread out"
 229              * the threads at equal threads per core.
 230              * Note that IBM is the major non-x86 case with cpuid support
 231              * and probably threads are already pinned by the queuing system,
 232              * so we wouldn't end up here in the first place.
 233              */
 234             *pin_stride = (hwThreads - pin_offset)/threads;
 235         }
 236     }
 237     else
 238     {
 239         /* Check the placement of the thread with the largest index to make sure
 240          * that the offset & stride doesn't cause pinning beyond the last hardware thread. */
 241         invalidValue = (pin_offset + (threads-1)*(*pin_stride) >= hwThreads);
 242     }
 243     if (invalidWithinSimulation(cr, invalidValue) && !alreadyWarned)
 244     {
 245         /* We are oversubscribing, don't pin */
 246         GMX_LOG(mdlog.warning).asParagraph().appendText(
 247                 "WARNING: Requested stride too large for available cores, thread pinning disabled.");
 248         alreadyWarned = true;
 249     }
 250     validLayout = validLayout && !invalidValue;
 251
 252     if (validLayout)
 253     {
 254         GMX_LOG(mdlog.info).appendTextFormatted(
 255                 "Pinning threads with a%s logical core stride of %d",
 256                 bPickPinStride ? "n auto-selected" : " user-specified",
 257                 *pin_stride);
 258     }
 259
 260     *issuedWarning = alreadyWarned;
 261
 262     return validLayout;
 263 }
 264
 265 static bool set_affinity(const t_commrec *cr, int nthread_local, int intraNodeThreadOffset,
 266                          int offset, int core_pinning_stride, int *localityOrder,
 267                          gmx::IThreadAffinityAccess *affinityAccess)
 268 {
 269     // Set the per-thread affinity. In order to be able to check the success
 270     // of affinity settings, we will set nth_affinity_set to 1 on threads
 271     // where the affinity setting succeded and to 0 where it failed.
 272     // Reducing these 0/1 values over the threads will give the total number
 273     // of threads on which we succeeded.
 274
 275     // To avoid warnings from the static analyzer we initialize nth_affinity_set
 276     // to zero outside the OpenMP block, and then add to it inside the block.
 277     // The value will still always be 0 or 1 from each thread.
 278     int nth_affinity_set = 0;
 279 #pragma omp parallel num_threads(nthread_local) reduction(+:nth_affinity_set)
 280     {
 281         try
 282         {
 283             int      thread_id, thread_id_node;
 284             int      index, core;
 285
 286             thread_id      = gmx_omp_get_thread_num();
 287             thread_id_node = intraNodeThreadOffset + thread_id;
 288             index          = offset + thread_id_node*core_pinning_stride;
 289             if (localityOrder != nullptr)
 290             {
 291                 core = localityOrder[index];
 292             }
 293             else
 294             {
 295                 core = index;
 296             }
 297
 298             const bool ret = affinityAccess->setCurrentThreadAffinityToCore(core);
 299
 300             /* store the per-thread success-values of the setaffinity */
 301             nth_affinity_set += (ret ? 1 : 0);
 302
 303             if (debug)
 304             {
 305                 fprintf(debug, "On rank %2d, thread %2d, index %2d, core %2d the affinity setting returned %d\n",
 306                         cr->nodeid, gmx_omp_get_thread_num(), index, core, ret ? 1 : 0);
 307             }
 308         }
 309         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
 310     }
 311
 312     if (nth_affinity_set > nthread_local)
 313     {
 314         char msg[STRLEN];
 315
 316         sprintf(msg, "Looks like we have set affinity for more threads than "
 317                 "we have (%d > %d)!\n", nth_affinity_set, nthread_local);
 318         gmx_incons(msg);
 319     }
 320
 321     /* check & warn if some threads failed to set their affinities */
 322     const bool allAffinitiesSet = (nth_affinity_set == nthread_local);
 323     if (!allAffinitiesSet)
 324     {
 325         char sbuf1[STRLEN], sbuf2[STRLEN];
 326
 327         /* sbuf1 contains rank info, while sbuf2 OpenMP thread info */
 328         sbuf1[0] = sbuf2[0] = '\0';
 329         /* Only add rank info if we have more than one rank. */
 330         if (cr->nnodes > 1)
 331         {
 332 #if GMX_MPI
 333 #if GMX_THREAD_MPI
 334             sprintf(sbuf1, "In tMPI thread #%d: ", cr->nodeid);
 335 #else           /* GMX_LIB_MPI */
 336             sprintf(sbuf1, "In MPI process #%d: ", cr->nodeid);
 337 #endif
 338 #endif          /* GMX_MPI */
 339         }
 340
 341         if (nthread_local > 1)
 342         {
 343             sprintf(sbuf2, "for %d/%d thread%s ",
 344                     nthread_local - nth_affinity_set, nthread_local,
 345                     nthread_local > 1 ? "s" : "");
 346         }
 347
 348         // TODO: This output should also go through mdlog.
 349         fprintf(stderr, "NOTE: %sAffinity setting %sfailed.\n", sbuf1, sbuf2);
 350     }
 351     return allAffinitiesSet;
 352 }
 353
 354 void analyzeThreadsOnThisNode(const gmx::PhysicalNodeCommunicator &physicalNodeComm,
 355                               int                                  numThreadsOnThisRank,
 356                               int                                 *numThreadsOnThisNode,
 357                               int                                 *intraNodeThreadOffset)
 358 {
 359     *intraNodeThreadOffset                  = 0;
 360     *numThreadsOnThisNode                   = numThreadsOnThisRank;
 361 #if GMX_MPI
 362     if (physicalNodeComm.size_ > 1)
 363     {
 364         /* We need to determine a scan of the thread counts in this
 365          * compute node. */
 366         MPI_Scan(&numThreadsOnThisRank, intraNodeThreadOffset, 1, MPI_INT, MPI_SUM, physicalNodeComm.comm_);
 367         /* MPI_Scan is inclusive, but here we need exclusive */
 368         *intraNodeThreadOffset -= numThreadsOnThisRank;
 369         /* Get the total number of threads on this physical node */
 370         MPI_Allreduce(&numThreadsOnThisRank, numThreadsOnThisNode, 1, MPI_INT, MPI_SUM, physicalNodeComm.comm_);
 371     }
 372 #else
 373     GMX_UNUSED_VALUE(physicalNodeComm);
 374 #endif
 375
 376 }
 377
 378 /* Set CPU affinity. Can be important for performance.
 379    On some systems (e.g. Cray) CPU Affinity is set by default.
 380    But default assigning doesn't work (well) with only some ranks
 381    having threads. This causes very low performance.
 382    External tools have cumbersome syntax for setting affinity
 383    in the case that only some ranks have threads.
 384    Thus it is important that GROMACS sets the affinity internally
 385    if only PME is using threads.
 386  */
 387 void
 388 gmx_set_thread_affinity(const gmx::MDLogger         &mdlog,
 389                         const t_commrec             *cr,
 390                         const gmx_hw_opt_t          *hw_opt,
 391                         const gmx::HardwareTopology &hwTop,
 392                         int                          numThreadsOnThisRank,
 393                         int                          numThreadsOnThisNode,
 394                         int                          intraNodeThreadOffset,
 395                         gmx::IThreadAffinityAccess  *affinityAccess)
 396 {
 397     int *localityOrder = nullptr;
 398
 399     if (hw_opt->thread_affinity == threadaffOFF)
 400     {
 401         /* Nothing to do */
 402         return;
 403     }
 404
 405     if (affinityAccess == nullptr)
 406     {
 407         affinityAccess = &g_defaultAffinityAccess;
 408     }
 409
 410     /* If the tMPI thread affinity setting is not supported encourage the user
 411      * to report it as it's either a bug or an exotic platform which we might
 412      * want to support. */
 413     if (!affinityAccess->isThreadAffinitySupported())
 414     {
 415         /* we know Mac OS does not support setting thread affinity, so there's
 416            no point in warning the user in that case. In any other case
 417            the user might be able to do something about it. */
 418 #if !defined(__APPLE__)
 419         GMX_LOG(mdlog.warning).asParagraph().appendText(
 420                 "NOTE: Cannot set thread affinities on the current platform.");
 421 #endif  /* __APPLE__ */
 422         return;
 423     }
 424
 425     int  offset              = hw_opt->core_pinning_offset;
 426     int  core_pinning_stride = hw_opt->core_pinning_stride;
 427     if (offset != 0)
 428     {
 429         GMX_LOG(mdlog.warning).appendTextFormatted("Applying core pinning offset %d", offset);
 430     }
 431
 432     bool affinityIsAutoAndNumThreadsIsNotAuto =
 433         (hw_opt->thread_affinity == threadaffAUTO &&
 434          !hw_opt->totNumThreadsIsAuto);
 435     bool issuedWarning;
 436     bool validLayout
 437         = get_thread_affinity_layout(mdlog, cr, hwTop, numThreadsOnThisNode,
 438                                      affinityIsAutoAndNumThreadsIsNotAuto,
 439                                      offset, &core_pinning_stride, &localityOrder,
 440                                      &issuedWarning);
 441     const gmx::sfree_guard  localityOrderGuard(localityOrder);
 442
 443     bool                    allAffinitiesSet;
 444     if (validLayout)
 445     {
 446         allAffinitiesSet = set_affinity(cr, numThreadsOnThisRank, intraNodeThreadOffset,
 447                                         offset, core_pinning_stride, localityOrder,
 448                                         affinityAccess);
 449     }
 450     else
 451     {
 452         // Produce the warning if any rank fails.
 453         allAffinitiesSet = false;
 454     }
 455     if (invalidWithinSimulation(cr, !allAffinitiesSet) && !issuedWarning)
 456     {
 457         GMX_LOG(mdlog.warning).asParagraph().appendText("NOTE: Thread affinity was not set.");
 458     }
 459 }
 460
 461 /* Check the process affinity mask and if it is found to be non-zero,
 462  * will honor it and disable mdrun internal affinity setting.
 463  * Note that this will only work on Linux as we use a GNU feature.
 464  */
 465 void
 466 gmx_check_thread_affinity_set(const gmx::MDLogger &mdlog,
 467                               const t_commrec     *cr,
 468                               gmx_hw_opt_t        *hw_opt,
 469                               int  gmx_unused      nthreads_hw_avail,
 470                               gmx_bool             bAfterOpenmpInit)
 471 {
 472     GMX_RELEASE_ASSERT(hw_opt, "hw_opt must be a non-NULL pointer");
 473
 474     if (!bAfterOpenmpInit)
 475     {
 476         /* Check for externally set OpenMP affinity and turn off internal
 477          * pinning if any is found. We need to do this check early to tell
 478          * thread-MPI whether it should do pinning when spawning threads.
 479          * TODO: the above no longer holds, we should move these checks later
 480          */
 481         if (hw_opt->thread_affinity != threadaffOFF)
 482         {
 483             char *message;
 484             if (!gmx_omp_check_thread_affinity(&message))
 485             {
 486                 /* We only pin automatically with totNumThreadsIsAuto=true */
 487                 if (hw_opt->thread_affinity == threadaffON ||
 488                     hw_opt->totNumThreadsIsAuto)
 489                 {
 490                     GMX_LOG(mdlog.warning).asParagraph().appendText(message);
 491                 }
 492                 sfree(message);
 493                 hw_opt->thread_affinity = threadaffOFF;
 494             }
 495         }
 496
 497         /* With thread-MPI this is needed as pinning might get turned off,
 498          * which needs to be known before starting thread-MPI.
 499          * With thread-MPI hw_opt is processed here on the master rank
 500          * and passed to the other ranks later, so we only do this on master.
 501          */
 502         if (!SIMMASTER(cr))
 503         {
 504             return;
 505         }
 506 #if !GMX_THREAD_MPI
 507         return;
 508 #endif
 509     }
 510
 511 #if HAVE_SCHED_AFFINITY
 512     int       ret;
 513     cpu_set_t mask_current;
 514
 515     if (hw_opt->thread_affinity == threadaffOFF)
 516     {
 517         /* internal affinity setting is off, don't bother checking process affinity */
 518         return;
 519     }
 520
 521     CPU_ZERO(&mask_current);
 522     if ((ret = sched_getaffinity(0, sizeof(cpu_set_t), &mask_current)) != 0)
 523     {
 524         /* failed to query affinity mask, will just return */
 525         if (debug)
 526         {
 527             fprintf(debug, "Failed to query affinity mask (error %d)", ret);
 528         }
 529         return;
 530     }
 531
 532     /* Before proceeding with the actual check, make sure that the number of
 533      * detected CPUs is >= the CPUs in the current set.
 534      * We need to check for CPU_COUNT as it was added only in glibc 2.6. */
 535 #ifdef CPU_COUNT
 536     if (nthreads_hw_avail < CPU_COUNT(&mask_current))
 537     {
 538         if (debug)
 539         {
 540             fprintf(debug, "%d hardware threads detected, but %d was returned by CPU_COUNT",
 541                     nthreads_hw_avail, CPU_COUNT(&mask_current));
 542         }
 543         return;
 544     }
 545 #endif /* CPU_COUNT */
 546
 547     gmx_bool bAllSet = TRUE;
 548     for (int i = 0; (i < nthreads_hw_avail && i < CPU_SETSIZE); i++)
 549     {
 550         bAllSet = bAllSet && (CPU_ISSET(i, &mask_current) != 0);
 551     }
 552
 553 #if GMX_MPI
 554     int isInitialized;
 555     MPI_Initialized(&isInitialized);
 556     /* Before OpenMP initialization, thread-MPI is not yet initialized.
 557      * With thread-MPI bAllSet will be the same on all MPI-ranks, but the
 558      * MPI_Allreduce then functions as a barrier before setting affinities.
 559      */
 560     if (isInitialized)
 561     {
 562         gmx_bool  bAllSet_All;
 563
 564         MPI_Allreduce(&bAllSet, &bAllSet_All, 1, MPI_INT, MPI_LAND, MPI_COMM_WORLD);
 565         bAllSet = bAllSet_All;
 566     }
 567 #endif
 568
 569     if (!bAllSet)
 570     {
 571         if (hw_opt->thread_affinity == threadaffAUTO)
 572         {
 573             if (!bAfterOpenmpInit)
 574             {
 575                 GMX_LOG(mdlog.warning).asParagraph().appendText(
 576                         "Non-default thread affinity set, disabling internal thread affinity");
 577             }
 578             else
 579             {
 580                 GMX_LOG(mdlog.warning).asParagraph().appendText(
 581                         "Non-default thread affinity set probably by the OpenMP library,\n"
 582                         "disabling internal thread affinity");
 583             }
 584             hw_opt->thread_affinity = threadaffOFF;
 585         }
 586         else
 587         {
 588             /* Only warn once, at the last check (bAfterOpenmpInit==TRUE) */
 589             if (bAfterOpenmpInit)
 590             {
 591                 GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted(
 592                         "Overriding thread affinity set outside %s",
 593                         gmx::getProgramContext().displayName());
 594             }
 595         }
 596
 597         if (debug)
 598         {
 599             fprintf(debug, "Non-default affinity mask found\n");
 600         }
 601     }
 602     else
 603     {
 604         if (debug)
 605         {
 606             fprintf(debug, "Default affinity mask found\n");
 607         }
 608     }
 609 #endif /* HAVE_SCHED_AFFINITY */
 610 }