2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2016 by the GROMACS development team.
5 * Copyright (c) 2017,2018,2019,2020,2021, by the GROMACS development team, led by
6 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
7 * and including many others, as listed in the AUTHORS file in the
8 * top-level source directory and at http://www.gromacs.org.
10 * GROMACS is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public License
12 * as published by the Free Software Foundation; either version 2.1
13 * of the License, or (at your option) any later version.
15 * GROMACS is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with GROMACS; if not, see
22 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
23 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
25 * If you want to redistribute modifications to GROMACS, please
26 * consider that scientific software is very special. Version
27 * control is crucial - bugs must be traceable. We will be happy to
28 * consider code for inclusion in the official distribution, but
29 * derived work must not be called official GROMACS. Details are found
30 * in the README & COPYING files - if they are missing, get the
31 * official version at http://www.gromacs.org.
33 * To help us fund GROMACS development, we humbly ask that you cite
34 * the research papers on the package. Check out http://www.gromacs.org.
38 #include "threadaffinity.h"
46 #if HAVE_SCHED_AFFINITY
50 #include "thread_mpi/threads.h"
52 #include "gromacs/hardware/hardwaretopology.h"
53 #include "gromacs/hardware/hw_info.h"
54 #include "gromacs/mdtypes/commrec.h"
55 #include "gromacs/utility/basenetwork.h"
56 #include "gromacs/utility/cstringutil.h"
57 #include "gromacs/utility/exceptions.h"
58 #include "gromacs/utility/fatalerror.h"
59 #include "gromacs/utility/gmxassert.h"
60 #include "gromacs/utility/gmxomp.h"
61 #include "gromacs/utility/logger.h"
62 #include "gromacs/utility/physicalnodecommunicator.h"
63 #include "gromacs/utility/programcontext.h"
64 #include "gromacs/utility/smalloc.h"
65 #include "gromacs/utility/unique_cptr.h"
70 class DefaultThreadAffinityAccess : public gmx::IThreadAffinityAccess
73 bool isThreadAffinitySupported() const override
75 return tMPI_Thread_setaffinity_support() == TMPI_SETAFFINITY_SUPPORT_YES;
77 bool setCurrentThreadAffinityToCore(int core) override
79 const int ret = tMPI_Thread_setaffinity_single(tMPI_Thread_self(), core);
84 //! Global instance of DefaultThreadAffinityAccess
85 // NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
86 DefaultThreadAffinityAccess g_defaultAffinityAccess;
90 gmx::IThreadAffinityAccess::~IThreadAffinityAccess() {}
92 static bool invalidWithinSimulation(const t_commrec* cr, bool invalidLocally)
97 int value = invalidLocally ? 1 : 0;
99 MPI_Reduce(&value, &globalValue, 1, MPI_INT, MPI_LOR, MASTERRANK(cr), cr->mpi_comm_mysim);
100 return SIMMASTER(cr) ? (globalValue != 0) : invalidLocally;
103 GMX_UNUSED_VALUE(cr);
105 return invalidLocally;
108 static bool get_thread_affinity_layout(const gmx::MDLogger& mdlog,
110 const gmx::HardwareTopology& hwTop,
112 bool affinityIsAutoAndNumThreadsIsNotAuto,
119 int hwThreadsPerCore = 0;
124 haveTopology = (hwTop.supportLevel() >= gmx::HardwareTopology::SupportLevel::Basic);
128 gmx_fatal(FARGS, "Negative thread pinning offset requested");
132 gmx_fatal(FARGS, "Negative thread pinning stride requested");
137 hwThreads = hwTop.machine().logicalProcessorCount;
138 // Just use the value for the first core
139 hwThreadsPerCore = hwTop.machine().sockets[0].cores[0].hwThreads.size();
140 snew(*localityOrder, hwThreads);
142 for (const auto& s : hwTop.machine().sockets)
144 for (const auto& c : s.cores)
146 for (const auto& t : c.hwThreads)
148 (*localityOrder)[i++] = t.logicalProcessorId;
155 /* topology information not available or invalid, ignore it */
156 hwThreads = hwTop.machine().logicalProcessorCount;
157 *localityOrder = nullptr;
159 // Only warn about the first problem per node. Otherwise, the first test
160 // failing would essentially always cause also the other problems get
161 // reported, leading to bogus warnings. The order in the conditionals
162 // with this variable is important, since the MPI_Reduce() in
163 // invalidWithinSimulation() needs to always happen.
164 bool alreadyWarned = false;
165 invalidValue = (hwThreads <= 0);
166 if (invalidWithinSimulation(cr, invalidValue))
168 /* We don't know anything about the hardware, don't pin */
169 GMX_LOG(mdlog.warning)
171 .appendText("NOTE: No information on available cores, thread pinning disabled.");
172 alreadyWarned = true;
174 bool validLayout = !invalidValue;
176 if (affinityIsAutoAndNumThreadsIsNotAuto)
178 invalidValue = (threads != hwThreads);
179 bool warn = (invalidValue && threads > 1 && threads < hwThreads);
180 if (invalidWithinSimulation(cr, warn) && !alreadyWarned)
182 GMX_LOG(mdlog.warning)
185 "NOTE: The number of threads is not equal to the number of (logical) "
187 " and the -pin option is set to auto: will not pin threads to "
189 " This can lead to significant performance degradation.\n"
190 " Consider using -pin on (and -pinoffset in case you run multiple "
192 alreadyWarned = true;
194 validLayout = validLayout && !invalidValue;
197 invalidValue = (threads > hwThreads);
198 if (invalidWithinSimulation(cr, invalidValue) && !alreadyWarned)
200 GMX_LOG(mdlog.warning)
202 .appendText("NOTE: Oversubscribing the CPU, will not pin threads");
203 alreadyWarned = true;
205 validLayout = validLayout && !invalidValue;
207 invalidValue = (pin_offset + threads > hwThreads);
208 if (invalidWithinSimulation(cr, invalidValue) && !alreadyWarned)
210 GMX_LOG(mdlog.warning)
213 "WARNING: Requested offset too large for available cores, thread pinning "
215 alreadyWarned = true;
217 validLayout = validLayout && !invalidValue;
219 invalidValue = false;
220 /* do we need to choose the pinning stride? */
221 bPickPinStride = (*pin_stride == 0);
225 if (haveTopology && pin_offset + threads * hwThreadsPerCore <= hwThreads)
227 /* Put one thread on each physical core */
228 *pin_stride = hwThreadsPerCore;
232 /* We don't know if we have SMT, and if we do, we don't know
233 * if hw threads in the same physical core are consecutive.
234 * Without SMT the pinning layout should not matter too much.
235 * so we assume a consecutive layout and maximally spread out"
236 * the threads at equal threads per core.
237 * Note that IBM is the major non-x86 case with cpuid support
238 * and probably threads are already pinned by the queuing system,
239 * so we wouldn't end up here in the first place.
241 *pin_stride = (hwThreads - pin_offset) / threads;
246 /* Check the placement of the thread with the largest index to make sure
247 * that the offset & stride doesn't cause pinning beyond the last hardware thread. */
248 invalidValue = (pin_offset + (threads - 1) * (*pin_stride) >= hwThreads);
250 if (invalidWithinSimulation(cr, invalidValue) && !alreadyWarned)
252 /* We are oversubscribing, don't pin */
253 GMX_LOG(mdlog.warning)
256 "WARNING: Requested stride too large for available cores, thread pinning "
258 alreadyWarned = true;
260 validLayout = validLayout && !invalidValue;
265 .appendTextFormatted("Pinning threads with a%s logical core stride of %d",
266 bPickPinStride ? "n auto-selected" : " user-specified",
270 *issuedWarning = alreadyWarned;
275 static bool set_affinity(const t_commrec* cr,
277 int intraNodeThreadOffset,
279 int core_pinning_stride,
280 const int* localityOrder,
281 gmx::IThreadAffinityAccess* affinityAccess)
283 // Set the per-thread affinity. In order to be able to check the success
284 // of affinity settings, we will set nth_affinity_set to 1 on threads
285 // where the affinity setting succeded and to 0 where it failed.
286 // Reducing these 0/1 values over the threads will give the total number
287 // of threads on which we succeeded.
289 // To avoid warnings from the static analyzer we initialize nth_affinity_set
290 // to zero outside the OpenMP block, and then add to it inside the block.
291 // The value will still always be 0 or 1 from each thread.
292 int nth_affinity_set = 0;
293 #pragma omp parallel num_threads(nthread_local) reduction(+ : nth_affinity_set)
297 int thread_id, thread_id_node;
300 thread_id = gmx_omp_get_thread_num();
301 thread_id_node = intraNodeThreadOffset + thread_id;
302 index = offset + thread_id_node * core_pinning_stride;
303 if (localityOrder != nullptr)
305 core = localityOrder[index];
312 const bool ret = affinityAccess->setCurrentThreadAffinityToCore(core);
314 /* store the per-thread success-values of the setaffinity */
315 nth_affinity_set += (ret ? 1 : 0);
320 "On rank %2d, thread %2d, index %2d, core %2d the affinity setting "
323 gmx_omp_get_thread_num(),
329 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
332 if (nth_affinity_set > nthread_local)
337 "Looks like we have set affinity for more threads than "
338 "we have (%d > %d)!\n",
344 /* check & warn if some threads failed to set their affinities */
345 const bool allAffinitiesSet = (nth_affinity_set == nthread_local);
346 if (!allAffinitiesSet)
348 char sbuf1[STRLEN], sbuf2[STRLEN];
350 /* sbuf1 contains rank info, while sbuf2 OpenMP thread info */
351 sbuf1[0] = sbuf2[0] = '\0';
352 /* Only add rank info if we have more than one rank. */
357 sprintf(sbuf1, "In tMPI thread #%d: ", cr->nodeid);
358 # else /* GMX_LIB_MPI */
359 sprintf(sbuf1, "In MPI process #%d: ", cr->nodeid);
364 if (nthread_local > 1)
367 "for %d/%d thread%s ",
368 nthread_local - nth_affinity_set,
370 nthread_local > 1 ? "s" : "");
373 // TODO: This output should also go through mdlog.
374 fprintf(stderr, "NOTE: %sAffinity setting %sfailed.\n", sbuf1, sbuf2);
376 return allAffinitiesSet;
379 void analyzeThreadsOnThisNode(const gmx::PhysicalNodeCommunicator& physicalNodeComm,
380 int numThreadsOnThisRank,
381 int* numThreadsOnThisNode,
382 int* intraNodeThreadOffset)
384 *intraNodeThreadOffset = 0;
385 *numThreadsOnThisNode = numThreadsOnThisRank;
387 if (physicalNodeComm.size_ > 1)
389 /* We need to determine a scan of the thread counts in this
391 MPI_Scan(&numThreadsOnThisRank, intraNodeThreadOffset, 1, MPI_INT, MPI_SUM, physicalNodeComm.comm_);
392 /* MPI_Scan is inclusive, but here we need exclusive */
393 *intraNodeThreadOffset -= numThreadsOnThisRank;
394 /* Get the total number of threads on this physical node */
396 &numThreadsOnThisRank, numThreadsOnThisNode, 1, MPI_INT, MPI_SUM, physicalNodeComm.comm_);
399 GMX_UNUSED_VALUE(physicalNodeComm);
403 /* Set CPU affinity. Can be important for performance.
404 On some systems (e.g. Cray) CPU Affinity is set by default.
405 But default assigning doesn't work (well) with only some ranks
406 having threads. This causes very low performance.
407 External tools have cumbersome syntax for setting affinity
408 in the case that only some ranks have threads.
409 Thus it is important that GROMACS sets the affinity internally
410 if only PME is using threads.
412 void gmx_set_thread_affinity(const gmx::MDLogger& mdlog,
414 const gmx_hw_opt_t* hw_opt,
415 const gmx::HardwareTopology& hwTop,
416 int numThreadsOnThisRank,
417 int numThreadsOnThisNode,
418 int intraNodeThreadOffset,
419 gmx::IThreadAffinityAccess* affinityAccess)
421 int* localityOrder = nullptr;
423 if (hw_opt->threadAffinity == ThreadAffinity::Off)
429 if (affinityAccess == nullptr)
431 affinityAccess = &g_defaultAffinityAccess;
434 /* If the tMPI thread affinity setting is not supported encourage the user
435 * to report it as it's either a bug or an exotic platform which we might
436 * want to support. */
437 if (!affinityAccess->isThreadAffinitySupported())
439 /* we know Mac OS does not support setting thread affinity, so there's
440 no point in warning the user in that case. In any other case
441 the user might be able to do something about it. */
442 #if !defined(__APPLE__)
443 GMX_LOG(mdlog.warning)
445 .appendText("NOTE: Cannot set thread affinities on the current platform.");
446 #endif /* __APPLE__ */
450 int offset = hw_opt->core_pinning_offset;
451 int core_pinning_stride = hw_opt->core_pinning_stride;
454 GMX_LOG(mdlog.warning).appendTextFormatted("Applying core pinning offset %d", offset);
457 bool affinityIsAutoAndNumThreadsIsNotAuto =
458 (hw_opt->threadAffinity == ThreadAffinity::Auto && !hw_opt->totNumThreadsIsAuto);
460 bool validLayout = get_thread_affinity_layout(mdlog,
463 numThreadsOnThisNode,
464 affinityIsAutoAndNumThreadsIsNotAuto,
466 &core_pinning_stride,
469 const gmx::sfree_guard localityOrderGuard(localityOrder);
471 bool allAffinitiesSet;
474 allAffinitiesSet = set_affinity(
475 cr, numThreadsOnThisRank, intraNodeThreadOffset, offset, core_pinning_stride, localityOrder, affinityAccess);
479 // Produce the warning if any rank fails.
480 allAffinitiesSet = false;
482 if (invalidWithinSimulation(cr, !allAffinitiesSet) && !issuedWarning)
484 GMX_LOG(mdlog.warning).asParagraph().appendText("NOTE: Thread affinity was not set.");
488 /* Detects and returns whether we have the default affinity mask
490 * Returns true when we can query thread affinities and CPU count is
491 * consistent and we have default affinity mask on all ranks.
493 * Should be called simultaneously by all MPI ranks.
495 static bool detectDefaultAffinityMask(const int nthreads_hw_avail)
497 bool detectedDefaultAffinityMask = true;
499 #if HAVE_SCHED_AFFINITY
500 cpu_set_t mask_current;
501 CPU_ZERO(&mask_current);
503 if ((ret = sched_getaffinity(0, sizeof(cpu_set_t), &mask_current)) != 0)
505 /* failed to query affinity mask, will just return */
508 fprintf(debug, "Failed to query affinity mask (error %d)", ret);
510 detectedDefaultAffinityMask = false;
513 /* Before proceeding with the actual check, make sure that the number of
514 * detected CPUs is >= the CPUs in the current set.
515 * We need to check for CPU_COUNT as it was added only in glibc 2.6. */
517 if (detectedDefaultAffinityMask && nthreads_hw_avail < CPU_COUNT(&mask_current))
522 "%d hardware threads detected, but %d was returned by CPU_COUNT",
524 CPU_COUNT(&mask_current));
526 detectedDefaultAffinityMask = false;
528 # endif /* CPU_COUNT */
530 if (detectedDefaultAffinityMask)
532 /* Here we check whether all bits of the affinity mask are set.
533 * Note that this mask can change while the program is executing,
534 * e.g. by the system dynamically enabling or disabling cores or
535 * by some scheduler changing the affinities. Thus we can not assume
536 * that the result is the same on all ranks within a node
537 * when running this code at different times.
539 bool allBitsAreSet = true;
540 for (int i = 0; (i < nthreads_hw_avail && i < CPU_SETSIZE); i++)
542 allBitsAreSet = allBitsAreSet && (CPU_ISSET(i, &mask_current) != 0);
546 fprintf(debug, "%s affinity mask found\n", allBitsAreSet ? "Default" : "Non-default");
550 detectedDefaultAffinityMask = false;
554 GMX_UNUSED_VALUE(nthreads_hw_avail);
555 #endif /* HAVE_SCHED_AFFINITY */
558 int mpiIsInitialized;
559 MPI_Initialized(&mpiIsInitialized);
560 if (mpiIsInitialized)
562 bool maskToReduce = detectedDefaultAffinityMask;
563 MPI_Allreduce(&maskToReduce, &detectedDefaultAffinityMask, 1, MPI_C_BOOL, MPI_LAND, MPI_COMM_WORLD);
567 return detectedDefaultAffinityMask;
570 /* Check the process affinity mask and if it is found to be non-zero,
571 * will honor it and disable mdrun internal affinity setting.
572 * Note that this will only work on Linux as we use a GNU feature.
574 void gmx_check_thread_affinity_set(const gmx::MDLogger& mdlog,
575 gmx_hw_opt_t* hw_opt,
576 int gmx_unused nthreads_hw_avail,
577 gmx_bool bAfterOpenmpInit)
579 GMX_RELEASE_ASSERT(hw_opt, "hw_opt must be a non-NULL pointer");
581 if (!bAfterOpenmpInit)
583 /* Check for externally set OpenMP affinity and turn off internal
584 * pinning if any is found. We need to do this check early to tell
585 * thread-MPI whether it should do pinning when spawning threads.
586 * TODO: the above no longer holds, we should move these checks later
588 if (hw_opt->threadAffinity != ThreadAffinity::Off)
591 if (!gmx_omp_check_thread_affinity(&message))
593 /* We only pin automatically with totNumThreadsIsAuto=true */
594 if (hw_opt->threadAffinity == ThreadAffinity::On || hw_opt->totNumThreadsIsAuto)
596 GMX_LOG(mdlog.warning).asParagraph().appendText(message);
599 hw_opt->threadAffinity = ThreadAffinity::Off;
604 if (!detectDefaultAffinityMask(nthreads_hw_avail))
606 if (hw_opt->threadAffinity == ThreadAffinity::Auto)
608 if (!bAfterOpenmpInit)
610 GMX_LOG(mdlog.warning)
613 "Non-default thread affinity set, disabling internal thread "
618 GMX_LOG(mdlog.warning)
621 "Non-default thread affinity set probably by the OpenMP library,\n"
622 "disabling internal thread affinity");
624 hw_opt->threadAffinity = ThreadAffinity::Off;
628 /* Only warn once, at the last check (bAfterOpenmpInit==TRUE) */
629 if (bAfterOpenmpInit)
631 GMX_LOG(mdlog.warning)
633 .appendTextFormatted("Overriding thread affinity set outside %s",
634 gmx::getProgramContext().displayName());