2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2016 by the GROMACS development team.
5 * Copyright (c) 2017,2018,2019,2020, by the GROMACS development team, led by
6 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
7 * and including many others, as listed in the AUTHORS file in the
8 * top-level source directory and at http://www.gromacs.org.
10 * GROMACS is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public License
12 * as published by the Free Software Foundation; either version 2.1
13 * of the License, or (at your option) any later version.
15 * GROMACS is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with GROMACS; if not, see
22 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
23 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
25 * If you want to redistribute modifications to GROMACS, please
26 * consider that scientific software is very special. Version
27 * control is crucial - bugs must be traceable. We will be happy to
28 * consider code for inclusion in the official distribution, but
29 * derived work must not be called official GROMACS. Details are found
30 * in the README & COPYING files - if they are missing, get the
31 * official version at http://www.gromacs.org.
33 * To help us fund GROMACS development, we humbly ask that you cite
34 * the research papers on the package. Check out http://www.gromacs.org.
38 #include "threadaffinity.h"
46 #if HAVE_SCHED_AFFINITY
48 # include <sys/syscall.h>
51 #include "thread_mpi/threads.h"
53 #include "gromacs/hardware/hardwaretopology.h"
54 #include "gromacs/hardware/hw_info.h"
55 #include "gromacs/mdtypes/commrec.h"
56 #include "gromacs/utility/basenetwork.h"
57 #include "gromacs/utility/cstringutil.h"
58 #include "gromacs/utility/exceptions.h"
59 #include "gromacs/utility/fatalerror.h"
60 #include "gromacs/utility/gmxassert.h"
61 #include "gromacs/utility/gmxomp.h"
62 #include "gromacs/utility/logger.h"
63 #include "gromacs/utility/physicalnodecommunicator.h"
64 #include "gromacs/utility/programcontext.h"
65 #include "gromacs/utility/smalloc.h"
66 #include "gromacs/utility/unique_cptr.h"
71 class DefaultThreadAffinityAccess : public gmx::IThreadAffinityAccess
74 bool isThreadAffinitySupported() const override
76 return tMPI_Thread_setaffinity_support() == TMPI_SETAFFINITY_SUPPORT_YES;
78 bool setCurrentThreadAffinityToCore(int core) override
80 const int ret = tMPI_Thread_setaffinity_single(tMPI_Thread_self(), core);
85 //! Global instance of DefaultThreadAffinityAccess
86 DefaultThreadAffinityAccess g_defaultAffinityAccess;
90 gmx::IThreadAffinityAccess::~IThreadAffinityAccess() {}
92 static bool invalidWithinSimulation(const t_commrec* cr, bool invalidLocally)
97 int value = invalidLocally ? 1 : 0;
99 MPI_Reduce(&value, &globalValue, 1, MPI_INT, MPI_LOR, MASTERRANK(cr), cr->mpi_comm_mysim);
100 return SIMMASTER(cr) ? (globalValue != 0) : invalidLocally;
103 GMX_UNUSED_VALUE(cr);
105 return invalidLocally;
108 static bool get_thread_affinity_layout(const gmx::MDLogger& mdlog,
110 const gmx::HardwareTopology& hwTop,
112 bool affinityIsAutoAndNumThreadsIsNotAuto,
119 int hwThreadsPerCore = 0;
124 haveTopology = (hwTop.supportLevel() >= gmx::HardwareTopology::SupportLevel::Basic);
128 gmx_fatal(FARGS, "Negative thread pinning offset requested");
132 gmx_fatal(FARGS, "Negative thread pinning stride requested");
137 hwThreads = hwTop.machine().logicalProcessorCount;
138 // Just use the value for the first core
139 hwThreadsPerCore = hwTop.machine().sockets[0].cores[0].hwThreads.size();
140 snew(*localityOrder, hwThreads);
142 for (auto& s : hwTop.machine().sockets)
144 for (auto& c : s.cores)
146 for (auto& t : c.hwThreads)
148 (*localityOrder)[i++] = t.logicalProcessorId;
155 /* topology information not available or invalid, ignore it */
156 hwThreads = hwTop.machine().logicalProcessorCount;
157 *localityOrder = nullptr;
159 // Only warn about the first problem per node. Otherwise, the first test
160 // failing would essentially always cause also the other problems get
161 // reported, leading to bogus warnings. The order in the conditionals
162 // with this variable is important, since the MPI_Reduce() in
163 // invalidWithinSimulation() needs to always happen.
164 bool alreadyWarned = false;
165 invalidValue = (hwThreads <= 0);
166 if (invalidWithinSimulation(cr, invalidValue))
168 /* We don't know anything about the hardware, don't pin */
169 GMX_LOG(mdlog.warning)
171 .appendText("NOTE: No information on available cores, thread pinning disabled.");
172 alreadyWarned = true;
174 bool validLayout = !invalidValue;
176 if (affinityIsAutoAndNumThreadsIsNotAuto)
178 invalidValue = (threads != hwThreads);
179 bool warn = (invalidValue && threads > 1 && threads < hwThreads);
180 if (invalidWithinSimulation(cr, warn) && !alreadyWarned)
182 GMX_LOG(mdlog.warning)
185 "NOTE: The number of threads is not equal to the number of (logical) "
187 " and the -pin option is set to auto: will not pin threads to "
189 " This can lead to significant performance degradation.\n"
190 " Consider using -pin on (and -pinoffset in case you run multiple "
192 alreadyWarned = true;
194 validLayout = validLayout && !invalidValue;
197 invalidValue = (threads > hwThreads);
198 if (invalidWithinSimulation(cr, invalidValue) && !alreadyWarned)
200 GMX_LOG(mdlog.warning)
202 .appendText("NOTE: Oversubscribing the CPU, will not pin threads");
203 alreadyWarned = true;
205 validLayout = validLayout && !invalidValue;
207 invalidValue = (pin_offset + threads > hwThreads);
208 if (invalidWithinSimulation(cr, invalidValue) && !alreadyWarned)
210 GMX_LOG(mdlog.warning)
213 "WARNING: Requested offset too large for available cores, thread pinning "
215 alreadyWarned = true;
217 validLayout = validLayout && !invalidValue;
219 invalidValue = false;
220 /* do we need to choose the pinning stride? */
221 bPickPinStride = (*pin_stride == 0);
225 if (haveTopology && pin_offset + threads * hwThreadsPerCore <= hwThreads)
227 /* Put one thread on each physical core */
228 *pin_stride = hwThreadsPerCore;
232 /* We don't know if we have SMT, and if we do, we don't know
233 * if hw threads in the same physical core are consecutive.
234 * Without SMT the pinning layout should not matter too much.
235 * so we assume a consecutive layout and maximally spread out"
236 * the threads at equal threads per core.
237 * Note that IBM is the major non-x86 case with cpuid support
238 * and probably threads are already pinned by the queuing system,
239 * so we wouldn't end up here in the first place.
241 *pin_stride = (hwThreads - pin_offset) / threads;
246 /* Check the placement of the thread with the largest index to make sure
247 * that the offset & stride doesn't cause pinning beyond the last hardware thread. */
248 invalidValue = (pin_offset + (threads - 1) * (*pin_stride) >= hwThreads);
250 if (invalidWithinSimulation(cr, invalidValue) && !alreadyWarned)
252 /* We are oversubscribing, don't pin */
253 GMX_LOG(mdlog.warning)
256 "WARNING: Requested stride too large for available cores, thread pinning "
258 alreadyWarned = true;
260 validLayout = validLayout && !invalidValue;
265 .appendTextFormatted("Pinning threads with a%s logical core stride of %d",
266 bPickPinStride ? "n auto-selected" : " user-specified", *pin_stride);
269 *issuedWarning = alreadyWarned;
274 static bool set_affinity(const t_commrec* cr,
276 int intraNodeThreadOffset,
278 int core_pinning_stride,
279 const int* localityOrder,
280 gmx::IThreadAffinityAccess* affinityAccess)
282 // Set the per-thread affinity. In order to be able to check the success
283 // of affinity settings, we will set nth_affinity_set to 1 on threads
284 // where the affinity setting succeded and to 0 where it failed.
285 // Reducing these 0/1 values over the threads will give the total number
286 // of threads on which we succeeded.
288 // To avoid warnings from the static analyzer we initialize nth_affinity_set
289 // to zero outside the OpenMP block, and then add to it inside the block.
290 // The value will still always be 0 or 1 from each thread.
291 int nth_affinity_set = 0;
292 #pragma omp parallel num_threads(nthread_local) reduction(+ : nth_affinity_set)
296 int thread_id, thread_id_node;
299 thread_id = gmx_omp_get_thread_num();
300 thread_id_node = intraNodeThreadOffset + thread_id;
301 index = offset + thread_id_node * core_pinning_stride;
302 if (localityOrder != nullptr)
304 core = localityOrder[index];
311 const bool ret = affinityAccess->setCurrentThreadAffinityToCore(core);
313 /* store the per-thread success-values of the setaffinity */
314 nth_affinity_set += (ret ? 1 : 0);
319 "On rank %2d, thread %2d, index %2d, core %2d the affinity setting "
321 cr->nodeid, gmx_omp_get_thread_num(), index, core, ret ? 1 : 0);
324 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
327 if (nth_affinity_set > nthread_local)
332 "Looks like we have set affinity for more threads than "
333 "we have (%d > %d)!\n",
334 nth_affinity_set, nthread_local);
338 /* check & warn if some threads failed to set their affinities */
339 const bool allAffinitiesSet = (nth_affinity_set == nthread_local);
340 if (!allAffinitiesSet)
342 char sbuf1[STRLEN], sbuf2[STRLEN];
344 /* sbuf1 contains rank info, while sbuf2 OpenMP thread info */
345 sbuf1[0] = sbuf2[0] = '\0';
346 /* Only add rank info if we have more than one rank. */
351 sprintf(sbuf1, "In tMPI thread #%d: ", cr->nodeid);
352 # else /* GMX_LIB_MPI */
353 sprintf(sbuf1, "In MPI process #%d: ", cr->nodeid);
358 if (nthread_local > 1)
360 sprintf(sbuf2, "for %d/%d thread%s ", nthread_local - nth_affinity_set, nthread_local,
361 nthread_local > 1 ? "s" : "");
364 // TODO: This output should also go through mdlog.
365 fprintf(stderr, "NOTE: %sAffinity setting %sfailed.\n", sbuf1, sbuf2);
367 return allAffinitiesSet;
370 void analyzeThreadsOnThisNode(const gmx::PhysicalNodeCommunicator& physicalNodeComm,
371 int numThreadsOnThisRank,
372 int* numThreadsOnThisNode,
373 int* intraNodeThreadOffset)
375 *intraNodeThreadOffset = 0;
376 *numThreadsOnThisNode = numThreadsOnThisRank;
378 if (physicalNodeComm.size_ > 1)
380 /* We need to determine a scan of the thread counts in this
382 MPI_Scan(&numThreadsOnThisRank, intraNodeThreadOffset, 1, MPI_INT, MPI_SUM, physicalNodeComm.comm_);
383 /* MPI_Scan is inclusive, but here we need exclusive */
384 *intraNodeThreadOffset -= numThreadsOnThisRank;
385 /* Get the total number of threads on this physical node */
386 MPI_Allreduce(&numThreadsOnThisRank, numThreadsOnThisNode, 1, MPI_INT, MPI_SUM,
387 physicalNodeComm.comm_);
390 GMX_UNUSED_VALUE(physicalNodeComm);
394 /* Set CPU affinity. Can be important for performance.
395 On some systems (e.g. Cray) CPU Affinity is set by default.
396 But default assigning doesn't work (well) with only some ranks
397 having threads. This causes very low performance.
398 External tools have cumbersome syntax for setting affinity
399 in the case that only some ranks have threads.
400 Thus it is important that GROMACS sets the affinity internally
401 if only PME is using threads.
403 void gmx_set_thread_affinity(const gmx::MDLogger& mdlog,
405 const gmx_hw_opt_t* hw_opt,
406 const gmx::HardwareTopology& hwTop,
407 int numThreadsOnThisRank,
408 int numThreadsOnThisNode,
409 int intraNodeThreadOffset,
410 gmx::IThreadAffinityAccess* affinityAccess)
412 int* localityOrder = nullptr;
414 if (hw_opt->threadAffinity == ThreadAffinity::Off)
420 if (affinityAccess == nullptr)
422 affinityAccess = &g_defaultAffinityAccess;
425 /* If the tMPI thread affinity setting is not supported encourage the user
426 * to report it as it's either a bug or an exotic platform which we might
427 * want to support. */
428 if (!affinityAccess->isThreadAffinitySupported())
430 /* we know Mac OS does not support setting thread affinity, so there's
431 no point in warning the user in that case. In any other case
432 the user might be able to do something about it. */
433 #if !defined(__APPLE__)
434 GMX_LOG(mdlog.warning)
436 .appendText("NOTE: Cannot set thread affinities on the current platform.");
437 #endif /* __APPLE__ */
441 int offset = hw_opt->core_pinning_offset;
442 int core_pinning_stride = hw_opt->core_pinning_stride;
445 GMX_LOG(mdlog.warning).appendTextFormatted("Applying core pinning offset %d", offset);
448 bool affinityIsAutoAndNumThreadsIsNotAuto =
449 (hw_opt->threadAffinity == ThreadAffinity::Auto && !hw_opt->totNumThreadsIsAuto);
451 bool validLayout = get_thread_affinity_layout(
452 mdlog, cr, hwTop, numThreadsOnThisNode, affinityIsAutoAndNumThreadsIsNotAuto, offset,
453 &core_pinning_stride, &localityOrder, &issuedWarning);
454 const gmx::sfree_guard localityOrderGuard(localityOrder);
456 bool allAffinitiesSet;
459 allAffinitiesSet = set_affinity(cr, numThreadsOnThisRank, intraNodeThreadOffset, offset,
460 core_pinning_stride, localityOrder, affinityAccess);
464 // Produce the warning if any rank fails.
465 allAffinitiesSet = false;
467 if (invalidWithinSimulation(cr, !allAffinitiesSet) && !issuedWarning)
469 GMX_LOG(mdlog.warning).asParagraph().appendText("NOTE: Thread affinity was not set.");
473 /* Detects and returns whether we have the default affinity mask
475 * Returns true when we can query thread affinities and CPU count is
476 * consistent and we have default affinity mask on all ranks.
478 * Should be called simultaneously by all MPI ranks.
480 static bool detectDefaultAffinityMask(const int nthreads_hw_avail)
482 bool detectedDefaultAffinityMask = true;
484 #if HAVE_SCHED_AFFINITY
485 cpu_set_t mask_current;
486 CPU_ZERO(&mask_current);
488 if ((ret = sched_getaffinity(0, sizeof(cpu_set_t), &mask_current)) != 0)
490 /* failed to query affinity mask, will just return */
493 fprintf(debug, "Failed to query affinity mask (error %d)", ret);
495 detectedDefaultAffinityMask = false;
498 /* Before proceeding with the actual check, make sure that the number of
499 * detected CPUs is >= the CPUs in the current set.
500 * We need to check for CPU_COUNT as it was added only in glibc 2.6. */
502 if (detectedDefaultAffinityMask && nthreads_hw_avail < CPU_COUNT(&mask_current))
506 fprintf(debug, "%d hardware threads detected, but %d was returned by CPU_COUNT",
507 nthreads_hw_avail, CPU_COUNT(&mask_current));
509 detectedDefaultAffinityMask = false;
511 # endif /* CPU_COUNT */
513 if (detectedDefaultAffinityMask)
515 /* Here we check whether all bits of the affinity mask are set.
516 * Note that this mask can change while the program is executing,
517 * e.g. by the system dynamically enabling or disabling cores or
518 * by some scheduler changing the affinities. Thus we can not assume
519 * that the result is the same on all ranks within a node
520 * when running this code at different times.
522 bool allBitsAreSet = true;
523 for (int i = 0; (i < nthreads_hw_avail && i < CPU_SETSIZE); i++)
525 allBitsAreSet = allBitsAreSet && (CPU_ISSET(i, &mask_current) != 0);
529 fprintf(debug, "%s affinity mask found\n", allBitsAreSet ? "Default" : "Non-default");
533 detectedDefaultAffinityMask = false;
537 GMX_UNUSED_VALUE(nthreads_hw_avail);
538 #endif /* HAVE_SCHED_AFFINITY */
541 int mpiIsInitialized;
542 MPI_Initialized(&mpiIsInitialized);
543 if (mpiIsInitialized)
545 bool detectedDefaultAffinityMaskOnAllRanks;
546 MPI_Allreduce(&detectedDefaultAffinityMask, &detectedDefaultAffinityMaskOnAllRanks, 1,
547 MPI_C_BOOL, MPI_LAND, MPI_COMM_WORLD);
548 detectedDefaultAffinityMask = detectedDefaultAffinityMaskOnAllRanks;
552 return detectedDefaultAffinityMask;
555 /* Check the process affinity mask and if it is found to be non-zero,
556 * will honor it and disable mdrun internal affinity setting.
557 * Note that this will only work on Linux as we use a GNU feature.
559 void gmx_check_thread_affinity_set(const gmx::MDLogger& mdlog,
560 gmx_hw_opt_t* hw_opt,
561 int gmx_unused nthreads_hw_avail,
562 gmx_bool bAfterOpenmpInit)
564 GMX_RELEASE_ASSERT(hw_opt, "hw_opt must be a non-NULL pointer");
566 if (!bAfterOpenmpInit)
568 /* Check for externally set OpenMP affinity and turn off internal
569 * pinning if any is found. We need to do this check early to tell
570 * thread-MPI whether it should do pinning when spawning threads.
571 * TODO: the above no longer holds, we should move these checks later
573 if (hw_opt->threadAffinity != ThreadAffinity::Off)
576 if (!gmx_omp_check_thread_affinity(&message))
578 /* We only pin automatically with totNumThreadsIsAuto=true */
579 if (hw_opt->threadAffinity == ThreadAffinity::On || hw_opt->totNumThreadsIsAuto)
581 GMX_LOG(mdlog.warning).asParagraph().appendText(message);
584 hw_opt->threadAffinity = ThreadAffinity::Off;
589 if (!detectDefaultAffinityMask(nthreads_hw_avail))
591 if (hw_opt->threadAffinity == ThreadAffinity::Auto)
593 if (!bAfterOpenmpInit)
595 GMX_LOG(mdlog.warning)
598 "Non-default thread affinity set, disabling internal thread "
603 GMX_LOG(mdlog.warning)
606 "Non-default thread affinity set probably by the OpenMP library,\n"
607 "disabling internal thread affinity");
609 hw_opt->threadAffinity = ThreadAffinity::Off;
613 /* Only warn once, at the last check (bAfterOpenmpInit==TRUE) */
614 if (bAfterOpenmpInit)
616 GMX_LOG(mdlog.warning)
618 .appendTextFormatted("Overriding thread affinity set outside %s",
619 gmx::getProgramContext().displayName());