The order of logical cores on x86 is hardware and software dependent.
The cpuid topology reports this and this information is now used.
The mdrun -pinht option is generalized for SMT to -pinstride.
The mdrun -pinoffset option is now in logical (iso phyiscal) cores.
Thread-MPI no longer sets affinity, it's now all done in one place.
The option -pin is now an enum, default auto: only on when using all
cores and when no external affinity has been set.
A big NOTE is printed with auto when no pinning is used.
Option -pin on can now override thread affinity set outside mdrun.
Fixes #1122
All thread affinity code has been moved from runner.c to
gmx_thread_affinity.c.
Updated the mdrun manual for pinning also active without OpenMP.
Change-Id: Ibf0fe5882688de80c223640502c68e6170d4d044
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+#ifndef GMX_THREAD_AFFINITY_H_
+#define GMX_THREAD_AFFINITY_H_
+#include "visibility.h"
+#include "types/commrec.h"
+#include "typedefs.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#if 0
+} /* fixes auto-indentation problems */
+#endif
+
+/* Sets the thread affinity using the requested setting stored in hw_opt.
+ * The hardware topologu is requested from hwinfo, when present.
+ */
+GMX_LIBGMX_EXPORT
+void
+gmx_set_thread_affinity(FILE *fplog,
+ const t_commrec *cr,
+ gmx_hw_opt_t *hw_opt,
+ int nthreads_pme,
+ const gmx_hw_info_t *hwinfo,
+ const t_inputrec *inputrec);
+
+/* Check the process affinity mask and if it is found to be non-zero,
+ * will honor it and disable mdrun internal affinity setting.
+ * This function should be called first before the OpenMP library gets
+ * initialized with the last argument FALSE (which will detect affinity
+ * set by external tools like taskset), and later, after the OpenMP
+ * initialization, with the last argument TRUE to detect affinity changes
+ * made by the OpenMP library.
+ *
+ * Note that this will only work on Linux as we use a GNU feature.
+ */
+GMX_LIBGMX_EXPORT
+void
+gmx_check_thread_affinity_set(FILE *fplog, const t_commrec *cr,
+ gmx_hw_opt_t *hw_opt, int ncpus,
+ gmx_bool bAfterOpenmpInit);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* GMX_THREAD_AFFINITY_H_ */
#define MD_TUNEPME (1<<20)
#define MD_TESTVERLET (1<<22)
+/* The options for the domain decomposition MPI task ordering */
enum {
ddnoSEL, ddnoINTERLEAVE, ddnoPP_PME, ddnoCARTESIAN, ddnoNR
};
+/* The options for the thread affinity setting, default: auto */
+enum {
+ threadaffSEL, threadaffAUTO, threadaffON, threadaffOFF, threadaffNR
+};
+
typedef struct {
int nthreads_tot; /* Total number of threads requested (TMPI) */
int nthreads_tmpi; /* Number of TMPI threads requested */
int nthreads_omp; /* Number of OpenMP threads requested */
int nthreads_omp_pme; /* As nthreads_omp, but for PME only nodes */
- gmx_bool bThreadPinning; /* Pin OpenMP threads to cores? */
- gmx_bool bPinHyperthreading; /* Pin pairs of threads to physical cores */
- int core_pinning_offset; /* Physical core pinning offset */
+ int thread_affinity; /* Thread affinity switch, see enum above */
+ int core_pinning_stride; /* Logical core pinning stride */
+ int core_pinning_offset; /* Logical core pinning offset */
char *gpu_id; /* GPU id's to use, each specified as chars */
} gmx_hw_opt_t;
char *kmp_env, *gomp_env;
/* no need to worry if internal thread pinning is turned off */
- if (!hw_opt->bThreadPinning)
+ if (hw_opt->thread_affinity == threadaffOFF)
{
return;
}
/* turn off internal pinning KMP_AFFINITY != "disabled" */
if (bKmpAffinitySet && (gmx_strncasecmp(kmp_env, "disabled", 8) != 0))
{
- md_print_warn(cr, fplog, "WARNING: KMP_AFFINITY set, will turn off %s internal affinity\n"
- " setting as the two can conflict and cause performance degradation.\n"
- " To keep using the %s internal affinity setting, set the\n"
- " KMP_AFFINITY=disabled environment variable.",
+ /* TODO: with -pin auto we should only warn when using all cores */
+ md_print_warn(cr, fplog,
+ "NOTE: KMP_AFFINITY set, will turn off %s internal affinity\n"
+ " setting as the two can conflict and cause performance degradation.\n"
+ " To keep using the %s internal affinity setting, set the\n"
+ " KMP_AFFINITY=disabled environment variable.",
ShortProgram(), ShortProgram());
- hw_opt->bThreadPinning = FALSE;
+ hw_opt->thread_affinity = threadaffOFF;
}
#endif /* __INTEL_COMPILER */
/* turn off internal pinning f GOMP_CPU_AFFINITY is set & non-empty */
if (bGompCpuAffinitySet && gomp_env != NULL && gomp_env != '\0')
{
+ /* TODO: with -pin auto we should only warn when using all cores */
md_print_warn(cr, fplog,
- "WARNING: GOMP_CPU_AFFINITY set, will turn off %s internal affinity\n"
- " setting as the two can conflict and cause performance degradation.\n"
- " To keep using the %s internal affinity setting, unset the\n"
- " GOMP_CPU_AFFINITY environment variable.",
+ "NOTE: GOMP_CPU_AFFINITY set, will turn off %s internal affinity\n"
+ " setting as the two can conflict and cause performance degradation.\n"
+ " To keep using the %s internal affinity setting, unset the\n"
+ " GOMP_CPU_AFFINITY environment variable.",
ShortProgram(), ShortProgram());
- hw_opt->bThreadPinning = FALSE;
+ hw_opt->thread_affinity = threadaffOFF;
}
#endif /* __INTEL_COMPILER || __GNUC__ */
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+#if defined(HAVE_SCHED_H) && defined(HAVE_SCHED_GETAFFINITY)
+#define _GNU_SOURCE
+#include <sched.h>
+#include <sys/syscall.h>
+#endif
+#include <assert.h>
+#include <stdio.h>
+#include "typedefs.h"
+#include "types/commrec.h"
+#include "types/hw_info.h"
+#include "gmx_cpuid.h"
+#include "gmx_omp.h"
+#include "gmx_omp_nthreads.h"
+#include "mdrun.h"
+#include "md_logging.h"
+#include "statutil.h"
+#include "gmx_thread_affinity.h"
+
+#include "thread_mpi/threads.h"
+
+
+static int
+get_thread_affinity_layout(FILE *fplog,
+ const t_commrec *cr,
+ const gmx_hw_info_t * hwinfo,
+ int nthreads,
+ int pin_offset, int * pin_stride,
+ const int **locality_order)
+{
+ int nhwthreads,npkg,ncores,nhwthreads_per_core,rc;
+ const int * pkg_id;
+ const int * core_id;
+ const int * hwthread_id;
+
+ if (pin_offset < 0)
+ {
+ gmx_fatal(FARGS,"Negative thread pinning offset requested");
+ }
+ if (*pin_stride < 0)
+ {
+ gmx_fatal(FARGS,"Negative thread pinning stride requested");
+ }
+
+ rc = gmx_cpuid_topology(hwinfo->cpuid_info, &nhwthreads, &npkg, &ncores,
+ &nhwthreads_per_core,
+ &pkg_id, &core_id, &hwthread_id, locality_order);
+
+ if (rc != 0)
+ {
+ nhwthreads = hwinfo->nthreads_hw_avail;
+ *locality_order = NULL;
+
+ if (nhwthreads <= 0)
+ {
+ /* We don't know anything about the hardware, don't pin */
+ md_print_warn(cr, fplog,
+ "We don't know how many logical cores we have, will not pin threads");
+
+ return -1;
+ }
+ }
+
+ if (pin_offset + nthreads > nhwthreads)
+ {
+ /* We are oversubscribing, don't pin */
+ md_print_warn(NULL, fplog,
+ "More threads requested than available logical cores, will not pin threads");
+
+ return -1;
+ }
+
+ /* Check if we need to choose the pinning stride */
+ if (*pin_stride == 0)
+ {
+ if (rc == 0 && pin_offset + nthreads*nhwthreads_per_core <= nhwthreads)
+ {
+ /* Put one thread on each physical core */
+ *pin_stride = nhwthreads_per_core;
+ }
+ else
+ {
+ /* We don't know if we have SMT, and if we do, we don't know
+ * if hw threads in the same physical core are consecutive.
+ * Without SMT the pinning layout should not matter too much.
+ * so we assume a consecutive layout and maximally spread out"
+ * the threads at equal threads per core.
+ * Note that IBM is the major non-x86 case with cpuid support
+ * and probably threads are already pinned by the queuing system,
+ * so we wouldn't end up here in the first place.
+ */
+ *pin_stride = (nhwthreads - pin_offset)/nthreads;
+ }
+
+ if (fplog != NULL)
+ {
+ fprintf(fplog,"Pinning threads with a logical core stride of %d\n",
+ *pin_stride);
+ }
+ }
+ else
+ {
+ if (pin_offset + nthreads*(*pin_stride) > nhwthreads)
+ {
+ /* We are oversubscribing, don't pin */
+ md_print_warn(NULL, fplog,
+ "The requested pinning stride is too large for the available logical cores, will not pin threads");
+
+ return -1;
+ }
+ }
+
+ return 0;
+}
+
+/* Set CPU affinity. Can be important for performance.
+ On some systems (e.g. Cray) CPU Affinity is set by default.
+ But default assigning doesn't work (well) with only some ranks
+ having threads. This causes very low performance.
+ External tools have cumbersome syntax for setting affinity
+ in the case that only some ranks have threads.
+ Thus it is important that GROMACS sets the affinity internally
+ if only PME is using threads.
+*/
+void
+gmx_set_thread_affinity(FILE *fplog,
+ const t_commrec *cr,
+ gmx_hw_opt_t *hw_opt,
+ int nthreads_pme,
+ const gmx_hw_info_t *hwinfo,
+ const t_inputrec *inputrec)
+{
+ int nth_affinity_set, thread_id_node, thread_id,
+ nthread_local, nthread_node, nthread_hw_max, nphyscore;
+ int offset;
+ const int *locality_order;
+ int rc;
+
+ if (hw_opt->thread_affinity == threadaffOFF)
+ {
+ /* Nothing to do */
+ return;
+ }
+
+#ifndef __APPLE__
+ /* If the tMPI thread affinity setting is not supported encourage the user
+ * to report it as it's either a bug or an exotic platform which we might
+ * want to support. */
+ if (tMPI_Thread_setaffinity_support() != TMPI_SETAFFINITY_SUPPORT_YES)
+ {
+ md_print_warn(NULL, fplog,
+ "Can not set thread affinities on the current platform. On NUMA systems this\n"
+ "can cause performance degradation. If you think your platform should support\n"
+ "setting affinities, contact the GROMACS developers.");
+ return;
+ }
+#endif /* __APPLE__ */
+
+ /* threads on this MPI process or TMPI thread */
+ if (cr->duty & DUTY_PP)
+ {
+ nthread_local = gmx_omp_nthreads_get(emntNonbonded);
+ }
+ else
+ {
+ nthread_local = gmx_omp_nthreads_get(emntPME);
+ }
+
+ /* map the current process to cores */
+ thread_id_node = 0;
+ nthread_node = nthread_local;
+#ifdef GMX_MPI
+ if (PAR(cr) || MULTISIM(cr))
+ {
+ /* We need to determine a scan of the thread counts in this
+ * compute node.
+ */
+ MPI_Comm comm_intra;
+
+ MPI_Comm_split(MPI_COMM_WORLD,gmx_hostname_num(),cr->rank_intranode,
+ &comm_intra);
+ MPI_Scan(&nthread_local,&thread_id_node,1,MPI_INT,MPI_SUM,comm_intra);
+ /* MPI_Scan is inclusive, but here we need exclusive */
+ thread_id_node -= nthread_local;
+ /* Get the total number of threads on this physical node */
+ MPI_Allreduce(&nthread_local,&nthread_node,1,MPI_INT,MPI_SUM,comm_intra);
+ MPI_Comm_free(&comm_intra);
+ }
+#endif
+
+ if (hw_opt->thread_affinity == threadaffAUTO &&
+ nthread_node != hwinfo->nthreads_hw_avail)
+ {
+ if (nthread_node > 1 && nthread_node < hwinfo->nthreads_hw_avail)
+ {
+ md_print_warn(cr, fplog,
+ "NOTE: The number of threads is not equal to the number of (logical) cores\n"
+ " and the -pin option is set to auto: will not pin thread to cores.\n"
+ " This can lead to significant performance degradation.\n"
+ " Consider using -pin on (and -pinoffset in case you run multiple jobs).\n");
+ }
+
+ return;
+ }
+
+ offset = 0;
+ if (hw_opt->core_pinning_offset != 0)
+ {
+ offset = hw_opt->core_pinning_offset;
+ md_print_info(cr,fplog,"Applying core pinning offset %d\n", offset);
+ }
+
+ rc = get_thread_affinity_layout(fplog, cr, hwinfo,
+ nthread_node,
+ offset, &hw_opt->core_pinning_stride,
+ &locality_order);
+ if (rc != 0)
+ {
+ /* Incompatible layout, don't pin, warning was already issued */
+ return;
+ }
+
+ /* Set the per-thread affinity. In order to be able to check the success
+ * of affinity settings, we will set nth_affinity_set to 1 on threads
+ * where the affinity setting succeded and to 0 where it failed.
+ * Reducing these 0/1 values over the threads will give the total number
+ * of threads on which we succeeded.
+ */
+ nth_affinity_set = 0;
+#pragma omp parallel firstprivate(thread_id_node) num_threads(nthread_local) \
+ reduction(+:nth_affinity_set)
+ {
+ int index,core;
+ gmx_bool setaffinity_ret;
+
+ thread_id = gmx_omp_get_thread_num();
+ thread_id_node += thread_id;
+ index = offset + thread_id_node*hw_opt->core_pinning_stride;
+ if (locality_order != NULL)
+ {
+ core = locality_order[index];
+ }
+ else
+ {
+ core = index;
+ }
+
+ setaffinity_ret = tMPI_Thread_setaffinity_single(tMPI_Thread_self(), core);
+
+ /* store the per-thread success-values of the setaffinity */
+ nth_affinity_set = (setaffinity_ret == 0);
+
+ if (debug)
+ {
+ fprintf(debug, "On rank %2d, thread %2d, core %2d the affinity setting returned %d\n",
+ cr->nodeid, gmx_omp_get_thread_num(), core, setaffinity_ret);
+ }
+ }
+
+ if (nth_affinity_set > nthread_local)
+ {
+ char msg[STRLEN];
+
+ sprintf(msg, "Looks like we have set affinity for more threads than "
+ "we have (%d > %d)!\n", nth_affinity_set, nthread_local);
+ gmx_incons(msg);
+ }
+ else
+ {
+ /* check & warn if some threads failed to set their affinities */
+ if (nth_affinity_set != nthread_local)
+ {
+ char sbuf1[STRLEN], sbuf2[STRLEN];
+
+ /* sbuf1 contains rank info, while sbuf2 OpenMP thread info */
+ sbuf1[0] = sbuf2[0] = '\0';
+#ifdef GMX_MPI
+#ifdef GMX_THREAD_MPI
+ sprintf(sbuf1, "In thread-MPI thread #%d: ", cr->nodeid);
+#else /* GMX_LIB_MPI */
+ sprintf(sbuf1, "In MPI process #%d: ", cr->nodeid);
+#endif
+#endif /* GMX_MPI */
+
+ if (nthread_local > 1)
+ {
+ sprintf(sbuf2, "of %d/%d thread%s ",
+ nthread_local - nth_affinity_set, nthread_local,
+ (nthread_local - nth_affinity_set) > 1 ? "s" : "");
+ }
+
+ md_print_warn(NULL, fplog,
+ "NOTE: %sAffinity setting %sfailed.\n"
+ " This can cause performance degradation!",
+ sbuf1, sbuf2);
+ }
+ }
+}
+
+/* Check the process affinity mask and if it is found to be non-zero,
+ * will honor it and disable mdrun internal affinity setting.
+ * Note that this will only work on Linux as we use a GNU feature.
+ */
+void
+gmx_check_thread_affinity_set(FILE *fplog, const t_commrec *cr,
+ gmx_hw_opt_t *hw_opt, int ncpus,
+ gmx_bool bAfterOpenmpInit)
+{
+#ifdef HAVE_SCHED_GETAFFINITY
+ cpu_set_t mask_current;
+ int i, ret, cpu_count, cpu_set;
+ gmx_bool bAllSet;
+
+ assert(hw_opt);
+ if (hw_opt->thread_affinity == threadaffOFF)
+ {
+ /* internal affinity setting is off, don't bother checking process affinity */
+ return;
+ }
+
+ CPU_ZERO(&mask_current);
+ if ((ret = sched_getaffinity(0, sizeof(cpu_set_t), &mask_current)) != 0)
+ {
+ /* failed to query affinity mask, will just return */
+ if (debug)
+ {
+ fprintf(debug, "Failed to query affinity mask (error %d)", ret);
+ }
+ return;
+ }
+
+ /* Before proceeding with the actual check, make sure that the number of
+ * detected CPUs is >= the CPUs in the current set.
+ * We need to check for CPU_COUNT as it was added only in glibc 2.6. */
+#ifdef CPU_COUNT
+ if (ncpus < CPU_COUNT(&mask_current))
+ {
+ if (debug)
+ {
+ fprintf(debug, "%d CPUs detected, but %d was returned by CPU_COUNT",
+ ncpus, CPU_COUNT(&mask_current));
+ }
+ return;
+ }
+#endif /* CPU_COUNT */
+
+ bAllSet = TRUE;
+ for (i = 0; (i < ncpus && i < CPU_SETSIZE); i++)
+ {
+ bAllSet = bAllSet && (CPU_ISSET(i, &mask_current) != 0);
+ }
+
+ if (!bAllSet)
+ {
+ if (hw_opt->thread_affinity == threadaffAUTO)
+ {
+ if (!bAfterOpenmpInit)
+ {
+ md_print_warn(cr, fplog,
+ "Non-default thread affinity set, disabling internal thread affinity");
+ }
+ else
+ {
+ md_print_warn(cr, fplog,
+ "Non-default thread affinity set probably by the OpenMP library,\n"
+ "disabling internal thread affinity");
+ }
+ hw_opt->thread_affinity = threadaffOFF;
+ }
+ else
+ {
+ /* Only warn once, at the last check (bAfterOpenmpInit==TRUE) */
+ if (bAfterOpenmpInit)
+ {
+ md_print_warn(cr, fplog,
+ "Overriding thread affinity set outside %s\n",
+ ShortProgram());
+ }
+ }
+
+ if (debug)
+ {
+ fprintf(debug, "Non-default affinity mask found\n");
+ }
+ }
+ else
+ {
+ if (debug)
+ {
+ fprintf(debug, "Default affinity mask found\n");
+ }
+ }
+#endif /* HAVE_SCHED_GETAFFINITY */
+}
"into particle and mesh contributions. The auto-tuning can be turned off",
"with the option [TT]-notunepme[tt].",
"[PAR]",
- "When compiled with OpenMP on Linux, [TT]mdrun[tt] pins threads to cores,",
+ "[TT]mdrun[tt] pins (sets affinity of) threads to specific cores,",
+ "when all (logical) cores on a compute node are used by [TT]mdrun[tt],",
+ "even when no multi-threading is used,",
"as this usually results in significantly better performance.",
- "If you don't want this, use [TT]-nopin[tt].",
- "With Intel CPUs with hyper-threading enabled, you should pin",
- "consecutive threads to the same physical core for optimal",
- "performance when you use virtual cores. This is done automatically",
- "when you use more than half of the virtual cores. It can also be set",
- "manually with [TT]-pinht[tt], e.g. for running multiple simulations",
- "on one compute node.",
+ "If the queuing systems or the OpenMP library pinned threads, we honor",
+ "this and don't pin again, even though the layout may be sub-optimal.",
+ "If you want to have [TT]mdrun[tt] override an already set thread affinity",
+ "or pin threads when using less cores, use [TT]-pin on[tt].",
+ "With SMT (simultaneous multithreading), e.g. Intel Hyper-Threading,",
+ "there are multiple logical cores per physical core.",
+ "The option [TT]-pinstride[tt] sets the stride in logical cores for",
+ "pinning consecutive threads. Without SMT, 1 is usually the best choice.",
+ "With Intel Hyper-Threading 2 is best when using half or less of the",
+ "logical cores, 1 otherwise. The default value of 0 do exactly that:",
+ "it minimizes the threads per logical core, to optimize performance.",
+ "If you want to run multiple mdrun jobs on the same physical node,"
+ "you should set [TT]-pinstride[tt] to 1 when using all logical cores.",
"When running multiple mdrun (or other) simulations on the same physical",
"node, some simulations need to start pinning from a non-zero core",
"to avoid overloading cores; with [TT]-pinoffset[tt] you can specify",
- "the offset in (physical) cores for pinning.",
+ "the offset in logical cores for pinning.",
"[PAR]",
"When [TT]mdrun[tt] is started using MPI with more than 1 process",
"or with thread-MPI with more than 1 thread, MPI parallelization is used.",
{ NULL, "interleave", "pp_pme", "cartesian", NULL };
const char *dddlb_opt[] =
{ NULL, "auto", "no", "yes", NULL };
+ const char *thread_aff_opt[threadaffNR+1] =
+ { NULL, "auto", "on", "off", NULL };
const char *nbpu_opt[] =
{ NULL, "auto", "cpu", "gpu", "gpu_cpu", NULL };
real rdd=0.0,rconstr=0.0,dlb_scale=0.8,pforce=-1;
output_env_t oenv=NULL;
const char *deviceOptions = "";
- gmx_hw_opt_t hw_opt={0,0,0,0,TRUE,FALSE,0,NULL};
+ gmx_hw_opt_t hw_opt={0,0,0,0,threadaffSEL,0,0,NULL};
t_pargs pa[] = {
"Number of OpenMP threads per MPI process/thread to start (0 is guess)" },
{ "-ntomp_pme", FALSE, etINT, {&hw_opt.nthreads_omp_pme},
"Number of OpenMP threads per MPI process/thread to start (0 is -ntomp)" },
- { "-pin", FALSE, etBOOL, {&hw_opt.bThreadPinning},
- "Pin OpenMP threads to cores" },
- { "-pinht", FALSE, etBOOL, {&hw_opt.bPinHyperthreading},
- "Always pin threads to Hyper-Threading cores" },
+ { "-pin", FALSE, etENUM, {thread_aff_opt},
+ "Fix threads (or processes) to specific cores" },
{ "-pinoffset", FALSE, etINT, {&hw_opt.core_pinning_offset},
- "Core offset for pinning (for running multiple mdrun processes on a single physical node)" },
+ "The starting logical core number for pinning to cores; used to avoid pinning threads from different mdrun instances to the same core" },
+ { "-pinstride", FALSE, etINT, {&hw_opt.core_pinning_stride},
+ "Pinning distance in logical cores for threads, use 0 to minimize the number of threads per physical core" },
{ "-gpu_id", FALSE, etSTR, {&hw_opt.gpu_id},
"List of GPU id's to use" },
{ "-ddcheck", FALSE, etBOOL, {&bDDBondCheck},
asize(desc),desc,0,NULL, &oenv);
-
/* we set these early because they might be used in init_multisystem()
Note that there is the potential for npme>nnodes until the number of
threads is set later on, if there's thread parallelization. That shouldn't
dd_node_order = nenum(ddno_opt);
cr->npmenodes = npme;
+ hw_opt.thread_affinity = nenum(thread_aff_opt);
+
/* now check the -multi and -multidir option */
if (opt2bSet("-multidir", NFILE, fnm))
{
#ifdef HAVE_CONFIG_H
#include <config.h>
#endif
-#if defined(HAVE_SCHED_H) && defined(HAVE_SCHED_GETAFFINITY)
-#define _GNU_SOURCE
-#include <sched.h>
-#include <sys/syscall.h>
-#endif
#include <signal.h>
#include <stdlib.h>
#include <string.h>
#include "gmx_fatal_collective.h"
#include "membed.h"
#include "gmx_omp.h"
-
-#include "thread_mpi/threads.h"
+#include "gmx_thread_affinity.h"
#ifdef GMX_LIB_MPI
#include <mpi.h>
mda->Flags=Flags;
/* now spawn new threads that start mdrunner_start_fn(), while
- the main thread returns */
- ret=tMPI_Init_fn(TRUE, hw_opt->nthreads_tmpi,
- (hw_opt->bThreadPinning ? TMPI_AFFINITY_ALL_CORES : TMPI_AFFINITY_NONE),
+ the main thread returns, we set thread affinity later */
+ ret=tMPI_Init_fn(TRUE, hw_opt->nthreads_tmpi, TMPI_AFFINITY_NONE,
mdrunner_start_fn, (void*)(mda) );
if (ret!=TMPI_SUCCESS)
return NULL;
gmx_mtop_remove_chargegroups(mtop);
}
-/* Check the process affinity mask. If it is non-zero, something
- * else has set the affinity, and mdrun should honor that and
- * not attempt to do its own thread pinning.
- *
- * This function should be called twice. Once before the OpenMP
- * library gets initialized with bAfterOpenMPInit=FALSE (which will
- * detect affinity set by external tools like taskset), and again
- * later, after the OpenMP initialization, with bAfterOpenMPInit=TRUE
- * (which will detect affinity changes made by the OpenMP library).
- *
- * Note that this will only work on Linux, because we use a GNU
- * feature. */
-static void check_cpu_affinity_set(FILE *fplog, const t_commrec *cr,
- gmx_hw_opt_t *hw_opt, int ncpus,
- gmx_bool bAfterOpenmpInit)
-{
-#ifdef HAVE_SCHED_GETAFFINITY
- cpu_set_t mask_current;
- int i, ret, cpu_count, cpu_set;
- gmx_bool bAllSet;
-
- assert(hw_opt);
- if (!hw_opt->bThreadPinning)
- {
- /* internal affinity setting is off, don't bother checking process affinity */
- return;
- }
-
- CPU_ZERO(&mask_current);
- if ((ret = sched_getaffinity(0, sizeof(cpu_set_t), &mask_current)) != 0)
- {
- /* failed to query affinity mask, will just return */
- if (debug)
- {
- fprintf(debug, "Failed to query affinity mask (error %d)", ret);
- }
- return;
- }
-
- /* Before proceeding with the actual check, make sure that the number of
- * detected CPUs is >= the CPUs in the current set.
- * We need to check for CPU_COUNT as it was added only in glibc 2.6. */
-#ifdef CPU_COUNT
- if (ncpus < CPU_COUNT(&mask_current))
- {
- if (debug)
- {
- fprintf(debug, "%d CPUs detected, but %d was returned by CPU_COUNT",
- ncpus, CPU_COUNT(&mask_current));
- }
- return;
- }
-#endif /* CPU_COUNT */
-
- bAllSet = TRUE;
- for (i = 0; (i < ncpus && i < CPU_SETSIZE); i++)
- {
- bAllSet = bAllSet && (CPU_ISSET(i, &mask_current) != 0);
- }
-
- if (!bAllSet)
- {
- if (!bAfterOpenmpInit)
- {
- md_print_warn(cr, fplog,
- "%s detected a non-default process affinity, "
- "so it will not attempt to pin its threads", ShortProgram());
- }
- else
- {
- md_print_warn(cr, fplog,
- "%s detected a non-default process affinity, "
- "probably set by the OpenMP library, "
- "so it will not attempt to pin its threads", ShortProgram());
- }
- hw_opt->bThreadPinning = FALSE;
-
- if (debug)
- {
- fprintf(debug, "Non-default affinity mask found, mdrun will not pin threads\n");
- }
- }
- else
- {
- if (debug)
- {
- fprintf(debug, "Default affinity mask found\n");
- }
- }
-#endif /* HAVE_SCHED_GETAFFINITY */
-}
-
-/* Set CPU affinity. Can be important for performance.
- On some systems (e.g. Cray) CPU Affinity is set by default.
- But default assigning doesn't work (well) with only some ranks
- having threads. This causes very low performance.
- External tools have cumbersome syntax for setting affinity
- in the case that only some ranks have threads.
- Thus it is important that GROMACS sets the affinity internally
- if only PME is using threads.
-*/
-static void set_cpu_affinity(FILE *fplog,
- const t_commrec *cr,
- gmx_hw_opt_t *hw_opt,
- int nthreads_pme,
- const gmx_hw_info_t *hwinfo,
- const t_inputrec *inputrec)
-{
-#if defined GMX_THREAD_MPI
- /* With the number of TMPI threads equal to the number of cores
- * we already pinned in thread-MPI, so don't pin again here.
- */
- if (hw_opt->nthreads_tmpi == tMPI_Thread_get_hw_number())
- {
- return;
- }
-#endif
-
-#ifndef __APPLE__
- /* If the tMPI thread affinity setting is not supported encourage the user
- * to report it as it's either a bug or an exotic platform which we might
- * want to support. */
- if (tMPI_Thread_setaffinity_support() != TMPI_SETAFFINITY_SUPPORT_YES)
- {
- md_print_warn(NULL, fplog,
- "Can not set thread affinities on the current plarform. On NUMA systems this\n"
- "can cause performance degradation. If you think your platform should support\n"
- "setting affinities, contact the GROMACS developers.");
- return;
- }
-#endif /* __APPLE__ */
-
- if (hw_opt->bThreadPinning)
- {
- int nth_affinity_set, thread_id_node, thread_id,
- nthread_local, nthread_node, nthread_hw_max, nphyscore;
- int offset;
- char *env;
-
- /* threads on this MPI process or TMPI thread */
- if (cr->duty & DUTY_PP)
- {
- nthread_local = gmx_omp_nthreads_get(emntNonbonded);
- }
- else
- {
- nthread_local = gmx_omp_nthreads_get(emntPME);
- }
-
- /* map the current process to cores */
- thread_id_node = 0;
- nthread_node = nthread_local;
-#ifdef GMX_MPI
- if (PAR(cr) || MULTISIM(cr))
- {
- /* We need to determine a scan of the thread counts in this
- * compute node.
- */
- MPI_Comm comm_intra;
-
- MPI_Comm_split(MPI_COMM_WORLD,gmx_hostname_num(),cr->rank_intranode,
- &comm_intra);
- MPI_Scan(&nthread_local,&thread_id_node,1,MPI_INT,MPI_SUM,comm_intra);
- /* MPI_Scan is inclusive, but here we need exclusive */
- thread_id_node -= nthread_local;
- /* Get the total number of threads on this physical node */
- MPI_Allreduce(&nthread_local,&nthread_node,1,MPI_INT,MPI_SUM,comm_intra);
- MPI_Comm_free(&comm_intra);
- }
-#endif
-
- offset = 0;
- if (hw_opt->core_pinning_offset > 0)
- {
- offset = hw_opt->core_pinning_offset;
- if (SIMMASTER(cr))
- {
- fprintf(stderr, "Applying core pinning offset %d\n", offset);
- }
- if (fplog)
- {
- fprintf(fplog, "Applying core pinning offset %d\n", offset);
- }
- }
-
- /* With Intel Hyper-Threading enabled, we want to pin consecutive
- * threads to physical cores when using more threads than physical
- * cores or when the user requests so.
- */
- nthread_hw_max = hwinfo->nthreads_hw_avail;
- nphyscore = -1;
- if (hw_opt->bPinHyperthreading ||
- (gmx_cpuid_x86_smt(hwinfo->cpuid_info) == GMX_CPUID_X86_SMT_ENABLED &&
- nthread_node > nthread_hw_max/2 && getenv("GMX_DISABLE_PINHT") == NULL))
- {
- if (gmx_cpuid_x86_smt(hwinfo->cpuid_info) != GMX_CPUID_X86_SMT_ENABLED)
- {
- /* We print to stderr on all processes, as we might have
- * different settings on different physical nodes.
- */
- if (gmx_cpuid_vendor(hwinfo->cpuid_info) != GMX_CPUID_VENDOR_INTEL)
- {
- md_print_warn(NULL, fplog, "Pinning for Hyper-Threading layout requested, "
- "but non-Intel CPU detected (vendor: %s)\n",
- gmx_cpuid_vendor_string[gmx_cpuid_vendor(hwinfo->cpuid_info)]);
- }
- else
- {
- md_print_warn(NULL, fplog, "Pinning for Hyper-Threading layout requested, "
- "but the CPU detected does not have Intel Hyper-Threading support "
- "(or it is turned off)\n");
- }
- }
- nphyscore = nthread_hw_max/2;
-
- if (SIMMASTER(cr))
- {
- fprintf(stderr, "Pinning to Hyper-Threading cores with %d physical cores in a compute node\n",
- nphyscore);
- }
- if (fplog)
- {
- fprintf(fplog, "Pinning to Hyper-Threading cores with %d physical cores in a compute node\n",
- nphyscore);
- }
- }
-
- /* Set the per-thread affinity. In order to be able to check the success
- * of affinity settings, we will set nth_affinity_set to 1 on threads
- * where the affinity setting succeded and to 0 where it failed.
- * Reducing these 0/1 values over the threads will give the total number
- * of threads on which we succeeded.
- */
- nth_affinity_set = 0;
-#pragma omp parallel firstprivate(thread_id_node) num_threads(nthread_local) \
- reduction(+:nth_affinity_set)
- {
- int core;
- gmx_bool setaffinity_ret;
-
- thread_id = gmx_omp_get_thread_num();
- thread_id_node += thread_id;
- if (nphyscore <= 0)
- {
- core = offset + thread_id_node;
- }
- else
- {
- /* Lock pairs of threads to the same hyperthreaded core */
- core = offset + thread_id_node/2 + (thread_id_node % 2)*nphyscore;
- }
-
- setaffinity_ret = tMPI_Thread_setaffinity_single(tMPI_Thread_self(), core);
-
- /* store the per-thread success-values of the setaffinity */
- nth_affinity_set = (setaffinity_ret == 0);
-
- if (debug)
- {
- fprintf(debug, "On rank %2d, thread %2d, core %2d the affinity setting returned %d\n",
- cr->nodeid, gmx_omp_get_thread_num(), core, setaffinity_ret);
- }
- }
-
- if (nth_affinity_set > nthread_local)
- {
- char msg[STRLEN];
-
- sprintf(msg, "Looks like we have set affinity for more threads than "
- "we have (%d > %d)!\n", nth_affinity_set, nthread_local);
- gmx_incons(msg);
- }
- else
- {
- /* check & warn if some threads failed to set their affinities */
- if (nth_affinity_set != nthread_local)
- {
- char sbuf1[STRLEN], sbuf2[STRLEN];
-
- /* sbuf1 contains rank info, while sbuf2 OpenMP thread info */
- sbuf1[0] = sbuf2[0] = '\0';
-#ifdef GMX_MPI
-#ifdef GMX_THREAD_MPI
- sprintf(sbuf1, "In thread-MPI thread #%d: ", cr->nodeid);
-#else /* GMX_LIB_MPI */
- sprintf(sbuf1, "In MPI process #%d: ", cr->nodeid);
-#endif
-#endif /* GMX_MPI */
-
- if (nthread_local > 1)
- {
- sprintf(sbuf2, "of %d/%d thread%s ",
- nthread_local - nth_affinity_set, nthread_local,
- (nthread_local - nth_affinity_set) > 1 ? "s" : "");
- }
-
- md_print_warn(NULL, fplog,
- "NOTE: %sAffinity setting %sfailed.\n"
- " This can cause performance degradation!",
- sbuf1, sbuf2);
- }
- }
- }
-}
-
-
static void check_and_update_hw_opt(gmx_hw_opt_t *hw_opt,
int cutoff_scheme,
gmx_bool bIsSimMaster)
/* Check for externally set OpenMP affinity and turn off internal
* pinning if any is found. We need to do this check early to tell
* thread-MPI whether it should do pinning when spawning threads.
+ * TODO: the above no longer holds, we should move these checks down
*/
gmx_omp_check_thread_affinity(fplog, cr, hw_opt);
* MPI processes because hwinfo is not available everywhere, but with
* thread-MPI it's needed as pinning might get turned off which needs
* to be known before starting thread-MPI. */
- check_cpu_affinity_set(fplog,
- NULL,
- hw_opt, hwinfo->nthreads_hw_avail, FALSE);
+ gmx_check_thread_affinity_set(fplog,
+ NULL,
+ hw_opt, hwinfo->nthreads_hw_avail, FALSE);
#endif
#ifdef GMX_THREAD_MPI
}
/* Now do the affinity check with MPI/no-MPI (done earlier with thread-MPI). */
- check_cpu_affinity_set(fplog, cr,
- hw_opt, hwinfo->nthreads_hw_avail, FALSE);
+ gmx_check_thread_affinity_set(fplog, cr,
+ hw_opt, hwinfo->nthreads_hw_avail, FALSE);
#endif
/* now make sure the state is initialized and propagated */
snew(pmedata,1);
}
- /* Before setting affinity, check whether the affinity has changed
- * - which indicates that probably the OpenMP library has changed it since
- * we first checked). */
- check_cpu_affinity_set(fplog, cr, hw_opt, hwinfo->nthreads_hw_avail, TRUE);
+ if (hw_opt->thread_affinity != threadaffOFF)
+ {
+ /* Before setting affinity, check whether the affinity has changed
+ * - which indicates that probably the OpenMP library has changed it
+ * since we first checked).
+ */
+ gmx_check_thread_affinity_set(fplog, cr,
+ hw_opt, hwinfo->nthreads_hw_avail, TRUE);
- /* Set the CPU affinity */
- set_cpu_affinity(fplog,cr,hw_opt,nthreads_pme,hwinfo,inputrec);
+ /* Set the CPU affinity */
+ gmx_set_thread_affinity(fplog,cr,hw_opt,nthreads_pme,hwinfo,inputrec);
+ }
/* Initiate PME if necessary,
* either on all nodes or on dedicated PME nodes only. */