* To help us fund GROMACS development, we humbly ask that you cite
* the research papers on the package. Check out http://www.gromacs.org.
*/
+/*! \internal \file
+ * \brief Defines utility functionality for dividing resources and
+ * checking for consistency and usefulness.
+ *
+ * \author Mark Abraham <mark.j.abraham@gmail.com>
+ * \ingroup module_taskassignment
+ */
#include "gmxpre.h"
-#include "resource-division.h"
+#include "resourcedivision.h"
#include "config.h"
#include <algorithm>
-#include "gromacs/gmxlib/md_logging.h"
+#include "gromacs/ewald/pme.h"
#include "gromacs/hardware/cpuinfo.h"
#include "gromacs/hardware/detecthardware.h"
-#include "gromacs/hardware/gpu_hw_info.h"
#include "gromacs/hardware/hardwaretopology.h"
#include "gromacs/hardware/hw_info.h"
+#include "gromacs/math/functions.h"
#include "gromacs/mdlib/gmx_omp_nthreads.h"
#include "gromacs/mdtypes/commrec.h"
#include "gromacs/mdtypes/inputrec.h"
#include "gromacs/mdtypes/md_enums.h"
#include "gromacs/topology/mtop_util.h"
#include "gromacs/topology/topology.h"
+#include "gromacs/utility/baseversion.h"
#include "gromacs/utility/fatalerror.h"
#include "gromacs/utility/gmxassert.h"
+#include "gromacs/utility/logger.h"
#include "gromacs/utility/stringutil.h"
* and after a switch point doesn't change too much.
*/
+//! Constant used to help minimize preprocessed code
static const bool bHasOmpSupport = GMX_OPENMP;
-#if GMX_THREAD_MPI
-/* The minimum number of atoms per tMPI thread. With fewer atoms than this,
- * the number of threads will get lowered.
+/*! \brief The minimum number of atoms per thread-MPI thread when GPUs
+ * are present. With fewer atoms than this, the number of thread-MPI
+ * ranks will get lowered.
*/
static const int min_atoms_per_mpi_thread = 90;
+/*! \brief The minimum number of atoms per GPU with thread-MPI
+ * active. With fewer atoms than this, the number of thread-MPI ranks
+ * will get lowered.
+ */
static const int min_atoms_per_gpu = 900;
-#endif /* GMX_THREAD_MPI */
+
+/**@{*/
+/*! \brief Constants for implementing default divisions of threads */
/* TODO choose nthreads_omp based on hardware topology
when we have a hardware topology detection library */
const int nthreads_omp_faster_default = 8;
const int nthreads_omp_faster_Nehalem = 12;
const int nthreads_omp_faster_Intel_AVX = 16;
+const int nthreads_omp_faster_AMD_Ryzen = 16;
/* For CPU only runs the fastest options are usually MPI or OpenMP only.
* With one GPU, using MPI only is almost never optimal, so we need to
* compare running pure OpenMP with combined MPI+OpenMP. This means higher
const int nthreads_omp_mpi_ok_min_gpu = 2;
const int nthreads_omp_mpi_target_max = 6;
+/**@}*/
-/* Returns the maximum OpenMP thread count for which using a single MPI rank
+/*! \brief Returns the maximum OpenMP thread count for which using a single MPI rank
* should be faster than using multiple ranks with the same total thread count.
*/
static int nthreads_omp_faster(const gmx::CpuInfo &cpuInfo, gmx_bool bUseGPU)
// Intel Nehalem
nth = nthreads_omp_faster_Nehalem;
}
+ else if (cpuInfo.vendor() == gmx::CpuInfo::Vendor::Amd && cpuInfo.family() >= 23)
+ {
+ // AMD Ryzen
+ nth = nthreads_omp_faster_AMD_Ryzen;
+ }
else
{
nth = nthreads_omp_faster_default;
return nth;
}
-/* Returns that maximum OpenMP thread count that passes the efficiency check */
-static int nthreads_omp_efficient_max(int gmx_unused nrank,
- const gmx::CpuInfo &cpuInfo,
- gmx_bool bUseGPU)
+/*! \brief Returns that maximum OpenMP thread count that passes the efficiency check */
+gmx_unused static int nthreads_omp_efficient_max(int gmx_unused nrank,
+ const gmx::CpuInfo &cpuInfo,
+ gmx_bool bUseGPU)
{
#if GMX_OPENMP && GMX_MPI
if (nrank > 1)
}
}
-/* Return the number of thread-MPI ranks to use.
+/*! \brief Return the number of thread-MPI ranks to use.
* This is chosen such that we can always obey our own efficiency checks.
*/
-static int get_tmpi_omp_thread_division(const gmx_hw_info_t *hwinfo,
- const gmx_hw_opt_t *hw_opt,
- int nthreads_tot,
- int ngpu)
+gmx_unused static int get_tmpi_omp_thread_division(const gmx_hw_info_t *hwinfo,
+ const gmx_hw_opt_t &hw_opt,
+ int nthreads_tot,
+ int ngpu)
{
int nrank;
const gmx::CpuInfo &cpuInfo = *hwinfo->cpuInfo;
*/
if (ngpu > 0)
{
- if (hw_opt->nthreads_omp > 0)
+ if (hw_opt.nthreads_omp > 0)
{
/* In this case it is unclear if we should use 1 rank per GPU
* or more or less, so we require also setting the number of ranks.
* If the user does not set the number of OpenMP threads, nthreads_omp==0 and
* this code has no effect.
*/
- GMX_RELEASE_ASSERT(hw_opt->nthreads_omp >= 0, "nthreads_omp is negative, but previous checks should have prevented this");
- while (nrank*hw_opt->nthreads_omp > hwinfo->nthreads_hw_avail && nrank > 1)
+ GMX_RELEASE_ASSERT(hw_opt.nthreads_omp >= 0, "nthreads_omp is negative, but previous checks should have prevented this");
+ while (nrank*hw_opt.nthreads_omp > hwinfo->nthreads_hw_avail && nrank > 1)
{
nrank--;
}
/* #thread < #gpu is very unlikely, but if so: waste gpu(s) */
nrank = nthreads_tot;
}
- else if (gmx_gpu_sharing_supported() &&
- (nthreads_tot > nthreads_omp_faster(cpuInfo, ngpu > 0) ||
- (ngpu > 1 && nthreads_tot/ngpu > nthreads_omp_mpi_target_max)))
+ else if (nthreads_tot > nthreads_omp_faster(cpuInfo, ngpu > 0) ||
+ (ngpu > 1 && nthreads_tot/ngpu > nthreads_omp_mpi_target_max))
{
/* The high OpenMP thread count will likely result in sub-optimal
* performance. Increase the rank count to reduce the thread count
(nthreads_tot/(ngpu*(nshare + 1)) >= nthreads_omp_mpi_ok_min_gpu && nthreads_tot % nrank != 0));
}
}
- else if (hw_opt->nthreads_omp > 0)
+ else if (hw_opt.nthreads_omp > 0)
{
/* Here we could oversubscribe, when we do, we issue a warning later */
- nrank = std::max(1, nthreads_tot/hw_opt->nthreads_omp);
+ nrank = std::max(1, nthreads_tot/hw_opt.nthreads_omp);
}
else
{
return nrank;
}
-
-static int getMaxGpuUsable(FILE *fplog, const t_commrec *cr, const gmx_hw_info_t *hwinfo,
- int cutoff_scheme, gmx_bool bUseGpu)
-{
- /* This code relies on the fact that GPU are not detected when GPU
- * acceleration was disabled at run time by the user.
- */
- if (cutoff_scheme == ecutsVERLET &&
- bUseGpu &&
- hwinfo->gpu_info.n_dev_compatible > 0)
- {
- if (gmx_multiple_gpu_per_node_supported())
- {
- return hwinfo->gpu_info.n_dev_compatible;
- }
- else
- {
- if (hwinfo->gpu_info.n_dev_compatible > 1)
- {
- md_print_warn(cr, fplog, "More than one compatible GPU is available, but GROMACS can only use one of them. Using a single thread-MPI rank.\n");
- }
- return 1;
- }
- }
- else
- {
- return 0;
- }
-}
-
-
-#if GMX_THREAD_MPI
-
+//! Return whether hyper threading is enabled.
static bool
gmxSmtIsEnabled(const gmx::HardwareTopology &hwTop)
{
namespace
{
+//! Handles checks for algorithms that must use a single rank.
class SingleRankChecker
{
public:
* Thus all options should be internally consistent and consistent
* with the hardware, except that ntmpi could be larger than #GPU.
*/
-int get_nthreads_mpi(const gmx_hw_info_t *hwinfo,
- gmx_hw_opt_t *hw_opt,
- const t_inputrec *inputrec,
- const gmx_mtop_t *mtop,
- const t_commrec *cr,
- FILE *fplog,
- gmx_bool bUseGpu,
- bool doMembed)
+int get_nthreads_mpi(const gmx_hw_info_t *hwinfo,
+ gmx_hw_opt_t *hw_opt,
+ const std::vector<int> &gpuIdsToUse,
+ bool nonbondedOnGpu,
+ bool pmeOnGpu,
+ const t_inputrec *inputrec,
+ const gmx_mtop_t *mtop,
+ const gmx::MDLogger &mdlog,
+ bool doMembed)
{
int nthreads_hw, nthreads_tot_max, nrank, ngpu;
int min_atoms_per_mpi_rank;
const gmx::CpuInfo &cpuInfo = *hwinfo->cpuInfo;
const gmx::HardwareTopology &hwTop = *hwinfo->hardwareTopology;
+ if (pmeOnGpu)
+ {
+ GMX_RELEASE_ASSERT((EEL_PME(inputrec->coulombtype) || EVDW_PME(inputrec->vdwtype)) &&
+ pme_gpu_supports_input(inputrec, nullptr),
+ "PME can't be on GPUs unless we are using PME");
+
+ // PME on GPUs supports a single PME rank with PP running on the same or few other ranks.
+ // For now, let's treat separate PME GPU rank as opt-in.
+ if (hw_opt->nthreads_tmpi < 1)
+ {
+ return 1;
+ }
+ }
+
{
/* Check if an algorithm does not support parallel simulation. */
// TODO This might work better if e.g. implemented algorithms
{
gmx_fatal(FARGS, "%s However, you asked for more than 1 thread-MPI rank, so mdrun cannot continue. Choose a single rank, or a different algorithm.", message.c_str());
}
- md_print_warn(cr, fplog, "%s Choosing to use only a single thread-MPI rank.", message.c_str());
+ GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted("%s Choosing to use only a single thread-MPI rank.", message.c_str());
return 1;
}
}
if (hw_opt->nthreads_tmpi > 0)
{
- /* Trivial, return right away */
+ /* Trivial, return the user's choice right away */
return hw_opt->nthreads_tmpi;
}
+ // Now implement automatic selection of number of thread-MPI ranks
nthreads_hw = hwinfo->nthreads_hw_avail;
if (nthreads_hw <= 0)
nthreads_tot_max = nthreads_hw;
}
- ngpu = getMaxGpuUsable(fplog, cr, hwinfo, inputrec->cutoff_scheme, bUseGpu);
+ /* nonbondedOnGpu might be false e.g. because this simulation uses
+ * the group scheme, or is a rerun with energy groups. */
+ ngpu = (nonbondedOnGpu ? static_cast<int>(gpuIdsToUse.size()) : 0);
if (inputrec->cutoff_scheme == ecutsGROUP)
{
}
nrank =
- get_tmpi_omp_thread_division(hwinfo, hw_opt, nthreads_tot_max, ngpu);
+ get_tmpi_omp_thread_division(hwinfo, *hw_opt, nthreads_tot_max, ngpu);
if (inputrec->eI == eiNM || EI_TPI(inputrec->eI))
{
return nrank;
}
-#endif /* GMX_THREAD_MPI */
void check_resource_division_efficiency(const gmx_hw_info_t *hwinfo,
- const gmx_hw_opt_t *hw_opt,
+ bool willUsePhysicalGpu,
gmx_bool bNtOmpOptionSet,
t_commrec *cr,
- FILE *fplog)
+ const gmx::MDLogger &mdlog)
{
+ GMX_UNUSED_VALUE(hwinfo);
#if GMX_OPENMP && GMX_MPI
- int nth_omp_min, nth_omp_max, ngpu;
+ int nth_omp_min, nth_omp_max;
char buf[1000];
#if GMX_THREAD_MPI
const char *mpi_option = " (option -ntmpi)";
*/
#if GMX_THREAD_MPI
GMX_RELEASE_ASSERT(nthreads_omp_faster_default >= nthreads_omp_mpi_ok_max, "Inconsistent OpenMP thread count default values");
- GMX_RELEASE_ASSERT(hw_opt->nthreads_tmpi >= 1, "Must have at least one thread-MPI rank");
#endif
GMX_RELEASE_ASSERT(gmx_omp_nthreads_get(emntDefault) >= 1, "Must have at least one OpenMP thread");
nth_omp_min = gmx_omp_nthreads_get(emntDefault);
nth_omp_max = gmx_omp_nthreads_get(emntDefault);
- ngpu = hw_opt->gpu_opt.n_dev_use;
+ bool anyRankIsUsingGpus = willUsePhysicalGpu;
/* Thread-MPI seems to have a bug with reduce on 1 node, so use a cond. */
if (cr->nnodes + cr->npmenodes > 1)
{
count[0] = -nth_omp_min;
count[1] = nth_omp_max;
- count[2] = ngpu;
+ count[2] = willUsePhysicalGpu;
MPI_Allreduce(count, count_max, 3, MPI_INT, MPI_MAX, cr->mpi_comm_mysim);
/* In case of an inhomogeneous run setup we use the maximum counts */
- nth_omp_min = -count_max[0];
- nth_omp_max = count_max[1];
- ngpu = count_max[2];
+ nth_omp_min = -count_max[0];
+ nth_omp_max = count_max[1];
+ anyRankIsUsingGpus = count_max[2] > 0;
}
int nthreads_omp_mpi_ok_min;
- if (ngpu == 0)
+ if (!anyRankIsUsingGpus)
{
nthreads_omp_mpi_ok_min = nthreads_omp_mpi_ok_min_cpu;
}
if (DOMAINDECOMP(cr) && cr->nnodes > 1)
{
if (nth_omp_max < nthreads_omp_mpi_ok_min ||
- (!(ngpu > 0 && !gmx_gpu_sharing_supported()) &&
- nth_omp_max > nthreads_omp_mpi_ok_max))
+ nth_omp_max > nthreads_omp_mpi_ok_max)
{
/* Note that we print target_max here, not ok_max */
sprintf(buf, "Your choice of number of MPI ranks and amount of resources results in using %d OpenMP threads per rank, which is most likely inefficient. The optimum is usually between %d and %d threads per rank.",
if (bNtOmpOptionSet)
{
- md_print_warn(cr, fplog, "NOTE: %s\n", buf);
+ GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted("NOTE: %s", buf);
}
else
{
}
}
}
- else
- {
- const gmx::CpuInfo &cpuInfo = *hwinfo->cpuInfo;
-
- /* No domain decomposition (or only one domain) */
- if (!(ngpu > 0 && !gmx_gpu_sharing_supported()) &&
- nth_omp_max > nthreads_omp_faster(cpuInfo, ngpu > 0))
- {
- /* To arrive here, the user/system set #ranks and/or #OMPthreads */
- gmx_bool bEnvSet;
- char buf2[256];
-
- bEnvSet = (getenv("OMP_NUM_THREADS") != NULL);
-
- if (bNtOmpOptionSet || bEnvSet)
- {
- sprintf(buf2, "You requested %d OpenMP threads", nth_omp_max);
- }
- else
- {
- sprintf(buf2, "Your choice of %d MPI rank%s and the use of %d total threads %sleads to the use of %d OpenMP threads",
- cr->nnodes + cr->npmenodes,
- cr->nnodes + cr->npmenodes == 1 ? "" : "s",
- hw_opt->nthreads_tot > 0 ? hw_opt->nthreads_tot : hwinfo->nthreads_hw_avail,
- hwinfo->nphysicalnode > 1 ? "on a node " : "",
- nth_omp_max);
- }
- sprintf(buf, "%s, whereas we expect the optimum to be with more MPI ranks with %d to %d OpenMP threads.",
- buf2, nthreads_omp_mpi_ok_min, nthreads_omp_mpi_target_max);
-
- /* We can not quit with a fatal error when OMP_NUM_THREADS is set
- * with different values per rank or node, since in that case
- * the user can not set -ntomp to override the error.
- */
- if (bNtOmpOptionSet || (bEnvSet && nth_omp_min != nth_omp_max))
- {
- md_print_warn(cr, fplog, "NOTE: %s\n", buf);
- }
- else
- {
- gmx_fatal(FARGS, "%s If you want to run with this many OpenMP threads, specify the -ntomp option. But we suggest to increase the number of MPI ranks%s.", buf, mpi_option);
- }
- }
- }
#else /* GMX_OPENMP && GMX_MPI */
/* No OpenMP and/or MPI: it doesn't make much sense to check */
- GMX_UNUSED_VALUE(hw_opt);
GMX_UNUSED_VALUE(bNtOmpOptionSet);
+ GMX_UNUSED_VALUE(willUsePhysicalGpu);
+ GMX_UNUSED_VALUE(cr);
/* Check if we have more than 1 physical core, if detected,
* or more than 1 hardware thread if physical cores were not detected.
*/
-#if !GMX_OPENMP && !GMX_MPI
- if (hwinfo->hardwareTopology->numberOfCores() > 1)
+ if (!GMX_OPENMP && !GMX_MPI && hwinfo->hardwareTopology->numberOfCores() > 1)
{
- md_print_warn(cr, fplog, "NOTE: GROMACS was compiled without OpenMP and (thread-)MPI support, can only use a single CPU core\n");
+ GMX_LOG(mdlog.warning).asParagraph().appendText("NOTE: GROMACS was compiled without OpenMP and (thread-)MPI support, can only use a single CPU core");
}
-#else
- GMX_UNUSED_VALUE(hwinfo);
- GMX_UNUSED_VALUE(cr);
- GMX_UNUSED_VALUE(fplog);
-#endif
-
#endif /* GMX_OPENMP && GMX_MPI */
}
+//! Dump a \c hw_opt to \c fp.
static void print_hw_opt(FILE *fp, const gmx_hw_opt_t *hw_opt)
{
- fprintf(fp, "hw_opt: nt %d ntmpi %d ntomp %d ntomp_pme %d gpu_id '%s'\n",
+ fprintf(fp, "hw_opt: nt %d ntmpi %d ntomp %d ntomp_pme %d gpu_id '%s' gputasks '%s'\n",
hw_opt->nthreads_tot,
hw_opt->nthreads_tmpi,
hw_opt->nthreads_omp,
hw_opt->nthreads_omp_pme,
- hw_opt->gpu_opt.gpu_id != NULL ? hw_opt->gpu_opt.gpu_id : "");
+ hw_opt->gpuIdsAvailable.c_str(),
+ hw_opt->userGpuTaskAssignment.c_str());
}
-/* Checks we can do when we don't (yet) know the cut-off scheme */
void check_and_update_hw_opt_1(gmx_hw_opt_t *hw_opt,
const t_commrec *cr,
int nPmeRanks)
}
#endif
+ /* With thread-MPI the master thread sets hw_opt->totNumThreadsIsAuto.
+ * The other threads receive a partially processed hw_opt from the master
+ * thread and should not set hw_opt->totNumThreadsIsAuto again.
+ */
+ if (!GMX_THREAD_MPI || SIMMASTER(cr))
+ {
+ /* Check if mdrun is free to choose the total number of threads */
+ hw_opt->totNumThreadsIsAuto = (hw_opt->nthreads_omp == 0 && hw_opt->nthreads_omp_pme == 0 && hw_opt->nthreads_tot == 0);
+ }
+
if (bHasOmpSupport)
{
/* Check restrictions on PME thread related options set by the user */
}
}
- if (hw_opt->nthreads_tot == 1)
+ if (hw_opt->nthreads_tot > 0)
{
- hw_opt->nthreads_tmpi = 1;
-
- if (hw_opt->nthreads_omp > 1)
+ if (hw_opt->nthreads_omp > hw_opt->nthreads_tot)
{
- gmx_fatal(FARGS, "You requested %d OpenMP threads with %d total threads",
- hw_opt->nthreads_tmpi, hw_opt->nthreads_tot);
+ gmx_fatal(FARGS, "You requested %d OpenMP threads with %d total threads. Choose a total number of threads that is a multiple of the number of OpenMP threads.",
+ hw_opt->nthreads_omp, hw_opt->nthreads_tot);
}
- hw_opt->nthreads_omp = 1;
- hw_opt->nthreads_omp_pme = 1;
- }
- /* Parse GPU IDs, if provided.
- * We check consistency with the tMPI thread count later.
- */
- gmx_parse_gpu_ids(&hw_opt->gpu_opt);
-
-#if GMX_THREAD_MPI
- if (hw_opt->gpu_opt.n_dev_use > 0 && hw_opt->nthreads_tmpi == 0)
- {
- /* Set the number of MPI threads equal to the number of GPUs */
- hw_opt->nthreads_tmpi = hw_opt->gpu_opt.n_dev_use;
-
- if (hw_opt->nthreads_tot > 0 &&
- hw_opt->nthreads_tmpi > hw_opt->nthreads_tot)
+ if (hw_opt->nthreads_tmpi > hw_opt->nthreads_tot)
{
- /* We have more GPUs than total threads requested.
- * We choose to (later) generate a mismatch error,
- * instead of launching more threads than requested.
- */
- hw_opt->nthreads_tmpi = hw_opt->nthreads_tot;
+ gmx_fatal(FARGS, "You requested %d thread-MPI ranks with %d total threads. Choose a total number of threads that is a multiple of the number of thread-MPI ranks.",
+ hw_opt->nthreads_tmpi, hw_opt->nthreads_tot);
}
}
-#endif
if (debug)
{
"PME thread count should only be set when the normal thread count is also set");
}
-/* Checks we can do when we know the cut-off scheme */
void check_and_update_hw_opt_2(gmx_hw_opt_t *hw_opt,
int cutoff_scheme)
{
}
}
-/* Checks we can do when we know the thread-MPI rank count */
-void check_and_update_hw_opt_3(gmx_hw_opt_t gmx_unused *hw_opt)
+void checkAndUpdateRequestedNumOpenmpThreads(gmx_hw_opt_t *hw_opt,
+ const gmx_hw_info_t &hwinfo,
+ const t_commrec *cr,
+ PmeRunMode pmeRunMode,
+ const gmx_mtop_t &mtop)
{
#if GMX_THREAD_MPI
GMX_RELEASE_ASSERT(hw_opt->nthreads_tmpi >= 1, "Must have at least one thread-MPI rank");
}
#endif
+ /* With both non-bonded and PME on GPU, the work left on the CPU is often
+ * (much) slower with SMT than without SMT. This is mostly the case with
+ * few atoms per core. Thus, if the number of threads is set to auto,
+ * we turn off SMT in that case. Note that PME on GPU implies that also
+ * the non-bonded are computed on the GPU.
+ * We only need to do this when the number of hardware theads is larger
+ * than the number of cores. Note that a queuing system could limit
+ * the number of hardware threads available, but we are not trying to be
+ * too smart here in that case.
+ */
+ /* The thread reduction and synchronization costs go up roughy quadratically
+ * with the threads count, so we apply a threshold quadratic in #cores.
+ * Also more cores per GPU usually means the CPU gets faster than the GPU.
+ * The number 1000 atoms per core^2 is a reasonable threshold
+ * for Intel x86 and AMD Threadripper.
+ */
+ constexpr int c_numAtomsPerCoreSquaredSmtThreshold = 1000;
+
+ /* Prepare conditions for deciding if we should disable SMT.
+ * We currently only limit SMT for simulations using a single rank.
+ * TODO: Consider limiting also for multi-rank simulations.
+ */
+ bool canChooseNumOpenmpThreads = (bHasOmpSupport && hw_opt->nthreads_omp <= 0);
+ bool haveSmtSupport = (hwinfo.hardwareTopology->supportLevel() >= gmx::HardwareTopology::SupportLevel::Basic &&
+ hwinfo.hardwareTopology->machine().logicalProcessorCount > hwinfo.hardwareTopology->numberOfCores());
+ bool simRunsSingleRankNBAndPmeOnGpu = (cr->nnodes == 1 && pmeRunMode == PmeRunMode::GPU);
+
+ if (canChooseNumOpenmpThreads && haveSmtSupport &&
+ simRunsSingleRankNBAndPmeOnGpu)
+ {
+ /* Note that the queing system might have limited us from using
+ * all detected ncore_tot physical cores. We are currently not
+ * checking for that here.
+ */
+ int numRanksTot = cr->nnodes*(MULTISIM(cr) ? cr->ms->nsim : 1);
+ int numAtomsPerRank = mtop.natoms/cr->nnodes;
+ int numCoresPerRank = hwinfo.ncore_tot/numRanksTot;
+ if (numAtomsPerRank < c_numAtomsPerCoreSquaredSmtThreshold*gmx::square(numCoresPerRank))
+ {
+ int numRanksInThisNode = (cr ? cr->nrank_intranode : 1);
+ /* Choose one OpenMP thread per physical core */
+ hw_opt->nthreads_omp = std::max(1, hwinfo.hardwareTopology->numberOfCores()/numRanksInThisNode);
+ }
+ }
+
GMX_RELEASE_ASSERT(bHasOmpSupport || hw_opt->nthreads_omp == 1, "Without OpenMP support, only one thread per rank can be used");
/* We are done with updating nthreads_omp, we can set nthreads_omp_pme */
print_hw_opt(debug, hw_opt);
}
}
+
+void checkHardwareOversubscription(int numThreadsOnThisRank,
+ const gmx::HardwareTopology &hwTop,
+ const t_commrec *cr,
+ const gmx::MDLogger &mdlog)
+{
+ if (hwTop.supportLevel() < gmx::HardwareTopology::SupportLevel::LogicalProcessorCount)
+ {
+ /* There is nothing we can check */
+ return;
+ }
+
+ int numRanksOnThisNode = 1;
+ int numThreadsOnThisNode = numThreadsOnThisRank;
+#if GMX_MPI
+ if (PAR(cr) || MULTISIM(cr))
+ {
+ /* Count the threads within this physical node */
+ MPI_Comm_size(cr->mpi_comm_physicalnode, &numRanksOnThisNode);
+ MPI_Allreduce(&numThreadsOnThisRank, &numThreadsOnThisNode, 1, MPI_INT, MPI_SUM, cr->mpi_comm_physicalnode);
+ }
+#endif
+
+ if (numThreadsOnThisNode > hwTop.machine().logicalProcessorCount)
+ {
+ std::string mesg = "WARNING: ";
+ if (GMX_LIB_MPI)
+ {
+ mesg += gmx::formatString("On rank %d: o", cr->sim_nodeid);
+ }
+ else
+ {
+ mesg += "O";
+ }
+ mesg += gmx::formatString("versubscribing the available %d logical CPU cores", hwTop.machine().logicalProcessorCount);
+ if (GMX_LIB_MPI)
+ {
+ mesg += " per node";
+ }
+ mesg += gmx::formatString(" with %d ", numThreadsOnThisNode);
+ if (numRanksOnThisNode == numThreadsOnThisNode)
+ {
+ if (GMX_THREAD_MPI)
+ {
+ mesg += "thread-MPI threads.";
+ }
+ else
+ {
+ mesg += "MPI processes.";
+ }
+ }
+ else
+ {
+ mesg += "threads.";
+ }
+ mesg += "\n This will cause considerable performance loss.";
+ /* Note that only the master rank logs to stderr and only ranks
+ * with an open log file write to log.
+ * TODO: When we have a proper parallel logging framework,
+ * the framework should add the rank and node numbers.
+ */
+ GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted(mesg.c_str());
+ }
+}