}
/* Split the PP communicator over the physical nodes */
/* TODO: See if we should store this (before), as it's also used for
- * for the nodecomm summution.
+ * for the nodecomm summation.
*/
+ // TODO PhysicalNodeCommunicator could be extended/used to handle
+ // the need for per-node per-group communicators.
MPI_Comm_split(dd->mpi_comm_all, physicalnode_id_hash, dd->rank,
&mpi_comm_pp_physicalnode);
MPI_Comm_split(mpi_comm_pp_physicalnode, gpu_id, dd->rank,
#include "gromacs/gpu_utils/gpu_utils.h"
#include "gromacs/hardware/hw_info.h"
+#include "gromacs/utility/basenetwork.h"
#include "gromacs/utility/exceptions.h"
#include "gromacs/utility/loggerbuilder.h"
+#include "gromacs/utility/physicalnodecommunicator.h"
namespace gmx
{
//! Simple hardware initialization
static gmx_hw_info_t *hardwareInit()
{
- LoggerBuilder builder;
- LoggerOwner logOwner(builder.build());
- MDLogger log(logOwner.logger());
- return gmx_detect_hardware(log);
+ LoggerBuilder builder;
+ LoggerOwner logOwner(builder.build());
+ MDLogger log(logOwner.logger());
+ PhysicalNodeCommunicator physicalNodeComm(MPI_COMM_WORLD, gmx_physicalnode_id_hash());
+ return gmx_detect_hardware(log, physicalNodeComm);
}
void PmeTestEnvironment::SetUp()
/* The source code in this file should be thread-safe.
Please keep it that way. */
-void gmx_fill_commrec_from_mpi(t_commrec *cr,
- const gmx_multisim_t *ms)
+void gmx_fill_commrec_from_mpi(t_commrec *cr)
{
#if !GMX_MPI
gmx_call("gmx_fill_commrec_from_mpi");
GMX_UNUSED_VALUE(cr);
- GMX_UNUSED_VALUE(ms);
#else
if (!gmx_mpi_initialized())
{
cr->nnodes = gmx_node_num();
cr->nodeid = gmx_node_rank();
- // TODO This communicator should be always available. Currently we
- // make it multiple times, and keep it only when relevant. But the
- // cost of an extra communicator is negligible in single-node
- // cases (both thread-MPI and real MPI) case, and we need it in
- // all multi-node MPI cases with more than one PP rank per node,
- // with and without GPUs. By always having it available, we also
- // don't need to protect calls to mpi_comm_physicalnode, etc.
- if (PAR(cr) || isMultiSim(ms))
- {
- MPI_Comm_split(MPI_COMM_WORLD, gmx_physicalnode_id_hash(), cr->nodeid, &cr->mpi_comm_physicalnode);
- }
cr->sim_nodeid = cr->nodeid;
cr->mpi_comm_mysim = MPI_COMM_WORLD;
cr->mpi_comm_mygroup = MPI_COMM_WORLD;
-
#endif
}
snew(cr, 1);
- cr->mpi_comm_physicalnode = MPI_COMM_NULL;
#if GMX_LIB_MPI
- gmx_fill_commrec_from_mpi(cr, nullptr);
+ gmx_fill_commrec_from_mpi(cr);
#else
cr->mpi_comm_mysim = MPI_COMM_NULL;
cr->mpi_comm_mygroup = MPI_COMM_NULL;
void done_commrec(t_commrec *cr)
{
-#if GMX_MPI
- if (cr->mpi_comm_physicalnode != MPI_COMM_NULL)
- {
- MPI_Comm_free(&cr->mpi_comm_physicalnode);
- }
-#endif
if (nullptr != cr->dd)
{
// TODO: implement
sfree(cr);
}
-t_commrec *reinitialize_commrec_for_this_thread(const t_commrec *cro,
- const gmx_multisim_t *ms)
+t_commrec *reinitialize_commrec_for_this_thread(const t_commrec *cro)
{
#if GMX_THREAD_MPI
t_commrec *cr;
*cr = *cro;
/* and we start setting our own thread-specific values for things */
- gmx_fill_commrec_from_mpi(cr, ms);
+ gmx_fill_commrec_from_mpi(cr);
// TODO cr->duty should not be initialized here
cr->duty = (DUTY_PP | DUTY_PME);
return cr;
#else
GMX_UNUSED_VALUE(cro);
- GMX_UNUSED_VALUE(ms);
return nullptr;
#endif
}
#if GMX_MPI
int n, rank;
+ // TODO PhysicalNodeCommunicator could be extended/used to handle
+ // the need for per-node per-group communicators.
MPI_Comm_size(cr->mpi_comm_mygroup, &n);
MPI_Comm_rank(cr->mpi_comm_mygroup, &rank);
#endif
}
-void gmx_init_intranode_counters(t_commrec *cr)
-{
- /* counters for PP+PME and PP-only processes on my physical node */
- int nrank_intranode, rank_intranode;
- /* thread-MPI is not initialized when not running in parallel */
-#if GMX_MPI && !GMX_THREAD_MPI
- int nrank_world, rank_world;
- int i, myhash, *hash, *hash_s, *hash_pp, *hash_pp_s;
-
- MPI_Comm_size(MPI_COMM_WORLD, &nrank_world);
- MPI_Comm_rank(MPI_COMM_WORLD, &rank_world);
-
- /* Get a (hopefully unique) hash that identifies our physical node */
- myhash = gmx_physicalnode_id_hash();
-
- /* We can't rely on MPI_IN_PLACE, so we need send and receive buffers */
- snew(hash, nrank_world);
- snew(hash_s, nrank_world);
- snew(hash_pp, nrank_world);
- snew(hash_pp_s, nrank_world);
-
- hash_s[rank_world] = myhash;
- hash_pp_s[rank_world] = thisRankHasDuty(cr, DUTY_PP) ? myhash : -1;
-
- MPI_Allreduce(hash_s, hash, nrank_world, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
- MPI_Allreduce(hash_pp_s, hash_pp, nrank_world, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
-
- nrank_intranode = 0;
- rank_intranode = 0;
- for (i = 0; i < nrank_world; i++)
- {
- if (hash[i] == myhash)
- {
- nrank_intranode++;
- if (i < rank_world)
- {
- rank_intranode++;
- }
- }
- }
- sfree(hash);
- sfree(hash_s);
- sfree(hash_pp);
- sfree(hash_pp_s);
-#else
- /* Serial or thread-MPI code: we run within a single physical node */
- nrank_intranode = cr->nnodes;
- rank_intranode = cr->sim_nodeid;
-#endif
-
- if (debug)
- {
- char sbuf[STRLEN];
- if (thisRankHasDuty(cr, DUTY_PP) && thisRankHasDuty(cr, DUTY_PME))
- {
- sprintf(sbuf, "PP+PME");
- }
- else
- {
- sprintf(sbuf, "%s", thisRankHasDuty(cr, DUTY_PP) ? "PP" : "PME");
- }
- fprintf(debug, "On %3s rank %d: nrank_intranode=%d, rank_intranode=%d\n",
- sbuf, cr->sim_nodeid,
- nrank_intranode, rank_intranode);
- }
-
- cr->nrank_intranode = nrank_intranode;
- cr->rank_intranode = rank_intranode;
-}
-
-
void gmx_barrier(const t_commrec gmx_unused *cr)
{
#if !GMX_MPI
#endif
}
-void gmx_barrier_physical_node(const t_commrec gmx_unused *cr)
-{
-#if !GMX_MPI
- gmx_call("gmx_barrier_physical_node");
-#else
- MPI_Barrier(cr->mpi_comm_physicalnode);
-#endif
-}
-
void gmx_bcast(int gmx_unused nbytes, void gmx_unused *b, const t_commrec gmx_unused *cr)
{
#if !GMX_MPI
void done_commrec(t_commrec *cr);
/* Free memory associated with the commrec. */
-struct t_commrec *reinitialize_commrec_for_this_thread(const t_commrec *cro,
- const gmx_multisim_t *ms);
+struct t_commrec *reinitialize_commrec_for_this_thread(const t_commrec *cro);
/* Initialize communication records for thread-parallel simulations.
Must be called on all threads before any communication takes place by
thread-local versions (a small memory leak results because we don't
deallocate the old shared version). */
-void gmx_fill_commrec_from_mpi(t_commrec *cr,
- const gmx_multisim_t *ms);
+void gmx_fill_commrec_from_mpi(t_commrec *cr);
/* Continues t_commrec construction */
void gmx_setup_nodecomm(FILE *fplog, struct t_commrec *cr);
/* Sets up fast global communication for clusters with multi-core nodes */
-void gmx_init_intranode_counters(struct t_commrec *cr);
-/* Initializes intra-physical-node MPI process/thread counts and ID. */
-
void gmx_barrier(const struct t_commrec *cr);
/* Wait till all processes in cr->mpi_comm_mygroup have reached the barrier */
-void gmx_barrier_physical_node(const struct t_commrec *cr);
-/* Wait till all processes in cr->mpi_comm_physical_node have reached the barrier */
-
void gmx_bcast(int nbytes, void *b, const struct t_commrec *cr);
/* Broadcast nbytes bytes from the master to cr->mpi_comm_mygroup */
#include "gromacs/utility/fatalerror.h"
#include "gromacs/utility/gmxassert.h"
#include "gromacs/utility/logger.h"
+#include "gromacs/utility/physicalnodecommunicator.h"
#include "gromacs/utility/programcontext.h"
#include "gromacs/utility/smalloc.h"
#include "gromacs/utility/stringutil.h"
static tMPI_Thread_mutex_t hw_info_lock = TMPI_THREAD_MUTEX_INITIALIZER;
//! Detect GPUs, if that makes sense to attempt.
-static void gmx_detect_gpus(const gmx::MDLogger &mdlog)
+static void gmx_detect_gpus(const gmx::MDLogger &mdlog,
+ const PhysicalNodeCommunicator &physicalNodeComm)
{
-#if GMX_LIB_MPI
- int rank_world;
- MPI_Comm physicalnode_comm;
-#endif
- bool isMasterRankOfNode;
-
hwinfo_g->gpu_info.bDetectGPUs =
(bGPUBinary && getenv("GMX_DISABLE_GPU_DETECTION") == nullptr);
if (!hwinfo_g->gpu_info.bDetectGPUs)
return;
}
- /* Under certain circumstances MPI ranks on the same physical node
- * can not simultaneously access the same GPU(s). Therefore we run
- * the detection only on one MPI rank per node and broadcast the info.
- * Note that with thread-MPI only a single thread runs this code.
- *
- * NOTE: We can't broadcast gpu_info with OpenCL as the device and platform
- * ID stored in the structure are unique for each rank (even if a device
- * is shared by multiple ranks).
- *
- * TODO: We should also do CPU hardware detection only once on each
- * physical node and broadcast it, instead of do it on every MPI rank.
- */
+ bool isMasterRankOfPhysicalNode = true;
#if GMX_LIB_MPI
- /* A split of MPI_COMM_WORLD over physical nodes is only required here,
- * so we create and destroy it locally.
- */
- MPI_Comm_rank(MPI_COMM_WORLD, &rank_world);
- MPI_Comm_split(MPI_COMM_WORLD, gmx_physicalnode_id_hash(),
- rank_world, &physicalnode_comm);
- {
- int rankOnNode = -1;
- MPI_Comm_rank(physicalnode_comm, &rankOnNode);
- isMasterRankOfNode = (rankOnNode == 0);
- }
+ isMasterRankOfPhysicalNode = (physicalNodeComm.rank_ == 0);
#else
// We choose to run the detection only once with thread-MPI and
// use reference counting on the results of the detection to
// enforce it. But we can assert that this is true.
GMX_RELEASE_ASSERT(n_hwinfo == 0, "Cannot run GPU detection on non-master thread-MPI ranks");
- isMasterRankOfNode = true;
+ GMX_UNUSED_VALUE(physicalNodeComm);
+ isMasterRankOfPhysicalNode = true;
#endif
/* The OpenCL support requires us to run detection on all ranks.
* and send the information to the other ranks over MPI. */
bool allRanksMustDetectGpus = (GMX_GPU == GMX_GPU_OPENCL);
bool gpusCanBeDetected = false;
- if (isMasterRankOfNode || allRanksMustDetectGpus)
+ if (isMasterRankOfPhysicalNode || allRanksMustDetectGpus)
{
std::string errorMessage;
gpusCanBeDetected = canDetectGpus(&errorMessage);
if (!allRanksMustDetectGpus)
{
/* Broadcast the GPU info to the other ranks within this node */
- MPI_Bcast(&hwinfo_g->gpu_info.n_dev, 1, MPI_INT, 0, physicalnode_comm);
+ MPI_Bcast(&hwinfo_g->gpu_info.n_dev, 1, MPI_INT, 0, physicalNodeComm.comm_);
if (hwinfo_g->gpu_info.n_dev > 0)
{
dev_size = hwinfo_g->gpu_info.n_dev*sizeof_gpu_dev_info();
- if (!isMasterRankOfNode)
+ if (!isMasterRankOfPhysicalNode)
{
hwinfo_g->gpu_info.gpu_dev =
(struct gmx_device_info_t *)malloc(dev_size);
}
MPI_Bcast(hwinfo_g->gpu_info.gpu_dev, dev_size, MPI_BYTE,
- 0, physicalnode_comm);
+ 0, physicalNodeComm.comm_);
MPI_Bcast(&hwinfo_g->gpu_info.n_dev_compatible, 1, MPI_INT,
- 0, physicalnode_comm);
+ 0, physicalNodeComm.comm_);
}
}
-
- MPI_Comm_free(&physicalnode_comm);
#endif
}
//! Reduce the locally collected \p hwinfo_g over MPI ranks
-static void gmx_collect_hardware_mpi(const gmx::CpuInfo &cpuInfo)
+static void gmx_collect_hardware_mpi(const gmx::CpuInfo &cpuInfo,
+ const PhysicalNodeCommunicator &physicalNodeComm)
{
const int ncore = hwinfo_g->hardwareTopology->numberOfCores();
/* Zen has family=23, for now we treat future AMD CPUs like Zen */
cpuInfo.family() >= 23);
#if GMX_LIB_MPI
- int rank_id;
- int nrank, rank, nhwthread, ngpu, i;
+ int nhwthread, ngpu, i;
int gpu_hash;
- int *buf, *all;
- rank_id = gmx_physicalnode_id_hash();
- MPI_Comm_rank(MPI_COMM_WORLD, &rank);
- MPI_Comm_size(MPI_COMM_WORLD, &nrank);
nhwthread = hwinfo_g->nthreads_hw_avail;
ngpu = hwinfo_g->gpu_info.n_dev_compatible;
/* Create a unique hash of the GPU type(s) in this node */
gpu_hash ^= gmx_string_fullhash_func(stmp, gmx_string_hash_init);
}
- snew(buf, nrank);
- snew(all, nrank);
- buf[rank] = rank_id;
-
- MPI_Allreduce(buf, all, nrank, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
-
- gmx_bool bFound;
- int nnode0, ncore0, nhwthread0, ngpu0, r;
-
- bFound = FALSE;
- ncore0 = 0;
- nnode0 = 0;
- nhwthread0 = 0;
- ngpu0 = 0;
- for (r = 0; r < nrank; r++)
- {
- if (all[r] == rank_id)
- {
- if (!bFound && r == rank)
- {
- /* We are the first rank in this physical node */
- nnode0 = 1;
- ncore0 = ncore;
- nhwthread0 = nhwthread;
- ngpu0 = ngpu;
- }
- bFound = TRUE;
- }
- }
-
- sfree(buf);
- sfree(all);
-
constexpr int numElementsCounts = 4;
std::array<int, numElementsCounts> countsReduced;
{
- std::array<int, numElementsCounts> countsLocal;
- /* Sum values from only intra-rank 0 so we get the sum over all nodes */
- countsLocal[0] = nnode0;
- countsLocal[1] = ncore0;
- countsLocal[2] = nhwthread0;
- countsLocal[3] = ngpu0;
+ std::array<int, numElementsCounts> countsLocal = {{0}};
+ // Organize to sum values from only one rank within each node,
+ // so we get the sum over all nodes.
+ bool isMasterRankOfPhysicalNode = (physicalNodeComm.rank_ == 0);
+ if (isMasterRankOfPhysicalNode)
+ {
+ countsLocal[0] = 1;
+ countsLocal[1] = ncore;
+ countsLocal[2] = nhwthread;
+ countsLocal[3] = ngpu;
+ }
MPI_Allreduce(countsLocal.data(), countsReduced.data(), countsLocal.size(),
MPI_INT, MPI_SUM, MPI_COMM_WORLD);
hwinfo_g->simd_suggest_max = static_cast<int>(simdSuggested(cpuInfo));
hwinfo_g->bIdenticalGPUs = TRUE;
hwinfo_g->haveAmdZenCpu = cpuIsAmdZen;
+ GMX_UNUSED_VALUE(physicalNodeComm);
#endif
}
#endif
}
-gmx_hw_info_t *gmx_detect_hardware(const gmx::MDLogger &mdlog)
+gmx_hw_info_t *gmx_detect_hardware(const gmx::MDLogger &mdlog,
+ const PhysicalNodeCommunicator &physicalNodeComm)
{
int ret;
{
hwinfo_g = compat::make_unique<gmx_hw_info_t>();
+ /* TODO: We should also do CPU hardware detection only once on each
+ * physical node and broadcast it, instead of do it on every MPI rank. */
hwinfo_g->cpuInfo = new gmx::CpuInfo(gmx::CpuInfo::detect());
hardwareTopologyPrepareDetection();
hwinfo_g->gpu_info.n_dev_compatible = 0;
hwinfo_g->gpu_info.gpu_dev = nullptr;
- gmx_detect_gpus(mdlog);
- gmx_collect_hardware_mpi(*hwinfo_g->cpuInfo);
+ gmx_detect_gpus(mdlog, physicalNodeComm);
+ gmx_collect_hardware_mpi(*hwinfo_g->cpuInfo, physicalNodeComm);
}
/* increase the reference counter */
n_hwinfo++;
namespace gmx
{
class MDLogger;
+class PhysicalNodeCommunicator;
/*! \brief Run detection, consistency checks, and make available on all ranks.
*
*
* May do communication on MPI_COMM_WORLD when compiled with real MPI.
*/
-gmx_hw_info_t *gmx_detect_hardware(const gmx::MDLogger &mdlog);
+gmx_hw_info_t *gmx_detect_hardware(const gmx::MDLogger &mdlog,
+ const PhysicalNodeCommunicator &physicalNodeComm);
/*! \brief Free the hwinfo structure */
void gmx_hardware_info_free();
{
class ForceWithVirial;
class MDLogger;
+class PhysicalNodeCommunicator;
}
void calc_vir(int nxf, rvec x[], rvec f[], tensor vir,
float *cycles_pme);
/* Call all the force routines */
-void free_gpu_resources(const t_forcerec *fr,
- const t_commrec *cr,
- const gmx_multisim_t *ms);
+void free_gpu_resources(const t_forcerec *fr,
+ const gmx::PhysicalNodeCommunicator &physicalNodeCommunicator);
#endif
#include "gromacs/utility/fatalerror.h"
#include "gromacs/utility/gmxassert.h"
#include "gromacs/utility/logger.h"
+#include "gromacs/utility/physicalnodecommunicator.h"
#include "gromacs/utility/pleasecite.h"
#include "gromacs/utility/smalloc.h"
#include "gromacs/utility/strconvert.h"
* \todo Remove physical node barrier from this function after making sure
* that it's not needed anymore (with a shared GPU run).
*/
-void free_gpu_resources(const t_forcerec *fr,
- const t_commrec *cr,
- const gmx_multisim_t *ms)
+void free_gpu_resources(const t_forcerec *fr,
+ const gmx::PhysicalNodeCommunicator &physicalNodeCommunicator)
{
bool isPPrankUsingGPU = fr && fr->nbv && fr->nbv->bUseGPU;
* Note: it is safe to not call the barrier on the ranks which do not use GPU,
* but it is easier and more futureproof to call it on the whole node.
*/
- if (GMX_THREAD_MPI && (PAR(cr) || isMultiSim(ms)))
+ if (GMX_THREAD_MPI)
{
- gmx_barrier_physical_node(cr);
+ physicalNodeCommunicator.barrier();
}
}
int omp_nthreads_pme_req,
gmx_bool gmx_unused bThisNodePMEOnly,
gmx_bool bFullOmpSupport,
- int nppn,
+ int numRanksOnThisNode,
gmx_bool bSepPME)
{
int nth;
/* max available threads per node */
nth = nthreads_hw_avail;
- /* divide the threads among the MPI processes/tMPI threads */
- if (nth >= nppn)
+ /* divide the threads among the MPI ranks */
+ if (nth >= numRanksOnThisNode)
{
- nth /= nppn;
+ nth /= numRanksOnThisNode;
}
else
{
void gmx_omp_nthreads_init(const gmx::MDLogger &mdlog, t_commrec *cr,
int nthreads_hw_avail,
+ int numRanksOnThisNode,
int omp_nthreads_req,
int omp_nthreads_pme_req,
gmx_bool bThisNodePMEOnly,
gmx_bool bFullOmpSupport)
{
- int nppn;
gmx_bool bSepPME;
const bool bOMP = GMX_OPENMP;
- /* number of MPI processes/threads per physical node */
- nppn = cr->nrank_intranode;
-
bSepPME = (thisRankHasDuty(cr, DUTY_PP) != thisRankHasDuty(cr, DUTY_PME));
manage_number_of_openmp_threads(mdlog, cr, bOMP,
nthreads_hw_avail,
omp_nthreads_req, omp_nthreads_pme_req,
bThisNodePMEOnly, bFullOmpSupport,
- nppn, bSepPME);
+ numRanksOnThisNode, bSepPME);
#if GMX_THREAD_MPI
/* Non-master threads have to wait for the OpenMP management to be
* done, so that code elsewhere that uses OpenMP can be certain
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2012,2013,2014,2015,2016,2017, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014,2015,2016,2017,2018, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
* This function should caled only once during the initialization of mdrun. */
void gmx_omp_nthreads_init(const gmx::MDLogger &fplog, t_commrec *cr,
int nthreads_hw_avail,
+ int numRanksOnThisNode,
int omp_nthreads_req,
int omp_nthreads_pme_req,
gmx_bool bCurrNodePMEOnly,
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2016,2017, by the GROMACS development team, led by
+ * Copyright (c) 2016,2017,2018, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
{
MockThreadAffinityAccess::MockThreadAffinityAccess()
- : supported_(true), physicalNodeId_(0)
+ : supported_(true)
{
using ::testing::_;
using ::testing::Return;
snew(cr_, 1);
cr_->nnodes = gmx_node_num();
cr_->nodeid = gmx_node_rank();
- cr_->rank_intranode = cr_->nodeid;
cr_->duty = DUTY_PP;
#if GMX_MPI
cr_->mpi_comm_mysim = MPI_COMM_WORLD;
#endif
hwOpt_.thread_affinity = threadaffAUTO;
hwOpt_.totNumThreadsIsAuto = false;
+ physicalNodeId_ = 0;
}
ThreadAffinityTestHelper::~ThreadAffinityTestHelper()
#include "gromacs/hardware/hw_info.h"
#include "gromacs/mdrunutility/threadaffinity.h"
#include "gromacs/utility/logger.h"
+#include "gromacs/utility/physicalnodecommunicator.h"
#include "gromacs/utility/stringutil.h"
#include "testutils/loggertest.h"
~MockThreadAffinityAccess();
void setSupported(bool supported) { supported_ = supported; }
- void setPhysicalNodeId(int nodeId) { physicalNodeId_ = nodeId; }
virtual bool isThreadAffinitySupported() const { return supported_; }
- virtual int physicalNodeId() const { return physicalNodeId_; }
MOCK_METHOD1(setCurrentThreadAffinityToCore, bool(int core));
private:
bool supported_;
- int physicalNodeId_;
};
class ThreadAffinityTestHelper
void setPhysicalNodeId(int nodeId)
{
- affinityAccess_.setPhysicalNodeId(nodeId);
+ physicalNodeId_ = nodeId;
}
void setLogicalProcessorCount(int logicalProcessorCount);
{
setLogicalProcessorCount(1);
}
+ gmx::PhysicalNodeCommunicator comm(MPI_COMM_WORLD, physicalNodeId_);
int numThreadsOnThisNode, indexWithinNodeOfFirstThreadOnThisRank;
- analyzeThreadsOnThisNode(cr_, nullptr, &affinityAccess_,
+ analyzeThreadsOnThisNode(comm,
numThreadsOnThisRank,
&numThreadsOnThisNode,
&indexWithinNodeOfFirstThreadOnThisRank);
std::unique_ptr<HardwareTopology> hwTop_;
MockThreadAffinityAccess affinityAccess_;
LoggerTestHelper logHelper_;
+ int physicalNodeId_;
};
} // namespace test
#include "gromacs/utility/gmxassert.h"
#include "gromacs/utility/gmxomp.h"
#include "gromacs/utility/logger.h"
+#include "gromacs/utility/physicalnodecommunicator.h"
#include "gromacs/utility/programcontext.h"
#include "gromacs/utility/smalloc.h"
#include "gromacs/utility/unique_cptr.h"
{
return tMPI_Thread_setaffinity_support() == TMPI_SETAFFINITY_SUPPORT_YES;
}
- virtual int physicalNodeId() const
- {
- return gmx_physicalnode_id_hash();
- }
virtual bool setCurrentThreadAffinityToCore(int core)
{
const int ret = tMPI_Thread_setaffinity_single(tMPI_Thread_self(), core);
return allAffinitiesSet;
}
-void analyzeThreadsOnThisNode(const t_commrec *cr,
- const gmx_multisim_t *ms,
- gmx::IThreadAffinityAccess *affinityAccess,
- int numThreadsOnThisRank,
- int *numThreadsOnThisNode,
- int *intraNodeThreadOffset)
+void analyzeThreadsOnThisNode(const gmx::PhysicalNodeCommunicator &physicalNodeComm,
+ int numThreadsOnThisRank,
+ int *numThreadsOnThisNode,
+ int *intraNodeThreadOffset)
{
*intraNodeThreadOffset = 0;
*numThreadsOnThisNode = numThreadsOnThisRank;
#if GMX_MPI
- if (PAR(cr) || isMultiSim(ms))
+ if (physicalNodeComm.size_ > 1)
{
- if (affinityAccess == nullptr)
- {
- affinityAccess = &g_defaultAffinityAccess;
- }
-
/* We need to determine a scan of the thread counts in this
- * compute node.
- */
- MPI_Comm comm_intra;
-
- MPI_Comm_split(MPI_COMM_WORLD,
- affinityAccess->physicalNodeId(), cr->rank_intranode,
- &comm_intra);
- MPI_Scan(&numThreadsOnThisRank, intraNodeThreadOffset, 1, MPI_INT, MPI_SUM, comm_intra);
+ * compute node. */
+ MPI_Scan(&numThreadsOnThisRank, intraNodeThreadOffset, 1, MPI_INT, MPI_SUM, physicalNodeComm.comm_);
/* MPI_Scan is inclusive, but here we need exclusive */
*intraNodeThreadOffset -= numThreadsOnThisRank;
/* Get the total number of threads on this physical node */
- MPI_Allreduce(&numThreadsOnThisRank, numThreadsOnThisNode, 1, MPI_INT, MPI_SUM, comm_intra);
- MPI_Comm_free(&comm_intra);
+ MPI_Allreduce(&numThreadsOnThisRank, numThreadsOnThisNode, 1, MPI_INT, MPI_SUM, physicalNodeComm.comm_);
}
#else
- GMX_UNUSED_VALUE(cr);
- GMX_UNUSED_VALUE(ms);
- GMX_UNUSED_VALUE(affinityAccess);
+ GMX_UNUSED_VALUE(physicalNodeComm);
#endif
}
class HardwareTopology;
class MDLogger;
+class PhysicalNodeCommunicator;
class IThreadAffinityAccess
{
public:
virtual bool isThreadAffinitySupported() const = 0;
- virtual int physicalNodeId() const = 0;
virtual bool setCurrentThreadAffinityToCore(int core) = 0;
protected:
} // namespace gmx
/*! \brief Communicates within physical nodes to discover the
- * distribution of threads over ranks.
- *
- * See gmx_set_thread_affinity(), which consumes this output.
- *
- * \param[in] cr Communication handler.
- * \param[in] ms Multi-simulation handler.
- * \param[in] affinityAccess Interface for low-level access to affinity details.
- * \param[in] numThreadsOnThisRank The number of threads on this rank.
- * \param[out] numThreadsOnThisNode On exit, the number of threads on all ranks of this node.
- * \param[out] intraNodeThreadOffset On exit, the index of the first hardware thread of this rank
- * in the set of all the threads of all MPI ranks within a node (ordered by MPI rank ID).
- */
-void analyzeThreadsOnThisNode(const t_commrec *cr,
- const gmx_multisim_t *ms,
- gmx::IThreadAffinityAccess *affinityAccess,
- int numThreadsOnThisRank,
- int *numThreadsOnThisNode,
- int *intraNodeThreadOffset);
+ * distribution of threads over ranks. */
+void analyzeThreadsOnThisNode(const gmx::PhysicalNodeCommunicator &physicalNodeComm,
+ int numThreadsOnThisRank,
+ int *numThreadsOnThisNode,
+ int *intraNodeThreadOffset);
/*! \brief
* Sets the thread affinity using the requested setting stored in hw_opt.
MPI_Comm mpi_comm_mygroup; /* subset of mpi_comm_mysim including only
the ranks in the same group (PP or PME) */
- /* MPI ranks and a communicator within a physical node for hardware access */
- MPI_Comm mpi_comm_physicalnode; /* communicator for all ranks of the physical node
- * NOTE: this communicator should only be used during initialization and finalization, as it can contain ranks from PP, PME and multiple simulations with multisim
- */
- int nrank_intranode; /* nr of ranks on this physical node */
- int rank_intranode; /* our rank on this physical node */
-
gmx_nodecomm_t nc;
/* For domain decomposition */
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2017, by the GROMACS development team, led by
+ * Copyright (c) 2017,2018, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
#include <numeric>
#include <vector>
-#include "gromacs/mdtypes/commrec.h"
#include "gromacs/utility/exceptions.h"
#include "gromacs/utility/gmxassert.h"
#include "gromacs/utility/gmxmpi.h"
+#include "gromacs/utility/physicalnodecommunicator.h"
namespace gmx
{
* assignment. Separating this aspect makes it possible to unit test
* the logic of task assignment. */
GpuTasksOnRanks
-findAllGpuTasksOnThisNode(ArrayRef<const GpuTask> gpuTasksOnThisRank,
- int numRanksOnThisNode,
- MPI_Comm communicator)
+findAllGpuTasksOnThisNode(ArrayRef<const GpuTask> gpuTasksOnThisRank,
+ const PhysicalNodeCommunicator &physicalNodeComm)
{
+ int numRanksOnThisNode = physicalNodeComm.size_;
+ MPI_Comm communicator = physicalNodeComm.comm_;
// Find out how many GPU tasks are on each rank on this node.
- auto numGpuTasksOnEachRankOfThisNode =
+ auto numGpuTasksOnEachRankOfThisNode =
allgather(gpuTasksOnThisRank.size(), numRanksOnThisNode, communicator);
/* Collect on each rank of this node a vector describing all
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2017, by the GROMACS development team, led by
+ * Copyright (c) 2017,2018, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
#include "gromacs/taskassignment/taskassignment.h"
#include "gromacs/utility/arrayref.h"
-#include "gromacs/utility/gmxmpi.h"
namespace gmx
{
+class PhysicalNodeCommunicator;
+
/*! \brief Returns container of all tasks on all ranks of this node
* that are eligible for GPU execution.
*
* assignment. Separating this aspect makes it possible to unit test
* the logic of task assignment. */
GpuTasksOnRanks
-findAllGpuTasksOnThisNode(ArrayRef<const GpuTask> gpuTasksOnThisRank,
- int numRanksOnThisNode,
- MPI_Comm communicator);
+findAllGpuTasksOnThisNode(ArrayRef<const GpuTask> gpuTasksOnThisRank,
+ const PhysicalNodeCommunicator &physicalNodeComm);
} // namespace
#include "gromacs/utility/fatalerror.h"
#include "gromacs/utility/gmxassert.h"
#include "gromacs/utility/logger.h"
+#include "gromacs/utility/physicalnodecommunicator.h"
#include "gromacs/utility/stringutil.h"
const gmx_hw_info_t &hwinfo,
const t_commrec *cr,
const gmx_multisim_t *ms,
+ int numRanksOnThisNode,
PmeRunMode pmeRunMode,
const gmx_mtop_t &mtop)
{
int numCoresPerRank = hwinfo.ncore_tot/numRanksTot;
if (numAtomsPerRank < c_numAtomsPerCoreSquaredSmtThreshold*gmx::square(numCoresPerRank))
{
- int numRanksInThisNode = (cr ? cr->nrank_intranode : 1);
/* Choose one OpenMP thread per physical core */
- hw_opt->nthreads_omp = std::max(1, hwinfo.hardwareTopology->numberOfCores()/numRanksInThisNode);
+ hw_opt->nthreads_omp = std::max(1, hwinfo.hardwareTopology->numberOfCores()/numRanksOnThisNode);
}
}
}
}
-void checkHardwareOversubscription(int numThreadsOnThisRank,
- const gmx::HardwareTopology &hwTop,
- const t_commrec *cr,
- const gmx_multisim_t *ms,
- const gmx::MDLogger &mdlog)
+namespace gmx
{
- if (hwTop.supportLevel() < gmx::HardwareTopology::SupportLevel::LogicalProcessorCount)
+
+void checkHardwareOversubscription(int numThreadsOnThisRank,
+ int rank,
+ const HardwareTopology &hwTop,
+ const PhysicalNodeCommunicator &comm,
+ const MDLogger &mdlog)
+{
+ if (hwTop.supportLevel() < HardwareTopology::SupportLevel::LogicalProcessorCount)
{
/* There is nothing we can check */
return;
}
- int numRanksOnThisNode = 1;
+ int numRanksOnThisNode = comm.size_;
int numThreadsOnThisNode = numThreadsOnThisRank;
-#if GMX_MPI
- if (PAR(cr) || isMultiSim(ms))
+ /* Avoid MPI calls with uninitialized thread-MPI communicators */
+ if (comm.size_ > 1)
{
+#if GMX_MPI
/* Count the threads within this physical node */
- MPI_Comm_size(cr->mpi_comm_physicalnode, &numRanksOnThisNode);
- MPI_Allreduce(&numThreadsOnThisRank, &numThreadsOnThisNode, 1, MPI_INT, MPI_SUM, cr->mpi_comm_physicalnode);
- }
-#else
- GMX_UNUSED_VALUE(ms);
+ MPI_Allreduce(&numThreadsOnThisRank, &numThreadsOnThisNode, 1, MPI_INT, MPI_SUM, comm.comm_);
#endif
+ }
if (numThreadsOnThisNode > hwTop.machine().logicalProcessorCount)
{
std::string mesg = "WARNING: ";
if (GMX_LIB_MPI)
{
- mesg += gmx::formatString("On rank %d: o", cr->sim_nodeid);
+ mesg += formatString("On rank %d: o", rank);
}
else
{
mesg += "O";
}
- mesg += gmx::formatString("versubscribing the available %d logical CPU cores", hwTop.machine().logicalProcessorCount);
+ mesg += formatString("versubscribing the available %d logical CPU cores", hwTop.machine().logicalProcessorCount);
if (GMX_LIB_MPI)
{
mesg += " per node";
}
- mesg += gmx::formatString(" with %d ", numThreadsOnThisNode);
+ mesg += formatString(" with %d ", numThreadsOnThisNode);
if (numRanksOnThisNode == numThreadsOnThisNode)
{
if (GMX_THREAD_MPI)
GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted(mesg.c_str());
}
}
+
+} // namespace
{
class HardwareTopology;
class MDLogger;
+class PhysicalNodeCommunicator;
}
/*! \brief Return the number of threads to use for thread-MPI based on how many
const gmx_hw_info_t &hwinfo,
const t_commrec *cr,
const gmx_multisim_t *ms,
+ int numRanksOnThisNode,
PmeRunMode pmeRunMode,
const gmx_mtop_t &mtop);
+namespace gmx
+{
+
/*! \brief Warns for oversubscribing the hardware threads, when that is the case
*/
-void checkHardwareOversubscription(int numThreadsOnThisRank,
- const gmx::HardwareTopology &hwTop,
- const t_commrec *cr,
- const gmx_multisim_t *ms,
- const gmx::MDLogger &mdlog);
+void checkHardwareOversubscription(int numThreadsOnThisRank,
+ int rank,
+ const HardwareTopology &hwTop,
+ const PhysicalNodeCommunicator &comm,
+ const MDLogger &mdlog);
+
+} // namespace
#endif
#include "gromacs/utility/gmxassert.h"
#include "gromacs/utility/gmxmpi.h"
#include "gromacs/utility/logger.h"
+#include "gromacs/utility/physicalnodecommunicator.h"
#include "gromacs/utility/stringutil.h"
#include "gromacs/utility/sysinfo.h"
} // namespace
GpuTaskAssignments::value_type
-runTaskAssignment(const std::vector<int> &gpuIdsToUse,
- const std::vector<int> &userGpuTaskAssignment,
- const gmx_hw_info_t &hardwareInfo,
- const MDLogger &mdlog,
- const t_commrec *cr,
- const gmx_multisim_t *ms,
- const std::vector<GpuTask> &gpuTasksOnThisRank)
+runTaskAssignment(const std::vector<int> &gpuIdsToUse,
+ const std::vector<int> &userGpuTaskAssignment,
+ const gmx_hw_info_t &hardwareInfo,
+ const MDLogger &mdlog,
+ const t_commrec *cr,
+ const gmx_multisim_t *ms,
+ const PhysicalNodeCommunicator &physicalNodeComm,
+ const std::vector<GpuTask> &gpuTasksOnThisRank)
{
/* Communicate among ranks on this node to find each task that can
* be executed on a GPU, on each rank. */
- auto gpuTasksOnRanksOfThisNode = findAllGpuTasksOnThisNode(gpuTasksOnThisRank,
- cr->nrank_intranode,
- cr->mpi_comm_physicalnode);
+ auto gpuTasksOnRanksOfThisNode = findAllGpuTasksOnThisNode(gpuTasksOnThisRank,
+ physicalNodeComm);
auto numGpuTasksOnThisNode = countGpuTasksOnThisNode(gpuTasksOnRanksOfThisNode);
GpuTaskAssignments taskAssignmentOnRanksOfThisNode;
ArrayRef<const int> compatibleGpusToUse = gpuIdsToUse;
// enforce the single device/rank restriction
- if (cr->nrank_intranode == 1 && !compatibleGpusToUse.empty())
+ if (physicalNodeComm.size_ == 1 && !compatibleGpusToUse.empty())
{
compatibleGpusToUse = compatibleGpusToUse.subArray(0, 1);
}
// GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR), but it is unclear
// how we should involve MPI in the implementation of error
// handling.
- if (cr->rank_intranode == 0)
+ if (physicalNodeComm.rank_ == 0)
{
printFatalErrorMessage(stderr, ex);
}
}
reportGpuUsage(mdlog, !userGpuTaskAssignment.empty(), taskAssignmentOnRanksOfThisNode,
- numGpuTasksOnThisNode, cr->nrank_intranode, cr->nnodes > 1);
+ numGpuTasksOnThisNode, physicalNodeComm.size_, cr->nnodes > 1);
// If the user chose a task assignment, give them some hints where appropriate.
if (!userGpuTaskAssignment.empty())
taskAssignmentOnRanksOfThisNode);
}
- return taskAssignmentOnRanksOfThisNode[cr->rank_intranode];
+ return taskAssignmentOnRanksOfThisNode[physicalNodeComm.rank_];
// TODO There is no check that mdrun -nb gpu or -pme gpu or
// -gpu_id is actually being implemented such that nonbonded tasks
{
class MDLogger;
+class PhysicalNodeCommunicator;
/*! \brief Types of compute tasks that can be run on a GPU.
*
* \param[in] hardwareInfo The detected hardware
* \param[in] mdlog Logging object to write to.
* \param[in] cr Communication object.
- * \param[in] ms Handles multi-simulations.
+ * \param[in] ms Multi-simulation handler.
+ * \param[in] physicalNodeComm Communication object for this physical node.
* \param[in] gpuTasksOnThisRank Information about what GPU tasks
* exist on this rank.
*
* InconsistentInputError If user and/or detected inputs are inconsistent.
*/
GpuTaskAssignments::value_type
-runTaskAssignment(const std::vector<int> &gpuIdsToUse,
- const std::vector<int> &userGpuTaskAssignment,
- const gmx_hw_info_t &hardwareInfo,
- const MDLogger &mdlog,
- const t_commrec *cr,
- const gmx_multisim_t *ms,
- const std::vector<GpuTask> &gpuTasksOnThisRank);
+runTaskAssignment(const std::vector<int> &gpuIdsToUse,
+ const std::vector<int> &userGpuTaskAssignment,
+ const gmx_hw_info_t &hardwareInfo,
+ const MDLogger &mdlog,
+ const t_commrec *cr,
+ const gmx_multisim_t *ms,
+ const PhysicalNodeCommunicator &physicalNodeComm,
+ const std::vector<GpuTask> &gpuTasksOnThisRank);
//! Function for whether the task of \c mapping has value \c TaskType.
template<GpuTask TaskType>
typedef void* MPI_Request;
typedef void* MPI_Status;
typedef void* MPI_Group;
-#define MPI_COMM_NULL NULL
-#define MPI_GROUP_NULL NULL
-#define MPI_COMM_WORLD NULL
+#define MPI_COMM_NULL nullptr
+#define MPI_GROUP_NULL nullptr
+#define MPI_COMM_WORLD nullptr
#endif
#endif
//! \endcond
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2018, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ * \brief
+ * Defines functionality for communicators across physical nodes.
+ *
+ * \ingroup module_utility
+ */
+#include "gmxpre.h"
+
+#include "physicalnodecommunicator.h"
+
+#include "config.h"
+
+#include "gromacs/utility/basedefinitions.h"
+#include "gromacs/utility/gmxmpi.h"
+
+namespace gmx
+{
+
+void
+MPI_Comm_free_wrapper(MPI_Comm *comm)
+{
+#if GMX_MPI
+ // With thread-MPI *comm is shared between ranks which causes issues with
+ // freeing. But all thread-mpi data is anyhow freed in tMPI_Finalize()
+ // and in practice *comm is always MPI_COMM_WORLD with thread-MPI.
+ // Only the thread-affinity test code uses *comm != MPI_COMM_WORLD.
+ if (!GMX_THREAD_MPI)
+ {
+ MPI_Comm_free(comm);
+ }
+#else
+ GMX_UNUSED_VALUE(comm);
+#endif
+}
+
+PhysicalNodeCommunicator::PhysicalNodeCommunicator(MPI_Comm world, int physicalNodeId)
+{
+#if GMX_MPI
+ int isInitialized;
+ MPI_Initialized(&isInitialized);
+ if (isInitialized)
+ {
+ int sizeOfWorld;
+ MPI_Comm_size(world, &sizeOfWorld);
+ if (sizeOfWorld > 1)
+ {
+ int rankWithinWorld;
+ MPI_Comm_rank(world, &rankWithinWorld);
+ MPI_Comm_split(world, physicalNodeId, rankWithinWorld, &comm_);
+ auto ptr = MPI_Comm_ptr(&comm_);
+ commGuard_.swap(ptr);
+ MPI_Comm_size(comm_, &size_);
+ MPI_Comm_rank(comm_, &rank_);
+ }
+ else
+ {
+ // Handle this trivial case separately, because thread-MPI
+ // doesn't have a valid communicator when there is only
+ // one rank.
+ comm_ = world;
+ size_ = 1;
+ rank_ = 0;
+ }
+ }
+ else
+ {
+ comm_ = MPI_COMM_NULL;
+ size_ = 1;
+ rank_ = 0;
+ }
+#else
+ // Trivial case when there is no MPI support or not initialized
+ GMX_UNUSED_VALUE(world);
+ GMX_UNUSED_VALUE(physicalNodeId);
+ comm_ = nullptr;
+ size_ = 1;
+ rank_ = 0;
+#endif
+}
+
+void PhysicalNodeCommunicator::barrier() const
+{
+#if GMX_MPI
+ if (size_ > 1)
+ {
+ MPI_Barrier(comm_);
+ }
+#else
+ // Nothing to do
+#endif
+}
+
+} // namespace
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2018, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \libinternal \file
+ * \brief
+ * Declares functionality for communicators across physical nodes.
+ *
+ * \inlibraryapi
+ * \ingroup module_utility
+ */
+#ifndef GMX_MDTYPES_PHYSICALNODECOMMUNICATOR_H
+#define GMX_MDTYPES_PHYSICALNODECOMMUNICATOR_H
+
+#include "gromacs/utility/gmxmpi.h"
+#include "gromacs/utility/unique_cptr.h"
+
+namespace gmx
+{
+
+/*! \brief Wrapper function for RAII-style cleanup.
+ *
+ * This is needed to discard the return value so it can be used as a
+ * deleter by a smart pointer. */
+void
+MPI_Comm_free_wrapper(MPI_Comm *comm);
+
+//! Make a smart pointer for MPI communicators.
+using MPI_Comm_ptr = gmx::unique_cptr<MPI_Comm, MPI_Comm_free_wrapper>;
+
+/*! \libinternal \brief Holds a communicator for the physical node of this rank
+ *
+ * This communicator should only be used for appropriate tasks,
+ * e.g. during initialization and finalization. It can contain ranks
+ * from PP, PME and multiple simulations with multisim, so is not
+ * suited for general-purpose communication. */
+class PhysicalNodeCommunicator
+{
+ public:
+ /*! \brief Constructor.
+ *
+ * Communicates within \c world to make intra-communicator \c
+ * comm_ between all ranks that share \c physicalNodeId. */
+ PhysicalNodeCommunicator(MPI_Comm world, int physicalNodeId);
+ //! Communicator for all ranks on this physical node
+ MPI_Comm comm_;
+ //! Number of ranks on this physical node, corresponds to MPI_Comm_size of comm.
+ int size_;
+ //! Rank ID within this physical node, corresponds to MPI_Comm_rank of comm.
+ int rank_;
+ //! RAII handler for cleaning up \c comm_ only when appropriate.
+ MPI_Comm_ptr commGuard_;
+ //! Creates a barrier for all ranks on this physical node.
+ void barrier() const;
+};
+
+} // namespace
+
+#endif
logger.cpp
mutex.cpp
path.cpp
+ physicalnodecommunicator.cpp
stringutil.cpp
textreader.cpp
textwriter.cpp
typetraits.cpp
)
+
+gmx_add_mpi_unit_test(UtilityMpiUnitTests utility-mpi-test 4
+ physicalnodecommunicator-mpi.cpp
+ )
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2018, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+#include "gmxpre.h"
+
+#include <array>
+
+#include <gtest/gtest.h>
+
+#include "gromacs/utility/physicalnodecommunicator.h"
+
+#include "testutils/mpitest.h"
+
+namespace gmx
+{
+namespace
+{
+
+TEST(PhysicalNodeCommunicatorTest, CanConstruct)
+{
+ GMX_MPI_TEST(4);
+ PhysicalNodeCommunicator comm(MPI_COMM_WORLD, 0);
+}
+
+TEST(PhysicalNodeCommunicatorTest, CanCallBarrier)
+{
+ GMX_MPI_TEST(4);
+ PhysicalNodeCommunicator comm(MPI_COMM_WORLD, 0);
+ comm.barrier();
+}
+
+} // namespace
+} // namespace
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2018, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+#include "gmxpre.h"
+
+#include "gromacs/utility/physicalnodecommunicator.h"
+
+#include <array>
+
+#include <gtest/gtest.h>
+
+#include "testutils/mpitest.h"
+
+namespace gmx
+{
+namespace
+{
+
+TEST(PhysicalNodeCommunicatorTest, CanConstruct)
+{
+ PhysicalNodeCommunicator comm(MPI_COMM_WORLD, 0);
+}
+
+TEST(PhysicalNodeCommunicatorTest, CanCallBarrier)
+{
+ PhysicalNodeCommunicator comm(MPI_COMM_WORLD, 0);
+ comm.barrier();
+}
+
+} // namespace
+} // namespace
#include "gromacs/timing/wallcycle.h"
#include "gromacs/topology/mtop_util.h"
#include "gromacs/trajectory/trajectoryframe.h"
+#include "gromacs/utility/basenetwork.h"
#include "gromacs/utility/cstringutil.h"
#include "gromacs/utility/exceptions.h"
#include "gromacs/utility/fatalerror.h"
#include "gromacs/utility/gmxmpi.h"
#include "gromacs/utility/logger.h"
#include "gromacs/utility/loggerbuilder.h"
+#include "gromacs/utility/physicalnodecommunicator.h"
#include "gromacs/utility/pleasecite.h"
#include "gromacs/utility/programcontext.h"
#include "gromacs/utility/smalloc.h"
{
threadMpiMdrunnerAccessBarrier();
- cr = reinitialize_commrec_for_this_thread(cr, ms);
+ cr = reinitialize_commrec_for_this_thread(cr);
GMX_RELEASE_ASSERT(!MASTER(cr), "reinitializeOnSpawnedThread should only be called on spawned threads");
GMX_UNUSED_VALUE(mdrunner_start_fn);
#endif
- return reinitialize_commrec_for_this_thread(cr, ms);
+ return reinitialize_commrec_for_this_thread(cr);
}
} // namespace
gmx::LoggerOwner logOwner(buildLogger(fplog, cr));
gmx::MDLogger mdlog(logOwner.logger());
- hwinfo = gmx_detect_hardware(mdlog);
+ // TODO The thread-MPI master rank makes a working
+ // PhysicalNodeCommunicator here, but it gets rebuilt by all ranks
+ // after the threads have been launched. This works because no use
+ // is made of that communicator until after the execution paths
+ // have rejoined. But it is likely that we can improve the way
+ // this is expressed, e.g. by expressly running detection only the
+ // master rank for thread-MPI, rather than relying on the mutex
+ // and reference count.
+ PhysicalNodeCommunicator physicalNodeComm(MPI_COMM_WORLD, gmx_physicalnode_id_hash());
+ hwinfo = gmx_detect_hardware(mdlog, physicalNodeComm);
gmx_print_detected_hardware(fplog, cr, ms, mdlog, hwinfo);
// TODO Both master and spawned threads call dup_tfn and
// reinitialize_commrec_for_this_thread. Find a way to express
// this better.
+ physicalNodeComm = PhysicalNodeCommunicator(MPI_COMM_WORLD, gmx_physicalnode_id_hash());
}
- /* END OF CAUTION: cr is now reliable */
+ // END OF CAUTION: cr and physicalNodeComm are now reliable
if (PAR(cr))
{
gmx_setup_nodecomm(fplog, cr);
}
- /* Initialize per-physical-node MPI process/thread ID and counters. */
- gmx_init_intranode_counters(cr);
#if GMX_MPI
if (isMultiSim(ms))
{
check_and_update_hw_opt_2(&hw_opt, inputrec->cutoff_scheme);
/* Check and update the number of OpenMP threads requested */
- checkAndUpdateRequestedNumOpenmpThreads(&hw_opt, *hwinfo, cr, ms, pmeRunMode, *mtop);
+ checkAndUpdateRequestedNumOpenmpThreads(&hw_opt, *hwinfo, cr, ms, physicalNodeComm.size_,
+ pmeRunMode, *mtop);
gmx_omp_nthreads_init(mdlog, cr,
hwinfo->nthreads_hw_avail,
+ physicalNodeComm.size_,
hw_opt.nthreads_omp,
hw_opt.nthreads_omp_pme,
!thisRankHasDuty(cr, DUTY_PP),
{
// Produce the task assignment for this rank.
gpuTaskAssignment = runTaskAssignment(gpuIdsToUse, userGpuTaskAssignment, *hwinfo,
- mdlog, cr, ms, gpuTasksOnThisRank);
+ mdlog, cr, ms, physicalNodeComm, gpuTasksOnThisRank);
}
GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
numThreadsOnThisRank = nthreads_pme;
}
- checkHardwareOversubscription(numThreadsOnThisRank,
+ checkHardwareOversubscription(numThreadsOnThisRank, cr->nodeid,
*hwinfo->hardwareTopology,
- cr, ms, mdlog);
+ physicalNodeComm, mdlog);
if (hw_opt.thread_affinity != threadaffOFF)
{
&hw_opt, hwinfo->nthreads_hw_avail, TRUE);
int numThreadsOnThisNode, intraNodeThreadOffset;
- analyzeThreadsOnThisNode(cr, ms, nullptr, numThreadsOnThisRank, &numThreadsOnThisNode,
+ analyzeThreadsOnThisNode(physicalNodeComm, numThreadsOnThisRank, &numThreadsOnThisNode,
&intraNodeThreadOffset);
/* Set the CPU affinity */
mdModules.reset(nullptr); // destruct force providers here as they might also use the GPU
/* Free GPU memory and set a physical node tMPI barrier (which should eventually go away) */
- free_gpu_resources(fr, cr, ms);
+ free_gpu_resources(fr, physicalNodeComm);
free_gpu(nonbondedDeviceInfo);
free_gpu(pmeDeviceInfo);