Make a persistent physical node communicator

author Mark Abraham <mark.j.abraham@gmail.com>

Thu, 25 Jan 2018 14:47:35 +0000 (15:47 +0100)

committer Mark Abraham <mark.j.abraham@gmail.com>

Wed, 14 Mar 2018 13:37:43 +0000 (14:37 +0100)
author Mark Abraham <mark.j.abraham@gmail.com>
Thu, 25 Jan 2018 14:47:35 +0000 (15:47 +0100)
committer Mark Abraham <mark.j.abraham@gmail.com>
Wed, 14 Mar 2018 13:37:43 +0000 (14:37 +0100)
diff --git a/src/gromacs/domdec/domdec.cpp b/src/gromacs/domdec/domdec.cpp

index 83873afb1e4b9513687b927c3093311b05296a2b..eb5305dafdeeb88e32283c8c992bd002cd6c299a 100644 (file)
--- a/src/gromacs/domdec/domdec.cpp
+++ b/src/gromacs/domdec/domdec.cpp
@@ -5417,8 +5417,10 @@ void dd_setup_dlb_resource_sharing(t_commrec            *cr,
      }
      /* Split the PP communicator over the physical nodes */
      /* TODO: See if we should store this (before), as it's also used for
-     * for the nodecomm summution.
+     * for the nodecomm summation.
       */
+    // TODO PhysicalNodeCommunicator could be extended/used to handle
+    // the need for per-node per-group communicators.
      MPI_Comm_split(dd->mpi_comm_all, physicalnode_id_hash, dd->rank,
                     &mpi_comm_pp_physicalnode);
      MPI_Comm_split(mpi_comm_pp_physicalnode, gpu_id, dd->rank,
diff --git a/src/gromacs/ewald/tests/testhardwarecontexts.cpp b/src/gromacs/ewald/tests/testhardwarecontexts.cpp

index 52da6d62371bb064c81ee4cad2703ab9d954f1d4..b62da2b215c88904e7deeae3580e0167b56f7b87 100644 (file)
--- a/src/gromacs/ewald/tests/testhardwarecontexts.cpp
+++ b/src/gromacs/ewald/tests/testhardwarecontexts.cpp
@@ -48,8 +48,10 @@
  
  #include "gromacs/gpu_utils/gpu_utils.h"
  #include "gromacs/hardware/hw_info.h"
+#include "gromacs/utility/basenetwork.h"
  #include "gromacs/utility/exceptions.h"
  #include "gromacs/utility/loggerbuilder.h"
+#include "gromacs/utility/physicalnodecommunicator.h"
  
  namespace gmx
  {
@@ -85,10 +87,11 @@ void callAddGlobalTestEnvironment()
  //! Simple hardware initialization
  static gmx_hw_info_t *hardwareInit()
  {
-    LoggerBuilder builder;
-    LoggerOwner   logOwner(builder.build());
-    MDLogger      log(logOwner.logger());
-    return gmx_detect_hardware(log);
+    LoggerBuilder                        builder;
+    LoggerOwner                          logOwner(builder.build());
+    MDLogger                             log(logOwner.logger());
+    PhysicalNodeCommunicator             physicalNodeComm(MPI_COMM_WORLD, gmx_physicalnode_id_hash());
+    return gmx_detect_hardware(log, physicalNodeComm);
  }
  
  void PmeTestEnvironment::SetUp()
diff --git a/src/gromacs/gmxlib/network.cpp b/src/gromacs/gmxlib/network.cpp

index 3a3257199750f9ee0bf8587e084f9445892be625..58aecdcfc0b335f0799152633dddc49971fa93a7 100644 (file)
--- a/src/gromacs/gmxlib/network.cpp
+++ b/src/gromacs/gmxlib/network.cpp
@@ -57,13 +57,11 @@
  /* The source code in this file should be thread-safe.
        Please keep it that way. */
  
-void gmx_fill_commrec_from_mpi(t_commrec            *cr,
-                               const gmx_multisim_t *ms)
+void gmx_fill_commrec_from_mpi(t_commrec *cr)
  {
  #if !GMX_MPI
      gmx_call("gmx_fill_commrec_from_mpi");
      GMX_UNUSED_VALUE(cr);
-    GMX_UNUSED_VALUE(ms);
  #else
      if (!gmx_mpi_initialized())
      {
@@ -72,21 +70,9 @@ void gmx_fill_commrec_from_mpi(t_commrec            *cr,
  
      cr->nnodes           = gmx_node_num();
      cr->nodeid           = gmx_node_rank();
-    // TODO This communicator should be always available. Currently we
-    // make it multiple times, and keep it only when relevant. But the
-    // cost of an extra communicator is negligible in single-node
-    // cases (both thread-MPI and real MPI) case, and we need it in
-    // all multi-node MPI cases with more than one PP rank per node,
-    // with and without GPUs. By always having it available, we also
-    // don't need to protect calls to mpi_comm_physicalnode, etc.
-    if (PAR(cr) || isMultiSim(ms))
-    {
-        MPI_Comm_split(MPI_COMM_WORLD, gmx_physicalnode_id_hash(), cr->nodeid, &cr->mpi_comm_physicalnode);
-    }
      cr->sim_nodeid       = cr->nodeid;
      cr->mpi_comm_mysim   = MPI_COMM_WORLD;
      cr->mpi_comm_mygroup = MPI_COMM_WORLD;
-
  #endif
  }
  
@@ -96,9 +82,8 @@ t_commrec *init_commrec()
  
      snew(cr, 1);
  
-    cr->mpi_comm_physicalnode = MPI_COMM_NULL;
  #if GMX_LIB_MPI
-    gmx_fill_commrec_from_mpi(cr, nullptr);
+    gmx_fill_commrec_from_mpi(cr);
  #else
      cr->mpi_comm_mysim   = MPI_COMM_NULL;
      cr->mpi_comm_mygroup = MPI_COMM_NULL;
@@ -140,12 +125,6 @@ void done_mpi_in_place_buf(mpi_in_place_buf_t *buf)
  
  void done_commrec(t_commrec *cr)
  {
-#if GMX_MPI
-    if (cr->mpi_comm_physicalnode != MPI_COMM_NULL)
-    {
-        MPI_Comm_free(&cr->mpi_comm_physicalnode);
-    }
-#endif
      if (nullptr != cr->dd)
      {
          // TODO: implement
@@ -155,8 +134,7 @@ void done_commrec(t_commrec *cr)
      sfree(cr);
  }
  
-t_commrec *reinitialize_commrec_for_this_thread(const t_commrec      *cro,
-                                                const gmx_multisim_t *ms)
+t_commrec *reinitialize_commrec_for_this_thread(const t_commrec *cro)
  {
  #if GMX_THREAD_MPI
      t_commrec *cr;
@@ -168,7 +146,7 @@ t_commrec *reinitialize_commrec_for_this_thread(const t_commrec      *cro,
      *cr = *cro;
  
      /* and we start setting our own thread-specific values for things */
-    gmx_fill_commrec_from_mpi(cr, ms);
+    gmx_fill_commrec_from_mpi(cr);
  
      // TODO cr->duty should not be initialized here
      cr->duty             = (DUTY_PP | DUTY_PME);
@@ -176,7 +154,6 @@ t_commrec *reinitialize_commrec_for_this_thread(const t_commrec      *cro,
      return cr;
  #else
      GMX_UNUSED_VALUE(cro);
-    GMX_UNUSED_VALUE(ms);
      return nullptr;
  #endif
  }
@@ -203,6 +180,8 @@ void gmx_setup_nodecomm(FILE gmx_unused *fplog, t_commrec *cr)
  #if GMX_MPI
      int n, rank;
  
+    // TODO PhysicalNodeCommunicator could be extended/used to handle
+    // the need for per-node per-group communicators.
      MPI_Comm_size(cr->mpi_comm_mygroup, &n);
      MPI_Comm_rank(cr->mpi_comm_mygroup, &rank);
  
@@ -269,77 +248,6 @@ void gmx_setup_nodecomm(FILE gmx_unused *fplog, t_commrec *cr)
  #endif
  }
  
-void gmx_init_intranode_counters(t_commrec *cr)
-{
-    /* counters for PP+PME and PP-only processes on my physical node */
-    int nrank_intranode, rank_intranode;
-    /* thread-MPI is not initialized when not running in parallel */
-#if GMX_MPI && !GMX_THREAD_MPI
-    int nrank_world, rank_world;
-    int i, myhash, *hash, *hash_s, *hash_pp, *hash_pp_s;
-
-    MPI_Comm_size(MPI_COMM_WORLD, &nrank_world);
-    MPI_Comm_rank(MPI_COMM_WORLD, &rank_world);
-
-    /* Get a (hopefully unique) hash that identifies our physical node */
-    myhash = gmx_physicalnode_id_hash();
-
-    /* We can't rely on MPI_IN_PLACE, so we need send and receive buffers */
-    snew(hash,   nrank_world);
-    snew(hash_s, nrank_world);
-    snew(hash_pp,   nrank_world);
-    snew(hash_pp_s, nrank_world);
-
-    hash_s[rank_world]    = myhash;
-    hash_pp_s[rank_world] = thisRankHasDuty(cr, DUTY_PP) ? myhash : -1;
-
-    MPI_Allreduce(hash_s,    hash,    nrank_world, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
-    MPI_Allreduce(hash_pp_s, hash_pp, nrank_world, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
-
-    nrank_intranode    = 0;
-    rank_intranode     = 0;
-    for (i = 0; i < nrank_world; i++)
-    {
-        if (hash[i] == myhash)
-        {
-            nrank_intranode++;
-            if (i < rank_world)
-            {
-                rank_intranode++;
-            }
-        }
-    }
-    sfree(hash);
-    sfree(hash_s);
-    sfree(hash_pp);
-    sfree(hash_pp_s);
-#else
-    /* Serial or thread-MPI code: we run within a single physical node */
-    nrank_intranode    = cr->nnodes;
-    rank_intranode     = cr->sim_nodeid;
-#endif
-
-    if (debug)
-    {
-        char sbuf[STRLEN];
-        if (thisRankHasDuty(cr, DUTY_PP) && thisRankHasDuty(cr, DUTY_PME))
-        {
-            sprintf(sbuf, "PP+PME");
-        }
-        else
-        {
-            sprintf(sbuf, "%s", thisRankHasDuty(cr, DUTY_PP) ? "PP" : "PME");
-        }
-        fprintf(debug, "On %3s rank %d: nrank_intranode=%d, rank_intranode=%d\n",
-                sbuf, cr->sim_nodeid,
-                nrank_intranode, rank_intranode);
-    }
-
-    cr->nrank_intranode    = nrank_intranode;
-    cr->rank_intranode     = rank_intranode;
-}
-
-
  void gmx_barrier(const t_commrec gmx_unused *cr)
  {
  #if !GMX_MPI
@@ -349,15 +257,6 @@ void gmx_barrier(const t_commrec gmx_unused *cr)
  #endif
  }
  
-void gmx_barrier_physical_node(const t_commrec gmx_unused *cr)
-{
-#if !GMX_MPI
-    gmx_call("gmx_barrier_physical_node");
-#else
-    MPI_Barrier(cr->mpi_comm_physicalnode);
-#endif
-}
-
  void gmx_bcast(int gmx_unused nbytes, void gmx_unused *b, const t_commrec gmx_unused *cr)
  {
  #if !GMX_MPI
diff --git a/src/gromacs/gmxlib/network.h b/src/gromacs/gmxlib/network.h

index 95a163b2e894ed110f9fa88005c45533cd6eda47..e690525d68760ecfab263fc68b236b3e1f45c178 100644 (file)
--- a/src/gromacs/gmxlib/network.h
+++ b/src/gromacs/gmxlib/network.h
@@ -56,8 +56,7 @@ struct t_commrec *init_commrec(void);
  void done_commrec(t_commrec *cr);
  /* Free memory associated with the commrec. */
  
-struct t_commrec *reinitialize_commrec_for_this_thread(const t_commrec      *cro,
-                                                       const gmx_multisim_t *ms);
+struct t_commrec *reinitialize_commrec_for_this_thread(const t_commrec *cro);
  
  /* Initialize communication records for thread-parallel simulations.
     Must be called on all threads before any communication takes place by
@@ -65,22 +64,15 @@ struct t_commrec *reinitialize_commrec_for_this_thread(const t_commrec      *cro
     thread-local versions (a small memory leak results because we don't
     deallocate the old shared version).  */
  
-void gmx_fill_commrec_from_mpi(t_commrec            *cr,
-                               const gmx_multisim_t *ms);
+void gmx_fill_commrec_from_mpi(t_commrec            *cr);
  /* Continues t_commrec construction */
  
  void gmx_setup_nodecomm(FILE *fplog, struct t_commrec *cr);
  /* Sets up fast global communication for clusters with multi-core nodes */
  
-void gmx_init_intranode_counters(struct t_commrec *cr);
-/* Initializes intra-physical-node MPI process/thread counts and ID. */
-
  void gmx_barrier(const struct t_commrec *cr);
  /* Wait till all processes in cr->mpi_comm_mygroup have reached the barrier */
  
-void gmx_barrier_physical_node(const struct t_commrec *cr);
-/* Wait till all processes in cr->mpi_comm_physical_node have reached the barrier */
-
  void gmx_bcast(int nbytes, void *b, const struct t_commrec *cr);
  /* Broadcast nbytes bytes from the master to cr->mpi_comm_mygroup */
  
diff --git a/src/gromacs/hardware/detecthardware.cpp b/src/gromacs/hardware/detecthardware.cpp

index 2f234e645ef45f795252b946871d7a32756e5ef6..ca2e9ca4176c02aa4abb207d001e8e97db574c26 100644 (file)
--- a/src/gromacs/hardware/detecthardware.cpp
+++ b/src/gromacs/hardware/detecthardware.cpp
@@ -67,6 +67,7 @@
  #include "gromacs/utility/fatalerror.h"
  #include "gromacs/utility/gmxassert.h"
  #include "gromacs/utility/logger.h"
+#include "gromacs/utility/physicalnodecommunicator.h"
  #include "gromacs/utility/programcontext.h"
  #include "gromacs/utility/smalloc.h"
  #include "gromacs/utility/stringutil.h"
@@ -108,14 +109,9 @@ static int                            n_hwinfo = 0;
  static tMPI_Thread_mutex_t            hw_info_lock = TMPI_THREAD_MUTEX_INITIALIZER;
  
  //! Detect GPUs, if that makes sense to attempt.
-static void gmx_detect_gpus(const gmx::MDLogger &mdlog)
+static void gmx_detect_gpus(const gmx::MDLogger            &mdlog,
+                            const PhysicalNodeCommunicator &physicalNodeComm)
  {
-#if GMX_LIB_MPI
-    int              rank_world;
-    MPI_Comm         physicalnode_comm;
-#endif
-    bool             isMasterRankOfNode;
-
      hwinfo_g->gpu_info.bDetectGPUs =
          (bGPUBinary && getenv("GMX_DISABLE_GPU_DETECTION") == nullptr);
      if (!hwinfo_g->gpu_info.bDetectGPUs)
@@ -123,36 +119,16 @@ static void gmx_detect_gpus(const gmx::MDLogger &mdlog)
          return;
      }
  
-    /* Under certain circumstances MPI ranks on the same physical node
-     * can not simultaneously access the same GPU(s). Therefore we run
-     * the detection only on one MPI rank per node and broadcast the info.
-     * Note that with thread-MPI only a single thread runs this code.
-     *
-     * NOTE: We can't broadcast gpu_info with OpenCL as the device and platform
-     * ID stored in the structure are unique for each rank (even if a device
-     * is shared by multiple ranks).
-     *
-     * TODO: We should also do CPU hardware detection only once on each
-     * physical node and broadcast it, instead of do it on every MPI rank.
-     */
+    bool isMasterRankOfPhysicalNode = true;
  #if GMX_LIB_MPI
-    /* A split of MPI_COMM_WORLD over physical nodes is only required here,
-     * so we create and destroy it locally.
-     */
-    MPI_Comm_rank(MPI_COMM_WORLD, &rank_world);
-    MPI_Comm_split(MPI_COMM_WORLD, gmx_physicalnode_id_hash(),
-                   rank_world, &physicalnode_comm);
-    {
-        int rankOnNode = -1;
-        MPI_Comm_rank(physicalnode_comm, &rankOnNode);
-        isMasterRankOfNode = (rankOnNode == 0);
-    }
+    isMasterRankOfPhysicalNode = (physicalNodeComm.rank_ == 0);
  #else
      // We choose to run the detection only once with thread-MPI and
      // use reference counting on the results of the detection to
      // enforce it. But we can assert that this is true.
      GMX_RELEASE_ASSERT(n_hwinfo == 0, "Cannot run GPU detection on non-master thread-MPI ranks");
-    isMasterRankOfNode = true;
+    GMX_UNUSED_VALUE(physicalNodeComm);
+    isMasterRankOfPhysicalNode = true;
  #endif
  
      /* The OpenCL support requires us to run detection on all ranks.
@@ -160,7 +136,7 @@ static void gmx_detect_gpus(const gmx::MDLogger &mdlog)
       * and send the information to the other ranks over MPI. */
      bool allRanksMustDetectGpus = (GMX_GPU == GMX_GPU_OPENCL);
      bool gpusCanBeDetected      = false;
-    if (isMasterRankOfNode || allRanksMustDetectGpus)
+    if (isMasterRankOfPhysicalNode || allRanksMustDetectGpus)
      {
          std::string errorMessage;
          gpusCanBeDetected = canDetectGpus(&errorMessage);
@@ -185,7 +161,7 @@ static void gmx_detect_gpus(const gmx::MDLogger &mdlog)
      if (!allRanksMustDetectGpus)
      {
          /* Broadcast the GPU info to the other ranks within this node */
-        MPI_Bcast(&hwinfo_g->gpu_info.n_dev, 1, MPI_INT, 0, physicalnode_comm);
+        MPI_Bcast(&hwinfo_g->gpu_info.n_dev, 1, MPI_INT, 0, physicalNodeComm.comm_);
  
          if (hwinfo_g->gpu_info.n_dev > 0)
          {
@@ -193,24 +169,23 @@ static void gmx_detect_gpus(const gmx::MDLogger &mdlog)
  
              dev_size = hwinfo_g->gpu_info.n_dev*sizeof_gpu_dev_info();
  
-            if (!isMasterRankOfNode)
+            if (!isMasterRankOfPhysicalNode)
              {
                  hwinfo_g->gpu_info.gpu_dev =
                      (struct gmx_device_info_t *)malloc(dev_size);
              }
              MPI_Bcast(hwinfo_g->gpu_info.gpu_dev, dev_size, MPI_BYTE,
-                      0, physicalnode_comm);
+                      0, physicalNodeComm.comm_);
              MPI_Bcast(&hwinfo_g->gpu_info.n_dev_compatible, 1, MPI_INT,
-                      0, physicalnode_comm);
+                      0, physicalNodeComm.comm_);
          }
      }
-
-    MPI_Comm_free(&physicalnode_comm);
  #endif
  }
  
  //! Reduce the locally collected \p hwinfo_g over MPI ranks
-static void gmx_collect_hardware_mpi(const gmx::CpuInfo &cpuInfo)
+static void gmx_collect_hardware_mpi(const gmx::CpuInfo             &cpuInfo,
+                                     const PhysicalNodeCommunicator &physicalNodeComm)
  {
      const int  ncore        = hwinfo_g->hardwareTopology->numberOfCores();
      /* Zen has family=23, for now we treat future AMD CPUs like Zen */
@@ -218,14 +193,9 @@ static void gmx_collect_hardware_mpi(const gmx::CpuInfo &cpuInfo)
                                 cpuInfo.family() >= 23);
  
  #if GMX_LIB_MPI
-    int       rank_id;
-    int       nrank, rank, nhwthread, ngpu, i;
+    int       nhwthread, ngpu, i;
      int       gpu_hash;
-    int      *buf, *all;
  
-    rank_id   = gmx_physicalnode_id_hash();
-    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-    MPI_Comm_size(MPI_COMM_WORLD, &nrank);
      nhwthread = hwinfo_g->nthreads_hw_avail;
      ngpu      = hwinfo_g->gpu_info.n_dev_compatible;
      /* Create a unique hash of the GPU type(s) in this node */
@@ -246,48 +216,20 @@ static void gmx_collect_hardware_mpi(const gmx::CpuInfo &cpuInfo)
          gpu_hash ^= gmx_string_fullhash_func(stmp, gmx_string_hash_init);
      }
  
-    snew(buf, nrank);
-    snew(all, nrank);
-    buf[rank] = rank_id;
-
-    MPI_Allreduce(buf, all, nrank, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
-
-    gmx_bool bFound;
-    int      nnode0, ncore0, nhwthread0, ngpu0, r;
-
-    bFound     = FALSE;
-    ncore0     = 0;
-    nnode0     = 0;
-    nhwthread0 = 0;
-    ngpu0      = 0;
-    for (r = 0; r < nrank; r++)
-    {
-        if (all[r] == rank_id)
-        {
-            if (!bFound && r == rank)
-            {
-                /* We are the first rank in this physical node */
-                nnode0     = 1;
-                ncore0     = ncore;
-                nhwthread0 = nhwthread;
-                ngpu0      = ngpu;
-            }
-            bFound = TRUE;
-        }
-    }
-
-    sfree(buf);
-    sfree(all);
-
      constexpr int                          numElementsCounts =  4;
      std::array<int, numElementsCounts>     countsReduced;
      {
-        std::array<int, numElementsCounts> countsLocal;
-        /* Sum values from only intra-rank 0 so we get the sum over all nodes */
-        countsLocal[0] = nnode0;
-        countsLocal[1] = ncore0;
-        countsLocal[2] = nhwthread0;
-        countsLocal[3] = ngpu0;
+        std::array<int, numElementsCounts> countsLocal = {{0}};
+        // Organize to sum values from only one rank within each node,
+        // so we get the sum over all nodes.
+        bool isMasterRankOfPhysicalNode = (physicalNodeComm.rank_ == 0);
+        if (isMasterRankOfPhysicalNode)
+        {
+            countsLocal[0] = 1;
+            countsLocal[1] = ncore;
+            countsLocal[2] = nhwthread;
+            countsLocal[3] = ngpu;
+        }
  
          MPI_Allreduce(countsLocal.data(), countsReduced.data(), countsLocal.size(),
                        MPI_INT, MPI_SUM, MPI_COMM_WORLD);
@@ -346,6 +288,7 @@ static void gmx_collect_hardware_mpi(const gmx::CpuInfo &cpuInfo)
      hwinfo_g->simd_suggest_max    = static_cast<int>(simdSuggested(cpuInfo));
      hwinfo_g->bIdenticalGPUs      = TRUE;
      hwinfo_g->haveAmdZenCpu       = cpuIsAmdZen;
+    GMX_UNUSED_VALUE(physicalNodeComm);
  #endif
  }
  
@@ -474,7 +417,8 @@ hardwareTopologyDoubleCheckDetection(const gmx::MDLogger gmx_unused         &mdl
  #endif
  }
  
-gmx_hw_info_t *gmx_detect_hardware(const gmx::MDLogger &mdlog)
+gmx_hw_info_t *gmx_detect_hardware(const gmx::MDLogger            &mdlog,
+                                   const PhysicalNodeCommunicator &physicalNodeComm)
  {
      int ret;
  
@@ -490,6 +434,8 @@ gmx_hw_info_t *gmx_detect_hardware(const gmx::MDLogger &mdlog)
      {
          hwinfo_g = compat::make_unique<gmx_hw_info_t>();
  
+        /* TODO: We should also do CPU hardware detection only once on each
+         * physical node and broadcast it, instead of do it on every MPI rank. */
          hwinfo_g->cpuInfo             = new gmx::CpuInfo(gmx::CpuInfo::detect());
  
          hardwareTopologyPrepareDetection();
@@ -509,8 +455,8 @@ gmx_hw_info_t *gmx_detect_hardware(const gmx::MDLogger &mdlog)
          hwinfo_g->gpu_info.n_dev_compatible = 0;
          hwinfo_g->gpu_info.gpu_dev          = nullptr;
  
-        gmx_detect_gpus(mdlog);
-        gmx_collect_hardware_mpi(*hwinfo_g->cpuInfo);
+        gmx_detect_gpus(mdlog, physicalNodeComm);
+        gmx_collect_hardware_mpi(*hwinfo_g->cpuInfo, physicalNodeComm);
      }
      /* increase the reference counter */
      n_hwinfo++;
diff --git a/src/gromacs/hardware/detecthardware.h b/src/gromacs/hardware/detecthardware.h

index 5ba14eb52be7104d734c4f38bc873fa744d204cd..d339448298db249ed2297e592b0dda902ed76451 100644 (file)
--- a/src/gromacs/hardware/detecthardware.h
+++ b/src/gromacs/hardware/detecthardware.h
@@ -41,6 +41,7 @@ struct gmx_hw_info_t;
  namespace gmx
  {
  class MDLogger;
+class PhysicalNodeCommunicator;
  
  /*! \brief Run detection, consistency checks, and make available on all ranks.
   *
@@ -52,7 +53,8 @@ class MDLogger;
   *
   * May do communication on MPI_COMM_WORLD when compiled with real MPI.
   */
-gmx_hw_info_t *gmx_detect_hardware(const gmx::MDLogger &mdlog);
+gmx_hw_info_t *gmx_detect_hardware(const gmx::MDLogger            &mdlog,
+                                   const PhysicalNodeCommunicator &physicalNodeComm);
  
  /*! \brief Free the hwinfo structure */
  void gmx_hardware_info_free();
diff --git a/src/gromacs/mdlib/force.h b/src/gromacs/mdlib/force.h

index 93c1d10f452a3ed827d7d03be947da82becdc45c..55e889c07700fb0d18eaacea5d22a02f6bedd6c8 100644 (file)
--- a/src/gromacs/mdlib/force.h
+++ b/src/gromacs/mdlib/force.h
@@ -68,6 +68,7 @@ namespace gmx
  {
  class ForceWithVirial;
  class MDLogger;
+class PhysicalNodeCommunicator;
  }
  
  void calc_vir(int nxf, rvec x[], rvec f[], tensor vir,
@@ -214,8 +215,7 @@ void do_force_lowlevel(t_forcerec   *fr,
                         float        *cycles_pme);
  /* Call all the force routines */
  
-void free_gpu_resources(const t_forcerec            *fr,
-                        const t_commrec             *cr,
-                        const gmx_multisim_t        *ms);
+void free_gpu_resources(const t_forcerec                    *fr,
+                        const gmx::PhysicalNodeCommunicator &physicalNodeCommunicator);
  
  #endif
diff --git a/src/gromacs/mdlib/forcerec.cpp b/src/gromacs/mdlib/forcerec.cpp

index 0a98e7ada40e614f0aa5bff7f4cf6a80c09a293b..b77cf9a602871528eace607f402344d26ff42114 100644 (file)
--- a/src/gromacs/mdlib/forcerec.cpp
+++ b/src/gromacs/mdlib/forcerec.cpp
@@ -95,6 +95,7 @@
  #include "gromacs/utility/fatalerror.h"
  #include "gromacs/utility/gmxassert.h"
  #include "gromacs/utility/logger.h"
+#include "gromacs/utility/physicalnodecommunicator.h"
  #include "gromacs/utility/pleasecite.h"
  #include "gromacs/utility/smalloc.h"
  #include "gromacs/utility/strconvert.h"
@@ -3108,9 +3109,8 @@ void init_forcerec(FILE                             *fp,
   * \todo Remove physical node barrier from this function after making sure
   * that it's not needed anymore (with a shared GPU run).
   */
-void free_gpu_resources(const t_forcerec        *fr,
-                        const t_commrec         *cr,
-                        const gmx_multisim_t    *ms)
+void free_gpu_resources(const t_forcerec                    *fr,
+                        const gmx::PhysicalNodeCommunicator &physicalNodeCommunicator)
  {
      bool isPPrankUsingGPU = fr && fr->nbv && fr->nbv->bUseGPU;
  
@@ -3133,8 +3133,8 @@ void free_gpu_resources(const t_forcerec        *fr,
       * Note: it is safe to not call the barrier on the ranks which do not use GPU,
       * but it is easier and more futureproof to call it on the whole node.
       */
-    if (GMX_THREAD_MPI && (PAR(cr) || isMultiSim(ms)))
+    if (GMX_THREAD_MPI)
      {
-        gmx_barrier_physical_node(cr);
+        physicalNodeCommunicator.barrier();
      }
  }
diff --git a/src/gromacs/mdlib/gmx_omp_nthreads.cpp b/src/gromacs/mdlib/gmx_omp_nthreads.cpp

index d8df2950446a29187303b94c54f2f16ab88c3ed8..de97bf8689231af767dc9301d9684969841ed42a 100644 (file)
--- a/src/gromacs/mdlib/gmx_omp_nthreads.cpp
+++ b/src/gromacs/mdlib/gmx_omp_nthreads.cpp
@@ -235,7 +235,7 @@ static void manage_number_of_openmp_threads(const gmx::MDLogger &mdlog,
                                              int                  omp_nthreads_pme_req,
                                              gmx_bool gmx_unused  bThisNodePMEOnly,
                                              gmx_bool             bFullOmpSupport,
-                                            int                  nppn,
+                                            int                  numRanksOnThisNode,
                                              gmx_bool             bSepPME)
  {
      int      nth;
@@ -304,10 +304,10 @@ static void manage_number_of_openmp_threads(const gmx::MDLogger &mdlog,
          /* max available threads per node */
          nth = nthreads_hw_avail;
  
-        /* divide the threads among the MPI processes/tMPI threads */
-        if (nth >= nppn)
+        /* divide the threads among the MPI ranks */
+        if (nth >= numRanksOnThisNode)
          {
-            nth /= nppn;
+            nth /= numRanksOnThisNode;
          }
          else
          {
@@ -467,26 +467,23 @@ reportOpenmpSettings(const gmx::MDLogger &mdlog,
  
  void gmx_omp_nthreads_init(const gmx::MDLogger &mdlog, t_commrec *cr,
                             int nthreads_hw_avail,
+                           int numRanksOnThisNode,
                             int omp_nthreads_req,
                             int omp_nthreads_pme_req,
                             gmx_bool bThisNodePMEOnly,
                             gmx_bool bFullOmpSupport)
  {
-    int        nppn;
      gmx_bool   bSepPME;
  
      const bool bOMP = GMX_OPENMP;
  
-    /* number of MPI processes/threads per physical node */
-    nppn = cr->nrank_intranode;
-
      bSepPME = (thisRankHasDuty(cr, DUTY_PP) != thisRankHasDuty(cr, DUTY_PME));
  
      manage_number_of_openmp_threads(mdlog, cr, bOMP,
                                      nthreads_hw_avail,
                                      omp_nthreads_req, omp_nthreads_pme_req,
                                      bThisNodePMEOnly, bFullOmpSupport,
-                                    nppn, bSepPME);
+                                    numRanksOnThisNode, bSepPME);
  #if GMX_THREAD_MPI
      /* Non-master threads have to wait for the OpenMP management to be
       * done, so that code elsewhere that uses OpenMP can be certain
diff --git a/src/gromacs/mdlib/gmx_omp_nthreads.h b/src/gromacs/mdlib/gmx_omp_nthreads.h

index 510825bb83435829b5b673cf878d574550adc7b7..2fb02eef05436bbcbdd7553f35fcdbc5592db9a1 100644 (file)
--- a/src/gromacs/mdlib/gmx_omp_nthreads.h
+++ b/src/gromacs/mdlib/gmx_omp_nthreads.h
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2012,2013,2014,2015,2016,2017, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014,2015,2016,2017,2018, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -65,6 +65,7 @@ typedef enum module_nth
   * This function should caled only once during the initialization of mdrun. */
  void gmx_omp_nthreads_init(const gmx::MDLogger &fplog, t_commrec *cr,
                             int nthreads_hw_avail,
+                           int numRanksOnThisNode,
                             int omp_nthreads_req,
                             int omp_nthreads_pme_req,
                             gmx_bool bCurrNodePMEOnly,
diff --git a/src/gromacs/mdrunutility/tests/threadaffinitytest.cpp b/src/gromacs/mdrunutility/tests/threadaffinitytest.cpp

index 862322a5a9355210024d31c0f71041eacd51633d..4e9524c4112b07e9cd204b544aea20b825b483aa 100644 (file)
--- a/src/gromacs/mdrunutility/tests/threadaffinitytest.cpp
+++ b/src/gromacs/mdrunutility/tests/threadaffinitytest.cpp
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2016,2017, by the GROMACS development team, led by
+ * Copyright (c) 2016,2017,2018, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -52,7 +52,7 @@ namespace test
  {
  
  MockThreadAffinityAccess::MockThreadAffinityAccess()
-    : supported_(true), physicalNodeId_(0)
+    : supported_(true)
  {
      using ::testing::_;
      using ::testing::Return;
@@ -70,13 +70,13 @@ ThreadAffinityTestHelper::ThreadAffinityTestHelper()
      snew(cr_, 1);
      cr_->nnodes         = gmx_node_num();
      cr_->nodeid         = gmx_node_rank();
-    cr_->rank_intranode = cr_->nodeid;
      cr_->duty           = DUTY_PP;
  #if GMX_MPI
      cr_->mpi_comm_mysim = MPI_COMM_WORLD;
  #endif
      hwOpt_.thread_affinity     = threadaffAUTO;
      hwOpt_.totNumThreadsIsAuto = false;
+    physicalNodeId_            = 0;
  }
  
  ThreadAffinityTestHelper::~ThreadAffinityTestHelper()
diff --git a/src/gromacs/mdrunutility/tests/threadaffinitytest.h b/src/gromacs/mdrunutility/tests/threadaffinitytest.h

index e3d78db24f651d409ce5b123650ec42bc2570b05..31b47ef60cdf90ec67d9470af0612a8a2c9e9e1d 100644 (file)
--- a/src/gromacs/mdrunutility/tests/threadaffinitytest.h
+++ b/src/gromacs/mdrunutility/tests/threadaffinitytest.h
@@ -43,6 +43,7 @@
  #include "gromacs/hardware/hw_info.h"
  #include "gromacs/mdrunutility/threadaffinity.h"
  #include "gromacs/utility/logger.h"
+#include "gromacs/utility/physicalnodecommunicator.h"
  #include "gromacs/utility/stringutil.h"
  
  #include "testutils/loggertest.h"
@@ -64,15 +65,12 @@ class MockThreadAffinityAccess : public IThreadAffinityAccess
          ~MockThreadAffinityAccess();
  
          void setSupported(bool supported) { supported_ = supported; }
-        void setPhysicalNodeId(int nodeId) { physicalNodeId_ = nodeId; }
  
          virtual bool isThreadAffinitySupported() const { return supported_; }
-        virtual int physicalNodeId() const { return physicalNodeId_; }
          MOCK_METHOD1(setCurrentThreadAffinityToCore, bool(int core));
  
      private:
          bool supported_;
-        int  physicalNodeId_;
  };
  
  class ThreadAffinityTestHelper
@@ -97,7 +95,7 @@ class ThreadAffinityTestHelper
  
          void setPhysicalNodeId(int nodeId)
          {
-            affinityAccess_.setPhysicalNodeId(nodeId);
+            physicalNodeId_ = nodeId;
          }
  
          void setLogicalProcessorCount(int logicalProcessorCount);
@@ -171,8 +169,9 @@ class ThreadAffinityTestHelper
              {
                  setLogicalProcessorCount(1);
              }
+            gmx::PhysicalNodeCommunicator comm(MPI_COMM_WORLD, physicalNodeId_);
              int numThreadsOnThisNode, indexWithinNodeOfFirstThreadOnThisRank;
-            analyzeThreadsOnThisNode(cr_, nullptr, &affinityAccess_,
+            analyzeThreadsOnThisNode(comm,
                                       numThreadsOnThisRank,
                                       &numThreadsOnThisNode,
                                       &indexWithinNodeOfFirstThreadOnThisRank);
@@ -187,6 +186,7 @@ class ThreadAffinityTestHelper
          std::unique_ptr<HardwareTopology>  hwTop_;
          MockThreadAffinityAccess           affinityAccess_;
          LoggerTestHelper                   logHelper_;
+        int                                physicalNodeId_;
  };
  
  } // namespace test
diff --git a/src/gromacs/mdrunutility/threadaffinity.cpp b/src/gromacs/mdrunutility/threadaffinity.cpp

index b316c4aa00e639b8c8f7b1558c2c7530b74f2fdd..c0a96bf819069af9f25d7f3905c43057c830fb8e 100644 (file)
--- a/src/gromacs/mdrunutility/threadaffinity.cpp
+++ b/src/gromacs/mdrunutility/threadaffinity.cpp
@@ -59,6 +59,7 @@
  #include "gromacs/utility/gmxassert.h"
  #include "gromacs/utility/gmxomp.h"
  #include "gromacs/utility/logger.h"
+#include "gromacs/utility/physicalnodecommunicator.h"
  #include "gromacs/utility/programcontext.h"
  #include "gromacs/utility/smalloc.h"
  #include "gromacs/utility/unique_cptr.h"
@@ -73,10 +74,6 @@ class DefaultThreadAffinityAccess : public gmx::IThreadAffinityAccess
          {
              return tMPI_Thread_setaffinity_support() == TMPI_SETAFFINITY_SUPPORT_YES;
          }
-        virtual int physicalNodeId() const
-        {
-            return gmx_physicalnode_id_hash();
-        }
          virtual bool setCurrentThreadAffinityToCore(int core)
          {
              const int ret = tMPI_Thread_setaffinity_single(tMPI_Thread_self(), core);
@@ -354,42 +351,26 @@ static bool set_affinity(const t_commrec *cr, int nthread_local, int intraNodeTh
      return allAffinitiesSet;
  }
  
-void analyzeThreadsOnThisNode(const t_commrec            *cr,
-                              const gmx_multisim_t       *ms,
-                              gmx::IThreadAffinityAccess *affinityAccess,
-                              int                         numThreadsOnThisRank,
-                              int                        *numThreadsOnThisNode,
-                              int                        *intraNodeThreadOffset)
+void analyzeThreadsOnThisNode(const gmx::PhysicalNodeCommunicator &physicalNodeComm,
+                              int                                  numThreadsOnThisRank,
+                              int                                 *numThreadsOnThisNode,
+                              int                                 *intraNodeThreadOffset)
  {
      *intraNodeThreadOffset                  = 0;
      *numThreadsOnThisNode                   = numThreadsOnThisRank;
  #if GMX_MPI
-    if (PAR(cr) || isMultiSim(ms))
+    if (physicalNodeComm.size_ > 1)
      {
-        if (affinityAccess == nullptr)
-        {
-            affinityAccess = &g_defaultAffinityAccess;
-        }
-
          /* We need to determine a scan of the thread counts in this
-         * compute node.
-         */
-        MPI_Comm comm_intra;
-
-        MPI_Comm_split(MPI_COMM_WORLD,
-                       affinityAccess->physicalNodeId(), cr->rank_intranode,
-                       &comm_intra);
-        MPI_Scan(&numThreadsOnThisRank, intraNodeThreadOffset, 1, MPI_INT, MPI_SUM, comm_intra);
+         * compute node. */
+        MPI_Scan(&numThreadsOnThisRank, intraNodeThreadOffset, 1, MPI_INT, MPI_SUM, physicalNodeComm.comm_);
          /* MPI_Scan is inclusive, but here we need exclusive */
          *intraNodeThreadOffset -= numThreadsOnThisRank;
          /* Get the total number of threads on this physical node */
-        MPI_Allreduce(&numThreadsOnThisRank, numThreadsOnThisNode, 1, MPI_INT, MPI_SUM, comm_intra);
-        MPI_Comm_free(&comm_intra);
+        MPI_Allreduce(&numThreadsOnThisRank, numThreadsOnThisNode, 1, MPI_INT, MPI_SUM, physicalNodeComm.comm_);
      }
  #else
-    GMX_UNUSED_VALUE(cr);
-    GMX_UNUSED_VALUE(ms);
-    GMX_UNUSED_VALUE(affinityAccess);
+    GMX_UNUSED_VALUE(physicalNodeComm);
  #endif
  
  }
diff --git a/src/gromacs/mdrunutility/threadaffinity.h b/src/gromacs/mdrunutility/threadaffinity.h

index 499aecb52e3c5b466eead7e2ecfb38f04334fd26..384842b79c9d65b6bb05b730fe5448f3590947e7 100644 (file)
--- a/src/gromacs/mdrunutility/threadaffinity.h
+++ b/src/gromacs/mdrunutility/threadaffinity.h
@@ -55,12 +55,12 @@ namespace gmx
  
  class HardwareTopology;
  class MDLogger;
+class PhysicalNodeCommunicator;
  
  class IThreadAffinityAccess
  {
      public:
          virtual bool isThreadAffinitySupported() const        = 0;
-        virtual int physicalNodeId() const                    = 0;
          virtual bool setCurrentThreadAffinityToCore(int core) = 0;
  
      protected:
@@ -70,24 +70,11 @@ class IThreadAffinityAccess
  } // namespace gmx
  
  /*! \brief Communicates within physical nodes to discover the
- * distribution of threads over ranks.
- *
- * See gmx_set_thread_affinity(), which consumes this output.
- *
- * \param[in]  cr                     Communication handler.
- * \param[in]  ms                     Multi-simulation handler.
- * \param[in]  affinityAccess         Interface for low-level access to affinity details.
- * \param[in]  numThreadsOnThisRank   The number of threads on this rank.
- * \param[out] numThreadsOnThisNode   On exit, the number of threads on all ranks of this node.
- * \param[out] intraNodeThreadOffset  On exit, the index of the first hardware thread of this rank
- *   in the set of all the threads of all MPI ranks within a node (ordered by MPI rank ID).
- */
-void analyzeThreadsOnThisNode(const t_commrec            *cr,
-                              const gmx_multisim_t       *ms,
-                              gmx::IThreadAffinityAccess *affinityAccess,
-                              int                         numThreadsOnThisRank,
-                              int                        *numThreadsOnThisNode,
-                              int                        *intraNodeThreadOffset);
+ * distribution of threads over ranks. */
+void analyzeThreadsOnThisNode(const gmx::PhysicalNodeCommunicator &physicalNodeComm,
+                              int                                  numThreadsOnThisRank,
+                              int                                 *numThreadsOnThisNode,
+                              int                                 *intraNodeThreadOffset);
  
  /*! \brief
   * Sets the thread affinity using the requested setting stored in hw_opt.
diff --git a/src/gromacs/mdtypes/commrec.h b/src/gromacs/mdtypes/commrec.h

index 15fdd97fd00dad3d17d9f7e2aa3c58612ae8fece..1003ca2dfd6d77cc4a90aa94e054c9ae5b1003d1 100644 (file)
--- a/src/gromacs/mdtypes/commrec.h
+++ b/src/gromacs/mdtypes/commrec.h
@@ -107,13 +107,6 @@ struct t_commrec {
      MPI_Comm mpi_comm_mygroup;         /* subset of mpi_comm_mysim including only
                                            the ranks in the same group (PP or PME) */
  
-    /* MPI ranks and a communicator within a physical node for hardware access */
-    MPI_Comm       mpi_comm_physicalnode; /* communicator for all ranks of the physical node
-                                           * NOTE: this communicator should only be used during initialization and finalization, as it can contain ranks from PP, PME and multiple simulations with multisim
-                                           */
-    int            nrank_intranode;       /* nr of ranks on this physical node */
-    int            rank_intranode;        /* our rank on this physical node */
-
      gmx_nodecomm_t nc;
  
      /* For domain decomposition */
diff --git a/src/gromacs/taskassignment/findallgputasks.cpp b/src/gromacs/taskassignment/findallgputasks.cpp

index a9c63e5e4892e78df28a72af890ac48c5e42a6f7..9b1ae2f103b0238b16e87ea10725680bb7e25feb 100644 (file)
--- a/src/gromacs/taskassignment/findallgputasks.cpp
+++ b/src/gromacs/taskassignment/findallgputasks.cpp
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2017, by the GROMACS development team, led by
+ * Copyright (c) 2017,2018, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -48,10 +48,10 @@
  #include <numeric>
  #include <vector>
  
-#include "gromacs/mdtypes/commrec.h"
  #include "gromacs/utility/exceptions.h"
  #include "gromacs/utility/gmxassert.h"
  #include "gromacs/utility/gmxmpi.h"
+#include "gromacs/utility/physicalnodecommunicator.h"
  
  namespace gmx
  {
@@ -174,12 +174,13 @@ static std::vector<GpuTask> allgatherv(ArrayRef<const GpuTask> input,
   * assignment. Separating this aspect makes it possible to unit test
   * the logic of task assignment. */
  GpuTasksOnRanks
-findAllGpuTasksOnThisNode(ArrayRef<const GpuTask> gpuTasksOnThisRank,
-                          int                     numRanksOnThisNode,
-                          MPI_Comm                communicator)
+findAllGpuTasksOnThisNode(ArrayRef<const GpuTask>         gpuTasksOnThisRank,
+                          const PhysicalNodeCommunicator &physicalNodeComm)
  {
+    int      numRanksOnThisNode = physicalNodeComm.size_;
+    MPI_Comm communicator       = physicalNodeComm.comm_;
      // Find out how many GPU tasks are on each rank on this node.
-    auto numGpuTasksOnEachRankOfThisNode =
+    auto     numGpuTasksOnEachRankOfThisNode =
          allgather(gpuTasksOnThisRank.size(), numRanksOnThisNode, communicator);
  
      /* Collect on each rank of this node a vector describing all
diff --git a/src/gromacs/taskassignment/findallgputasks.h b/src/gromacs/taskassignment/findallgputasks.h

index 06069d3f5bc926cc3d04eb94b622d58e33cfb221..6c84f8b6d6a2939cef4d4a2fbe4c9be3891893e6 100644 (file)
--- a/src/gromacs/taskassignment/findallgputasks.h
+++ b/src/gromacs/taskassignment/findallgputasks.h
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2017, by the GROMACS development team, led by
+ * Copyright (c) 2017,2018, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -44,11 +44,12 @@
  
  #include "gromacs/taskassignment/taskassignment.h"
  #include "gromacs/utility/arrayref.h"
-#include "gromacs/utility/gmxmpi.h"
  
  namespace gmx
  {
  
+class PhysicalNodeCommunicator;
+
  /*! \brief Returns container of all tasks on all ranks of this node
   * that are eligible for GPU execution.
   *
@@ -56,9 +57,8 @@ namespace gmx
   * assignment. Separating this aspect makes it possible to unit test
   * the logic of task assignment. */
  GpuTasksOnRanks
-findAllGpuTasksOnThisNode(ArrayRef<const GpuTask> gpuTasksOnThisRank,
-                          int                     numRanksOnThisNode,
-                          MPI_Comm                communicator);
+findAllGpuTasksOnThisNode(ArrayRef<const GpuTask>         gpuTasksOnThisRank,
+                          const PhysicalNodeCommunicator &physicalNodeComm);
  
  } // namespace
  
diff --git a/src/gromacs/taskassignment/resourcedivision.cpp b/src/gromacs/taskassignment/resourcedivision.cpp

index add2f55c7c3fb6c18f0ff8430e67101268313fca..e2805f1ff438c7ed2753aa02e2dfa0f4e68f2082 100644 (file)
--- a/src/gromacs/taskassignment/resourcedivision.cpp
+++ b/src/gromacs/taskassignment/resourcedivision.cpp
@@ -67,6 +67,7 @@
  #include "gromacs/utility/fatalerror.h"
  #include "gromacs/utility/gmxassert.h"
  #include "gromacs/utility/logger.h"
+#include "gromacs/utility/physicalnodecommunicator.h"
  #include "gromacs/utility/stringutil.h"
  
  
@@ -797,6 +798,7 @@ void checkAndUpdateRequestedNumOpenmpThreads(gmx_hw_opt_t         *hw_opt,
                                               const gmx_hw_info_t  &hwinfo,
                                               const t_commrec      *cr,
                                               const gmx_multisim_t *ms,
+                                             int                   numRanksOnThisNode,
                                               PmeRunMode            pmeRunMode,
                                               const gmx_mtop_t     &mtop)
  {
@@ -856,9 +858,8 @@ void checkAndUpdateRequestedNumOpenmpThreads(gmx_hw_opt_t         *hw_opt,
          int numCoresPerRank = hwinfo.ncore_tot/numRanksTot;
          if (numAtomsPerRank < c_numAtomsPerCoreSquaredSmtThreshold*gmx::square(numCoresPerRank))
          {
-            int numRanksInThisNode = (cr ? cr->nrank_intranode : 1);
              /* Choose one OpenMP thread per physical core */
-            hw_opt->nthreads_omp = std::max(1, hwinfo.hardwareTopology->numberOfCores()/numRanksInThisNode);
+            hw_opt->nthreads_omp = std::max(1, hwinfo.hardwareTopology->numberOfCores()/numRanksOnThisNode);
          }
      }
  
@@ -876,48 +877,49 @@ void checkAndUpdateRequestedNumOpenmpThreads(gmx_hw_opt_t         *hw_opt,
      }
  }
  
-void checkHardwareOversubscription(int                          numThreadsOnThisRank,
-                                   const gmx::HardwareTopology &hwTop,
-                                   const t_commrec             *cr,
-                                   const gmx_multisim_t        *ms,
-                                   const gmx::MDLogger         &mdlog)
+namespace gmx
  {
-    if (hwTop.supportLevel() < gmx::HardwareTopology::SupportLevel::LogicalProcessorCount)
+
+void checkHardwareOversubscription(int                             numThreadsOnThisRank,
+                                   int                             rank,
+                                   const HardwareTopology         &hwTop,
+                                   const PhysicalNodeCommunicator &comm,
+                                   const MDLogger                 &mdlog)
+{
+    if (hwTop.supportLevel() < HardwareTopology::SupportLevel::LogicalProcessorCount)
      {
          /* There is nothing we can check */
          return;
      }
  
-    int numRanksOnThisNode   = 1;
+    int numRanksOnThisNode   = comm.size_;
      int numThreadsOnThisNode = numThreadsOnThisRank;
-#if GMX_MPI
-    if (PAR(cr) || isMultiSim(ms))
+    /* Avoid MPI calls with uninitialized thread-MPI communicators */
+    if (comm.size_ > 1)
      {
+#if GMX_MPI
          /* Count the threads within this physical node */
-        MPI_Comm_size(cr->mpi_comm_physicalnode, &numRanksOnThisNode);
-        MPI_Allreduce(&numThreadsOnThisRank, &numThreadsOnThisNode, 1, MPI_INT, MPI_SUM, cr->mpi_comm_physicalnode);
-    }
-#else
-    GMX_UNUSED_VALUE(ms);
+        MPI_Allreduce(&numThreadsOnThisRank, &numThreadsOnThisNode, 1, MPI_INT, MPI_SUM, comm.comm_);
  #endif
+    }
  
      if (numThreadsOnThisNode > hwTop.machine().logicalProcessorCount)
      {
          std::string mesg = "WARNING: ";
          if (GMX_LIB_MPI)
          {
-            mesg += gmx::formatString("On rank %d: o", cr->sim_nodeid);
+            mesg += formatString("On rank %d: o", rank);
          }
          else
          {
              mesg += "O";
          }
-        mesg     += gmx::formatString("versubscribing the available %d logical CPU cores", hwTop.machine().logicalProcessorCount);
+        mesg     += formatString("versubscribing the available %d logical CPU cores", hwTop.machine().logicalProcessorCount);
          if (GMX_LIB_MPI)
          {
              mesg += " per node";
          }
-        mesg     += gmx::formatString(" with %d ", numThreadsOnThisNode);
+        mesg     += formatString(" with %d ", numThreadsOnThisNode);
          if (numRanksOnThisNode == numThreadsOnThisNode)
          {
              if (GMX_THREAD_MPI)
@@ -942,3 +944,5 @@ void checkHardwareOversubscription(int                          numThreadsOnThis
          GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted(mesg.c_str());
      }
  }
+
+} // namespace
diff --git a/src/gromacs/taskassignment/resourcedivision.h b/src/gromacs/taskassignment/resourcedivision.h

index 4bd2447d9c1af76cf3df4d3751dbd2e5839eb9df..91b6914862f0207e04af8d74be8c8fd9f6a95d81 100644 (file)
--- a/src/gromacs/taskassignment/resourcedivision.h
+++ b/src/gromacs/taskassignment/resourcedivision.h
@@ -62,6 +62,7 @@ namespace gmx
  {
  class HardwareTopology;
  class MDLogger;
+class PhysicalNodeCommunicator;
  }
  
  /*! \brief Return the number of threads to use for thread-MPI based on how many
@@ -113,15 +114,21 @@ void checkAndUpdateRequestedNumOpenmpThreads(gmx_hw_opt_t         *hw_opt,
                                               const gmx_hw_info_t  &hwinfo,
                                               const t_commrec      *cr,
                                               const gmx_multisim_t *ms,
+                                             int                   numRanksOnThisNode,
                                               PmeRunMode            pmeRunMode,
                                               const gmx_mtop_t     &mtop);
  
+namespace gmx
+{
+
  /*! \brief Warns for oversubscribing the hardware threads, when that is the case
   */
-void checkHardwareOversubscription(int                          numThreadsOnThisRank,
-                                   const gmx::HardwareTopology &hwTop,
-                                   const t_commrec             *cr,
-                                   const gmx_multisim_t        *ms,
-                                   const gmx::MDLogger         &mdlog);
+void checkHardwareOversubscription(int                             numThreadsOnThisRank,
+                                   int                             rank,
+                                   const HardwareTopology         &hwTop,
+                                   const PhysicalNodeCommunicator &comm,
+                                   const MDLogger                 &mdlog);
+
+} // namespace
  
  #endif
diff --git a/src/gromacs/taskassignment/taskassignment.cpp b/src/gromacs/taskassignment/taskassignment.cpp

index f9862902042b317c4616030f1a39ef8867a0f1f5..4f0c65cbf45f290cc595dd48b2a53a560613e105 100644 (file)
--- a/src/gromacs/taskassignment/taskassignment.cpp
+++ b/src/gromacs/taskassignment/taskassignment.cpp
@@ -65,6 +65,7 @@
  #include "gromacs/utility/gmxassert.h"
  #include "gromacs/utility/gmxmpi.h"
  #include "gromacs/utility/logger.h"
+#include "gromacs/utility/physicalnodecommunicator.h"
  #include "gromacs/utility/stringutil.h"
  #include "gromacs/utility/sysinfo.h"
  
@@ -182,19 +183,19 @@ size_t countGpuTasksOnThisNode(const GpuTasksOnRanks &gpuTasksOnRanksOfThisNode)
  }   // namespace
  
  GpuTaskAssignments::value_type
-runTaskAssignment(const std::vector<int>     &gpuIdsToUse,
-                  const std::vector<int>     &userGpuTaskAssignment,
-                  const gmx_hw_info_t        &hardwareInfo,
-                  const MDLogger             &mdlog,
-                  const t_commrec            *cr,
-                  const gmx_multisim_t       *ms,
-                  const std::vector<GpuTask> &gpuTasksOnThisRank)
+runTaskAssignment(const std::vector<int>         &gpuIdsToUse,
+                  const std::vector<int>         &userGpuTaskAssignment,
+                  const gmx_hw_info_t            &hardwareInfo,
+                  const MDLogger                 &mdlog,
+                  const t_commrec                *cr,
+                  const gmx_multisim_t           *ms,
+                  const PhysicalNodeCommunicator &physicalNodeComm,
+                  const std::vector<GpuTask>     &gpuTasksOnThisRank)
  {
      /* Communicate among ranks on this node to find each task that can
       * be executed on a GPU, on each rank. */
-    auto gpuTasksOnRanksOfThisNode = findAllGpuTasksOnThisNode(gpuTasksOnThisRank,
-                                                               cr->nrank_intranode,
-                                                               cr->mpi_comm_physicalnode);
+    auto               gpuTasksOnRanksOfThisNode = findAllGpuTasksOnThisNode(gpuTasksOnThisRank,
+                                                                             physicalNodeComm);
      auto               numGpuTasksOnThisNode = countGpuTasksOnThisNode(gpuTasksOnRanksOfThisNode);
  
      GpuTaskAssignments taskAssignmentOnRanksOfThisNode;
@@ -223,7 +224,7 @@ runTaskAssignment(const std::vector<int>     &gpuIdsToUse,
              ArrayRef<const int> compatibleGpusToUse = gpuIdsToUse;
  
              // enforce the single device/rank restriction
-            if (cr->nrank_intranode == 1 && !compatibleGpusToUse.empty())
+            if (physicalNodeComm.size_ == 1 && !compatibleGpusToUse.empty())
              {
                  compatibleGpusToUse = compatibleGpusToUse.subArray(0, 1);
              }
@@ -284,7 +285,7 @@ runTaskAssignment(const std::vector<int>     &gpuIdsToUse,
          // GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR), but it is unclear
          // how we should involve MPI in the implementation of error
          // handling.
-        if (cr->rank_intranode == 0)
+        if (physicalNodeComm.rank_ == 0)
          {
              printFatalErrorMessage(stderr, ex);
          }
@@ -306,7 +307,7 @@ runTaskAssignment(const std::vector<int>     &gpuIdsToUse,
      }
  
      reportGpuUsage(mdlog, !userGpuTaskAssignment.empty(), taskAssignmentOnRanksOfThisNode,
-                   numGpuTasksOnThisNode, cr->nrank_intranode, cr->nnodes > 1);
+                   numGpuTasksOnThisNode, physicalNodeComm.size_, cr->nnodes > 1);
  
      // If the user chose a task assignment, give them some hints where appropriate.
      if (!userGpuTaskAssignment.empty())
@@ -316,7 +317,7 @@ runTaskAssignment(const std::vector<int>     &gpuIdsToUse,
                              taskAssignmentOnRanksOfThisNode);
      }
  
-    return taskAssignmentOnRanksOfThisNode[cr->rank_intranode];
+    return taskAssignmentOnRanksOfThisNode[physicalNodeComm.rank_];
  
      // TODO There is no check that mdrun -nb gpu or -pme gpu or
      // -gpu_id is actually being implemented such that nonbonded tasks
diff --git a/src/gromacs/taskassignment/taskassignment.h b/src/gromacs/taskassignment/taskassignment.h

index 53936362c1b3f907ccdbed4f6733b4e7b2fe36ba..c6217a8b03e9cfe73eab141fee5c0528ce0b823a 100644 (file)
--- a/src/gromacs/taskassignment/taskassignment.h
+++ b/src/gromacs/taskassignment/taskassignment.h
@@ -60,6 +60,7 @@ namespace gmx
  {
  
  class MDLogger;
+class PhysicalNodeCommunicator;
  
  /*! \brief Types of compute tasks that can be run on a GPU.
   *
@@ -103,7 +104,8 @@ using GpuTaskAssignments = std::vector<GpuTaskAssignment>;
   * \param[in]  hardwareInfo               The detected hardware
   * \param[in]  mdlog                      Logging object to write to.
   * \param[in]  cr                         Communication object.
- * \param[in]  ms                         Handles multi-simulations.
+ * \param[in]  ms                         Multi-simulation handler.
+ * \param[in]  physicalNodeComm           Communication object for this physical node.
   * \param[in]  gpuTasksOnThisRank         Information about what GPU tasks
   *                                        exist on this rank.
   *
@@ -113,13 +115,14 @@ using GpuTaskAssignments = std::vector<GpuTaskAssignment>;
   *           InconsistentInputError  If user and/or detected inputs are inconsistent.
   */
  GpuTaskAssignments::value_type
-runTaskAssignment(const std::vector<int>     &gpuIdsToUse,
-                  const std::vector<int>     &userGpuTaskAssignment,
-                  const gmx_hw_info_t        &hardwareInfo,
-                  const MDLogger             &mdlog,
-                  const t_commrec            *cr,
-                  const gmx_multisim_t       *ms,
-                  const std::vector<GpuTask> &gpuTasksOnThisRank);
+runTaskAssignment(const std::vector<int>         &gpuIdsToUse,
+                  const std::vector<int>         &userGpuTaskAssignment,
+                  const gmx_hw_info_t            &hardwareInfo,
+                  const MDLogger                 &mdlog,
+                  const t_commrec                *cr,
+                  const gmx_multisim_t           *ms,
+                  const PhysicalNodeCommunicator &physicalNodeComm,
+                  const std::vector<GpuTask>     &gpuTasksOnThisRank);
  
  //! Function for whether the task of \c mapping has value \c TaskType.
  template<GpuTask TaskType>
diff --git a/src/gromacs/utility/gmxmpi.h b/src/gromacs/utility/gmxmpi.h

index bf22d4c66f0fc6e6a6cb72baf27bdf183dab667e..f39eb1b3231d82262c852145414623ebc374a1f5 100644 (file)
--- a/src/gromacs/utility/gmxmpi.h
+++ b/src/gromacs/utility/gmxmpi.h
@@ -82,9 +82,9 @@ typedef void* MPI_Comm;
  typedef void* MPI_Request;
  typedef void* MPI_Status;
  typedef void* MPI_Group;
-#define MPI_COMM_NULL NULL
-#define MPI_GROUP_NULL NULL
-#define MPI_COMM_WORLD NULL
+#define MPI_COMM_NULL  nullptr
+#define MPI_GROUP_NULL nullptr
+#define MPI_COMM_WORLD nullptr
  #endif
  #endif
  //! \endcond
diff --git a/src/gromacs/utility/physicalnodecommunicator.cpp b/src/gromacs/utility/physicalnodecommunicator.cpp

new file mode 100644 (file)

index 0000000..6c912ad
--- /dev/null
+++ b/src/gromacs/utility/physicalnodecommunicator.cpp
@@ -0,0 +1,127 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2018, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ * \brief
+ * Defines functionality for communicators across physical nodes.
+ *
+ * \ingroup module_utility
+ */
+#include "gmxpre.h"
+
+#include "physicalnodecommunicator.h"
+
+#include "config.h"
+
+#include "gromacs/utility/basedefinitions.h"
+#include "gromacs/utility/gmxmpi.h"
+
+namespace gmx
+{
+
+void
+MPI_Comm_free_wrapper(MPI_Comm *comm)
+{
+#if GMX_MPI
+    // With thread-MPI *comm is shared between ranks which causes issues with
+    // freeing. But all thread-mpi data is anyhow freed in tMPI_Finalize()
+    // and in practice *comm is always MPI_COMM_WORLD with thread-MPI.
+    // Only the thread-affinity test code uses *comm != MPI_COMM_WORLD.
+    if (!GMX_THREAD_MPI)
+    {
+        MPI_Comm_free(comm);
+    }
+#else
+    GMX_UNUSED_VALUE(comm);
+#endif
+}
+
+PhysicalNodeCommunicator::PhysicalNodeCommunicator(MPI_Comm world, int physicalNodeId)
+{
+#if GMX_MPI
+    int isInitialized;
+    MPI_Initialized(&isInitialized);
+    if (isInitialized)
+    {
+        int sizeOfWorld;
+        MPI_Comm_size(world, &sizeOfWorld);
+        if (sizeOfWorld > 1)
+        {
+            int rankWithinWorld;
+            MPI_Comm_rank(world, &rankWithinWorld);
+            MPI_Comm_split(world, physicalNodeId, rankWithinWorld, &comm_);
+            auto ptr = MPI_Comm_ptr(&comm_);
+            commGuard_.swap(ptr);
+            MPI_Comm_size(comm_, &size_);
+            MPI_Comm_rank(comm_, &rank_);
+        }
+        else
+        {
+            // Handle this trivial case separately, because thread-MPI
+            // doesn't have a valid communicator when there is only
+            // one rank.
+            comm_ = world;
+            size_ = 1;
+            rank_ = 0;
+        }
+    }
+    else
+    {
+        comm_ = MPI_COMM_NULL;
+        size_ = 1;
+        rank_ = 0;
+    }
+#else
+    // Trivial case when there is no MPI support or not initialized
+    GMX_UNUSED_VALUE(world);
+    GMX_UNUSED_VALUE(physicalNodeId);
+    comm_ = nullptr;
+    size_ = 1;
+    rank_ = 0;
+#endif
+}
+
+void PhysicalNodeCommunicator::barrier() const
+{
+#if GMX_MPI
+    if (size_ > 1)
+    {
+        MPI_Barrier(comm_);
+    }
+#else
+    // Nothing to do
+#endif
+}
+
+} // namespace
diff --git a/src/gromacs/utility/physicalnodecommunicator.h b/src/gromacs/utility/physicalnodecommunicator.h

new file mode 100644 (file)

index 0000000..f6d7405
--- /dev/null
+++ b/src/gromacs/utility/physicalnodecommunicator.h
@@ -0,0 +1,89 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2018, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \libinternal \file
+ * \brief
+ * Declares functionality for communicators across physical nodes.
+ *
+ * \inlibraryapi
+ * \ingroup module_utility
+ */
+#ifndef GMX_MDTYPES_PHYSICALNODECOMMUNICATOR_H
+#define GMX_MDTYPES_PHYSICALNODECOMMUNICATOR_H
+
+#include "gromacs/utility/gmxmpi.h"
+#include "gromacs/utility/unique_cptr.h"
+
+namespace gmx
+{
+
+/*! \brief Wrapper function for RAII-style cleanup.
+ *
+ * This is needed to discard the return value so it can be used as a
+ * deleter by a smart pointer. */
+void
+MPI_Comm_free_wrapper(MPI_Comm *comm);
+
+//! Make a smart pointer for MPI communicators.
+using MPI_Comm_ptr = gmx::unique_cptr<MPI_Comm, MPI_Comm_free_wrapper>;
+
+/*! \libinternal \brief Holds a communicator for the physical node of this rank
+ *
+ * This communicator should only be used for appropriate tasks,
+ * e.g. during initialization and finalization. It can contain ranks
+ * from PP, PME and multiple simulations with multisim, so is not
+ * suited for general-purpose communication. */
+class PhysicalNodeCommunicator
+{
+    public:
+        /*! \brief Constructor.
+         *
+         * Communicates within \c world to make intra-communicator \c
+         * comm_ between all ranks that share \c physicalNodeId. */
+        PhysicalNodeCommunicator(MPI_Comm world, int physicalNodeId);
+        //! Communicator for all ranks on this physical node
+        MPI_Comm     comm_;
+        //! Number of ranks on this physical node, corresponds to MPI_Comm_size of comm.
+        int          size_;
+        //! Rank ID within this physical node, corresponds to MPI_Comm_rank of comm.
+        int          rank_;
+        //! RAII handler for cleaning up \c comm_ only when appropriate.
+        MPI_Comm_ptr commGuard_;
+        //! Creates a barrier for all ranks on this physical node.
+        void barrier() const;
+};
+
+} // namespace
+
+#endif
diff --git a/src/gromacs/utility/tests/CMakeLists.txt b/src/gromacs/utility/tests/CMakeLists.txt

index 0c67bc12ea798f3ee76c1eb00121bb71b630582b..1b0f13dfb293fb752e9b95f92dcfc0ea50ccd929 100644 (file)
--- a/src/gromacs/utility/tests/CMakeLists.txt
+++ b/src/gromacs/utility/tests/CMakeLists.txt
@@ -41,8 +41,13 @@ gmx_add_unit_test(UtilityUnitTests utility-test
                    logger.cpp
                    mutex.cpp
                    path.cpp
+                  physicalnodecommunicator.cpp
                    stringutil.cpp
                    textreader.cpp
                    textwriter.cpp
                    typetraits.cpp
                    )
+
+gmx_add_mpi_unit_test(UtilityMpiUnitTests utility-mpi-test 4
+                  physicalnodecommunicator-mpi.cpp
+                  )
diff --git a/src/gromacs/utility/tests/physicalnodecommunicator-mpi.cpp b/src/gromacs/utility/tests/physicalnodecommunicator-mpi.cpp

new file mode 100644 (file)

index 0000000..5260a4b
--- /dev/null
+++ b/src/gromacs/utility/tests/physicalnodecommunicator-mpi.cpp
@@ -0,0 +1,64 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2018, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+#include "gmxpre.h"
+
+#include <array>
+
+#include <gtest/gtest.h>
+
+#include "gromacs/utility/physicalnodecommunicator.h"
+
+#include "testutils/mpitest.h"
+
+namespace gmx
+{
+namespace
+{
+
+TEST(PhysicalNodeCommunicatorTest, CanConstruct)
+{
+    GMX_MPI_TEST(4);
+    PhysicalNodeCommunicator comm(MPI_COMM_WORLD, 0);
+}
+
+TEST(PhysicalNodeCommunicatorTest, CanCallBarrier)
+{
+    GMX_MPI_TEST(4);
+    PhysicalNodeCommunicator comm(MPI_COMM_WORLD, 0);
+    comm.barrier();
+}
+
+} // namespace
+} // namespace
diff --git a/src/gromacs/utility/tests/physicalnodecommunicator.cpp b/src/gromacs/utility/tests/physicalnodecommunicator.cpp

new file mode 100644 (file)

index 0000000..86ed2eb
--- /dev/null
+++ b/src/gromacs/utility/tests/physicalnodecommunicator.cpp
@@ -0,0 +1,62 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2018, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+#include "gmxpre.h"
+
+#include "gromacs/utility/physicalnodecommunicator.h"
+
+#include <array>
+
+#include <gtest/gtest.h>
+
+#include "testutils/mpitest.h"
+
+namespace gmx
+{
+namespace
+{
+
+TEST(PhysicalNodeCommunicatorTest, CanConstruct)
+{
+    PhysicalNodeCommunicator comm(MPI_COMM_WORLD, 0);
+}
+
+TEST(PhysicalNodeCommunicatorTest, CanCallBarrier)
+{
+    PhysicalNodeCommunicator comm(MPI_COMM_WORLD, 0);
+    comm.barrier();
+}
+
+} // namespace
+} // namespace
diff --git a/src/programs/mdrun/runner.cpp b/src/programs/mdrun/runner.cpp

index 762ed733e8bc07d57de598e2c52f411b302bcb32..705b56bda6e811bc6e71c92ca248033ad8d6b38d 100644 (file)
--- a/src/programs/mdrun/runner.cpp
+++ b/src/programs/mdrun/runner.cpp
@@ -108,6 +108,7 @@
  #include "gromacs/timing/wallcycle.h"
  #include "gromacs/topology/mtop_util.h"
  #include "gromacs/trajectory/trajectoryframe.h"
+#include "gromacs/utility/basenetwork.h"
  #include "gromacs/utility/cstringutil.h"
  #include "gromacs/utility/exceptions.h"
  #include "gromacs/utility/fatalerror.h"
@@ -116,6 +117,7 @@
  #include "gromacs/utility/gmxmpi.h"
  #include "gromacs/utility/logger.h"
  #include "gromacs/utility/loggerbuilder.h"
+#include "gromacs/utility/physicalnodecommunicator.h"
  #include "gromacs/utility/pleasecite.h"
  #include "gromacs/utility/programcontext.h"
  #include "gromacs/utility/smalloc.h"
@@ -155,7 +157,7 @@ void Mdrunner::reinitializeOnSpawnedThread()
  {
      threadMpiMdrunnerAccessBarrier();
  
-    cr  = reinitialize_commrec_for_this_thread(cr, ms);
+    cr  = reinitialize_commrec_for_this_thread(cr);
  
      GMX_RELEASE_ASSERT(!MASTER(cr), "reinitializeOnSpawnedThread should only be called on spawned threads");
  
@@ -214,7 +216,7 @@ t_commrec *Mdrunner::spawnThreads(int numThreadsToLaunch) const
      GMX_UNUSED_VALUE(mdrunner_start_fn);
  #endif
  
-    return reinitialize_commrec_for_this_thread(cr, ms);
+    return reinitialize_commrec_for_this_thread(cr);
  }
  
  }      // namespace
@@ -493,7 +495,16 @@ int Mdrunner::mdrunner()
      gmx::LoggerOwner logOwner(buildLogger(fplog, cr));
      gmx::MDLogger    mdlog(logOwner.logger());
  
-    hwinfo = gmx_detect_hardware(mdlog);
+    // TODO The thread-MPI master rank makes a working
+    // PhysicalNodeCommunicator here, but it gets rebuilt by all ranks
+    // after the threads have been launched. This works because no use
+    // is made of that communicator until after the execution paths
+    // have rejoined. But it is likely that we can improve the way
+    // this is expressed, e.g. by expressly running detection only the
+    // master rank for thread-MPI, rather than relying on the mutex
+    // and reference count.
+    PhysicalNodeCommunicator physicalNodeComm(MPI_COMM_WORLD, gmx_physicalnode_id_hash());
+    hwinfo = gmx_detect_hardware(mdlog, physicalNodeComm);
  
      gmx_print_detected_hardware(fplog, cr, ms, mdlog, hwinfo);
  
@@ -625,8 +636,9 @@ int Mdrunner::mdrunner()
          // TODO Both master and spawned threads call dup_tfn and
          // reinitialize_commrec_for_this_thread. Find a way to express
          // this better.
+        physicalNodeComm = PhysicalNodeCommunicator(MPI_COMM_WORLD, gmx_physicalnode_id_hash());
      }
-    /* END OF CAUTION: cr is now reliable */
+    // END OF CAUTION: cr and physicalNodeComm are now reliable
  
      if (PAR(cr))
      {
@@ -908,8 +920,6 @@ int Mdrunner::mdrunner()
          gmx_setup_nodecomm(fplog, cr);
      }
  
-    /* Initialize per-physical-node MPI process/thread ID and counters. */
-    gmx_init_intranode_counters(cr);
  #if GMX_MPI
      if (isMultiSim(ms))
      {
@@ -934,10 +944,12 @@ int Mdrunner::mdrunner()
      check_and_update_hw_opt_2(&hw_opt, inputrec->cutoff_scheme);
  
      /* Check and update the number of OpenMP threads requested */
-    checkAndUpdateRequestedNumOpenmpThreads(&hw_opt, *hwinfo, cr, ms, pmeRunMode, *mtop);
+    checkAndUpdateRequestedNumOpenmpThreads(&hw_opt, *hwinfo, cr, ms, physicalNodeComm.size_,
+                                            pmeRunMode, *mtop);
  
      gmx_omp_nthreads_init(mdlog, cr,
                            hwinfo->nthreads_hw_avail,
+                          physicalNodeComm.size_,
                            hw_opt.nthreads_omp,
                            hw_opt.nthreads_omp_pme,
                            !thisRankHasDuty(cr, DUTY_PP),
@@ -1014,7 +1026,7 @@ int Mdrunner::mdrunner()
      {
          // Produce the task assignment for this rank.
          gpuTaskAssignment = runTaskAssignment(gpuIdsToUse, userGpuTaskAssignment, *hwinfo,
-                                              mdlog, cr, ms, gpuTasksOnThisRank);
+                                              mdlog, cr, ms, physicalNodeComm, gpuTasksOnThisRank);
      }
      GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
  
@@ -1096,9 +1108,9 @@ int Mdrunner::mdrunner()
          numThreadsOnThisRank = nthreads_pme;
      }
  
-    checkHardwareOversubscription(numThreadsOnThisRank,
+    checkHardwareOversubscription(numThreadsOnThisRank, cr->nodeid,
                                    *hwinfo->hardwareTopology,
-                                  cr, ms, mdlog);
+                                  physicalNodeComm, mdlog);
  
      if (hw_opt.thread_affinity != threadaffOFF)
      {
@@ -1110,7 +1122,7 @@ int Mdrunner::mdrunner()
                                        &hw_opt, hwinfo->nthreads_hw_avail, TRUE);
  
          int numThreadsOnThisNode, intraNodeThreadOffset;
-        analyzeThreadsOnThisNode(cr, ms, nullptr, numThreadsOnThisRank, &numThreadsOnThisNode,
+        analyzeThreadsOnThisNode(physicalNodeComm, numThreadsOnThisRank, &numThreadsOnThisNode,
                                   &intraNodeThreadOffset);
  
          /* Set the CPU affinity */
@@ -1384,7 +1396,7 @@ int Mdrunner::mdrunner()
      mdModules.reset(nullptr);   // destruct force providers here as they might also use the GPU
  
      /* Free GPU memory and set a physical node tMPI barrier (which should eventually go away) */
-    free_gpu_resources(fr, cr, ms);
+    free_gpu_resources(fr, physicalNodeComm);
      free_gpu(nonbondedDeviceInfo);
      free_gpu(pmeDeviceInfo);
author	Mark Abraham <mark.j.abraham@gmail.com>
	Thu, 25 Jan 2018 14:47:35 +0000 (15:47 +0100)
committer	Mark Abraham <mark.j.abraham@gmail.com>
	Wed, 14 Mar 2018 13:37:43 +0000 (14:37 +0100)
src/gromacs/domdec/domdec.cpp		patch \| blob \| history
src/gromacs/ewald/tests/testhardwarecontexts.cpp		patch \| blob \| history
src/gromacs/gmxlib/network.cpp		patch \| blob \| history
src/gromacs/gmxlib/network.h		patch \| blob \| history
src/gromacs/hardware/detecthardware.cpp		patch \| blob \| history
src/gromacs/hardware/detecthardware.h		patch \| blob \| history
src/gromacs/mdlib/force.h		patch \| blob \| history
src/gromacs/mdlib/forcerec.cpp		patch \| blob \| history
src/gromacs/mdlib/gmx_omp_nthreads.cpp		patch \| blob \| history
src/gromacs/mdlib/gmx_omp_nthreads.h		patch \| blob \| history
src/gromacs/mdrunutility/tests/threadaffinitytest.cpp		patch \| blob \| history
src/gromacs/mdrunutility/tests/threadaffinitytest.h		patch \| blob \| history
src/gromacs/mdrunutility/threadaffinity.cpp		patch \| blob \| history
src/gromacs/mdrunutility/threadaffinity.h		patch \| blob \| history
src/gromacs/mdtypes/commrec.h		patch \| blob \| history
src/gromacs/taskassignment/findallgputasks.cpp		patch \| blob \| history
src/gromacs/taskassignment/findallgputasks.h		patch \| blob \| history
src/gromacs/taskassignment/resourcedivision.cpp		patch \| blob \| history
src/gromacs/taskassignment/resourcedivision.h		patch \| blob \| history
src/gromacs/taskassignment/taskassignment.cpp		patch \| blob \| history
src/gromacs/taskassignment/taskassignment.h		patch \| blob \| history
src/gromacs/utility/gmxmpi.h		patch \| blob \| history
src/gromacs/utility/physicalnodecommunicator.cpp	[new file with mode: 0644]	patch \| blob
src/gromacs/utility/physicalnodecommunicator.h	[new file with mode: 0644]	patch \| blob
src/gromacs/utility/tests/CMakeLists.txt		patch \| blob \| history
src/gromacs/utility/tests/physicalnodecommunicator-mpi.cpp	[new file with mode: 0644]	patch \| blob
src/gromacs/utility/tests/physicalnodecommunicator.cpp	[new file with mode: 0644]	patch \| blob
src/programs/mdrun/runner.cpp		patch \| blob \| history