Fix multisim with parallel simulation

author Berk Hess <hess@kth.se>

Tue, 14 Jan 2020 13:51:38 +0000 (14:51 +0100)

committer Berk Hess <hess@kth.se>

Wed, 15 Jan 2020 12:40:57 +0000 (13:40 +0100)
author Berk Hess <hess@kth.se>
Tue, 14 Jan 2020 13:51:38 +0000 (14:51 +0100)
committer Berk Hess <hess@kth.se>
Wed, 15 Jan 2020 12:40:57 +0000 (13:40 +0100)
diff --git a/docs/release-notes/2020/2020.1.rst b/docs/release-notes/2020/2020.1.rst

index 9c3b1f108c8080f11be1e587460c7868a902d12a..9fc5281a42ccde4552c8799c0761ea5aad3ba020 100644 (file)
--- a/docs/release-notes/2020/2020.1.rst
+++ b/docs/release-notes/2020/2020.1.rst
@@ -16,6 +16,11 @@ in the :ref:`release-notes`.
  Fixes where mdrun could behave incorrectly
  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  
+Fix fatal error with mdrun -multidir with more than 1 rank per simulation
+"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
+
+:issue:`3297`
+
  Fixes for ``gmx`` tools
  ^^^^^^^^^^^^^^^^^^^^^^^
  
diff --git a/src/gromacs/gmxlib/network.cpp b/src/gromacs/gmxlib/network.cpp

index 64c7c51d8c45540349d5201629c606c405fb36a3..b0b9e0d986adf9c629c6d6c37b22068dfdad359e 100644 (file)
--- a/src/gromacs/gmxlib/network.cpp
+++ b/src/gromacs/gmxlib/network.cpp
@@ -3,7 +3,7 @@
   *
   * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   * Copyright (c) 2001-2004, The GROMACS development team.
- * Copyright (c) 2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
+ * Copyright (c) 2013,2014,2015,2016,2017,2018,2019,2020, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -527,13 +527,3 @@ void gmx_fatal_collective(int                    f_errno,
      gmx_fatal_mpi_va(f_errno, file, line, bMaster, bFinalize, fmt, ap);
      va_end(ap);
  }
-
-void simulationBarrier(const t_commrec* cr)
-{
-    if (PAR(cr))
-    {
-#if GMX_MPI
-        MPI_Barrier(cr->mpi_comm_mysim);
-#endif
-    }
-}
diff --git a/src/gromacs/gmxlib/network.h b/src/gromacs/gmxlib/network.h

index fd0196a2954dac96ff3ae49504cfe158768b8f05..99d29bfdb0a857b232b2e6284dbfe39312acae0b 100644 (file)
--- a/src/gromacs/gmxlib/network.h
+++ b/src/gromacs/gmxlib/network.h
@@ -3,7 +3,7 @@
   *
   * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   * Copyright (c) 2001-2004, The GROMACS development team.
- * Copyright (c) 2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
+ * Copyright (c) 2013,2014,2015,2016,2017,2018,2019,2020, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -123,7 +123,4 @@ const char* opt2fn_master(const char* opt, int nfile, const t_filenm fnm[], t_co
   * for all processes.
   */
  
-//! Make a barrier across all ranks of this simulation
-void simulationBarrier(const t_commrec* cr);
-
  #endif
diff --git a/src/gromacs/mdrun/runner.cpp b/src/gromacs/mdrun/runner.cpp

index 31935fd77f6abd94f9d60aba9c388dd5c343a5cf..bae0c18e960081dcde47628192aaefd7b85e9b1c 100644 (file)
--- a/src/gromacs/mdrun/runner.cpp
+++ b/src/gromacs/mdrun/runner.cpp
@@ -1143,9 +1143,9 @@ int Mdrunner::mdrunner()
      // Produce the task assignment for this rank.
      GpuTaskAssignmentsBuilder gpuTaskAssignmentsBuilder;
      GpuTaskAssignments        gpuTaskAssignments = gpuTaskAssignmentsBuilder.build(
-            gpuIdsToUse, userGpuTaskAssignment, *hwinfo, cr, ms, physicalNodeComm, nonbondedTarget,
-            pmeTarget, bondedTarget, updateTarget, useGpuForNonbonded, useGpuForPme,
-            thisRankHasDuty(cr, DUTY_PP),
+            gpuIdsToUse, userGpuTaskAssignment, *hwinfo, communicator, physicalNodeComm,
+            nonbondedTarget, pmeTarget, bondedTarget, updateTarget, useGpuForNonbonded,
+            useGpuForPme, thisRankHasDuty(cr, DUTY_PP),
              // TODO cr->duty & DUTY_PME should imply that a PME
              // algorithm is active, but currently does not.
              EEL_PME(inputrec->coulombtype) && thisRankHasDuty(cr, DUTY_PME));
diff --git a/src/gromacs/mdrunutility/multisim.cpp b/src/gromacs/mdrunutility/multisim.cpp

index debb0aeaaaf9bf8b531887e854111b863a0d8243..b1a3eb2a49a6f1abf1032dc1b2b96c025341954d 100644 (file)
--- a/src/gromacs/mdrunutility/multisim.cpp
+++ b/src/gromacs/mdrunutility/multisim.cpp
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2018,2019, by the GROMACS development team, led by
+ * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -424,16 +424,3 @@ bool isMasterSimMasterRank(const gmx_multisim_t* ms, const bool isMaster)
  {
      return (isMaster && isMasterSim(ms));
  }
-
-void multiSimBarrier(const gmx_multisim_t* ms)
-{
-    if (isMultiSim(ms))
-    {
-#if GMX_MPI
-        if (ms->mpi_comm_masters != MPI_COMM_NULL)
-        {
-            MPI_Barrier(ms->mpi_comm_masters);
-        }
-#endif
-    }
-}
diff --git a/src/gromacs/mdrunutility/multisim.h b/src/gromacs/mdrunutility/multisim.h

index cde06c2ae25e61a0fb05a30792acfb69cafbc571..2a31292a913dc91bfc9a0c7487515b7c936ef779 100644 (file)
--- a/src/gromacs/mdrunutility/multisim.h
+++ b/src/gromacs/mdrunutility/multisim.h
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2018,2019, by the GROMACS development team, led by
+ * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -136,7 +136,4 @@ bool isMasterSim(const gmx_multisim_t* ms);
   * This rank prints the remaining run time etc. */
  bool isMasterSimMasterRank(const gmx_multisim_t* ms, bool isMaster);
  
-//! Make a barrier across all multi-simulation master ranks
-void multiSimBarrier(const gmx_multisim_t* ms);
-
  #endif
diff --git a/src/gromacs/taskassignment/taskassignment.cpp b/src/gromacs/taskassignment/taskassignment.cpp

index 3899d5616e8f0f840a2949332185d0fd79247923..03ebb3e50aaacd50972838d17ed41e28168f2980 100644 (file)
--- a/src/gromacs/taskassignment/taskassignment.cpp
+++ b/src/gromacs/taskassignment/taskassignment.cpp
@@ -51,6 +51,8 @@
  
  #include "taskassignment.h"
  
+#include "config.h"
+
  #include <algorithm>
  #include <exception>
  #include <string>
@@ -192,27 +194,38 @@ size_t countGpuTasksOnThisNode(const GpuTasksOnRanks& gpuTasksOnRanksOfThisNode)
  
  /*! \brief Return on each rank the total count over all ranks of all
   * simulations. */
-int countOverAllRanks(const t_commrec* cr, const gmx_multisim_t* ms, const int countOnThisRank)
+int countOverAllRanks(MPI_Comm comm, int countOnThisRank)
  {
-    int countOverAllRanksValue = countOnThisRank;
-    if (PAR(cr))
+    int sum;
+#if GMX_MPI
+    int numRanks;
+    MPI_Comm_size(comm, &numRanks);
+    if (numRanks > 1)
      {
-        // Count over the ranks of this simulation.
-        gmx_sumi(1, &countOverAllRanksValue, cr);
+        MPI_Allreduce(&countOnThisRank, &sum, 1, MPI_INT, MPI_SUM, comm);
      }
-    if (isMultiSim(ms))
+    else
+#endif
      {
-        // Count over the ranks of all simulations.
-        gmx_sumi_sim(1, &countOverAllRanksValue, ms);
-        if (PAR(cr))
-        {
-            // Propagate the information from other simulations back
-            // to non-master ranks so they can all agree on future
-            // behavior.
-            gmx_bcast(sizeof(decltype(countOverAllRanksValue)), &countOverAllRanksValue, cr);
-        }
+        sum = countOnThisRank;
+    }
+
+    return sum;
+}
+
+/*! \brief Barrier over all rank in \p comm */
+void barrierOverAllRanks(MPI_Comm comm)
+{
+#if GMX_MPI
+    int numRanks;
+    MPI_Comm_size(comm, &numRanks);
+    if (numRanks > 1)
+    {
+        MPI_Barrier(comm);
      }
-    return countOverAllRanksValue;
+#else
+    GMX_UNUSED_VALUE(comm);
+#endif
  }
  
  } // namespace
@@ -222,8 +235,7 @@ GpuTaskAssignmentsBuilder::GpuTaskAssignmentsBuilder() = default;
  GpuTaskAssignments GpuTaskAssignmentsBuilder::build(const std::vector<int>& gpuIdsToUse,
                                                      const std::vector<int>& userGpuTaskAssignment,
                                                      const gmx_hw_info_t&    hardwareInfo,
-                                                    const t_commrec*        cr,
-                                                    const gmx_multisim_t*   ms,
+                                                    MPI_Comm                gromacsWorldComm,
                                                      const PhysicalNodeCommunicator& physicalNodeComm,
                                                      const TaskTarget                nonbondedTarget,
                                                      const TaskTarget                pmeTarget,
@@ -327,8 +339,8 @@ GpuTaskAssignments GpuTaskAssignmentsBuilder::build(const std::vector<int>& gpuI
      {
          exceptionPtr = std::current_exception();
      }
-    int countOfExceptionsOnThisRank   = int(bool(exceptionPtr));
-    int countOfExceptionsOverAllRanks = countOverAllRanks(cr, ms, countOfExceptionsOnThisRank);
+    int countOfExceptionsOnThisRank = int(bool(exceptionPtr));
+    int countOfExceptionsOverAllRanks = countOverAllRanks(gromacsWorldComm, countOfExceptionsOnThisRank);
  
      // Avoid all ranks spamming the error stream
      //
@@ -345,14 +357,12 @@ GpuTaskAssignments GpuTaskAssignmentsBuilder::build(const std::vector<int>& gpuI
          }
          GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
      }
-    // TODO This implements a global barrier so that MPI runtimes can
+    // TODO Global barrier so that MPI runtimes can
      // organize an orderly shutdown if one of the ranks has had to
      // issue a fatal error above. When we have MPI-aware error
      // handling and reporting, this should be improved (perhaps
      // centralized there).
-    simulationBarrier(cr);
-    multiSimBarrier(ms);
-    simulationBarrier(cr);
+    barrierOverAllRanks(gromacsWorldComm);
      if (countOfExceptionsOverAllRanks > 0)
      {
          gmx_fatal(FARGS,
diff --git a/src/gromacs/taskassignment/taskassignment.h b/src/gromacs/taskassignment/taskassignment.h

index 899666fbdd33bd7d8986ab1d22be0b7b64f21d0b..6ef380385d3906d0b873da3b32454d7454c60106 100644 (file)
--- a/src/gromacs/taskassignment/taskassignment.h
+++ b/src/gromacs/taskassignment/taskassignment.h
@@ -53,10 +53,10 @@
  #include <vector>
  
  #include "gromacs/utility/basedefinitions.h"
+#include "gromacs/utility/gmxmpi.h"
  
  struct gmx_device_info_t;
  struct gmx_hw_info_t;
-struct gmx_multisim_t;
  struct t_commrec;
  
  enum class PmeRunMode;
@@ -143,8 +143,7 @@ public:
       * \param[in]  gpuIdsToUse            The compatible GPUs that the user permitted us to use.
       * \param[in]  userGpuTaskAssignment  The user-specified assignment of GPU tasks to device IDs.
       * \param[in]  hardwareInfo           The detected hardware
-     * \param[in]  cr                     Communication object.
-     * \param[in]  ms                     Multi-simulation handler.
+     * \param[in]  gromacsWorldComm       MPI communicator for all ranks in the current GROMACS run
       * \param[in]  physicalNodeComm       Communication object for this physical node.
       * \param[in]  nonbondedTarget        The user's choice for mdrun -nb for where to assign
       *                                    short-ranged nonbonded interaction tasks.
@@ -163,8 +162,7 @@ public:
      GpuTaskAssignments build(const std::vector<int>&         gpuIdsToUse,
                               const std::vector<int>&         userGpuTaskAssignment,
                               const gmx_hw_info_t&            hardwareInfo,
-                             const t_commrec*                cr,
-                             const gmx_multisim_t*           ms,
+                             MPI_Comm                        gromacsWorldComm,
                               const PhysicalNodeCommunicator& physicalNodeComm,
                               TaskTarget                      nonbondedTarget,
                               TaskTarget                      pmeTarget,
author	Berk Hess <hess@kth.se>
	Tue, 14 Jan 2020 13:51:38 +0000 (14:51 +0100)
committer	Berk Hess <hess@kth.se>
	Wed, 15 Jan 2020 12:40:57 +0000 (13:40 +0100)
docs/release-notes/2020/2020.1.rst		patch \| blob \| history
src/gromacs/gmxlib/network.cpp		patch \| blob \| history
src/gromacs/gmxlib/network.h		patch \| blob \| history
src/gromacs/mdrun/runner.cpp		patch \| blob \| history
src/gromacs/mdrunutility/multisim.cpp		patch \| blob \| history
src/gromacs/mdrunutility/multisim.h		patch \| blob \| history
src/gromacs/taskassignment/taskassignment.cpp		patch \| blob \| history
src/gromacs/taskassignment/taskassignment.h		patch \| blob \| history