2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
37 * \brief Implements the multi-simulation support routines.
39 * \author Mark Abraham <mark.j.abraham@gmail.com>
40 * \ingroup module_mdrunutility
48 #include "gromacs/gmxlib/network.h"
49 #include "gromacs/mdtypes/commrec.h"
50 #include "gromacs/utility/exceptions.h"
51 #include "gromacs/utility/fatalerror.h"
52 #include "gromacs/utility/futil.h"
53 #include "gromacs/utility/gmxassert.h"
54 #include "gromacs/utility/logger.h"
55 #include "gromacs/utility/smalloc.h"
57 std::unique_ptr<gmx_multisim_t> buildMultiSimulation(MPI_Comm worldComm,
58 gmx::ArrayRef<const std::string> multidirs)
60 if (multidirs.empty())
65 if (!GMX_LIB_MPI && !multidirs.empty())
67 GMX_THROW(gmx::NotImplementedError(
68 "Multi-simulations are only supported when GROMACS has been "
69 "configured with a proper external MPI library."));
72 if (multidirs.size() == 1)
74 /* NOTE: It would be nice if this special case worked, but this requires checks/tests. */
75 GMX_THROW(gmx::NotImplementedError(
76 "To run mdrun in multi-simulation mode, more then one "
77 "actual simulation is required. The single simulation case is not supported."));
82 MPI_Comm_size(worldComm, &numRanks);
83 if (numRanks % multidirs.size() != 0)
85 auto message = gmx::formatString(
86 "The number of ranks (%d) is not a multiple of the number of simulations (%td)",
89 GMX_THROW(gmx::InconsistentInputError(message));
92 int numRanksPerSimulation = numRanks / multidirs.size();
93 int rankWithinWorldComm;
94 MPI_Comm_rank(worldComm, &rankWithinWorldComm);
99 "We have %td simulations, %d ranks per simulation, local simulation is %d\n",
101 numRanksPerSimulation,
102 rankWithinWorldComm / numRanksPerSimulation);
105 int numSimulations = multidirs.size();
106 // Create a communicator for the master ranks of each simulation
107 std::vector<int> ranksOfMasters(numSimulations);
108 for (int i = 0; i < numSimulations; i++)
110 ranksOfMasters[i] = i * numRanksPerSimulation;
112 MPI_Group worldGroup;
113 // No need to free worldGroup later, we didn't create it.
114 MPI_Comm_group(worldComm, &worldGroup);
116 MPI_Group mastersGroup = MPI_GROUP_NULL;
117 MPI_Group_incl(worldGroup, numSimulations, ranksOfMasters.data(), &mastersGroup);
118 MPI_Comm mastersComm = MPI_COMM_NULL;
119 MPI_Comm_create(worldComm, mastersGroup, &mastersComm);
120 if (mastersGroup != MPI_GROUP_NULL)
122 MPI_Group_free(&mastersGroup);
125 int simulationIndex = rankWithinWorldComm / numRanksPerSimulation;
126 MPI_Comm simulationComm = MPI_COMM_NULL;
127 MPI_Comm_split(worldComm, simulationIndex, rankWithinWorldComm, &simulationComm);
131 gmx_chdir(multidirs[simulationIndex].c_str());
133 catch (gmx::GromacsException& e)
135 e.prependContext("While changing directory for multi-simulation to " + multidirs[simulationIndex]);
138 return std::make_unique<gmx_multisim_t>(numSimulations, simulationIndex, mastersComm, simulationComm);
140 GMX_UNUSED_VALUE(worldComm);
145 gmx_multisim_t::gmx_multisim_t(int numSimulations, int simulationIndex, MPI_Comm mastersComm, MPI_Comm simulationComm) :
146 numSimulations_(numSimulations),
147 simulationIndex_(simulationIndex),
148 mastersComm_(mastersComm),
149 simulationComm_(simulationComm)
153 gmx_multisim_t::~gmx_multisim_t()
156 // TODO This would work better if the result of MPI_Comm_split was
157 // put into an RAII-style guard, such as gmx::unique_cptr.
158 if (mastersComm_ != MPI_COMM_NULL && mastersComm_ != MPI_COMM_WORLD)
160 MPI_Comm_free(&mastersComm_);
162 if (simulationComm_ != MPI_COMM_NULL && simulationComm_ != MPI_COMM_WORLD)
164 MPI_Comm_free(&simulationComm_);
170 static void gmx_sumd_comm(int nr, double r[], MPI_Comm mpi_comm)
172 # if MPI_IN_PLACE_EXISTS
173 MPI_Allreduce(MPI_IN_PLACE, r, nr, MPI_DOUBLE, MPI_SUM, mpi_comm);
175 /* this function is only used in code that is not performance critical,
176 (during setup, when comm_rec is not the appropriate communication
177 structure), so this isn't as bad as it looks. */
182 MPI_Allreduce(r, buf, nr, MPI_DOUBLE, MPI_SUM, mpi_comm);
183 for (i = 0; i < nr; i++)
193 static void gmx_sumf_comm(int nr, float r[], MPI_Comm mpi_comm)
195 # if MPI_IN_PLACE_EXISTS
196 MPI_Allreduce(MPI_IN_PLACE, r, nr, MPI_FLOAT, MPI_SUM, mpi_comm);
198 /* this function is only used in code that is not performance critical,
199 (during setup, when comm_rec is not the appropriate communication
200 structure), so this isn't as bad as it looks. */
205 MPI_Allreduce(r, buf, nr, MPI_FLOAT, MPI_SUM, mpi_comm);
206 for (i = 0; i < nr; i++)
215 void gmx_sumd_sim(int gmx_unused nr, double gmx_unused r[], const gmx_multisim_t gmx_unused* ms)
218 GMX_RELEASE_ASSERT(false, "Invalid call to gmx_sumd_sim");
220 gmx_sumd_comm(nr, r, ms->mastersComm_);
224 void gmx_sumf_sim(int gmx_unused nr, float gmx_unused r[], const gmx_multisim_t gmx_unused* ms)
227 GMX_RELEASE_ASSERT(false, "Invalid call to gmx_sumf_sim");
229 gmx_sumf_comm(nr, r, ms->mastersComm_);
233 void gmx_sumi_sim(int gmx_unused nr, int gmx_unused r[], const gmx_multisim_t gmx_unused* ms)
236 GMX_RELEASE_ASSERT(false, "Invalid call to gmx_sumi_sim");
238 # if MPI_IN_PLACE_EXISTS
239 MPI_Allreduce(MPI_IN_PLACE, r, nr, MPI_INT, MPI_SUM, ms->mastersComm_);
241 /* this is thread-unsafe, but it will do for now: */
242 ms->intBuffer.resize(nr);
243 MPI_Allreduce(r, ms->intBuffer.data(), ms->intBuffer.size(), MPI_INT, MPI_SUM, ms->mastersComm_);
244 std::copy(std::begin(ms->intBuffer), std::end(ms->intBuffer), r);
249 void gmx_sumli_sim(int gmx_unused nr, int64_t gmx_unused r[], const gmx_multisim_t gmx_unused* ms)
252 GMX_RELEASE_ASSERT(false, "Invalid call to gmx_sumli_sim");
254 # if MPI_IN_PLACE_EXISTS
255 MPI_Allreduce(MPI_IN_PLACE, r, nr, MPI_INT64_T, MPI_SUM, ms->mastersComm_);
257 /* this is thread-unsafe, but it will do for now: */
258 ms->int64Buffer.resize(nr);
259 MPI_Allreduce(r, ms->int64Buffer.data(), ms->int64Buffer.size(), MPI_INT64_T, MPI_SUM, ms->mastersComm_);
260 std::copy(std::begin(ms->int64Buffer), std::end(ms->int64Buffer), r);
265 std::vector<int> gatherIntFromMultiSimulation(const gmx_multisim_t* ms, const int localValue)
267 std::vector<int> valuesFromAllRanks;
271 valuesFromAllRanks.resize(ms->numSimulations_);
272 valuesFromAllRanks[ms->simulationIndex_] = localValue;
273 gmx_sumi_sim(ms->numSimulations_, valuesFromAllRanks.data(), ms);
277 valuesFromAllRanks.emplace_back(localValue);
280 GMX_UNUSED_VALUE(ms);
281 valuesFromAllRanks.emplace_back(localValue);
283 return valuesFromAllRanks;
286 void check_multi_int(FILE* log, const gmx_multisim_t* ms, int val, const char* name, gmx_bool bQuiet)
289 gmx_bool bCompatible;
291 if (nullptr != log && !bQuiet)
293 fprintf(log, "Multi-checking %s ... ", name);
298 gmx_fatal(FARGS, "check_multi_int called with a NULL communication pointer");
301 snew(ibuf, ms->numSimulations_);
302 ibuf[ms->simulationIndex_] = val;
303 gmx_sumi_sim(ms->numSimulations_, ibuf, ms);
306 for (p = 1; p < ms->numSimulations_; p++)
308 bCompatible = bCompatible && (ibuf[p - 1] == ibuf[p]);
313 if (nullptr != log && !bQuiet)
315 fprintf(log, "OK\n");
322 fprintf(log, "\n%s is not equal for all subsystems\n", name);
323 for (p = 0; p < ms->numSimulations_; p++)
325 fprintf(log, " subsystem %d: %d\n", p, ibuf[p]);
328 gmx_fatal(FARGS, "The %d subsystems are not compatible\n", ms->numSimulations_);
334 void check_multi_int64(FILE* log, const gmx_multisim_t* ms, int64_t val, const char* name, gmx_bool bQuiet)
338 gmx_bool bCompatible;
340 if (nullptr != log && !bQuiet)
342 fprintf(log, "Multi-checking %s ... ", name);
347 gmx_fatal(FARGS, "check_multi_int called with a NULL communication pointer");
350 snew(ibuf, ms->numSimulations_);
351 ibuf[ms->simulationIndex_] = val;
352 gmx_sumli_sim(ms->numSimulations_, ibuf, ms);
355 for (p = 1; p < ms->numSimulations_; p++)
357 bCompatible = bCompatible && (ibuf[p - 1] == ibuf[p]);
362 if (nullptr != log && !bQuiet)
364 fprintf(log, "OK\n");
369 // TODO Part of this error message would also be good to go to
370 // stderr (from one rank of one sim only)
373 fprintf(log, "\n%s is not equal for all subsystems\n", name);
374 for (p = 0; p < ms->numSimulations_; p++)
377 /* first make the format string */
378 snprintf(strbuf, 255, " subsystem %%d: %s\n", "%" PRId64);
379 fprintf(log, strbuf, p, ibuf[p]);
382 gmx_fatal(FARGS, "The %d subsystems are not compatible\n", ms->numSimulations_);
388 bool findIsSimulationMasterRank(const gmx_multisim_t* ms, MPI_Comm communicator)
392 // Ranks of multi-simulations know whether they are a master
393 // rank. Ranks of non-multi simulation do not know until a
394 // t_commrec is available.
395 if ((ms != nullptr) && (ms->numSimulations_ > 1))
397 return ms->mastersComm_ != MPI_COMM_NULL;
403 MPI_Comm_rank(communicator, &rank);
408 else if (GMX_THREAD_MPI)
410 GMX_RELEASE_ASSERT(communicator == MPI_COMM_NULL || communicator == MPI_COMM_WORLD,
411 "Invalid communicator");
412 // Spawned threads have MPI_COMM_WORLD upon creation, so if
413 // the communicator is MPI_COMM_NULL this is not a spawned thread,
414 // ie is the master thread
415 return (communicator == MPI_COMM_NULL);
419 // No MPI means it must be the master (and only) rank.
424 bool isMasterSim(const gmx_multisim_t* ms)
426 return !isMultiSim(ms) || ms->simulationIndex_ == 0;
429 bool isMasterSimMasterRank(const gmx_multisim_t* ms, const bool isMaster)
431 return (isMaster && isMasterSim(ms));
434 static bool multisim_int_all_are_equal(const gmx_multisim_t* ms, int64_t value)
436 bool allValuesAreEqual = true;
439 GMX_RELEASE_ASSERT(ms, "Invalid use of multi-simulation pointer");
441 snew(buf, ms->numSimulations_);
442 /* send our value to all other master ranks, receive all of theirs */
443 buf[ms->simulationIndex_] = value;
444 gmx_sumli_sim(ms->numSimulations_, buf, ms);
446 for (int s = 0; s < ms->numSimulations_; s++)
450 allValuesAreEqual = false;
457 return allValuesAreEqual;
460 void logInitialMultisimStatus(const gmx_multisim_t* ms,
462 const gmx::MDLogger& mdlog,
463 const bool simulationsShareState,
465 const int initialStep)
467 if (!multisim_int_all_are_equal(ms, numSteps))
469 GMX_LOG(mdlog.warning)
471 "Note: The number of steps is not consistent across multi "
473 "but we are proceeding anyway!");
475 if (!multisim_int_all_are_equal(ms, initialStep))
477 if (simulationsShareState)
482 "The initial step is not consistent across multi simulations which "
485 gmx_barrier(cr->mpi_comm_mygroup);
489 GMX_LOG(mdlog.warning)
491 "Note: The initial step is not consistent across multi "
493 "but we are proceeding anyway!");