From 01b234357f0d68d561f46272ba204bd378220862 Mon Sep 17 00:00:00 2001 From: Berk Hess Date: Fri, 24 Sep 2021 19:24:23 +0000 Subject: [PATCH] Also use DD partitioning in serial --- src/gromacs/domdec/domdec.cpp | 30 ++++++++---- src/gromacs/domdec/domdec_network.cpp | 34 +++++++++---- src/gromacs/domdec/domdec_struct.h | 2 +- src/gromacs/domdec/localtopologychecker.cpp | 49 ++++++++++++------- src/gromacs/domdec/localtopologychecker.h | 4 +- src/gromacs/gmxlib/network.cpp | 30 ++++++++++-- src/gromacs/imd/imd.cpp | 4 +- src/gromacs/mdlib/constr.cpp | 2 +- src/gromacs/mdlib/sim_util.cpp | 5 +- src/gromacs/mdrun/minimize.cpp | 4 ++ src/gromacs/mdrun/runner.cpp | 9 +++- src/gromacs/mdrun/shellfc.cpp | 2 +- src/gromacs/mdtypes/commrec.h | 39 ++++++++------- src/gromacs/nbnxm/gridset.cpp | 3 +- .../taskassignment/resourcedivision.cpp | 2 +- 15 files changed, 145 insertions(+), 74 deletions(-) diff --git a/src/gromacs/domdec/domdec.cpp b/src/gromacs/domdec/domdec.cpp index f4ddcc70ee..8a634b7c07 100644 --- a/src/gromacs/domdec/domdec.cpp +++ b/src/gromacs/domdec/domdec.cpp @@ -1063,7 +1063,7 @@ static void make_load_communicator(gmx_domdec_t* dd, int dim_ind, ivec loc) void dd_setup_dlb_resource_sharing(const t_commrec* cr, int gpu_id) { #if GMX_MPI - MPI_Comm mpi_comm_pp_physicalnode = MPI_COMM_NULL; + gmx_domdec_t* dd = cr->dd; if (!thisRankHasDuty(cr, DUTY_PP) || gpu_id < 0) { @@ -1073,9 +1073,14 @@ void dd_setup_dlb_resource_sharing(const t_commrec* cr, int gpu_id) return; } - const int physicalnode_id_hash = gmx_physicalnode_id_hash(); + if (cr->nnodes == 1) + { + dd->comm->nrank_gpu_shared = 1; - gmx_domdec_t* dd = cr->dd; + return; + } + + const int physicalnode_id_hash = gmx_physicalnode_id_hash(); if (debug) { @@ -1088,6 +1093,7 @@ void dd_setup_dlb_resource_sharing(const t_commrec* cr, int gpu_id) */ // TODO PhysicalNodeCommunicator could be extended/used to handle // the need for per-node per-group communicators. + MPI_Comm mpi_comm_pp_physicalnode; MPI_Comm_split(dd->mpi_comm_all, physicalnode_id_hash, dd->rank, &mpi_comm_pp_physicalnode); MPI_Comm_split(mpi_comm_pp_physicalnode, gpu_id, dd->rank, &dd->comm->mpi_comm_gpu_shared); MPI_Comm_free(&mpi_comm_pp_physicalnode); @@ -1528,6 +1534,7 @@ static CartesianRankSetup split_communicator(const gmx::MDLogger& mdlog, getThisRankDuties(cr), dd_index(cartSetup.ntot, ddCellIndex), &cr->mpi_comm_mygroup); + MPI_Comm_size(cr->mpi_comm_mygroup, &cr->sizeOfMyGroupCommunicator); #else GMX_UNUSED_VALUE(ddCellIndex); #endif @@ -1559,6 +1566,7 @@ static CartesianRankSetup split_communicator(const gmx::MDLogger& mdlog, #if GMX_MPI /* Split the sim communicator into PP and PME only nodes */ MPI_Comm_split(cr->mpi_comm_mysim, getThisRankDuties(cr), cr->nodeid, &cr->mpi_comm_mygroup); + MPI_Comm_size(cr->mpi_comm_mygroup, &cr->sizeOfMyGroupCommunicator); MPI_Comm_rank(cr->mpi_comm_mygroup, &cr->nodeid); #endif } @@ -1628,14 +1636,18 @@ static void setupGroupCommunication(const gmx::MDLogger& mdlog, if (thisRankHasDuty(cr, DUTY_PP)) { - /* Copy or make a new PP communicator */ + if (dd->nnodes > 1) + { + /* Copy or make a new PP communicator */ - /* We (possibly) reordered the nodes in split_communicator, - * so it is no longer required in make_pp_communicator. - */ - const bool useCartesianReorder = (ddSettings.useCartesianReorder && !cartSetup.bCartesianPP_PME); + /* We (possibly) reordered the nodes in split_communicator, + * so it is no longer required in make_pp_communicator. + */ + const bool useCartesianReorder = + (ddSettings.useCartesianReorder && !cartSetup.bCartesianPP_PME); - make_pp_communicator(mdlog, dd, cr, useCartesianReorder); + make_pp_communicator(mdlog, dd, cr, useCartesianReorder); + } } else { diff --git a/src/gromacs/domdec/domdec_network.cpp b/src/gromacs/domdec/domdec_network.cpp index b5c01f85e0..ead1f5282e 100644 --- a/src/gromacs/domdec/domdec_network.cpp +++ b/src/gromacs/domdec/domdec_network.cpp @@ -280,9 +280,16 @@ void dd_gather(const gmx_domdec_t gmx_unused* dd, void gmx_unused* dest) { #if GMX_MPI - /* Some MPI implementions don't specify const */ - MPI_Gather(const_cast(src), nbytes, MPI_BYTE, dest, nbytes, MPI_BYTE, DDMASTERRANK(dd), dd->mpi_comm_all); + if (dd->nnodes > 1) + { + /* Some MPI implementions don't specify const */ + MPI_Gather(const_cast(src), nbytes, MPI_BYTE, dest, nbytes, MPI_BYTE, DDMASTERRANK(dd), dd->mpi_comm_all); + } + else #endif + { + memcpy(dest, src, nbytes); + } } void dd_scatterv(const gmx_domdec_t gmx_unused* dd, @@ -325,15 +332,22 @@ void dd_gatherv(const gmx_domdec_t gmx_unused* dd, void gmx_unused* rbuf) { #if GMX_MPI - int dum = 0; - - if (scount == 0) + if (dd->nnodes > 1) { - /* MPI does not allow NULL pointers */ - sbuf = &dum; + int dum; + + if (scount == 0) + { + /* MPI does not allow NULL pointers */ + sbuf = &dum; + } + /* Some MPI implementions don't specify const */ + MPI_Gatherv( + const_cast(sbuf), scount, MPI_BYTE, rbuf, rcounts, disps, MPI_BYTE, DDMASTERRANK(dd), dd->mpi_comm_all); } - /* Some MPI implementions don't specify const */ - MPI_Gatherv( - const_cast(sbuf), scount, MPI_BYTE, rbuf, rcounts, disps, MPI_BYTE, DDMASTERRANK(dd), dd->mpi_comm_all); + else #endif + { + memcpy(rbuf, sbuf, rcounts[0]); + } } diff --git a/src/gromacs/domdec/domdec_struct.h b/src/gromacs/domdec/domdec_struct.h index 613a62e466..911f1e7333 100644 --- a/src/gromacs/domdec/domdec_struct.h +++ b/src/gromacs/domdec/domdec_struct.h @@ -170,7 +170,7 @@ struct gmx_domdec_t /* The communication setup within the communicator all * defined in dd->comm in domdec.c */ - int nnodes = 0; + int nnodes = 1; MPI_Comm mpi_comm_all = MPI_COMM_NULL; /* The local DD cell index and rank */ gmx::IVec ci = { 0, 0, 0 }; diff --git a/src/gromacs/domdec/localtopologychecker.cpp b/src/gromacs/domdec/localtopologychecker.cpp index 8db5e261be..9fabbc10bf 100644 --- a/src/gromacs/domdec/localtopologychecker.cpp +++ b/src/gromacs/domdec/localtopologychecker.cpp @@ -480,25 +480,36 @@ LocalTopologyChecker& LocalTopologyChecker::operator=(LocalTopologyChecker&& oth void LocalTopologyChecker::scheduleCheckOfLocalTopology(const int numBondedInteractionsToReduce) { - // Fill the reduction buffer with the value from this domain to reduce - impl_->reductionBuffer_[0] = double(numBondedInteractionsToReduce); - - // Pass the post-reduction callback to the ObservablesReducer via - // the callback it gave us for the purpose. - // - // Note that it's possible that the callbackAfterReduction is already - // outstanding, e.g. if repartitioning was triggered before - // observables were reduced. This could happen for example when - // replica exchange took place soon after a partition. If so, the - // callback will be called again. So long as there is no race - // between the calls to this function and the calls to - // ObservablesReducer for reduction, this will work correctly. It - // could be made safer e.g. with checks against duplicate - // callbacks, but there is no problem to solve. - // - // There is no need to check the return value from this callback, - // as it is not an error to request reduction at a future step. - impl_->callbackToRequireReduction_(ReductionRequirement::Eventually); + // When we have a single domain, we don't need to reduce and we algorithmically can not miss + // any interactions, so we can assert here. + if (!havePPDomainDecomposition(impl_->cr_)) + { + GMX_RELEASE_ASSERT(numBondedInteractionsToReduce == impl_->expectedNumGlobalBondedInteractions_, + "With a single domain the number of assigned bonded interactions should " + "always match the global number"); + } + else + { + // Fill the reduction buffer with the value from this domain to reduce + impl_->reductionBuffer_[0] = double(numBondedInteractionsToReduce); + + // Pass the post-reduction callback to the ObservablesReducer via + // the callback it gave us for the purpose. + // + // Note that it's possible that the callbackAfterReduction is already + // outstanding, e.g. if repartitioning was triggered before + // observables were reduced. This could happen for example when + // replica exchange took place soon after a partition. If so, the + // callback will be called again. So long as there is no race + // between the calls to this function and the calls to + // ObservablesReducer for reduction, this will work correctly. It + // could be made safer e.g. with checks against duplicate + // callbacks, but there is no problem to solve. + // + // There is no need to check the return value from this callback, + // as it is not an error to request reduction at a future step. + impl_->callbackToRequireReduction_(ReductionRequirement::Eventually); + } } } // namespace gmx diff --git a/src/gromacs/domdec/localtopologychecker.h b/src/gromacs/domdec/localtopologychecker.h index 253879021b..5ea7d634e2 100644 --- a/src/gromacs/domdec/localtopologychecker.h +++ b/src/gromacs/domdec/localtopologychecker.h @@ -106,7 +106,9 @@ public: /*! \brief Set that the local topology should be checked via * observables reduction whenever that reduction is required by - * another module. */ + * another module. In case of a single domain a direct assertion + * is performed instead. + */ void scheduleCheckOfLocalTopology(int numBondedInteractionsToReduce); private: diff --git a/src/gromacs/gmxlib/network.cpp b/src/gromacs/gmxlib/network.cpp index eec49f099a..c5edd4eb51 100644 --- a/src/gromacs/gmxlib/network.cpp +++ b/src/gromacs/gmxlib/network.cpp @@ -87,11 +87,12 @@ CommrecHandle init_commrec(MPI_Comm communicator) // For now, we want things to go horribly wrong if this is used too early... // TODO: Remove when communicators are removed from commrec (#2395) - cr->nnodes = -1; - cr->nodeid = -1; - cr->sim_nodeid = -1; - cr->mpi_comm_mysim = MPI_COMM_NULL; - cr->mpi_comm_mygroup = MPI_COMM_NULL; + cr->nnodes = -1; + cr->sizeOfMyGroupCommunicator = -1; + cr->nodeid = -1; + cr->sim_nodeid = -1; + cr->mpi_comm_mysim = MPI_COMM_NULL; + cr->mpi_comm_mygroup = MPI_COMM_NULL; // TODO cr->duty should not be initialized here cr->duty = (DUTY_PP | DUTY_PME); @@ -221,6 +222,10 @@ void gmx_setup_nodecomm(FILE gmx_unused* fplog, t_commrec* cr) void gmx_barrier(MPI_Comm gmx_unused communicator) { + if (communicator == MPI_COMM_NULL) + { + return; + } #if !GMX_MPI GMX_RELEASE_ASSERT(false, "Invalid call to gmx_barrier"); #else @@ -239,6 +244,11 @@ void gmx_bcast(int gmx_unused nbytes, void gmx_unused* b, MPI_Comm gmx_unused co void gmx_sumd(int gmx_unused nr, double gmx_unused r[], const t_commrec gmx_unused* cr) { + if (cr->sizeOfMyGroupCommunicator == 1) + { + return; + } + #if !GMX_MPI GMX_RELEASE_ASSERT(false, "Invalid call to gmx_sumd"); #else @@ -268,6 +278,11 @@ void gmx_sumd(int gmx_unused nr, double gmx_unused r[], const t_commrec gmx_unus void gmx_sumf(int gmx_unused nr, float gmx_unused r[], const t_commrec gmx_unused* cr) { + if (cr->sizeOfMyGroupCommunicator == 1) + { + return; + } + #if !GMX_MPI GMX_RELEASE_ASSERT(false, "Invalid call to gmx_sumf"); #else @@ -297,6 +312,11 @@ void gmx_sumf(int gmx_unused nr, float gmx_unused r[], const t_commrec gmx_unuse void gmx_sumi(int gmx_unused nr, int gmx_unused r[], const t_commrec gmx_unused* cr) { + if (cr->sizeOfMyGroupCommunicator == 1) + { + return; + } + #if !GMX_MPI GMX_RELEASE_ASSERT(false, "Invalid call to gmx_sumi"); #else diff --git a/src/gromacs/imd/imd.cpp b/src/gromacs/imd/imd.cpp index 7050b67956..7d0cef345c 100644 --- a/src/gromacs/imd/imd.cpp +++ b/src/gromacs/imd/imd.cpp @@ -1268,7 +1268,7 @@ void ImdSession::Impl::prepareForPositionAssembly(const t_commrec* cr, gmx::Arra } } - if (!PAR(cr)) + if (!DOMAINDECOMP(cr)) { nat_loc = nat; ind_loc = ind; @@ -1281,7 +1281,7 @@ void ImdSession::Impl::prepareForPositionAssembly(const t_commrec* cr, gmx::Arra } /* Communicate initial coordinates xa_old to all processes */ - if (PAR(cr)) + if (cr && havePPDomainDecomposition(cr)) { gmx_bcast(nat * sizeof(xa_old[0]), xa_old, cr->mpi_comm_mygroup); } diff --git a/src/gromacs/mdlib/constr.cpp b/src/gromacs/mdlib/constr.cpp index 17fa1b19ff..255dfb77a3 100644 --- a/src/gromacs/mdlib/constr.cpp +++ b/src/gromacs/mdlib/constr.cpp @@ -489,7 +489,7 @@ bool Constraints::Impl::apply(bool bLog, /* Communicate the coordinates required for the non-local constraints * for LINCS and/or SETTLE. */ - if (cr->dd) + if (havePPDomainDecomposition(cr)) { dd_move_x_constraints(cr->dd, box, diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp index 9b7affec5e..429b91c6b1 100644 --- a/src/gromacs/mdlib/sim_util.cpp +++ b/src/gromacs/mdlib/sim_util.cpp @@ -2127,8 +2127,9 @@ void do_force(FILE* fplog, // With both nonbonded and PME offloaded a GPU on the same rank, we use // an alternating wait/reduction scheme. bool alternateGpuWait = - (!c_disableAlternatingWait && stepWork.haveGpuPmeOnThisRank - && simulationWork.useGpuNonbonded && !DOMAINDECOMP(cr) && !stepWork.useGpuFBufferOps); + (!c_disableAlternatingWait && stepWork.haveGpuPmeOnThisRank && simulationWork.useGpuNonbonded + && !simulationWork.havePpDomainDecomposition && !stepWork.useGpuFBufferOps); + if (alternateGpuWait) { alternatePmeNbGpuWaitReduce(fr->nbv.get(), diff --git a/src/gromacs/mdrun/minimize.cpp b/src/gromacs/mdrun/minimize.cpp index 07e9d1c7e5..27a7bf1fea 100644 --- a/src/gromacs/mdrun/minimize.cpp +++ b/src/gromacs/mdrun/minimize.cpp @@ -1975,6 +1975,10 @@ void LegacySimulator::do_lbfgs() "be available in a different form in a future version of GROMACS, " "e.g. gmx minimize and an .mdp option."); + if (DOMAINDECOMP(cr)) + { + gmx_fatal(FARGS, "L_BFGS is currently not supported"); + } if (PAR(cr)) { gmx_fatal(FARGS, "L-BFGS minimization only supports a single rank"); diff --git a/src/gromacs/mdrun/runner.cpp b/src/gromacs/mdrun/runner.cpp index 17db7f419b..d409b82136 100644 --- a/src/gromacs/mdrun/runner.cpp +++ b/src/gromacs/mdrun/runner.cpp @@ -974,12 +974,17 @@ int Mdrunner::mdrunner() "Linear acceleration has been removed in GROMACS 2022, and was broken for many years " "before that. Use GROMACS 4.5 or earlier if you need this feature."); + // Now we decide whether to use the domain decomposition machinery. + // Note that this does not necessarily imply actually using multiple domains. // Now the number of ranks is known to all ranks, and each knows // the inputrec read by the master rank. The ranks can now all run // the task-deciding functions and will agree on the result // without needing to communicate. + // The LBFGS minimizer, test-particle insertion, normal modes and shell dynamics don't support DD const bool useDomainDecomposition = - (PAR(cr) && !(EI_TPI(inputrec->eI) || inputrec->eI == IntegrationAlgorithm::NM)); + !(inputrec->eI == IntegrationAlgorithm::LBFGS || EI_TPI(inputrec->eI) + || inputrec->eI == IntegrationAlgorithm::NM + || gmx_mtop_particletype_count(mtop)[ParticleType::Shell] > 0); // Note that these variables describe only their own node. // @@ -1502,7 +1507,7 @@ int Mdrunner::mdrunner() if (deviceInfo != nullptr) { - if (DOMAINDECOMP(cr) && thisRankHasDuty(cr, DUTY_PP)) + if (runScheduleWork.simulationWork.havePpDomainDecomposition && thisRankHasDuty(cr, DUTY_PP)) { dd_setup_dlb_resource_sharing(cr, deviceId); } diff --git a/src/gromacs/mdrun/shellfc.cpp b/src/gromacs/mdrun/shellfc.cpp index 1284b62fb5..7f15b180bc 100644 --- a/src/gromacs/mdrun/shellfc.cpp +++ b/src/gromacs/mdrun/shellfc.cpp @@ -287,7 +287,7 @@ gmx_shellfc_t* init_shell_flexcon(FILE* fplog, "not supported in combination with shell particles.\nPlease make a new tpr file.", nstcalcenergy); } - if (usingDomainDecomposition) + if (nshell > 0 && usingDomainDecomposition) { gmx_fatal( FARGS, diff --git a/src/gromacs/mdtypes/commrec.h b/src/gromacs/mdtypes/commrec.h index 9213187567..0307de3d5f 100644 --- a/src/gromacs/mdtypes/commrec.h +++ b/src/gromacs/mdtypes/commrec.h @@ -77,11 +77,14 @@ struct t_commrec * All communication within some simulation should happen * in mpi_comm_mysim, or its subset mpi_comm_mygroup. */ - int sim_nodeid, nnodes, npmenodes; - - /* thread numbers: */ - /* Not used yet: int threadid, nthreads; */ - /* The nodeid in the PP/PME, PP or PME group */ + //! The rank-id in mpi_comm_mysim; + int sim_nodeid; + //! The number of ranks in mpi_comm_mysim + int nnodes; + //! The number of separate PME ranks, 0 when no separate PME ranks are used + int npmenodes; + + //! The rank-id in mpi_comm_mygroup; int nodeid; /* MPI communicators within a single simulation @@ -91,6 +94,9 @@ struct t_commrec a single simulation */ MPI_Comm mpi_comm_mygroup; /* subset of mpi_comm_mysim including only the ranks in the same group (PP or PME) */ + //! The number of ranks in mpi_comm_mygroup + int sizeOfMyGroupCommunicator; + //! The communicator used before DD was initialized MPI_Comm mpiDefaultCommunicator; int sizeOfDefaultCommunicator; @@ -152,21 +158,16 @@ inline bool thisRankHasDuty(const t_commrec* cr, int duty) //! The node id for the master #define MASTERRANK(cr) (0) -/*! \brief Do we decompose the work of this simulation? - * - * True if this simulation uses more than one PP rank, or if this simulation - * uses at least one PME-only rank. +/*! \brief Returns whether the domain decomposition machinery is active * - * PAR(cr) is true if this is true, but the converse does not apply (see docs - * of PAR(cr)). - * - * This is true if havePPDomainDecomposition is true, but the converse does not - * apply (see docs of havePpDomainDecomposition()). - * - * \todo As part of Issue #2395, replace calls to this with - * havePPDomainDecomposition or a call of some other/new function, as - * appropriate to each case. Then eliminate this macro. */ -#define DOMAINDECOMP(cr) (((cr)->dd != nullptr) && PAR(cr)) + * Note that when the return value is true, there are not necessarily + * multiple domains. The domain decomposition machinery is also active and + * reorders the atoms also with a single MPI rank, or 1 PP and 1 PME rank, + * with most integrators. Only a few special non-integrator "integrators" + * do not (yet) support the domain decomposition machinery and therefore + * this macro is still needed. + */ +#define DOMAINDECOMP(cr) ((cr)->dd != nullptr) /*! \brief Returns whether we have actual domain decomposition for the particle-particle interactions * diff --git a/src/gromacs/nbnxm/gridset.cpp b/src/gromacs/nbnxm/gridset.cpp index c3a1a7ec26..a301092e36 100644 --- a/src/gromacs/nbnxm/gridset.cpp +++ b/src/gromacs/nbnxm/gridset.cpp @@ -83,7 +83,8 @@ GridSet::DomainSetup::DomainSetup(const PbcType pbcType, const gmx_domdec_zones_t* ddZones) : pbcType(pbcType), doTestParticleInsertion(doTestParticleInsertion), - haveMultipleDomains(numDDCells != nullptr), + haveMultipleDomains(numDDCells != nullptr + && (*numDDCells)[XX] * (*numDDCells)[YY] * (*numDDCells)[ZZ] > 1), zones(ddZones) { for (int d = 0; d < DIM; d++) diff --git a/src/gromacs/taskassignment/resourcedivision.cpp b/src/gromacs/taskassignment/resourcedivision.cpp index 42c5b64a06..2108d1e7bd 100644 --- a/src/gromacs/taskassignment/resourcedivision.cpp +++ b/src/gromacs/taskassignment/resourcedivision.cpp @@ -617,7 +617,7 @@ void check_resource_division_efficiency(const gmx_hw_info_t* hwinfo, nthreads_omp_mpi_ok_min = nthreads_omp_mpi_ok_min_gpu; } - if (DOMAINDECOMP(cr)) + if (cr && cr->nnodes > 1) { if (nth_omp_max < nthreads_omp_mpi_ok_min || nth_omp_max > nthreads_omp_mpi_ok_max) { -- 2.22.0