2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2014,2015,2016,2017,2018, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
37 * \brief Declares implementation functions and types for the domain
38 * decomposition module.
40 * \author Berk Hess <hess@kth.se>
41 * \ingroup module_domdec
43 #ifndef GMX_DOMDEC_DOMDEC_INTERNAL_H
44 #define GMX_DOMDEC_DOMDEC_INTERNAL_H
48 #include "gromacs/domdec/domdec.h"
49 #include "gromacs/domdec/domdec_struct.h"
50 #include "gromacs/timing/cyclecounter.h"
51 #include "gromacs/topology/block.h"
59 struct gmx_domdec_ind_t
61 /* The numbers of charge groups to send and receive for each cell
62 * that requires communication, the last entry contains the total
63 * number of atoms that needs to be communicated.
65 int nsend[DD_MAXIZONE+2];
66 int nrecv[DD_MAXIZONE+2];
67 /* The charge groups to send */
68 std::vector<int> index;
69 /* The atom range for non-in-place communication */
70 int cell2at0[DD_MAXIZONE];
71 int cell2at1[DD_MAXIZONE];
74 struct gmx_domdec_comm_dim_t
76 /* Returns the number of grid pulses (the number of domains in the halo along this dimension) */
82 int np_dlb; /* For dlb, for use with edlbAUTO */
83 std::vector<gmx_domdec_ind_t> ind; /* The indices to communicate, size np */
84 bool receiveInPlace; /* Can we receive data in place? */
87 /*! \brief Load balancing data along a dim used on the master rank of that dim */
92 real cellFracLowerMax; /**< State var.: max lower bound., incl. neighbors */
93 real cellFracUpperMin; /**< State var.: min upper bound., incl. neighbors */
94 real boundMin; /**< Temp. var.: lower limit for cell boundary */
95 real boundMax; /**< Temp. var.: upper limit for cell boundary */
98 std::vector<bool> isCellMin; /**< Temp. var.: is this cell size at the limit */
99 std::vector<real> cellFrac; /**< State var.: cell boundaries, box relative */
100 std::vector<real> oldCellFrac; /**< Temp. var.: old cell size */
101 std::vector<Bounds> bounds; /**< Cell bounds */
102 bool dlbIsLimited; /**< State var.: is DLB limited in this row */
103 std::vector<real> buf_ncd; /**< Temp. var. */
106 /*! \brief Struct for managing cell sizes with DLB along a dimension */
107 struct DDCellsizesWithDlb
109 /* Cell sizes for dynamic load balancing */
110 std::unique_ptr<RowMaster> rowMaster; /**< Cell row root struct, only available on the first rank in a row */
111 std::vector<real> fracRow; /**< The cell sizes, in fractions, along a row, not available on the first rank in a row */
112 real fracLower; /**< The lower corner, in fractions, in triclinic space */
113 real fracUpper; /**< The upper corner, in fractions, in triclinic space */
114 real fracLowerMax; /**< The maximum lower corner among all our neighbors */
115 real fracUpperMin; /**< The minimum upper corner among all our neighbors */
118 /*! \brief Struct for compute load commuication
120 * Here floats are accurate enough, since these variables
121 * only influence the load balancing, not the actual MD results.
125 int nload; /**< The number of load recordings */
126 float *load; /**< Scan of the sum of load over dimensions */
127 float sum; /**< The sum of the load over the ranks up to our current dimension */
128 float max; /**< The maximum over the ranks contributing to \p sum */
129 float sum_m; /**< Like \p sum, but takes the maximum when the load balancing is limited */
130 float cvol_min; /**< Minimum cell volume, relative to the box */
131 float mdf; /**< The PP time during which PME can overlap */
132 float pme; /**< The PME-only rank load */
133 int flags; /**< Bit flags that tell if DLB was limited, per dimension */
136 /*! \brief Data needed to sort an atom to the desired location in the local state */
139 int nsc; /**< Neighborsearch grid cell index */
140 int ind_gl; /**< Global atom/charge group index */
141 int ind; /**< Local atom/charge group index */
144 /*! \brief Temporary buffers for sorting atoms */
147 std::vector<gmx_cgsort_t> sorted; /**< Sorted array of indices */
148 std::vector<gmx_cgsort_t> stationary; /**< Array of stationary atom/charge group indices */
149 std::vector<gmx_cgsort_t> moved; /**< Array of moved atom/charge group indices */
150 std::vector<int> intBuffer; /**< Integer buffer for sorting */
153 /*! \brief Manages atom ranges and order for the local state atom vectors */
157 /*! \brief The local state atom order
159 * This enum determines the order of the atoms in the local state.
160 * ddnatHOME and ddnatZONE should be first and second,
161 * the others can be ordered as wanted.
163 enum class Type : int
165 Home, /**< The home atoms */
166 Zones, /**< All zones in the eighth shell */
167 Vsites, /**< Atoms communicated for virtual sites */
168 Constraints, /**< Atoms communicated for constraints */
169 Number /**< Not a count, only present for convenience */
172 /*! \brief Returns the start atom index for range \p rangeType */
173 int start(Type rangeType) const
175 if (rangeType == Type::Home)
181 return end_[static_cast<int>(rangeType) - 1];
185 /*! \brief Returns the end atom index for range \p rangeType */
186 int end(Type rangeType) const
188 return end_[static_cast<int>(rangeType)];
191 /*! \brief Returns the number of home atoms */
192 int numHomeAtoms() const
194 return end_[static_cast<int>(Type::Home)];
197 /*! \brief Returns the total number of atoms */
198 int numAtomsTotal() const
200 return end_[static_cast<int>(Type::Number) - 1];
203 /*! \brief Sets the end index of range \p rangeType to \p end
205 * This should be called either with Type::Home or with a type
206 * that is larger than that passed in the previous call to setEnd.
207 * A release assertion for these conditions are present.
209 void setEnd(Type rangeType,
212 GMX_RELEASE_ASSERT(rangeType == Type::Home || rangeType > lastTypeSet_, "Can only set either home or a larger type than the last one");
214 for (int i = static_cast<int>(rangeType); i < static_cast<int>(Type::Number); i++)
219 lastTypeSet_ = rangeType;
223 /*! \brief The list of end atom indices */
224 std::array<int, static_cast<int>(Type::Number)> end_;
225 Type lastTypeSet_ = Type::Number;
228 /*! \brief Enum of dynamic load balancing states
230 * Allowed DLB states and transitions
231 * - intialization at startup:
232 * -> edlbsOffUser ("-dlb no")
233 * -> edlbsOnUser ("-dlb yes")
234 * -> edlbsOffCanTurnOn ("-dlb auto")
236 * - in automatic mode (i.e. initial state edlbsOffCanTurnOn):
237 * edlbsOffCanTurnOn -> edlbsOnCanTurnOff
238 * edlbsOffCanTurnOn -> edlbsOffForever
239 * edlbsOffCanTurnOn -> edlbsOffTemporarilyLocked
240 * edlbsOffTemporarilyLocked -> edlbsOffCanTurnOn
241 * edlbsOnCanTurnOff -> edlbsOffCanTurnOn
244 edlbsOffUser, /**< DLB is permanently off per user request */
245 edlbsOffForever, /**< DLB is off due to a runtime condition (not supported or causes performance loss) and will never be turned on */
246 edlbsOffCanTurnOn, /**< DLB is off and will turn on on imbalance */
247 edlbsOffTemporarilyLocked, /**< DLB is off and temporarily can't turn on */
248 edlbsOnCanTurnOff, /**< DLB is on and can turn off when slow */
249 edlbsOnUser, /**< DLB is permanently on per user request */
250 edlbsNR /**< The number of DLB states */
253 /*! \brief The PME domain decomposition for one dimension */
256 int dim; /**< The dimension */
257 gmx_bool dim_match; /**< Tells if DD and PME dims match */
258 int nslab; /**< The number of PME ranks/domains in this dimension */
259 real *slb_dim_f; /**< Cell sizes for determining the PME comm. with SLB */
260 int *pp_min; /**< The minimum pp node location, size nslab */
261 int *pp_max; /**< The maximum pp node location, size nslab */
262 int maxshift; /**< The maximum shift for coordinate redistribution in PME */
267 real min0; /* The minimum bottom of this zone */
268 real max1; /* The maximum top of this zone */
269 real min1; /* The minimum top of this zone */
270 real mch0; /* The maximum bottom communicaton height for this zone */
271 real mch1; /* The maximum top communicaton height for this zone */
272 real p1_0; /* The bottom value of the first cell in this zone */
273 real p1_1; /* The top value of the first cell in this zone */
276 /*! \brief The number of reals in gmx_ddzone_t */
277 constexpr int c_ddzoneNumReals = 7;
279 /*! \brief Forward declaration */
280 template<typename T> class DDBufferAccess;
282 /*! \brief Temporary storage container that minimizes (re)allocation and clearing
284 * This is only the storage, actual access happens through DDBufferAccess.
285 * All methods check if the buffer is (not) in use.
291 /*! \brief Returns a buffer of size \p numElements, the elements are undefined */
292 gmx::ArrayRef<T> resize(size_t numElements)
294 GMX_ASSERT(isInUse_, "Should only operate on acquired buffers");
296 if (numElements > buffer_.size())
298 buffer_.resize(numElements);
301 return gmx::arrayRefFromArray(buffer_.data(), numElements);
304 /*! \brief Acquire the buffer for use with size set to \p numElements, the elements are undefined */
305 gmx::ArrayRef<T> acquire(size_t numElements)
307 GMX_RELEASE_ASSERT(!isInUse_, "Should only request free buffers");
310 return resize(numElements);
313 /*! \brief Releases the buffer, buffer_ should not be used after this */
316 GMX_RELEASE_ASSERT(isInUse_, "Should only release buffers in use");
320 std::vector<T> buffer_; /**< The actual memory buffer */
321 bool isInUse_ = false; /**< Flag that tells whether the buffer is in use */
323 friend class DDBufferAccess<T>;
326 /*! \brief Class that manages access to a temporary memory buffer */
331 /*! \brief Constructor, returns a buffer of size \p numElements, element values are undefined
333 * \note The actual memory buffer \p ddBuffer can not be used to
334 * create other DDBufferAccess objects until the one created
337 DDBufferAccess(DDBuffer<T> &ddBuffer,
338 size_t numElements) :
341 buffer = ddBuffer_.acquire(numElements);
344 /*! \brief Destructor */
350 /*! \brief Resizes the buffer to \p numElements, new elements are undefined
352 * \note The buffer arrayref is updated after this call.
354 void resize(size_t numElements)
356 buffer = ddBuffer_.resize(numElements);
360 DDBuffer<T> &ddBuffer_; /**< Reference to the storage class */
362 gmx::ArrayRef<T> buffer; /**< The access to the memory buffer */
365 /*! brief Temporary buffer for setting up communiation over one pulse and all zones in the halo */
366 struct dd_comm_setup_work_t
368 std::vector<int> localAtomGroupBuffer; /**< The local atom group indices to send */
369 std::vector<int> atomGroupBuffer; /**< Buffer for collecting the global atom group indices to send */
370 std::vector<gmx::RVec> positionBuffer; /**< Buffer for collecting the atom group positions to send */
371 int nat; /**< The number of atoms contained in the atom groups to send */
372 int nsend_zone; /**< The number of atom groups to send for the last zone */
375 /*! \brief Struct for domain decomposition communication
377 * This struct contains most information about domain decomposition
378 * communication setup, some communication buffers, some statistics
379 * and also the setup for the communication between particle-particle
380 * and PME only ranks.
382 * All arrays are indexed with 0 to dd->ndim (not Cartesian indexing),
383 * unless stated otherwise.
385 struct gmx_domdec_comm_t
387 /* PME and Cartesian communicator stuff */
388 int npmedecompdim; /**< The number of decomposition dimensions for PME, 0: no PME */
389 int npmenodes; /**< The number of ranks doing PME (PP/PME or only PME) */
390 int npmenodes_x; /**< The number of PME ranks/domains along x */
391 int npmenodes_y; /**< The number of PME ranks/domains along y */
392 gmx_bool bCartesianPP_PME; /**< Use Cartesian communication between PP and PME ranks */
393 ivec ntot; /**< Cartesian grid for combinted PP+PME ranks */
394 int cartpmedim; /**< The number of dimensions for the PME setup that are Cartesian */
395 int *pmenodes; /**< The PME ranks, size npmenodes */
396 int *ddindex2simnodeid; /**< The Cartesian index to sim rank conversion, used with bCartesianPP_PME */
397 gmx_ddpme_t ddpme[2]; /**< The 1D or 2D PME domain decomposition setup */
399 /* The DD particle-particle nodes only */
400 gmx_bool bCartesianPP; /**< Use a Cartesian communicator for PP */
401 int *ddindex2ddnodeid; /**< The Cartesian index to DD rank conversion, used with bCartesianPP */
403 /* The DLB state, used for reloading old states, during e.g. EM */
404 t_block cgs_gl; /**< The global charge groups, this defined the DD state (except for the DLB state) */
406 /* Charge group / atom sorting */
407 std::unique_ptr<gmx_domdec_sort_t> sort; /**< Data structure for cg/atom sorting */
409 /* Are there charge groups? */
410 gmx_bool bCGs; /**< True when there are charge groups */
412 gmx_bool bInterCGBondeds; /**< Are there inter-cg bonded interactions? */
413 gmx_bool bInterCGMultiBody; /**< Are there inter-cg multi-body interactions? */
415 /* Data for the optional bonded interaction atom communication range */
416 gmx_bool bBondComm; /**< Only communicate atoms beyond the non-bonded cut-off when they are involved in bonded interactions with non-local atoms */
417 t_blocka *cglink; /**< Links between cg's through bonded interactions */
418 char *bLocalCG; /**< Local cg availability, TODO: remove when group scheme is removed */
420 /* The DLB state, possible values are defined above */
422 /* With dlbState=edlbsOffCanTurnOn, should we check if to DLB on at the next DD? */
423 gmx_bool bCheckWhetherToTurnDlbOn;
424 /* The first DD count since we are running without DLB */
425 int ddPartioningCountFirstDlbOff = 0;
427 /* Cell sizes for static load balancing, first index cartesian */
430 /* The width of the communicated boundaries */
431 real cutoff_mbody; /**< Cut-off for multi-body interactions, also 2-body bonded when \p cutoff_mody > \p cutoff */
432 real cutoff; /**< Cut-off for non-bonded/2-body interactions */
433 rvec cellsize_min; /**< The minimum guaranteed cell-size, Cartesian indexing */
434 rvec cellsize_min_dlb; /**< The minimum guaranteed cell-size with dlb=auto */
435 real cellsize_limit; /**< The lower limit for the DD cell size with DLB */
436 gmx_bool bVacDLBNoLimit; /**< Effectively no NB cut-off limit with DLB for systems without PBC? */
438 /** With PME load balancing we set limits on DLB */
439 gmx_bool bPMELoadBalDLBLimits;
440 /** DLB needs to take into account that we want to allow this maximum
441 * cut-off (for PME load balancing), this could limit cell boundaries.
443 real PMELoadBal_max_cutoff;
445 ivec tric_dir; /**< tric_dir from \p gmx_domdec_box_t is only stored here because dd_get_ns_ranges needs it */
446 rvec box0; /**< box lower corner, required with dim's without pbc and -gcom */
447 rvec box_size; /**< box size, required with dim's without pbc and -gcom */
449 rvec cell_x0; /**< The DD cell lower corner, in triclinic space */
450 rvec cell_x1; /**< The DD cell upper corner, in triclinic space */
452 rvec old_cell_x0; /**< The old \p cell_x0, to check cg displacements */
453 rvec old_cell_x1; /**< The old \p cell_x1, to check cg displacements */
455 /** The communication setup and charge group boundaries for the zones */
456 gmx_domdec_zones_t zones;
458 /* The zone limits for DD dimensions 1 and 2 (not 0), determined from
459 * cell boundaries of neighboring cells for staggered grids when using
460 * dynamic load balancing.
462 gmx_ddzone_t zone_d1[2]; /**< Zone limits for dim 1 with staggered grids */
463 gmx_ddzone_t zone_d2[2][2]; /**< Zone limits for dim 2 with staggered grids */
465 /** The coordinate/force communication setup and indices */
466 gmx_domdec_comm_dim_t cd[DIM];
467 /** The maximum number of cells to communicate with in one dimension */
470 /** Which cg distribution is stored on the master node,
471 * stored as DD partitioning call count.
473 int64_t master_cg_ddp_count;
475 /** The number of cg's received from the direct neighbors */
476 int zone_ncg1[DD_MAXZONE];
478 /** The atom ranges in the local state */
479 DDAtomRanges atomRanges;
481 /** Array for signalling if atoms have moved to another domain */
482 std::vector<int> movedBuffer;
484 /** Communication int buffer for general use */
485 DDBuffer<int> intBuffer;
487 /** Communication rvec buffer for general use */
488 DDBuffer<gmx::RVec> rvecBuffer;
490 /* Temporary storage for thread parallel communication setup */
491 std::vector<dd_comm_setup_work_t> dth; /**< Thread-local work data */
493 /* Communication buffer only used with multiple grid pulses */
494 DDBuffer<gmx::RVec> rvecBuffer2; /**< Another rvec comm. buffer */
496 /* Communication buffers for local redistribution */
497 std::array<std::vector<int>, DIM*2> cggl_flag; /**< Charge group flag comm. buffers */
498 std::array<std::vector<gmx::RVec>, DIM*2> cgcm_state; /**< Charge group center comm. buffers */
500 /* Cell sizes for dynamic load balancing */
501 std::vector<DDCellsizesWithDlb> cellsizesWithDlb;
503 /* Stuff for load communication */
504 gmx_bool bRecordLoad; /**< Should we record the load */
505 domdec_load_t *load; /**< The recorded load data */
506 int nrank_gpu_shared; /**< The number of MPI ranks sharing the GPU our rank is using */
508 MPI_Comm *mpi_comm_load; /**< The MPI load communicator */
509 MPI_Comm mpi_comm_gpu_shared; /**< The MPI load communicator for ranks sharing a GPU */
512 /* Information for managing the dynamic load balancing */
513 int dlb_scale_lim; /**< Maximum DLB scaling per load balancing step in percent */
515 BalanceRegion *balanceRegion; /**< Struct for timing the force load balancing region */
517 /* Cycle counters over nstlist steps */
518 float cycl[ddCyclNr]; /**< Total cycles counted */
519 int cycl_n[ddCyclNr]; /**< The number of cycle recordings */
520 float cycl_max[ddCyclNr]; /**< The maximum cycle count */
521 /** Flop counter (0=no,1=yes,2=with (eFlop-1)*5% noise */
523 double flop; /**< Total flops counted */
524 int flop_n; /**< The number of flop recordings */
525 /** How many times did we have load measurements */
527 /** How many times have we collected the load measurements */
530 /* Cycle count history for DLB checks */
531 float cyclesPerStepBeforeDLB; /**< The averaged cycles per step over the last nstlist step before turning on DLB */
532 float cyclesPerStepDlbExpAverage; /**< The running average of the cycles per step during DLB */
533 bool haveTurnedOffDlb; /**< Have we turned off DLB (after turning DLB on)? */
534 int64_t dlbSlowerPartitioningCount; /**< The DD step at which we last measured that DLB off was faster than DLB on, 0 if there was no such step */
536 /* Statistics for atoms */
537 double sum_nat[static_cast<int>(DDAtomRanges::Type::Number)]; /**< The atoms per range, summed over the steps */
539 /* Statistics for calls and times */
540 int ndecomp; /**< The number of partioning calls */
541 int nload; /**< The number of load recordings */
542 double load_step; /**< Total MD step time */
543 double load_sum; /**< Total PP force time */
544 double load_max; /**< Max \p load_sum over the ranks */
545 ivec load_lim; /**< Was load balancing limited, per DD dim */
546 double load_mdf; /**< Total time on PP done during PME overlap time */
547 double load_pme; /**< Total time on our PME-only rank */
549 /** The last partition step */
550 int64_t partition_step;
553 int nstDDDump; /**< Step interval for dumping the local+non-local atoms to pdb */
554 int nstDDDumpGrid; /**< Step interval for duming the DD grid to pdb */
555 int DD_debug; /**< DD debug print level: 0, 1, 2 */
558 /*! \brief DD zone permutation
560 * Zone permutation from the Cartesian x-major/z-minor order to an order
561 * that leads to consecutive charge groups for neighbor searching.
562 * TODO: remove when the group scheme is removed
564 static const int zone_perm[3][4] = { {0, 0, 0, 0}, {1, 0, 0, 0}, {3, 0, 1, 2} };
566 /*! \brief DD zone reordering to Cartesian order
568 * Index to reorder the zone such that the end up in Cartesian order
569 * with dimension index 0 major and dimension index 2 minor.
571 static const int zone_reorder_cartesian[DD_MAXZONE] = { 0, 1, 3, 2, 5, 4, 6, 7 };
573 /* dd_zo and dd_zp3 is set up such that i zones with non-zero
574 * components see only j zones with that component 0.
577 /*! \brief Returns the DD cut-off distance for multi-body interactions */
578 real dd_cutoff_multibody(const gmx_domdec_t *dd);
580 /*! \brief Returns the DD cut-off distance for two-body interactions */
581 real dd_cutoff_twobody(const gmx_domdec_t *dd);
583 /*! \brief Returns the domain index given the number of domains and the domain coordinates
585 * This order is required to minimize the coordinate communication in PME
586 * which uses decomposition in the x direction.
588 static inline int dd_index(const ivec numDomains,
589 const ivec domainCoordinates)
591 return ((domainCoordinates[XX]*numDomains[YY] + domainCoordinates[YY])*numDomains[ZZ]) + domainCoordinates[ZZ];
594 /*! Returns the size of the buffer to hold fractional cell boundaries for DD dimension index dimIndex */
595 static inline int ddCellFractionBufferSize(const gmx_domdec_t *dd,
598 return dd->nc[dd->dim[dimIndex ]] + 1 + dimIndex*2 + 1 + dimIndex;
601 /*! \brief Maximum number of ranks for using send/recv for state scattering and gathering
603 * Use separate MPI send and receive commands
604 * when #ranks <= c_maxNumRanksUseSendRecvForScatterAndGather
605 * This saves memory (and some copying for small #ranks).
606 * For high parallelization scatter and gather calls are used.
608 static constexpr int c_maxNumRanksUseSendRecvForScatterAndGather = 4;
610 /*! \brief Make DD cells larger by this factor than the limit to avoid rounding issues */
611 static constexpr double DD_CELL_MARGIN = 1.0001;
613 /*! \brief Factor for checking DD cell size limitation during DLB, should be in between 1 and DD_CELL_MARGIN */
614 static constexpr double DD_CELL_MARGIN2 = 1.00005;
616 /*! \brief With pressure scaling, keep cell sizes 2% above the limit to allow for some scaling */
617 static constexpr double DD_PRES_SCALE_MARGIN = 1.02;