2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2017,2018,2019,2020,2021, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
38 * \brief Implements functions for tuning adjustable parameters for the nbnxn non-bonded search and interaction kernels
40 * \author Berk Hess <hess@kth.se>
41 * \ingroup module_nbnxm
46 #include "pairlist_tuning.h"
55 #include "gromacs/domdec/domdec.h"
56 #include "gromacs/hardware/cpuinfo.h"
57 #include "gromacs/math/vec.h"
58 #include "gromacs/mdlib/calc_verletbuf.h"
59 #include "gromacs/mdtypes/commrec.h"
60 #include "gromacs/mdtypes/inputrec.h"
61 #include "gromacs/mdtypes/interaction_const.h"
62 #include "gromacs/mdtypes/multipletimestepping.h"
63 #include "gromacs/mdtypes/state.h"
64 #include "gromacs/pbcutil/pbc.h"
65 #include "gromacs/topology/topology.h"
66 #include "gromacs/utility/cstringutil.h"
67 #include "gromacs/utility/fatalerror.h"
68 #include "gromacs/utility/gmxassert.h"
69 #include "gromacs/utility/logger.h"
70 #include "gromacs/utility/strconvert.h"
71 #include "gromacs/utility/stringutil.h"
73 #include "nbnxm_geometry.h"
74 #include "pairlistsets.h"
76 /*! \brief Returns if we can (heuristically) change nstlist and rlist
78 * \param [in] ir The input parameter record
80 static bool supportsDynamicPairlistGenerationInterval(const t_inputrec& ir)
82 return ir.cutoff_scheme == CutoffScheme::Verlet && EI_DYNAMICS(ir.eI)
83 && !(EI_MD(ir.eI) && ir.etc == TemperatureCoupling::No) && ir.verletbuf_tol > 0;
86 /*! \brief Cost of non-bonded kernels
88 * We determine the extra cost of the non-bonded kernels compared to
89 * a reference nstlist value of 10 (which is the default in grompp).
91 static const int nbnxnReferenceNstlist = 10;
92 //! The values to try when switching
93 const int nstlist_try[] = { 20, 25, 40, 50, 80, 100 };
94 //! Number of elements in the neighborsearch list trials.
95 #define NNSTL (sizeof(nstlist_try) / sizeof(nstlist_try[0]))
96 /* Increase nstlist until the size of the pair-list increased by
97 * \p c_nbnxnListSizeFactor??? or more, but never more than
98 * \p c_nbnxnListSizeFactor??? + \p c_nbnxnListSizeFactorMargin.
99 * Since we have dynamic pair list pruning, the force kernel cost depends
100 * only very weakly on nstlist. It depends strongly on nstlistPrune.
101 * Increasing nstlist mainly affects the cost of the pair search (down due
102 * to lower frequency, up due to larger list) and the list pruning kernel.
103 * We increase nstlist conservatively with regard to kernel performance.
104 * In serial the search cost is not high and thus we don't gain much by
105 * increasing nstlist a lot. In parallel the MPI and CPU-GPU communication
106 * volume as well as the communication buffer preparation and reduction time
107 * increase quickly with rlist and thus nslist. Therefore we should avoid
108 * large nstlist, even if that also reduces the domain decomposition cost.
109 * With GPUs we perform the dynamic pruning in a rolling fashion and this
110 * overlaps with the update on the CPU, which allows even larger nstlist.
112 // CPU: pair-search is a factor ~1.5 slower than the non-bonded kernel.
113 //! Target pair-list size increase ratio for CPU
114 static const float c_nbnxnListSizeFactorCpu = 1.25;
115 // Intel KNL: pair-search is a factor ~2-3 slower than the non-bonded kernel.
116 //! Target pair-list size increase ratio for Intel KNL
117 static const float c_nbnxnListSizeFactorIntelXeonPhi = 1.4;
118 // GPU: pair-search is a factor 1.5-3 slower than the non-bonded kernel.
119 //! Target pair-list size increase ratio for GPU
120 static const float c_nbnxnListSizeFactorGPU = 1.4;
121 //! Never increase the size of the pair-list more than the factor above plus this margin
122 static const float c_nbnxnListSizeFactorMargin = 0.1;
124 void increaseNstlist(FILE* fp,
128 const gmx_mtop_t* mtop,
130 bool useOrEmulateGpuForNonbondeds,
131 const gmx::CpuInfo& cpuinfo)
133 if (!EI_DYNAMICS(ir->eI))
135 /* Can only increase nstlist with dynamics */
139 size_t nstlist_ind = 0;
140 const char* nstl_gpu =
141 "\nFor optimal performance with a GPU nstlist (now %d) should be larger.\nThe "
142 "optimum depends on your CPU and GPU resources.\nYou might want to try several "
144 const char* nve_err = "Can not increase nstlist because an NVE ensemble is used";
145 const char* vbd_err =
146 "Can not increase nstlist because verlet-buffer-tolerance is not set or used";
147 const char* box_err = "Can not increase nstlist because the box is too small";
148 const char* dd_err = "Can not increase nstlist because of domain decomposition limitations";
151 /* When most of the computation, and in particular the non-bondeds is only
152 * performed every ir->mtsFactor steps due to multiple time stepping,
153 * we scale all nstlist values by this factor.
155 const int mtsFactor = gmx::nonbondedMtsFactor(*ir);
157 if (nstlist_cmdline <= 0)
159 if (ir->nstlist <= mtsFactor)
161 /* The user probably set nstlist<=mtsFactor for a reason,
162 * don't mess with the settings, except when < mtsFactor.
164 ir->nstlist = mtsFactor;
169 /* With a GPU and fixed nstlist suggest tuning nstlist */
170 if (fp != nullptr && useOrEmulateGpuForNonbondeds && ir->nstlist < nstlist_try[0] * mtsFactor
171 && !supportsDynamicPairlistGenerationInterval(*ir))
173 fprintf(fp, nstl_gpu, ir->nstlist);
177 while (nstlist_ind < NNSTL && ir->nstlist >= nstlist_try[nstlist_ind] * mtsFactor)
181 if (nstlist_ind == NNSTL)
183 /* There are no larger nstlist value to try */
188 if (EI_MD(ir->eI) && ir->etc == TemperatureCoupling::No)
192 fprintf(stderr, "%s\n", nve_err);
196 fprintf(fp, "%s\n", nve_err);
202 if (ir->verletbuf_tol == 0 && useOrEmulateGpuForNonbondeds)
205 "You are using an old tpr file with a GPU, please generate a new tpr file with "
206 "an up to date version of grompp");
209 if (ir->verletbuf_tol < 0)
213 fprintf(stderr, "%s\n", vbd_err);
217 fprintf(fp, "%s\n", vbd_err);
223 GMX_RELEASE_ASSERT(supportsDynamicPairlistGenerationInterval(*ir),
224 "In all cases that do not support dynamic nstlist, we should have returned "
225 "with an appropriate message above");
227 const bool runningOnXeonPhi = (cpuinfo.brandString().find("Xeon Phi") != std::string::npos);
228 const float listfac_ok = useOrEmulateGpuForNonbondeds ? c_nbnxnListSizeFactorGPU
229 : runningOnXeonPhi ? c_nbnxnListSizeFactorIntelXeonPhi
230 : c_nbnxnListSizeFactorCpu;
231 float listfac_max = listfac_ok + c_nbnxnListSizeFactorMargin;
233 const int nstlist_orig = ir->nstlist;
234 if (nstlist_cmdline > 0)
238 sprintf(buf, "Getting nstlist=%d from command line option", nstlist_cmdline);
240 ir->nstlist = nstlist_cmdline;
243 ListSetupType listType =
244 (useOrEmulateGpuForNonbondeds ? ListSetupType::Gpu : ListSetupType::CpuSimdWhenSupported);
245 VerletbufListSetup listSetup = verletbufGetSafeListSetup(listType);
247 /* Allow rlist to make the list a given factor larger than the list
248 * would be with the reference value for nstlist (10*mtsFactor).
250 int nstlist_prev = ir->nstlist;
251 ir->nstlist = nbnxnReferenceNstlist * mtsFactor;
252 const real rlistWithReferenceNstlist =
253 calcVerletBufferSize(*mtop, det(box), *ir, ir->nstlist, ir->nstlist - 1, -1, listSetup);
254 ir->nstlist = nstlist_prev;
256 /* Determine the pair list size increase due to zero interactions */
257 real rlist_inc = nbnxn_get_rlist_effective_inc(listSetup.cluster_size_j, mtop->natoms / det(box));
258 real rlist_ok = (rlistWithReferenceNstlist + rlist_inc) * std::cbrt(listfac_ok) - rlist_inc;
259 real rlist_max = (rlistWithReferenceNstlist + rlist_inc) * std::cbrt(listfac_max) - rlist_inc;
262 fprintf(debug, "nstlist tuning: rlist_inc %.3f rlist_ok %.3f rlist_max %.3f\n", rlist_inc, rlist_ok, rlist_max);
265 nstlist_prev = nstlist_orig;
266 real rlist_prev = ir->rlist;
268 bool bBox = false, bDD = false, bCont = false;
271 if (nstlist_cmdline <= 0)
273 ir->nstlist = nstlist_try[nstlist_ind] * mtsFactor;
276 /* Set the pair-list buffer size in ir */
277 rlist_new = calcVerletBufferSize(
278 *mtop, det(box), *ir, ir->nstlist, ir->nstlist - mtsFactor, -1, listSetup);
280 /* Does rlist fit in the box? */
281 bBox = (gmx::square(rlist_new) < max_cutoff2(ir->pbcType, box));
283 if (bBox && haveDDAtomOrdering(*cr))
285 /* Currently (as of July 2020), the code in this if clause is never executed.
286 * increaseNstlist(...) is only called from prepare_verlet_scheme, which in turns
287 * gets called by the runner _before_ setting up DD. haveDDAtomOrdering(*cr) will
288 * therefore always be false here. See #3334.
290 /* Check if rlist fits in the domain decomposition */
291 if (inputrec2nboundeddim(ir) < DIM)
294 "Changing nstlist with domain decomposition and unbounded dimensions is "
295 "not implemented yet");
297 bDD = change_dd_cutoff(cr, box, gmx::ArrayRef<const gmx::RVec>(), rlist_new);
303 "nstlist %d rlist %.3f bBox %s bDD %s\n",
306 gmx::boolToString(bBox),
307 gmx::boolToString(bDD));
312 if (nstlist_cmdline <= 0)
314 if (bBox && bDD && rlist_new <= rlist_max)
316 /* Increase nstlist */
317 nstlist_prev = ir->nstlist;
318 rlist_prev = rlist_new;
319 bCont = (nstlist_ind + 1 < NNSTL && rlist_new < rlist_ok);
323 /* Stick with the previous nstlist */
324 ir->nstlist = nstlist_prev;
325 rlist_new = rlist_prev;
336 gmx_warning("%s", !bBox ? box_err : dd_err);
339 fprintf(fp, "\n%s\n", !bBox ? box_err : dd_err);
341 ir->nstlist = nstlist_orig;
343 else if (ir->nstlist != nstlist_orig || rlist_new != ir->rlist)
346 "Changing nstlist from %d to %d, rlist from %g to %g",
353 fprintf(stderr, "%s\n\n", buf);
357 fprintf(fp, "%s\n\n", buf);
359 ir->rlist = rlist_new;
363 /*! \brief The interval in steps at which we perform dynamic, rolling pruning on a GPU.
365 * Ideally we should auto-tune this value.
366 * Not considering overheads, 1 would be the ideal value. But 2 seems
367 * a reasonable compromise that reduces GPU kernel launch overheads and
368 * also avoids inefficiency on large GPUs when pruning small lists.
369 * Because with domain decomposition we alternate local/non-local pruning
370 * at even/odd steps, which gives a period of 2, this value currenly needs
371 * to be 2, which is indirectly asserted when the GPU pruning is dispatched
372 * during the force evaluation.
374 static const int c_nbnxnGpuRollingListPruningInterval = 2;
376 /*! \brief The minimum nstlist for dynamic pair list pruning.
378 * In most cases going lower than 4 will lead to a too high pruning cost.
379 * This value should be a multiple of \p c_nbnxnGpuRollingListPruningInterval
381 static const int c_nbnxnDynamicListPruningMinLifetime = 4;
383 /*! \brief Set the dynamic pairlist pruning parameters in \p ic
385 * \param[in] inputrec The input parameter record
386 * \param[in] mtop The global topology
387 * \param[in] box The unit cell
388 * \param[in] useGpuList Tells if we are using a GPU type pairlist
389 * \param[in] listSetup The nbnxn pair list setup
390 * \param[in] userSetNstlistPrune The user set ic->nstlistPrune (using an env.var.)
391 * \param[in] interactionConst The nonbonded interactions constants
392 * \param[in,out] listParams The list setup parameters
394 static void setDynamicPairlistPruningParameters(const t_inputrec& inputrec,
395 const gmx_mtop_t& mtop,
397 const bool useGpuList,
398 const VerletbufListSetup& listSetup,
399 const bool userSetNstlistPrune,
400 const interaction_const_t& interactionConst,
401 PairlistParams* listParams)
403 /* When applying multiple time stepping to the non-bonded forces,
404 * we only compute them every mtsFactor steps, so all parameters here
405 * should be a multiple of mtsFactor.
407 listParams->mtsFactor = gmx::nonbondedMtsFactor(inputrec);
409 const int mtsFactor = listParams->mtsFactor;
411 GMX_RELEASE_ASSERT(inputrec.nstlist % mtsFactor == 0,
412 "nstlist should be a multiple of mtsFactor");
414 listParams->lifetime = inputrec.nstlist - mtsFactor;
416 /* When nstlistPrune was set by the user, we need to execute one loop
417 * iteration to determine rlistInner.
418 * Otherwise we compute rlistInner and increase nstlist as long as
419 * we have a pairlist buffer of length 0 (i.e. rlistInner == cutoff).
421 const real interactionCutoff = std::max(interactionConst.rcoulomb, interactionConst.rvdw);
422 int tunedNstlistPrune = listParams->nstlistPrune;
425 /* Dynamic pruning on the GPU is performed on the list for
426 * the next step on the coordinates of the current step,
427 * so the list lifetime is nstlistPrune (not the usual nstlist-mtsFactor).
429 int listLifetime = tunedNstlistPrune - (useGpuList ? 0 : mtsFactor);
430 listParams->nstlistPrune = tunedNstlistPrune;
431 listParams->rlistInner = calcVerletBufferSize(
432 mtop, det(box), inputrec, tunedNstlistPrune, listLifetime, -1, listSetup);
434 /* On the GPU we apply the dynamic pruning in a rolling fashion
435 * every c_nbnxnGpuRollingListPruningInterval steps,
436 * so keep nstlistPrune a multiple of the interval.
438 tunedNstlistPrune += (useGpuList ? c_nbnxnGpuRollingListPruningInterval : 1) * mtsFactor;
439 } while (!userSetNstlistPrune && tunedNstlistPrune < inputrec.nstlist
440 && listParams->rlistInner == interactionCutoff);
442 if (userSetNstlistPrune)
444 listParams->useDynamicPruning = true;
448 /* Determine the pair list size increase due to zero interactions */
449 real rlistInc = nbnxn_get_rlist_effective_inc(listSetup.cluster_size_j, mtop.natoms / det(box));
451 /* Dynamic pruning is only useful when the inner list is smaller than
452 * the outer. The factor 0.99 ensures at least 3% list size reduction.
454 * With dynamic pruning on the CPU we prune after updating,
455 * so nstlistPrune=nstlist-1 would add useless extra work.
456 * With the GPU there will probably be more overhead than gain
457 * with nstlistPrune=nstlist-1, so we disable dynamic pruning.
458 * Note that in such cases the first sub-condition is likely also false.
460 listParams->useDynamicPruning =
461 (listParams->rlistInner + rlistInc < 0.99 * (listParams->rlistOuter + rlistInc)
462 && listParams->nstlistPrune < listParams->lifetime);
465 if (!listParams->useDynamicPruning)
467 /* These parameters should not be used, but set them to useful values */
468 listParams->nstlistPrune = -1;
469 listParams->rlistInner = listParams->rlistOuter;
473 /*! \brief Returns a string describing the setup of a single pair-list
475 * \param[in] listName Short name of the list, can be ""
476 * \param[in] nstList The list update interval in steps
477 * \param[in] nstListForSpacing Update interval for setting the number characters for printing \p nstList
478 * \param[in] rList List cut-off radius
479 * \param[in] interactionCutoff The interaction cut-off, use for printing the list buffer size
481 static std::string formatListSetup(const std::string& listName,
483 int nstListForSpacing,
485 real interactionCutoff)
487 std::string listSetup = " ";
488 if (!listName.empty())
490 listSetup += listName + " list: ";
492 listSetup += "updated every ";
493 // Make the shortest int format string that fits nstListForSpacing
494 std::string nstListFormat =
495 "%" + gmx::formatString("%zu", gmx::formatString("%d", nstListForSpacing).size()) + "d";
496 listSetup += gmx::formatString(nstListFormat.c_str(), nstList);
497 listSetup += gmx::formatString(
498 " steps, buffer %.3f nm, rlist %.3f nm\n", rList - interactionCutoff, rList);
503 void setupDynamicPairlistPruning(const gmx::MDLogger& mdlog,
504 const t_inputrec& inputrec,
505 const gmx_mtop_t& mtop,
507 const interaction_const_t& interactionConst,
508 PairlistParams* listParams)
510 GMX_RELEASE_ASSERT(listParams->rlistOuter > 0, "With the nbnxn setup rlist should be > 0");
512 /* Initialize the parameters to no dynamic list pruning */
513 listParams->useDynamicPruning = false;
515 const VerletbufListSetup ls = { IClusterSizePerListType[listParams->pairlistType],
516 JClusterSizePerListType[listParams->pairlistType] };
518 /* Currently emulation mode does not support dual pair-lists */
519 const bool useGpuList = sc_isGpuPairListType[listParams->pairlistType];
521 if (supportsDynamicPairlistGenerationInterval(inputrec) && getenv("GMX_DISABLE_DYNAMICPRUNING") == nullptr)
523 /* Note that nstlistPrune can have any value independently of nstlist.
524 * Actually applying rolling pruning is only useful when
525 * nstlistPrune < nstlist -1
527 char* env = getenv("GMX_NSTLIST_DYNAMICPRUNING");
528 bool userSetNstlistPrune = (env != nullptr);
530 if (userSetNstlistPrune)
533 listParams->nstlistPrune = strtol(env, &end, 10);
534 if (!end || (*end != 0)
535 || !(listParams->nstlistPrune > 0 && listParams->nstlistPrune < inputrec.nstlist))
538 "Invalid value passed in GMX_NSTLIST_DYNAMICPRUNING=%s, should be > 0 "
545 static_assert(c_nbnxnDynamicListPruningMinLifetime % c_nbnxnGpuRollingListPruningInterval == 0,
546 "c_nbnxnDynamicListPruningMinLifetime sets the starting value for "
547 "nstlistPrune, which should be divisible by the rolling pruning interval "
548 "for efficiency reasons.");
550 // TODO: Use auto-tuning to determine nstlistPrune
551 listParams->nstlistPrune = c_nbnxnDynamicListPruningMinLifetime;
554 setDynamicPairlistPruningParameters(
555 inputrec, mtop, box, useGpuList, ls, userSetNstlistPrune, interactionConst, listParams);
557 if (listParams->useDynamicPruning && useGpuList)
559 /* Note that we can round down here. This makes the effective
560 * rolling pruning interval slightly shorter than nstlistTune,
561 * thus giving correct results, but a slightly lower efficiency.
563 GMX_RELEASE_ASSERT(listParams->nstlistPrune >= c_nbnxnGpuRollingListPruningInterval,
564 ("With dynamic list pruning on GPUs pruning frequency must be at "
565 "least as large as the rolling pruning interval ("
566 + std::to_string(c_nbnxnGpuRollingListPruningInterval) + ").")
568 listParams->numRollingPruningParts =
569 listParams->nstlistPrune / c_nbnxnGpuRollingListPruningInterval;
573 listParams->numRollingPruningParts = 1;
579 const real interactionCutoff = std::max(interactionConst.rcoulomb, interactionConst.rvdw);
580 if (listParams->useDynamicPruning)
582 mesg += gmx::formatString(
583 "Using a dual %dx%d pair-list setup updated with dynamic%s pruning:\n",
586 listParams->numRollingPruningParts > 1 ? ", rolling" : "");
587 mesg += formatListSetup(
588 "outer", inputrec.nstlist, inputrec.nstlist, listParams->rlistOuter, interactionCutoff);
589 mesg += formatListSetup(
590 "inner", listParams->nstlistPrune, inputrec.nstlist, listParams->rlistInner, interactionCutoff);
594 mesg += gmx::formatString("Using a %dx%d pair-list setup:\n", ls.cluster_size_i, ls.cluster_size_j);
595 mesg += formatListSetup(
596 "", inputrec.nstlist, inputrec.nstlist, listParams->rlistOuter, interactionCutoff);
598 if (supportsDynamicPairlistGenerationInterval(inputrec))
600 const VerletbufListSetup listSetup1x1 = { 1, 1 };
601 const real rlistOuter = calcVerletBufferSize(
602 mtop, det(box), inputrec, inputrec.nstlist, inputrec.nstlist - 1, -1, listSetup1x1);
603 real rlistInner = rlistOuter;
604 if (listParams->useDynamicPruning)
606 int listLifeTime = listParams->nstlistPrune - (useGpuList ? 0 : 1);
607 rlistInner = calcVerletBufferSize(
608 mtop, det(box), inputrec, listParams->nstlistPrune, listLifeTime, -1, listSetup1x1);
611 mesg += gmx::formatString(
612 "At tolerance %g kJ/mol/ps per atom, equivalent classical 1x1 list would be:\n",
613 inputrec.verletbuf_tol);
614 if (listParams->useDynamicPruning)
616 mesg += formatListSetup(
617 "outer", inputrec.nstlist, inputrec.nstlist, rlistOuter, interactionCutoff);
618 mesg += formatListSetup(
619 "inner", listParams->nstlistPrune, inputrec.nstlist, rlistInner, interactionCutoff);
623 mesg += formatListSetup("", inputrec.nstlist, inputrec.nstlist, rlistOuter, interactionCutoff);
627 GMX_LOG(mdlog.info).asParagraph().appendText(mesg);