2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2017, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
38 * \brief Implements functions for tuning adjustable parameters for the nbnxn non-bonded search and interaction kernels
40 * \author Berk Hess <hess@kth.se>
41 * \ingroup __module_nb_verlet
46 #include "nbnxn_tuning.h"
56 #include "gromacs/domdec/domdec.h"
57 #include "gromacs/hardware/cpuinfo.h"
58 #include "gromacs/math/vec.h"
59 #include "gromacs/mdlib/calc_verletbuf.h"
60 #include "gromacs/mdlib/nb_verlet.h"
61 #include "gromacs/mdlib/nbnxn_search.h"
62 #include "gromacs/mdtypes/commrec.h"
63 #include "gromacs/mdtypes/inputrec.h"
64 #include "gromacs/mdtypes/interaction_const.h"
65 #include "gromacs/mdtypes/state.h"
66 #include "gromacs/pbcutil/pbc.h"
67 #include "gromacs/topology/topology.h"
68 #include "gromacs/utility/cstringutil.h"
69 #include "gromacs/utility/fatalerror.h"
70 #include "gromacs/utility/gmxassert.h"
72 /*! \brief Returns if we can (heuristically) change nstlist and rlist
74 * \param [in] ir The input parameter record
76 static bool supportsDynamicPairlistGenerationInterval(const t_inputrec &ir)
79 ir.cutoff_scheme == ecutsVERLET &&
81 !(EI_MD(ir.eI) && ir.etc == etcNO) &&
85 /*! \brief Cost of non-bonded kernels
87 * We determine the extra cost of the non-bonded kernels compared to
88 * a reference nstlist value of 10 (which is the default in grompp).
90 static const int nbnxnReferenceNstlist = 10;
91 //! The values to try when switching
92 const int nstlist_try[] = { 20, 25, 40, 50, 80, 100 };
93 //! Number of elements in the neighborsearch list trials.
94 #define NNSTL sizeof(nstlist_try)/sizeof(nstlist_try[0])
95 /* Increase nstlist until the size of the pair-list increased by
96 * \p c_nbnxnListSizeFactor??? or more, but never more than
97 * \p c_nbnxnListSizeFactor??? + \p c_nbnxnListSizeFactorMargin.
98 * Since we have dynamic pair list pruning, the force kernel cost depends
99 * only very weakly on nstlist. It depends strongly on nstlistPrune.
100 * Increasing nstlist mainly affects the cost of the pair search (down due
101 * to lower frequency, up due to larger list) and the list pruning kernel.
102 * We increase nstlist conservatively with regard to kernel performance.
103 * In serial the search cost is not high and thus we don't gain much by
104 * increasing nstlist a lot. In parallel the MPI and CPU-GPU communication
105 * volume as well as the communication buffer preparation and reduction time
106 * increase quickly with rlist and thus nslist. Therefore we should avoid
107 * large nstlist, even if that also reduces the domain decomposition cost.
108 * With GPUs we perform the dynamic pruning in a rolling fashion and this
109 * overlaps with the update on the CPU, which allows even larger nstlist.
111 // CPU: pair-search is a factor ~1.5 slower than the non-bonded kernel.
112 //! Target pair-list size increase ratio for CPU
113 static const float c_nbnxnListSizeFactorCpu = 1.25;
114 // Intel KNL: pair-search is a factor ~2-3 slower than the non-bonded kernel.
115 //! Target pair-list size increase ratio for Intel KNL
116 static const float c_nbnxnListSizeFactorIntelXeonPhi = 1.4;
117 // GPU: pair-search is a factor 1.5-3 slower than the non-bonded kernel.
118 //! Target pair-list size increase ratio for hybrid CPU-GPU
119 static const float c_nbnxnListSizeFactorGPU = 1.4;
120 //! Never increase the size of the pair-list more than the factor above plus this margin
121 static const float c_nbnxnListSizeFactorMargin = 0.1;
123 void increaseNstlist(FILE *fp, t_commrec *cr,
124 t_inputrec *ir, int nstlist_cmdline,
125 const gmx_mtop_t *mtop, matrix box,
126 bool useOrEmulateGpuForNonbondeds,
127 const gmx::CpuInfo &cpuinfo)
129 float listfac_ok, listfac_max;
130 int nstlist_orig, nstlist_prev;
131 verletbuf_list_setup_t ls;
132 real rlistWithReferenceNstlist, rlist_inc, rlist_ok, rlist_max;
133 real rlist_new, rlist_prev;
134 size_t nstlist_ind = 0;
135 gmx_bool bBox, bDD, bCont;
136 const char *nstl_gpu = "\nFor optimal performance with a GPU nstlist (now %d) should be larger.\nThe optimum depends on your CPU and GPU resources.\nYou might want to try several nstlist values.\n";
137 const char *nve_err = "Can not increase nstlist because an NVE ensemble is used";
138 const char *vbd_err = "Can not increase nstlist because verlet-buffer-tolerance is not set or used";
139 const char *box_err = "Can not increase nstlist because the box is too small";
140 const char *dd_err = "Can not increase nstlist because of domain decomposition limitations";
143 if (nstlist_cmdline <= 0)
145 if (ir->nstlist == 1)
147 /* The user probably set nstlist=1 for a reason,
148 * don't mess with the settings.
153 if (fp != nullptr && useOrEmulateGpuForNonbondeds && ir->nstlist < nstlist_try[0])
155 fprintf(fp, nstl_gpu, ir->nstlist);
158 while (nstlist_ind < NNSTL && ir->nstlist >= nstlist_try[nstlist_ind])
162 if (nstlist_ind == NNSTL)
164 /* There are no larger nstlist value to try */
169 if (EI_MD(ir->eI) && ir->etc == etcNO)
173 fprintf(stderr, "%s\n", nve_err);
177 fprintf(fp, "%s\n", nve_err);
183 if (ir->verletbuf_tol == 0 && useOrEmulateGpuForNonbondeds)
185 gmx_fatal(FARGS, "You are using an old tpr file with a GPU, please generate a new tpr file with an up to date version of grompp");
188 if (ir->verletbuf_tol < 0)
192 fprintf(stderr, "%s\n", vbd_err);
196 fprintf(fp, "%s\n", vbd_err);
202 if (useOrEmulateGpuForNonbondeds)
204 listfac_ok = c_nbnxnListSizeFactorGPU;
206 else if (cpuinfo.brandString().find("Xeon Phi") != std::string::npos)
208 listfac_ok = c_nbnxnListSizeFactorIntelXeonPhi;
212 listfac_ok = c_nbnxnListSizeFactorCpu;
214 listfac_max = listfac_ok + c_nbnxnListSizeFactorMargin;
216 nstlist_orig = ir->nstlist;
217 if (nstlist_cmdline > 0)
221 sprintf(buf, "Getting nstlist=%d from command line option",
224 ir->nstlist = nstlist_cmdline;
227 verletbuf_get_list_setup(TRUE, useOrEmulateGpuForNonbondeds, &ls);
229 /* Allow rlist to make the list a given factor larger than the list
230 * would be with the reference value for nstlist (10).
232 nstlist_prev = ir->nstlist;
233 ir->nstlist = nbnxnReferenceNstlist;
234 calc_verlet_buffer_size(mtop, det(box), ir, ir->nstlist, ir->nstlist - 1,
235 -1, &ls, nullptr, &rlistWithReferenceNstlist);
236 ir->nstlist = nstlist_prev;
238 /* Determine the pair list size increase due to zero interactions */
239 rlist_inc = nbnxn_get_rlist_effective_inc(ls.cluster_size_j,
240 mtop->natoms/det(box));
241 rlist_ok = (rlistWithReferenceNstlist + rlist_inc)*std::cbrt(listfac_ok) - rlist_inc;
242 rlist_max = (rlistWithReferenceNstlist + rlist_inc)*std::cbrt(listfac_max) - rlist_inc;
245 fprintf(debug, "nstlist tuning: rlist_inc %.3f rlist_ok %.3f rlist_max %.3f\n",
246 rlist_inc, rlist_ok, rlist_max);
249 nstlist_prev = nstlist_orig;
250 rlist_prev = ir->rlist;
253 if (nstlist_cmdline <= 0)
255 ir->nstlist = nstlist_try[nstlist_ind];
258 /* Set the pair-list buffer size in ir */
259 calc_verlet_buffer_size(mtop, det(box), ir, ir->nstlist, ir->nstlist - 1, -1, &ls, nullptr, &rlist_new);
261 /* Does rlist fit in the box? */
262 bBox = (gmx::square(rlist_new) < max_cutoff2(ir->ePBC, box));
264 if (bBox && DOMAINDECOMP(cr))
266 /* Check if rlist fits in the domain decomposition */
267 if (inputrec2nboundeddim(ir) < DIM)
269 gmx_incons("Changing nstlist with domain decomposition and unbounded dimensions is not implemented yet");
272 copy_mat(box, state_tmp.box);
273 bDD = change_dd_cutoff(cr, &state_tmp, ir, rlist_new);
278 fprintf(debug, "nstlist %d rlist %.3f bBox %d bDD %d\n",
279 ir->nstlist, rlist_new, bBox, bDD);
284 if (nstlist_cmdline <= 0)
286 if (bBox && bDD && rlist_new <= rlist_max)
288 /* Increase nstlist */
289 nstlist_prev = ir->nstlist;
290 rlist_prev = rlist_new;
291 bCont = (nstlist_ind+1 < NNSTL && rlist_new < rlist_ok);
295 /* Stick with the previous nstlist */
296 ir->nstlist = nstlist_prev;
297 rlist_new = rlist_prev;
309 gmx_warning(!bBox ? box_err : dd_err);
312 fprintf(fp, "\n%s\n", bBox ? box_err : dd_err);
314 ir->nstlist = nstlist_orig;
316 else if (ir->nstlist != nstlist_orig || rlist_new != ir->rlist)
318 sprintf(buf, "Changing nstlist from %d to %d, rlist from %g to %g",
319 nstlist_orig, ir->nstlist,
320 ir->rlist, rlist_new);
323 fprintf(stderr, "%s\n\n", buf);
327 fprintf(fp, "%s\n\n", buf);
329 ir->rlist = rlist_new;
333 /*! \brief The interval in steps at which we perform dynamic, rolling pruning on a GPU.
335 * Ideally we should auto-tune this value.
336 * Not considering overheads, 1 would be the ideal value. But 2 seems
337 * a reasonable compromise that reduces GPU kernel launch overheads and
338 * also avoids inefficiency on large GPUs when pruning small lists.
339 * Because with domain decomposition we alternate local/non-local pruning
340 * at even/odd steps, which gives a period of 2, this value currenly needs
341 * to be 2, which is indirectly asserted when the GPU pruning is dispatched
342 * during the force evaluation.
344 static const int c_nbnxnGpuRollingListPruningInterval = 2;
346 /*! \brief The minimum nstlist for dynamic pair list pruning.
348 * In most cases going lower than 4 will lead to a too high pruning cost.
349 * This value should be a multiple of \p c_nbnxnGpuRollingListPruningInterval
351 static const int c_nbnxnDynamicListPruningMinLifetime = 4;
353 /*! \brief Set the dynamic pairlist pruning parameters in \p ic
355 * \param[in] ir The input parameter record
356 * \param[in] mtop The global topology
357 * \param[in] box The unit cell
358 * \param[in] useGpu Tells if we are using a GPU for non-bondeds
359 * \param[in] listSetup The nbnxn pair list setup
360 * \param[in] userSetNstlistPrune The user set ic->nstlistPrune (using an env.var.)
361 * \param[in] ic The nonbonded interactions constants
362 * \param[in,out] listParams The list setup parameters
365 setDynamicPairlistPruningParameters(const t_inputrec *ir,
366 const gmx_mtop_t *mtop,
369 const verletbuf_list_setup_t &listSetup,
370 bool userSetNstlistPrune,
371 const interaction_const_t *ic,
372 NbnxnListParameters *listParams)
374 /* When nstlistPrune was set by the user, we need to execute one loop
375 * iteration to determine rlistInner.
376 * Otherwise we compute rlistInner and increase nstlist as long as
377 * we have a pairlist buffer of length 0 (i.e. rlistInner == cutoff).
379 const real interactionCutoff = std::max(ic->rcoulomb, ic->rvdw);
380 int tunedNstlistPrune = listParams->nstlistPrune;
383 /* Dynamic pruning on the GPU is performed on the list for
384 * the next step on the coordinates of the current step,
385 * so the list lifetime is nstlistPrune (not the usual nstlist-1).
387 int listLifetime = tunedNstlistPrune - (useGpu ? 0 : 1);
388 listParams->nstlistPrune = tunedNstlistPrune;
389 calc_verlet_buffer_size(mtop, det(box), ir,
390 tunedNstlistPrune, listLifetime,
391 -1, &listSetup, NULL,
392 &listParams->rlistInner);
394 /* On the GPU we apply the dynamic pruning in a rolling fashion
395 * every c_nbnxnGpuRollingListPruningInterval steps,
396 * so keep nstlistPrune a multiple of the interval.
398 tunedNstlistPrune += useGpu ? c_nbnxnGpuRollingListPruningInterval : 1;
400 while (!userSetNstlistPrune &&
401 tunedNstlistPrune < ir->nstlist &&
402 listParams->rlistInner == interactionCutoff);
404 if (userSetNstlistPrune)
406 listParams->useDynamicPruning = true;
410 /* Determine the pair list size increase due to zero interactions */
411 real rlistInc = nbnxn_get_rlist_effective_inc(listSetup.cluster_size_j,
412 mtop->natoms/det(box));
414 /* Dynamic pruning is only useful when the inner list is smaller than
415 * the outer. The factor 0.99 ensures at least 3% list size reduction.
417 * With dynamic pruning on the CPU we prune after updating,
418 * so nstlistPrune=nstlist-1 would add useless extra work.
419 * With the GPU there will probably be more overhead than gain
420 * with nstlistPrune=nstlist-1, so we disable dynamic pruning.
421 * Note that in such cases the first sub-condition is likely also false.
423 listParams->useDynamicPruning =
424 (listParams->rlistInner + rlistInc < 0.99*(listParams->rlistOuter + rlistInc) &&
425 listParams->nstlistPrune < ir->nstlist - 1);
428 if (!listParams->useDynamicPruning)
430 /* These parameters should not be used, but set them to useful values */
431 listParams->nstlistPrune = -1;
432 listParams->rlistInner = listParams->rlistOuter;
436 void setupDynamicPairlistPruning(FILE *fplog,
437 const t_inputrec *ir,
438 const gmx_mtop_t *mtop,
441 const interaction_const_t *ic,
442 NbnxnListParameters *listParams)
444 GMX_RELEASE_ASSERT(listParams->rlistOuter > 0, "With the nbnxn setup rlist should be > 0");
446 /* Initialize the parameters to no dynamic list pruning */
447 listParams->useDynamicPruning = false;
449 if (supportsDynamicPairlistGenerationInterval(*ir) &&
450 getenv("GMX_DISABLE_DYNAMICPRUNING") == NULL)
452 verletbuf_list_setup_t ls;
453 verletbuf_get_list_setup(TRUE, TRUE, &ls);
455 /* Note that nstlistPrune can have any value independently of nstlist.
456 * Actually applying rolling pruning is only useful when
457 * nstlistPrune < nstlist -1
459 char *env = getenv("GMX_NSTLIST_DYNAMICPRUNING");
460 bool userSetNstlistPrune = (env != NULL);
462 if (userSetNstlistPrune)
465 listParams->nstlistPrune = strtol(env, &end, 10);
466 if (!end || (*end != 0) ||
467 !(listParams->nstlistPrune > 0 && listParams->nstlistPrune < ir->nstlist))
469 gmx_fatal(FARGS, "Invalid value passed in GMX_NSTLIST_DYNAMICPRUNING=%s, should be > 0 and < nstlist", env);
474 static_assert(c_nbnxnDynamicListPruningMinLifetime % c_nbnxnGpuRollingListPruningInterval == 0,
475 "c_nbnxnDynamicListPruningMinLifetime sets the starting value for nstlistPrune, which should be divisible by the rolling pruning interval for efficiency reasons.");
477 // TODO: Use auto-tuning to determine nstlistPrune
478 listParams->nstlistPrune = c_nbnxnDynamicListPruningMinLifetime;
481 setDynamicPairlistPruningParameters(ir, mtop, box, useGpu, ls,
482 userSetNstlistPrune, ic,
485 if (listParams->useDynamicPruning && useGpu)
487 /* Note that we can round down here. This makes the effective
488 * rolling pruning interval slightly shorter than nstlistTune,
489 * thus giving correct results, but a slightly lower efficiency.
491 GMX_RELEASE_ASSERT(listParams->nstlistPrune >= c_nbnxnGpuRollingListPruningInterval,
492 ( "With dynamic list pruning on GPUs pruning frequency must be at least as large as the rolling pruning interval (" +
493 std::to_string(c_nbnxnGpuRollingListPruningInterval) +
495 listParams->numRollingParts = listParams->nstlistPrune/c_nbnxnGpuRollingListPruningInterval;
499 listParams->numRollingParts = 1;
502 if (fplog && listParams->useDynamicPruning)
504 const real interactionCutoff = std::max(ic->rcoulomb, ic->rvdw);
506 "Using a dual pair-list setup updated with dynamic%s pruning:\n"
507 " outer list: updated every %3d steps, buffer %.3f nm, rlist %.3f nm\n"
508 " inner list: updated every %3d steps, buffer %.3f nm, rlist %.3f nm\n",
509 listParams->numRollingParts > 1 ? ", rolling" : "",
510 ir->nstlist, listParams->rlistOuter - interactionCutoff, listParams->rlistOuter,
511 listParams->nstlistPrune, listParams->rlistInner - interactionCutoff, listParams->rlistInner);