src/gromacs/mdlib/nbnxn_tuning.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2017, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 /*! \internal \file
  37  *
  38  * \brief Implements functions for tuning adjustable parameters for the nbnxn non-bonded search and interaction kernels
  39  *
  40  * \author Berk Hess <hess@kth.se>
  41  * \ingroup __module_nb_verlet
  42  */
  43
  44 #include "gmxpre.h"
  45
  46 #include "nbnxn_tuning.h"
  47
  48 #include <assert.h>
  49 #include <stdlib.h>
  50
  51 #include <cmath>
  52
  53 #include <algorithm>
  54 #include <string>
  55
  56 #include "gromacs/domdec/domdec.h"
  57 #include "gromacs/hardware/cpuinfo.h"
  58 #include "gromacs/math/vec.h"
  59 #include "gromacs/mdlib/calc_verletbuf.h"
  60 #include "gromacs/mdlib/nb_verlet.h"
  61 #include "gromacs/mdlib/nbnxn_search.h"
  62 #include "gromacs/mdtypes/commrec.h"
  63 #include "gromacs/mdtypes/inputrec.h"
  64 #include "gromacs/mdtypes/interaction_const.h"
  65 #include "gromacs/mdtypes/state.h"
  66 #include "gromacs/pbcutil/pbc.h"
  67 #include "gromacs/topology/topology.h"
  68 #include "gromacs/utility/cstringutil.h"
  69 #include "gromacs/utility/fatalerror.h"
  70 #include "gromacs/utility/gmxassert.h"
  71
  72 /*! \brief Returns if we can (heuristically) change nstlist and rlist
  73  *
  74  * \param [in] ir  The input parameter record
  75  */
  76 static bool supportsDynamicPairlistGenerationInterval(const t_inputrec &ir)
  77 {
  78     return
  79         ir.cutoff_scheme == ecutsVERLET &&
  80         EI_DYNAMICS(ir.eI) &&
  81         !(EI_MD(ir.eI) && ir.etc == etcNO) &&
  82         ir.verletbuf_tol > 0;
  83 }
  84
  85 /*! \brief Cost of non-bonded kernels
  86  *
  87  * We determine the extra cost of the non-bonded kernels compared to
  88  * a reference nstlist value of 10 (which is the default in grompp).
  89  */
  90 static const int    nbnxnReferenceNstlist = 10;
  91 //! The values to try when switching
  92 const int           nstlist_try[] = { 20, 25, 40, 50, 80, 100 };
  93 //! Number of elements in the neighborsearch list trials.
  94 #define NNSTL  sizeof(nstlist_try)/sizeof(nstlist_try[0])
  95 /* Increase nstlist until the size of the pair-list increased by
  96  * \p c_nbnxnListSizeFactor??? or more, but never more than
  97  * \p c_nbnxnListSizeFactor??? + \p c_nbnxnListSizeFactorMargin.
  98  * Since we have dynamic pair list pruning, the force kernel cost depends
  99  * only very weakly on nstlist. It depends strongly on nstlistPrune.
 100  * Increasing nstlist mainly affects the cost of the pair search (down due
 101  * to lower frequency, up due to larger list) and the list pruning kernel.
 102  * We increase nstlist conservatively with regard to kernel performance.
 103  * In serial the search cost is not high and thus we don't gain much by
 104  * increasing nstlist a lot. In parallel the MPI and CPU-GPU communication
 105  * volume as well as the communication buffer preparation and reduction time
 106  * increase quickly with rlist and thus nslist. Therefore we should avoid
 107  * large nstlist, even if that also reduces the domain decomposition cost.
 108  * With GPUs we perform the dynamic pruning in a rolling fashion and this
 109  * overlaps with the update on the CPU, which allows even larger nstlist.
 110  */
 111 // CPU: pair-search is a factor ~1.5 slower than the non-bonded kernel.
 112 //! Target pair-list size increase ratio for CPU
 113 static const float c_nbnxnListSizeFactorCpu           = 1.25;
 114 // Intel KNL: pair-search is a factor ~2-3 slower than the non-bonded kernel.
 115 //! Target pair-list size increase ratio for Intel KNL
 116 static const float c_nbnxnListSizeFactorIntelXeonPhi  = 1.4;
 117 // GPU: pair-search is a factor 1.5-3 slower than the non-bonded kernel.
 118 //! Target pair-list size increase ratio for hybrid CPU-GPU
 119 static const float c_nbnxnListSizeFactorGPU           = 1.4;
 120 //! Never increase the size of the pair-list more than the factor above plus this margin
 121 static const float c_nbnxnListSizeFactorMargin        = 0.1;
 122
 123 void increaseNstlist(FILE *fp, t_commrec *cr,
 124                      t_inputrec *ir, int nstlist_cmdline,
 125                      const gmx_mtop_t *mtop, matrix box,
 126                      bool useOrEmulateGpuForNonbondeds,
 127                      const gmx::CpuInfo &cpuinfo)
 128 {
 129     float                  listfac_ok, listfac_max;
 130     int                    nstlist_orig, nstlist_prev;
 131     verletbuf_list_setup_t ls;
 132     real                   rlistWithReferenceNstlist, rlist_inc, rlist_ok, rlist_max;
 133     real                   rlist_new, rlist_prev;
 134     size_t                 nstlist_ind = 0;
 135     gmx_bool               bBox, bDD, bCont;
 136     const char            *nstl_gpu = "\nFor optimal performance with a GPU nstlist (now %d) should be larger.\nThe optimum depends on your CPU and GPU resources.\nYou might want to try several nstlist values.\n";
 137     const char            *nve_err  = "Can not increase nstlist because an NVE ensemble is used";
 138     const char            *vbd_err  = "Can not increase nstlist because verlet-buffer-tolerance is not set or used";
 139     const char            *box_err  = "Can not increase nstlist because the box is too small";
 140     const char            *dd_err   = "Can not increase nstlist because of domain decomposition limitations";
 141     char                   buf[STRLEN];
 142
 143     if (nstlist_cmdline <= 0)
 144     {
 145         if (ir->nstlist == 1)
 146         {
 147             /* The user probably set nstlist=1 for a reason,
 148              * don't mess with the settings.
 149              */
 150             return;
 151         }
 152
 153         if (fp != nullptr && useOrEmulateGpuForNonbondeds && ir->nstlist < nstlist_try[0])
 154         {
 155             fprintf(fp, nstl_gpu, ir->nstlist);
 156         }
 157         nstlist_ind = 0;
 158         while (nstlist_ind < NNSTL && ir->nstlist >= nstlist_try[nstlist_ind])
 159         {
 160             nstlist_ind++;
 161         }
 162         if (nstlist_ind == NNSTL)
 163         {
 164             /* There are no larger nstlist value to try */
 165             return;
 166         }
 167     }
 168
 169     if (EI_MD(ir->eI) && ir->etc == etcNO)
 170     {
 171         if (MASTER(cr))
 172         {
 173             fprintf(stderr, "%s\n", nve_err);
 174         }
 175         if (fp != nullptr)
 176         {
 177             fprintf(fp, "%s\n", nve_err);
 178         }
 179
 180         return;
 181     }
 182
 183     if (ir->verletbuf_tol == 0 && useOrEmulateGpuForNonbondeds)
 184     {
 185         gmx_fatal(FARGS, "You are using an old tpr file with a GPU, please generate a new tpr file with an up to date version of grompp");
 186     }
 187
 188     if (ir->verletbuf_tol < 0)
 189     {
 190         if (MASTER(cr))
 191         {
 192             fprintf(stderr, "%s\n", vbd_err);
 193         }
 194         if (fp != nullptr)
 195         {
 196             fprintf(fp, "%s\n", vbd_err);
 197         }
 198
 199         return;
 200     }
 201
 202     if (useOrEmulateGpuForNonbondeds)
 203     {
 204         listfac_ok  = c_nbnxnListSizeFactorGPU;
 205     }
 206     else if (cpuinfo.brandString().find("Xeon Phi") != std::string::npos)
 207     {
 208         listfac_ok  = c_nbnxnListSizeFactorIntelXeonPhi;
 209     }
 210     else
 211     {
 212         listfac_ok  = c_nbnxnListSizeFactorCpu;
 213     }
 214     listfac_max     = listfac_ok + c_nbnxnListSizeFactorMargin;
 215
 216     nstlist_orig    = ir->nstlist;
 217     if (nstlist_cmdline > 0)
 218     {
 219         if (fp)
 220         {
 221             sprintf(buf, "Getting nstlist=%d from command line option",
 222                     nstlist_cmdline);
 223         }
 224         ir->nstlist = nstlist_cmdline;
 225     }
 226
 227     verletbuf_get_list_setup(TRUE, useOrEmulateGpuForNonbondeds, &ls);
 228
 229     /* Allow rlist to make the list a given factor larger than the list
 230      * would be with the reference value for nstlist (10).
 231      */
 232     nstlist_prev = ir->nstlist;
 233     ir->nstlist  = nbnxnReferenceNstlist;
 234     calc_verlet_buffer_size(mtop, det(box), ir, ir->nstlist, ir->nstlist - 1,
 235                             -1, &ls, nullptr, &rlistWithReferenceNstlist);
 236     ir->nstlist  = nstlist_prev;
 237
 238     /* Determine the pair list size increase due to zero interactions */
 239     rlist_inc = nbnxn_get_rlist_effective_inc(ls.cluster_size_j,
 240                                               mtop->natoms/det(box));
 241     rlist_ok  = (rlistWithReferenceNstlist + rlist_inc)*std::cbrt(listfac_ok) - rlist_inc;
 242     rlist_max = (rlistWithReferenceNstlist + rlist_inc)*std::cbrt(listfac_max) - rlist_inc;
 243     if (debug)
 244     {
 245         fprintf(debug, "nstlist tuning: rlist_inc %.3f rlist_ok %.3f rlist_max %.3f\n",
 246                 rlist_inc, rlist_ok, rlist_max);
 247     }
 248
 249     nstlist_prev = nstlist_orig;
 250     rlist_prev   = ir->rlist;
 251     do
 252     {
 253         if (nstlist_cmdline <= 0)
 254         {
 255             ir->nstlist = nstlist_try[nstlist_ind];
 256         }
 257
 258         /* Set the pair-list buffer size in ir */
 259         calc_verlet_buffer_size(mtop, det(box), ir, ir->nstlist, ir->nstlist - 1, -1, &ls, nullptr, &rlist_new);
 260
 261         /* Does rlist fit in the box? */
 262         bBox = (gmx::square(rlist_new) < max_cutoff2(ir->ePBC, box));
 263         bDD  = TRUE;
 264         if (bBox && DOMAINDECOMP(cr))
 265         {
 266             /* Check if rlist fits in the domain decomposition */
 267             if (inputrec2nboundeddim(ir) < DIM)
 268             {
 269                 gmx_incons("Changing nstlist with domain decomposition and unbounded dimensions is not implemented yet");
 270             }
 271             t_state state_tmp;
 272             copy_mat(box, state_tmp.box);
 273             bDD = change_dd_cutoff(cr, &state_tmp, ir, rlist_new);
 274         }
 275
 276         if (debug)
 277         {
 278             fprintf(debug, "nstlist %d rlist %.3f bBox %d bDD %d\n",
 279                     ir->nstlist, rlist_new, bBox, bDD);
 280         }
 281
 282         bCont = FALSE;
 283
 284         if (nstlist_cmdline <= 0)
 285         {
 286             if (bBox && bDD && rlist_new <= rlist_max)
 287             {
 288                 /* Increase nstlist */
 289                 nstlist_prev = ir->nstlist;
 290                 rlist_prev   = rlist_new;
 291                 bCont        = (nstlist_ind+1 < NNSTL && rlist_new < rlist_ok);
 292             }
 293             else
 294             {
 295                 /* Stick with the previous nstlist */
 296                 ir->nstlist = nstlist_prev;
 297                 rlist_new   = rlist_prev;
 298                 bBox        = TRUE;
 299                 bDD         = TRUE;
 300             }
 301         }
 302
 303         nstlist_ind++;
 304     }
 305     while (bCont);
 306
 307     if (!bBox || !bDD)
 308     {
 309         gmx_warning(!bBox ? box_err : dd_err);
 310         if (fp != nullptr)
 311         {
 312             fprintf(fp, "\n%s\n", bBox ? box_err : dd_err);
 313         }
 314         ir->nstlist = nstlist_orig;
 315     }
 316     else if (ir->nstlist != nstlist_orig || rlist_new != ir->rlist)
 317     {
 318         sprintf(buf, "Changing nstlist from %d to %d, rlist from %g to %g",
 319                 nstlist_orig, ir->nstlist,
 320                 ir->rlist, rlist_new);
 321         if (MASTER(cr))
 322         {
 323             fprintf(stderr, "%s\n\n", buf);
 324         }
 325         if (fp != nullptr)
 326         {
 327             fprintf(fp, "%s\n\n", buf);
 328         }
 329         ir->rlist     = rlist_new;
 330     }
 331 }
 332
 333 /*! \brief The interval in steps at which we perform dynamic, rolling pruning on a GPU.
 334  *
 335  * Ideally we should auto-tune this value.
 336  * Not considering overheads, 1 would be the ideal value. But 2 seems
 337  * a reasonable compromise that reduces GPU kernel launch overheads and
 338  * also avoids inefficiency on large GPUs when pruning small lists.
 339  * Because with domain decomposition we alternate local/non-local pruning
 340  * at even/odd steps, which gives a period of 2, this value currenly needs
 341  * to be 2, which is indirectly asserted when the GPU pruning is dispatched
 342  * during the force evaluation.
 343  */
 344 static const int c_nbnxnGpuRollingListPruningInterval = 2;
 345
 346 /*! \brief The minimum nstlist for dynamic pair list pruning.
 347  *
 348  * In most cases going lower than 4 will lead to a too high pruning cost.
 349  * This value should be a multiple of \p c_nbnxnGpuRollingListPruningInterval
 350  */
 351 static const int c_nbnxnDynamicListPruningMinLifetime = 4;
 352
 353 /*! \brief Set the dynamic pairlist pruning parameters in \p ic
 354  *
 355  * \param[in]     ir          The input parameter record
 356  * \param[in]     mtop        The global topology
 357  * \param[in]     box         The unit cell
 358  * \param[in]     useGpu      Tells if we are using a GPU for non-bondeds
 359  * \param[in]     listSetup   The nbnxn pair list setup
 360  * \param[in]     userSetNstlistPrune  The user set ic->nstlistPrune (using an env.var.)
 361  * \param[in] ic              The nonbonded interactions constants
 362  * \param[in,out] listParams  The list setup parameters
 363  */
 364 static void
 365 setDynamicPairlistPruningParameters(const t_inputrec             *ir,
 366                                     const gmx_mtop_t             *mtop,
 367                                     matrix                        box,
 368                                     gmx_bool                      useGpu,
 369                                     const verletbuf_list_setup_t &listSetup,
 370                                     bool                          userSetNstlistPrune,
 371                                     const interaction_const_t    *ic,
 372                                     NbnxnListParameters          *listParams)
 373 {
 374     /* When nstlistPrune was set by the user, we need to execute one loop
 375      * iteration to determine rlistInner.
 376      * Otherwise we compute rlistInner and increase nstlist as long as
 377      * we have a pairlist buffer of length 0 (i.e. rlistInner == cutoff).
 378      */
 379     const real interactionCutoff = std::max(ic->rcoulomb, ic->rvdw);
 380     int        tunedNstlistPrune = listParams->nstlistPrune;
 381     do
 382     {
 383         /* Dynamic pruning on the GPU is performed on the list for
 384          * the next step on the coordinates of the current step,
 385          * so the list lifetime is nstlistPrune (not the usual nstlist-1).
 386          */
 387         int listLifetime         = tunedNstlistPrune - (useGpu ? 0 : 1);
 388         listParams->nstlistPrune = tunedNstlistPrune;
 389         calc_verlet_buffer_size(mtop, det(box), ir,
 390                                 tunedNstlistPrune, listLifetime,
 391                                 -1, &listSetup, NULL,
 392                                 &listParams->rlistInner);
 393
 394         /* On the GPU we apply the dynamic pruning in a rolling fashion
 395          * every c_nbnxnGpuRollingListPruningInterval steps,
 396          * so keep nstlistPrune a multiple of the interval.
 397          */
 398         tunedNstlistPrune += useGpu ? c_nbnxnGpuRollingListPruningInterval : 1;
 399     }
 400     while (!userSetNstlistPrune &&
 401            tunedNstlistPrune < ir->nstlist &&
 402            listParams->rlistInner == interactionCutoff);
 403
 404     if (userSetNstlistPrune)
 405     {
 406         listParams->useDynamicPruning = true;
 407     }
 408     else
 409     {
 410         /* Determine the pair list size increase due to zero interactions */
 411         real rlistInc = nbnxn_get_rlist_effective_inc(listSetup.cluster_size_j,
 412                                                       mtop->natoms/det(box));
 413
 414         /* Dynamic pruning is only useful when the inner list is smaller than
 415          * the outer. The factor 0.99 ensures at least 3% list size reduction.
 416          *
 417          * With dynamic pruning on the CPU we prune after updating,
 418          * so nstlistPrune=nstlist-1 would add useless extra work.
 419          * With the GPU there will probably be more overhead than gain
 420          * with nstlistPrune=nstlist-1, so we disable dynamic pruning.
 421          * Note that in such cases the first sub-condition is likely also false.
 422          */
 423         listParams->useDynamicPruning =
 424             (listParams->rlistInner + rlistInc < 0.99*(listParams->rlistOuter + rlistInc) &&
 425              listParams->nstlistPrune < ir->nstlist - 1);
 426     }
 427
 428     if (!listParams->useDynamicPruning)
 429     {
 430         /* These parameters should not be used, but set them to useful values */
 431         listParams->nstlistPrune  = -1;
 432         listParams->rlistInner    = listParams->rlistOuter;
 433     }
 434 }
 435
 436 void setupDynamicPairlistPruning(FILE                      *fplog,
 437                                  const t_inputrec          *ir,
 438                                  const gmx_mtop_t          *mtop,
 439                                  matrix                     box,
 440                                  bool                       useGpu,
 441                                  const interaction_const_t *ic,
 442                                  NbnxnListParameters       *listParams)
 443 {
 444     GMX_RELEASE_ASSERT(listParams->rlistOuter > 0, "With the nbnxn setup rlist should be > 0");
 445
 446     /* Initialize the parameters to no dynamic list pruning */
 447     listParams->useDynamicPruning = false;
 448
 449     if (supportsDynamicPairlistGenerationInterval(*ir) &&
 450         getenv("GMX_DISABLE_DYNAMICPRUNING") == NULL)
 451     {
 452         verletbuf_list_setup_t ls;
 453         verletbuf_get_list_setup(TRUE, TRUE, &ls);
 454
 455         /* Note that nstlistPrune can have any value independently of nstlist.
 456          * Actually applying rolling pruning is only useful when
 457          * nstlistPrune < nstlist -1
 458          */
 459         char *env                 = getenv("GMX_NSTLIST_DYNAMICPRUNING");
 460         bool  userSetNstlistPrune = (env != NULL);
 461
 462         if (userSetNstlistPrune)
 463         {
 464             char *end;
 465             listParams->nstlistPrune = strtol(env, &end, 10);
 466             if (!end || (*end != 0) ||
 467                 !(listParams->nstlistPrune > 0 && listParams->nstlistPrune < ir->nstlist))
 468             {
 469                 gmx_fatal(FARGS, "Invalid value passed in GMX_NSTLIST_DYNAMICPRUNING=%s, should be > 0 and < nstlist", env);
 470             }
 471         }
 472         else
 473         {
 474             static_assert(c_nbnxnDynamicListPruningMinLifetime % c_nbnxnGpuRollingListPruningInterval == 0,
 475                           "c_nbnxnDynamicListPruningMinLifetime sets the starting value for nstlistPrune, which should be divisible by the rolling pruning interval for efficiency reasons.");
 476
 477             // TODO: Use auto-tuning to determine nstlistPrune
 478             listParams->nstlistPrune = c_nbnxnDynamicListPruningMinLifetime;
 479         }
 480
 481         setDynamicPairlistPruningParameters(ir, mtop, box, useGpu, ls,
 482                                             userSetNstlistPrune, ic,
 483                                             listParams);
 484
 485         if (listParams->useDynamicPruning && useGpu)
 486         {
 487             /* Note that we can round down here. This makes the effective
 488              * rolling pruning interval slightly shorter than nstlistTune,
 489              * thus giving correct results, but a slightly lower efficiency.
 490              */
 491             GMX_RELEASE_ASSERT(listParams->nstlistPrune >= c_nbnxnGpuRollingListPruningInterval,
 492                                ( "With dynamic list pruning on GPUs pruning frequency must be at least as large as the rolling pruning interval (" +
 493                                  std::to_string(c_nbnxnGpuRollingListPruningInterval) +
 494                                  ").").c_str() );
 495             listParams->numRollingParts = listParams->nstlistPrune/c_nbnxnGpuRollingListPruningInterval;
 496         }
 497         else
 498         {
 499             listParams->numRollingParts = 1;
 500         }
 501
 502         if (fplog && listParams->useDynamicPruning)
 503         {
 504             const real interactionCutoff = std::max(ic->rcoulomb, ic->rvdw);
 505             fprintf(fplog,
 506                     "Using a dual pair-list setup updated with dynamic%s pruning:\n"
 507                     "  outer list: updated every %3d steps, buffer %.3f nm, rlist %.3f nm\n"
 508                     "  inner list: updated every %3d steps, buffer %.3f nm, rlist %.3f nm\n",
 509                     listParams->numRollingParts > 1 ? ", rolling" : "",
 510                     ir->nstlist, listParams->rlistOuter - interactionCutoff, listParams->rlistOuter,
 511                     listParams->nstlistPrune, listParams->rlistInner - interactionCutoff, listParams->rlistInner);
 512         }
 513     }
 514 }