``MDRUN``
the :ref:`gmx mdrun` command used by :ref:`gmx tune_pme`.
-``GMX_NSTLIST``
- sets the default value for :mdp:`nstlist`, preventing it from being tuned during
- :ref:`gmx mdrun` startup when using the Verlet cutoff scheme.
+``GMX_DISABLE DYNAMICPRUNING``
+ disables dynamic pair-list pruning. Note that :ref:`gmx mdrun` will
+ still tune nstlist to the optimal value with dynamic pruning. Thus
+ for good performance the -nstlist option should be used.
+
+``GMX_NSTLIST_DYNAMICPRUNING``
+ overrides the dynamic pair-list pruning interval chosen heuristically
+ by mdrun. Values should be between 1 and :mdp:`nstlist - 1`.
``GMX_USE_TREEREDUCE``
use tree reduction for nbnxn force reduction. Potentially faster for large number of
#include "gromacs/math/functions.h"
#include "gromacs/math/vec.h"
#include "gromacs/mdlib/forcerec.h"
+#include "gromacs/mdlib/nb_verlet.h"
#include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h"
+#include "gromacs/mdlib/nbnxn_pairlist.h"
#include "gromacs/mdlib/sim_util.h"
#include "gromacs/mdtypes/commrec.h"
#include "gromacs/mdtypes/inputrec.h"
/*! \brief Parameters and settings for one PP-PME setup */
struct pme_setup_t {
real rcut_coulomb; /**< Coulomb cut-off */
- real rlist; /**< pair-list cut-off */
+ real rlistOuter; /**< cut-off for the outer pair-list */
+ real rlistInner; /**< cut-off for the inner pair-list */
real spacing; /**< (largest) PME grid spacing */
ivec grid; /**< the PME grid dimensions */
real grid_efficiency; /**< ineffiency factor for non-uniform grids <= 1 */
real cut_spacing; /**< the minimum cutoff / PME grid spacing ratio */
real rcut_vdw; /**< Vdw cutoff (does not change) */
real rcut_coulomb_start; /**< Initial electrostatics cutoff */
- real rbuf_coulomb; /**< the pairlist buffer size */
- real rbuf_vdw; /**< the pairlist buffer size */
+ real rbufOuter_coulomb; /**< the outer pairlist buffer size */
+ real rbufOuter_vdw; /**< the outer pairlist buffer size */
+ real rbufInner_coulomb; /**< the inner pairlist buffer size */
+ real rbufInner_vdw; /**< the inner pairlist buffer size */
matrix box_start; /**< the initial simulation box */
int n; /**< the count of setup as well as the allocation size */
pme_setup_t *setup; /**< the PME+cutoff setups */
const t_inputrec *ir,
matrix box,
const interaction_const_t *ic,
+ const NbnxnListParameters *listParams,
gmx_pme_t *pmedata,
gmx_bool bUseGPU,
gmx_bool *bPrinting)
snew(pme_lb, 1);
- pme_lb->bSepPMERanks = !(cr->duty & DUTY_PME);
+ pme_lb->bSepPMERanks = !(cr->duty & DUTY_PME);
/* Initially we turn on balancing directly on based on PP/PME imbalance */
- pme_lb->bTriggerOnDLB = FALSE;
+ pme_lb->bTriggerOnDLB = FALSE;
/* Any number of stages >= 2 is supported */
- pme_lb->nstage = 2;
+ pme_lb->nstage = 2;
- pme_lb->cutoff_scheme = ir->cutoff_scheme;
+ pme_lb->cutoff_scheme = ir->cutoff_scheme;
- pme_lb->rbuf_coulomb = ic->rlist - ic->rcoulomb;
- pme_lb->rbuf_vdw = ic->rlist - ic->rvdw;
+ pme_lb->rbufOuter_coulomb = listParams->rlistOuter - ic->rcoulomb;
+ pme_lb->rbufOuter_vdw = listParams->rlistOuter - ic->rvdw;
+ pme_lb->rbufInner_coulomb = listParams->rlistInner - ic->rcoulomb;
+ pme_lb->rbufInner_vdw = listParams->rlistInner - ic->rvdw;
copy_mat(box, pme_lb->box_start);
if (ir->ePBC == epbcXY && ir->nwall == 2)
pme_lb->cur = 0;
pme_lb->setup[0].rcut_coulomb = ic->rcoulomb;
- pme_lb->setup[0].rlist = ic->rlist;
+ pme_lb->setup[0].rlistOuter = listParams->rlistOuter;
+ pme_lb->setup[0].rlistInner = listParams->rlistInner;
pme_lb->setup[0].grid[XX] = ir->nkx;
pme_lb->setup[0].grid[YY] = ir->nky;
pme_lb->setup[0].grid[ZZ] = ir->nkz;
if (pme_lb->cutoff_scheme == ecutsVERLET)
{
/* Never decrease the Coulomb and VdW list buffers */
- set->rlist = std::max(set->rcut_coulomb + pme_lb->rbuf_coulomb,
- pme_lb->rcut_vdw + pme_lb->rbuf_vdw);
+ set->rlistOuter = std::max(set->rcut_coulomb + pme_lb->rbufOuter_coulomb,
+ pme_lb->rcut_vdw + pme_lb->rbufOuter_vdw);
+ set->rlistInner = std::max(set->rcut_coulomb + pme_lb->rbufInner_coulomb,
+ pme_lb->rcut_vdw + pme_lb->rbufInner_vdw);
}
else
{
- tmpr_coulomb = set->rcut_coulomb + pme_lb->rbuf_coulomb;
- tmpr_vdw = pme_lb->rcut_vdw + pme_lb->rbuf_vdw;
- set->rlist = std::min(tmpr_coulomb, tmpr_vdw);
+ tmpr_coulomb = set->rcut_coulomb + pme_lb->rbufOuter_coulomb;
+ tmpr_vdw = pme_lb->rcut_vdw + pme_lb->rbufOuter_vdw;
+ set->rlistOuter = std::min(tmpr_coulomb, tmpr_vdw);
+ set->rlistInner = set->rlistOuter;
}
- set->spacing = sp;
+ set->spacing = sp;
/* The grid efficiency is the size wrt a grid with uniform x/y/z spacing */
set->grid_efficiency = 1;
for (d = 0; d < DIM; d++)
* better overal performance can be obtained with a slightly
* shorter cut-off and better DD load balancing.
*/
- set_dd_dlb_max_cutoff(cr, pme_lb->setup[pme_lb->fastest].rlist);
+ set_dd_dlb_max_cutoff(cr, pme_lb->setup[pme_lb->fastest].rlistOuter);
}
}
cycles_fast = pme_lb->setup[pme_lb->fastest].cycles;
if (OK && ir->ePBC != epbcNONE)
{
- OK = (gmx::square(pme_lb->setup[pme_lb->cur+1].rlist)
+ OK = (gmx::square(pme_lb->setup[pme_lb->cur+1].rlistOuter)
<= max_cutoff2(ir->ePBC, state->box));
if (!OK)
{
if (DOMAINDECOMP(cr))
{
OK = change_dd_cutoff(cr, state, ir,
- pme_lb->setup[pme_lb->cur].rlist);
+ pme_lb->setup[pme_lb->cur].rlistOuter);
if (!OK)
{
/* Failed: do not use this setup */
if (DOMAINDECOMP(cr) && pme_lb->stage > 0)
{
- OK = change_dd_cutoff(cr, state, ir, pme_lb->setup[pme_lb->cur].rlist);
+ OK = change_dd_cutoff(cr, state, ir, pme_lb->setup[pme_lb->cur].rlistOuter);
if (!OK)
{
/* For some reason the chosen cut-off is incompatible with DD.
set = &pme_lb->setup[pme_lb->cur];
- ic->rcoulomb = set->rcut_coulomb;
- ic->rlist = set->rlist;
- ic->ewaldcoeff_q = set->ewaldcoeff_q;
+ NbnxnListParameters *listParams = nbv->listParams.get();
+
+ ic->rcoulomb = set->rcut_coulomb;
+ listParams->rlistOuter = set->rlistOuter;
+ listParams->rlistInner = set->rlistInner;
+ ic->ewaldcoeff_q = set->ewaldcoeff_q;
/* TODO: centralize the code that sets the potentials shifts */
if (ic->coulomb_modifier == eintmodPOTSHIFT)
{
/* We always re-initialize the tables whether they are used or not */
init_interaction_const_tables(nullptr, ic, rtab);
- nbnxn_gpu_pme_loadbal_update_param(nbv, ic);
+ nbnxn_gpu_pme_loadbal_update_param(nbv, ic, listParams);
/* With tMPI + GPUs some ranks may be sharing GPU(s) and therefore
* also sharing texture references. To keep the code simple, we don't
* This also ensures that we won't disable the currently
* optimal setting during a second round of PME balancing.
*/
- set_dd_dlb_max_cutoff(cr, fr->ic->rlist);
+ set_dd_dlb_max_cutoff(cr, fr->nbv->listParams->rlistOuter);
}
}
/* Update constants in forcerec/inputrec to keep them in sync with fr->ic */
fr->ewaldcoeff_q = fr->ic->ewaldcoeff_q;
fr->ewaldcoeff_lj = fr->ic->ewaldcoeff_lj;
- fr->rlist = fr->ic->rlist;
+ fr->rlist = fr->nbv->listParams->rlistOuter;
fr->rcoulomb = fr->ic->rcoulomb;
fr->rvdw = fr->ic->rvdw;
fprintf(fplog,
" %-7s %6.3f nm %6.3f nm %3d %3d %3d %5.3f nm %5.3f nm\n",
name,
- setup->rcut_coulomb, setup->rlist,
+ setup->rcut_coulomb, setup->rlistInner,
setup->grid[XX], setup->grid[YY], setup->grid[ZZ],
setup->spacing, 1/setup->ewaldcoeff_q);
}
double pp_ratio, grid_ratio;
real pp_ratio_temporary;
- pp_ratio_temporary = pme_lb->setup[pme_lb->cur].rlist / pme_lb->setup[0].rlist;
+ pp_ratio_temporary = pme_lb->setup[pme_lb->cur].rlistInner / pme_lb->setup[0].rlistInner;
pp_ratio = gmx::power3(pp_ratio_temporary);
grid_ratio = pme_grid_points(&pme_lb->setup[pme_lb->cur])/
(double)pme_grid_points(&pme_lb->setup[0]);
#include "gromacs/mdtypes/interaction_const.h"
#include "gromacs/timing/wallcycle.h"
+struct NbnxnListParameters;
struct t_commrec;
struct t_inputrec;
class t_state;
const t_inputrec *ir,
matrix box,
const interaction_const_t *ic,
+ const NbnxnListParameters *listParams,
gmx_pme_t *pmedata,
gmx_bool bUseGPU,
gmx_bool *bPrinting);
/* Calculate the buffer size for simple atom vs atoms list */
ls.cluster_size_i = 1;
ls.cluster_size_j = 1;
- calc_verlet_buffer_size(mtop, det(box), ir, buffer_temp,
- &ls, &n_nonlin_vsite, &rlist_1x1);
+ calc_verlet_buffer_size(mtop, det(box), ir, ir->nstlist, ir->nstlist - 1,
+ buffer_temp, &ls, &n_nonlin_vsite, &rlist_1x1);
/* Set the pair-list buffer size in ir */
verletbuf_get_list_setup(FALSE, FALSE, &ls);
- calc_verlet_buffer_size(mtop, det(box), ir, buffer_temp,
- &ls, &n_nonlin_vsite, &ir->rlist);
+ calc_verlet_buffer_size(mtop, det(box), ir, ir->nstlist, ir->nstlist - 1,
+ buffer_temp, &ls, &n_nonlin_vsite, &ir->rlist);
if (n_nonlin_vsite > 0)
{
void calc_verlet_buffer_size(const gmx_mtop_t *mtop, real boxvol,
const t_inputrec *ir,
+ int nstlist,
+ int list_lifetime,
real reference_temperature,
const verletbuf_list_setup_t *list_setup,
int *n_nonlin_vsite,
}
/* Determine the variance of the atomic displacement
- * over nstlist-1 steps: kT_fac
+ * over list_lifetime steps: kT_fac
* For inertial dynamics (not Brownian dynamics) the mass factor
* is not included in kT_fac, it is added later.
*/
* should be negligible (unless nstlist is extremely large, which
* you wouldn't do anyhow).
*/
- kT_fac = 2*BOLTZ*reference_temperature*(ir->nstlist-1)*ir->delta_t;
+ kT_fac = 2*BOLTZ*reference_temperature*list_lifetime*ir->delta_t;
if (ir->bd_fric > 0)
{
/* This is directly sigma^2 of the displacement */
}
else
{
- kT_fac = BOLTZ*reference_temperature*gmx::square((ir->nstlist-1)*ir->delta_t);
+ kT_fac = BOLTZ*reference_temperature*gmx::square(list_lifetime*ir->delta_t);
}
mass_min = att[0].prop.mass;
drift *= nb_clust_frac_pairs_not_in_list_at_cutoff;
/* Convert the drift to drift per unit time per atom */
- drift /= ir->nstlist*ir->delta_t*mtop->natoms;
+ drift /= nstlist*ir->delta_t*mtop->natoms;
if (debug)
{
/* Calculate the non-bonded pair-list buffer size for the Verlet list
* based on the particle masses, temperature, LJ types, charges
* and constraints as well as the non-bonded force behavior at the cut-off.
+ * The pair list update frequency and the list lifetime, which is nstlist-1
+ * for normal pair-list buffering, are passed separately, as in some cases
+ * we want an estimate for different values than the ones set in the inputrec.
* If reference_temperature < 0, the maximum coupling temperature will be used.
* The target is a maximum energy drift of ir->verletbuf_tol.
* Returns the number of non-linear virtual sites. For these it's difficult
*/
void calc_verlet_buffer_size(const gmx_mtop_t *mtop, real boxvol,
const t_inputrec *ir,
+ int nstlist,
+ int list_lifetime,
real reference_temperature,
const verletbuf_list_setup_t *list_setup,
int *n_nonlin_vsite,
#include "gromacs/utility/strconvert.h"
#include "nbnxn_gpu_jit_support.h"
+#include "nbnxn_tuning.h"
const char *egrp_nm[egNR+1] = {
"Coul-SR", "LJ-SR", "Buck-SR",
snew_aligned(ic->tabq_coul_F, 16, 32);
snew_aligned(ic->tabq_coul_V, 16, 32);
- ic->rlist = fr->rlist;
-
/* Lennard-Jones */
ic->vdwtype = fr->vdwtype;
ic->vdw_modifier = fr->vdw_modifier;
const t_forcerec *fr,
const t_commrec *cr,
const char *nbpu_opt,
- gmx_device_info_t *deviceInfo)
+ gmx_device_info_t *deviceInfo,
+ const gmx_mtop_t *mtop,
+ matrix box)
{
nonbonded_verlet_t *nbv;
int i;
nbnxn_alloc_t *nb_alloc;
nbnxn_free_t *nb_free;
- snew(nbv, 1);
+ nbv = new nonbonded_verlet_t();
nbv->emulateGpu = (getenv("GMX_EMULATE_GPU") != nullptr);
nbv->bUseGPU = deviceInfo != nullptr;
}
}
+ nbv->listParams = std::unique_ptr<NbnxnListParameters>(new NbnxnListParameters(ir->rlist));
+ setupDynamicPairlistPruning(fp, ir, mtop, box, nbv->bUseGPU, fr->ic,
+ nbv->listParams.get());
+
nbnxn_init_search(&nbv->nbs,
DOMAINDECOMP(cr) ? &cr->dd->nc : nullptr,
DOMAINDECOMP(cr) ? domdec_zones(cr->dd) : nullptr,
nbnxn_gpu_init(&nbv->gpu_nbv,
deviceInfo,
fr->ic,
+ nbv->listParams.get(),
nbv->grp,
cr->nodeid,
(nbv->ngrp > 1) && !bHybridGPURun);
GMX_RELEASE_ASSERT(ir->rcoulomb == ir->rvdw, "With Verlet lists and no PME rcoulomb and rvdw should be identical");
}
- init_nb_verlet(fp, mdlog, &fr->nbv, bFEP_NonBonded, ir, fr, cr, nbpu_opt, deviceInfo);
+ init_nb_verlet(fp, mdlog, &fr->nbv, bFEP_NonBonded, ir, fr,
+ cr, nbpu_opt, deviceInfo,
+ mtop, box);
}
if (ir->eDispCorr != edispcNO)
* which in the code is referred to as the NxN algorithms ("nbnxn_" prefix);
* for details of the algorithm see DOI:10.1016/j.cpc.2013.06.003.
*
+ * Algorithmically, the non-bonded computation has two different modes:
+ * A "classical" mode: generate a list every nstlist steps containing at least
+ * all atom pairs up to a distance of rlistOuter and compute pair interactions
+ * for all pairs that are within the interaction cut-off.
+ * A "dynamic pruning" mode: generate an "outer-list" up to cut-off rlistOuter
+ * every nstlist steps and prune the outer-list using a cut-off of rlistInner
+ * every nstlistPrune steps to obtain a, smaller, "inner-list". This
+ * results in fewer interaction computations and allows for a larger nstlist.
+ * On a GPU, this dynamic pruning is performed in a rolling fashion, pruning
+ * only a sub-part of the list each (second) step. This way it can often
+ * overlap with integration and constraints on the CPU.
+ * Currently a simple heuristic determines which mode will be used.
+ *
* TODO: add a summary list and brief descriptions of the different submodules:
* search, CPU kernels, GPU glue code + kernels.
*
#ifndef NB_VERLET_H
#define NB_VERLET_H
+#include <memory>
+
#include "gromacs/mdlib/nbnxn_gpu_types.h"
#include "gromacs/mdlib/nbnxn_pairlist.h"
/*! \libinternal
* \brief Top-level non-bonded data structure for the Verlet-type cut-off scheme. */
typedef struct nonbonded_verlet_t {
- nbnxn_search_t nbs; /**< n vs n atom pair searching data */
- int ngrp; /**< number of interaction groups */
- nonbonded_verlet_group_t grp[2]; /**< local and non-local interaction group */
-
- gmx_bool bUseGPU; /**< TRUE when non-bonded interactions are computed on a physical GPU */
- bool emulateGpu; /**< true when non-bonded interactions are computed on the CPU using GPU-style pair lists */
- gmx_nbnxn_gpu_t *gpu_nbv; /**< pointer to GPU nb verlet data */
- int min_ci_balanced; /**< pair list balancing parameter
- used for the 8x8x8 GPU kernels */
+ std::unique_ptr<NbnxnListParameters> listParams; /**< Parameters for the search and list pruning setup */
+ nbnxn_search_t nbs; /**< n vs n atom pair searching data */
+ int ngrp; /**< number of interaction groups */
+ nonbonded_verlet_group_t grp[2]; /**< local and non-local interaction group */
+
+ gmx_bool bUseGPU; /**< TRUE when non-bonded interactions are computed on a physical GPU */
+ bool emulateGpu; /**< true when non-bonded interactions are computed on the CPU using GPU-style pair lists */
+ gmx_nbnxn_gpu_t *gpu_nbv; /**< pointer to GPU nb verlet data */
+ int min_ci_balanced; /**< pair list balancing parameter
+ used for the 8x8x8 GPU kernels */
} nonbonded_verlet_t;
/*! \brief Getter for bUseGPU */
/*! Copies all parameters related to the cut-off from ic to nbp */
static void set_cutoff_parameters(cu_nbparam_t *nbp,
- const interaction_const_t *ic)
+ const interaction_const_t *ic,
+ const NbnxnListParameters *listParams)
{
nbp->ewald_beta = ic->ewaldcoeff_q;
nbp->sh_ewald = ic->sh_ewald;
nbp->c_rf = ic->c_rf;
nbp->rvdw_sq = ic->rvdw * ic->rvdw;
nbp->rcoulomb_sq = ic->rcoulomb * ic->rcoulomb;
- nbp->rlist_sq = ic->rlist * ic->rlist;
+ nbp->rlist_sq = listParams->rlistOuter * listParams->rlistOuter;
nbp->sh_lj_ewald = ic->sh_lj_ewald;
nbp->ewaldcoeff_lj = ic->ewaldcoeff_lj;
/*! Initializes the nonbonded parameter data structure. */
static void init_nbparam(cu_nbparam_t *nbp,
const interaction_const_t *ic,
+ const NbnxnListParameters *listParams,
const nbnxn_atomdata_t *nbat,
const gmx_device_info_t *dev_info)
{
ntypes = nbat->ntype;
- set_cutoff_parameters(nbp, ic);
+ set_cutoff_parameters(nbp, ic, listParams);
/* The kernel code supports LJ combination rules (geometric and LB) for
* all kernel types, but we only generate useful combination rule kernels.
/*! Re-generate the GPU Ewald force table, resets rlist, and update the
* electrostatic type switching to twin cut-off (or back) if needed. */
void nbnxn_gpu_pme_loadbal_update_param(const nonbonded_verlet_t *nbv,
- const interaction_const_t *ic)
+ const interaction_const_t *ic,
+ const NbnxnListParameters *listParams)
{
if (!nbv || nbv->grp[0].kernel_type != nbnxnk8x8x8_GPU)
{
gmx_nbnxn_cuda_t *nb = nbv->gpu_nbv;
cu_nbparam_t *nbp = nb->nbparam;
- set_cutoff_parameters(nbp, ic);
+ set_cutoff_parameters(nbp, ic, listParams);
nbp->eeltype = pick_ewald_kernel_type(ic->rcoulomb != ic->rvdw,
nb->dev_info);
/*! Initializes simulation constant data. */
static void nbnxn_cuda_init_const(gmx_nbnxn_cuda_t *nb,
const interaction_const_t *ic,
+ const NbnxnListParameters *listParams,
const nonbonded_verlet_group_t *nbv_group)
{
init_atomdata_first(nb->atdat, nbv_group[0].nbat->ntype);
- init_nbparam(nb->nbparam, ic, nbv_group[0].nbat, nb->dev_info);
+ init_nbparam(nb->nbparam, ic, listParams, nbv_group[0].nbat, nb->dev_info);
/* clear energy and shift force outputs */
nbnxn_cuda_clear_e_fshift(nb);
void nbnxn_gpu_init(gmx_nbnxn_cuda_t **p_nb,
const gmx_device_info_t *deviceInfo,
const interaction_const_t *ic,
+ const NbnxnListParameters *listParams,
nonbonded_verlet_group_t *nbv_grp,
int /*rank*/,
gmx_bool bLocalAndNonlocal)
/* pick L1 cache configuration */
nbnxn_cuda_set_cacheconfig(nb->dev_info);
- nbnxn_cuda_init_const(nb, ic, nbv_grp);
+ nbnxn_cuda_init_const(nb, ic, listParams, nbv_grp);
*p_nb = nb;
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014,2015,2017, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
int gmx_unused flags,
int gmx_unused iloc) GPU_FUNC_TERM
+/*! \brief
+ * Launch asynchronously the nonbonded prune-only kernel.
+ *
+ * The local and non-local list pruning are launched in their separate streams.
+ *
+ * Notes for future scheduling tuning:
+ * Currently we schedule the dynamic pruning between two MD steps *after* both local and
+ * nonlocal force D2H transfers completed. We could launch already after the cpyback
+ * is launched, but we want to avoid prune kernels (especially in the non-local
+ * high prio-stream) competing with nonbonded work.
+ *
+ * However, this is not ideal as this schedule does not expose the available
+ * concurrency. The dynamic pruning kernel:
+ * - should be allowed to overlap with any task other than force compute, including
+ * transfers (F D2H and the next step's x H2D as well as force clearing).
+ * - we'd prefer to avoid competition with non-bonded force kernels belonging
+ * to the same rank and ideally other ranks too.
+ *
+ * In the most general case, the former would require scheduling pruning in a separate
+ * stream and adding additional event sync points to ensure that force kernels read
+ * consistent pair list data. This would lead to some overhead (due to extra
+ * cudaStreamWaitEvent calls, 3-5 us/call) which we might be able to live with.
+ * The gains from additional overlap might not be significant as long as
+ * update+constraints anyway takes longer than pruning, but there will still
+ * be use-cases where more overlap may help (e.g. multiple ranks per GPU,
+ * no/hbonds only constraints).
+ * The above second point is harder to address given that multiple ranks will often
+ * share a GPU. Ranks that complete their nonbondeds sooner can schedule pruning earlier
+ * and without a third priority level it is difficult to avoid some interference of
+ * prune kernels with force tasks (in particular preemption of low-prio local force task).
+ *
+ * \param [inout] nb GPU nonbonded data.
+ * \param [in] iloc Interaction locality flag.
+ * \param [in] numParts Number of parts the pair list is split into in the rolling kernel.
+ */
+GPU_FUNC_QUALIFIER
+void nbnxn_gpu_launch_kernel_pruneonly(gmx_nbnxn_gpu_t gmx_unused *nb,
+ int gmx_unused iloc,
+ int gmx_unused numParts) GPU_FUNC_TERM
+
/*! \brief
* Launch asynchronously the download of nonbonded forces from the GPU
* (and energies/shift forces if required).
struct nonbonded_verlet_group_t;
struct nbnxn_pairlist_t;
struct nbnxn_atomdata_t;
+struct NbnxnListParameters;
struct gmx_wallclock_gpu_t;
struct gmx_gpu_info_t;
void nbnxn_gpu_init(gmx_nbnxn_gpu_t gmx_unused **p_nb,
const gmx_device_info_t gmx_unused *deviceInfo,
const interaction_const_t gmx_unused *ic,
+ const NbnxnListParameters gmx_unused *listParams,
nonbonded_verlet_group_t gmx_unused *nbv_grp,
int gmx_unused rank,
/* true if both local and non-local are done on GPU */
*/
GPU_FUNC_QUALIFIER
void nbnxn_gpu_pme_loadbal_update_param(const struct nonbonded_verlet_t gmx_unused *nbv,
- const interaction_const_t gmx_unused *ic) GPU_FUNC_TERM
+ const interaction_const_t gmx_unused *ic,
+ const NbnxnListParameters gmx_unused *listParams) GPU_FUNC_TERM
/** Uploads shift vector to the GPU if the box is dynamic (otherwise just returns). */
GPU_FUNC_QUALIFIER
* \param[in,out] vCoulomb Coulomb energy output buffer
*/
template <int unrollj> static void
-reduceGroupEnergySimdBuffers(int numGroups, int numGroups_2log,
- const real *vVdwSimd, const real *vCoulombSimd,
- real * gmx_restrict vVdw,
- real * gmx_restrict vCoulomb)
+reduceGroupEnergySimdBuffers(int numGroups,
+ int numGroups_2log,
+ const real * gmx_restrict vVdwSimd,
+ const real * gmx_restrict vCoulombSimd,
+ real * gmx_restrict vVdw,
+ real * gmx_restrict vCoulomb)
{
// cppcheck-suppress duplicateExpression
const int unrollj_half = unrollj/2;
int nnbl = nbvg->nbl_lists.nnbl;
nbnxn_pairlist_t **nbl = nbvg->nbl_lists.nbl;
+ GMX_ASSERT(nbl[0]->nci >= 0, "nci<0, which signals an invalid pair-list");
+
// cppcheck-suppress unreadVariable
int gmx_unused nthreads = gmx_omp_nthreads_get(emntNonbonded);
#pragma omp parallel for schedule(static) num_threads(nthreads)
/*! \brief Copies all parameters related to the cut-off from ic to nbp
*/
static void set_cutoff_parameters(cl_nbparam_t *nbp,
- const interaction_const_t *ic)
+ const interaction_const_t *ic,
+ const NbnxnListParameters *listParams)
{
nbp->ewald_beta = ic->ewaldcoeff_q;
nbp->sh_ewald = ic->sh_ewald;
nbp->c_rf = ic->c_rf;
nbp->rvdw_sq = ic->rvdw * ic->rvdw;
nbp->rcoulomb_sq = ic->rcoulomb * ic->rcoulomb;
- nbp->rlist_sq = ic->rlist * ic->rlist;
+ nbp->rlist_sq = listParams->rlistOuter * listParams->rlistOuter;
nbp->sh_lj_ewald = ic->sh_lj_ewald;
nbp->ewaldcoeff_lj = ic->ewaldcoeff_lj;
*/
static void init_nbparam(cl_nbparam_t *nbp,
const interaction_const_t *ic,
+ const NbnxnListParameters *listParams,
const nbnxn_atomdata_t *nbat,
const gmx_device_runtime_data_t *runData)
{
ntypes = nbat->ntype;
- set_cutoff_parameters(nbp, ic);
+ set_cutoff_parameters(nbp, ic, listParams);
map_interaction_types_to_gpu_kernel_flavors(ic,
nbat->comb_rule,
//! This function is documented in the header file
void nbnxn_gpu_pme_loadbal_update_param(const nonbonded_verlet_t *nbv,
- const interaction_const_t *ic)
+ const interaction_const_t *ic,
+ const NbnxnListParameters *listParams)
{
if (!nbv || nbv->grp[0].kernel_type != nbnxnk8x8x8_GPU)
{
gmx_nbnxn_ocl_t *nb = nbv->gpu_nbv;
cl_nbparam_t *nbp = nb->nbparam;
- set_cutoff_parameters(nbp, ic);
+ set_cutoff_parameters(nbp, ic, listParams);
nbp->eeltype = nbnxn_gpu_pick_ewald_kernel_type(ic->rcoulomb != ic->rvdw);
*/
static void nbnxn_ocl_init_const(gmx_nbnxn_ocl_t *nb,
const interaction_const_t *ic,
+ const NbnxnListParameters *listParams,
const nonbonded_verlet_group_t *nbv_group)
{
init_atomdata_first(nb->atdat, nbv_group[0].nbat->ntype, nb->dev_rundata);
- init_nbparam(nb->nbparam, ic, nbv_group[0].nbat, nb->dev_rundata);
+ init_nbparam(nb->nbparam, ic, listParams, nbv_group[0].nbat, nb->dev_rundata);
}
void nbnxn_gpu_init(gmx_nbnxn_ocl_t **p_nb,
const gmx_device_info_t *deviceInfo,
const interaction_const_t *ic,
+ const NbnxnListParameters *listParams,
nonbonded_verlet_group_t *nbv_grp,
int rank,
gmx_bool bLocalAndNonlocal)
init_timings(nb->timings);
}
- nbnxn_ocl_init_const(nb, ic, nbv_grp);
+ nbnxn_ocl_init_const(nb, ic, listParams, nbv_grp);
/* Enable LJ param manual prefetch for AMD or if we request through env. var.
* TODO: decide about NVIDIA
struct tMPI_Atomic;
+/*! \cond INTERNAL */
+
+/*! \brief The setup for generating and pruning the nbnxn pair list.
+ *
+ * Without dynamic pruning rlistOuter=rlistInner.
+ */
+struct NbnxnListParameters
+{
+ /*! \brief Constructor producing a struct with dynamic pruning disabled
+ */
+ NbnxnListParameters(real rlist) :
+ useDynamicPruning(false),
+ nstlistPrune(-1),
+ rlistOuter(rlist),
+ rlistInner(rlist),
+ numRollingParts(1)
+ {
+ }
+
+ bool useDynamicPruning; //!< Are we using dynamic pair-list pruning
+ int nstlistPrune; //!< Pair-list dynamic pruning interval
+ real rlistOuter; //!< Cut-off of the larger, outer pair-list
+ real rlistInner; //!< Cut-off of the smaller, inner pair-list
+ int numRollingParts; //!< The number parts to divide the pair-list into for rolling pruning, a value of 1 gives no rolling pruning
+};
+
+/*! \endcond */
+
/* With GPU kernels the i and j cluster size is 8 atoms */
static const int c_nbnxnGpuClusterSize = 8;
int na_sc; /* The number of atoms per super cluster */
real rlist; /* The radius for constructing the list */
int nci; /* The number of i-clusters in the list */
+ int nciOuter; /* The number of i-clusters in the outer, unpruned list, -1 when invalid */
nbnxn_ci_t *ci; /* The i-cluster list, size nci */
- int ci_nalloc; /* The allocation size of ci */
+ nbnxn_ci_t *ciOuter; /* The outer, unpruned i-cluster list, size nciOuter(=-1 when invalid) */
+ int ci_nalloc; /* The allocation size of ci/ciOuter */
int nsci; /* The number of i-super-clusters in the list */
nbnxn_sci_t *sci; /* The i-super-cluster list */
int sci_nalloc; /* The allocation size of sci */
int ncj; /* The number of j-clusters in the list */
nbnxn_cj_t *cj; /* The j-cluster list, size ncj */
- int cj_nalloc; /* The allocation size of cj */
+ nbnxn_cj_t *cjOuter; /* The outer, unpruned j-cluster list, size ncj */
+ int cj_nalloc; /* The allocation size of cj/cj0 */
int ncjInUse; /* The number of j-clusters that are used by ci entries in this list, will be <= ncj */
int ncj4; /* The total number of 4*j clusters */
} nbnxn_pairlist_t;
typedef struct {
- int nnbl; /* number of lists */
- nbnxn_pairlist_t **nbl; /* lists */
- nbnxn_pairlist_t **nbl_work; /* work space for rebalancing lists */
- gmx_bool bCombined; /* TRUE if lists get combined into one (the 1st) */
- gmx_bool bSimple; /* TRUE if the list of of type "simple"
- (na_sc=na_s, no super-clusters used) */
- int natpair_ljq; /* Total number of atom pairs for LJ+Q kernel */
- int natpair_lj; /* Total number of atom pairs for LJ kernel */
- int natpair_q; /* Total number of atom pairs for Q kernel */
- t_nblist **nbl_fep;
+ int nnbl; /* number of lists */
+ nbnxn_pairlist_t **nbl; /* lists */
+ nbnxn_pairlist_t **nbl_work; /* work space for rebalancing lists */
+ gmx_bool bCombined; /* TRUE if lists get combined into one (the 1st) */
+ gmx_bool bSimple; /* TRUE if the list of of type "simple"
+ (na_sc=na_s, no super-clusters used) */
+ int natpair_ljq; /* Total number of atom pairs for LJ+Q kernel */
+ int natpair_lj; /* Total number of atom pairs for LJ kernel */
+ int natpair_q; /* Total number of atom pairs for Q kernel */
+ t_nblist **nbl_fep; /* List of free-energy atom pair interactions */
+ gmx_int64_t outerListCreationStep; /* Step at which the outer list was created */
} nbnxn_pairlist_set_t;
enum {
nbl->ncj*sizeof(*nbl->cj),
nbl->cj_nalloc*sizeof(*nbl->cj),
nbl->alloc, nbl->free);
+
+ nbnxn_realloc_void((void **)&nbl->cjOuter,
+ nbl->ncj*sizeof(*nbl->cjOuter),
+ nbl->cj_nalloc*sizeof(*nbl->cjOuter),
+ nbl->alloc, nbl->free);
}
}
nbl->nci*sizeof(*nbl->ci),
nbl->ci_nalloc*sizeof(*nbl->ci),
nbl->alloc, nbl->free);
+
+ nbnxn_realloc_void((void **)&nbl->ciOuter,
+ nbl->nci*sizeof(*nbl->ciOuter),
+ nbl->ci_nalloc*sizeof(*nbl->ciOuter),
+ nbl->alloc, nbl->free);
}
/* Reallocate the super-cell sci list for at least n entries */
nbl->ncjInUse = 0;
nbl->ncj4 = 0;
nbl->nci_tot = 0;
+ nbl->nciOuter = -1;
nbl->nexcl = 1;
nbl->work->ncj_noq = 0;
balance_fep_lists(nbs, nbl_list);
}
+ /* This is a fresh list, so not pruned, stored using ci and nci.
+ * ciOuter and nciOuter are invalid at this point.
+ */
+ GMX_ASSERT(nbl_list->nbl[0]->nciOuter == -1, "nciOuter should have been set to -1 to signal that it is invalid");
+
/* Special performance logging stuff (env.var. GMX_NBNXN_CYCLE) */
if (LOCAL_I(iloc))
{
}
}
}
+
+void nbnxnPrepareListForDynamicPruning(nbnxn_pairlist_set_t *listSet)
+{
+ /* TODO: Restructure the lists so we have actual outer and inner
+ * list objects so we can set a single pointer instead of
+ * swapping several pointers.
+ */
+
+ for (int i = 0; i < listSet->nnbl; i++)
+ {
+ /* The search produced a list in ci/cj.
+ * Swap the list pointers so we get the outer list is ciOuter,cjOuter
+ * and we can prune that to get an inner list in ci/cj.
+ */
+ nbnxn_pairlist_t *list = listSet->nbl[i];
+ list->nciOuter = list->nci;
+
+ nbnxn_ci_t *ciTmp = list->ciOuter;
+ list->ciOuter = list->ci;
+ list->ci = ciTmp;
+
+ nbnxn_cj_t *cjTmp = list->cjOuter;
+ list->cjOuter = list->cj;
+ list->cj = cjTmp;
+
+ /* Signal that this inner list is currently invalid */
+ list->nci = -1;
+ }
+}
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014,2015,2017, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
int nb_kernel_type,
t_nrnb *nrnb);
+/*! \brief Prepare the list-set produced by the search for dynamic pruning
+ *
+ * \param[in,out] listSet The list-set to prepare for dynamic pruning.
+ */
+void nbnxnPrepareListForDynamicPruning(nbnxn_pairlist_set_t *listSet);
+
#endif
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2017, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/*! \internal \file
+ *
+ * \brief Implements functions for tuning adjustable parameters for the nbnxn non-bonded search and interaction kernels
+ *
+ * \author Berk Hess <hess@kth.se>
+ * \ingroup __module_nb_verlet
+ */
+
+#include "gmxpre.h"
+
+#include "nbnxn_tuning.h"
+
+#include <assert.h>
+#include <stdlib.h>
+
+#include <cmath>
+
+#include <algorithm>
+
+#include "gromacs/domdec/domdec.h"
+#include "gromacs/hardware/cpuinfo.h"
+#include "gromacs/math/vec.h"
+#include "gromacs/mdlib/calc_verletbuf.h"
+#include "gromacs/mdlib/nb_verlet.h"
+#include "gromacs/mdlib/nbnxn_search.h"
+#include "gromacs/mdtypes/inputrec.h"
+#include "gromacs/mdtypes/interaction_const.h"
+#include "gromacs/topology/topology.h"
+#include "gromacs/utility/cstringutil.h"
+#include "gromacs/utility/fatalerror.h"
+#include "gromacs/utility/gmxassert.h"
+
+/*! \brief Returns if we can (heuristically) change nstlist and rlist
+ *
+ * \param [in] ir The input parameter record
+ */
+static bool supportsDynamicPairlistGenerationInterval(const t_inputrec &ir)
+{
+ return
+ ir.cutoff_scheme == ecutsVERLET &&
+ EI_DYNAMICS(ir.eI) &&
+ !(EI_MD(ir.eI) && ir.etc == etcNO) &&
+ ir.verletbuf_tol > 0;
+}
+
+/*! \brief The interval in steps at which we perform dynamic, rolling pruning on a GPU.
+ *
+ * Ideally we should auto-tune this value.
+ * Not considering overheads, 1 would be the ideal value. But 2 seems
+ * a reasonable compromise that reduces GPU kernel launch overheads and
+ * also avoids inefficiency on large GPUs when pruning small lists.
+ * Because with domain decomposition we alternate local/non-local pruning
+ * at even/odd steps, which gives a period of 2, this value currenly needs
+ * to be 2, which is indirectly asserted when the GPU pruning is dispatched
+ * during the force evaluation.
+ */
+static const int c_nbnxnGpuRollingListPruningInterval = 2;
+
+/*! \brief The minimum nstlist for dynamic pair list pruning.
+ *
+ * In most cases going lower than 4 will lead to a too high pruning cost.
+ * This value should be a multiple of \p c_nbnxnGpuRollingListPruningInterval
+ */
+static const int c_nbnxnDynamicListPruningMinLifetime = 4;
+
+/*! \brief Set the dynamic pairlist pruning parameters in \p ic
+ *
+ * \param[in] ir The input parameter record
+ * \param[in] mtop The global topology
+ * \param[in] box The unit cell
+ * \param[in] useGpu Tells if we are using a GPU for non-bondeds
+ * \param[in] listSetup The nbnxn pair list setup
+ * \param[in] userSetNstlistPrune The user set ic->nstlistPrune (using an env.var.)
+ * \param[in] ic The nonbonded interactions constants
+ * \param[in,out] listParams The list setup parameters
+ */
+static void
+setDynamicPairlistPruningParameters(const t_inputrec *ir,
+ const gmx_mtop_t *mtop,
+ matrix box,
+ gmx_bool useGpu,
+ const verletbuf_list_setup_t &listSetup,
+ bool userSetNstlistPrune,
+ const interaction_const_t *ic,
+ NbnxnListParameters *listParams)
+{
+ /* When nstlistPrune was set by the user, we need to execute one loop
+ * iteration to determine rlistInner.
+ * Otherwise we compute rlistInner and increase nstlist as long as
+ * we have a pairlist buffer of length 0 (i.e. rlistInner == cutoff).
+ */
+ const real interactionCutoff = std::max(ic->rcoulomb, ic->rvdw);
+ int tunedNstlistPrune = listParams->nstlistPrune;
+ do
+ {
+ /* Dynamic pruning on the GPU is performed on the list for
+ * the next step on the coordinates of the current step,
+ * so the list lifetime is nstlistPrune (not the usual nstlist-1).
+ */
+ int listLifetime = tunedNstlistPrune - (useGpu ? 0 : 1);
+ listParams->nstlistPrune = tunedNstlistPrune;
+ calc_verlet_buffer_size(mtop, det(box), ir,
+ tunedNstlistPrune, listLifetime,
+ -1, &listSetup, NULL,
+ &listParams->rlistInner);
+
+ /* On the GPU we apply the dynamic pruning in a rolling fashion
+ * every c_nbnxnGpuRollingListPruningInterval steps,
+ * so keep nstlistPrune a multiple of the interval.
+ */
+ tunedNstlistPrune += useGpu ? c_nbnxnGpuRollingListPruningInterval : 1;
+ }
+ while (!userSetNstlistPrune &&
+ tunedNstlistPrune < ir->nstlist &&
+ listParams->rlistInner == interactionCutoff);
+
+ if (userSetNstlistPrune)
+ {
+ listParams->useDynamicPruning = true;
+ }
+ else
+ {
+ /* Determine the pair list size increase due to zero interactions */
+ real rlistInc = nbnxn_get_rlist_effective_inc(listSetup.cluster_size_j,
+ mtop->natoms/det(box));
+
+ /* Dynamic pruning is only useful when the inner list is smaller than
+ * the outer. The factor 0.99 ensures at least 3% list size reduction.
+ *
+ * With dynamic pruning on the CPU we prune after updating,
+ * so nstlistPrune=nstlist-1 would add useless extra work.
+ * With the GPU there will probably be more overhead than gain
+ * with nstlistPrune=nstlist-1, so we disable dynamic pruning.
+ * Note that in such cases the first sub-condition is likely also false.
+ */
+ listParams->useDynamicPruning =
+ (listParams->rlistInner + rlistInc < 0.99*(listParams->rlistOuter + rlistInc) &&
+ listParams->nstlistPrune < ir->nstlist - 1);
+ }
+
+ if (!listParams->useDynamicPruning)
+ {
+ /* These parameters should not be used, but set them to useful values */
+ listParams->nstlistPrune = -1;
+ listParams->rlistInner = listParams->rlistOuter;
+ }
+}
+
+void setupDynamicPairlistPruning(FILE *fplog,
+ const t_inputrec *ir,
+ const gmx_mtop_t *mtop,
+ matrix box,
+ bool useGpu,
+ const interaction_const_t *ic,
+ NbnxnListParameters *listParams)
+{
+ GMX_RELEASE_ASSERT(listParams->rlistOuter > 0, "With the nbnxn setup rlist should be > 0");
+
+ /* Initialize the parameters to no dynamic list pruning */
+ listParams->useDynamicPruning = false;
+
+ if (supportsDynamicPairlistGenerationInterval(*ir) &&
+ getenv("GMX_DISABLE_DYNAMICPRUNING") == NULL)
+ {
+ verletbuf_list_setup_t ls;
+ verletbuf_get_list_setup(TRUE, TRUE, &ls);
+
+ /* Note that nstlistPrune can have any value independently of nstlist.
+ * Actually applying rolling pruning is only useful when
+ * nstlistPrune < nstlist -1
+ */
+ char *env = getenv("GMX_NSTLIST_DYNAMICPRUNING");
+ bool userSetNstlistPrune = (env != NULL);
+
+ if (userSetNstlistPrune)
+ {
+ char *end;
+ listParams->nstlistPrune = strtol(env, &end, 10);
+ if (!end || (*end != 0) ||
+ !(listParams->nstlistPrune > 0 && listParams->nstlistPrune < ir->nstlist))
+ {
+ gmx_fatal(FARGS, "Invalid value passed in GMX_NSTLIST_DYNAMICPRUNING=%s, should be > 0 and < nstlist", env);
+ }
+ }
+ else
+ {
+ static_assert(c_nbnxnDynamicListPruningMinLifetime % c_nbnxnGpuRollingListPruningInterval == 0,
+ "c_nbnxnDynamicListPruningMinLifetime sets the starting value for nstlistPrune, which should be divisible by the rolling pruning interval for efficiency reasons.");
+
+ // TODO: Use auto-tuning to determine nstlistPrune
+ listParams->nstlistPrune = c_nbnxnDynamicListPruningMinLifetime;
+ }
+
+ setDynamicPairlistPruningParameters(ir, mtop, box, useGpu, ls,
+ userSetNstlistPrune, ic,
+ listParams);
+
+ // Dynamic pruning disabled until the kernels are present
+ listParams->useDynamicPruning = false;
+
+ if (listParams->useDynamicPruning && useGpu)
+ {
+ /* Note that we can round down here. This makes the effective
+ * rolling pruning interval slightly shorter than nstlistTune,
+ * thus giving correct results, but a slightly lower efficiency.
+ */
+ listParams->numRollingParts = listParams->nstlistPrune/c_nbnxnGpuRollingListPruningInterval;
+ }
+ else
+ {
+ listParams->numRollingParts = 1;
+ }
+
+ if (fplog && listParams->useDynamicPruning)
+ {
+ const real interactionCutoff = std::max(ic->rcoulomb, ic->rvdw);
+ fprintf(fplog,
+ "Using a dual pair-list setup updated with dynamic%s pruning:\n"
+ " outer list: updated every %3d steps, buffer %.3f nm, rlist %.3f nm\n"
+ " inner list: updated every %3d steps, buffer %.3f nm, rlist %.3f nm\n",
+ listParams->numRollingParts > 1 ? ", rolling" : "",
+ ir->nstlist, listParams->rlistOuter - interactionCutoff, listParams->rlistOuter,
+ listParams->nstlistPrune, listParams->rlistInner - interactionCutoff, listParams->rlistInner);
+ }
+ }
+}
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2017, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/*! \libinternal \file
+ *
+ * \brief Declares functions for tuning adjustable parameters for the nbnxn non-bonded search and interaction kernels
+ *
+ * \author Berk Hess <hess@kth.se>
+ * \ingroup __module_nb_verlet
+ */
+
+#ifndef NBNXN_TUNING_H
+#define NBNXN_TUNING_H
+
+#include <stdio.h>
+
+#include "gromacs/math/vectypes.h"
+
+struct gmx_mtop_t;
+struct interaction_const_t;
+struct NbnxnListParameters;
+struct t_inputrec;
+
+/*! \brief Set up the dynamic pairlist pruning
+ *
+ * \param[in,out] fplog Log file
+ * \param[in] ir The input parameter record
+ * \param[in] mtop The global topology
+ * \param[in] box The unit cell
+ * \param[in] useGpu Tells if we are using a GPU for non-bondeds
+ * \param[in] ic The nonbonded interactions constants
+ * \param[in,out] listParams The list setup parameters
+ */
+void setupDynamicPairlistPruning(FILE *fplog,
+ const t_inputrec *ir,
+ const gmx_mtop_t *mtop,
+ matrix box,
+ bool useGpu,
+ const interaction_const_t *ic,
+ NbnxnListParameters *listParams);
+
+#endif /* NBNXN_TUNING_H */
gmx_enerdata_t *enerd,
int flags, int ilocality,
int clearF,
+ gmx_int64_t step,
t_nrnb *nrnb,
gmx_wallcycle_t wcycle)
{
- int enr_nbnxn_kernel_ljc, enr_nbnxn_kernel_lj;
- nonbonded_verlet_group_t *nbvg;
- gmx_bool bUsingGpuKernels;
-
if (!(flags & GMX_FORCE_NONBONDED))
{
/* skip non-bonded calculation */
return;
}
- nbvg = &fr->nbv->grp[ilocality];
+ nonbonded_verlet_t *nbv = fr->nbv;
+ nonbonded_verlet_group_t *nbvg = &nbv->grp[ilocality];
/* GPU kernel launch overhead is already timed separately */
if (fr->cutoff_scheme != ecutsVERLET)
gmx_incons("Invalid cut-off scheme passed!");
}
- bUsingGpuKernels = (nbvg->kernel_type == nbnxnk8x8x8_GPU);
+ bool bUsingGpuKernels = (nbvg->kernel_type == nbnxnk8x8x8_GPU);
if (!bUsingGpuKernels)
{
+ /* When dynamic pair-list pruning is requested, we need to prune
+ * at nstlistPrune steps.
+ */
+ if (nbv->listParams->useDynamicPruning &&
+ (step - nbvg->nbl_lists.outerListCreationStep) % nbv->listParams->nstlistPrune == 0)
+ {
+ /* Prune the pair-list beyond fr->ic->rlistPrune using
+ * the current coordinates of the atoms.
+ */
+ wallcycle_sub_start(wcycle, ewcsNONBONDED_PRUNING);
+ GMX_RELEASE_ASSERT(false, "The CPU prune kernel will be called here");
+ wallcycle_sub_stop(wcycle, ewcsNONBONDED_PRUNING);
+ }
+
wallcycle_sub_start(wcycle, ewcsNONBONDED);
}
+
switch (nbvg->kernel_type)
{
case nbnxnk4x4_PlainC:
break;
case nbnxnk8x8x8_GPU:
- nbnxn_gpu_launch_kernel(fr->nbv->gpu_nbv, nbvg->nbat, flags, ilocality);
+ nbnxn_gpu_launch_kernel(nbv->gpu_nbv, nbvg->nbat, flags, ilocality);
break;
case nbnxnk8x8x8_PlainC:
break;
default:
- gmx_incons("Invalid nonbonded kernel type passed!");
+ GMX_RELEASE_ASSERT(false, "Invalid nonbonded kernel type passed!");
}
if (!bUsingGpuKernels)
wallcycle_sub_stop(wcycle, ewcsNONBONDED);
}
+ int enr_nbnxn_kernel_ljc, enr_nbnxn_kernel_lj;
if (EEL_RF(ic->eeltype) || ic->eeltype == eelCUT)
{
enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_RF;
}
else if ((!bUsingGpuKernels && nbvg->ewald_excl == ewaldexclAnalytical) ||
- (bUsingGpuKernels && nbnxn_gpu_is_kernel_ewald_analytical(fr->nbv->gpu_nbv)))
+ (bUsingGpuKernels && nbnxn_gpu_is_kernel_ewald_analytical(nbv->gpu_nbv)))
{
enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_EWALD;
}
wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_LOCAL);
nbnxn_make_pairlist(nbv->nbs, nbv->grp[eintLocal].nbat,
&top->excls,
- ic->rlist,
+ nbv->listParams->rlistOuter,
nbv->min_ci_balanced,
&nbv->grp[eintLocal].nbl_lists,
eintLocal,
nbv->grp[eintLocal].kernel_type,
nrnb);
+ nbv->grp[eintLocal].nbl_lists.outerListCreationStep = step;
+ if (nbv->listParams->useDynamicPruning && !bUseGPU)
+ {
+ nbnxnPrepareListForDynamicPruning(&nbv->grp[eintLocal].nbl_lists);
+ }
wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_LOCAL);
if (bUseGPU)
wallcycle_start(wcycle, ewcLAUNCH_GPU_NB);
/* launch local nonbonded F on GPU */
do_nb_verlet(fr, ic, enerd, flags, eintLocal, enbvClearFNo,
- nrnb, wcycle);
+ step, nrnb, wcycle);
wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
}
nbnxn_make_pairlist(nbv->nbs, nbv->grp[eintNonlocal].nbat,
&top->excls,
- ic->rlist,
+ nbv->listParams->rlistOuter,
nbv->min_ci_balanced,
&nbv->grp[eintNonlocal].nbl_lists,
eintNonlocal,
nbv->grp[eintNonlocal].kernel_type,
nrnb);
-
+ nbv->grp[eintNonlocal].nbl_lists.outerListCreationStep = step;
+ if (nbv->listParams->useDynamicPruning && !bUseGPU)
+ {
+ nbnxnPrepareListForDynamicPruning(&nbv->grp[eintNonlocal].nbl_lists);
+ }
wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_NONLOCAL);
if (nbv->grp[eintNonlocal].kernel_type == nbnxnk8x8x8_GPU)
wallcycle_start(wcycle, ewcLAUNCH_GPU_NB);
/* launch non-local nonbonded F on GPU */
do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFNo,
- nrnb, wcycle);
+ step, nrnb, wcycle);
wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
}
}
if (!bUseOrEmulGPU)
{
- /* Maybe we should move this into do_force_lowlevel */
do_nb_verlet(fr, ic, enerd, flags, eintLocal, enbvClearFYes,
- nrnb, wcycle);
+ step, nrnb, wcycle);
}
if (fr->efep != efepNO)
{
do_nb_verlet(fr, ic, enerd, flags, eintNonlocal,
bDiffKernels ? enbvClearFYes : enbvClearFNo,
- nrnb, wcycle);
+ step, nrnb, wcycle);
}
if (!bUseOrEmulGPU)
{
wallcycle_start_nocount(wcycle, ewcFORCE);
do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFYes,
- nrnb, wcycle);
+ step, nrnb, wcycle);
wallcycle_stop(wcycle, ewcFORCE);
}
wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
/* now clear the GPU outputs while we finish the step on the CPU */
wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU_NB);
nbnxn_gpu_clear_outputs(nbv->gpu_nbv, flags);
+
+ /* Is dynamic pair-list pruning activated? */
+ if (nbv->listParams->useDynamicPruning)
+ {
+ /* We should not launch the rolling pruning kernel at a search
+ * step or just before search steps, since that's useless.
+ * Without domain decomposition we prune at even steps.
+ * With domain decomposition we alternate local and non-local
+ * pruning at even and odd steps.
+ */
+ int numRollingParts = nbv->listParams->numRollingParts;
+ GMX_ASSERT(numRollingParts == nbv->listParams->nstlistPrune/2, "Since we alternate local/non-local at even/odd steps, we need numRollingParts<=nstlistPrune/2 for correctness and == for efficiency");
+ int stepWithCurrentList = step - nbv->grp[eintLocal].nbl_lists.outerListCreationStep;
+ bool stepIsEven = ((stepWithCurrentList & 1) == 0);
+ if (stepWithCurrentList > 0 &&
+ stepWithCurrentList < inputrec->nstlist - 1 &&
+ (stepIsEven || DOMAINDECOMP(cr)))
+ {
+ GMX_RELEASE_ASSERT(false, "The GPU prune kernel will be called here");
+ }
+ }
wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
}
else
wallcycle_start_nocount(wcycle, ewcFORCE);
do_nb_verlet(fr, ic, enerd, flags, eintLocal,
DOMAINDECOMP(cr) ? enbvClearFNo : enbvClearFYes,
- nrnb, wcycle);
+ step, nrnb, wcycle);
wallcycle_stop(wcycle, ewcFORCE);
}
wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
/* Coulomb */
real rcoulomb;
- /* Cut-off */
- real rlist;
-
/* PME/Ewald */
real ewaldcoeff_q;
real ewaldcoeff_lj;
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2012,2014,2015, by the GROMACS development team, led by
+ * Copyright (c) 2012,2014,2015,2017, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
/*! \internal \brief GPU timings for kernels and H2d/D2H transfers. */
struct gmx_wallclock_gpu_t
{
- struct gmx_nbnxn_kernel_timing_data_t ktime[2][2]; /**< table containing the timings of the four
- versions of the nonbonded kernels: force-only,
- force+energy, force+pruning, and force+energy+pruning */
- double nb_h2d_t; /**< host to device transfer time in nb calculation */
- double nb_d2h_t; /**< device to host transfer time in nb calculation */
- int nb_c; /**< total call count of the nonbonded gpu operations */
- double pl_h2d_t; /**< pair search step host to device transfer time */
- int pl_h2d_c; /**< pair search step host to device transfer call count */
+ gmx_nbnxn_kernel_timing_data_t ktime[2][2]; /**< table containing the timings of the four
+ versions of the nonbonded kernels: force-only,
+ force+energy, force+pruning, and force+energy+pruning */
+ gmx_nbnxn_kernel_timing_data_t pruneTime; /**< table containing the timings of the 1st pass prune-only kernels */
+ gmx_nbnxn_kernel_timing_data_t dynamicPruneTime; /**< table containing the timings of dynamic prune-only kernels */
+ double nb_h2d_t; /**< host to device transfer time in nb calculation */
+ double nb_d2h_t; /**< device to host transfer time in nb calculation */
+ int nb_c; /**< total call count of the nonbonded gpu operations */
+ double pl_h2d_t; /**< pair search step host to device transfer time */
+ int pl_h2d_c; /**< pair search step host to device transfer call count */
};
#ifdef __cplusplus
"Bonded-FEP F",
"Restraints F",
"Listed buffer ops.",
+ "Nonbonded pruning",
"Nonbonded F",
"Ewald F correction",
"NB X buffer ops.",
tot_k += gpu_t->ktime[i][j].t;
}
}
- tot_gpu += tot_k;
+ tot_gpu += tot_k + gpu_t->pruneTime.t;
tot_cpu_overlap = wc->wcc[ewcFORCE].c;
if (wc->wcc[ewcPMEMESH].n > 0)
}
}
}
-
+ if (gpu_t->pruneTime.c)
+ {
+ print_gputimes(fplog, "Pruning kernel", gpu_t->pruneTime.c, gpu_t->pruneTime.t, tot_gpu);
+ }
print_gputimes(fplog, "F D2H", gpu_t->nb_c, gpu_t->nb_d2h_t, tot_gpu);
fprintf(fplog, "%s\n", hline);
print_gputimes(fplog, "Total ", gpu_t->nb_c, tot_gpu, tot_gpu);
fprintf(fplog, "%s\n", hline);
+ if (gpu_t->dynamicPruneTime.c)
+ {
+ /* We print the dynamic pruning kernel timings after a separator
+ * and avoid adding it to tot_gpu as this is not in the force
+ * overlap. We print the fraction as relative to the rest.
+ */
+ print_gputimes(fplog, "*Dynamic pruning", gpu_t->dynamicPruneTime.c, gpu_t->dynamicPruneTime.t, tot_gpu);
+ fprintf(fplog, "%s\n", hline);
+ }
gpu_cpu_ratio = tot_gpu/tot_cpu_overlap;
if (gpu_t->nb_c > 0 && wc->wcc[ewcFORCE].n > 0)
*
* Copyright (c) 1991-2000, University of Groningen, The Netherlands.
* Copyright (c) 2001-2008, The GROMACS development team.
- * Copyright (c) 2013,2014,2015, by the GROMACS development team, led by
+ * Copyright (c) 2013,2014,2015,2017, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
ewcsLISTED_FEP,
ewcsRESTRAINTS,
ewcsLISTED_BUF_OPS,
+ ewcsNONBONDED_PRUNING,
ewcsNONBONDED,
ewcsEWALD_CORRECTION,
ewcsNB_X_BUF_OPS,
if (bPMETune)
{
pme_loadbal_init(&pme_loadbal, cr, mdlog, ir, state->box,
- fr->ic, fr->pmedata, use_GPU(fr->nbv),
+ fr->ic, fr->nbv->listParams.get(), fr->pmedata, use_GPU(fr->nbv),
&bPMETunePrinting);
}
*/
nstlist_prev = ir->nstlist;
ir->nstlist = nbnxnReferenceNstlist;
- calc_verlet_buffer_size(mtop, det(box), ir, -1, &ls, nullptr,
- &rlistWithReferenceNstlist);
+ calc_verlet_buffer_size(mtop, det(box), ir, ir->nstlist, ir->nstlist - 1,
+ -1, &ls, nullptr, &rlistWithReferenceNstlist);
ir->nstlist = nstlist_prev;
/* Determine the pair list size increase due to zero interactions */
}
/* Set the pair-list buffer size in ir */
- calc_verlet_buffer_size(mtop, det(box), ir, -1, &ls, nullptr, &rlist_new);
+ calc_verlet_buffer_size(mtop, det(box), ir, ir->nstlist, ir->nstlist - 1, -1, &ls, nullptr, &rlist_new);
/* Does rlist fit in the box? */
bBox = (gmx::square(rlist_new) < max_cutoff2(ir->ePBC, box));
*/
verletbuf_get_list_setup(true, makeGpuPairList, &ls);
- calc_verlet_buffer_size(mtop, det(box), ir, -1, &ls, nullptr, &rlist_new);
+ calc_verlet_buffer_size(mtop, det(box), ir, ir->nstlist, ir->nstlist - 1, -1, &ls, nullptr, &rlist_new);
if (rlist_new != ir->rlist)
{