/*! \brief Zero thread-local output buffers */
static void
-zero_thread_output(struct bonded_threading_t *bt, int thread)
+zero_thread_output(bonded_threading_t *bt, int thread)
{
if (!bt->haveBondeds)
{
/*! \brief Reduce thread-local force buffers */
static void
reduce_thread_forces(int n, rvec *f,
- struct bonded_threading_t *bt,
+ bonded_threading_t *bt,
int nthreads)
{
if (nthreads > MAX_BONDED_THREADS)
static void
reduce_thread_output(int n, rvec *f, rvec *fshift,
real *ener, gmx_grppairener_t *grpp, real *dvdl,
- struct bonded_threading_t *bt,
+ bonded_threading_t *bt,
gmx_bool bCalcEnerVir,
gmx_bool bDHDL)
{
static real
calc_one_bond(int thread,
int ftype, const t_idef *idef,
+ const bonded_threading_t &bondedThreading,
const rvec x[], rvec4 f[], rvec fshift[],
const t_forcerec *fr,
const t_pbc *pbc, const t_graph *g,
nbonds = idef->il[ftype].nr/nat1;
iatoms = idef->il[ftype].iatoms;
- GMX_ASSERT(idef->il_thread_division[ftype*(idef->nthreads + 1) + idef->nthreads] == idef->il[ftype].nr, "The thread division should match the topology");
+ GMX_ASSERT(bondedThreading.il_thread_division[ftype*(bondedThreading.nthreads + 1) + bondedThreading.nthreads] == idef->il[ftype].nr, "The thread division should match the topology");
- nb0 = idef->il_thread_division[ftype*(idef->nthreads+1)+thread];
- nbn = idef->il_thread_division[ftype*(idef->nthreads+1)+thread+1] - nb0;
+ nb0 = bondedThreading.il_thread_division[ftype*(bondedThreading.nthreads+1)+thread];
+ nbn = bondedThreading.il_thread_division[ftype*(bondedThreading.nthreads+1)+thread+1] - nb0;
if (!isPairInteraction(ftype))
{
gmx_bool bCalcEnerVir,
int *global_atom_index)
{
- struct bonded_threading_t *bt = fr->bonded_threading;
+ bonded_threading_t *bt = fr->bondedThreading;
#pragma omp parallel for num_threads(bt->nthreads) schedule(static)
for (int thread = 0; thread < bt->nthreads; thread++)
{
if (idef->il[ftype].nr > 0 && ftype_is_bonded_potential(ftype))
{
- v = calc_one_bond(thread, ftype, idef, x,
+ v = calc_one_bond(thread, ftype, idef,
+ *fr->bondedThreading, x,
ft, fshift, fr, pbc_null, g, grpp,
nrnb, lambda, dvdlt,
md, fcd, bCalcEnerVir,
t_fcdata *fcd, int *global_atom_index,
int force_flags)
{
- struct bonded_threading_t *bt;
gmx_bool bCalcEnerVir;
const t_pbc *pbc_null;
-
- bt = fr->bonded_threading;
-
- assert(bt->nthreads == idef->nthreads);
+ bonded_threading_t *bt = fr->bondedThreading;
bCalcEnerVir = (force_flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY));
t_fcdata *fcd,
int *global_atom_index)
{
- real v;
- real dvdl_dum[efptNR] = {0};
- rvec4 *f;
- rvec *fshift;
- const t_pbc *pbc_null;
- t_idef idef_fe;
+ real v;
+ real dvdl_dum[efptNR] = {0};
+ rvec4 *f;
+ rvec *fshift;
+ const t_pbc *pbc_null;
+ t_idef idef_fe;
+ bonded_threading_t bondedThreading;
if (fr->bMolPBC)
{
}
/* Copy the whole idef, so we can modify the contents locally */
- idef_fe = *idef;
- idef_fe.nthreads = 1;
- snew(idef_fe.il_thread_division, F_NRE*(idef_fe.nthreads+1));
+ idef_fe = *idef;
+ bondedThreading.nthreads = 1;
+ snew(bondedThreading.il_thread_division, F_NRE*(bondedThreading.nthreads+1));
/* We already have the forces, so we use temp buffers here */
snew(f, fr->natoms_force);
ilist_fe.nr_nonperturbed = 0;
ilist_fe.nr = ilist.nr - ilist.nr_nonperturbed;
/* Set the work range of thread 0 to the perturbed bondeds */
- idef_fe.il_thread_division[ftype*2 + 0] = 0;
- idef_fe.il_thread_division[ftype*2 + 1] = ilist_fe.nr;
+ bondedThreading.il_thread_division[ftype*2 + 0] = 0;
+ bondedThreading.il_thread_division[ftype*2 + 1] = ilist_fe.nr;
if (ilist_fe.nr > 0)
{
- v = calc_one_bond(0, ftype, &idef_fe,
+ v = calc_one_bond(0, ftype, &idef_fe, bondedThreading,
x, f, fshift, fr, pbc_null, g,
grpp, nrnb, lambda, dvdl_dum,
md, fcd, TRUE,
sfree(fshift);
sfree(f);
- sfree(idef_fe.il_thread_division);
+ sfree(bondedThreading.il_thread_division);
}
void
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2014,2015,2016, by the GROMACS development team, led by
+ * Copyright (c) 2014,2015,2016,2018, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
/* There are two different ways to distribute the bonded force calculation
* over the threads. We dedice which to use based on the number of threads.
*/
- int bonded_max_nthread_uniform; /**< Maximum thread count for uniform distribution of bondeds over threads */
+ int max_nthread_uniform; /**< Maximum thread count for uniform distribution of bondeds over threads */
+
+ int *il_thread_division; /**< Stores the division of work in the t_list over threads.
+ *
+ * The division of the normal bonded interactions of threads.
+ * il_thread_division[ftype*(nthreads+1)+t] contains an index
+ * into il[ftype].iatoms; thread th operates on t=th to t=th+1. */
+ int il_thread_division_nalloc; /**< Allocation size of the work division, at least F_NRE*(nthreads+1). */
};
/*! \brief struct for passing all data required for a function type */
typedef struct {
- int ftype; /**< the function type index */
- t_ilist *il; /**< pointer to t_ilist entry corresponding to ftype */
- int nat; /**< nr of atoms involved in a single ftype interaction */
+ int ftype; /**< the function type index */
+ const t_ilist *il; /**< pointer to t_ilist entry corresponding to ftype */
+ int nat; /**< nr of atoms involved in a single ftype interaction */
} ilist_data_t;
/*! \brief Divides listed interactions over threads
* equal load and different threads avoid touching the same atoms as much
* as possible.
*/
-static void divide_bondeds_by_locality(int ntype,
- const ilist_data_t *ild,
- int nthread,
- t_idef *idef)
+static void divide_bondeds_by_locality(bonded_threading_t *bt,
+ int ntype,
+ const ilist_data_t *ild)
{
int nat_tot, nat_sum;
int ind[F_NRE]; /* index into the ild[].il->iatoms */
}
nat_sum = 0;
- /* Loop over the end bounds of the nthread threads to determine
- * which interactions threads 0 to nthread shall calculate.
+ /* Loop over the end bounds of the nthreads threads to determine
+ * which interactions threads 0 to nthreads shall calculate.
*
* NOTE: The cost of these combined loops is #interactions*ntype.
* This code is running single threaded (difficult to parallelize
* be negligble. At high thread count many other parts of the code
* scale the same way, so it's (currently) not worth improving this.
*/
- for (t = 1; t <= nthread; t++)
+ for (t = 1; t <= bt->nthreads; t++)
{
int nat_thread;
* uniformly. Proper and RB dihedrals are often distributed
* non-uniformly, but their cost is roughly equal.
*/
- nat_thread = (nat_tot*t)/nthread;
+ nat_thread = (nat_tot*t)/bt->nthreads;
while (nat_sum < nat_thread)
{
/* Store the bonded end boundaries (at index t) for thread t-1 */
for (f = 0; f < ntype; f++)
{
- idef->il_thread_division[ild[f].ftype*(nthread + 1) + t] = ind[f];
+ bt->il_thread_division[ild[f].ftype*(bt->nthreads + 1) + t] = ind[f];
}
}
}
//! Divides bonded interactions over threads
-static void divide_bondeds_over_threads(t_idef *idef,
- int nthread,
- int max_nthread_uniform,
- bool *haveBondeds)
+static void divide_bondeds_over_threads(bonded_threading_t *bt,
+ const t_idef *idef)
{
ilist_data_t ild[F_NRE];
int ntype;
int f;
- assert(nthread > 0);
+ assert(bt->nthreads > 0);
- idef->nthreads = nthread;
-
- if (F_NRE*(nthread + 1) > idef->il_thread_division_nalloc)
+ if (F_NRE*(bt->nthreads + 1) > bt->il_thread_division_nalloc)
{
- idef->il_thread_division_nalloc = F_NRE*(nthread + 1);
- srenew(idef->il_thread_division, idef->il_thread_division_nalloc);
+ bt->il_thread_division_nalloc = F_NRE*(bt->nthreads + 1);
+ srenew(bt->il_thread_division, bt->il_thread_division_nalloc);
}
- *haveBondeds = false;
- ntype = 0;
+ bt->haveBondeds = false;
+ ntype = 0;
for (f = 0; f < F_NRE; f++)
{
if (!ftype_is_bonded_potential(f))
if (idef->il[f].nr > 0)
{
- *haveBondeds = true;
+ bt->haveBondeds = true;
}
if (idef->il[f].nr == 0)
{
/* No interactions, avoid all the integer math below */
int t;
- for (t = 0; t <= nthread; t++)
+ for (t = 0; t <= bt->nthreads; t++)
{
- idef->il_thread_division[f*(nthread + 1) + t] = 0;
+ bt->il_thread_division[f*(bt->nthreads + 1) + t] = 0;
}
}
- else if (nthread <= max_nthread_uniform || f == F_DISRES)
+ else if (bt->nthreads <= bt->max_nthread_uniform || f == F_DISRES)
{
/* On up to 4 threads, load balancing the bonded work
* is more important than minimizing the reduction cost.
/* nat1 = 1 + #atoms(ftype) which is the stride use for iatoms */
nat1 = 1 + NRAL(f);
- for (t = 0; t <= nthread; t++)
+ for (t = 0; t <= bt->nthreads; t++)
{
int nr_t;
/* Divide equally over the threads */
- nr_t = (((idef->il[f].nr/nat1)*t)/nthread)*nat1;
+ nr_t = (((idef->il[f].nr/nat1)*t)/bt->nthreads)*nat1;
if (f == F_DISRES)
{
}
}
- idef->il_thread_division[f*(nthread + 1) + t] = nr_t;
+ bt->il_thread_division[f*(bt->nthreads + 1) + t] = nr_t;
}
}
else
ild[ntype].nat = nat;
/* The first index for the thread division is always 0 */
- idef->il_thread_division[f*(nthread + 1)] = 0;
+ bt->il_thread_division[f*(bt->nthreads + 1)] = 0;
ntype++;
}
if (ntype > 0)
{
- divide_bondeds_by_locality(ntype, ild, nthread, idef);
+ divide_bondeds_by_locality(bt, ntype, ild);
}
if (debug)
int t;
fprintf(debug, "%16s", interaction_function[f].name);
- for (t = 0; t < nthread; t++)
+ for (t = 0; t < bt->nthreads; t++)
{
fprintf(debug, " %4d",
- (idef->il_thread_division[f*(nthread + 1) + t + 1] -
- idef->il_thread_division[f*(nthread + 1) + t])/
+ (bt->il_thread_division[f*(bt->nthreads + 1) + t + 1] -
+ bt->il_thread_division[f*(bt->nthreads + 1) + t])/
(1 + NRAL(f)));
}
fprintf(debug, "\n");
//! Construct a reduction mask for which parts (blocks) of the force array are touched on which thread task
static void
-calc_bonded_reduction_mask(int natoms,
- f_thread_t *f_thread,
- const t_idef *idef,
- int thread, int nthread)
+calc_bonded_reduction_mask(int natoms,
+ f_thread_t *f_thread,
+ const t_idef *idef,
+ int thread,
+ const bonded_threading_t &bondedThreading)
{
static_assert(BITMASK_SIZE == GMX_OPENMP_MAX_THREADS, "For the error message below we assume these two are equal.");
- if (nthread > BITMASK_SIZE)
+ if (bondedThreading.nthreads > BITMASK_SIZE)
{
#pragma omp master
gmx_fatal(FARGS, "You are using %d OpenMP threads, which is larger than GMX_OPENMP_MAX_THREADS (%d). Decrease the number of OpenMP threads or rebuild GROMACS with a larger value for GMX_OPENMP_MAX_THREADS.",
- nthread, GMX_OPENMP_MAX_THREADS);
+ bondedThreading.nthreads, GMX_OPENMP_MAX_THREADS);
#pragma omp barrier
}
- GMX_ASSERT(nthread <= BITMASK_SIZE, "We need at least nthread bits in the mask");
+ GMX_ASSERT(bondedThreading.nthreads <= BITMASK_SIZE, "We need at least nthreads bits in the mask");
int nblock = (natoms + reduction_block_size - 1) >> reduction_block_bits;
{
int nat1 = interaction_function[ftype].nratoms + 1;
- int nb0 = idef->il_thread_division[ftype*(nthread + 1) + thread];
- int nb1 = idef->il_thread_division[ftype*(nthread + 1) + thread + 1];
+ int nb0 = bondedThreading.il_thread_division[ftype*(bondedThreading.nthreads + 1) + thread];
+ int nb1 = bondedThreading.il_thread_division[ftype*(bondedThreading.nthreads + 1) + thread + 1];
for (int i = nb0; i < nb1; i += nat1)
{
}
}
-void setup_bonded_threading(t_forcerec *fr, t_idef *idef)
+void setup_bonded_threading(bonded_threading_t *bt,
+ int numAtoms,
+ const t_idef *idef)
{
- bonded_threading_t *bt = fr->bonded_threading;
int ctot = 0;
assert(bt->nthreads >= 1);
/* Divide the bonded interaction over the threads */
- divide_bondeds_over_threads(idef,
- bt->nthreads,
- bt->bonded_max_nthread_uniform,
- &bt->haveBondeds);
+ divide_bondeds_over_threads(bt, idef);
if (!bt->haveBondeds)
{
{
try
{
- calc_bonded_reduction_mask(fr->natoms_force, &bt->f_t[t],
- idef, t, bt->nthreads);
+ calc_bonded_reduction_mask(numAtoms, &bt->f_t[t],
+ idef, t, *bt);
}
GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
}
/* Reduce the masks over the threads and determine which blocks
* we need to reduce over.
*/
- int nblock_tot = (fr->natoms_force + reduction_block_size - 1) >> reduction_block_bits;
+ int nblock_tot = (numAtoms + reduction_block_size - 1) >> reduction_block_bits;
if (nblock_tot > bt->block_nalloc)
{
bt->block_nalloc = over_alloc_large(nblock_tot);
fprintf(debug, "Number of %d atom blocks to reduce: %d\n",
reduction_block_size, bt->nblock_used);
fprintf(debug, "Reduction density %.2f for touched blocks only %.2f\n",
- ctot*reduction_block_size/(double)fr->natoms_force,
+ ctot*reduction_block_size/(double)numAtoms,
ctot/(double)bt->nblock_used);
}
}
-void tear_down_bonded_threading(bonded_threading_t *bt,
- t_idef *idef)
+void tear_down_bonded_threading(bonded_threading_t *bt)
{
for (int th = 0; th < bt->nthreads; th++)
{
}
}
sfree(bt->f_t);
+ sfree(bt->il_thread_division);
sfree(bt);
- sfree(idef->il_thread_division);
}
void init_bonded_threading(FILE *fplog, int nenergrp,
if ((ptr = getenv("GMX_BONDED_NTHREAD_UNIFORM")) != nullptr)
{
- sscanf(ptr, "%d", &bt->bonded_max_nthread_uniform);
+ sscanf(ptr, "%d", &bt->max_nthread_uniform);
if (fplog != nullptr)
{
fprintf(fplog, "\nMax threads for uniform bonded distribution set to %d by env.var.\n",
- bt->bonded_max_nthread_uniform);
+ bt->max_nthread_uniform);
}
}
else
{
- bt->bonded_max_nthread_uniform = max_nthread_uniform;
+ bt->max_nthread_uniform = max_nthread_uniform;
}
*bt_ptr = bt;
#include "gromacs/mdtypes/forcerec.h"
#include "gromacs/topology/idef.h"
-#ifdef __cplusplus
-extern "C" {
-#endif
-
/*! \brief Divide the listed interactions over the threads
*
* Uses fr->nthreads for the number of threads, and sets up the
* bonded setup changes; i.e. at start-up without domain decomposition
* and at DD.
*/
-void setup_bonded_threading(t_forcerec *fr, t_idef *idef);
+void setup_bonded_threading(bonded_threading_t *bt,
+ int numAtoms,
+ const t_idef *idef);
//! Destructor.
-void tear_down_bonded_threading(bonded_threading_t *bt,
- t_idef *idef);
+void tear_down_bonded_threading(bonded_threading_t *bt);
/*! \brief Initialize the bonded threading data structures
*
void init_bonded_threading(FILE *fplog, int nenergrp,
struct bonded_threading_t **bt_ptr);
-#ifdef __cplusplus
-}
-#endif
-
#endif
/* Initialize the thread working data for bonded interactions */
init_bonded_threading(fp, mtop->groups.grps[egcENER].nr,
- &fr->bonded_threading);
+ &fr->bondedThreading);
fr->nthread_ewc = gmx_omp_nthreads_get(emntBonded);
snew(fr->ewc_t, fr->nthread_ewc);
sfree(fr->nblists);
done_ns(fr->ns, numEnergyGroups);
sfree(fr->ewc_t);
+ tear_down_bonded_threading(fr->bondedThreading);
+ fr->bondedThreading = nullptr;
sfree(fr);
}
make_local_shells(cr, mdatoms, shellfc);
}
- setup_bonded_threading(fr, &top->idef);
+ setup_bonded_threading(fr->bondedThreading,
+ fr->natoms_force,
+ &top->idef);
gmx_pme_reinit_atoms(fr->pmedata, numHomeAtoms, mdatoms->chargeA);
/* This handles the PP+PME rank case where fr->pmedata is valid.
* TODO: this also does not account for TPI.
*/
}
-
-void mdAlgorithmsTearDownAtomData(bonded_threading_t *bt,
- gmx_localtop_t *top)
-{
- tear_down_bonded_threading(bt, &top->idef);
-}
gmx_vsite_t *vsite,
gmx_shellfc_t *shellfc);
-/*! \brief Clean up after MD algorithms.
- *
- * \param[out] bt Manages the bonded threading.
- * \param[out] top The local topology
- */
-void mdAlgorithmsTearDownAtomData(bonded_threading_t *bt,
- gmx_localtop_t *top);
-
#endif
destroy_enerdata(enerd);
sfree(enerd);
- mdAlgorithmsTearDownAtomData(fr->bonded_threading, top);
- fr->bonded_threading = nullptr;
sfree(top);
}
real userreal4;
/* Pointer to struct for managing threading of bonded force calculation */
- struct bonded_threading_t *bonded_threading;
+ struct bonded_threading_t *bondedThreading;
/* Ewald correction thread local virial and energy data */
int nthread_ewc;
t_ilist il[F_NRE];
int ilsort;
- int nthreads;
- int *il_thread_division;
- int il_thread_division_nalloc;
} t_idef;
/*
* such as LJ and COUL will have 0 entries.
* int ilsort
* The state of the sorting of il, values are provided above.
- * int nthreads
- * The number of threads used to set il_thread_division.
- * int *il_thread_division
- * The division of the normal bonded interactions of threads.
- * il_thread_division[ftype*(nthreads+1)+t] contains an index
- * into il[ftype].iatoms; thread th operates on t=th to t=th+1.
- * int il_thread_division_nalloc
- * The allocated size of il_thread_division,
- * should be at least F_NRE*(nthreads+1).
*/
void pr_iparams(FILE *fp, t_functype ftype, const t_iparams *iparams);