/*! \brief struct for passing all data required for a function type */
typedef struct {
- int ftype; /**< the function type index */
- t_ilist *il; /**< pointer to t_ilist entry corresponding to ftype */
- int nat; /**< nr of atoms involved in a single ftype interaction */
+ int ftype; /**< the function type index */
+ const t_ilist *il; /**< pointer to t_ilist entry corresponding to ftype */
+ int nat; /**< nr of atoms involved in a single ftype interaction */
} ilist_data_t;
/*! \brief Divides listed interactions over threads
* equal load and different threads avoid touching the same atoms as much
* as possible.
*/
-static void divide_bondeds_by_locality(int ntype,
- const ilist_data_t *ild,
- int nthread,
- t_idef *idef)
+static void divide_bondeds_by_locality(bonded_threading_t *bt,
+ int ntype,
+ const ilist_data_t *ild)
{
int nat_tot, nat_sum;
int ind[F_NRE]; /* index into the ild[].il->iatoms */
}
nat_sum = 0;
- /* Loop over the end bounds of the nthread threads to determine
- * which interactions threads 0 to nthread shall calculate.
+ /* Loop over the end bounds of the nthreads threads to determine
+ * which interactions threads 0 to nthreads shall calculate.
*
* NOTE: The cost of these combined loops is #interactions*ntype.
* This code is running single threaded (difficult to parallelize
* be negligble. At high thread count many other parts of the code
* scale the same way, so it's (currently) not worth improving this.
*/
- for (t = 1; t <= nthread; t++)
+ for (t = 1; t <= bt->nthreads; t++)
{
int nat_thread;
* uniformly. Proper and RB dihedrals are often distributed
* non-uniformly, but their cost is roughly equal.
*/
- nat_thread = (nat_tot*t)/nthread;
+ nat_thread = (nat_tot*t)/bt->nthreads;
while (nat_sum < nat_thread)
{
/* Store the bonded end boundaries (at index t) for thread t-1 */
for (f = 0; f < ntype; f++)
{
- idef->il_thread_division[ild[f].ftype*(nthread + 1) + t] = ind[f];
+ bt->il_thread_division[ild[f].ftype*(bt->nthreads + 1) + t] = ind[f];
}
}
}
//! Divides bonded interactions over threads
-static void divide_bondeds_over_threads(t_idef *idef,
- int nthread,
- int max_nthread_uniform,
- bool *haveBondeds)
+static void divide_bondeds_over_threads(bonded_threading_t *bt,
+ const t_idef *idef)
{
ilist_data_t ild[F_NRE];
int ntype;
int f;
- assert(nthread > 0);
+ assert(bt->nthreads > 0);
- idef->nthreads = nthread;
-
- if (F_NRE*(nthread + 1) > idef->il_thread_division_nalloc)
+ if (F_NRE*(bt->nthreads + 1) > bt->il_thread_division_nalloc)
{
- idef->il_thread_division_nalloc = F_NRE*(nthread + 1);
- srenew(idef->il_thread_division, idef->il_thread_division_nalloc);
+ bt->il_thread_division_nalloc = F_NRE*(bt->nthreads + 1);
+ srenew(bt->il_thread_division, bt->il_thread_division_nalloc);
}
- *haveBondeds = false;
- ntype = 0;
+ bt->haveBondeds = false;
+ ntype = 0;
for (f = 0; f < F_NRE; f++)
{
if (!ftype_is_bonded_potential(f))
if (idef->il[f].nr > 0)
{
- *haveBondeds = true;
+ bt->haveBondeds = true;
}
if (idef->il[f].nr == 0)
{
/* No interactions, avoid all the integer math below */
int t;
- for (t = 0; t <= nthread; t++)
+ for (t = 0; t <= bt->nthreads; t++)
{
- idef->il_thread_division[f*(nthread + 1) + t] = 0;
+ bt->il_thread_division[f*(bt->nthreads + 1) + t] = 0;
}
}
- else if (nthread <= max_nthread_uniform || f == F_DISRES)
+ else if (bt->nthreads <= bt->max_nthread_uniform || f == F_DISRES)
{
/* On up to 4 threads, load balancing the bonded work
* is more important than minimizing the reduction cost.
/* nat1 = 1 + #atoms(ftype) which is the stride use for iatoms */
nat1 = 1 + NRAL(f);
- for (t = 0; t <= nthread; t++)
+ for (t = 0; t <= bt->nthreads; t++)
{
int nr_t;
/* Divide equally over the threads */
- nr_t = (((idef->il[f].nr/nat1)*t)/nthread)*nat1;
+ nr_t = (((idef->il[f].nr/nat1)*t)/bt->nthreads)*nat1;
if (f == F_DISRES)
{
}
}
- idef->il_thread_division[f*(nthread + 1) + t] = nr_t;
+ bt->il_thread_division[f*(bt->nthreads + 1) + t] = nr_t;
}
}
else
ild[ntype].nat = nat;
/* The first index for the thread division is always 0 */
- idef->il_thread_division[f*(nthread + 1)] = 0;
+ bt->il_thread_division[f*(bt->nthreads + 1)] = 0;
ntype++;
}
if (ntype > 0)
{
- divide_bondeds_by_locality(ntype, ild, nthread, idef);
+ divide_bondeds_by_locality(bt, ntype, ild);
}
if (debug)
int t;
fprintf(debug, "%16s", interaction_function[f].name);
- for (t = 0; t < nthread; t++)
+ for (t = 0; t < bt->nthreads; t++)
{
fprintf(debug, " %4d",
- (idef->il_thread_division[f*(nthread + 1) + t + 1] -
- idef->il_thread_division[f*(nthread + 1) + t])/
+ (bt->il_thread_division[f*(bt->nthreads + 1) + t + 1] -
+ bt->il_thread_division[f*(bt->nthreads + 1) + t])/
(1 + NRAL(f)));
}
fprintf(debug, "\n");
//! Construct a reduction mask for which parts (blocks) of the force array are touched on which thread task
static void
-calc_bonded_reduction_mask(int natoms,
- f_thread_t *f_thread,
- const t_idef *idef,
- int thread, int nthread)
+calc_bonded_reduction_mask(int natoms,
+ f_thread_t *f_thread,
+ const t_idef *idef,
+ int thread,
+ const bonded_threading_t &bondedThreading)
{
static_assert(BITMASK_SIZE == GMX_OPENMP_MAX_THREADS, "For the error message below we assume these two are equal.");
- if (nthread > BITMASK_SIZE)
+ if (bondedThreading.nthreads > BITMASK_SIZE)
{
#pragma omp master
gmx_fatal(FARGS, "You are using %d OpenMP threads, which is larger than GMX_OPENMP_MAX_THREADS (%d). Decrease the number of OpenMP threads or rebuild GROMACS with a larger value for GMX_OPENMP_MAX_THREADS.",
- nthread, GMX_OPENMP_MAX_THREADS);
+ bondedThreading.nthreads, GMX_OPENMP_MAX_THREADS);
#pragma omp barrier
}
- GMX_ASSERT(nthread <= BITMASK_SIZE, "We need at least nthread bits in the mask");
+ GMX_ASSERT(bondedThreading.nthreads <= BITMASK_SIZE, "We need at least nthreads bits in the mask");
int nblock = (natoms + reduction_block_size - 1) >> reduction_block_bits;
{
int nat1 = interaction_function[ftype].nratoms + 1;
- int nb0 = idef->il_thread_division[ftype*(nthread + 1) + thread];
- int nb1 = idef->il_thread_division[ftype*(nthread + 1) + thread + 1];
+ int nb0 = bondedThreading.il_thread_division[ftype*(bondedThreading.nthreads + 1) + thread];
+ int nb1 = bondedThreading.il_thread_division[ftype*(bondedThreading.nthreads + 1) + thread + 1];
for (int i = nb0; i < nb1; i += nat1)
{
}
}
-void setup_bonded_threading(t_forcerec *fr, t_idef *idef)
+void setup_bonded_threading(bonded_threading_t *bt,
+ int numAtoms,
+ const t_idef *idef)
{
- bonded_threading_t *bt = fr->bonded_threading;
int ctot = 0;
assert(bt->nthreads >= 1);
/* Divide the bonded interaction over the threads */
- divide_bondeds_over_threads(idef,
- bt->nthreads,
- bt->bonded_max_nthread_uniform,
- &bt->haveBondeds);
+ divide_bondeds_over_threads(bt, idef);
if (!bt->haveBondeds)
{
{
try
{
- calc_bonded_reduction_mask(fr->natoms_force, &bt->f_t[t],
- idef, t, bt->nthreads);
+ calc_bonded_reduction_mask(numAtoms, &bt->f_t[t],
+ idef, t, *bt);
}
GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
}
/* Reduce the masks over the threads and determine which blocks
* we need to reduce over.
*/
- int nblock_tot = (fr->natoms_force + reduction_block_size - 1) >> reduction_block_bits;
+ int nblock_tot = (numAtoms + reduction_block_size - 1) >> reduction_block_bits;
if (nblock_tot > bt->block_nalloc)
{
bt->block_nalloc = over_alloc_large(nblock_tot);
fprintf(debug, "Number of %d atom blocks to reduce: %d\n",
reduction_block_size, bt->nblock_used);
fprintf(debug, "Reduction density %.2f for touched blocks only %.2f\n",
- ctot*reduction_block_size/(double)fr->natoms_force,
+ ctot*reduction_block_size/(double)numAtoms,
ctot/(double)bt->nblock_used);
}
}
-void tear_down_bonded_threading(bonded_threading_t *bt,
- t_idef *idef)
+void tear_down_bonded_threading(bonded_threading_t *bt)
{
for (int th = 0; th < bt->nthreads; th++)
{
}
}
sfree(bt->f_t);
+ sfree(bt->il_thread_division);
sfree(bt);
- sfree(idef->il_thread_division);
}
void init_bonded_threading(FILE *fplog, int nenergrp,
if ((ptr = getenv("GMX_BONDED_NTHREAD_UNIFORM")) != nullptr)
{
- sscanf(ptr, "%d", &bt->bonded_max_nthread_uniform);
+ sscanf(ptr, "%d", &bt->max_nthread_uniform);
if (fplog != nullptr)
{
fprintf(fplog, "\nMax threads for uniform bonded distribution set to %d by env.var.\n",
- bt->bonded_max_nthread_uniform);
+ bt->max_nthread_uniform);
}
}
else
{
- bt->bonded_max_nthread_uniform = max_nthread_uniform;
+ bt->max_nthread_uniform = max_nthread_uniform;
}
*bt_ptr = bt;