2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2004, The GROMACS development team.
6 * Copyright (c) 2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
7 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8 * and including many others, as listed in the AUTHORS file in the
9 * top-level source directory and at http://www.gromacs.org.
11 * GROMACS is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public License
13 * as published by the Free Software Foundation; either version 2.1
14 * of the License, or (at your option) any later version.
16 * GROMACS is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * Lesser General Public License for more details.
21 * You should have received a copy of the GNU Lesser General Public
22 * License along with GROMACS; if not, see
23 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
26 * If you want to redistribute modifications to GROMACS, please
27 * consider that scientific software is very special. Version
28 * control is crucial - bugs must be traceable. We will be happy to
29 * consider code for inclusion in the official distribution, but
30 * derived work must not be called official GROMACS. Details are found
31 * in the README & COPYING files - if they are missing, get the
32 * official version at http://www.gromacs.org.
34 * To help us fund GROMACS development, we humbly ask that you cite
35 * the research papers on the package. Check out http://www.gromacs.org.
38 * \brief This file defines functions for managing threading of listed
41 * \author Mark Abraham <mark.j.abraham@gmail.com>
42 * \ingroup module_listed_forces
46 #include "manage_threading.h"
58 #include "gromacs/listed_forces/gpubonded.h"
59 #include "gromacs/mdlib/gmx_omp_nthreads.h"
60 #include "gromacs/pbcutil/ishift.h"
61 #include "gromacs/topology/ifunc.h"
62 #include "gromacs/utility/exceptions.h"
63 #include "gromacs/utility/fatalerror.h"
64 #include "gromacs/utility/gmxassert.h"
65 #include "gromacs/utility/smalloc.h"
67 #include "listed_internal.h"
68 #include "utilities.h"
70 /*! \brief struct for passing all data required for a function type */
72 const t_ilist *il; /**< pointer to t_ilist entry corresponding to ftype */
73 int ftype; /**< the function type index */
74 int nat; /**< nr of atoms involved in a single ftype interaction */
77 /*! \brief Divides listed interactions over threads
79 * This routine attempts to divide all interactions of the ntype bondeds
80 * types stored in ild over the threads such that each thread has roughly
81 * equal load and different threads avoid touching the same atoms as much
84 static void divide_bondeds_by_locality(bonded_threading_t *bt,
86 const ilist_data_t *ild)
89 int ind[F_NRE]; /* index into the ild[].il->iatoms */
90 int at_ind[F_NRE]; /* index of the first atom of the interaction at ind */
93 assert(ntype <= F_NRE);
96 for (f = 0; f < ntype; f++)
98 /* Sum #bondeds*#atoms_per_bond over all bonded types */
99 nat_tot += ild[f].il->nr/(ild[f].nat + 1)*ild[f].nat;
100 /* The start bound for thread 0 is 0 for all interactions */
102 /* Initialize the next atom index array */
103 assert(ild[f].il->nr > 0);
104 at_ind[f] = ild[f].il->iatoms[1];
108 /* Loop over the end bounds of the nthreads threads to determine
109 * which interactions threads 0 to nthreads shall calculate.
111 * NOTE: The cost of these combined loops is #interactions*ntype.
112 * This code is running single threaded (difficult to parallelize
113 * over threads). So the relative cost of this function increases
114 * linearly with the number of threads. Since the inner-most loop
115 * is cheap and this is done only at DD repartitioning, the cost should
116 * be negligble. At high thread count many other parts of the code
117 * scale the same way, so it's (currently) not worth improving this.
119 for (t = 1; t <= bt->nthreads; t++)
123 /* Here we assume that the computational cost is proportional
124 * to the number of atoms in the interaction. This is a rough
125 * measure, but roughly correct. Usually there are very few
126 * interactions anyhow and there are distributed relatively
127 * uniformly. Proper and RB dihedrals are often distributed
128 * non-uniformly, but their cost is roughly equal.
130 nat_thread = (nat_tot*t)/bt->nthreads;
132 while (nat_sum < nat_thread)
134 /* To divide bonds based on atom order, we compare
135 * the index of the first atom in the bonded interaction.
136 * This works well, since the domain decomposition generates
137 * bondeds in order of the atoms by looking up interactions
138 * which are linked to the first atom in each interaction.
139 * It usually also works well without DD, since than the atoms
140 * in bonded interactions are usually in increasing order.
141 * If they are not assigned in increasing order, the balancing
142 * is still good, but the memory access and reduction cost will
147 /* Find out which of the types has the lowest atom index */
149 for (f = 1; f < ntype; f++)
151 if (at_ind[f] < at_ind[f_min])
156 assert(f_min >= 0 && f_min < ntype);
158 /* Assign the interaction with the lowest atom index (of type
159 * index f_min) to thread t-1 by increasing ind.
161 ind[f_min] += ild[f_min].nat + 1;
162 nat_sum += ild[f_min].nat;
164 /* Update the first unassigned atom index for this type */
165 if (ind[f_min] < ild[f_min].il->nr)
167 at_ind[f_min] = ild[f_min].il->iatoms[ind[f_min] + 1];
171 /* We have assigned all interactions of this type.
172 * Setting at_ind to INT_MAX ensures this type will not be
173 * chosen in the for loop above during next iterations.
175 at_ind[f_min] = INT_MAX;
179 /* Store the bonded end boundaries (at index t) for thread t-1 */
180 for (f = 0; f < ntype; f++)
182 bt->il_thread_division[ild[f].ftype*(bt->nthreads + 1) + t] = ind[f];
186 for (f = 0; f < ntype; f++)
188 assert(ind[f] == ild[f].il->nr);
192 //! Return whether function type \p ftype in \p idef has perturbed interactions
193 static bool ftypeHasPerturbedEntries(const t_idef &idef,
196 GMX_ASSERT(idef.ilsort == ilsortNO_FE || idef.ilsort == ilsortFE_SORTED,
197 "Perturbed interations should be sorted here");
199 const t_ilist &ilist = idef.il[ftype];
201 return (idef.ilsort != ilsortNO_FE && ilist.nr_nonperturbed != ilist.nr);
204 //! Divides bonded interactions over threads and GPU
205 static void divide_bondeds_over_threads(bonded_threading_t *bt,
206 bool useGpuForBondeds,
209 ilist_data_t ild[F_NRE];
211 assert(bt->nthreads > 0);
213 if (F_NRE*(bt->nthreads + 1) > bt->il_thread_division_nalloc)
215 bt->il_thread_division_nalloc = F_NRE*(bt->nthreads + 1);
216 srenew(bt->il_thread_division, bt->il_thread_division_nalloc);
219 bt->haveBondeds = false;
221 size_t ftypeGpuIndex = 0;
222 for (int ftype = 0; ftype < F_NRE; ftype++)
224 if (!ftype_is_bonded_potential(ftype))
229 const t_ilist &il = idef.il[ftype];
230 int nrToAssignToCpuThreads = il.nr;
232 if (useGpuForBondeds &&
233 ftypeGpuIndex < gmx::ftypesOnGpu.size() &&
234 gmx::ftypesOnGpu[ftypeGpuIndex] == ftype)
238 /* Perturbation is not implemented in the GPU bonded kernels.
239 * But instead of doing all on the CPU, we could do only
240 * the actually perturbed interactions on the CPU.
242 if (!ftypeHasPerturbedEntries(idef, ftype))
244 /* We will assign this interaction type to the GPU */
245 nrToAssignToCpuThreads = 0;
249 if (nrToAssignToCpuThreads > 0)
251 bt->haveBondeds = true;
254 if (nrToAssignToCpuThreads == 0)
256 /* No interactions, avoid all the integer math below */
257 for (int t = 0; t <= bt->nthreads; t++)
259 bt->il_thread_division[ftype*(bt->nthreads + 1) + t] = 0;
262 else if (bt->nthreads <= bt->max_nthread_uniform || ftype == F_DISRES)
264 /* On up to 4 threads, load balancing the bonded work
265 * is more important than minimizing the reduction cost.
268 const int stride = 1 + NRAL(ftype);
270 for (int t = 0; t <= bt->nthreads; t++)
272 /* Divide equally over the threads */
273 int nr_t = (((nrToAssignToCpuThreads/stride)*t)/bt->nthreads)*stride;
275 if (ftype == F_DISRES)
277 /* Ensure that distance restraint pairs with the same label
278 * end up on the same thread.
280 while (nr_t > 0 && nr_t < nrToAssignToCpuThreads &&
281 idef.iparams[il.iatoms[nr_t]].disres.label ==
282 idef.iparams[il.iatoms[nr_t - stride]].disres.label)
288 bt->il_thread_division[ftype*(bt->nthreads + 1) + t] = nr_t;
293 /* Add this ftype to the list to be distributed */
294 int nat = NRAL(ftype);
295 ild[ntype].ftype = ftype;
297 ild[ntype].nat = nat;
299 /* The first index for the thread division is always 0 */
300 bt->il_thread_division[ftype*(bt->nthreads + 1)] = 0;
308 divide_bondeds_by_locality(bt, ntype, ild);
315 fprintf(debug, "Division of bondeds over threads:\n");
316 for (f = 0; f < F_NRE; f++)
318 if (ftype_is_bonded_potential(f) && idef.il[f].nr > 0)
322 fprintf(debug, "%16s", interaction_function[f].name);
323 for (t = 0; t < bt->nthreads; t++)
325 fprintf(debug, " %4d",
326 (bt->il_thread_division[f*(bt->nthreads + 1) + t + 1] -
327 bt->il_thread_division[f*(bt->nthreads + 1) + t])/
330 fprintf(debug, "\n");
336 //! Construct a reduction mask for which parts (blocks) of the force array are touched on which thread task
338 calc_bonded_reduction_mask(int natoms,
339 f_thread_t *f_thread,
342 const bonded_threading_t &bondedThreading)
344 static_assert(BITMASK_SIZE == GMX_OPENMP_MAX_THREADS, "For the error message below we assume these two are equal.");
346 if (bondedThreading.nthreads > BITMASK_SIZE)
349 gmx_fatal(FARGS, "You are using %d OpenMP threads, which is larger than GMX_OPENMP_MAX_THREADS (%d). Decrease the number of OpenMP threads or rebuild GROMACS with a larger value for GMX_OPENMP_MAX_THREADS.",
350 bondedThreading.nthreads, GMX_OPENMP_MAX_THREADS);
353 GMX_ASSERT(bondedThreading.nthreads <= BITMASK_SIZE, "We need at least nthreads bits in the mask");
355 int nblock = (natoms + reduction_block_size - 1) >> reduction_block_bits;
357 if (nblock > f_thread->block_nalloc)
359 f_thread->block_nalloc = over_alloc_large(nblock);
360 srenew(f_thread->mask, f_thread->block_nalloc);
361 srenew(f_thread->block_index, f_thread->block_nalloc);
362 sfree_aligned(f_thread->f);
363 snew_aligned(f_thread->f, f_thread->block_nalloc*reduction_block_size, 128);
366 gmx_bitmask_t *mask = f_thread->mask;
368 for (int b = 0; b < nblock; b++)
370 bitmask_clear(&mask[b]);
373 for (int ftype = 0; ftype < F_NRE; ftype++)
375 if (ftype_is_bonded_potential(ftype))
377 int nb = idef.il[ftype].nr;
380 int nat1 = interaction_function[ftype].nratoms + 1;
382 int nb0 = bondedThreading.il_thread_division[ftype*(bondedThreading.nthreads + 1) + thread];
383 int nb1 = bondedThreading.il_thread_division[ftype*(bondedThreading.nthreads + 1) + thread + 1];
385 for (int i = nb0; i < nb1; i += nat1)
387 for (int a = 1; a < nat1; a++)
389 bitmask_set_bit(&mask[idef.il[ftype].iatoms[i+a] >> reduction_block_bits], thread);
396 /* Make an index of the blocks our thread touches, so we can do fast
397 * force buffer clearing.
399 f_thread->nblock_used = 0;
400 for (int b = 0; b < nblock; b++)
402 if (bitmask_is_set(mask[b], thread))
404 f_thread->block_index[f_thread->nblock_used++] = b;
409 void setup_bonded_threading(bonded_threading_t *bt,
411 bool useGpuForBondeds,
416 assert(bt->nthreads >= 1);
418 /* Divide the bonded interaction over the threads */
419 divide_bondeds_over_threads(bt, useGpuForBondeds, idef);
421 if (!bt->haveBondeds)
423 /* We don't have bondeds, so there is nothing to reduce */
427 /* Determine to which blocks each thread's bonded force calculation
428 * contributes. Store this as a mask for each thread.
430 #pragma omp parallel for num_threads(bt->nthreads) schedule(static)
431 for (int t = 0; t < bt->nthreads; t++)
435 calc_bonded_reduction_mask(numAtoms, &bt->f_t[t],
438 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
441 /* Reduce the masks over the threads and determine which blocks
442 * we need to reduce over.
444 int nblock_tot = (numAtoms + reduction_block_size - 1) >> reduction_block_bits;
445 if (nblock_tot > bt->block_nalloc)
447 bt->block_nalloc = over_alloc_large(nblock_tot);
448 srenew(bt->block_index, bt->block_nalloc);
449 srenew(bt->mask, bt->block_nalloc);
452 for (int b = 0; b < nblock_tot; b++)
454 gmx_bitmask_t *mask = &bt->mask[b];
456 /* Generate the union over the threads of the bitmask */
458 for (int t = 0; t < bt->nthreads; t++)
460 bitmask_union(mask, bt->f_t[t].mask[b]);
462 if (!bitmask_is_zero(*mask))
464 bt->block_index[bt->nblock_used++] = b;
470 for (int t = 0; t < bt->nthreads; t++)
472 if (bitmask_is_set(*mask, t))
481 fprintf(debug, "block %d flags %s count %d\n",
482 b, to_hex_string(*mask).c_str(), c);
488 fprintf(debug, "Number of %d atom blocks to reduce: %d\n",
489 reduction_block_size, bt->nblock_used);
490 fprintf(debug, "Reduction density %.2f for touched blocks only %.2f\n",
491 ctot*reduction_block_size/static_cast<double>(numAtoms),
492 ctot/static_cast<double>(bt->nblock_used));
496 void tear_down_bonded_threading(bonded_threading_t *bt)
498 for (int th = 0; th < bt->nthreads; th++)
500 sfree(bt->f_t[th].mask);
501 sfree(bt->f_t[th].fshift);
502 sfree(bt->f_t[th].block_index);
503 sfree_aligned(bt->f_t[th].f);
504 for (int i = 0; i < egNR; i++)
506 sfree(bt->f_t[th].grpp.ener[i]);
510 sfree(bt->il_thread_division);
514 void init_bonded_threading(FILE *fplog, int nenergrp,
515 struct bonded_threading_t **bt_ptr)
517 bonded_threading_t *bt;
521 /* These thread local data structures are used for bondeds only.
523 * Note that we also use there structures when running single-threaded.
524 * This is because the bonded force buffer uses type rvec4, whereas
525 * the normal force buffer is uses type rvec. This leads to a little
526 * reduction overhead, but the speed gain in the bonded calculations
527 * of doing transposeScatterIncr/DecrU with aligment 4 instead of 3
528 * is much larger than the reduction overhead.
530 bt->nthreads = gmx_omp_nthreads_get(emntBonded);
532 snew(bt->f_t, bt->nthreads);
533 #pragma omp parallel for num_threads(bt->nthreads) schedule(static)
534 for (int t = 0; t < bt->nthreads; t++)
538 /* Note that thread 0 uses the global fshift and energy arrays,
539 * but to keep the code simple, we initialize all data here.
541 bt->f_t[t].f = nullptr;
542 bt->f_t[t].f_nalloc = 0;
543 snew(bt->f_t[t].fshift, SHIFTS);
544 bt->f_t[t].grpp.nener = nenergrp*nenergrp;
545 for (int i = 0; i < egNR; i++)
547 snew(bt->f_t[t].grpp.ener[i], bt->f_t[t].grpp.nener);
550 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
554 bt->block_index = nullptr;
556 bt->block_nalloc = 0;
558 /* The optimal value after which to switch from uniform to localized
559 * bonded interaction distribution is 3, 4 or 5 depending on the system
562 const int max_nthread_uniform = 4;
565 if ((ptr = getenv("GMX_BONDED_NTHREAD_UNIFORM")) != nullptr)
567 sscanf(ptr, "%d", &bt->max_nthread_uniform);
568 if (fplog != nullptr)
570 fprintf(fplog, "\nMax threads for uniform bonded distribution set to %d by env.var.\n",
571 bt->max_nthread_uniform);
576 bt->max_nthread_uniform = max_nthread_uniform;