2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2004, The GROMACS development team.
6 * Copyright (c) 2013,2014,2015, by the GROMACS development team, led by
7 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8 * and including many others, as listed in the AUTHORS file in the
9 * top-level source directory and at http://www.gromacs.org.
11 * GROMACS is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public License
13 * as published by the Free Software Foundation; either version 2.1
14 * of the License, or (at your option) any later version.
16 * GROMACS is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * Lesser General Public License for more details.
21 * You should have received a copy of the GNU Lesser General Public
22 * License along with GROMACS; if not, see
23 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
26 * If you want to redistribute modifications to GROMACS, please
27 * consider that scientific software is very special. Version
28 * control is crucial - bugs must be traceable. We will be happy to
29 * consider code for inclusion in the official distribution, but
30 * derived work must not be called official GROMACS. Details are found
31 * in the README & COPYING files - if they are missing, get the
32 * official version at http://www.gromacs.org.
34 * To help us fund GROMACS development, we humbly ask that you cite
35 * the research papers on the package. Check out http://www.gromacs.org.
38 * \brief This file defines functions for managing threading of listed
41 * \author Mark Abraham <mark.j.abraham@gmail.com>
42 * \ingroup module_listed-forces
46 #include "manage-threading.h"
54 #include "gromacs/legacyheaders/gmx_omp_nthreads.h"
55 #include "gromacs/listed-forces/listed-forces.h"
56 #include "gromacs/pbcutil/ishift.h"
57 #include "gromacs/utility/fatalerror.h"
58 #include "gromacs/utility/smalloc.h"
59 #include "gromacs/utility/stringutil.h"
61 #include "listed-internal.h"
63 /*! \brief struct for passing all data required for a function type */
65 int ftype; /**< the function type index */
66 t_ilist *il; /**< pointer to t_ilist entry corresponding to ftype */
67 int nat; /**< nr of atoms involved in a single ftype interaction */
70 /*! \brief Divides listed interactions over threads
72 * This routine attempts to divide all interactions of the ntype bondeds
73 * types stored in ild over the threads such that each thread has roughly
74 * equal load and different threads avoid touching the same atoms as much
77 static void divide_bondeds_by_locality(int ntype,
78 const ilist_data_t *ild,
83 int ind[F_NRE]; /* index into the ild[].il->iatoms */
84 int at_ind[F_NRE]; /* index of the first atom of the interaction at ind */
87 assert(ntype <= F_NRE);
90 for (f = 0; f < ntype; f++)
92 /* Sum #bondeds*#atoms_per_bond over all bonded types */
93 nat_tot += ild[f].il->nr/(ild[f].nat + 1)*ild[f].nat;
94 /* The start bound for thread 0 is 0 for all interactions */
96 /* Initialize the next atom index array */
97 assert(ild[f].il->nr > 0);
98 at_ind[f] = ild[f].il->iatoms[1];
102 /* Loop over the end bounds of the nthread threads to determine
103 * which interactions threads 0 to nthread shall calculate.
105 * NOTE: The cost of these combined loops is #interactions*ntype.
106 * This code is running single threaded (difficult to parallelize
107 * over threads). So the relative cost of this function increases
108 * linearly with the number of threads. Since the inner-most loop
109 * is cheap and this is done only at DD repartitioning, the cost should
110 * be negligble. At high thread count many other parts of the code
111 * scale the same way, so it's (currently) not worth improving this.
113 for (t = 1; t <= nthread; t++)
117 /* Here we assume that the computational cost is proportional
118 * to the number of atoms in the interaction. This is a rough
119 * measure, but roughly correct. Usually there are very few
120 * interactions anyhow and there are distributed relatively
121 * uniformly. Proper and RB dihedrals are often distributed
122 * non-uniformly, but their cost is roughly equal.
124 nat_thread = (nat_tot*t)/nthread;
126 while (nat_sum < nat_thread)
128 /* To divide bonds based on atom order, we compare
129 * the index of the first atom in the bonded interaction.
130 * This works well, since the domain decomposition generates
131 * bondeds in order of the atoms by looking up interactions
132 * which are linked to the first atom in each interaction.
133 * It usually also works well without DD, since than the atoms
134 * in bonded interactions are usually in increasing order.
135 * If they are not assigned in increasing order, the balancing
136 * is still good, but the memory access and reduction cost will
141 /* Find out which of the types has the lowest atom index */
143 for (f = 1; f < ntype; f++)
145 if (at_ind[f] < at_ind[f_min])
150 assert(f_min >= 0 && f_min < ntype);
152 /* Assign the interaction with the lowest atom index (of type
153 * index f_min) to thread t-1 by increasing ind.
155 ind[f_min] += ild[f_min].nat + 1;
156 nat_sum += ild[f_min].nat;
158 /* Update the first unassigned atom index for this type */
159 if (ind[f_min] < ild[f_min].il->nr)
161 at_ind[f_min] = ild[f_min].il->iatoms[ind[f_min] + 1];
165 /* We have assigned all interactions of this type.
166 * Setting at_ind to INT_MAX ensures this type will not be
167 * chosen in the for loop above during next iterations.
169 at_ind[f_min] = INT_MAX;
173 /* Store the bonded end boundaries (at index t) for thread t-1 */
174 for (f = 0; f < ntype; f++)
176 idef->il_thread_division[ild[f].ftype*(nthread + 1) + t] = ind[f];
180 for (f = 0; f < ntype; f++)
182 assert(ind[f] == ild[f].il->nr);
186 //! Divides bonded interactions over threads
187 static void divide_bondeds_over_threads(t_idef *idef,
189 int max_nthread_uniform)
191 ilist_data_t ild[F_NRE];
197 idef->nthreads = nthread;
199 if (F_NRE*(nthread + 1) > idef->il_thread_division_nalloc)
201 idef->il_thread_division_nalloc = F_NRE*(nthread + 1);
202 snew(idef->il_thread_division, idef->il_thread_division_nalloc);
206 for (f = 0; f < F_NRE; f++)
208 if (!ftype_is_bonded_potential(f))
213 if (idef->il[f].nr == 0)
215 /* No interactions, avoid all the integer math below */
217 for (t = 0; t <= nthread; t++)
219 idef->il_thread_division[f*(nthread + 1) + t] = 0;
222 else if (nthread <= max_nthread_uniform || f == F_DISRES)
224 /* On up to 4 threads, load balancing the bonded work
225 * is more important than minimizing the reduction cost.
229 /* nat1 = 1 + #atoms(ftype) which is the stride use for iatoms */
232 for (t = 0; t <= nthread; t++)
236 /* Divide equally over the threads */
237 nr_t = (((idef->il[f].nr/nat1)*t)/nthread)*nat1;
241 /* Ensure that distance restraint pairs with the same label
242 * end up on the same thread.
244 while (nr_t > 0 && nr_t < idef->il[f].nr &&
245 idef->iparams[idef->il[f].iatoms[nr_t]].disres.label ==
246 idef->iparams[idef->il[f].iatoms[nr_t-nat1]].disres.label)
252 idef->il_thread_division[f*(nthread + 1) + t] = nr_t;
257 /* Add this ftype to the list to be distributed */
261 ild[ntype].ftype = f;
262 ild[ntype].il = &idef->il[f];
263 ild[ntype].nat = nat;
265 /* The first index for the thread division is always 0 */
266 idef->il_thread_division[f*(nthread + 1)] = 0;
274 divide_bondeds_by_locality(ntype, ild, nthread, idef);
281 fprintf(debug, "Division of bondeds over threads:\n");
282 for (f = 0; f < F_NRE; f++)
284 if (ftype_is_bonded_potential(f) && idef->il[f].nr > 0)
288 fprintf(debug, "%16s", interaction_function[f].name);
289 for (t = 0; t < nthread; t++)
291 fprintf(debug, " %4d",
292 (idef->il_thread_division[f*(nthread + 1) + t + 1] -
293 idef->il_thread_division[f*(nthread + 1) + t])/
296 fprintf(debug, "\n");
302 //! Construct a reduction mask for which interaction was computed on which thread
304 calc_bonded_reduction_mask(gmx_bitmask_t *mask,
309 int ftype, nb, nat1, nb0, nb1, i, a;
312 for (ftype = 0; ftype < F_NRE; ftype++)
314 if (ftype_is_bonded_potential(ftype))
316 nb = idef->il[ftype].nr;
319 nat1 = interaction_function[ftype].nratoms + 1;
321 /* Divide this interaction equally over the threads.
322 * This is not stored: should match division in calc_bonds.
324 nb0 = idef->il_thread_division[ftype*(nt+1)+t];
325 nb1 = idef->il_thread_division[ftype*(nt+1)+t+1];
327 for (i = nb0; i < nb1; i += nat1)
329 for (a = 1; a < nat1; a++)
331 bitmask_set_bit(mask, idef->il[ftype].iatoms[i+a]>>shift);
340 /*! \brief We divide the force array in a maximum of BITMASK_SIZE (default 32) blocks.
341 * Minimum force block reduction size is thus 2^6=64.
343 const int maxBlockBits = BITMASK_SIZE;
345 void setup_bonded_threading(t_forcerec *fr, t_idef *idef)
347 bonded_threading_t *bt;
351 bt = fr->bonded_threading;
353 assert(bt->nthreads >= 1);
355 /* Divide the bonded interaction over the threads */
356 divide_bondeds_over_threads(idef,
358 bt->bonded_max_nthread_uniform);
360 if (bt->nthreads == 1)
368 while (fr->natoms_force > (int)(maxBlockBits*(1U<<bt->red_ashift)))
374 fprintf(debug, "bonded force buffer block atom shift %d bits\n",
378 /* Determine to which blocks each thread's bonded force calculation
379 * contributes. Store this is a mask for each thread.
381 #pragma omp parallel for num_threads(bt->nthreads) schedule(static)
382 for (t = 1; t < bt->nthreads; t++)
384 calc_bonded_reduction_mask(&bt->f_t[t].red_mask,
385 idef, bt->red_ashift, t, bt->nthreads);
388 /* Determine the maximum number of blocks we need to reduce over */
391 for (t = 0; t < bt->nthreads; t++)
394 for (b = 0; b < maxBlockBits; b++)
396 if (bitmask_is_set(bt->f_t[t].red_mask, b))
398 bt->red_nblock = std::max(bt->red_nblock, b+1);
404 #if BITMASK_SIZE <= 64 //move into bitmask when it is C++
405 std::string flags = gmx::formatString("%x", bt->f_t[t].red_mask);
407 std::string flags = gmx::formatAndJoin(bt->f_t[t].red_mask,
408 bt->f_t[t].red_mask+BITMASK_ALEN,
409 "", gmx::StringFormatter("%x"));
411 fprintf(debug, "thread %d flags %s count %d\n",
412 t, flags.c_str(), c);
418 fprintf(debug, "Number of blocks to reduce: %d of size %d\n",
419 bt->red_nblock, 1<<bt->red_ashift);
420 fprintf(debug, "Reduction density %.2f density/#thread %.2f\n",
421 ctot*(1<<bt->red_ashift)/(double)fr->natoms_force,
422 ctot*(1<<bt->red_ashift)/(double)(fr->natoms_force*bt->nthreads));
426 void init_bonded_threading(FILE *fplog, int nenergrp,
427 struct bonded_threading_t **bt_ptr)
429 bonded_threading_t *bt;
433 /* These thread local data structures are used for bondeds only */
434 bt->nthreads = gmx_omp_nthreads_get(emntBonded);
436 if (bt->nthreads > 1)
440 snew(bt->f_t, bt->nthreads);
441 #pragma omp parallel for num_threads(bt->nthreads) schedule(static)
442 for (t = 0; t < bt->nthreads; t++)
444 /* Thread 0 uses the global force and energy arrays */
450 bt->f_t[t].f_nalloc = 0;
451 snew(bt->f_t[t].fshift, SHIFTS);
452 bt->f_t[t].grpp.nener = nenergrp*nenergrp;
453 for (i = 0; i < egNR; i++)
455 snew(bt->f_t[t].grpp.ener[i], bt->f_t[t].grpp.nener);
460 /* The optimal value after which to switch from uniform to localized
461 * bonded interaction distribution is 3, 4 or 5 depending on the system
464 const int max_nthread_uniform = 4;
467 if ((ptr = getenv("GMX_BONDED_NTHREAD_UNIFORM")) != NULL)
469 sscanf(ptr, "%d", &bt->bonded_max_nthread_uniform);
472 fprintf(fplog, "\nMax threads for uniform bonded distribution set to %d by env.var.\n",
473 bt->bonded_max_nthread_uniform);
478 bt->bonded_max_nthread_uniform = max_nthread_uniform;