2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2004, The GROMACS development team.
6 * Copyright (c) 2013,2014,2015,2016,2017 by the GROMACS development team.
7 * Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by
8 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
9 * and including many others, as listed in the AUTHORS file in the
10 * top-level source directory and at http://www.gromacs.org.
12 * GROMACS is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU Lesser General Public License
14 * as published by the Free Software Foundation; either version 2.1
15 * of the License, or (at your option) any later version.
17 * GROMACS is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 * Lesser General Public License for more details.
22 * You should have received a copy of the GNU Lesser General Public
23 * License along with GROMACS; if not, see
24 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
25 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
27 * If you want to redistribute modifications to GROMACS, please
28 * consider that scientific software is very special. Version
29 * control is crucial - bugs must be traceable. We will be happy to
30 * consider code for inclusion in the official distribution, but
31 * derived work must not be called official GROMACS. Details are found
32 * in the README & COPYING files - if they are missing, get the
33 * official version at http://www.gromacs.org.
35 * To help us fund GROMACS development, we humbly ask that you cite
36 * the research papers on the package. Check out http://www.gromacs.org.
39 * \brief This file defines functions for managing threading of listed
42 * \author Mark Abraham <mark.j.abraham@gmail.com>
43 * \author Berk Hess <hess@kth.se>
44 * \ingroup module_listed_forces
48 #include "manage_threading.h"
60 #include "gromacs/listed_forces/listed_forces_gpu.h"
61 #include "gromacs/pbcutil/ishift.h"
62 #include "gromacs/topology/ifunc.h"
63 #include "gromacs/utility/exceptions.h"
64 #include "gromacs/utility/fatalerror.h"
65 #include "gromacs/utility/gmxassert.h"
67 #include "listed_internal.h"
68 #include "utilities.h"
70 /*! \brief struct for passing all data required for a function type */
73 const InteractionList* il; /**< pointer to t_ilist entry corresponding to ftype */
74 int ftype; /**< the function type index */
75 int nat; /**< nr of atoms involved in a single ftype interaction */
78 /*! \brief Divides listed interactions over threads
80 * This routine attempts to divide all interactions of the numType bondeds
81 * types stored in ild over the threads such that each thread has roughly
82 * equal load and different threads avoid touching the same atoms as much
85 static void divide_bondeds_by_locality(bonded_threading_t* bt, int numType, const ilist_data_t* ild)
88 int ind[F_NRE]; /* index into the ild[].il->iatoms */
89 int at_ind[F_NRE]; /* index of the first atom of the interaction at ind */
92 assert(numType <= F_NRE);
95 for (f = 0; f < numType; f++)
97 /* Sum #bondeds*#atoms_per_bond over all bonded types */
98 nat_tot += ild[f].il->size() / (ild[f].nat + 1) * ild[f].nat;
99 /* The start bound for thread 0 is 0 for all interactions */
101 /* Initialize the next atom index array */
102 assert(!ild[f].il->empty());
103 at_ind[f] = ild[f].il->iatoms[1];
107 /* Loop over the end bounds of the nthreads threads to determine
108 * which interactions threads 0 to nthreads shall calculate.
110 * NOTE: The cost of these combined loops is #interactions*numType.
111 * This code is running single threaded (difficult to parallelize
112 * over threads). So the relative cost of this function increases
113 * linearly with the number of threads. Since the inner-most loop
114 * is cheap and this is done only at DD repartitioning, the cost should
115 * be negligble. At high thread count many other parts of the code
116 * scale the same way, so it's (currently) not worth improving this.
118 for (t = 1; t <= bt->nthreads; t++)
122 /* Here we assume that the computational cost is proportional
123 * to the number of atoms in the interaction. This is a rough
124 * measure, but roughly correct. Usually there are very few
125 * interactions anyhow and there are distributed relatively
126 * uniformly. Proper and RB dihedrals are often distributed
127 * non-uniformly, but their cost is roughly equal.
129 nat_thread = (nat_tot * t) / bt->nthreads;
131 while (nat_sum < nat_thread)
133 /* To divide bonds based on atom order, we compare
134 * the index of the first atom in the bonded interaction.
135 * This works well, since the domain decomposition generates
136 * bondeds in order of the atoms by looking up interactions
137 * which are linked to the first atom in each interaction.
138 * It usually also works well without DD, since than the atoms
139 * in bonded interactions are usually in increasing order.
140 * If they are not assigned in increasing order, the balancing
141 * is still good, but the memory access and reduction cost will
146 /* Find out which of the types has the lowest atom index */
148 for (f = 1; f < numType; f++)
150 if (at_ind[f] < at_ind[f_min])
155 assert(f_min >= 0 && f_min < numType);
157 /* Assign the interaction with the lowest atom index (of type
158 * index f_min) to thread t-1 by increasing ind.
160 ind[f_min] += ild[f_min].nat + 1;
161 nat_sum += ild[f_min].nat;
163 /* Update the first unassigned atom index for this type */
164 if (ind[f_min] < ild[f_min].il->size())
166 at_ind[f_min] = ild[f_min].il->iatoms[ind[f_min] + 1];
170 /* We have assigned all interactions of this type.
171 * Setting at_ind to INT_MAX ensures this type will not be
172 * chosen in the for loop above during next iterations.
174 at_ind[f_min] = INT_MAX;
178 /* Store the bonded end boundaries (at index t) for thread t-1 */
179 for (f = 0; f < numType; f++)
181 bt->workDivision.setBound(ild[f].ftype, t, ind[f]);
185 for (f = 0; f < numType; f++)
187 assert(ind[f] == ild[f].il->size());
191 //! Return whether function type \p ftype in \p idef has perturbed interactions
192 static bool ftypeHasPerturbedEntries(const InteractionDefinitions& idef, int ftype)
194 GMX_ASSERT(idef.ilsort == ilsortNO_FE || idef.ilsort == ilsortFE_SORTED,
195 "Perturbed interations should be sorted here");
197 const InteractionList& ilist = idef.il[ftype];
199 return (idef.ilsort != ilsortNO_FE && idef.numNonperturbedInteractions[ftype] != ilist.size());
202 //! Divides bonded interactions over threads and GPU
203 static void divide_bondeds_over_threads(bonded_threading_t* bt,
204 bool useGpuForBondeds,
205 const InteractionDefinitions& idef)
207 ilist_data_t ild[F_NRE];
209 GMX_ASSERT(bt->nthreads > 0, "Must have positive number of threads");
210 const int numThreads = bt->nthreads;
212 gmx::ArrayRef<const t_iparams> iparams = idef.iparams;
214 bt->haveBondeds = false;
216 size_t fTypeGpuIndex = 0;
217 for (int fType = 0; fType < F_NRE; fType++)
219 if (!ftype_is_bonded_potential(fType))
224 const InteractionList& il = idef.il[fType];
225 int nrToAssignToCpuThreads = il.size();
227 if (useGpuForBondeds && fTypeGpuIndex < gmx::fTypesOnGpu.size()
228 && gmx::fTypesOnGpu[fTypeGpuIndex] == fType)
232 /* Perturbation is not implemented in the GPU bonded kernels.
233 * But instead of doing all on the CPU, we could do only
234 * the actually perturbed interactions on the CPU.
236 if (!ftypeHasPerturbedEntries(idef, fType))
238 /* We will assign this interaction type to the GPU */
239 nrToAssignToCpuThreads = 0;
243 if (nrToAssignToCpuThreads > 0)
245 bt->haveBondeds = true;
248 if (nrToAssignToCpuThreads == 0)
250 /* No interactions, avoid all the integer math below */
251 for (int t = 0; t <= numThreads; t++)
253 bt->workDivision.setBound(fType, t, 0);
256 else if (numThreads <= bt->max_nthread_uniform || fType == F_DISRES)
258 /* On up to 4 threads, load balancing the bonded work
259 * is more important than minimizing the reduction cost.
262 const int stride = 1 + NRAL(fType);
264 for (int t = 0; t <= numThreads; t++)
266 /* Divide equally over the threads */
267 int nr_t = (((nrToAssignToCpuThreads / stride) * t) / numThreads) * stride;
269 if (fType == F_DISRES)
271 /* Ensure that distance restraint pairs with the same label
272 * end up on the same thread.
274 while (nr_t > 0 && nr_t < nrToAssignToCpuThreads
275 && iparams[il.iatoms[nr_t]].disres.label
276 == iparams[il.iatoms[nr_t - stride]].disres.label)
282 bt->workDivision.setBound(fType, t, nr_t);
287 /* Add this fType to the list to be distributed */
288 int nat = NRAL(fType);
289 ild[numType].ftype = fType;
290 ild[numType].il = &il;
291 ild[numType].nat = nat;
293 /* The first index for the thread division is always 0 */
294 bt->workDivision.setBound(fType, 0, 0);
302 divide_bondeds_by_locality(bt, numType, ild);
309 fprintf(debug, "Division of bondeds over threads:\n");
310 for (f = 0; f < F_NRE; f++)
312 if (ftype_is_bonded_potential(f) && !idef.il[f].empty())
316 fprintf(debug, "%16s", interaction_function[f].name);
317 for (t = 0; t < numThreads; t++)
321 (bt->workDivision.bound(f, t + 1) - bt->workDivision.bound(f, t))
324 fprintf(debug, "\n");
330 //! Construct a reduction mask for which parts (blocks) of the force array are touched on which thread task
331 static void calc_bonded_reduction_mask(int natoms,
332 gmx::ThreadForceBuffer<rvec4>* f_thread,
333 const InteractionDefinitions& idef,
335 const bonded_threading_t& bondedThreading)
337 static_assert(BITMASK_SIZE == GMX_OPENMP_MAX_THREADS,
338 "For the error message below we assume these two are equal.");
340 if (bondedThreading.nthreads > BITMASK_SIZE)
343 "You are using %d OpenMP threads, which is larger than GMX_OPENMP_MAX_THREADS "
344 "(%d). Decrease the number of OpenMP threads or rebuild GROMACS with a larger "
345 "value for GMX_OPENMP_MAX_THREADS passed to CMake.",
346 bondedThreading.nthreads,
347 GMX_OPENMP_MAX_THREADS);
349 GMX_ASSERT(bondedThreading.nthreads <= BITMASK_SIZE,
350 "We need at least nthreads bits in the mask");
353 f_thread->resizeBufferAndClearMask(natoms);
355 for (int ftype = 0; ftype < F_NRE; ftype++)
357 if (ftype_is_bonded_potential(ftype))
359 int nb = idef.il[ftype].size();
362 int nat1 = interaction_function[ftype].nratoms + 1;
364 int nb0 = bondedThreading.workDivision.bound(ftype, thread);
365 int nb1 = bondedThreading.workDivision.bound(ftype, thread + 1);
367 for (int i = nb0; i < nb1; i += nat1)
369 for (int a = 1; a < nat1; a++)
371 f_thread->addAtomToMask(idef.il[ftype].iatoms[i + a]);
378 f_thread->processMask();
381 void setup_bonded_threading(bonded_threading_t* bt,
383 bool useGpuForBondeds,
384 const InteractionDefinitions& idef)
386 assert(bt->nthreads >= 1);
388 /* Divide the bonded interaction over the threads */
389 divide_bondeds_over_threads(bt, useGpuForBondeds, idef);
391 if (!bt->haveBondeds)
393 /* We don't have bondeds, so there is nothing to reduce */
397 /* Determine to which blocks each thread's bonded force calculation
398 * contributes. Store this as a mask for each thread.
400 #pragma omp parallel for num_threads(bt->nthreads) schedule(static)
401 for (int t = 0; t < bt->nthreads; t++)
405 calc_bonded_reduction_mask(
406 numAtomsForce, &bt->threadedForceBuffer.threadForceBuffer(t), idef, t, *bt);
408 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
411 bt->threadedForceBuffer.setupReduction();
414 bonded_threading_t::bonded_threading_t(const int numThreads, const int numEnergyGroups, FILE* fplog) :
415 nthreads(numThreads),
416 threadedForceBuffer(numThreads, numEnergyGroups),
418 workDivision(nthreads),
419 foreignLambdaWorkDivision(1)
421 /* Note that we also use threadedForceBuffer when running single-threaded.
422 * This is because the bonded force buffer uses type rvec4, whereas
423 * the normal force buffer is uses type rvec. This leads to a little
424 * reduction overhead, but the speed gain in the bonded calculations
425 * of doing transposeScatterIncr/DecrU with aligment 4 instead of 3
426 * is much larger than the reduction overhead.
429 /* The optimal value after which to switch from uniform to localized
430 * bonded interaction distribution is 3, 4 or 5 depending on the system
433 const int max_nthread_uniform_default = 4;
436 if ((ptr = getenv("GMX_BONDED_NTHREAD_UNIFORM")) != nullptr)
438 sscanf(ptr, "%d", &max_nthread_uniform);
439 if (fplog != nullptr)
442 "\nMax threads for uniform bonded distribution set to %d by env.var.\n",
443 max_nthread_uniform);
448 max_nthread_uniform = max_nthread_uniform_default;