From fdf3b7d136c5638244bd6eadc175a5731682e180 Mon Sep 17 00:00:00 2001 From: Berk Hess Date: Fri, 14 Nov 2014 15:22:05 +0100 Subject: [PATCH] Improve threading of bondeds To reduce the cost of the force reduction, with more than 4 OpenMP threads the bondeds are now distributed over the threads using atom indices instead of uniformly. Moved the bonded thread data initialization to manage-threading.cpp. Change-Id: Ica5d6fcfb6db1d85058ef9825d0d515a205ee257 --- src/gromacs/legacyheaders/types/forcerec.h | 3 + .../listed-forces/manage-threading.cpp | 318 +++++++++++++++--- src/gromacs/listed-forces/manage-threading.h | 5 +- src/gromacs/mdlib/forcerec.cpp | 29 +- 4 files changed, 288 insertions(+), 67 deletions(-) diff --git a/src/gromacs/legacyheaders/types/forcerec.h b/src/gromacs/legacyheaders/types/forcerec.h index f2bd6f0180..33c44a6442 100644 --- a/src/gromacs/legacyheaders/types/forcerec.h +++ b/src/gromacs/legacyheaders/types/forcerec.h @@ -475,6 +475,9 @@ typedef struct { int red_nblock; f_thread_t *f_t; + /* Maximum thread count for uniform distribution of bondeds over threads */ + int bonded_max_nthread_uniform; + /* Exclusion load distribution over the threads */ int *excl_load; } t_forcerec; diff --git a/src/gromacs/listed-forces/manage-threading.cpp b/src/gromacs/listed-forces/manage-threading.cpp index 01bed0bffa..411be9a682 100644 --- a/src/gromacs/listed-forces/manage-threading.cpp +++ b/src/gromacs/listed-forces/manage-threading.cpp @@ -46,66 +46,252 @@ #include "manage-threading.h" #include +#include #include +#include "gromacs/legacyheaders/gmx_omp_nthreads.h" #include "gromacs/listed-forces/listed-forces.h" #include "gromacs/mdlib/forcerec-threading.h" +#include "gromacs/pbcutil/ishift.h" #include "gromacs/utility/fatalerror.h" #include "gromacs/utility/smalloc.h" #include "gromacs/utility/stringutil.h" -//! Divides listed interactions over threads -static void divide_bondeds_over_threads(t_idef *idef, int nthreads) +/*! \brief struct for passing all data required for a function type */ +typedef struct { + int ftype; /**< the function type index */ + t_ilist *il; /**< pointer to t_ilist entry corresponding to ftype */ + int nat; /**< nr of atoms involved in a single ftype interaction */ +} ilist_data_t; + +/*! \brief Divides listed interactions over threads + * + * This routine attempts to divide all interactions of the ntype bondeds + * types stored in ild over the threads such that each thread has roughly + * equal load and different threads avoid touching the same atoms as much + * as possible. + */ +static void divide_bondeds_by_locality(int ntype, + const ilist_data_t *ild, + int nthread, + t_idef *idef) { - int ftype; - int nat1; - int t; - int il_nr_thread; + int nat_tot, nat_sum; + int ind[F_NRE]; /* index into the ild[].il->iatoms */ + int at_ind[F_NRE]; /* index of the first atom of the interaction at ind */ + int f, t; - idef->nthreads = nthreads; + assert(ntype <= F_NRE); - if (F_NRE*(nthreads+1) > idef->il_thread_division_nalloc) + nat_tot = 0; + for (f = 0; f < ntype; f++) { - idef->il_thread_division_nalloc = F_NRE*(nthreads+1); - snew(idef->il_thread_division, idef->il_thread_division_nalloc); + /* Sum #bondeds*#atoms_per_bond over all bonded types */ + nat_tot += ild[f].il->nr/(ild[f].nat + 1)*ild[f].nat; + /* The start bound for thread 0 is 0 for all interactions */ + ind[f] = 0; + /* Initialize the next atom index array */ + assert(ild[f].il->nr > 0); + at_ind[f] = ild[f].il->iatoms[1]; } - for (ftype = 0; ftype < F_NRE; ftype++) + nat_sum = 0; + /* Loop over the end bounds of the nthread threads to determine + * which interactions threads 0 to nthread shall calculate. + * + * NOTE: The cost of these combined loops is #interactions*ntype. + * This code is running single threaded (difficult to parallelize + * over threads). So the relative cost of this function increases + * linearly with the number of threads. Since the inner-most loop + * is cheap and this is done only at DD repartitioning, the cost should + * be negligble. At high thread count many other parts of the code + * scale the same way, so it's (currently) not worth improving this. + */ + for (t = 1; t <= nthread; t++) { - if (ftype_is_bonded_potential(ftype)) + int nat_thread; + + /* Here we assume that the computational cost is proportional + * to the number of atoms in the interaction. This is a rough + * measure, but roughly correct. Usually there are very few + * interactions anyhow and there are distributed relatively + * uniformly. Proper and RB dihedrals are often distributed + * non-uniformly, but their cost is roughly equal. + */ + nat_thread = (nat_tot*t)/nthread; + + while (nat_sum < nat_thread) { - nat1 = interaction_function[ftype].nratoms + 1; + /* To divide bonds based on atom order, we compare + * the index of the first atom in the bonded interaction. + * This works well, since the domain decomposition generates + * bondeds in order of the atoms by looking up interactions + * which are linked to the first atom in each interaction. + * It usually also works well without DD, since than the atoms + * in bonded interactions are usually in increasing order. + * If they are not assigned in increasing order, the balancing + * is still good, but the memory access and reduction cost will + * be higher. + */ + int f_min; - for (t = 0; t <= nthreads; t++) + /* Find out which of the types has the lowest atom index */ + f_min = 0; + for (f = 1; f < ntype; f++) { - /* Divide the interactions equally over the threads. - * When the different types of bonded interactions - * are distributed roughly equally over the threads, - * this should lead to well localized output into - * the force buffer on each thread. - * If this is not the case, a more advanced scheme - * (not implemented yet) will do better. - */ - il_nr_thread = (((idef->il[ftype].nr/nat1)*t)/nthreads)*nat1; - - /* Ensure that distance restraint pairs with the same label - * end up on the same thread. - * This is slighlty tricky code, since the next for iteration - * may have an initial il_nr_thread lower than the final value - * in the previous iteration, but this will anyhow be increased - * to the approriate value again by this while loop. + if (at_ind[f] < at_ind[f_min]) + { + f_min = f; + } + } + assert(f_min >= 0 && f_min < ntype); + + /* Assign the interaction with the lowest atom index (of type + * index f_min) to thread t-1 by increasing ind. + */ + ind[f_min] += ild[f_min].nat + 1; + nat_sum += ild[f_min].nat; + + /* Update the first unassigned atom index for this type */ + if (ind[f_min] < ild[f_min].il->nr) + { + at_ind[f_min] = ild[f_min].il->iatoms[ind[f_min] + 1]; + } + else + { + /* We have assigned all interactions of this type. + * Setting at_ind to INT_MAX ensures this type will not be + * chosen in the for loop above during next iterations. */ - while (ftype == F_DISRES && - il_nr_thread > 0 && - il_nr_thread < idef->il[ftype].nr && - idef->iparams[idef->il[ftype].iatoms[il_nr_thread]].disres.label == - idef->iparams[idef->il[ftype].iatoms[il_nr_thread-nat1]].disres.label) + at_ind[f_min] = INT_MAX; + } + } + + /* Store the bonded end boundaries (at index t) for thread t-1 */ + for (f = 0; f < ntype; f++) + { + idef->il_thread_division[ild[f].ftype*(nthread + 1) + t] = ind[f]; + } + } + + for (f = 0; f < ntype; f++) + { + assert(ind[f] == ild[f].il->nr); + } +} + +//! Divides bonded interactions over threads +static void divide_bondeds_over_threads(t_idef *idef, + int nthread, + int max_nthread_uniform) +{ + ilist_data_t ild[F_NRE]; + int ntype; + int f; + + assert(nthread > 0); + + idef->nthreads = nthread; + + if (F_NRE*(nthread + 1) > idef->il_thread_division_nalloc) + { + idef->il_thread_division_nalloc = F_NRE*(nthread + 1); + snew(idef->il_thread_division, idef->il_thread_division_nalloc); + } + + ntype = 0; + for (f = 0; f < F_NRE; f++) + { + if (!ftype_is_bonded_potential(f)) + { + continue; + } + + if (idef->il[f].nr == 0) + { + /* No interactions, avoid all the integer math below */ + int t; + for (t = 0; t <= nthread; t++) + { + idef->il_thread_division[f*(nthread + 1) + t] = 0; + } + } + else if (nthread <= max_nthread_uniform || f == F_DISRES) + { + /* On up to 4 threads, load balancing the bonded work + * is more important than minimizing the reduction cost. + */ + int nat1, t; + + /* nat1 = 1 + #atoms(ftype) which is the stride use for iatoms */ + nat1 = 1 + NRAL(f); + + for (t = 0; t <= nthread; t++) + { + int nr_t; + + /* Divide equally over the threads */ + nr_t = (((idef->il[f].nr/nat1)*t)/nthread)*nat1; + + if (f == F_DISRES) { - il_nr_thread += nat1; + /* Ensure that distance restraint pairs with the same label + * end up on the same thread. + */ + while (nr_t > 0 && nr_t < idef->il[f].nr && + idef->iparams[idef->il[f].iatoms[nr_t]].disres.label == + idef->iparams[idef->il[f].iatoms[nr_t-nat1]].disres.label) + { + nr_t += nat1; + } } - idef->il_thread_division[ftype*(nthreads+1)+t] = il_nr_thread; + idef->il_thread_division[f*(nthread + 1) + t] = nr_t; + } + } + else + { + /* Add this ftype to the list to be distributed */ + int nat; + + nat = NRAL(f); + ild[ntype].ftype = f; + ild[ntype].il = &idef->il[f]; + ild[ntype].nat = nat; + + /* The first index for the thread division is always 0 */ + idef->il_thread_division[f*(nthread + 1)] = 0; + + ntype++; + } + } + + if (ntype > 0) + { + divide_bondeds_by_locality(ntype, ild, nthread, idef); + } + + if (debug) + { + int f; + + fprintf(debug, "Division of bondeds over threads:\n"); + for (f = 0; f < F_NRE; f++) + { + if (ftype_is_bonded_potential(f) && idef->il[f].nr > 0) + { + int t; + + fprintf(debug, "%16s", interaction_function[f].name); + for (t = 0; t < nthread; t++) + { + fprintf(debug, " %4d", + (idef->il_thread_division[f*(nthread + 1) + t + 1] - + idef->il_thread_division[f*(nthread + 1) + t])/ + (1 + NRAL(f))); + } + fprintf(debug, "\n"); } } } @@ -154,7 +340,7 @@ calc_bonded_reduction_mask(gmx_bitmask_t *mask, */ const int maxBlockBits = BITMASK_SIZE; -void setup_bonded_threading(t_forcerec *fr, t_idef *idef) +void setup_bonded_threading(t_forcerec *fr, t_idef *idef) { int t; int ctot, c, b; @@ -162,7 +348,9 @@ void setup_bonded_threading(t_forcerec *fr, t_idef *idef) assert(fr->nthreads >= 1); /* Divide the bonded interaction over the threads */ - divide_bondeds_over_threads(idef, fr->nthreads); + divide_bondeds_over_threads(idef, + fr->nthreads, + fr->bonded_max_nthread_uniform); if (fr->nthreads == 1) { @@ -229,3 +417,55 @@ void setup_bonded_threading(t_forcerec *fr, t_idef *idef) ctot*(1<red_ashift)/(double)(fr->natoms_force*fr->nthreads)); } } + +void init_bonded_threading(FILE *fplog, t_forcerec *fr, int nenergrp) +{ + /* These thread local data structures are used for bondeds only */ + fr->nthreads = gmx_omp_nthreads_get(emntBonded); + + if (fr->nthreads > 1) + { + int t; + + snew(fr->f_t, fr->nthreads); +#pragma omp parallel for num_threads(fr->nthreads) schedule(static) + for (t = 0; t < fr->nthreads; t++) + { + /* Thread 0 uses the global force and energy arrays */ + if (t > 0) + { + int i; + + fr->f_t[t].f = NULL; + fr->f_t[t].f_nalloc = 0; + snew(fr->f_t[t].fshift, SHIFTS); + fr->f_t[t].grpp.nener = nenergrp*nenergrp; + for (i = 0; i < egNR; i++) + { + snew(fr->f_t[t].grpp.ener[i], fr->f_t[t].grpp.nener); + } + } + } + + /* The optimal value after which to switch from uniform to localized + * bonded interaction distribution is 3, 4 or 5 depending on the system + * and hardware. + */ + const int max_nthread_uniform = 4; + char * ptr; + + if ((ptr = getenv("GMX_BONDED_NTHREAD_UNIFORM")) != NULL) + { + sscanf(ptr, "%d", &fr->bonded_max_nthread_uniform); + if (fplog != NULL) + { + fprintf(fplog, "\nMax threads for uniform bonded distribution set to %d by env.var.\n", + fr->bonded_max_nthread_uniform); + } + } + else + { + fr->bonded_max_nthread_uniform = max_nthread_uniform; + } + } +} diff --git a/src/gromacs/listed-forces/manage-threading.h b/src/gromacs/listed-forces/manage-threading.h index 1cf0844ce2..2096ec63ae 100644 --- a/src/gromacs/listed-forces/manage-threading.h +++ b/src/gromacs/listed-forces/manage-threading.h @@ -3,7 +3,7 @@ * * Copyright (c) 1991-2000, University of Groningen, The Netherlands. * Copyright (c) 2001-2004, The GROMACS development team. - * Copyright (c) 2013,2014, by the GROMACS development team, led by + * Copyright (c) 2013,2014,2015, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -60,6 +60,9 @@ extern "C" { */ void setup_bonded_threading(t_forcerec *fr, t_idef *idef); +/*! \brief Initialize the bonded threading data structures */ +void init_bonded_threading(FILE *fplog, t_forcerec *fr, int nenergrp); + #ifdef __cplusplus } #endif diff --git a/src/gromacs/mdlib/forcerec.cpp b/src/gromacs/mdlib/forcerec.cpp index a34f2dd83c..193894b7cf 100644 --- a/src/gromacs/mdlib/forcerec.cpp +++ b/src/gromacs/mdlib/forcerec.cpp @@ -64,6 +64,7 @@ #include "gromacs/legacyheaders/typedefs.h" #include "gromacs/legacyheaders/types/commrec.h" #include "gromacs/legacyheaders/types/nbnxn_cuda_types_ext.h" +#include "gromacs/listed-forces/manage-threading.h" #include "gromacs/math/calculate-ewald-splitting-coefficient.h" #include "gromacs/math/units.h" #include "gromacs/math/utilities.h" @@ -1537,32 +1538,6 @@ gmx_bool can_use_allvsall(const t_inputrec *ir, gmx_bool bPrintNote, t_commrec * } -static void init_forcerec_f_threads(t_forcerec *fr, int nenergrp) -{ - int t, i; - - /* These thread local data structures are used for bondeds only */ - fr->nthreads = gmx_omp_nthreads_get(emntBonded); - - if (fr->nthreads > 1) - { - snew(fr->f_t, fr->nthreads); - /* Thread 0 uses the global force and energy arrays */ - for (t = 1; t < fr->nthreads; t++) - { - fr->f_t[t].f = NULL; - fr->f_t[t].f_nalloc = 0; - snew(fr->f_t[t].fshift, SHIFTS); - fr->f_t[t].grpp.nener = nenergrp*nenergrp; - for (i = 0; i < egNR; i++) - { - snew(fr->f_t[t].grpp.ener[i], fr->f_t[t].grpp.nener); - } - } - } -} - - gmx_bool nbnxn_acceleration_supported(FILE *fplog, const t_commrec *cr, const t_inputrec *ir, @@ -3228,7 +3203,7 @@ void init_forcerec(FILE *fp, } /* Initialize the thread working data for bonded interactions */ - init_forcerec_f_threads(fr, mtop->groups.grps[egcENER].nr); + init_bonded_threading(fp, fr, mtop->groups.grps[egcENER].nr); snew(fr->excl_load, fr->nthreads+1); -- 2.22.0