src/gromacs/listed-forces/manage-threading.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2004, The GROMACS development team.
   6  * Copyright (c) 2013,2014,2015, by the GROMACS development team, led by
   7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   8  * and including many others, as listed in the AUTHORS file in the
   9  * top-level source directory and at http://www.gromacs.org.
  10  *
  11  * GROMACS is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public License
  13  * as published by the Free Software Foundation; either version 2.1
  14  * of the License, or (at your option) any later version.
  15  *
  16  * GROMACS is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19  * Lesser General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU Lesser General Public
  22  * License along with GROMACS; if not, see
  23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  25  *
  26  * If you want to redistribute modifications to GROMACS, please
  27  * consider that scientific software is very special. Version
  28  * control is crucial - bugs must be traceable. We will be happy to
  29  * consider code for inclusion in the official distribution, but
  30  * derived work must not be called official GROMACS. Details are found
  31  * in the README & COPYING files - if they are missing, get the
  32  * official version at http://www.gromacs.org.
  33  *
  34  * To help us fund GROMACS development, we humbly ask that you cite
  35  * the research papers on the package. Check out http://www.gromacs.org.
  36  */
  37 /*! \internal \file
  38  * \brief This file defines functions for managing threading of listed
  39  * interactions.
  40  *
  41  * \author Mark Abraham <mark.j.abraham@gmail.com>
  42  * \ingroup module_listed-forces
  43  */
  44 #include "gmxpre.h"
  45
  46 #include "manage-threading.h"
  47
  48 #include <assert.h>
  49 #include <limits.h>
  50 #include <stdlib.h>
  51
  52 #include <algorithm>
  53
  54 #include "gromacs/legacyheaders/gmx_omp_nthreads.h"
  55 #include "gromacs/listed-forces/listed-forces.h"
  56 #include "gromacs/pbcutil/ishift.h"
  57 #include "gromacs/utility/fatalerror.h"
  58 #include "gromacs/utility/smalloc.h"
  59 #include "gromacs/utility/stringutil.h"
  60
  61 #include "listed-internal.h"
  62
  63 /*! \brief struct for passing all data required for a function type */
  64 typedef struct {
  65     int      ftype; /**< the function type index */
  66     t_ilist *il;    /**< pointer to t_ilist entry corresponding to ftype */
  67     int      nat;   /**< nr of atoms involved in a single ftype interaction */
  68 } ilist_data_t;
  69
  70 /*! \brief Divides listed interactions over threads
  71  *
  72  * This routine attempts to divide all interactions of the ntype bondeds
  73  * types stored in ild over the threads such that each thread has roughly
  74  * equal load and different threads avoid touching the same atoms as much
  75  * as possible.
  76  */
  77 static void divide_bondeds_by_locality(int                 ntype,
  78                                        const ilist_data_t *ild,
  79                                        int                 nthread,
  80                                        t_idef             *idef)
  81 {
  82     int nat_tot, nat_sum;
  83     int ind[F_NRE];    /* index into the ild[].il->iatoms */
  84     int at_ind[F_NRE]; /* index of the first atom of the interaction at ind */
  85     int f, t;
  86
  87     assert(ntype <= F_NRE);
  88
  89     nat_tot = 0;
  90     for (f = 0; f < ntype; f++)
  91     {
  92         /* Sum #bondeds*#atoms_per_bond over all bonded types */
  93         nat_tot  += ild[f].il->nr/(ild[f].nat + 1)*ild[f].nat;
  94         /* The start bound for thread 0 is 0 for all interactions */
  95         ind[f]    = 0;
  96         /* Initialize the next atom index array */
  97         assert(ild[f].il->nr > 0);
  98         at_ind[f] = ild[f].il->iatoms[1];
  99     }
 100
 101     nat_sum = 0;
 102     /* Loop over the end bounds of the nthread threads to determine
 103      * which interactions threads 0 to nthread shall calculate.
 104      *
 105      * NOTE: The cost of these combined loops is #interactions*ntype.
 106      * This code is running single threaded (difficult to parallelize
 107      * over threads). So the relative cost of this function increases
 108      * linearly with the number of threads. Since the inner-most loop
 109      * is cheap and this is done only at DD repartitioning, the cost should
 110      * be negligble. At high thread count many other parts of the code
 111      * scale the same way, so it's (currently) not worth improving this.
 112      */
 113     for (t = 1; t <= nthread; t++)
 114     {
 115         int nat_thread;
 116
 117         /* Here we assume that the computational cost is proportional
 118          * to the number of atoms in the interaction. This is a rough
 119          * measure, but roughly correct. Usually there are very few
 120          * interactions anyhow and there are distributed relatively
 121          * uniformly. Proper and RB dihedrals are often distributed
 122          * non-uniformly, but their cost is roughly equal.
 123          */
 124         nat_thread = (nat_tot*t)/nthread;
 125
 126         while (nat_sum < nat_thread)
 127         {
 128             /* To divide bonds based on atom order, we compare
 129              * the index of the first atom in the bonded interaction.
 130              * This works well, since the domain decomposition generates
 131              * bondeds in order of the atoms by looking up interactions
 132              * which are linked to the first atom in each interaction.
 133              * It usually also works well without DD, since than the atoms
 134              * in bonded interactions are usually in increasing order.
 135              * If they are not assigned in increasing order, the balancing
 136              * is still good, but the memory access and reduction cost will
 137              * be higher.
 138              */
 139             int f_min;
 140
 141             /* Find out which of the types has the lowest atom index */
 142             f_min = 0;
 143             for (f = 1; f < ntype; f++)
 144             {
 145                 if (at_ind[f] < at_ind[f_min])
 146                 {
 147                     f_min = f;
 148                 }
 149             }
 150             assert(f_min >= 0 && f_min < ntype);
 151
 152             /* Assign the interaction with the lowest atom index (of type
 153              * index f_min) to thread t-1 by increasing ind.
 154              */
 155             ind[f_min] += ild[f_min].nat + 1;
 156             nat_sum    += ild[f_min].nat;
 157
 158             /* Update the first unassigned atom index for this type */
 159             if (ind[f_min] < ild[f_min].il->nr)
 160             {
 161                 at_ind[f_min] = ild[f_min].il->iatoms[ind[f_min] + 1];
 162             }
 163             else
 164             {
 165                 /* We have assigned all interactions of this type.
 166                  * Setting at_ind to INT_MAX ensures this type will not be
 167                  * chosen in the for loop above during next iterations.
 168                  */
 169                 at_ind[f_min] = INT_MAX;
 170             }
 171         }
 172
 173         /* Store the bonded end boundaries (at index t) for thread t-1 */
 174         for (f = 0; f < ntype; f++)
 175         {
 176             idef->il_thread_division[ild[f].ftype*(nthread + 1) + t] = ind[f];
 177         }
 178     }
 179
 180     for (f = 0; f < ntype; f++)
 181     {
 182         assert(ind[f] == ild[f].il->nr);
 183     }
 184 }
 185
 186 //! Divides bonded interactions over threads
 187 static void divide_bondeds_over_threads(t_idef *idef,
 188                                         int     nthread,
 189                                         int     max_nthread_uniform)
 190 {
 191     ilist_data_t ild[F_NRE];
 192     int          ntype;
 193     int          f;
 194
 195     assert(nthread > 0);
 196
 197     idef->nthreads = nthread;
 198
 199     if (F_NRE*(nthread + 1) > idef->il_thread_division_nalloc)
 200     {
 201         idef->il_thread_division_nalloc = F_NRE*(nthread + 1);
 202         snew(idef->il_thread_division, idef->il_thread_division_nalloc);
 203     }
 204
 205     ntype = 0;
 206     for (f = 0; f < F_NRE; f++)
 207     {
 208         if (!ftype_is_bonded_potential(f))
 209         {
 210             continue;
 211         }
 212
 213         if (idef->il[f].nr == 0)
 214         {
 215             /* No interactions, avoid all the integer math below */
 216             int t;
 217             for (t = 0; t <= nthread; t++)
 218             {
 219                 idef->il_thread_division[f*(nthread + 1) + t] = 0;
 220             }
 221         }
 222         else if (nthread <= max_nthread_uniform || f == F_DISRES)
 223         {
 224             /* On up to 4 threads, load balancing the bonded work
 225              * is more important than minimizing the reduction cost.
 226              */
 227             int nat1, t;
 228
 229             /* nat1 = 1 + #atoms(ftype) which is the stride use for iatoms */
 230             nat1 = 1 + NRAL(f);
 231
 232             for (t = 0; t <= nthread; t++)
 233             {
 234                 int nr_t;
 235
 236                 /* Divide equally over the threads */
 237                 nr_t = (((idef->il[f].nr/nat1)*t)/nthread)*nat1;
 238
 239                 if (f == F_DISRES)
 240                 {
 241                     /* Ensure that distance restraint pairs with the same label
 242                      * end up on the same thread.
 243                      */
 244                     while (nr_t > 0 && nr_t < idef->il[f].nr &&
 245                            idef->iparams[idef->il[f].iatoms[nr_t]].disres.label ==
 246                            idef->iparams[idef->il[f].iatoms[nr_t-nat1]].disres.label)
 247                     {
 248                         nr_t += nat1;
 249                     }
 250                 }
 251
 252                 idef->il_thread_division[f*(nthread + 1) + t] = nr_t;
 253             }
 254         }
 255         else
 256         {
 257             /* Add this ftype to the list to be distributed */
 258             int nat;
 259
 260             nat              = NRAL(f);
 261             ild[ntype].ftype = f;
 262             ild[ntype].il    = &idef->il[f];
 263             ild[ntype].nat   = nat;
 264
 265             /* The first index for the thread division is always 0 */
 266             idef->il_thread_division[f*(nthread + 1)] = 0;
 267
 268             ntype++;
 269         }
 270     }
 271
 272     if (ntype > 0)
 273     {
 274         divide_bondeds_by_locality(ntype, ild, nthread, idef);
 275     }
 276
 277     if (debug)
 278     {
 279         int f;
 280
 281         fprintf(debug, "Division of bondeds over threads:\n");
 282         for (f = 0; f < F_NRE; f++)
 283         {
 284             if (ftype_is_bonded_potential(f) && idef->il[f].nr > 0)
 285             {
 286                 int t;
 287
 288                 fprintf(debug, "%16s", interaction_function[f].name);
 289                 for (t = 0; t < nthread; t++)
 290                 {
 291                     fprintf(debug, " %4d",
 292                             (idef->il_thread_division[f*(nthread + 1) + t + 1] -
 293                              idef->il_thread_division[f*(nthread + 1) + t])/
 294                             (1 + NRAL(f)));
 295                 }
 296                 fprintf(debug, "\n");
 297             }
 298         }
 299     }
 300 }
 301
 302 //! Construct a reduction mask for which interaction was computed on which thread
 303 static void
 304 calc_bonded_reduction_mask(gmx_bitmask_t *mask,
 305                            const t_idef *idef,
 306                            int shift,
 307                            int t, int nt)
 308 {
 309     int           ftype, nb, nat1, nb0, nb1, i, a;
 310     bitmask_clear(mask);
 311
 312     for (ftype = 0; ftype < F_NRE; ftype++)
 313     {
 314         if (ftype_is_bonded_potential(ftype))
 315         {
 316             nb = idef->il[ftype].nr;
 317             if (nb > 0)
 318             {
 319                 nat1 = interaction_function[ftype].nratoms + 1;
 320
 321                 /* Divide this interaction equally over the threads.
 322                  * This is not stored: should match division in calc_bonds.
 323                  */
 324                 nb0 = idef->il_thread_division[ftype*(nt+1)+t];
 325                 nb1 = idef->il_thread_division[ftype*(nt+1)+t+1];
 326
 327                 for (i = nb0; i < nb1; i += nat1)
 328                 {
 329                     for (a = 1; a < nat1; a++)
 330                     {
 331                         bitmask_set_bit(mask, idef->il[ftype].iatoms[i+a]>>shift);
 332                     }
 333                 }
 334             }
 335         }
 336     }
 337 }
 338
 339
 340 /*! \brief We divide the force array in a maximum of BITMASK_SIZE (default 32) blocks.
 341  * Minimum force block reduction size is thus 2^6=64.
 342  */
 343 const int maxBlockBits = BITMASK_SIZE;
 344
 345 void setup_bonded_threading(t_forcerec *fr, t_idef *idef)
 346 {
 347     bonded_threading_t *bt;
 348     int                 t;
 349     int                 ctot, c, b;
 350
 351     bt = fr->bonded_threading;
 352
 353     assert(bt->nthreads >= 1);
 354
 355     /* Divide the bonded interaction over the threads */
 356     divide_bondeds_over_threads(idef,
 357                                 bt->nthreads,
 358                                 bt->bonded_max_nthread_uniform);
 359
 360     if (bt->nthreads == 1)
 361     {
 362         bt->red_nblock = 0;
 363
 364         return;
 365     }
 366
 367     bt->red_ashift = 6;
 368     while (fr->natoms_force > (int)(maxBlockBits*(1U<<bt->red_ashift)))
 369     {
 370         bt->red_ashift++;
 371     }
 372     if (debug)
 373     {
 374         fprintf(debug, "bonded force buffer block atom shift %d bits\n",
 375                 bt->red_ashift);
 376     }
 377
 378     /* Determine to which blocks each thread's bonded force calculation
 379      * contributes. Store this is a mask for each thread.
 380      */
 381 #pragma omp parallel for num_threads(bt->nthreads) schedule(static)
 382     for (t = 1; t < bt->nthreads; t++)
 383     {
 384         calc_bonded_reduction_mask(&bt->f_t[t].red_mask,
 385                                    idef, bt->red_ashift, t, bt->nthreads);
 386     }
 387
 388     /* Determine the maximum number of blocks we need to reduce over */
 389     bt->red_nblock = 0;
 390     ctot           = 0;
 391     for (t = 0; t < bt->nthreads; t++)
 392     {
 393         c = 0;
 394         for (b = 0; b < maxBlockBits; b++)
 395         {
 396             if (bitmask_is_set(bt->f_t[t].red_mask, b))
 397             {
 398                 bt->red_nblock = std::max(bt->red_nblock, b+1);
 399                 c++;
 400             }
 401         }
 402         if (debug)
 403         {
 404 #if BITMASK_SIZE <= 64 //move into bitmask when it is C++
 405             std::string flags = gmx::formatString("%x", bt->f_t[t].red_mask);
 406 #else
 407             std::string flags = gmx::formatAndJoin(bt->f_t[t].red_mask,
 408                                                    bt->f_t[t].red_mask+BITMASK_ALEN,
 409                                                    "", gmx::StringFormatter("%x"));
 410 #endif
 411             fprintf(debug, "thread %d flags %s count %d\n",
 412                     t, flags.c_str(), c);
 413         }
 414         ctot += c;
 415     }
 416     if (debug)
 417     {
 418         fprintf(debug, "Number of blocks to reduce: %d of size %d\n",
 419                 bt->red_nblock, 1<<bt->red_ashift);
 420         fprintf(debug, "Reduction density %.2f density/#thread %.2f\n",
 421                 ctot*(1<<bt->red_ashift)/(double)fr->natoms_force,
 422                 ctot*(1<<bt->red_ashift)/(double)(fr->natoms_force*bt->nthreads));
 423     }
 424 }
 425
 426 void init_bonded_threading(FILE *fplog, int nenergrp,
 427                            struct bonded_threading_t **bt_ptr)
 428 {
 429     bonded_threading_t *bt;
 430
 431     snew(bt, 1);
 432
 433     /* These thread local data structures are used for bondeds only */
 434     bt->nthreads = gmx_omp_nthreads_get(emntBonded);
 435
 436     if (bt->nthreads > 1)
 437     {
 438         int t;
 439
 440         snew(bt->f_t, bt->nthreads);
 441 #pragma omp parallel for num_threads(bt->nthreads) schedule(static)
 442         for (t = 0; t < bt->nthreads; t++)
 443         {
 444             /* Thread 0 uses the global force and energy arrays */
 445             if (t > 0)
 446             {
 447                 int i;
 448
 449                 bt->f_t[t].f        = NULL;
 450                 bt->f_t[t].f_nalloc = 0;
 451                 snew(bt->f_t[t].fshift, SHIFTS);
 452                 bt->f_t[t].grpp.nener = nenergrp*nenergrp;
 453                 for (i = 0; i < egNR; i++)
 454                 {
 455                     snew(bt->f_t[t].grpp.ener[i], bt->f_t[t].grpp.nener);
 456                 }
 457             }
 458         }
 459
 460         /* The optimal value after which to switch from uniform to localized
 461          * bonded interaction distribution is 3, 4 or 5 depending on the system
 462          * and hardware.
 463          */
 464         const int max_nthread_uniform = 4;
 465         char *    ptr;
 466
 467         if ((ptr = getenv("GMX_BONDED_NTHREAD_UNIFORM")) != NULL)
 468         {
 469             sscanf(ptr, "%d", &bt->bonded_max_nthread_uniform);
 470             if (fplog != NULL)
 471             {
 472                 fprintf(fplog, "\nMax threads for uniform bonded distribution set to %d by env.var.\n",
 473                         bt->bonded_max_nthread_uniform);
 474             }
 475         }
 476         else
 477         {
 478             bt->bonded_max_nthread_uniform = max_nthread_uniform;
 479         }
 480     }
 481
 482     *bt_ptr = bt;
 483 }