src/gromacs/listed-forces/manage-threading.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2004, The GROMACS development team.
   6  * Copyright (c) 2013,2014,2015, by the GROMACS development team, led by
   7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   8  * and including many others, as listed in the AUTHORS file in the
   9  * top-level source directory and at http://www.gromacs.org.
  10  *
  11  * GROMACS is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public License
  13  * as published by the Free Software Foundation; either version 2.1
  14  * of the License, or (at your option) any later version.
  15  *
  16  * GROMACS is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19  * Lesser General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU Lesser General Public
  22  * License along with GROMACS; if not, see
  23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  25  *
  26  * If you want to redistribute modifications to GROMACS, please
  27  * consider that scientific software is very special. Version
  28  * control is crucial - bugs must be traceable. We will be happy to
  29  * consider code for inclusion in the official distribution, but
  30  * derived work must not be called official GROMACS. Details are found
  31  * in the README & COPYING files - if they are missing, get the
  32  * official version at http://www.gromacs.org.
  33  *
  34  * To help us fund GROMACS development, we humbly ask that you cite
  35  * the research papers on the package. Check out http://www.gromacs.org.
  36  */
  37 /*! \internal \file
  38  * \brief This file defines functions for managing threading of listed
  39  * interactions.
  40  *
  41  * \author Mark Abraham <mark.j.abraham@gmail.com>
  42  * \ingroup module_listed-forces
  43  */
  44 #include "gmxpre.h"
  45
  46 #include "manage-threading.h"
  47
  48 #include <assert.h>
  49 #include <limits.h>
  50 #include <stdlib.h>
  51
  52 #include <algorithm>
  53
  54 #include "gromacs/legacyheaders/gmx_omp_nthreads.h"
  55 #include "gromacs/listed-forces/listed-forces.h"
  56 #include "gromacs/mdlib/forcerec-threading.h"
  57 #include "gromacs/pbcutil/ishift.h"
  58 #include "gromacs/utility/fatalerror.h"
  59 #include "gromacs/utility/smalloc.h"
  60 #include "gromacs/utility/stringutil.h"
  61
  62 /*! \brief struct for passing all data required for a function type */
  63 typedef struct {
  64     int      ftype; /**< the function type index */
  65     t_ilist *il;    /**< pointer to t_ilist entry corresponding to ftype */
  66     int      nat;   /**< nr of atoms involved in a single ftype interaction */
  67 } ilist_data_t;
  68
  69 /*! \brief Divides listed interactions over threads
  70  *
  71  * This routine attempts to divide all interactions of the ntype bondeds
  72  * types stored in ild over the threads such that each thread has roughly
  73  * equal load and different threads avoid touching the same atoms as much
  74  * as possible.
  75  */
  76 static void divide_bondeds_by_locality(int                 ntype,
  77                                        const ilist_data_t *ild,
  78                                        int                 nthread,
  79                                        t_idef             *idef)
  80 {
  81     int nat_tot, nat_sum;
  82     int ind[F_NRE];    /* index into the ild[].il->iatoms */
  83     int at_ind[F_NRE]; /* index of the first atom of the interaction at ind */
  84     int f, t;
  85
  86     assert(ntype <= F_NRE);
  87
  88     nat_tot = 0;
  89     for (f = 0; f < ntype; f++)
  90     {
  91         /* Sum #bondeds*#atoms_per_bond over all bonded types */
  92         nat_tot  += ild[f].il->nr/(ild[f].nat + 1)*ild[f].nat;
  93         /* The start bound for thread 0 is 0 for all interactions */
  94         ind[f]    = 0;
  95         /* Initialize the next atom index array */
  96         assert(ild[f].il->nr > 0);
  97         at_ind[f] = ild[f].il->iatoms[1];
  98     }
  99
 100     nat_sum = 0;
 101     /* Loop over the end bounds of the nthread threads to determine
 102      * which interactions threads 0 to nthread shall calculate.
 103      *
 104      * NOTE: The cost of these combined loops is #interactions*ntype.
 105      * This code is running single threaded (difficult to parallelize
 106      * over threads). So the relative cost of this function increases
 107      * linearly with the number of threads. Since the inner-most loop
 108      * is cheap and this is done only at DD repartitioning, the cost should
 109      * be negligble. At high thread count many other parts of the code
 110      * scale the same way, so it's (currently) not worth improving this.
 111      */
 112     for (t = 1; t <= nthread; t++)
 113     {
 114         int nat_thread;
 115
 116         /* Here we assume that the computational cost is proportional
 117          * to the number of atoms in the interaction. This is a rough
 118          * measure, but roughly correct. Usually there are very few
 119          * interactions anyhow and there are distributed relatively
 120          * uniformly. Proper and RB dihedrals are often distributed
 121          * non-uniformly, but their cost is roughly equal.
 122          */
 123         nat_thread = (nat_tot*t)/nthread;
 124
 125         while (nat_sum < nat_thread)
 126         {
 127             /* To divide bonds based on atom order, we compare
 128              * the index of the first atom in the bonded interaction.
 129              * This works well, since the domain decomposition generates
 130              * bondeds in order of the atoms by looking up interactions
 131              * which are linked to the first atom in each interaction.
 132              * It usually also works well without DD, since than the atoms
 133              * in bonded interactions are usually in increasing order.
 134              * If they are not assigned in increasing order, the balancing
 135              * is still good, but the memory access and reduction cost will
 136              * be higher.
 137              */
 138             int f_min;
 139
 140             /* Find out which of the types has the lowest atom index */
 141             f_min = 0;
 142             for (f = 1; f < ntype; f++)
 143             {
 144                 if (at_ind[f] < at_ind[f_min])
 145                 {
 146                     f_min = f;
 147                 }
 148             }
 149             assert(f_min >= 0 && f_min < ntype);
 150
 151             /* Assign the interaction with the lowest atom index (of type
 152              * index f_min) to thread t-1 by increasing ind.
 153              */
 154             ind[f_min] += ild[f_min].nat + 1;
 155             nat_sum    += ild[f_min].nat;
 156
 157             /* Update the first unassigned atom index for this type */
 158             if (ind[f_min] < ild[f_min].il->nr)
 159             {
 160                 at_ind[f_min] = ild[f_min].il->iatoms[ind[f_min] + 1];
 161             }
 162             else
 163             {
 164                 /* We have assigned all interactions of this type.
 165                  * Setting at_ind to INT_MAX ensures this type will not be
 166                  * chosen in the for loop above during next iterations.
 167                  */
 168                 at_ind[f_min] = INT_MAX;
 169             }
 170         }
 171
 172         /* Store the bonded end boundaries (at index t) for thread t-1 */
 173         for (f = 0; f < ntype; f++)
 174         {
 175             idef->il_thread_division[ild[f].ftype*(nthread + 1) + t] = ind[f];
 176         }
 177     }
 178
 179     for (f = 0; f < ntype; f++)
 180     {
 181         assert(ind[f] == ild[f].il->nr);
 182     }
 183 }
 184
 185 //! Divides bonded interactions over threads
 186 static void divide_bondeds_over_threads(t_idef *idef,
 187                                         int     nthread,
 188                                         int     max_nthread_uniform)
 189 {
 190     ilist_data_t ild[F_NRE];
 191     int          ntype;
 192     int          f;
 193
 194     assert(nthread > 0);
 195
 196     idef->nthreads = nthread;
 197
 198     if (F_NRE*(nthread + 1) > idef->il_thread_division_nalloc)
 199     {
 200         idef->il_thread_division_nalloc = F_NRE*(nthread + 1);
 201         snew(idef->il_thread_division, idef->il_thread_division_nalloc);
 202     }
 203
 204     ntype = 0;
 205     for (f = 0; f < F_NRE; f++)
 206     {
 207         if (!ftype_is_bonded_potential(f))
 208         {
 209             continue;
 210         }
 211
 212         if (idef->il[f].nr == 0)
 213         {
 214             /* No interactions, avoid all the integer math below */
 215             int t;
 216             for (t = 0; t <= nthread; t++)
 217             {
 218                 idef->il_thread_division[f*(nthread + 1) + t] = 0;
 219             }
 220         }
 221         else if (nthread <= max_nthread_uniform || f == F_DISRES)
 222         {
 223             /* On up to 4 threads, load balancing the bonded work
 224              * is more important than minimizing the reduction cost.
 225              */
 226             int nat1, t;
 227
 228             /* nat1 = 1 + #atoms(ftype) which is the stride use for iatoms */
 229             nat1 = 1 + NRAL(f);
 230
 231             for (t = 0; t <= nthread; t++)
 232             {
 233                 int nr_t;
 234
 235                 /* Divide equally over the threads */
 236                 nr_t = (((idef->il[f].nr/nat1)*t)/nthread)*nat1;
 237
 238                 if (f == F_DISRES)
 239                 {
 240                     /* Ensure that distance restraint pairs with the same label
 241                      * end up on the same thread.
 242                      */
 243                     while (nr_t > 0 && nr_t < idef->il[f].nr &&
 244                            idef->iparams[idef->il[f].iatoms[nr_t]].disres.label ==
 245                            idef->iparams[idef->il[f].iatoms[nr_t-nat1]].disres.label)
 246                     {
 247                         nr_t += nat1;
 248                     }
 249                 }
 250
 251                 idef->il_thread_division[f*(nthread + 1) + t] = nr_t;
 252             }
 253         }
 254         else
 255         {
 256             /* Add this ftype to the list to be distributed */
 257             int nat;
 258
 259             nat              = NRAL(f);
 260             ild[ntype].ftype = f;
 261             ild[ntype].il    = &idef->il[f];
 262             ild[ntype].nat   = nat;
 263
 264             /* The first index for the thread division is always 0 */
 265             idef->il_thread_division[f*(nthread + 1)] = 0;
 266
 267             ntype++;
 268         }
 269     }
 270
 271     if (ntype > 0)
 272     {
 273         divide_bondeds_by_locality(ntype, ild, nthread, idef);
 274     }
 275
 276     if (debug)
 277     {
 278         int f;
 279
 280         fprintf(debug, "Division of bondeds over threads:\n");
 281         for (f = 0; f < F_NRE; f++)
 282         {
 283             if (ftype_is_bonded_potential(f) && idef->il[f].nr > 0)
 284             {
 285                 int t;
 286
 287                 fprintf(debug, "%16s", interaction_function[f].name);
 288                 for (t = 0; t < nthread; t++)
 289                 {
 290                     fprintf(debug, " %4d",
 291                             (idef->il_thread_division[f*(nthread + 1) + t + 1] -
 292                              idef->il_thread_division[f*(nthread + 1) + t])/
 293                             (1 + NRAL(f)));
 294                 }
 295                 fprintf(debug, "\n");
 296             }
 297         }
 298     }
 299 }
 300
 301 //! Construct a reduction mask for which interaction was computed on which thread
 302 static void
 303 calc_bonded_reduction_mask(gmx_bitmask_t *mask,
 304                            const t_idef *idef,
 305                            int shift,
 306                            int t, int nt)
 307 {
 308     int           ftype, nb, nat1, nb0, nb1, i, a;
 309     bitmask_clear(mask);
 310
 311     for (ftype = 0; ftype < F_NRE; ftype++)
 312     {
 313         if (ftype_is_bonded_potential(ftype))
 314         {
 315             nb = idef->il[ftype].nr;
 316             if (nb > 0)
 317             {
 318                 nat1 = interaction_function[ftype].nratoms + 1;
 319
 320                 /* Divide this interaction equally over the threads.
 321                  * This is not stored: should match division in calc_bonds.
 322                  */
 323                 nb0 = idef->il_thread_division[ftype*(nt+1)+t];
 324                 nb1 = idef->il_thread_division[ftype*(nt+1)+t+1];
 325
 326                 for (i = nb0; i < nb1; i += nat1)
 327                 {
 328                     for (a = 1; a < nat1; a++)
 329                     {
 330                         bitmask_set_bit(mask, idef->il[ftype].iatoms[i+a]>>shift);
 331                     }
 332                 }
 333             }
 334         }
 335     }
 336 }
 337
 338
 339 /*! \brief We divide the force array in a maximum of BITMASK_SIZE (default 32) blocks.
 340  * Minimum force block reduction size is thus 2^6=64.
 341  */
 342 const int maxBlockBits = BITMASK_SIZE;
 343
 344 void setup_bonded_threading(t_forcerec *fr, t_idef *idef)
 345 {
 346     int t;
 347     int ctot, c, b;
 348
 349     assert(fr->nthreads >= 1);
 350
 351     /* Divide the bonded interaction over the threads */
 352     divide_bondeds_over_threads(idef,
 353                                 fr->nthreads,
 354                                 fr->bonded_max_nthread_uniform);
 355
 356     if (fr->nthreads == 1)
 357     {
 358         fr->red_nblock = 0;
 359
 360         return;
 361     }
 362
 363     fr->red_ashift = 6;
 364     while (fr->natoms_force > (int)(maxBlockBits*(1U<<fr->red_ashift)))
 365     {
 366         fr->red_ashift++;
 367     }
 368     if (debug)
 369     {
 370         fprintf(debug, "bonded force buffer block atom shift %d bits\n",
 371                 fr->red_ashift);
 372     }
 373
 374     /* Determine to which blocks each thread's bonded force calculation
 375      * contributes. Store this is a mask for each thread.
 376      */
 377 #pragma omp parallel for num_threads(fr->nthreads) schedule(static)
 378     for (t = 1; t < fr->nthreads; t++)
 379     {
 380         calc_bonded_reduction_mask(&fr->f_t[t].red_mask,
 381                                    idef, fr->red_ashift, t, fr->nthreads);
 382     }
 383
 384     /* Determine the maximum number of blocks we need to reduce over */
 385     fr->red_nblock = 0;
 386     ctot           = 0;
 387     for (t = 0; t < fr->nthreads; t++)
 388     {
 389         c = 0;
 390         for (b = 0; b < maxBlockBits; b++)
 391         {
 392             if (bitmask_is_set(fr->f_t[t].red_mask, b))
 393             {
 394                 fr->red_nblock = std::max(fr->red_nblock, b+1);
 395                 c++;
 396             }
 397         }
 398         if (debug)
 399         {
 400 #if BITMASK_SIZE <= 64 //move into bitmask when it is C++
 401             std::string flags = gmx::formatString("%x", fr->f_t[t].red_mask);
 402 #else
 403             std::string flags = gmx::formatAndJoin(fr->f_t[t].red_mask,
 404                                                    fr->f_t[t].red_mask+BITMASK_ALEN,
 405                                                    "", gmx::StringFormatter("%x"));
 406 #endif
 407             fprintf(debug, "thread %d flags %s count %d\n",
 408                     t, flags.c_str(), c);
 409         }
 410         ctot += c;
 411     }
 412     if (debug)
 413     {
 414         fprintf(debug, "Number of blocks to reduce: %d of size %d\n",
 415                 fr->red_nblock, 1<<fr->red_ashift);
 416         fprintf(debug, "Reduction density %.2f density/#thread %.2f\n",
 417                 ctot*(1<<fr->red_ashift)/(double)fr->natoms_force,
 418                 ctot*(1<<fr->red_ashift)/(double)(fr->natoms_force*fr->nthreads));
 419     }
 420 }
 421
 422 void init_bonded_threading(FILE *fplog, t_forcerec *fr, int nenergrp)
 423 {
 424     /* These thread local data structures are used for bondeds only */
 425     fr->nthreads = gmx_omp_nthreads_get(emntBonded);
 426
 427     if (fr->nthreads > 1)
 428     {
 429         int t;
 430
 431         snew(fr->f_t, fr->nthreads);
 432 #pragma omp parallel for num_threads(fr->nthreads) schedule(static)
 433         for (t = 0; t < fr->nthreads; t++)
 434         {
 435             /* Thread 0 uses the global force and energy arrays */
 436             if (t > 0)
 437             {
 438                 int i;
 439
 440                 fr->f_t[t].f        = NULL;
 441                 fr->f_t[t].f_nalloc = 0;
 442                 snew(fr->f_t[t].fshift, SHIFTS);
 443                 fr->f_t[t].grpp.nener = nenergrp*nenergrp;
 444                 for (i = 0; i < egNR; i++)
 445                 {
 446                     snew(fr->f_t[t].grpp.ener[i], fr->f_t[t].grpp.nener);
 447                 }
 448             }
 449         }
 450
 451         /* The optimal value after which to switch from uniform to localized
 452          * bonded interaction distribution is 3, 4 or 5 depending on the system
 453          * and hardware.
 454          */
 455         const int max_nthread_uniform = 4;
 456         char *    ptr;
 457
 458         if ((ptr = getenv("GMX_BONDED_NTHREAD_UNIFORM")) != NULL)
 459         {
 460             sscanf(ptr, "%d", &fr->bonded_max_nthread_uniform);
 461             if (fplog != NULL)
 462             {
 463                 fprintf(fplog, "\nMax threads for uniform bonded distribution set to %d by env.var.\n",
 464                         fr->bonded_max_nthread_uniform);
 465             }
 466         }
 467         else
 468         {
 469             fr->bonded_max_nthread_uniform = max_nthread_uniform;
 470         }
 471     }
 472 }