src/gromacs/listed_forces/manage_threading.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2004, The GROMACS development team.
   6  * Copyright (c) 2013,2014,2015,2016,2017 by the GROMACS development team.
   7  * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
   8  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   9  * and including many others, as listed in the AUTHORS file in the
  10  * top-level source directory and at http://www.gromacs.org.
  11  *
  12  * GROMACS is free software; you can redistribute it and/or
  13  * modify it under the terms of the GNU Lesser General Public License
  14  * as published by the Free Software Foundation; either version 2.1
  15  * of the License, or (at your option) any later version.
  16  *
  17  * GROMACS is distributed in the hope that it will be useful,
  18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  20  * Lesser General Public License for more details.
  21  *
  22  * You should have received a copy of the GNU Lesser General Public
  23  * License along with GROMACS; if not, see
  24  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  25  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  26  *
  27  * If you want to redistribute modifications to GROMACS, please
  28  * consider that scientific software is very special. Version
  29  * control is crucial - bugs must be traceable. We will be happy to
  30  * consider code for inclusion in the official distribution, but
  31  * derived work must not be called official GROMACS. Details are found
  32  * in the README & COPYING files - if they are missing, get the
  33  * official version at http://www.gromacs.org.
  34  *
  35  * To help us fund GROMACS development, we humbly ask that you cite
  36  * the research papers on the package. Check out http://www.gromacs.org.
  37  */
  38 /*! \internal \file
  39  * \brief This file defines functions for managing threading of listed
  40  * interactions.
  41  *
  42  * \author Mark Abraham <mark.j.abraham@gmail.com>
  43  * \author Berk Hess <hess@kth.se>
  44  * \ingroup module_listed_forces
  45  */
  46 #include "gmxpre.h"
  47
  48 #include "manage_threading.h"
  49
  50 #include "config.h"
  51
  52 #include <cassert>
  53 #include <cinttypes>
  54 #include <climits>
  55 #include <cstdlib>
  56
  57 #include <algorithm>
  58 #include <string>
  59
  60 #include "gromacs/listed_forces/gpubonded.h"
  61 #include "gromacs/pbcutil/ishift.h"
  62 #include "gromacs/topology/ifunc.h"
  63 #include "gromacs/utility/exceptions.h"
  64 #include "gromacs/utility/fatalerror.h"
  65 #include "gromacs/utility/gmxassert.h"
  66
  67 #include "listed_internal.h"
  68 #include "utilities.h"
  69
  70 /*! \brief struct for passing all data required for a function type */
  71 typedef struct
  72 {
  73     const InteractionList* il;    /**< pointer to t_ilist entry corresponding to ftype */
  74     int                    ftype; /**< the function type index */
  75     int                    nat;   /**< nr of atoms involved in a single ftype interaction */
  76 } ilist_data_t;
  77
  78 /*! \brief Divides listed interactions over threads
  79  *
  80  * This routine attempts to divide all interactions of the numType bondeds
  81  * types stored in ild over the threads such that each thread has roughly
  82  * equal load and different threads avoid touching the same atoms as much
  83  * as possible.
  84  */
  85 static void divide_bondeds_by_locality(bonded_threading_t* bt, int numType, const ilist_data_t* ild)
  86 {
  87     int nat_tot, nat_sum;
  88     int ind[F_NRE];    /* index into the ild[].il->iatoms */
  89     int at_ind[F_NRE]; /* index of the first atom of the interaction at ind */
  90     int f, t;
  91
  92     assert(numType <= F_NRE);
  93
  94     nat_tot = 0;
  95     for (f = 0; f < numType; f++)
  96     {
  97         /* Sum #bondeds*#atoms_per_bond over all bonded types */
  98         nat_tot += ild[f].il->size() / (ild[f].nat + 1) * ild[f].nat;
  99         /* The start bound for thread 0 is 0 for all interactions */
 100         ind[f] = 0;
 101         /* Initialize the next atom index array */
 102         assert(!ild[f].il->empty());
 103         at_ind[f] = ild[f].il->iatoms[1];
 104     }
 105
 106     nat_sum = 0;
 107     /* Loop over the end bounds of the nthreads threads to determine
 108      * which interactions threads 0 to nthreads shall calculate.
 109      *
 110      * NOTE: The cost of these combined loops is #interactions*numType.
 111      * This code is running single threaded (difficult to parallelize
 112      * over threads). So the relative cost of this function increases
 113      * linearly with the number of threads. Since the inner-most loop
 114      * is cheap and this is done only at DD repartitioning, the cost should
 115      * be negligble. At high thread count many other parts of the code
 116      * scale the same way, so it's (currently) not worth improving this.
 117      */
 118     for (t = 1; t <= bt->nthreads; t++)
 119     {
 120         int nat_thread;
 121
 122         /* Here we assume that the computational cost is proportional
 123          * to the number of atoms in the interaction. This is a rough
 124          * measure, but roughly correct. Usually there are very few
 125          * interactions anyhow and there are distributed relatively
 126          * uniformly. Proper and RB dihedrals are often distributed
 127          * non-uniformly, but their cost is roughly equal.
 128          */
 129         nat_thread = (nat_tot * t) / bt->nthreads;
 130
 131         while (nat_sum < nat_thread)
 132         {
 133             /* To divide bonds based on atom order, we compare
 134              * the index of the first atom in the bonded interaction.
 135              * This works well, since the domain decomposition generates
 136              * bondeds in order of the atoms by looking up interactions
 137              * which are linked to the first atom in each interaction.
 138              * It usually also works well without DD, since than the atoms
 139              * in bonded interactions are usually in increasing order.
 140              * If they are not assigned in increasing order, the balancing
 141              * is still good, but the memory access and reduction cost will
 142              * be higher.
 143              */
 144             int f_min;
 145
 146             /* Find out which of the types has the lowest atom index */
 147             f_min = 0;
 148             for (f = 1; f < numType; f++)
 149             {
 150                 if (at_ind[f] < at_ind[f_min])
 151                 {
 152                     f_min = f;
 153                 }
 154             }
 155             assert(f_min >= 0 && f_min < numType);
 156
 157             /* Assign the interaction with the lowest atom index (of type
 158              * index f_min) to thread t-1 by increasing ind.
 159              */
 160             ind[f_min] += ild[f_min].nat + 1;
 161             nat_sum += ild[f_min].nat;
 162
 163             /* Update the first unassigned atom index for this type */
 164             if (ind[f_min] < ild[f_min].il->size())
 165             {
 166                 at_ind[f_min] = ild[f_min].il->iatoms[ind[f_min] + 1];
 167             }
 168             else
 169             {
 170                 /* We have assigned all interactions of this type.
 171                  * Setting at_ind to INT_MAX ensures this type will not be
 172                  * chosen in the for loop above during next iterations.
 173                  */
 174                 at_ind[f_min] = INT_MAX;
 175             }
 176         }
 177
 178         /* Store the bonded end boundaries (at index t) for thread t-1 */
 179         for (f = 0; f < numType; f++)
 180         {
 181             bt->workDivision.setBound(ild[f].ftype, t, ind[f]);
 182         }
 183     }
 184
 185     for (f = 0; f < numType; f++)
 186     {
 187         assert(ind[f] == ild[f].il->size());
 188     }
 189 }
 190
 191 //! Return whether function type \p ftype in \p idef has perturbed interactions
 192 static bool ftypeHasPerturbedEntries(const InteractionDefinitions& idef, int ftype)
 193 {
 194     GMX_ASSERT(idef.ilsort == ilsortNO_FE || idef.ilsort == ilsortFE_SORTED,
 195                "Perturbed interations should be sorted here");
 196
 197     const InteractionList& ilist = idef.il[ftype];
 198
 199     return (idef.ilsort != ilsortNO_FE && idef.numNonperturbedInteractions[ftype] != ilist.size());
 200 }
 201
 202 //! Divides bonded interactions over threads and GPU
 203 static void divide_bondeds_over_threads(bonded_threading_t*           bt,
 204                                         bool                          useGpuForBondeds,
 205                                         const InteractionDefinitions& idef)
 206 {
 207     ilist_data_t ild[F_NRE];
 208
 209     GMX_ASSERT(bt->nthreads > 0, "Must have positive number of threads");
 210     const int numThreads = bt->nthreads;
 211
 212     gmx::ArrayRef<const t_iparams> iparams = idef.iparams;
 213
 214     bt->haveBondeds      = false;
 215     int    numType       = 0;
 216     size_t fTypeGpuIndex = 0;
 217     for (int fType = 0; fType < F_NRE; fType++)
 218     {
 219         if (!ftype_is_bonded_potential(fType))
 220         {
 221             continue;
 222         }
 223
 224         const InteractionList& il                     = idef.il[fType];
 225         int                    nrToAssignToCpuThreads = il.size();
 226
 227         if (useGpuForBondeds && fTypeGpuIndex < gmx::fTypesOnGpu.size()
 228             && gmx::fTypesOnGpu[fTypeGpuIndex] == fType)
 229         {
 230             fTypeGpuIndex++;
 231
 232             /* Perturbation is not implemented in the GPU bonded kernels.
 233              * But instead of doing all on the CPU, we could do only
 234              * the actually perturbed interactions on the CPU.
 235              */
 236             if (!ftypeHasPerturbedEntries(idef, fType))
 237             {
 238                 /* We will assign this interaction type to the GPU */
 239                 nrToAssignToCpuThreads = 0;
 240             }
 241         }
 242
 243         if (nrToAssignToCpuThreads > 0)
 244         {
 245             bt->haveBondeds = true;
 246         }
 247
 248         if (nrToAssignToCpuThreads == 0)
 249         {
 250             /* No interactions, avoid all the integer math below */
 251             for (int t = 0; t <= numThreads; t++)
 252             {
 253                 bt->workDivision.setBound(fType, t, 0);
 254             }
 255         }
 256         else if (numThreads <= bt->max_nthread_uniform || fType == F_DISRES)
 257         {
 258             /* On up to 4 threads, load balancing the bonded work
 259              * is more important than minimizing the reduction cost.
 260              */
 261
 262             const int stride = 1 + NRAL(fType);
 263
 264             for (int t = 0; t <= numThreads; t++)
 265             {
 266                 /* Divide equally over the threads */
 267                 int nr_t = (((nrToAssignToCpuThreads / stride) * t) / numThreads) * stride;
 268
 269                 if (fType == F_DISRES)
 270                 {
 271                     /* Ensure that distance restraint pairs with the same label
 272                      * end up on the same thread.
 273                      */
 274                     while (nr_t > 0 && nr_t < nrToAssignToCpuThreads
 275                            && iparams[il.iatoms[nr_t]].disres.label
 276                                       == iparams[il.iatoms[nr_t - stride]].disres.label)
 277                     {
 278                         nr_t += stride;
 279                     }
 280                 }
 281
 282                 bt->workDivision.setBound(fType, t, nr_t);
 283             }
 284         }
 285         else
 286         {
 287             /* Add this fType to the list to be distributed */
 288             int nat            = NRAL(fType);
 289             ild[numType].ftype = fType;
 290             ild[numType].il    = &il;
 291             ild[numType].nat   = nat;
 292
 293             /* The first index for the thread division is always 0 */
 294             bt->workDivision.setBound(fType, 0, 0);
 295
 296             numType++;
 297         }
 298     }
 299
 300     if (numType > 0)
 301     {
 302         divide_bondeds_by_locality(bt, numType, ild);
 303     }
 304
 305     if (debug)
 306     {
 307         int f;
 308
 309         fprintf(debug, "Division of bondeds over threads:\n");
 310         for (f = 0; f < F_NRE; f++)
 311         {
 312             if (ftype_is_bonded_potential(f) && !idef.il[f].empty())
 313             {
 314                 int t;
 315
 316                 fprintf(debug, "%16s", interaction_function[f].name);
 317                 for (t = 0; t < numThreads; t++)
 318                 {
 319                     fprintf(debug, " %4d",
 320                             (bt->workDivision.bound(f, t + 1) - bt->workDivision.bound(f, t))
 321                                     / (1 + NRAL(f)));
 322                 }
 323                 fprintf(debug, "\n");
 324             }
 325         }
 326     }
 327 }
 328
 329 //! Construct a reduction mask for which parts (blocks) of the force array are touched on which thread task
 330 static void calc_bonded_reduction_mask(int                           natoms,
 331                                        f_thread_t*                   f_thread,
 332                                        const InteractionDefinitions& idef,
 333                                        int                           thread,
 334                                        const bonded_threading_t&     bondedThreading)
 335 {
 336     static_assert(BITMASK_SIZE == GMX_OPENMP_MAX_THREADS,
 337                   "For the error message below we assume these two are equal.");
 338
 339     if (bondedThreading.nthreads > BITMASK_SIZE)
 340     {
 341         gmx_fatal(FARGS,
 342                   "You are using %d OpenMP threads, which is larger than GMX_OPENMP_MAX_THREADS "
 343                   "(%d). Decrease the number of OpenMP threads or rebuild GROMACS with a larger "
 344                   "value for GMX_OPENMP_MAX_THREADS passed to CMake.",
 345                   bondedThreading.nthreads, GMX_OPENMP_MAX_THREADS);
 346     }
 347     GMX_ASSERT(bondedThreading.nthreads <= BITMASK_SIZE,
 348                "We need at least nthreads bits in the mask");
 349
 350     const int nblock = (natoms + reduction_block_size - 1) >> reduction_block_bits;
 351
 352     f_thread->mask.resize(nblock);
 353     f_thread->block_index.resize(nblock);
 354     // NOTE: It seems f_thread->f does not need to be aligned
 355     f_thread->fBuffer.resize(nblock * reduction_block_size * sizeof(rvec4) / sizeof(real));
 356     f_thread->f = reinterpret_cast<rvec4*>(f_thread->fBuffer.data());
 357
 358     for (gmx_bitmask_t& mask : f_thread->mask)
 359     {
 360         bitmask_clear(&mask);
 361     }
 362
 363     gmx::ArrayRef<gmx_bitmask_t> mask = f_thread->mask;
 364
 365     for (int ftype = 0; ftype < F_NRE; ftype++)
 366     {
 367         if (ftype_is_bonded_potential(ftype))
 368         {
 369             int nb = idef.il[ftype].size();
 370             if (nb > 0)
 371             {
 372                 int nat1 = interaction_function[ftype].nratoms + 1;
 373
 374                 int nb0 = bondedThreading.workDivision.bound(ftype, thread);
 375                 int nb1 = bondedThreading.workDivision.bound(ftype, thread + 1);
 376
 377                 for (int i = nb0; i < nb1; i += nat1)
 378                 {
 379                     for (int a = 1; a < nat1; a++)
 380                     {
 381                         bitmask_set_bit(&mask[idef.il[ftype].iatoms[i + a] >> reduction_block_bits], thread);
 382                     }
 383                 }
 384             }
 385         }
 386     }
 387
 388     /* Make an index of the blocks our thread touches, so we can do fast
 389      * force buffer clearing.
 390      */
 391     f_thread->nblock_used = 0;
 392     for (int b = 0; b < nblock; b++)
 393     {
 394         if (bitmask_is_set(mask[b], thread))
 395         {
 396             f_thread->block_index[f_thread->nblock_used++] = b;
 397         }
 398     }
 399 }
 400
 401 void setup_bonded_threading(bonded_threading_t*           bt,
 402                             int                           numAtomsForce,
 403                             bool                          useGpuForBondeds,
 404                             const InteractionDefinitions& idef)
 405 {
 406     int ctot = 0;
 407
 408     assert(bt->nthreads >= 1);
 409
 410     bt->numAtomsForce = numAtomsForce;
 411
 412     /* Divide the bonded interaction over the threads */
 413     divide_bondeds_over_threads(bt, useGpuForBondeds, idef);
 414
 415     if (!bt->haveBondeds)
 416     {
 417         /* We don't have bondeds, so there is nothing to reduce */
 418         return;
 419     }
 420
 421     /* Determine to which blocks each thread's bonded force calculation
 422      * contributes. Store this as a mask for each thread.
 423      */
 424 #pragma omp parallel for num_threads(bt->nthreads) schedule(static)
 425     for (int t = 0; t < bt->nthreads; t++)
 426     {
 427         try
 428         {
 429             calc_bonded_reduction_mask(numAtomsForce, bt->f_t[t].get(), idef, t, *bt);
 430         }
 431         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
 432     }
 433
 434     /* Reduce the masks over the threads and determine which blocks
 435      * we need to reduce over.
 436      */
 437     int nblock_tot = (numAtomsForce + reduction_block_size - 1) >> reduction_block_bits;
 438     /* Ensure we have sufficient space for all blocks */
 439     if (static_cast<size_t>(nblock_tot) > bt->block_index.size())
 440     {
 441         bt->block_index.resize(nblock_tot);
 442     }
 443     if (static_cast<size_t>(nblock_tot) > bt->mask.size())
 444     {
 445         bt->mask.resize(nblock_tot);
 446     }
 447     bt->nblock_used = 0;
 448     for (int b = 0; b < nblock_tot; b++)
 449     {
 450         gmx_bitmask_t* mask = &bt->mask[b];
 451
 452         /* Generate the union over the threads of the bitmask */
 453         bitmask_clear(mask);
 454         for (int t = 0; t < bt->nthreads; t++)
 455         {
 456             bitmask_union(mask, bt->f_t[t]->mask[b]);
 457         }
 458         if (!bitmask_is_zero(*mask))
 459         {
 460             bt->block_index[bt->nblock_used++] = b;
 461         }
 462
 463         if (debug)
 464         {
 465             int c = 0;
 466             for (int t = 0; t < bt->nthreads; t++)
 467             {
 468                 if (bitmask_is_set(*mask, t))
 469                 {
 470                     c++;
 471                 }
 472             }
 473             ctot += c;
 474
 475             if (gmx_debug_at)
 476             {
 477                 fprintf(debug, "block %d flags %s count %d\n", b, to_hex_string(*mask).c_str(), c);
 478             }
 479         }
 480     }
 481     if (debug)
 482     {
 483         fprintf(debug, "Number of %d atom blocks to reduce: %d\n", reduction_block_size, bt->nblock_used);
 484         fprintf(debug, "Reduction density %.2f for touched blocks only %.2f\n",
 485                 ctot * reduction_block_size / static_cast<double>(numAtomsForce),
 486                 ctot / static_cast<double>(bt->nblock_used));
 487     }
 488 }
 489
 490 f_thread_t::f_thread_t(int numEnergyGroups) : fshift(SHIFTS), grpp(numEnergyGroups) {}
 491
 492 bonded_threading_t::bonded_threading_t(const int numThreads, const int numEnergyGroups, FILE* fplog) :
 493     nthreads(numThreads),
 494     nblock_used(0),
 495     haveBondeds(false),
 496     workDivision(nthreads),
 497     foreignLambdaWorkDivision(1)
 498 {
 499     /* These thread local data structures are used for bondeds only.
 500      *
 501      * Note that we also use there structures when running single-threaded.
 502      * This is because the bonded force buffer uses type rvec4, whereas
 503      * the normal force buffer is uses type rvec. This leads to a little
 504      * reduction overhead, but the speed gain in the bonded calculations
 505      * of doing transposeScatterIncr/DecrU with aligment 4 instead of 3
 506      * is much larger than the reduction overhead.
 507      */
 508     f_t.resize(numThreads);
 509 #pragma omp parallel for num_threads(nthreads) schedule(static)
 510     for (int t = 0; t < nthreads; t++)
 511     {
 512         try
 513         {
 514             /* Note that thread 0 uses the global fshift and energy arrays,
 515              * but to keep the code simple, we initialize all data here.
 516              */
 517             f_t[t] = std::make_unique<f_thread_t>(numEnergyGroups);
 518         }
 519         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
 520     }
 521
 522     /* The optimal value after which to switch from uniform to localized
 523      * bonded interaction distribution is 3, 4 or 5 depending on the system
 524      * and hardware.
 525      */
 526     const int max_nthread_uniform_default = 4;
 527     char*     ptr;
 528
 529     if ((ptr = getenv("GMX_BONDED_NTHREAD_UNIFORM")) != nullptr)
 530     {
 531         sscanf(ptr, "%d", &max_nthread_uniform);
 532         if (fplog != nullptr)
 533         {
 534             fprintf(fplog, "\nMax threads for uniform bonded distribution set to %d by env.var.\n",
 535                     max_nthread_uniform);
 536         }
 537     }
 538     else
 539     {
 540         max_nthread_uniform = max_nthread_uniform_default;
 541     }
 542 }