src/gromacs/listed-forces/manage-threading.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2004, The GROMACS development team.
   6  * Copyright (c) 2013,2014, by the GROMACS development team, led by
   7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   8  * and including many others, as listed in the AUTHORS file in the
   9  * top-level source directory and at http://www.gromacs.org.
  10  *
  11  * GROMACS is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public License
  13  * as published by the Free Software Foundation; either version 2.1
  14  * of the License, or (at your option) any later version.
  15  *
  16  * GROMACS is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19  * Lesser General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU Lesser General Public
  22  * License along with GROMACS; if not, see
  23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  25  *
  26  * If you want to redistribute modifications to GROMACS, please
  27  * consider that scientific software is very special. Version
  28  * control is crucial - bugs must be traceable. We will be happy to
  29  * consider code for inclusion in the official distribution, but
  30  * derived work must not be called official GROMACS. Details are found
  31  * in the README & COPYING files - if they are missing, get the
  32  * official version at http://www.gromacs.org.
  33  *
  34  * To help us fund GROMACS development, we humbly ask that you cite
  35  * the research papers on the package. Check out http://www.gromacs.org.
  36  */
  37 /*! \internal \file
  38  * \brief This file defines functions for managing threading of listed
  39  * interactions.
  40  *
  41  * \author Mark Abraham <mark.j.abraham@gmail.com>
  42  * \ingroup module_listed-forces
  43  */
  44 #include "gmxpre.h"
  45
  46 #include "manage-threading.h"
  47
  48 #include <assert.h>
  49
  50 #include <algorithm>
  51
  52 #include "gromacs/listed-forces/bonded.h"
  53 #include "gromacs/utility/fatalerror.h"
  54 #include "gromacs/utility/smalloc.h"
  55
  56 //! Divides listed interactions over threads
  57 static void divide_bondeds_over_threads(t_idef *idef, int nthreads)
  58 {
  59     int ftype;
  60     int nat1;
  61     int t;
  62     int il_nr_thread;
  63
  64     idef->nthreads = nthreads;
  65
  66     if (F_NRE*(nthreads+1) > idef->il_thread_division_nalloc)
  67     {
  68         idef->il_thread_division_nalloc = F_NRE*(nthreads+1);
  69         snew(idef->il_thread_division, idef->il_thread_division_nalloc);
  70     }
  71
  72     for (ftype = 0; ftype < F_NRE; ftype++)
  73     {
  74         if (ftype_is_bonded_potential(ftype))
  75         {
  76             nat1 = interaction_function[ftype].nratoms + 1;
  77
  78             for (t = 0; t <= nthreads; t++)
  79             {
  80                 /* Divide the interactions equally over the threads.
  81                  * When the different types of bonded interactions
  82                  * are distributed roughly equally over the threads,
  83                  * this should lead to well localized output into
  84                  * the force buffer on each thread.
  85                  * If this is not the case, a more advanced scheme
  86                  * (not implemented yet) will do better.
  87                  */
  88                 il_nr_thread = (((idef->il[ftype].nr/nat1)*t)/nthreads)*nat1;
  89
  90                 /* Ensure that distance restraint pairs with the same label
  91                  * end up on the same thread.
  92                  * This is slighlty tricky code, since the next for iteration
  93                  * may have an initial il_nr_thread lower than the final value
  94                  * in the previous iteration, but this will anyhow be increased
  95                  * to the approriate value again by this while loop.
  96                  */
  97                 while (ftype == F_DISRES &&
  98                        il_nr_thread > 0 &&
  99                        il_nr_thread < idef->il[ftype].nr &&
 100                        idef->iparams[idef->il[ftype].iatoms[il_nr_thread]].disres.label ==
 101                        idef->iparams[idef->il[ftype].iatoms[il_nr_thread-nat1]].disres.label)
 102                 {
 103                     il_nr_thread += nat1;
 104                 }
 105
 106                 idef->il_thread_division[ftype*(nthreads+1)+t] = il_nr_thread;
 107             }
 108         }
 109     }
 110 }
 111
 112 //! Construct a reduction mask for which interaction was computed on which thread
 113 static unsigned
 114 calc_bonded_reduction_mask(const t_idef *idef,
 115                            int shift,
 116                            int t, int nt)
 117 {
 118     unsigned mask;
 119     int      ftype, nb, nat1, nb0, nb1, i, a;
 120
 121     mask = 0;
 122
 123     for (ftype = 0; ftype < F_NRE; ftype++)
 124     {
 125         if (ftype_is_bonded_potential(ftype))
 126         {
 127             nb = idef->il[ftype].nr;
 128             if (nb > 0)
 129             {
 130                 nat1 = interaction_function[ftype].nratoms + 1;
 131
 132                 /* Divide this interaction equally over the threads.
 133                  * This is not stored: should match division in calc_bonds.
 134                  */
 135                 nb0 = idef->il_thread_division[ftype*(nt+1)+t];
 136                 nb1 = idef->il_thread_division[ftype*(nt+1)+t+1];
 137
 138                 for (i = nb0; i < nb1; i += nat1)
 139                 {
 140                     for (a = 1; a < nat1; a++)
 141                     {
 142                         mask |= (1U << (idef->il[ftype].iatoms[i+a]>>shift));
 143                     }
 144                 }
 145             }
 146         }
 147     }
 148
 149     return mask;
 150 }
 151
 152
 153 /*! \brief We divide the force array in a maximum of 32 blocks.
 154  * Minimum force block reduction size is thus 2^6=64.
 155  */
 156 const int maxBlockBits = 32;
 157
 158 void setup_bonded_threading(t_forcerec   *fr, t_idef *idef)
 159 {
 160     int t;
 161     int ctot, c, b;
 162
 163     assert(fr->nthreads >= 1);
 164
 165     /* Divide the bonded interaction over the threads */
 166     divide_bondeds_over_threads(idef, fr->nthreads);
 167
 168     if (fr->nthreads == 1)
 169     {
 170         fr->red_nblock = 0;
 171
 172         return;
 173     }
 174
 175     fr->red_ashift = 6;
 176     while (fr->natoms_force > (int)(maxBlockBits*(1U<<fr->red_ashift)))
 177     {
 178         fr->red_ashift++;
 179     }
 180     if (debug)
 181     {
 182         fprintf(debug, "bonded force buffer block atom shift %d bits\n",
 183                 fr->red_ashift);
 184     }
 185
 186     /* Determine to which blocks each thread's bonded force calculation
 187      * contributes. Store this is a mask for each thread.
 188      */
 189 #pragma omp parallel for num_threads(fr->nthreads) schedule(static)
 190     for (t = 1; t < fr->nthreads; t++)
 191     {
 192         fr->f_t[t].red_mask =
 193             calc_bonded_reduction_mask(idef, fr->red_ashift, t, fr->nthreads);
 194     }
 195
 196     /* Determine the maximum number of blocks we need to reduce over */
 197     fr->red_nblock = 0;
 198     ctot           = 0;
 199     for (t = 0; t < fr->nthreads; t++)
 200     {
 201         c = 0;
 202         for (b = 0; b < maxBlockBits; b++)
 203         {
 204             if (fr->f_t[t].red_mask & (1U<<b))
 205             {
 206                 fr->red_nblock = std::max(fr->red_nblock, b+1);
 207                 c++;
 208             }
 209         }
 210         if (debug)
 211         {
 212             fprintf(debug, "thread %d flags %x count %d\n",
 213                     t, fr->f_t[t].red_mask, c);
 214         }
 215         ctot += c;
 216     }
 217     if (debug)
 218     {
 219         fprintf(debug, "Number of blocks to reduce: %d of size %d\n",
 220                 fr->red_nblock, 1<<fr->red_ashift);
 221         fprintf(debug, "Reduction density %.2f density/#thread %.2f\n",
 222                 ctot*(1<<fr->red_ashift)/(double)fr->natoms_force,
 223                 ctot*(1<<fr->red_ashift)/(double)(fr->natoms_force*fr->nthreads));
 224     }
 225 }