src/gromacs/nbnxm/gpu_common.h

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2017,2018,2019, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35 /*! \internal \file
  36  * \brief Common functions for the different NBNXN GPU implementations.
  37  *
  38  * \author Szilard Pall <pall.szilard@gmail.com>
  39  *
  40  * \ingroup module_nbnxm
  41  */
  42
  43 #ifndef GMX_NBNXM_GPU_COMMON_H
  44 #define GMX_NBNXM_GPU_COMMON_H
  45
  46 #include "config.h"
  47
  48 #include <string>
  49
  50 #if GMX_GPU == GMX_GPU_CUDA
  51 #include "cuda/nbnxm_cuda_types.h"
  52 #endif
  53
  54 #if GMX_GPU == GMX_GPU_OPENCL
  55 #include "opencl/nbnxm_ocl_types.h"
  56 #endif
  57
  58 #include "gromacs/gpu_utils/gpu_utils.h"
  59 #include "gromacs/math/vec.h"
  60 #include "gromacs/mdlib/force_flags.h"
  61 #include "gromacs/nbnxm/nbnxm.h"
  62 #include "gromacs/pbcutil/ishift.h"
  63 #include "gromacs/timing/gpu_timing.h"
  64 #include "gromacs/utility/fatalerror.h"
  65 #include "gromacs/utility/stringutil.h"
  66
  67 #include "gpu_common_utils.h"
  68 #include "nbnxm_gpu.h"
  69
  70 /*! \brief Check that atom locality values are valid for the GPU module.
  71  *
  72  *  In the GPU module atom locality "all" is not supported, the local and
  73  *  non-local ranges are treated separately.
  74  *
  75  *  \param[in] atomLocality atom locality specifier
  76  */
  77 static inline void validateGpuAtomLocality(int atomLocality)
  78 {
  79     std::string str = gmx::formatString("Invalid atom locality passed (%d); valid here is only "
  80                                         "local (%d) or nonlocal (%d)", atomLocality, eatLocal, eatNonlocal);
  81
  82     GMX_ASSERT(LOCAL_OR_NONLOCAL_A(atomLocality), str.c_str());
  83 }
  84
  85 /*! \brief Convert atom locality to interaction locality.
  86  *
  87  *  In the current implementation the this is straightforward conversion:
  88  *  local to local, non-local to non-local.
  89  *
  90  *  \param[in] atomLocality Atom locality specifier
  91  *  \returns                Interaction locality corresponding to the atom locality passed.
  92  */
  93 static inline int gpuAtomToInteractionLocality(int atomLocality)
  94 {
  95     validateGpuAtomLocality(atomLocality);
  96
  97     /* determine interaction locality from atom locality */
  98     if (LOCAL_A(atomLocality))
  99     {
 100         return eintLocal;
 101     }
 102     else if (NONLOCAL_A(atomLocality))
 103     {
 104         return eintNonlocal;
 105     }
 106     else
 107     {
 108         gmx_incons("Wrong locality");
 109     }
 110 }
 111
 112 /*! \brief Calculate atom range and return start index and length.
 113  *
 114  * \param[in] atomData Atom descriptor data structure
 115  * \param[in] atomLocality Atom locality specifier
 116  * \param[out] atomRangeBegin Starting index of the atom range in the atom data array.
 117  * \param[out] atomRangeLen Atom range length in the atom data array.
 118  */
 119 template <typename AtomDataT>
 120 static inline void getGpuAtomRange(const AtomDataT *atomData,
 121                                    int              atomLocality,
 122                                    int             *atomRangeBegin,
 123                                    int             *atomRangeLen)
 124 {
 125     assert(atomData);
 126     validateGpuAtomLocality(atomLocality);
 127
 128     /* calculate the atom data index range based on locality */
 129     if (LOCAL_A(atomLocality))
 130     {
 131         *atomRangeBegin  = 0;
 132         *atomRangeLen    = atomData->natoms_local;
 133     }
 134     else
 135     {
 136         *atomRangeBegin  = atomData->natoms_local;
 137         *atomRangeLen    = atomData->natoms - atomData->natoms_local;
 138     }
 139 }
 140
 141
 142 /*! \brief Count pruning kernel time if either kernel has been triggered
 143  *
 144  *  We do the accounting for either of the two pruning kernel flavors:
 145  *   - 1st pass prune: ran during the current step (prior to the force kernel);
 146  *   - rolling prune:  ran at the end of the previous step (prior to the current step H2D xq);
 147  *
 148  * Note that the resetting of cu_timers_t::didPrune and cu_timers_t::didRollingPrune should happen
 149  * after calling this function.
 150  *
 151  * \param[in] timers   structs with GPU timer objects
 152  * \param[inout] timings  GPU task timing data
 153  * \param[in] iloc        interaction locality
 154  */
 155 template <typename GpuTimers>
 156 static void countPruneKernelTime(GpuTimers                 *timers,
 157                                  gmx_wallclock_gpu_nbnxn_t *timings,
 158                                  const int                  iloc)
 159 {
 160     // We might have not done any pruning (e.g. if we skipped with empty domains).
 161     if (!timers->didPrune[iloc] && !timers->didRollingPrune[iloc])
 162     {
 163         return;
 164     }
 165
 166     if (timers->didPrune[iloc])
 167     {
 168         timings->pruneTime.c++;
 169         timings->pruneTime.t += timers->prune_k[iloc].getLastRangeTime();
 170     }
 171
 172     if (timers->didRollingPrune[iloc])
 173     {
 174         timings->dynamicPruneTime.c++;
 175         timings->dynamicPruneTime.t += timers->rollingPrune_k[iloc].getLastRangeTime();
 176     }
 177 }
 178
 179 /*! \brief Reduce data staged internally in the nbnxn module.
 180  *
 181  * Shift forces and electrostatic/LJ energies copied from the GPU into
 182  * a module-internal staging area are immediately reduced (CPU-side buffers passed)
 183  * after having waited for the transfers' completion.
 184  *
 185  * Note that this function should always be called after the transfers into the
 186  * staging buffers has completed.
 187  *
 188  * \tparam     StagingData    Type of staging data
 189  * \param[in]  nbst           Nonbonded staging data
 190  * \param[in]  iLocality      Interaction locality specifier
 191  * \param[in]  reduceEnergies True if energy reduction should be done
 192  * \param[in]  reduceFshift   True if shift force reduction should be done
 193  * \param[out] e_lj           Variable to accumulate LJ energy into
 194  * \param[out] e_el           Variable to accumulate electrostatic energy into
 195  * \param[out] fshift         Pointer to the array of shift forces to accumulate into
 196  */
 197 template <typename StagingData>
 198 static inline void nbnxn_gpu_reduce_staged_outputs(const StagingData &nbst,
 199                                                    int                iLocality,
 200                                                    bool               reduceEnergies,
 201                                                    bool               reduceFshift,
 202                                                    real              *e_lj,
 203                                                    real              *e_el,
 204                                                    rvec              *fshift)
 205 {
 206     /* add up energies and shift forces (only once at local F wait) */
 207     if (LOCAL_I(iLocality))
 208     {
 209         if (reduceEnergies)
 210         {
 211             *e_lj += *nbst.e_lj;
 212             *e_el += *nbst.e_el;
 213         }
 214
 215         if (reduceFshift)
 216         {
 217             for (int i = 0; i < SHIFTS; i++)
 218             {
 219                 rvec_inc(fshift[i], nbst.fshift[i]);
 220             }
 221         }
 222     }
 223 }
 224
 225 /*! \brief Do the per-step timing accounting of the nonbonded tasks.
 226  *
 227  *  Does timing accumulation and call-count increments for the nonbonded kernels.
 228  *  Note that this function should be called after the current step's nonbonded
 229  *  nonbonded tasks have completed with the exception of the rolling pruning kernels
 230  *  that are accounted for during the following step.
 231  *
 232  * NOTE: if timing with multiple GPUs (streams) becomes possible, the
 233  *      counters could end up being inconsistent due to not being incremented
 234  *      on some of the node when this is skipped on empty local domains!
 235  *
 236  * \tparam     GpuTimers         GPU timers type
 237  * \tparam     GpuPairlist       Pair list type
 238  * \param[out] timings           Pointer to the NB GPU timings data
 239  * \param[in]  timers            Pointer to GPU timers data
 240  * \param[in]  plist             Pointer to the pair list data
 241  * \param[in]  atomLocality      Atom locality specifier
 242  * \param[in]  didEnergyKernels  True if energy kernels have been called in the current step
 243  * \param[in]  doTiming          True if timing is enabled.
 244  *
 245  */
 246 template <typename GpuTimers, typename GpuPairlist>
 247 static inline void nbnxn_gpu_accumulate_timings(gmx_wallclock_gpu_nbnxn_t *timings,
 248                                                 GpuTimers                 *timers,
 249                                                 const GpuPairlist         *plist,
 250                                                 int                        atomLocality,
 251                                                 bool                       didEnergyKernels,
 252                                                 bool                       doTiming)
 253 {
 254     /* timing data accumulation */
 255     if (!doTiming)
 256     {
 257         return;
 258     }
 259
 260     /* determine interaction locality from atom locality */
 261     int iLocality = gpuAtomToInteractionLocality(atomLocality);
 262
 263     /* only increase counter once (at local F wait) */
 264     if (LOCAL_I(iLocality))
 265     {
 266         timings->nb_c++;
 267         timings->ktime[plist->haveFreshList ? 1 : 0][didEnergyKernels ? 1 : 0].c += 1;
 268     }
 269
 270     /* kernel timings */
 271     timings->ktime[plist->haveFreshList ? 1 : 0][didEnergyKernels ? 1 : 0].t +=
 272         timers->nb_k[iLocality].getLastRangeTime();
 273
 274     /* X/q H2D and F D2H timings */
 275     timings->nb_h2d_t += timers->nb_h2d[iLocality].getLastRangeTime();
 276     timings->nb_d2h_t += timers->nb_d2h[iLocality].getLastRangeTime();
 277
 278     /* Count the pruning kernel times for both cases:1st pass (at search step)
 279        and rolling pruning (if called at the previous step).
 280        We do the accounting here as this is the only sync point where we
 281        know (without checking or additional sync-ing) that prune tasks in
 282        in the current stream have completed (having just blocking-waited
 283        for the force D2H). */
 284     countPruneKernelTime(timers, timings, iLocality);
 285
 286     /* only count atdat and pair-list H2D at pair-search step */
 287     if (timers->didPairlistH2D[iLocality])
 288     {
 289         /* atdat transfer timing (add only once, at local F wait) */
 290         if (LOCAL_A(atomLocality))
 291         {
 292             timings->pl_h2d_c++;
 293             timings->pl_h2d_t += timers->atdat.getLastRangeTime();
 294         }
 295
 296         timings->pl_h2d_t += timers->pl_h2d[iLocality].getLastRangeTime();
 297
 298         /* Clear the timing flag for the next step */
 299         timers->didPairlistH2D[iLocality] = false;
 300     }
 301 }
 302
 303 //TODO: move into shared source file with gmx_compile_cpp_as_cuda
 304 //NOLINTNEXTLINE(misc-definitions-in-headers)
 305 bool nbnxn_gpu_try_finish_task(gmx_nbnxn_gpu_t  *nb,
 306                                int               flags,
 307                                int               aloc,
 308                                bool              haveOtherWork,
 309                                real             *e_lj,
 310                                real             *e_el,
 311                                rvec             *fshift,
 312                                GpuTaskCompletion completionKind)
 313 {
 314     /* determine interaction locality from atom locality */
 315     int iLocality = gpuAtomToInteractionLocality(aloc);
 316
 317     //  We skip when during the non-local phase there was actually no work to do.
 318     //  This is consistent with nbnxn_gpu_launch_kernel.
 319     if (haveOtherWork || !canSkipWork(nb, iLocality))
 320     {
 321         // Query the state of the GPU stream and return early if we're not done
 322         if (completionKind == GpuTaskCompletion::Check)
 323         {
 324             if (!haveStreamTasksCompleted(nb->stream[iLocality]))
 325             {
 326                 // Early return to skip the steps below that we have to do only
 327                 // after the NB task completed
 328                 return false;
 329             }
 330         }
 331         else
 332         {
 333             gpuStreamSynchronize(nb->stream[iLocality]);
 334         }
 335
 336         bool calcEner   = (flags & GMX_FORCE_ENERGY) != 0;
 337         bool calcFshift = (flags & GMX_FORCE_VIRIAL) != 0;
 338
 339         nbnxn_gpu_accumulate_timings(nb->timings, nb->timers, nb->plist[iLocality], aloc, calcEner,
 340                                      nb->bDoTime != 0);
 341
 342         nbnxn_gpu_reduce_staged_outputs(nb->nbst, iLocality, calcEner, calcFshift, e_lj, e_el, fshift);
 343     }
 344
 345     /* Always reset both pruning flags (doesn't hurt doing it even when timing is off). */
 346     nb->timers->didPrune[iLocality] = nb->timers->didRollingPrune[iLocality] = false;
 347
 348     /* Turn off initial list pruning (doesn't hurt if this is not pair-search step). */
 349     nb->plist[iLocality]->haveFreshList = false;
 350
 351     return true;
 352 }
 353
 354 /*! \brief
 355  * Wait for the asynchronously launched nonbonded tasks and data
 356  * transfers to finish.
 357  *
 358  * Also does timing accounting and reduction of the internal staging buffers.
 359  * As this is called at the end of the step, it also resets the pair list and
 360  * pruning flags.
 361  *
 362  * \param[in] nb The nonbonded data GPU structure
 363  * \param[in] flags Force flags
 364  * \param[in] aloc Atom locality identifier
 365  * \param[in] haveOtherWork  Tells whether there is other work than non-bonded work in the nbnxn stream(s)
 366  * \param[out] e_lj Pointer to the LJ energy output to accumulate into
 367  * \param[out] e_el Pointer to the electrostatics energy output to accumulate into
 368  * \param[out] fshift Pointer to the shift force buffer to accumulate into
 369  */
 370 //NOLINTNEXTLINE(misc-definitions-in-headers) TODO: move into source file
 371 void nbnxn_gpu_wait_finish_task(gmx_nbnxn_gpu_t *nb,
 372                                 int              flags,
 373                                 int              aloc,
 374                                 bool             haveOtherWork,
 375                                 real            *e_lj,
 376                                 real            *e_el,
 377                                 rvec            *fshift)
 378 {
 379     nbnxn_gpu_try_finish_task(nb, flags, aloc, haveOtherWork, e_lj, e_el, fshift,
 380                               GpuTaskCompletion::Wait);
 381 }
 382
 383 #endif