src/gromacs/nbnxm/gpu_common.h

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2017,2018,2019, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35 /*! \internal \file
  36  * \brief Common functions for the different NBNXN GPU implementations.
  37  *
  38  * \author Szilard Pall <pall.szilard@gmail.com>
  39  *
  40  * \ingroup module_nbnxm
  41  */
  42
  43 #ifndef GMX_NBNXM_GPU_COMMON_H
  44 #define GMX_NBNXM_GPU_COMMON_H
  45
  46 #include "config.h"
  47
  48 #include <string>
  49
  50 #if GMX_GPU == GMX_GPU_CUDA
  51 #include "cuda/nbnxm_cuda_types.h"
  52 #endif
  53
  54 #if GMX_GPU == GMX_GPU_OPENCL
  55 #include "opencl/nbnxm_ocl_types.h"
  56 #endif
  57
  58 #include "gromacs/gpu_utils/gpu_utils.h"
  59 #include "gromacs/listed_forces/gpubonded.h"
  60 #include "gromacs/math/vec.h"
  61 #include "gromacs/mdlib/ppforceworkload.h"
  62 #include "gromacs/nbnxm/nbnxm.h"
  63 #include "gromacs/pbcutil/ishift.h"
  64 #include "gromacs/timing/gpu_timing.h"
  65 #include "gromacs/timing/wallcycle.h"
  66 #include "gromacs/utility/fatalerror.h"
  67 #include "gromacs/utility/stringutil.h"
  68
  69 #include "gpu_common_utils.h"
  70 #include "nbnxm_gpu.h"
  71
  72 namespace gmx
  73 {
  74 class GpuBonded;
  75 }
  76
  77 namespace Nbnxm
  78 {
  79
  80 /*! \brief Check that atom locality values are valid for the GPU module.
  81  *
  82  *  In the GPU module atom locality "all" is not supported, the local and
  83  *  non-local ranges are treated separately.
  84  *
  85  *  \param[in] atomLocality atom locality specifier
  86  */
  87 static inline void
  88 validateGpuAtomLocality(const AtomLocality atomLocality)
  89 {
  90     std::string str = gmx::formatString("Invalid atom locality passed (%d); valid here is only "
  91                                         "local (%d) or nonlocal (%d)",
  92                                         static_cast<int>(atomLocality),
  93                                         static_cast<int>(AtomLocality::Local),
  94                                         static_cast<int>(AtomLocality::NonLocal));
  95
  96     GMX_ASSERT(atomLocality == AtomLocality::Local || atomLocality == AtomLocality::NonLocal, str.c_str());
  97 }
  98
  99 /*! \brief Convert atom locality to interaction locality.
 100  *
 101  *  In the current implementation the this is straightforward conversion:
 102  *  local to local, non-local to non-local.
 103  *
 104  *  \param[in] atomLocality Atom locality specifier
 105  *  \returns                Interaction locality corresponding to the atom locality passed.
 106  */
 107 static inline InteractionLocality
 108 gpuAtomToInteractionLocality(const AtomLocality atomLocality)
 109 {
 110     validateGpuAtomLocality(atomLocality);
 111
 112     /* determine interaction locality from atom locality */
 113     if (atomLocality == AtomLocality::Local)
 114     {
 115         return InteractionLocality::Local;
 116     }
 117     else if (atomLocality == AtomLocality::NonLocal)
 118     {
 119         return InteractionLocality::NonLocal;
 120     }
 121     else
 122     {
 123         gmx_incons("Wrong locality");
 124     }
 125 }
 126
 127
 128 //NOLINTNEXTLINE(misc-definitions-in-headers)
 129 void setupGpuShortRangeWork(gmx_nbnxn_gpu_t                  *nb,
 130                             const gmx::GpuBonded             *gpuBonded,
 131                             const Nbnxm::InteractionLocality  iLocality)
 132 {
 133     GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
 134
 135     // There is short-range work if the pair list for the provided
 136     // interaction locality contains entries or if there is any
 137     // bonded work (as this is not split into local/nonlocal).
 138     nb->haveWork[iLocality] =
 139         ((nb->plist[iLocality]->nsci != 0) ||
 140          (gpuBonded != nullptr && gpuBonded->haveInteractions()));
 141 }
 142
 143 /*! \brief Returns true if there is GPU short-range work for the given interaction locality.
 144  *
 145  * Note that as, unlike nonbonded tasks, bonded tasks are not split into local/nonlocal,
 146  * and therefore if there are GPU offloaded bonded interactions, this function will return
 147  * true for all interaction localities.
 148  *
 149  * \param[inout]  nb        Pointer to the nonbonded GPU data structure
 150  * \param[in]     iLocality Interaction locality identifier
 151  */
 152 static bool
 153 haveGpuShortRangeWork(const gmx_nbnxn_gpu_t            &nb,
 154                       const Nbnxm::InteractionLocality  iLocality)
 155 {
 156     return nb.haveWork[iLocality];
 157 }
 158
 159 //NOLINTNEXTLINE(misc-definitions-in-headers)
 160 bool haveGpuShortRangeWork(const gmx_nbnxn_gpu_t     *nb,
 161                            const Nbnxm::AtomLocality  aLocality)
 162 {
 163     GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
 164
 165     return haveGpuShortRangeWork(*nb, gpuAtomToInteractionLocality(aLocality));
 166 }
 167
 168
 169
 170 /*! \brief Calculate atom range and return start index and length.
 171  *
 172  * \param[in] atomData Atom descriptor data structure
 173  * \param[in] atomLocality Atom locality specifier
 174  * \param[out] atomRangeBegin Starting index of the atom range in the atom data array.
 175  * \param[out] atomRangeLen Atom range length in the atom data array.
 176  */
 177 template <typename AtomDataT>
 178 static inline void
 179 getGpuAtomRange(const AtomDataT    *atomData,
 180                 const AtomLocality  atomLocality,
 181                 int                *atomRangeBegin,
 182                 int                *atomRangeLen)
 183 {
 184     assert(atomData);
 185     validateGpuAtomLocality(atomLocality);
 186
 187     /* calculate the atom data index range based on locality */
 188     if (atomLocality == AtomLocality::Local)
 189     {
 190         *atomRangeBegin  = 0;
 191         *atomRangeLen    = atomData->natoms_local;
 192     }
 193     else
 194     {
 195         *atomRangeBegin  = atomData->natoms_local;
 196         *atomRangeLen    = atomData->natoms - atomData->natoms_local;
 197     }
 198 }
 199
 200
 201 /*! \brief Count pruning kernel time if either kernel has been triggered
 202  *
 203  *  We do the accounting for either of the two pruning kernel flavors:
 204  *   - 1st pass prune: ran during the current step (prior to the force kernel);
 205  *   - rolling prune:  ran at the end of the previous step (prior to the current step H2D xq);
 206  *
 207  * Note that the resetting of cu_timers_t::didPrune and cu_timers_t::didRollingPrune should happen
 208  * after calling this function.
 209  *
 210  * \param[in] timers   structs with GPU timer objects
 211  * \param[inout] timings  GPU task timing data
 212  * \param[in] iloc        interaction locality
 213  */
 214 template <typename GpuTimers>
 215 static void countPruneKernelTime(GpuTimers                 *timers,
 216                                  gmx_wallclock_gpu_nbnxn_t *timings,
 217                                  const InteractionLocality  iloc)
 218 {
 219     gpu_timers_t::Interaction &iTimers = timers->interaction[iloc];
 220
 221     // We might have not done any pruning (e.g. if we skipped with empty domains).
 222     if (!iTimers.didPrune &&
 223         !iTimers.didRollingPrune)
 224     {
 225         return;
 226     }
 227
 228     if (iTimers.didPrune)
 229     {
 230         timings->pruneTime.c++;
 231         timings->pruneTime.t += iTimers.prune_k.getLastRangeTime();
 232     }
 233
 234     if (iTimers.didRollingPrune)
 235     {
 236         timings->dynamicPruneTime.c++;
 237         timings->dynamicPruneTime.t += iTimers.rollingPrune_k.getLastRangeTime();
 238     }
 239 }
 240
 241 /*! \brief Reduce data staged internally in the nbnxn module.
 242  *
 243  * Shift forces and electrostatic/LJ energies copied from the GPU into
 244  * a module-internal staging area are immediately reduced (CPU-side buffers passed)
 245  * after having waited for the transfers' completion.
 246  *
 247  * Note that this function should always be called after the transfers into the
 248  * staging buffers has completed.
 249  *
 250  * \tparam     StagingData    Type of staging data
 251  * \param[in]  nbst           Nonbonded staging data
 252  * \param[in]  iLocality      Interaction locality specifier
 253  * \param[in]  reduceEnergies True if energy reduction should be done
 254  * \param[in]  reduceFshift   True if shift force reduction should be done
 255  * \param[out] e_lj           Variable to accumulate LJ energy into
 256  * \param[out] e_el           Variable to accumulate electrostatic energy into
 257  * \param[out] fshift         Pointer to the array of shift forces to accumulate into
 258  */
 259 template <typename StagingData>
 260 static inline void
 261 gpu_reduce_staged_outputs(const StagingData         &nbst,
 262                           const InteractionLocality  iLocality,
 263                           const bool                 reduceEnergies,
 264                           const bool                 reduceFshift,
 265                           real                      *e_lj,
 266                           real                      *e_el,
 267                           rvec                      *fshift)
 268 {
 269     /* add up energies and shift forces (only once at local F wait) */
 270     if (iLocality == InteractionLocality::Local)
 271     {
 272         if (reduceEnergies)
 273         {
 274             *e_lj += *nbst.e_lj;
 275             *e_el += *nbst.e_el;
 276         }
 277
 278         if (reduceFshift)
 279         {
 280             for (int i = 0; i < SHIFTS; i++)
 281             {
 282                 rvec_inc(fshift[i], nbst.fshift[i]);
 283             }
 284         }
 285     }
 286 }
 287
 288 /*! \brief Do the per-step timing accounting of the nonbonded tasks.
 289  *
 290  *  Does timing accumulation and call-count increments for the nonbonded kernels.
 291  *  Note that this function should be called after the current step's nonbonded
 292  *  nonbonded tasks have completed with the exception of the rolling pruning kernels
 293  *  that are accounted for during the following step.
 294  *
 295  * NOTE: if timing with multiple GPUs (streams) becomes possible, the
 296  *      counters could end up being inconsistent due to not being incremented
 297  *      on some of the node when this is skipped on empty local domains!
 298  *
 299  * \tparam     GpuTimers         GPU timers type
 300  * \tparam     GpuPairlist       Pair list type
 301  * \param[out] timings           Pointer to the NB GPU timings data
 302  * \param[in]  timers            Pointer to GPU timers data
 303  * \param[in]  plist             Pointer to the pair list data
 304  * \param[in]  atomLocality      Atom locality specifier
 305  * \param[in]  didEnergyKernels  True if energy kernels have been called in the current step
 306  * \param[in]  doTiming          True if timing is enabled.
 307  *
 308  */
 309 template <typename GpuTimers, typename GpuPairlist>
 310 static inline void
 311 gpu_accumulate_timings(gmx_wallclock_gpu_nbnxn_t *timings,
 312                        GpuTimers                 *timers,
 313                        const GpuPairlist         *plist,
 314                        AtomLocality               atomLocality,
 315                        bool                       didEnergyKernels,
 316                        bool                       doTiming)
 317 {
 318     /* timing data accumulation */
 319     if (!doTiming)
 320     {
 321         return;
 322     }
 323
 324     /* determine interaction locality from atom locality */
 325     const InteractionLocality iLocality = gpuAtomToInteractionLocality(atomLocality);
 326
 327     /* only increase counter once (at local F wait) */
 328     if (iLocality == InteractionLocality::Local)
 329     {
 330         timings->nb_c++;
 331         timings->ktime[plist->haveFreshList ? 1 : 0][didEnergyKernels ? 1 : 0].c += 1;
 332     }
 333
 334     /* kernel timings */
 335     timings->ktime[plist->haveFreshList ? 1 : 0][didEnergyKernels ? 1 : 0].t +=
 336         timers->interaction[iLocality].nb_k.getLastRangeTime();
 337
 338     /* X/q H2D and F D2H timings */
 339     timings->nb_h2d_t += timers->xf[atomLocality].nb_h2d.getLastRangeTime();
 340     timings->nb_d2h_t += timers->xf[atomLocality].nb_d2h.getLastRangeTime();
 341
 342     /* Count the pruning kernel times for both cases:1st pass (at search step)
 343        and rolling pruning (if called at the previous step).
 344        We do the accounting here as this is the only sync point where we
 345        know (without checking or additional sync-ing) that prune tasks in
 346        in the current stream have completed (having just blocking-waited
 347        for the force D2H). */
 348     countPruneKernelTime(timers, timings, iLocality);
 349
 350     /* only count atdat and pair-list H2D at pair-search step */
 351     if (timers->interaction[iLocality].didPairlistH2D)
 352     {
 353         /* atdat transfer timing (add only once, at local F wait) */
 354         if (atomLocality == AtomLocality::Local)
 355         {
 356             timings->pl_h2d_c++;
 357             timings->pl_h2d_t += timers->atdat.getLastRangeTime();
 358         }
 359
 360         timings->pl_h2d_t += timers->interaction[iLocality].pl_h2d.getLastRangeTime();
 361
 362         /* Clear the timing flag for the next step */
 363         timers->interaction[iLocality].didPairlistH2D = false;
 364     }
 365 }
 366
 367 //TODO: move into shared source file with gmx_compile_cpp_as_cuda
 368 //NOLINTNEXTLINE(misc-definitions-in-headers)
 369 bool gpu_try_finish_task(gmx_nbnxn_gpu_t          *nb,
 370                          const gmx::ForceFlags    &forceFlags,
 371                          const AtomLocality        aloc,
 372                          real                     *e_lj,
 373                          real                     *e_el,
 374                          gmx::ArrayRef<gmx::RVec>  shiftForces,
 375                          GpuTaskCompletion         completionKind,
 376                          gmx_wallcycle            *wcycle)
 377 {
 378     GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
 379
 380     /* determine interaction locality from atom locality */
 381     const InteractionLocality iLocality = gpuAtomToInteractionLocality(aloc);
 382
 383     //  We skip when during the non-local phase there was actually no work to do.
 384     //  This is consistent with nbnxn_gpu_launch_kernel but it also considers possible
 385     //  bonded GPU work.
 386     if ((iLocality == InteractionLocality::Local) || haveGpuShortRangeWork(*nb, iLocality))
 387     {
 388         // Query the state of the GPU stream and return early if we're not done
 389         if (completionKind == GpuTaskCompletion::Check)
 390         {
 391             // To get the wcycle call count right, when in GpuTaskCompletion::Check mode,
 392             // we start without counting and only when the task finished we issue a
 393             // start/stop to increment.
 394             // GpuTaskCompletion::Wait mode the timing is expected to be done in the caller.
 395             wallcycle_start_nocount(wcycle, ewcWAIT_GPU_NB_L);
 396
 397             if (!haveStreamTasksCompleted(nb->stream[iLocality]))
 398             {
 399                 wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
 400
 401                 // Early return to skip the steps below that we have to do only
 402                 // after the NB task completed
 403                 return false;
 404             }
 405
 406             wallcycle_increment_event_count(wcycle, ewcWAIT_GPU_NB_L);
 407         }
 408         else
 409         {
 410             gpuStreamSynchronize(nb->stream[iLocality]);
 411         }
 412
 413         gpu_accumulate_timings(nb->timings, nb->timers, nb->plist[iLocality], aloc, forceFlags.computeEnergy,
 414                                nb->bDoTime != 0);
 415
 416         gpu_reduce_staged_outputs(nb->nbst, iLocality, forceFlags.computeEnergy, forceFlags.computeVirial,
 417                                   e_lj, e_el, as_rvec_array(shiftForces.data()));
 418     }
 419
 420     /* Always reset both pruning flags (doesn't hurt doing it even when timing is off). */
 421     nb->timers->interaction[iLocality].didPrune = nb->timers->interaction[iLocality].didRollingPrune = false;
 422
 423     /* Turn off initial list pruning (doesn't hurt if this is not pair-search step). */
 424     nb->plist[iLocality]->haveFreshList = false;
 425
 426     return true;
 427 }
 428
 429 /*! \brief
 430  * Wait for the asynchronously launched nonbonded tasks and data
 431  * transfers to finish.
 432  *
 433  * Also does timing accounting and reduction of the internal staging buffers.
 434  * As this is called at the end of the step, it also resets the pair list and
 435  * pruning flags.
 436  *
 437  * \param[in] nb The nonbonded data GPU structure
 438  * \param[in]  forceFlags     Force schedule flags
 439  * \param[in] aloc Atom locality identifier
 440  * \param[out] e_lj Pointer to the LJ energy output to accumulate into
 441  * \param[out] e_el Pointer to the electrostatics energy output to accumulate into
 442  * \param[out] shiftForces Shift forces buffer to accumulate into
 443  * \param[out] wcycle Pointer to wallcycle data structure
 444  * \return            The number of cycles the gpu wait took
 445  */
 446 //NOLINTNEXTLINE(misc-definitions-in-headers) TODO: move into source file
 447 float gpu_wait_finish_task(gmx_nbnxn_gpu_t         *nb,
 448                            const gmx::ForceFlags   &forceFlags,
 449                            AtomLocality             aloc,
 450                            real                    *e_lj,
 451                            real                    *e_el,
 452                            gmx::ArrayRef<gmx::RVec> shiftForces,
 453                            gmx_wallcycle           *wcycle)
 454 {
 455     auto cycleCounter =
 456         (gpuAtomToInteractionLocality(aloc) == InteractionLocality::Local) ? ewcWAIT_GPU_NB_L : ewcWAIT_GPU_NB_NL;
 457
 458     wallcycle_start(wcycle, cycleCounter);
 459     gpu_try_finish_task(nb, forceFlags, aloc, e_lj, e_el, shiftForces,
 460                         GpuTaskCompletion::Wait, wcycle);
 461     float waitTime = wallcycle_stop(wcycle, cycleCounter);
 462
 463     return waitTime;
 464 }
 465
 466 } // namespace Nbnxm
 467
 468 #endif