src/gromacs/nbnxm/gpu_common.h

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2017,2018,2019,2020,2021, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35 /*! \internal \file
  36  * \brief Common functions for the different NBNXN GPU implementations.
  37  *
  38  * \author Szilard Pall <pall.szilard@gmail.com>
  39  *
  40  * \ingroup module_nbnxm
  41  */
  42
  43 #ifndef GMX_NBNXM_GPU_COMMON_H
  44 #define GMX_NBNXM_GPU_COMMON_H
  45
  46 #include "config.h"
  47
  48 #include <string>
  49
  50 #if GMX_GPU_CUDA
  51 #    include "cuda/nbnxm_cuda_types.h"
  52 #endif
  53
  54 #if GMX_GPU_OPENCL
  55 #    include "opencl/nbnxm_ocl_types.h"
  56 #endif
  57
  58 #if GMX_GPU_SYCL
  59 #    include "sycl/nbnxm_sycl_types.h"
  60 #endif
  61
  62 #include "gromacs/gpu_utils/gpu_utils.h"
  63 #include "gromacs/listed_forces/gpubonded.h"
  64 #include "gromacs/math/vec.h"
  65 #include "gromacs/mdtypes/simulation_workload.h"
  66 #include "gromacs/nbnxm/nbnxm.h"
  67 #include "gromacs/pbcutil/ishift.h"
  68 #include "gromacs/timing/gpu_timing.h"
  69 #include "gromacs/timing/wallcycle.h"
  70 #include "gromacs/utility/fatalerror.h"
  71 #include "gromacs/utility/stringutil.h"
  72
  73 #include "gpu_common_utils.h"
  74 #include "nbnxm_gpu.h"
  75
  76 namespace gmx
  77 {
  78 class GpuBonded;
  79 }
  80
  81 namespace Nbnxm
  82 {
  83
  84 /*! \brief Check that atom locality values are valid for the GPU module.
  85  *
  86  *  In the GPU module atom locality "all" is not supported, the local and
  87  *  non-local ranges are treated separately.
  88  *
  89  *  \param[in] atomLocality atom locality specifier
  90  */
  91 static inline void validateGpuAtomLocality(const AtomLocality atomLocality)
  92 {
  93     std::string str = gmx::formatString(
  94             "Invalid atom locality passed (%d); valid here is only "
  95             "local (%d) or nonlocal (%d)",
  96             static_cast<int>(atomLocality),
  97             static_cast<int>(AtomLocality::Local),
  98             static_cast<int>(AtomLocality::NonLocal));
  99
 100     GMX_ASSERT(atomLocality == AtomLocality::Local || atomLocality == AtomLocality::NonLocal, str.c_str());
 101 }
 102
 103 /*! \brief Convert atom locality to interaction locality.
 104  *
 105  *  In the current implementation the this is straightforward conversion:
 106  *  local to local, non-local to non-local.
 107  *
 108  *  \param[in] atomLocality Atom locality specifier
 109  *  \returns                Interaction locality corresponding to the atom locality passed.
 110  */
 111 static inline InteractionLocality gpuAtomToInteractionLocality(const AtomLocality atomLocality)
 112 {
 113     validateGpuAtomLocality(atomLocality);
 114
 115     /* determine interaction locality from atom locality */
 116     if (atomLocality == AtomLocality::Local)
 117     {
 118         return InteractionLocality::Local;
 119     }
 120     else if (atomLocality == AtomLocality::NonLocal)
 121     {
 122         return InteractionLocality::NonLocal;
 123     }
 124     else
 125     {
 126         gmx_incons("Wrong locality");
 127     }
 128 }
 129
 130
 131 //NOLINTNEXTLINE(misc-definitions-in-headers)
 132 void setupGpuShortRangeWork(NbnxmGpu* nb, const gmx::GpuBonded* gpuBonded, const gmx::InteractionLocality iLocality)
 133 {
 134     GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
 135
 136     // There is short-range work if the pair list for the provided
 137     // interaction locality contains entries or if there is any
 138     // bonded work (as this is not split into local/nonlocal).
 139     nb->haveWork[iLocality] = ((nb->plist[iLocality]->nsci != 0)
 140                                || (gpuBonded != nullptr && gpuBonded->haveInteractions()));
 141 }
 142
 143 /*! \brief Returns true if there is GPU short-range work for the given interaction locality.
 144  *
 145  * Note that as, unlike nonbonded tasks, bonded tasks are not split into local/nonlocal,
 146  * and therefore if there are GPU offloaded bonded interactions, this function will return
 147  * true for all interaction localities.
 148  *
 149  * \param[inout]  nb        Pointer to the nonbonded GPU data structure
 150  * \param[in]     iLocality Interaction locality identifier
 151  */
 152 static bool haveGpuShortRangeWork(const NbnxmGpu& nb, const gmx::InteractionLocality iLocality)
 153 {
 154     return nb.haveWork[iLocality];
 155 }
 156
 157 //NOLINTNEXTLINE(misc-definitions-in-headers)
 158 bool haveGpuShortRangeWork(const NbnxmGpu* nb, const gmx::AtomLocality aLocality)
 159 {
 160     GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
 161
 162     return haveGpuShortRangeWork(*nb, gpuAtomToInteractionLocality(aLocality));
 163 }
 164
 165
 166 /*! \brief Calculate atom range and return start index and length.
 167  *
 168  * \param[in] atomData Atom descriptor data structure
 169  * \param[in] atomLocality Atom locality specifier
 170  * \param[out] atomRangeBegin Starting index of the atom range in the atom data array.
 171  * \param[out] atomRangeLen Atom range length in the atom data array.
 172  */
 173 template<typename AtomDataT>
 174 static inline void getGpuAtomRange(const AtomDataT*   atomData,
 175                                    const AtomLocality atomLocality,
 176                                    int*               atomRangeBegin,
 177                                    int*               atomRangeLen)
 178 {
 179     assert(atomData);
 180     validateGpuAtomLocality(atomLocality);
 181
 182     /* calculate the atom data index range based on locality */
 183     if (atomLocality == AtomLocality::Local)
 184     {
 185         *atomRangeBegin = 0;
 186         *atomRangeLen   = atomData->natoms_local;
 187     }
 188     else
 189     {
 190         *atomRangeBegin = atomData->natoms_local;
 191         *atomRangeLen   = atomData->natoms - atomData->natoms_local;
 192     }
 193 }
 194
 195
 196 /*! \brief Count pruning kernel time if either kernel has been triggered
 197  *
 198  *  We do the accounting for either of the two pruning kernel flavors:
 199  *   - 1st pass prune: ran during the current step (prior to the force kernel);
 200  *   - rolling prune:  ran at the end of the previous step (prior to the current step H2D xq);
 201  *
 202  * Note that the resetting of cu_timers_t::didPrune and cu_timers_t::didRollingPrune should happen
 203  * after calling this function.
 204  *
 205  * \param[in] timers   structs with GPU timer objects
 206  * \param[inout] timings  GPU task timing data
 207  * \param[in] iloc        interaction locality
 208  */
 209 template<typename GpuTimers>
 210 static void countPruneKernelTime(GpuTimers*                 timers,
 211                                  gmx_wallclock_gpu_nbnxn_t* timings,
 212                                  const InteractionLocality  iloc)
 213 {
 214     gpu_timers_t::Interaction& iTimers = timers->interaction[iloc];
 215
 216     // We might have not done any pruning (e.g. if we skipped with empty domains).
 217     if (!iTimers.didPrune && !iTimers.didRollingPrune)
 218     {
 219         return;
 220     }
 221
 222     if (iTimers.didPrune)
 223     {
 224         timings->pruneTime.c++;
 225         timings->pruneTime.t += iTimers.prune_k.getLastRangeTime();
 226     }
 227
 228     if (iTimers.didRollingPrune)
 229     {
 230         timings->dynamicPruneTime.c++;
 231         timings->dynamicPruneTime.t += iTimers.rollingPrune_k.getLastRangeTime();
 232     }
 233 }
 234
 235 /*! \brief Reduce data staged internally in the nbnxn module.
 236  *
 237  * Shift forces and electrostatic/LJ energies copied from the GPU into
 238  * a module-internal staging area are immediately reduced (CPU-side buffers passed)
 239  * after having waited for the transfers' completion.
 240  *
 241  * Note that this function should always be called after the transfers into the
 242  * staging buffers has completed.
 243  *
 244  * \tparam     StagingData    Type of staging data
 245  * \param[in]  nbst           Nonbonded staging data
 246  * \param[in]  iLocality      Interaction locality specifier
 247  * \param[in]  reduceEnergies True if energy reduction should be done
 248  * \param[in]  reduceFshift   True if shift force reduction should be done
 249  * \param[out] e_lj           Variable to accumulate LJ energy into
 250  * \param[out] e_el           Variable to accumulate electrostatic energy into
 251  * \param[out] fshift         Pointer to the array of shift forces to accumulate into
 252  */
 253 static inline void gpu_reduce_staged_outputs(const nb_staging_t&       nbst,
 254                                              const InteractionLocality iLocality,
 255                                              const bool                reduceEnergies,
 256                                              const bool                reduceFshift,
 257                                              real*                     e_lj,
 258                                              real*                     e_el,
 259                                              rvec*                     fshift)
 260 {
 261     /* add up energies and shift forces (only once at local F wait) */
 262     if (iLocality == InteractionLocality::Local)
 263     {
 264         if (reduceEnergies)
 265         {
 266             *e_lj += *nbst.e_lj;
 267             *e_el += *nbst.e_el;
 268         }
 269
 270         if (reduceFshift)
 271         {
 272             for (int i = 0; i < SHIFTS; i++)
 273             {
 274                 rvec_inc(fshift[i], nbst.fshift[i]);
 275             }
 276         }
 277     }
 278 }
 279
 280 /*! \brief Do the per-step timing accounting of the nonbonded tasks.
 281  *
 282  *  Does timing accumulation and call-count increments for the nonbonded kernels.
 283  *  Note that this function should be called after the current step's nonbonded
 284  *  nonbonded tasks have completed with the exception of the rolling pruning kernels
 285  *  that are accounted for during the following step.
 286  *
 287  * NOTE: if timing with multiple GPUs (streams) becomes possible, the
 288  *      counters could end up being inconsistent due to not being incremented
 289  *      on some of the node when this is skipped on empty local domains!
 290  *
 291  * \tparam     GpuTimers         GPU timers type
 292  * \tparam     GpuPairlist       Pair list type
 293  * \param[out] timings           Pointer to the NB GPU timings data
 294  * \param[in]  timers            Pointer to GPU timers data
 295  * \param[in]  plist             Pointer to the pair list data
 296  * \param[in]  atomLocality      Atom locality specifier
 297  * \param[in]  stepWork          Force schedule flags
 298  * \param[in]  doTiming          True if timing is enabled.
 299  *
 300  */
 301 template<typename GpuTimers, typename GpuPairlist>
 302 static inline void gpu_accumulate_timings(gmx_wallclock_gpu_nbnxn_t* timings,
 303                                           GpuTimers*                 timers,
 304                                           const GpuPairlist*         plist,
 305                                           AtomLocality               atomLocality,
 306                                           const gmx::StepWorkload&   stepWork,
 307                                           bool                       doTiming)
 308 {
 309     /* timing data accumulation */
 310     if (!doTiming)
 311     {
 312         return;
 313     }
 314
 315     /* determine interaction locality from atom locality */
 316     const InteractionLocality iLocality        = gpuAtomToInteractionLocality(atomLocality);
 317     const bool                didEnergyKernels = stepWork.computeEnergy;
 318
 319     /* only increase counter once (at local F wait) */
 320     if (iLocality == InteractionLocality::Local)
 321     {
 322         timings->nb_c++;
 323         timings->ktime[plist->haveFreshList ? 1 : 0][didEnergyKernels ? 1 : 0].c += 1;
 324     }
 325
 326     /* kernel timings */
 327     timings->ktime[plist->haveFreshList ? 1 : 0][didEnergyKernels ? 1 : 0].t +=
 328             timers->interaction[iLocality].nb_k.getLastRangeTime();
 329
 330     /* X/q H2D and F D2H timings */
 331     timings->nb_h2d_t += timers->xf[atomLocality].nb_h2d.getLastRangeTime();
 332     timings->nb_d2h_t += timers->xf[atomLocality].nb_d2h.getLastRangeTime();
 333
 334     /* Count the pruning kernel times for both cases:1st pass (at search step)
 335        and rolling pruning (if called at the previous step).
 336        We do the accounting here as this is the only sync point where we
 337        know (without checking or additional sync-ing) that prune tasks in
 338        in the current stream have completed (having just blocking-waited
 339        for the force D2H). */
 340     countPruneKernelTime(timers, timings, iLocality);
 341
 342     /* only count atdat at pair-search steps (add only once, at local F wait) */
 343     if (stepWork.doNeighborSearch && atomLocality == AtomLocality::Local)
 344     {
 345         /* atdat transfer timing */
 346         timings->pl_h2d_c++;
 347         timings->pl_h2d_t += timers->atdat.getLastRangeTime();
 348     }
 349
 350     /* only count pair-list H2D when actually performed */
 351     if (timers->interaction[iLocality].didPairlistH2D)
 352     {
 353         timings->pl_h2d_t += timers->interaction[iLocality].pl_h2d.getLastRangeTime();
 354
 355         /* Clear the timing flag for the next step */
 356         timers->interaction[iLocality].didPairlistH2D = false;
 357     }
 358 }
 359
 360 /*! \brief Attempts to complete nonbonded GPU task.
 361  *
 362  * See documentation in nbnxm_gpu.h for details.
 363  *
 364  * \todo Move into shared source file, perhaps including
 365  * cuda_runtime.h if needed for any remaining CUDA-specific
 366  * objects.
 367  */
 368 //NOLINTNEXTLINE(misc-definitions-in-headers)
 369 bool gpu_try_finish_task(NbnxmGpu*                nb,
 370                          const gmx::StepWorkload& stepWork,
 371                          const AtomLocality       aloc,
 372                          real*                    e_lj,
 373                          real*                    e_el,
 374                          gmx::ArrayRef<gmx::RVec> shiftForces,
 375                          GpuTaskCompletion        completionKind,
 376                          gmx_wallcycle*           wcycle)
 377 {
 378     GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
 379
 380     /* determine interaction locality from atom locality */
 381     const InteractionLocality iLocality = gpuAtomToInteractionLocality(aloc);
 382
 383
 384     // Transfers are launched and therefore need to be waited on if:
 385     // - buffer ops is not offloaded
 386     // - energies or virials are needed (on the local stream)
 387     //
 388     // (Note that useGpuFBufferOps and computeVirial are mutually exclusive
 389     // in current code as virial steps do CPU reduction.)
 390     const bool haveResultToWaitFor =
 391             (!stepWork.useGpuFBufferOps
 392              || (aloc == AtomLocality::Local && (stepWork.computeEnergy || stepWork.computeVirial)));
 393
 394     //  We skip when during the non-local phase there was actually no work to do.
 395     //  This is consistent with nbnxn_gpu_launch_kernel but it also considers possible
 396     //  bonded GPU work.
 397     if ((iLocality == InteractionLocality::Local) || haveGpuShortRangeWork(*nb, iLocality))
 398     {
 399         // Query the state of the GPU stream and return early if we're not done
 400         if (completionKind == GpuTaskCompletion::Check)
 401         {
 402             // To get the wcycle call count right, when in GpuTaskCompletion::Check mode,
 403             // we start without counting and only when the task finished we issue a
 404             // start/stop to increment.
 405             // GpuTaskCompletion::Wait mode the timing is expected to be done in the caller.
 406             wallcycle_start_nocount(wcycle, ewcWAIT_GPU_NB_L);
 407
 408             if (!haveStreamTasksCompleted(*nb->deviceStreams[iLocality]))
 409             {
 410                 wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
 411
 412                 // Early return to skip the steps below that we have to do only
 413                 // after the NB task completed
 414                 return false;
 415             }
 416
 417             wallcycle_increment_event_count(wcycle, ewcWAIT_GPU_NB_L);
 418         }
 419         else if (haveResultToWaitFor)
 420         {
 421             nb->deviceStreams[iLocality]->synchronize();
 422         }
 423
 424         // TODO: this needs to be moved later because conditional wait could brake timing
 425         // with a future OpenCL implementation, but with CUDA timing is anyway disabled
 426         // in all cases where we skip the wait.
 427         gpu_accumulate_timings(nb->timings, nb->timers, nb->plist[iLocality], aloc, stepWork, nb->bDoTime);
 428
 429         if (stepWork.computeEnergy || stepWork.computeVirial)
 430         {
 431             gpu_reduce_staged_outputs(nb->nbst,
 432                                       iLocality,
 433                                       stepWork.computeEnergy,
 434                                       stepWork.computeVirial,
 435                                       e_lj,
 436                                       e_el,
 437                                       as_rvec_array(shiftForces.data()));
 438         }
 439     }
 440
 441     /* Reset both pruning flags. */
 442     if (nb->bDoTime)
 443     {
 444         nb->timers->interaction[iLocality].didPrune =
 445                 nb->timers->interaction[iLocality].didRollingPrune = false;
 446     }
 447
 448     /* Turn off initial list pruning (doesn't hurt if this is not pair-search step). */
 449     nb->plist[iLocality]->haveFreshList = false;
 450
 451     return true;
 452 }
 453
 454 /*! \brief
 455  * Wait for the asynchronously launched nonbonded tasks and data
 456  * transfers to finish.
 457  *
 458  * Also does timing accounting and reduction of the internal staging buffers.
 459  * As this is called at the end of the step, it also resets the pair list and
 460  * pruning flags.
 461  *
 462  * \param[in] nb The nonbonded data GPU structure
 463  * \param[in]  stepWork     Force schedule flags
 464  * \param[in] aloc Atom locality identifier
 465  * \param[out] e_lj Pointer to the LJ energy output to accumulate into
 466  * \param[out] e_el Pointer to the electrostatics energy output to accumulate into
 467  * \param[out] shiftForces Shift forces buffer to accumulate into
 468  * \param[out] wcycle Pointer to wallcycle data structure
 469  * \return            The number of cycles the gpu wait took
 470  */
 471 //NOLINTNEXTLINE(misc-definitions-in-headers) TODO: move into source file
 472 float gpu_wait_finish_task(NbnxmGpu*                nb,
 473                            const gmx::StepWorkload& stepWork,
 474                            AtomLocality             aloc,
 475                            real*                    e_lj,
 476                            real*                    e_el,
 477                            gmx::ArrayRef<gmx::RVec> shiftForces,
 478                            gmx_wallcycle*           wcycle)
 479 {
 480     auto cycleCounter = (gpuAtomToInteractionLocality(aloc) == InteractionLocality::Local)
 481                                 ? ewcWAIT_GPU_NB_L
 482                                 : ewcWAIT_GPU_NB_NL;
 483
 484     wallcycle_start(wcycle, cycleCounter);
 485     gpu_try_finish_task(nb, stepWork, aloc, e_lj, e_el, shiftForces, GpuTaskCompletion::Wait, wcycle);
 486     float waitTime = wallcycle_stop(wcycle, cycleCounter);
 487
 488     return waitTime;
 489 }
 490
 491 } // namespace Nbnxm
 492
 493 #endif