src/gromacs/nbnxm/gpu_common.h

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2017,2018,2019,2020,2021, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35 /*! \internal \file
  36  * \brief Common functions for the different NBNXN GPU implementations.
  37  *
  38  * \author Szilard Pall <pall.szilard@gmail.com>
  39  *
  40  * \ingroup module_nbnxm
  41  */
  42
  43 #ifndef GMX_NBNXM_GPU_COMMON_H
  44 #define GMX_NBNXM_GPU_COMMON_H
  45
  46 #include "config.h"
  47
  48 #include <string>
  49
  50 #if GMX_GPU_CUDA
  51 #    include "cuda/nbnxm_cuda_types.h"
  52 #endif
  53
  54 #if GMX_GPU_OPENCL
  55 #    include "opencl/nbnxm_ocl_types.h"
  56 #endif
  57
  58 #if GMX_GPU_SYCL
  59 #    include "sycl/nbnxm_sycl_types.h"
  60 #endif
  61
  62 #include "gromacs/gpu_utils/gpu_utils.h"
  63 #include "gromacs/listed_forces/gpubonded.h"
  64 #include "gromacs/math/vec.h"
  65 #include "gromacs/mdtypes/simulation_workload.h"
  66 #include "gromacs/nbnxm/nbnxm.h"
  67 #include "gromacs/pbcutil/ishift.h"
  68 #include "gromacs/timing/gpu_timing.h"
  69 #include "gromacs/timing/wallcycle.h"
  70 #include "gromacs/utility/fatalerror.h"
  71 #include "gromacs/utility/range.h"
  72 #include "gromacs/utility/stringutil.h"
  73
  74 #include "gpu_common_utils.h"
  75 #include "nbnxm_gpu.h"
  76
  77 namespace gmx
  78 {
  79 class GpuBonded;
  80 }
  81
  82 namespace Nbnxm
  83 {
  84
  85 /*! \brief Check that atom locality values are valid for the GPU module.
  86  *
  87  *  In the GPU module atom locality "all" is not supported, the local and
  88  *  non-local ranges are treated separately.
  89  *
  90  *  \param[in] atomLocality atom locality specifier
  91  */
  92 static inline void validateGpuAtomLocality(const AtomLocality atomLocality)
  93 {
  94     std::string str = gmx::formatString(
  95             "Invalid atom locality passed (%d); valid here is only "
  96             "local (%d) or nonlocal (%d)",
  97             static_cast<int>(atomLocality),
  98             static_cast<int>(AtomLocality::Local),
  99             static_cast<int>(AtomLocality::NonLocal));
 100
 101     GMX_ASSERT(atomLocality == AtomLocality::Local || atomLocality == AtomLocality::NonLocal, str.c_str());
 102 }
 103
 104 /*! \brief Convert atom locality to interaction locality.
 105  *
 106  *  In the current implementation the this is straightforward conversion:
 107  *  local to local, non-local to non-local.
 108  *
 109  *  \param[in] atomLocality Atom locality specifier
 110  *  \returns                Interaction locality corresponding to the atom locality passed.
 111  */
 112 static inline InteractionLocality gpuAtomToInteractionLocality(const AtomLocality atomLocality)
 113 {
 114     validateGpuAtomLocality(atomLocality);
 115
 116     /* determine interaction locality from atom locality */
 117     if (atomLocality == AtomLocality::Local)
 118     {
 119         return InteractionLocality::Local;
 120     }
 121     else if (atomLocality == AtomLocality::NonLocal)
 122     {
 123         return InteractionLocality::NonLocal;
 124     }
 125     else
 126     {
 127         gmx_incons("Wrong locality");
 128     }
 129 }
 130
 131
 132 //NOLINTNEXTLINE(misc-definitions-in-headers)
 133 void setupGpuShortRangeWork(NbnxmGpu* nb, const gmx::GpuBonded* gpuBonded, const gmx::InteractionLocality iLocality)
 134 {
 135     GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
 136
 137     // There is short-range work if the pair list for the provided
 138     // interaction locality contains entries or if there is any
 139     // bonded work (as this is not split into local/nonlocal).
 140     nb->haveWork[iLocality] = ((nb->plist[iLocality]->nsci != 0)
 141                                || (gpuBonded != nullptr && gpuBonded->haveInteractions()));
 142 }
 143
 144 /*! \brief Returns true if there is GPU short-range work for the given interaction locality.
 145  *
 146  * Note that as, unlike nonbonded tasks, bonded tasks are not split into local/nonlocal,
 147  * and therefore if there are GPU offloaded bonded interactions, this function will return
 148  * true for all interaction localities.
 149  *
 150  * \param[inout]  nb        Pointer to the nonbonded GPU data structure
 151  * \param[in]     iLocality Interaction locality identifier
 152  */
 153 static bool haveGpuShortRangeWork(const NbnxmGpu& nb, const gmx::InteractionLocality iLocality)
 154 {
 155     return nb.haveWork[iLocality];
 156 }
 157
 158 //NOLINTNEXTLINE(misc-definitions-in-headers)
 159 bool haveGpuShortRangeWork(const NbnxmGpu* nb, const gmx::AtomLocality aLocality)
 160 {
 161     GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
 162
 163     return haveGpuShortRangeWork(*nb, gpuAtomToInteractionLocality(aLocality));
 164 }
 165
 166
 167 /*! \brief Calculate atom range and return start index and length.
 168  *
 169  * \param[in] atomData Atom descriptor data structure
 170  * \param[in] atomLocality Atom locality specifier
 171  * \returns Range of indexes for selected locality.
 172  */
 173 static inline gmx::Range<int> getGpuAtomRange(const NBAtomData* atomData, const AtomLocality atomLocality)
 174 {
 175     assert(atomData);
 176     validateGpuAtomLocality(atomLocality);
 177
 178     /* calculate the atom data index range based on locality */
 179     if (atomLocality == AtomLocality::Local)
 180     {
 181         return gmx::Range<int>(0, atomData->numAtomsLocal);
 182     }
 183     else
 184     {
 185         return gmx::Range<int>(atomData->numAtomsLocal, atomData->numAtoms);
 186     }
 187 }
 188
 189
 190 /*! \brief Count pruning kernel time if either kernel has been triggered
 191  *
 192  *  We do the accounting for either of the two pruning kernel flavors:
 193  *   - 1st pass prune: ran during the current step (prior to the force kernel);
 194  *   - rolling prune:  ran at the end of the previous step (prior to the current step H2D xq);
 195  *
 196  * Note that the resetting of cu_timers_t::didPrune and cu_timers_t::didRollingPrune should happen
 197  * after calling this function.
 198  *
 199  * \param[in] timers   structs with GPU timer objects
 200  * \param[inout] timings  GPU task timing data
 201  * \param[in] iloc        interaction locality
 202  */
 203 template<typename GpuTimers>
 204 static void countPruneKernelTime(GpuTimers*                 timers,
 205                                  gmx_wallclock_gpu_nbnxn_t* timings,
 206                                  const InteractionLocality  iloc)
 207 {
 208     gpu_timers_t::Interaction& iTimers = timers->interaction[iloc];
 209
 210     // We might have not done any pruning (e.g. if we skipped with empty domains).
 211     if (!iTimers.didPrune && !iTimers.didRollingPrune)
 212     {
 213         return;
 214     }
 215
 216     if (iTimers.didPrune)
 217     {
 218         timings->pruneTime.c++;
 219         timings->pruneTime.t += iTimers.prune_k.getLastRangeTime();
 220     }
 221
 222     if (iTimers.didRollingPrune)
 223     {
 224         timings->dynamicPruneTime.c++;
 225         timings->dynamicPruneTime.t += iTimers.rollingPrune_k.getLastRangeTime();
 226     }
 227 }
 228
 229 /*! \brief Reduce data staged internally in the nbnxn module.
 230  *
 231  * Shift forces and electrostatic/LJ energies copied from the GPU into
 232  * a module-internal staging area are immediately reduced (CPU-side buffers passed)
 233  * after having waited for the transfers' completion.
 234  *
 235  * Note that this function should always be called after the transfers into the
 236  * staging buffers has completed.
 237  *
 238  * \param[in]  nbst           Nonbonded staging data
 239  * \param[in]  iLocality      Interaction locality specifier
 240  * \param[in]  reduceEnergies True if energy reduction should be done
 241  * \param[in]  reduceFshift   True if shift force reduction should be done
 242  * \param[out] e_lj           Variable to accumulate LJ energy into
 243  * \param[out] e_el           Variable to accumulate electrostatic energy into
 244  * \param[out] fshift         Pointer to the array of shift forces to accumulate into
 245  */
 246 static inline void gpu_reduce_staged_outputs(const NBStagingData&      nbst,
 247                                              const InteractionLocality iLocality,
 248                                              const bool                reduceEnergies,
 249                                              const bool                reduceFshift,
 250                                              real*                     e_lj,
 251                                              real*                     e_el,
 252                                              rvec*                     fshift)
 253 {
 254     /* add up energies and shift forces (only once at local F wait) */
 255     if (iLocality == InteractionLocality::Local)
 256     {
 257         if (reduceEnergies)
 258         {
 259             *e_lj += *nbst.eLJ;
 260             *e_el += *nbst.eElec;
 261         }
 262
 263         if (reduceFshift)
 264         {
 265             for (int i = 0; i < SHIFTS; i++)
 266             {
 267                 rvec_inc(fshift[i], nbst.fShift[i]);
 268             }
 269         }
 270     }
 271 }
 272
 273 /*! \brief Do the per-step timing accounting of the nonbonded tasks.
 274  *
 275  *  Does timing accumulation and call-count increments for the nonbonded kernels.
 276  *  Note that this function should be called after the current step's nonbonded
 277  *  nonbonded tasks have completed with the exception of the rolling pruning kernels
 278  *  that are accounted for during the following step.
 279  *
 280  * NOTE: if timing with multiple GPUs (streams) becomes possible, the
 281  *      counters could end up being inconsistent due to not being incremented
 282  *      on some of the node when this is skipped on empty local domains!
 283  *
 284  * \tparam     GpuTimers         GPU timers type
 285  * \tparam     GpuPairlist       Pair list type
 286  * \param[out] timings           Pointer to the NB GPU timings data
 287  * \param[in]  timers            Pointer to GPU timers data
 288  * \param[in]  plist             Pointer to the pair list data
 289  * \param[in]  atomLocality      Atom locality specifier
 290  * \param[in]  stepWork          Force schedule flags
 291  * \param[in]  doTiming          True if timing is enabled.
 292  *
 293  */
 294 template<typename GpuTimers, typename GpuPairlist>
 295 static inline void gpu_accumulate_timings(gmx_wallclock_gpu_nbnxn_t* timings,
 296                                           GpuTimers*                 timers,
 297                                           const GpuPairlist*         plist,
 298                                           AtomLocality               atomLocality,
 299                                           const gmx::StepWorkload&   stepWork,
 300                                           bool                       doTiming)
 301 {
 302     /* timing data accumulation */
 303     if (!doTiming)
 304     {
 305         return;
 306     }
 307
 308     /* determine interaction locality from atom locality */
 309     const InteractionLocality iLocality        = gpuAtomToInteractionLocality(atomLocality);
 310     const bool                didEnergyKernels = stepWork.computeEnergy;
 311
 312     /* only increase counter once (at local F wait) */
 313     if (iLocality == InteractionLocality::Local)
 314     {
 315         timings->nb_c++;
 316         timings->ktime[plist->haveFreshList ? 1 : 0][didEnergyKernels ? 1 : 0].c += 1;
 317     }
 318
 319     /* kernel timings */
 320     timings->ktime[plist->haveFreshList ? 1 : 0][didEnergyKernels ? 1 : 0].t +=
 321             timers->interaction[iLocality].nb_k.getLastRangeTime();
 322
 323     /* X/q H2D and F D2H timings */
 324     timings->nb_h2d_t += timers->xf[atomLocality].nb_h2d.getLastRangeTime();
 325     timings->nb_d2h_t += timers->xf[atomLocality].nb_d2h.getLastRangeTime();
 326
 327     /* Count the pruning kernel times for both cases:1st pass (at search step)
 328        and rolling pruning (if called at the previous step).
 329        We do the accounting here as this is the only sync point where we
 330        know (without checking or additional sync-ing) that prune tasks in
 331        in the current stream have completed (having just blocking-waited
 332        for the force D2H). */
 333     countPruneKernelTime(timers, timings, iLocality);
 334
 335     /* only count atdat at pair-search steps (add only once, at local F wait) */
 336     if (stepWork.doNeighborSearch && atomLocality == AtomLocality::Local)
 337     {
 338         /* atdat transfer timing */
 339         timings->pl_h2d_c++;
 340         timings->pl_h2d_t += timers->atdat.getLastRangeTime();
 341     }
 342
 343     /* only count pair-list H2D when actually performed */
 344     if (timers->interaction[iLocality].didPairlistH2D)
 345     {
 346         timings->pl_h2d_t += timers->interaction[iLocality].pl_h2d.getLastRangeTime();
 347
 348         /* Clear the timing flag for the next step */
 349         timers->interaction[iLocality].didPairlistH2D = false;
 350     }
 351 }
 352
 353 /*! \brief Attempts to complete nonbonded GPU task.
 354  *
 355  * See documentation in nbnxm_gpu.h for details.
 356  *
 357  * \todo Move into shared source file, perhaps including
 358  * cuda_runtime.h if needed for any remaining CUDA-specific
 359  * objects.
 360  */
 361 //NOLINTNEXTLINE(misc-definitions-in-headers)
 362 bool gpu_try_finish_task(NbnxmGpu*                nb,
 363                          const gmx::StepWorkload& stepWork,
 364                          const AtomLocality       aloc,
 365                          real*                    e_lj,
 366                          real*                    e_el,
 367                          gmx::ArrayRef<gmx::RVec> shiftForces,
 368                          GpuTaskCompletion        completionKind,
 369                          gmx_wallcycle*           wcycle)
 370 {
 371     GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
 372
 373     /* determine interaction locality from atom locality */
 374     const InteractionLocality iLocality = gpuAtomToInteractionLocality(aloc);
 375
 376
 377     // Transfers are launched and therefore need to be waited on if:
 378     // - buffer ops is not offloaded
 379     // - energies or virials are needed (on the local stream)
 380     //
 381     // (Note that useGpuFBufferOps and computeVirial are mutually exclusive
 382     // in current code as virial steps do CPU reduction.)
 383     const bool haveResultToWaitFor =
 384             (!stepWork.useGpuFBufferOps
 385              || (aloc == AtomLocality::Local && (stepWork.computeEnergy || stepWork.computeVirial)));
 386
 387     //  We skip when during the non-local phase there was actually no work to do.
 388     //  This is consistent with nbnxn_gpu_launch_kernel but it also considers possible
 389     //  bonded GPU work.
 390     if ((iLocality == InteractionLocality::Local) || haveGpuShortRangeWork(*nb, iLocality))
 391     {
 392         // Query the state of the GPU stream and return early if we're not done
 393         if (completionKind == GpuTaskCompletion::Check)
 394         {
 395             // To get the wcycle call count right, when in GpuTaskCompletion::Check mode,
 396             // we start without counting and only when the task finished we issue a
 397             // start/stop to increment.
 398             // GpuTaskCompletion::Wait mode the timing is expected to be done in the caller.
 399             wallcycle_start_nocount(wcycle, ewcWAIT_GPU_NB_L);
 400
 401             if (!haveStreamTasksCompleted(*nb->deviceStreams[iLocality]))
 402             {
 403                 wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
 404
 405                 // Early return to skip the steps below that we have to do only
 406                 // after the NB task completed
 407                 return false;
 408             }
 409
 410             wallcycle_increment_event_count(wcycle, ewcWAIT_GPU_NB_L);
 411         }
 412         else if (haveResultToWaitFor)
 413         {
 414             nb->deviceStreams[iLocality]->synchronize();
 415         }
 416
 417         // TODO: this needs to be moved later because conditional wait could brake timing
 418         // with a future OpenCL implementation, but with CUDA timing is anyway disabled
 419         // in all cases where we skip the wait.
 420         gpu_accumulate_timings(nb->timings, nb->timers, nb->plist[iLocality], aloc, stepWork, nb->bDoTime);
 421
 422         if (stepWork.computeEnergy || stepWork.computeVirial)
 423         {
 424             gpu_reduce_staged_outputs(nb->nbst,
 425                                       iLocality,
 426                                       stepWork.computeEnergy,
 427                                       stepWork.computeVirial,
 428                                       e_lj,
 429                                       e_el,
 430                                       as_rvec_array(shiftForces.data()));
 431         }
 432     }
 433
 434     /* Reset both pruning flags. */
 435     if (nb->bDoTime)
 436     {
 437         nb->timers->interaction[iLocality].didPrune =
 438                 nb->timers->interaction[iLocality].didRollingPrune = false;
 439     }
 440
 441     /* Turn off initial list pruning (doesn't hurt if this is not pair-search step). */
 442     nb->plist[iLocality]->haveFreshList = false;
 443
 444     return true;
 445 }
 446
 447 /*! \brief
 448  * Wait for the asynchronously launched nonbonded tasks and data
 449  * transfers to finish.
 450  *
 451  * Also does timing accounting and reduction of the internal staging buffers.
 452  * As this is called at the end of the step, it also resets the pair list and
 453  * pruning flags.
 454  *
 455  * \param[in] nb The nonbonded data GPU structure
 456  * \param[in]  stepWork     Force schedule flags
 457  * \param[in] aloc Atom locality identifier
 458  * \param[out] e_lj Pointer to the LJ energy output to accumulate into
 459  * \param[out] e_el Pointer to the electrostatics energy output to accumulate into
 460  * \param[out] shiftForces Shift forces buffer to accumulate into
 461  * \param[out] wcycle Pointer to wallcycle data structure
 462  * \return            The number of cycles the gpu wait took
 463  */
 464 //NOLINTNEXTLINE(misc-definitions-in-headers) TODO: move into source file
 465 float gpu_wait_finish_task(NbnxmGpu*                nb,
 466                            const gmx::StepWorkload& stepWork,
 467                            AtomLocality             aloc,
 468                            real*                    e_lj,
 469                            real*                    e_el,
 470                            gmx::ArrayRef<gmx::RVec> shiftForces,
 471                            gmx_wallcycle*           wcycle)
 472 {
 473     auto cycleCounter = (gpuAtomToInteractionLocality(aloc) == InteractionLocality::Local)
 474                                 ? ewcWAIT_GPU_NB_L
 475                                 : ewcWAIT_GPU_NB_NL;
 476
 477     wallcycle_start(wcycle, cycleCounter);
 478     gpu_try_finish_task(nb, stepWork, aloc, e_lj, e_el, shiftForces, GpuTaskCompletion::Wait, wcycle);
 479     float waitTime = wallcycle_stop(wcycle, cycleCounter);
 480
 481     return waitTime;
 482 }
 483
 484 } // namespace Nbnxm
 485
 486 #endif