src/gromacs/nbnxm/gpu_common.h

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2017,2018,2019,2020, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35 /*! \internal \file
  36  * \brief Common functions for the different NBNXN GPU implementations.
  37  *
  38  * \author Szilard Pall <pall.szilard@gmail.com>
  39  *
  40  * \ingroup module_nbnxm
  41  */
  42
  43 #ifndef GMX_NBNXM_GPU_COMMON_H
  44 #define GMX_NBNXM_GPU_COMMON_H
  45
  46 #include "config.h"
  47
  48 #include <string>
  49
  50 #if GMX_GPU == GMX_GPU_CUDA
  51 #    include "cuda/nbnxm_cuda_types.h"
  52 #endif
  53
  54 #if GMX_GPU == GMX_GPU_OPENCL
  55 #    include "opencl/nbnxm_ocl_types.h"
  56 #endif
  57
  58 #include "gromacs/gpu_utils/gpu_utils.h"
  59 #include "gromacs/listed_forces/gpubonded.h"
  60 #include "gromacs/math/vec.h"
  61 #include "gromacs/mdtypes/simulation_workload.h"
  62 #include "gromacs/nbnxm/nbnxm.h"
  63 #include "gromacs/pbcutil/ishift.h"
  64 #include "gromacs/timing/gpu_timing.h"
  65 #include "gromacs/timing/wallcycle.h"
  66 #include "gromacs/utility/fatalerror.h"
  67 #include "gromacs/utility/stringutil.h"
  68
  69 #include "gpu_common_utils.h"
  70 #include "nbnxm_gpu.h"
  71
  72 namespace gmx
  73 {
  74 class GpuBonded;
  75 }
  76
  77 namespace Nbnxm
  78 {
  79
  80 /*! \brief Check that atom locality values are valid for the GPU module.
  81  *
  82  *  In the GPU module atom locality "all" is not supported, the local and
  83  *  non-local ranges are treated separately.
  84  *
  85  *  \param[in] atomLocality atom locality specifier
  86  */
  87 static inline void validateGpuAtomLocality(const AtomLocality atomLocality)
  88 {
  89     std::string str = gmx::formatString(
  90             "Invalid atom locality passed (%d); valid here is only "
  91             "local (%d) or nonlocal (%d)",
  92             static_cast<int>(atomLocality), static_cast<int>(AtomLocality::Local),
  93             static_cast<int>(AtomLocality::NonLocal));
  94
  95     GMX_ASSERT(atomLocality == AtomLocality::Local || atomLocality == AtomLocality::NonLocal, str.c_str());
  96 }
  97
  98 /*! \brief Convert atom locality to interaction locality.
  99  *
 100  *  In the current implementation the this is straightforward conversion:
 101  *  local to local, non-local to non-local.
 102  *
 103  *  \param[in] atomLocality Atom locality specifier
 104  *  \returns                Interaction locality corresponding to the atom locality passed.
 105  */
 106 static inline InteractionLocality gpuAtomToInteractionLocality(const AtomLocality atomLocality)
 107 {
 108     validateGpuAtomLocality(atomLocality);
 109
 110     /* determine interaction locality from atom locality */
 111     if (atomLocality == AtomLocality::Local)
 112     {
 113         return InteractionLocality::Local;
 114     }
 115     else if (atomLocality == AtomLocality::NonLocal)
 116     {
 117         return InteractionLocality::NonLocal;
 118     }
 119     else
 120     {
 121         gmx_incons("Wrong locality");
 122     }
 123 }
 124
 125
 126 //NOLINTNEXTLINE(misc-definitions-in-headers)
 127 void setupGpuShortRangeWork(gmx_nbnxm_gpu_t*               nb,
 128                             const gmx::GpuBonded*          gpuBonded,
 129                             const gmx::InteractionLocality iLocality)
 130 {
 131     GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
 132
 133     // There is short-range work if the pair list for the provided
 134     // interaction locality contains entries or if there is any
 135     // bonded work (as this is not split into local/nonlocal).
 136     nb->haveWork[iLocality] = ((nb->plist[iLocality]->nsci != 0)
 137                                || (gpuBonded != nullptr && gpuBonded->haveInteractions()));
 138 }
 139
 140 /*! \brief Returns true if there is GPU short-range work for the given interaction locality.
 141  *
 142  * Note that as, unlike nonbonded tasks, bonded tasks are not split into local/nonlocal,
 143  * and therefore if there are GPU offloaded bonded interactions, this function will return
 144  * true for all interaction localities.
 145  *
 146  * \param[inout]  nb        Pointer to the nonbonded GPU data structure
 147  * \param[in]     iLocality Interaction locality identifier
 148  */
 149 static bool haveGpuShortRangeWork(const gmx_nbnxm_gpu_t& nb, const gmx::InteractionLocality iLocality)
 150 {
 151     return nb.haveWork[iLocality];
 152 }
 153
 154 //NOLINTNEXTLINE(misc-definitions-in-headers)
 155 bool haveGpuShortRangeWork(const gmx_nbnxm_gpu_t* nb, const gmx::AtomLocality aLocality)
 156 {
 157     GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
 158
 159     return haveGpuShortRangeWork(*nb, gpuAtomToInteractionLocality(aLocality));
 160 }
 161
 162
 163 /*! \brief Calculate atom range and return start index and length.
 164  *
 165  * \param[in] atomData Atom descriptor data structure
 166  * \param[in] atomLocality Atom locality specifier
 167  * \param[out] atomRangeBegin Starting index of the atom range in the atom data array.
 168  * \param[out] atomRangeLen Atom range length in the atom data array.
 169  */
 170 template<typename AtomDataT>
 171 static inline void getGpuAtomRange(const AtomDataT*   atomData,
 172                                    const AtomLocality atomLocality,
 173                                    int*               atomRangeBegin,
 174                                    int*               atomRangeLen)
 175 {
 176     assert(atomData);
 177     validateGpuAtomLocality(atomLocality);
 178
 179     /* calculate the atom data index range based on locality */
 180     if (atomLocality == AtomLocality::Local)
 181     {
 182         *atomRangeBegin = 0;
 183         *atomRangeLen   = atomData->natoms_local;
 184     }
 185     else
 186     {
 187         *atomRangeBegin = atomData->natoms_local;
 188         *atomRangeLen   = atomData->natoms - atomData->natoms_local;
 189     }
 190 }
 191
 192
 193 /*! \brief Count pruning kernel time if either kernel has been triggered
 194  *
 195  *  We do the accounting for either of the two pruning kernel flavors:
 196  *   - 1st pass prune: ran during the current step (prior to the force kernel);
 197  *   - rolling prune:  ran at the end of the previous step (prior to the current step H2D xq);
 198  *
 199  * Note that the resetting of cu_timers_t::didPrune and cu_timers_t::didRollingPrune should happen
 200  * after calling this function.
 201  *
 202  * \param[in] timers   structs with GPU timer objects
 203  * \param[inout] timings  GPU task timing data
 204  * \param[in] iloc        interaction locality
 205  */
 206 template<typename GpuTimers>
 207 static void countPruneKernelTime(GpuTimers*                 timers,
 208                                  gmx_wallclock_gpu_nbnxn_t* timings,
 209                                  const InteractionLocality  iloc)
 210 {
 211     gpu_timers_t::Interaction& iTimers = timers->interaction[iloc];
 212
 213     // We might have not done any pruning (e.g. if we skipped with empty domains).
 214     if (!iTimers.didPrune && !iTimers.didRollingPrune)
 215     {
 216         return;
 217     }
 218
 219     if (iTimers.didPrune)
 220     {
 221         timings->pruneTime.c++;
 222         timings->pruneTime.t += iTimers.prune_k.getLastRangeTime();
 223     }
 224
 225     if (iTimers.didRollingPrune)
 226     {
 227         timings->dynamicPruneTime.c++;
 228         timings->dynamicPruneTime.t += iTimers.rollingPrune_k.getLastRangeTime();
 229     }
 230 }
 231
 232 /*! \brief Reduce data staged internally in the nbnxn module.
 233  *
 234  * Shift forces and electrostatic/LJ energies copied from the GPU into
 235  * a module-internal staging area are immediately reduced (CPU-side buffers passed)
 236  * after having waited for the transfers' completion.
 237  *
 238  * Note that this function should always be called after the transfers into the
 239  * staging buffers has completed.
 240  *
 241  * \tparam     StagingData    Type of staging data
 242  * \param[in]  nbst           Nonbonded staging data
 243  * \param[in]  iLocality      Interaction locality specifier
 244  * \param[in]  reduceEnergies True if energy reduction should be done
 245  * \param[in]  reduceFshift   True if shift force reduction should be done
 246  * \param[out] e_lj           Variable to accumulate LJ energy into
 247  * \param[out] e_el           Variable to accumulate electrostatic energy into
 248  * \param[out] fshift         Pointer to the array of shift forces to accumulate into
 249  */
 250 template<typename StagingData>
 251 static inline void gpu_reduce_staged_outputs(const StagingData&        nbst,
 252                                              const InteractionLocality iLocality,
 253                                              const bool                reduceEnergies,
 254                                              const bool                reduceFshift,
 255                                              real*                     e_lj,
 256                                              real*                     e_el,
 257                                              rvec*                     fshift)
 258 {
 259     /* add up energies and shift forces (only once at local F wait) */
 260     if (iLocality == InteractionLocality::Local)
 261     {
 262         if (reduceEnergies)
 263         {
 264             *e_lj += *nbst.e_lj;
 265             *e_el += *nbst.e_el;
 266         }
 267
 268         if (reduceFshift)
 269         {
 270             for (int i = 0; i < SHIFTS; i++)
 271             {
 272                 rvec_inc(fshift[i], nbst.fshift[i]);
 273             }
 274         }
 275     }
 276 }
 277
 278 /*! \brief Do the per-step timing accounting of the nonbonded tasks.
 279  *
 280  *  Does timing accumulation and call-count increments for the nonbonded kernels.
 281  *  Note that this function should be called after the current step's nonbonded
 282  *  nonbonded tasks have completed with the exception of the rolling pruning kernels
 283  *  that are accounted for during the following step.
 284  *
 285  * NOTE: if timing with multiple GPUs (streams) becomes possible, the
 286  *      counters could end up being inconsistent due to not being incremented
 287  *      on some of the node when this is skipped on empty local domains!
 288  *
 289  * \tparam     GpuTimers         GPU timers type
 290  * \tparam     GpuPairlist       Pair list type
 291  * \param[out] timings           Pointer to the NB GPU timings data
 292  * \param[in]  timers            Pointer to GPU timers data
 293  * \param[in]  plist             Pointer to the pair list data
 294  * \param[in]  atomLocality      Atom locality specifier
 295  * \param[in]  stepWork          Force schedule flags
 296  * \param[in]  doTiming          True if timing is enabled.
 297  *
 298  */
 299 template<typename GpuTimers, typename GpuPairlist>
 300 static inline void gpu_accumulate_timings(gmx_wallclock_gpu_nbnxn_t* timings,
 301                                           GpuTimers*                 timers,
 302                                           const GpuPairlist*         plist,
 303                                           AtomLocality               atomLocality,
 304                                           const gmx::StepWorkload&   stepWork,
 305                                           bool                       doTiming)
 306 {
 307     /* timing data accumulation */
 308     if (!doTiming)
 309     {
 310         return;
 311     }
 312
 313     /* determine interaction locality from atom locality */
 314     const InteractionLocality iLocality        = gpuAtomToInteractionLocality(atomLocality);
 315     const bool                didEnergyKernels = stepWork.computeEnergy;
 316
 317     /* only increase counter once (at local F wait) */
 318     if (iLocality == InteractionLocality::Local)
 319     {
 320         timings->nb_c++;
 321         timings->ktime[plist->haveFreshList ? 1 : 0][didEnergyKernels ? 1 : 0].c += 1;
 322     }
 323
 324     /* kernel timings */
 325     timings->ktime[plist->haveFreshList ? 1 : 0][didEnergyKernels ? 1 : 0].t +=
 326             timers->interaction[iLocality].nb_k.getLastRangeTime();
 327
 328     /* X/q H2D and F D2H timings */
 329     timings->nb_h2d_t += timers->xf[atomLocality].nb_h2d.getLastRangeTime();
 330     timings->nb_d2h_t += timers->xf[atomLocality].nb_d2h.getLastRangeTime();
 331
 332     /* Count the pruning kernel times for both cases:1st pass (at search step)
 333        and rolling pruning (if called at the previous step).
 334        We do the accounting here as this is the only sync point where we
 335        know (without checking or additional sync-ing) that prune tasks in
 336        in the current stream have completed (having just blocking-waited
 337        for the force D2H). */
 338     countPruneKernelTime(timers, timings, iLocality);
 339
 340     /* only count atdat at pair-search steps (add only once, at local F wait) */
 341     if (stepWork.doNeighborSearch && atomLocality == AtomLocality::Local)
 342     {
 343         /* atdat transfer timing */
 344         timings->pl_h2d_c++;
 345         timings->pl_h2d_t += timers->atdat.getLastRangeTime();
 346     }
 347
 348     /* only count pair-list H2D when actually performed */
 349     if (timers->interaction[iLocality].didPairlistH2D)
 350     {
 351         timings->pl_h2d_t += timers->interaction[iLocality].pl_h2d.getLastRangeTime();
 352
 353         /* Clear the timing flag for the next step */
 354         timers->interaction[iLocality].didPairlistH2D = false;
 355     }
 356 }
 357
 358 /*! \brief Attempts to complete nonbonded GPU task.
 359  *
 360  * See documentation in nbnxm_gpu.h for details.
 361  *
 362  * \todo Move into shared source file with gmx_compile_cpp_as_cuda
 363  */
 364 //NOLINTNEXTLINE(misc-definitions-in-headers)
 365 bool gpu_try_finish_task(gmx_nbnxm_gpu_t*         nb,
 366                          const gmx::StepWorkload& stepWork,
 367                          const AtomLocality       aloc,
 368                          real*                    e_lj,
 369                          real*                    e_el,
 370                          gmx::ArrayRef<gmx::RVec> shiftForces,
 371                          GpuTaskCompletion        completionKind,
 372                          gmx_wallcycle*           wcycle)
 373 {
 374     GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
 375
 376     /* determine interaction locality from atom locality */
 377     const InteractionLocality iLocality = gpuAtomToInteractionLocality(aloc);
 378
 379
 380     // Transfers are launched and therefore need to be waited on if:
 381     // - buffer ops is not offloaded
 382     // - energies or virials are needed (on the local stream)
 383     //
 384     // (Note that useGpuFBufferOps and computeVirial are mutually exclusive
 385     // in current code as virial steps do CPU reduction.)
 386     const bool haveResultToWaitFor =
 387             (!stepWork.useGpuFBufferOps
 388              || (aloc == AtomLocality::Local && (stepWork.computeEnergy || stepWork.computeVirial)));
 389
 390     //  We skip when during the non-local phase there was actually no work to do.
 391     //  This is consistent with nbnxn_gpu_launch_kernel but it also considers possible
 392     //  bonded GPU work.
 393     if ((iLocality == InteractionLocality::Local) || haveGpuShortRangeWork(*nb, iLocality))
 394     {
 395         // Query the state of the GPU stream and return early if we're not done
 396         if (completionKind == GpuTaskCompletion::Check)
 397         {
 398             // To get the wcycle call count right, when in GpuTaskCompletion::Check mode,
 399             // we start without counting and only when the task finished we issue a
 400             // start/stop to increment.
 401             // GpuTaskCompletion::Wait mode the timing is expected to be done in the caller.
 402             wallcycle_start_nocount(wcycle, ewcWAIT_GPU_NB_L);
 403
 404             if (!haveStreamTasksCompleted(nb->stream[iLocality]))
 405             {
 406                 wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
 407
 408                 // Early return to skip the steps below that we have to do only
 409                 // after the NB task completed
 410                 return false;
 411             }
 412
 413             wallcycle_increment_event_count(wcycle, ewcWAIT_GPU_NB_L);
 414         }
 415         else if (haveResultToWaitFor)
 416         {
 417             gpuStreamSynchronize(nb->stream[iLocality]);
 418         }
 419
 420         // TODO: this needs to be moved later because conditional wait could brake timing
 421         // with a future OpenCL implementation, but with CUDA timing is anyway disabled
 422         // in all cases where we skip the wait.
 423         gpu_accumulate_timings(nb->timings, nb->timers, nb->plist[iLocality], aloc, stepWork,
 424                                nb->bDoTime != 0);
 425
 426         if (stepWork.computeEnergy || stepWork.computeVirial)
 427         {
 428             gpu_reduce_staged_outputs(nb->nbst, iLocality, stepWork.computeEnergy, stepWork.computeVirial,
 429                                       e_lj, e_el, as_rvec_array(shiftForces.data()));
 430         }
 431     }
 432
 433     /* Always reset both pruning flags (doesn't hurt doing it even when timing is off). */
 434     nb->timers->interaction[iLocality].didPrune =
 435             nb->timers->interaction[iLocality].didRollingPrune = false;
 436
 437     /* Turn off initial list pruning (doesn't hurt if this is not pair-search step). */
 438     nb->plist[iLocality]->haveFreshList = false;
 439
 440     return true;
 441 }
 442
 443 /*! \brief
 444  * Wait for the asynchronously launched nonbonded tasks and data
 445  * transfers to finish.
 446  *
 447  * Also does timing accounting and reduction of the internal staging buffers.
 448  * As this is called at the end of the step, it also resets the pair list and
 449  * pruning flags.
 450  *
 451  * \param[in] nb The nonbonded data GPU structure
 452  * \param[in]  stepWork     Force schedule flags
 453  * \param[in] aloc Atom locality identifier
 454  * \param[out] e_lj Pointer to the LJ energy output to accumulate into
 455  * \param[out] e_el Pointer to the electrostatics energy output to accumulate into
 456  * \param[out] shiftForces Shift forces buffer to accumulate into
 457  * \param[out] wcycle Pointer to wallcycle data structure
 458  * \return            The number of cycles the gpu wait took
 459  */
 460 //NOLINTNEXTLINE(misc-definitions-in-headers) TODO: move into source file
 461 float gpu_wait_finish_task(gmx_nbnxm_gpu_t*         nb,
 462                            const gmx::StepWorkload& stepWork,
 463                            AtomLocality             aloc,
 464                            real*                    e_lj,
 465                            real*                    e_el,
 466                            gmx::ArrayRef<gmx::RVec> shiftForces,
 467                            gmx_wallcycle*           wcycle)
 468 {
 469     auto cycleCounter = (gpuAtomToInteractionLocality(aloc) == InteractionLocality::Local)
 470                                 ? ewcWAIT_GPU_NB_L
 471                                 : ewcWAIT_GPU_NB_NL;
 472
 473     wallcycle_start(wcycle, cycleCounter);
 474     gpu_try_finish_task(nb, stepWork, aloc, e_lj, e_el, shiftForces, GpuTaskCompletion::Wait, wcycle);
 475     float waitTime = wallcycle_stop(wcycle, cycleCounter);
 476
 477     return waitTime;
 478 }
 479
 480 } // namespace Nbnxm
 481
 482 #endif