src/gromacs/nbnxm/gpu_common.h

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2017,2018,2019,2020,2021, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35 /*! \internal \file
  36  * \brief Common functions for the different NBNXN GPU implementations.
  37  *
  38  * \author Szilard Pall <pall.szilard@gmail.com>
  39  *
  40  * \ingroup module_nbnxm
  41  */
  42
  43 #ifndef GMX_NBNXM_GPU_COMMON_H
  44 #define GMX_NBNXM_GPU_COMMON_H
  45
  46 #include "config.h"
  47
  48 #include <string>
  49
  50 #if GMX_GPU_CUDA
  51 #    include "cuda/nbnxm_cuda_types.h"
  52 #endif
  53
  54 #if GMX_GPU_OPENCL
  55 #    include "opencl/nbnxm_ocl_types.h"
  56 #endif
  57
  58 #if GMX_GPU_SYCL
  59 #    include "sycl/nbnxm_sycl_types.h"
  60 #    include "gromacs/gpu_utils/syclutils.h"
  61 #endif
  62
  63 #include "gromacs/gpu_utils/gpu_utils.h"
  64 #include "gromacs/math/vec.h"
  65 #include "gromacs/mdtypes/simulation_workload.h"
  66 #include "gromacs/nbnxm/nbnxm.h"
  67 #include "gromacs/pbcutil/ishift.h"
  68 #include "gromacs/timing/gpu_timing.h"
  69 #include "gromacs/timing/wallcycle.h"
  70 #include "gromacs/utility/stringutil.h"
  71
  72 #include "gpu_common_utils.h"
  73 #include "nbnxm_gpu.h"
  74
  75 namespace gmx
  76 {
  77 class ListedForcesGpu;
  78 }
  79
  80 namespace Nbnxm
  81 {
  82
  83 /*! \brief Count pruning kernel time if either kernel has been triggered
  84  *
  85  *  We do the accounting for either of the two pruning kernel flavors:
  86  *   - 1st pass prune: ran during the current step (prior to the force kernel);
  87  *   - rolling prune:  ran at the end of the previous step (prior to the current step H2D xq);
  88  *
  89  * Note that the resetting of Nbnxm::GpuTimers::didPrune and Nbnxm::GpuTimers::didRollingPrune
  90  * should happen after calling this function.
  91  *
  92  * \param[in] timers   structs with GPU timer objects
  93  * \param[inout] timings  GPU task timing data
  94  * \param[in] iloc        interaction locality
  95  */
  96 static void countPruneKernelTime(Nbnxm::GpuTimers*          timers,
  97                                  gmx_wallclock_gpu_nbnxn_t* timings,
  98                                  const InteractionLocality  iloc)
  99 {
 100     GpuTimers::Interaction& iTimers = timers->interaction[iloc];
 101
 102     // We might have not done any pruning (e.g. if we skipped with empty domains).
 103     if (!iTimers.didPrune && !iTimers.didRollingPrune)
 104     {
 105         return;
 106     }
 107
 108     if (iTimers.didPrune)
 109     {
 110         timings->pruneTime.c++;
 111         timings->pruneTime.t += iTimers.prune_k.getLastRangeTime();
 112     }
 113
 114     if (iTimers.didRollingPrune)
 115     {
 116         timings->dynamicPruneTime.c++;
 117         timings->dynamicPruneTime.t += iTimers.rollingPrune_k.getLastRangeTime();
 118     }
 119 }
 120
 121 /*! \brief Reduce data staged internally in the nbnxn module.
 122  *
 123  * Shift forces and electrostatic/LJ energies copied from the GPU into
 124  * a module-internal staging area are immediately reduced (CPU-side buffers passed)
 125  * after having waited for the transfers' completion.
 126  *
 127  * Note that this function should always be called after the transfers into the
 128  * staging buffers has completed.
 129  *
 130  * \param[in]  nbst           Nonbonded staging data
 131  * \param[in]  iLocality      Interaction locality specifier
 132  * \param[in]  reduceEnergies True if energy reduction should be done
 133  * \param[in]  reduceFshift   True if shift force reduction should be done
 134  * \param[out] e_lj           Variable to accumulate LJ energy into
 135  * \param[out] e_el           Variable to accumulate electrostatic energy into
 136  * \param[out] fshift         Pointer to the array of shift forces to accumulate into
 137  */
 138 static inline void gpu_reduce_staged_outputs(const NBStagingData&      nbst,
 139                                              const InteractionLocality iLocality,
 140                                              const bool                reduceEnergies,
 141                                              const bool                reduceFshift,
 142                                              real*                     e_lj,
 143                                              real*                     e_el,
 144                                              rvec*                     fshift)
 145 {
 146     /* add up energies and shift forces (only once at local F wait) */
 147     if (iLocality == InteractionLocality::Local)
 148     {
 149         if (reduceEnergies)
 150         {
 151             *e_lj += *nbst.eLJ;
 152             *e_el += *nbst.eElec;
 153         }
 154
 155         if (reduceFshift)
 156         {
 157             for (int i = 0; i < gmx::c_numShiftVectors; i++)
 158             {
 159                 rvec_inc(fshift[i], nbst.fShift[i]);
 160             }
 161         }
 162     }
 163 }
 164
 165 /*! \brief Do the per-step timing accounting of the nonbonded tasks.
 166  *
 167  *  Does timing accumulation and call-count increments for the nonbonded kernels.
 168  *  Note that this function should be called after the current step's nonbonded
 169  *  nonbonded tasks have completed with the exception of the rolling pruning kernels
 170  *  that are accounted for during the following step.
 171  *
 172  * NOTE: if timing with multiple GPUs (streams) becomes possible, the
 173  *      counters could end up being inconsistent due to not being incremented
 174  *      on some of the node when this is skipped on empty local domains!
 175  *
 176  * \tparam     GpuPairlist       Pair list type
 177  * \param[out] timings           Pointer to the NB GPU timings data
 178  * \param[in]  timers            Pointer to GPU timers data
 179  * \param[in]  plist             Pointer to the pair list data
 180  * \param[in]  atomLocality      Atom locality specifier
 181  * \param[in]  stepWork          Force schedule flags
 182  * \param[in]  doTiming          True if timing is enabled.
 183  *
 184  */
 185 template<typename GpuPairlist>
 186 static inline void gpu_accumulate_timings(gmx_wallclock_gpu_nbnxn_t* timings,
 187                                           Nbnxm::GpuTimers*          timers,
 188                                           const GpuPairlist*         plist,
 189                                           AtomLocality               atomLocality,
 190                                           const gmx::StepWorkload&   stepWork,
 191                                           bool                       doTiming)
 192 {
 193     /* timing data accumulation */
 194     if (!doTiming)
 195     {
 196         return;
 197     }
 198
 199     /* determine interaction locality from atom locality */
 200     const InteractionLocality iLocality        = atomToInteractionLocality(atomLocality);
 201     const bool                didEnergyKernels = stepWork.computeEnergy;
 202
 203     /* only increase counter once (at local F wait) */
 204     if (iLocality == InteractionLocality::Local)
 205     {
 206         timings->nb_c++;
 207         timings->ktime[plist->haveFreshList ? 1 : 0][didEnergyKernels ? 1 : 0].c += 1;
 208     }
 209
 210     /* kernel timings */
 211     timings->ktime[plist->haveFreshList ? 1 : 0][didEnergyKernels ? 1 : 0].t +=
 212             timers->interaction[iLocality].nb_k.getLastRangeTime();
 213
 214     /* X/q H2D and F D2H timings */
 215     timings->nb_h2d_t += timers->xf[atomLocality].nb_h2d.getLastRangeTime();
 216     timings->nb_d2h_t += timers->xf[atomLocality].nb_d2h.getLastRangeTime();
 217
 218     /* Count the pruning kernel times for both cases:1st pass (at search step)
 219        and rolling pruning (if called at the previous step).
 220        We do the accounting here as this is the only sync point where we
 221        know (without checking or additional sync-ing) that prune tasks in
 222        in the current stream have completed (having just blocking-waited
 223        for the force D2H). */
 224     countPruneKernelTime(timers, timings, iLocality);
 225
 226     /* only count atdat at pair-search steps (add only once, at local F wait) */
 227     if (stepWork.doNeighborSearch && atomLocality == AtomLocality::Local)
 228     {
 229         /* atdat transfer timing */
 230         timings->pl_h2d_c++;
 231         timings->pl_h2d_t += timers->atdat.getLastRangeTime();
 232     }
 233
 234     /* only count pair-list H2D when actually performed */
 235     if (timers->interaction[iLocality].didPairlistH2D)
 236     {
 237         timings->pl_h2d_t += timers->interaction[iLocality].pl_h2d.getLastRangeTime();
 238
 239         /* Clear the timing flag for the next step */
 240         timers->interaction[iLocality].didPairlistH2D = false;
 241     }
 242 }
 243
 244 /*! \brief Attempts to complete nonbonded GPU task.
 245  *
 246  * See documentation in nbnxm_gpu.h for details.
 247  *
 248  * \todo Move into shared source file, perhaps including
 249  * cuda_runtime.h if needed for any remaining CUDA-specific
 250  * objects.
 251  */
 252 //NOLINTNEXTLINE(misc-definitions-in-headers)
 253 bool gpu_try_finish_task(NbnxmGpu*                nb,
 254                          const gmx::StepWorkload& stepWork,
 255                          const AtomLocality       aloc,
 256                          real*                    e_lj,
 257                          real*                    e_el,
 258                          gmx::ArrayRef<gmx::RVec> shiftForces,
 259                          GpuTaskCompletion        completionKind,
 260                          gmx_wallcycle*           wcycle)
 261 {
 262     GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
 263
 264     /* determine interaction locality from atom locality */
 265     const InteractionLocality iLocality = atomToInteractionLocality(aloc);
 266
 267
 268     // Transfers are launched and therefore need to be waited on if:
 269     // - buffer ops is not offloaded
 270     // - energies or virials are needed (on the local stream)
 271     //
 272     // (Note that useGpuFBufferOps and computeVirial are mutually exclusive
 273     // in current code as virial steps do CPU reduction.)
 274     const bool haveResultToWaitFor =
 275             (!stepWork.useGpuFBufferOps
 276              || (aloc == AtomLocality::Local && (stepWork.computeEnergy || stepWork.computeVirial)));
 277
 278     //  We skip when during the non-local phase there was actually no work to do.
 279     //  This is consistent with nbnxn_gpu_launch_kernel but it also considers possible
 280     //  bonded GPU work.
 281     if ((iLocality == InteractionLocality::Local) || haveGpuShortRangeWork(nb, iLocality))
 282     {
 283         // Query the state of the GPU stream and return early if we're not done
 284         if (completionKind == GpuTaskCompletion::Check)
 285         {
 286             // To get the wcycle call count right, when in GpuTaskCompletion::Check mode,
 287             // we start without counting and only when the task finished we issue a
 288             // start/stop to increment.
 289             // GpuTaskCompletion::Wait mode the timing is expected to be done in the caller.
 290             wallcycle_start_nocount(wcycle, WallCycleCounter::WaitGpuNbL);
 291
 292             if (!haveStreamTasksCompleted(*nb->deviceStreams[iLocality]))
 293             {
 294                 wallcycle_stop(wcycle, WallCycleCounter::WaitGpuNbL);
 295
 296                 // Early return to skip the steps below that we have to do only
 297                 // after the NB task completed
 298                 return false;
 299             }
 300
 301             wallcycle_increment_event_count(wcycle, WallCycleCounter::WaitGpuNbL);
 302         }
 303         else if (haveResultToWaitFor)
 304         {
 305             nb->deviceStreams[iLocality]->synchronize();
 306         }
 307
 308         // TODO: this needs to be moved later because conditional wait could brake timing
 309         // with a future OpenCL implementation, but with CUDA timing is anyway disabled
 310         // in all cases where we skip the wait.
 311         gpu_accumulate_timings(nb->timings, nb->timers, nb->plist[iLocality], aloc, stepWork, nb->bDoTime);
 312
 313         if (stepWork.computeEnergy || stepWork.computeVirial)
 314         {
 315             gpu_reduce_staged_outputs(nb->nbst,
 316                                       iLocality,
 317                                       stepWork.computeEnergy,
 318                                       stepWork.computeVirial,
 319                                       e_lj,
 320                                       e_el,
 321                                       as_rvec_array(shiftForces.data()));
 322         }
 323     }
 324
 325     /* Reset both pruning flags. */
 326     if (nb->bDoTime)
 327     {
 328         nb->timers->interaction[iLocality].didPrune =
 329                 nb->timers->interaction[iLocality].didRollingPrune = false;
 330     }
 331
 332     /* Turn off initial list pruning (doesn't hurt if this is not pair-search step). */
 333     nb->plist[iLocality]->haveFreshList = false;
 334
 335     return true;
 336 }
 337
 338 /*! \brief
 339  * Wait for the asynchronously launched nonbonded tasks and data
 340  * transfers to finish.
 341  *
 342  * Also does timing accounting and reduction of the internal staging buffers.
 343  * As this is called at the end of the step, it also resets the pair list and
 344  * pruning flags.
 345  *
 346  * \param[in] nb The nonbonded data GPU structure
 347  * \param[in]  stepWork     Force schedule flags
 348  * \param[in] aloc Atom locality identifier
 349  * \param[out] e_lj Pointer to the LJ energy output to accumulate into
 350  * \param[out] e_el Pointer to the electrostatics energy output to accumulate into
 351  * \param[out] shiftForces Shift forces buffer to accumulate into
 352  * \param[out] wcycle Pointer to wallcycle data structure
 353  * \return            The number of cycles the gpu wait took
 354  */
 355 //NOLINTNEXTLINE(misc-definitions-in-headers) TODO: move into source file
 356 float gpu_wait_finish_task(NbnxmGpu*                nb,
 357                            const gmx::StepWorkload& stepWork,
 358                            AtomLocality             aloc,
 359                            real*                    e_lj,
 360                            real*                    e_el,
 361                            gmx::ArrayRef<gmx::RVec> shiftForces,
 362                            gmx_wallcycle*           wcycle)
 363 {
 364     auto cycleCounter = (atomToInteractionLocality(aloc) == InteractionLocality::Local)
 365                                 ? WallCycleCounter::WaitGpuNbL
 366                                 : WallCycleCounter::WaitGpuNbNL;
 367
 368     wallcycle_start(wcycle, cycleCounter);
 369     gpu_try_finish_task(nb, stepWork, aloc, e_lj, e_el, shiftForces, GpuTaskCompletion::Wait, wcycle);
 370     float waitTime = wallcycle_stop(wcycle, cycleCounter);
 371
 372     return waitTime;
 373 }
 374
 375 } // namespace Nbnxm
 376
 377 #endif