src/gromacs/nbnxm/gpu_common.h

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2017,2018,2019, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35 /*! \internal \file
  36  * \brief Common functions for the different NBNXN GPU implementations.
  37  *
  38  * \author Szilard Pall <pall.szilard@gmail.com>
  39  *
  40  * \ingroup module_nbnxm
  41  */
  42
  43 #ifndef GMX_NBNXM_GPU_COMMON_H
  44 #define GMX_NBNXM_GPU_COMMON_H
  45
  46 #include "config.h"
  47
  48 #include <string>
  49
  50 #if GMX_GPU == GMX_GPU_CUDA
  51 #include "cuda/nbnxm_cuda_types.h"
  52 #endif
  53
  54 #if GMX_GPU == GMX_GPU_OPENCL
  55 #include "opencl/nbnxm_ocl_types.h"
  56 #endif
  57
  58 #include "gromacs/gpu_utils/gpu_utils.h"
  59 #include "gromacs/math/vec.h"
  60 #include "gromacs/mdlib/force_flags.h"
  61 #include "gromacs/nbnxm/nbnxm.h"
  62 #include "gromacs/pbcutil/ishift.h"
  63 #include "gromacs/timing/gpu_timing.h"
  64 #include "gromacs/utility/fatalerror.h"
  65 #include "gromacs/utility/stringutil.h"
  66
  67 #include "gpu_common_utils.h"
  68 #include "nbnxm_gpu.h"
  69
  70 namespace Nbnxm
  71 {
  72
  73 /*! \brief Check that atom locality values are valid for the GPU module.
  74  *
  75  *  In the GPU module atom locality "all" is not supported, the local and
  76  *  non-local ranges are treated separately.
  77  *
  78  *  \param[in] atomLocality atom locality specifier
  79  */
  80 static inline void
  81 validateGpuAtomLocality(const AtomLocality atomLocality)
  82 {
  83     std::string str = gmx::formatString("Invalid atom locality passed (%d); valid here is only "
  84                                         "local (%d) or nonlocal (%d)",
  85                                         static_cast<int>(atomLocality),
  86                                         static_cast<int>(AtomLocality::Local),
  87                                         static_cast<int>(AtomLocality::NonLocal));
  88
  89     GMX_ASSERT(atomLocality == AtomLocality::Local || atomLocality == AtomLocality::NonLocal, str.c_str());
  90 }
  91
  92 /*! \brief Convert atom locality to interaction locality.
  93  *
  94  *  In the current implementation the this is straightforward conversion:
  95  *  local to local, non-local to non-local.
  96  *
  97  *  \param[in] atomLocality Atom locality specifier
  98  *  \returns                Interaction locality corresponding to the atom locality passed.
  99  */
 100 static inline InteractionLocality
 101 gpuAtomToInteractionLocality(const AtomLocality atomLocality)
 102 {
 103     validateGpuAtomLocality(atomLocality);
 104
 105     /* determine interaction locality from atom locality */
 106     if (atomLocality == AtomLocality::Local)
 107     {
 108         return InteractionLocality::Local;
 109     }
 110     else if (atomLocality == AtomLocality::NonLocal)
 111     {
 112         return InteractionLocality::NonLocal;
 113     }
 114     else
 115     {
 116         gmx_incons("Wrong locality");
 117     }
 118 }
 119
 120 /*! \brief Calculate atom range and return start index and length.
 121  *
 122  * \param[in] atomData Atom descriptor data structure
 123  * \param[in] atomLocality Atom locality specifier
 124  * \param[out] atomRangeBegin Starting index of the atom range in the atom data array.
 125  * \param[out] atomRangeLen Atom range length in the atom data array.
 126  */
 127 template <typename AtomDataT>
 128 static inline void
 129 getGpuAtomRange(const AtomDataT    *atomData,
 130                 const AtomLocality  atomLocality,
 131                 int                *atomRangeBegin,
 132                 int                *atomRangeLen)
 133 {
 134     assert(atomData);
 135     validateGpuAtomLocality(atomLocality);
 136
 137     /* calculate the atom data index range based on locality */
 138     if (atomLocality == AtomLocality::Local)
 139     {
 140         *atomRangeBegin  = 0;
 141         *atomRangeLen    = atomData->natoms_local;
 142     }
 143     else
 144     {
 145         *atomRangeBegin  = atomData->natoms_local;
 146         *atomRangeLen    = atomData->natoms - atomData->natoms_local;
 147     }
 148 }
 149
 150
 151 /*! \brief Count pruning kernel time if either kernel has been triggered
 152  *
 153  *  We do the accounting for either of the two pruning kernel flavors:
 154  *   - 1st pass prune: ran during the current step (prior to the force kernel);
 155  *   - rolling prune:  ran at the end of the previous step (prior to the current step H2D xq);
 156  *
 157  * Note that the resetting of cu_timers_t::didPrune and cu_timers_t::didRollingPrune should happen
 158  * after calling this function.
 159  *
 160  * \param[in] timers   structs with GPU timer objects
 161  * \param[inout] timings  GPU task timing data
 162  * \param[in] iloc        interaction locality
 163  */
 164 template <typename GpuTimers>
 165 static void countPruneKernelTime(GpuTimers                 *timers,
 166                                  gmx_wallclock_gpu_nbnxn_t *timings,
 167                                  const InteractionLocality  iloc)
 168 {
 169     gpu_timers_t::Interaction &iTimers = timers->interaction[iloc];
 170
 171     // We might have not done any pruning (e.g. if we skipped with empty domains).
 172     if (!iTimers.didPrune &&
 173         !iTimers.didRollingPrune)
 174     {
 175         return;
 176     }
 177
 178     if (iTimers.didPrune)
 179     {
 180         timings->pruneTime.c++;
 181         timings->pruneTime.t += iTimers.prune_k.getLastRangeTime();
 182     }
 183
 184     if (iTimers.didRollingPrune)
 185     {
 186         timings->dynamicPruneTime.c++;
 187         timings->dynamicPruneTime.t += iTimers.rollingPrune_k.getLastRangeTime();
 188     }
 189 }
 190
 191 /*! \brief Reduce data staged internally in the nbnxn module.
 192  *
 193  * Shift forces and electrostatic/LJ energies copied from the GPU into
 194  * a module-internal staging area are immediately reduced (CPU-side buffers passed)
 195  * after having waited for the transfers' completion.
 196  *
 197  * Note that this function should always be called after the transfers into the
 198  * staging buffers has completed.
 199  *
 200  * \tparam     StagingData    Type of staging data
 201  * \param[in]  nbst           Nonbonded staging data
 202  * \param[in]  iLocality      Interaction locality specifier
 203  * \param[in]  reduceEnergies True if energy reduction should be done
 204  * \param[in]  reduceFshift   True if shift force reduction should be done
 205  * \param[out] e_lj           Variable to accumulate LJ energy into
 206  * \param[out] e_el           Variable to accumulate electrostatic energy into
 207  * \param[out] fshift         Pointer to the array of shift forces to accumulate into
 208  */
 209 template <typename StagingData>
 210 static inline void
 211 gpu_reduce_staged_outputs(const StagingData         &nbst,
 212                           const InteractionLocality  iLocality,
 213                           const bool                 reduceEnergies,
 214                           const bool                 reduceFshift,
 215                           real                      *e_lj,
 216                           real                      *e_el,
 217                           rvec                      *fshift)
 218 {
 219     /* add up energies and shift forces (only once at local F wait) */
 220     if (iLocality == InteractionLocality::Local)
 221     {
 222         if (reduceEnergies)
 223         {
 224             *e_lj += *nbst.e_lj;
 225             *e_el += *nbst.e_el;
 226         }
 227
 228         if (reduceFshift)
 229         {
 230             for (int i = 0; i < SHIFTS; i++)
 231             {
 232                 rvec_inc(fshift[i], nbst.fshift[i]);
 233             }
 234         }
 235     }
 236 }
 237
 238 /*! \brief Do the per-step timing accounting of the nonbonded tasks.
 239  *
 240  *  Does timing accumulation and call-count increments for the nonbonded kernels.
 241  *  Note that this function should be called after the current step's nonbonded
 242  *  nonbonded tasks have completed with the exception of the rolling pruning kernels
 243  *  that are accounted for during the following step.
 244  *
 245  * NOTE: if timing with multiple GPUs (streams) becomes possible, the
 246  *      counters could end up being inconsistent due to not being incremented
 247  *      on some of the node when this is skipped on empty local domains!
 248  *
 249  * \tparam     GpuTimers         GPU timers type
 250  * \tparam     GpuPairlist       Pair list type
 251  * \param[out] timings           Pointer to the NB GPU timings data
 252  * \param[in]  timers            Pointer to GPU timers data
 253  * \param[in]  plist             Pointer to the pair list data
 254  * \param[in]  atomLocality      Atom locality specifier
 255  * \param[in]  didEnergyKernels  True if energy kernels have been called in the current step
 256  * \param[in]  doTiming          True if timing is enabled.
 257  *
 258  */
 259 template <typename GpuTimers, typename GpuPairlist>
 260 static inline void
 261 gpu_accumulate_timings(gmx_wallclock_gpu_nbnxn_t *timings,
 262                        GpuTimers                 *timers,
 263                        const GpuPairlist         *plist,
 264                        AtomLocality               atomLocality,
 265                        bool                       didEnergyKernels,
 266                        bool                       doTiming)
 267 {
 268     /* timing data accumulation */
 269     if (!doTiming)
 270     {
 271         return;
 272     }
 273
 274     /* determine interaction locality from atom locality */
 275     const InteractionLocality iLocality = gpuAtomToInteractionLocality(atomLocality);
 276
 277     /* only increase counter once (at local F wait) */
 278     if (iLocality == InteractionLocality::Local)
 279     {
 280         timings->nb_c++;
 281         timings->ktime[plist->haveFreshList ? 1 : 0][didEnergyKernels ? 1 : 0].c += 1;
 282     }
 283
 284     /* kernel timings */
 285     timings->ktime[plist->haveFreshList ? 1 : 0][didEnergyKernels ? 1 : 0].t +=
 286         timers->interaction[iLocality].nb_k.getLastRangeTime();
 287
 288     /* X/q H2D and F D2H timings */
 289     timings->nb_h2d_t += timers->xf[atomLocality].nb_h2d.getLastRangeTime();
 290     timings->nb_d2h_t += timers->xf[atomLocality].nb_d2h.getLastRangeTime();
 291
 292     /* Count the pruning kernel times for both cases:1st pass (at search step)
 293        and rolling pruning (if called at the previous step).
 294        We do the accounting here as this is the only sync point where we
 295        know (without checking or additional sync-ing) that prune tasks in
 296        in the current stream have completed (having just blocking-waited
 297        for the force D2H). */
 298     countPruneKernelTime(timers, timings, iLocality);
 299
 300     /* only count atdat and pair-list H2D at pair-search step */
 301     if (timers->interaction[iLocality].didPairlistH2D)
 302     {
 303         /* atdat transfer timing (add only once, at local F wait) */
 304         if (atomLocality == AtomLocality::Local)
 305         {
 306             timings->pl_h2d_c++;
 307             timings->pl_h2d_t += timers->atdat.getLastRangeTime();
 308         }
 309
 310         timings->pl_h2d_t += timers->interaction[iLocality].pl_h2d.getLastRangeTime();
 311
 312         /* Clear the timing flag for the next step */
 313         timers->interaction[iLocality].didPairlistH2D = false;
 314     }
 315 }
 316
 317 //TODO: move into shared source file with gmx_compile_cpp_as_cuda
 318 //NOLINTNEXTLINE(misc-definitions-in-headers)
 319 bool gpu_try_finish_task(gmx_nbnxn_gpu_t    *nb,
 320                          const int           flags,
 321                          const AtomLocality  aloc,
 322                          const bool          haveOtherWork,
 323                          real               *e_lj,
 324                          real               *e_el,
 325                          rvec               *fshift,
 326                          GpuTaskCompletion   completionKind)
 327 {
 328     GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
 329
 330     /* determine interaction locality from atom locality */
 331     const InteractionLocality iLocality = gpuAtomToInteractionLocality(aloc);
 332
 333     //  We skip when during the non-local phase there was actually no work to do.
 334     //  This is consistent with nbnxn_gpu_launch_kernel.
 335     if (haveOtherWork || !canSkipWork(*nb, iLocality))
 336     {
 337         // Query the state of the GPU stream and return early if we're not done
 338         if (completionKind == GpuTaskCompletion::Check)
 339         {
 340             if (!haveStreamTasksCompleted(nb->stream[iLocality]))
 341             {
 342                 // Early return to skip the steps below that we have to do only
 343                 // after the NB task completed
 344                 return false;
 345             }
 346         }
 347         else
 348         {
 349             gpuStreamSynchronize(nb->stream[iLocality]);
 350         }
 351
 352         bool calcEner   = (flags & GMX_FORCE_ENERGY) != 0;
 353         bool calcFshift = (flags & GMX_FORCE_VIRIAL) != 0;
 354
 355         gpu_accumulate_timings(nb->timings, nb->timers, nb->plist[iLocality], aloc, calcEner,
 356                                nb->bDoTime != 0);
 357
 358         gpu_reduce_staged_outputs(nb->nbst, iLocality, calcEner, calcFshift, e_lj, e_el, fshift);
 359     }
 360
 361     /* Always reset both pruning flags (doesn't hurt doing it even when timing is off). */
 362     nb->timers->interaction[iLocality].didPrune = nb->timers->interaction[iLocality].didRollingPrune = false;
 363
 364     /* Turn off initial list pruning (doesn't hurt if this is not pair-search step). */
 365     nb->plist[iLocality]->haveFreshList = false;
 366
 367     return true;
 368 }
 369
 370 /*! \brief
 371  * Wait for the asynchronously launched nonbonded tasks and data
 372  * transfers to finish.
 373  *
 374  * Also does timing accounting and reduction of the internal staging buffers.
 375  * As this is called at the end of the step, it also resets the pair list and
 376  * pruning flags.
 377  *
 378  * \param[in] nb The nonbonded data GPU structure
 379  * \param[in] flags Force flags
 380  * \param[in] aloc Atom locality identifier
 381  * \param[in] haveOtherWork  Tells whether there is other work than non-bonded work in the nbnxn stream(s)
 382  * \param[out] e_lj Pointer to the LJ energy output to accumulate into
 383  * \param[out] e_el Pointer to the electrostatics energy output to accumulate into
 384  * \param[out] fshift Pointer to the shift force buffer to accumulate into
 385  */
 386 //NOLINTNEXTLINE(misc-definitions-in-headers) TODO: move into source file
 387 void gpu_wait_finish_task(gmx_nbnxn_gpu_t *nb,
 388                           int              flags,
 389                           AtomLocality     aloc,
 390                           bool             haveOtherWork,
 391                           real            *e_lj,
 392                           real            *e_el,
 393                           rvec            *fshift)
 394 {
 395     gpu_try_finish_task(nb, flags, aloc, haveOtherWork, e_lj, e_el, fshift,
 396                         GpuTaskCompletion::Wait);
 397 }
 398
 399 } // namespace Nbnxm
 400
 401 #endif