2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2017,2018,2019,2020,2021, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * \brief Common functions for the different NBNXN GPU implementations.
38 * \author Szilard Pall <pall.szilard@gmail.com>
40 * \ingroup module_nbnxm
43 #ifndef GMX_NBNXM_GPU_COMMON_H
44 #define GMX_NBNXM_GPU_COMMON_H
51 # include "cuda/nbnxm_cuda_types.h"
55 # include "opencl/nbnxm_ocl_types.h"
59 # include "sycl/nbnxm_sycl_types.h"
62 #include "gromacs/gpu_utils/gpu_utils.h"
63 #include "gromacs/listed_forces/gpubonded.h"
64 #include "gromacs/math/vec.h"
65 #include "gromacs/mdtypes/simulation_workload.h"
66 #include "gromacs/nbnxm/nbnxm.h"
67 #include "gromacs/pbcutil/ishift.h"
68 #include "gromacs/timing/gpu_timing.h"
69 #include "gromacs/timing/wallcycle.h"
70 #include "gromacs/utility/fatalerror.h"
71 #include "gromacs/utility/stringutil.h"
73 #include "gpu_common_utils.h"
74 #include "nbnxm_gpu.h"
84 /*! \brief Check that atom locality values are valid for the GPU module.
86 * In the GPU module atom locality "all" is not supported, the local and
87 * non-local ranges are treated separately.
89 * \param[in] atomLocality atom locality specifier
91 static inline void validateGpuAtomLocality(const AtomLocality atomLocality)
93 std::string str = gmx::formatString(
94 "Invalid atom locality passed (%d); valid here is only "
95 "local (%d) or nonlocal (%d)",
96 static_cast<int>(atomLocality),
97 static_cast<int>(AtomLocality::Local),
98 static_cast<int>(AtomLocality::NonLocal));
100 GMX_ASSERT(atomLocality == AtomLocality::Local || atomLocality == AtomLocality::NonLocal, str.c_str());
103 /*! \brief Convert atom locality to interaction locality.
105 * In the current implementation the this is straightforward conversion:
106 * local to local, non-local to non-local.
108 * \param[in] atomLocality Atom locality specifier
109 * \returns Interaction locality corresponding to the atom locality passed.
111 static inline InteractionLocality gpuAtomToInteractionLocality(const AtomLocality atomLocality)
113 validateGpuAtomLocality(atomLocality);
115 /* determine interaction locality from atom locality */
116 if (atomLocality == AtomLocality::Local)
118 return InteractionLocality::Local;
120 else if (atomLocality == AtomLocality::NonLocal)
122 return InteractionLocality::NonLocal;
126 gmx_incons("Wrong locality");
131 //NOLINTNEXTLINE(misc-definitions-in-headers)
132 void setupGpuShortRangeWork(NbnxmGpu* nb, const gmx::GpuBonded* gpuBonded, const gmx::InteractionLocality iLocality)
134 GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
136 // There is short-range work if the pair list for the provided
137 // interaction locality contains entries or if there is any
138 // bonded work (as this is not split into local/nonlocal).
139 nb->haveWork[iLocality] = ((nb->plist[iLocality]->nsci != 0)
140 || (gpuBonded != nullptr && gpuBonded->haveInteractions()));
143 /*! \brief Returns true if there is GPU short-range work for the given interaction locality.
145 * Note that as, unlike nonbonded tasks, bonded tasks are not split into local/nonlocal,
146 * and therefore if there are GPU offloaded bonded interactions, this function will return
147 * true for all interaction localities.
149 * \param[inout] nb Pointer to the nonbonded GPU data structure
150 * \param[in] iLocality Interaction locality identifier
152 static bool haveGpuShortRangeWork(const NbnxmGpu& nb, const gmx::InteractionLocality iLocality)
154 return nb.haveWork[iLocality];
157 //NOLINTNEXTLINE(misc-definitions-in-headers)
158 bool haveGpuShortRangeWork(const NbnxmGpu* nb, const gmx::AtomLocality aLocality)
160 GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
162 return haveGpuShortRangeWork(*nb, gpuAtomToInteractionLocality(aLocality));
166 /*! \brief Calculate atom range and return start index and length.
168 * \param[in] atomData Atom descriptor data structure
169 * \param[in] atomLocality Atom locality specifier
170 * \param[out] atomRangeBegin Starting index of the atom range in the atom data array.
171 * \param[out] atomRangeLen Atom range length in the atom data array.
173 template<typename AtomDataT>
174 static inline void getGpuAtomRange(const AtomDataT* atomData,
175 const AtomLocality atomLocality,
180 validateGpuAtomLocality(atomLocality);
182 /* calculate the atom data index range based on locality */
183 if (atomLocality == AtomLocality::Local)
186 *atomRangeLen = atomData->natoms_local;
190 *atomRangeBegin = atomData->natoms_local;
191 *atomRangeLen = atomData->natoms - atomData->natoms_local;
196 /*! \brief Count pruning kernel time if either kernel has been triggered
198 * We do the accounting for either of the two pruning kernel flavors:
199 * - 1st pass prune: ran during the current step (prior to the force kernel);
200 * - rolling prune: ran at the end of the previous step (prior to the current step H2D xq);
202 * Note that the resetting of cu_timers_t::didPrune and cu_timers_t::didRollingPrune should happen
203 * after calling this function.
205 * \param[in] timers structs with GPU timer objects
206 * \param[inout] timings GPU task timing data
207 * \param[in] iloc interaction locality
209 template<typename GpuTimers>
210 static void countPruneKernelTime(GpuTimers* timers,
211 gmx_wallclock_gpu_nbnxn_t* timings,
212 const InteractionLocality iloc)
214 gpu_timers_t::Interaction& iTimers = timers->interaction[iloc];
216 // We might have not done any pruning (e.g. if we skipped with empty domains).
217 if (!iTimers.didPrune && !iTimers.didRollingPrune)
222 if (iTimers.didPrune)
224 timings->pruneTime.c++;
225 timings->pruneTime.t += iTimers.prune_k.getLastRangeTime();
228 if (iTimers.didRollingPrune)
230 timings->dynamicPruneTime.c++;
231 timings->dynamicPruneTime.t += iTimers.rollingPrune_k.getLastRangeTime();
235 /*! \brief Reduce data staged internally in the nbnxn module.
237 * Shift forces and electrostatic/LJ energies copied from the GPU into
238 * a module-internal staging area are immediately reduced (CPU-side buffers passed)
239 * after having waited for the transfers' completion.
241 * Note that this function should always be called after the transfers into the
242 * staging buffers has completed.
244 * \tparam StagingData Type of staging data
245 * \param[in] nbst Nonbonded staging data
246 * \param[in] iLocality Interaction locality specifier
247 * \param[in] reduceEnergies True if energy reduction should be done
248 * \param[in] reduceFshift True if shift force reduction should be done
249 * \param[out] e_lj Variable to accumulate LJ energy into
250 * \param[out] e_el Variable to accumulate electrostatic energy into
251 * \param[out] fshift Pointer to the array of shift forces to accumulate into
253 static inline void gpu_reduce_staged_outputs(const nb_staging_t& nbst,
254 const InteractionLocality iLocality,
255 const bool reduceEnergies,
256 const bool reduceFshift,
261 /* add up energies and shift forces (only once at local F wait) */
262 if (iLocality == InteractionLocality::Local)
272 for (int i = 0; i < SHIFTS; i++)
274 rvec_inc(fshift[i], nbst.fshift[i]);
280 /*! \brief Do the per-step timing accounting of the nonbonded tasks.
282 * Does timing accumulation and call-count increments for the nonbonded kernels.
283 * Note that this function should be called after the current step's nonbonded
284 * nonbonded tasks have completed with the exception of the rolling pruning kernels
285 * that are accounted for during the following step.
287 * NOTE: if timing with multiple GPUs (streams) becomes possible, the
288 * counters could end up being inconsistent due to not being incremented
289 * on some of the node when this is skipped on empty local domains!
291 * \tparam GpuTimers GPU timers type
292 * \tparam GpuPairlist Pair list type
293 * \param[out] timings Pointer to the NB GPU timings data
294 * \param[in] timers Pointer to GPU timers data
295 * \param[in] plist Pointer to the pair list data
296 * \param[in] atomLocality Atom locality specifier
297 * \param[in] stepWork Force schedule flags
298 * \param[in] doTiming True if timing is enabled.
301 template<typename GpuTimers, typename GpuPairlist>
302 static inline void gpu_accumulate_timings(gmx_wallclock_gpu_nbnxn_t* timings,
304 const GpuPairlist* plist,
305 AtomLocality atomLocality,
306 const gmx::StepWorkload& stepWork,
309 /* timing data accumulation */
315 /* determine interaction locality from atom locality */
316 const InteractionLocality iLocality = gpuAtomToInteractionLocality(atomLocality);
317 const bool didEnergyKernels = stepWork.computeEnergy;
319 /* only increase counter once (at local F wait) */
320 if (iLocality == InteractionLocality::Local)
323 timings->ktime[plist->haveFreshList ? 1 : 0][didEnergyKernels ? 1 : 0].c += 1;
327 timings->ktime[plist->haveFreshList ? 1 : 0][didEnergyKernels ? 1 : 0].t +=
328 timers->interaction[iLocality].nb_k.getLastRangeTime();
330 /* X/q H2D and F D2H timings */
331 timings->nb_h2d_t += timers->xf[atomLocality].nb_h2d.getLastRangeTime();
332 timings->nb_d2h_t += timers->xf[atomLocality].nb_d2h.getLastRangeTime();
334 /* Count the pruning kernel times for both cases:1st pass (at search step)
335 and rolling pruning (if called at the previous step).
336 We do the accounting here as this is the only sync point where we
337 know (without checking or additional sync-ing) that prune tasks in
338 in the current stream have completed (having just blocking-waited
339 for the force D2H). */
340 countPruneKernelTime(timers, timings, iLocality);
342 /* only count atdat at pair-search steps (add only once, at local F wait) */
343 if (stepWork.doNeighborSearch && atomLocality == AtomLocality::Local)
345 /* atdat transfer timing */
347 timings->pl_h2d_t += timers->atdat.getLastRangeTime();
350 /* only count pair-list H2D when actually performed */
351 if (timers->interaction[iLocality].didPairlistH2D)
353 timings->pl_h2d_t += timers->interaction[iLocality].pl_h2d.getLastRangeTime();
355 /* Clear the timing flag for the next step */
356 timers->interaction[iLocality].didPairlistH2D = false;
360 /*! \brief Attempts to complete nonbonded GPU task.
362 * See documentation in nbnxm_gpu.h for details.
364 * \todo Move into shared source file, perhaps including
365 * cuda_runtime.h if needed for any remaining CUDA-specific
368 //NOLINTNEXTLINE(misc-definitions-in-headers)
369 bool gpu_try_finish_task(NbnxmGpu* nb,
370 const gmx::StepWorkload& stepWork,
371 const AtomLocality aloc,
374 gmx::ArrayRef<gmx::RVec> shiftForces,
375 GpuTaskCompletion completionKind,
376 gmx_wallcycle* wcycle)
378 GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
380 /* determine interaction locality from atom locality */
381 const InteractionLocality iLocality = gpuAtomToInteractionLocality(aloc);
384 // Transfers are launched and therefore need to be waited on if:
385 // - buffer ops is not offloaded
386 // - energies or virials are needed (on the local stream)
388 // (Note that useGpuFBufferOps and computeVirial are mutually exclusive
389 // in current code as virial steps do CPU reduction.)
390 const bool haveResultToWaitFor =
391 (!stepWork.useGpuFBufferOps
392 || (aloc == AtomLocality::Local && (stepWork.computeEnergy || stepWork.computeVirial)));
394 // We skip when during the non-local phase there was actually no work to do.
395 // This is consistent with nbnxn_gpu_launch_kernel but it also considers possible
397 if ((iLocality == InteractionLocality::Local) || haveGpuShortRangeWork(*nb, iLocality))
399 // Query the state of the GPU stream and return early if we're not done
400 if (completionKind == GpuTaskCompletion::Check)
402 // To get the wcycle call count right, when in GpuTaskCompletion::Check mode,
403 // we start without counting and only when the task finished we issue a
404 // start/stop to increment.
405 // GpuTaskCompletion::Wait mode the timing is expected to be done in the caller.
406 wallcycle_start_nocount(wcycle, ewcWAIT_GPU_NB_L);
408 if (!haveStreamTasksCompleted(*nb->deviceStreams[iLocality]))
410 wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
412 // Early return to skip the steps below that we have to do only
413 // after the NB task completed
417 wallcycle_increment_event_count(wcycle, ewcWAIT_GPU_NB_L);
419 else if (haveResultToWaitFor)
421 nb->deviceStreams[iLocality]->synchronize();
424 // TODO: this needs to be moved later because conditional wait could brake timing
425 // with a future OpenCL implementation, but with CUDA timing is anyway disabled
426 // in all cases where we skip the wait.
427 gpu_accumulate_timings(nb->timings, nb->timers, nb->plist[iLocality], aloc, stepWork, nb->bDoTime);
429 if (stepWork.computeEnergy || stepWork.computeVirial)
431 gpu_reduce_staged_outputs(nb->nbst,
433 stepWork.computeEnergy,
434 stepWork.computeVirial,
437 as_rvec_array(shiftForces.data()));
441 /* Reset both pruning flags. */
444 nb->timers->interaction[iLocality].didPrune =
445 nb->timers->interaction[iLocality].didRollingPrune = false;
448 /* Turn off initial list pruning (doesn't hurt if this is not pair-search step). */
449 nb->plist[iLocality]->haveFreshList = false;
455 * Wait for the asynchronously launched nonbonded tasks and data
456 * transfers to finish.
458 * Also does timing accounting and reduction of the internal staging buffers.
459 * As this is called at the end of the step, it also resets the pair list and
462 * \param[in] nb The nonbonded data GPU structure
463 * \param[in] stepWork Force schedule flags
464 * \param[in] aloc Atom locality identifier
465 * \param[out] e_lj Pointer to the LJ energy output to accumulate into
466 * \param[out] e_el Pointer to the electrostatics energy output to accumulate into
467 * \param[out] shiftForces Shift forces buffer to accumulate into
468 * \param[out] wcycle Pointer to wallcycle data structure
469 * \return The number of cycles the gpu wait took
471 //NOLINTNEXTLINE(misc-definitions-in-headers) TODO: move into source file
472 float gpu_wait_finish_task(NbnxmGpu* nb,
473 const gmx::StepWorkload& stepWork,
477 gmx::ArrayRef<gmx::RVec> shiftForces,
478 gmx_wallcycle* wcycle)
480 auto cycleCounter = (gpuAtomToInteractionLocality(aloc) == InteractionLocality::Local)
484 wallcycle_start(wcycle, cycleCounter);
485 gpu_try_finish_task(nb, stepWork, aloc, e_lj, e_el, shiftForces, GpuTaskCompletion::Wait, wcycle);
486 float waitTime = wallcycle_stop(wcycle, cycleCounter);