2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2017,2018,2019,2020,2021, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * \brief Common functions for the different NBNXN GPU implementations.
38 * \author Szilard Pall <pall.szilard@gmail.com>
40 * \ingroup module_nbnxm
43 #ifndef GMX_NBNXM_GPU_COMMON_H
44 #define GMX_NBNXM_GPU_COMMON_H
51 # include "cuda/nbnxm_cuda_types.h"
55 # include "opencl/nbnxm_ocl_types.h"
59 # include "sycl/nbnxm_sycl_types.h"
60 # include "gromacs/gpu_utils/syclutils.h"
63 #include "gromacs/gpu_utils/gpu_utils.h"
64 #include "gromacs/math/vec.h"
65 #include "gromacs/mdtypes/simulation_workload.h"
66 #include "gromacs/nbnxm/nbnxm.h"
67 #include "gromacs/pbcutil/ishift.h"
68 #include "gromacs/timing/gpu_timing.h"
69 #include "gromacs/timing/wallcycle.h"
70 #include "gromacs/utility/stringutil.h"
72 #include "gpu_common_utils.h"
73 #include "nbnxm_gpu.h"
77 class ListedForcesGpu;
83 /*! \brief Count pruning kernel time if either kernel has been triggered
85 * We do the accounting for either of the two pruning kernel flavors:
86 * - 1st pass prune: ran during the current step (prior to the force kernel);
87 * - rolling prune: ran at the end of the previous step (prior to the current step H2D xq);
89 * Note that the resetting of Nbnxm::GpuTimers::didPrune and Nbnxm::GpuTimers::didRollingPrune
90 * should happen after calling this function.
92 * \param[in] timers structs with GPU timer objects
93 * \param[inout] timings GPU task timing data
94 * \param[in] iloc interaction locality
96 static void countPruneKernelTime(Nbnxm::GpuTimers* timers,
97 gmx_wallclock_gpu_nbnxn_t* timings,
98 const InteractionLocality iloc)
100 GpuTimers::Interaction& iTimers = timers->interaction[iloc];
102 // We might have not done any pruning (e.g. if we skipped with empty domains).
103 if (!iTimers.didPrune && !iTimers.didRollingPrune)
108 if (iTimers.didPrune)
110 timings->pruneTime.c++;
111 timings->pruneTime.t += iTimers.prune_k.getLastRangeTime();
114 if (iTimers.didRollingPrune)
116 timings->dynamicPruneTime.c++;
117 timings->dynamicPruneTime.t += iTimers.rollingPrune_k.getLastRangeTime();
121 /*! \brief Reduce data staged internally in the nbnxn module.
123 * Shift forces and electrostatic/LJ energies copied from the GPU into
124 * a module-internal staging area are immediately reduced (CPU-side buffers passed)
125 * after having waited for the transfers' completion.
127 * Note that this function should always be called after the transfers into the
128 * staging buffers has completed.
130 * \param[in] nbst Nonbonded staging data
131 * \param[in] iLocality Interaction locality specifier
132 * \param[in] reduceEnergies True if energy reduction should be done
133 * \param[in] reduceFshift True if shift force reduction should be done
134 * \param[out] e_lj Variable to accumulate LJ energy into
135 * \param[out] e_el Variable to accumulate electrostatic energy into
136 * \param[out] fshift Pointer to the array of shift forces to accumulate into
138 static inline void gpu_reduce_staged_outputs(const NBStagingData& nbst,
139 const InteractionLocality iLocality,
140 const bool reduceEnergies,
141 const bool reduceFshift,
146 /* add up energies and shift forces (only once at local F wait) */
147 if (iLocality == InteractionLocality::Local)
152 *e_el += *nbst.eElec;
157 for (int i = 0; i < gmx::c_numShiftVectors; i++)
159 rvec_inc(fshift[i], nbst.fShift[i]);
165 /*! \brief Do the per-step timing accounting of the nonbonded tasks.
167 * Does timing accumulation and call-count increments for the nonbonded kernels.
168 * Note that this function should be called after the current step's nonbonded
169 * nonbonded tasks have completed with the exception of the rolling pruning kernels
170 * that are accounted for during the following step.
172 * NOTE: if timing with multiple GPUs (streams) becomes possible, the
173 * counters could end up being inconsistent due to not being incremented
174 * on some of the node when this is skipped on empty local domains!
176 * \tparam GpuPairlist Pair list type
177 * \param[out] timings Pointer to the NB GPU timings data
178 * \param[in] timers Pointer to GPU timers data
179 * \param[in] plist Pointer to the pair list data
180 * \param[in] atomLocality Atom locality specifier
181 * \param[in] stepWork Force schedule flags
182 * \param[in] doTiming True if timing is enabled.
185 template<typename GpuPairlist>
186 static inline void gpu_accumulate_timings(gmx_wallclock_gpu_nbnxn_t* timings,
187 Nbnxm::GpuTimers* timers,
188 const GpuPairlist* plist,
189 AtomLocality atomLocality,
190 const gmx::StepWorkload& stepWork,
193 /* timing data accumulation */
199 /* determine interaction locality from atom locality */
200 const InteractionLocality iLocality = atomToInteractionLocality(atomLocality);
201 const bool didEnergyKernels = stepWork.computeEnergy;
203 /* only increase counter once (at local F wait) */
204 if (iLocality == InteractionLocality::Local)
207 timings->ktime[plist->haveFreshList ? 1 : 0][didEnergyKernels ? 1 : 0].c += 1;
211 timings->ktime[plist->haveFreshList ? 1 : 0][didEnergyKernels ? 1 : 0].t +=
212 timers->interaction[iLocality].nb_k.getLastRangeTime();
214 /* X/q H2D and F D2H timings */
215 timings->nb_h2d_t += timers->xf[atomLocality].nb_h2d.getLastRangeTime();
216 timings->nb_d2h_t += timers->xf[atomLocality].nb_d2h.getLastRangeTime();
218 /* Count the pruning kernel times for both cases:1st pass (at search step)
219 and rolling pruning (if called at the previous step).
220 We do the accounting here as this is the only sync point where we
221 know (without checking or additional sync-ing) that prune tasks in
222 in the current stream have completed (having just blocking-waited
223 for the force D2H). */
224 countPruneKernelTime(timers, timings, iLocality);
226 /* only count atdat at pair-search steps (add only once, at local F wait) */
227 if (stepWork.doNeighborSearch && atomLocality == AtomLocality::Local)
229 /* atdat transfer timing */
231 timings->pl_h2d_t += timers->atdat.getLastRangeTime();
234 /* only count pair-list H2D when actually performed */
235 if (timers->interaction[iLocality].didPairlistH2D)
237 timings->pl_h2d_t += timers->interaction[iLocality].pl_h2d.getLastRangeTime();
239 /* Clear the timing flag for the next step */
240 timers->interaction[iLocality].didPairlistH2D = false;
244 /*! \brief Attempts to complete nonbonded GPU task.
246 * See documentation in nbnxm_gpu.h for details.
248 * \todo Move into shared source file, perhaps including
249 * cuda_runtime.h if needed for any remaining CUDA-specific
252 //NOLINTNEXTLINE(misc-definitions-in-headers)
253 bool gpu_try_finish_task(NbnxmGpu* nb,
254 const gmx::StepWorkload& stepWork,
255 const AtomLocality aloc,
258 gmx::ArrayRef<gmx::RVec> shiftForces,
259 GpuTaskCompletion completionKind,
260 gmx_wallcycle* wcycle)
262 GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
264 /* determine interaction locality from atom locality */
265 const InteractionLocality iLocality = atomToInteractionLocality(aloc);
268 // Transfers are launched and therefore need to be waited on if:
269 // - buffer ops is not offloaded
270 // - energies or virials are needed (on the local stream)
272 // (Note that useGpuFBufferOps and computeVirial are mutually exclusive
273 // in current code as virial steps do CPU reduction.)
274 const bool haveResultToWaitFor =
275 (!stepWork.useGpuFBufferOps
276 || (aloc == AtomLocality::Local && (stepWork.computeEnergy || stepWork.computeVirial)));
278 // We skip when during the non-local phase there was actually no work to do.
279 // This is consistent with nbnxn_gpu_launch_kernel but it also considers possible
281 if ((iLocality == InteractionLocality::Local) || haveGpuShortRangeWork(nb, iLocality))
283 // Query the state of the GPU stream and return early if we're not done
284 if (completionKind == GpuTaskCompletion::Check)
286 // To get the wcycle call count right, when in GpuTaskCompletion::Check mode,
287 // we start without counting and only when the task finished we issue a
288 // start/stop to increment.
289 // GpuTaskCompletion::Wait mode the timing is expected to be done in the caller.
290 wallcycle_start_nocount(wcycle, WallCycleCounter::WaitGpuNbL);
292 if (!haveStreamTasksCompleted(*nb->deviceStreams[iLocality]))
294 wallcycle_stop(wcycle, WallCycleCounter::WaitGpuNbL);
296 // Early return to skip the steps below that we have to do only
297 // after the NB task completed
301 wallcycle_increment_event_count(wcycle, WallCycleCounter::WaitGpuNbL);
303 else if (haveResultToWaitFor)
305 nb->deviceStreams[iLocality]->synchronize();
308 // TODO: this needs to be moved later because conditional wait could brake timing
309 // with a future OpenCL implementation, but with CUDA timing is anyway disabled
310 // in all cases where we skip the wait.
311 gpu_accumulate_timings(nb->timings, nb->timers, nb->plist[iLocality], aloc, stepWork, nb->bDoTime);
313 if (stepWork.computeEnergy || stepWork.computeVirial)
315 gpu_reduce_staged_outputs(nb->nbst,
317 stepWork.computeEnergy,
318 stepWork.computeVirial,
321 as_rvec_array(shiftForces.data()));
325 /* Reset both pruning flags. */
328 nb->timers->interaction[iLocality].didPrune =
329 nb->timers->interaction[iLocality].didRollingPrune = false;
332 /* Turn off initial list pruning (doesn't hurt if this is not pair-search step). */
333 nb->plist[iLocality]->haveFreshList = false;
339 * Wait for the asynchronously launched nonbonded tasks and data
340 * transfers to finish.
342 * Also does timing accounting and reduction of the internal staging buffers.
343 * As this is called at the end of the step, it also resets the pair list and
346 * \param[in] nb The nonbonded data GPU structure
347 * \param[in] stepWork Force schedule flags
348 * \param[in] aloc Atom locality identifier
349 * \param[out] e_lj Pointer to the LJ energy output to accumulate into
350 * \param[out] e_el Pointer to the electrostatics energy output to accumulate into
351 * \param[out] shiftForces Shift forces buffer to accumulate into
352 * \param[out] wcycle Pointer to wallcycle data structure
353 * \return The number of cycles the gpu wait took
355 //NOLINTNEXTLINE(misc-definitions-in-headers) TODO: move into source file
356 float gpu_wait_finish_task(NbnxmGpu* nb,
357 const gmx::StepWorkload& stepWork,
361 gmx::ArrayRef<gmx::RVec> shiftForces,
362 gmx_wallcycle* wcycle)
364 auto cycleCounter = (atomToInteractionLocality(aloc) == InteractionLocality::Local)
365 ? WallCycleCounter::WaitGpuNbL
366 : WallCycleCounter::WaitGpuNbNL;
368 wallcycle_start(wcycle, cycleCounter);
369 gpu_try_finish_task(nb, stepWork, aloc, e_lj, e_el, shiftForces, GpuTaskCompletion::Wait, wcycle);
370 float waitTime = wallcycle_stop(wcycle, cycleCounter);