2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2016 by the GROMACS development team.
5 * Copyright (c) 2017,2018,2019,2020,2021, by the GROMACS development team, led by
6 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
7 * and including many others, as listed in the AUTHORS file in the
8 * top-level source directory and at http://www.gromacs.org.
10 * GROMACS is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public License
12 * as published by the Free Software Foundation; either version 2.1
13 * of the License, or (at your option) any later version.
15 * GROMACS is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with GROMACS; if not, see
22 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
23 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
25 * If you want to redistribute modifications to GROMACS, please
26 * consider that scientific software is very special. Version
27 * control is crucial - bugs must be traceable. We will be happy to
28 * consider code for inclusion in the official distribution, but
29 * derived work must not be called official GROMACS. Details are found
30 * in the README & COPYING files - if they are missing, get the
31 * official version at http://www.gromacs.org.
33 * To help us fund GROMACS development, we humbly ask that you cite
34 * the research papers on the package. Check out http://www.gromacs.org.
37 * \brief Define common implementation of nbnxm_gpu_data_mgmt.h
39 * \author Anca Hamuraru <anca@streamcomputing.eu>
40 * \author Dimitrios Karkoulis <dimitris.karkoulis@gmail.com>
41 * \author Teemu Virolainen <teemu@streamcomputing.eu>
42 * \author Szilárd Páll <pall.szilard@gmail.com>
43 * \author Artem Zhmurov <zhmurov@gmail.com>
45 * \ingroup module_nbnxm
52 # include "cuda/nbnxm_cuda_types.h"
56 # include "opencl/nbnxm_ocl_types.h"
60 # include "sycl/nbnxm_sycl_types.h"
63 #include "nbnxm_gpu_data_mgmt.h"
65 #include "gromacs/hardware/device_information.h"
66 #include "gromacs/mdtypes/interaction_const.h"
67 #include "gromacs/mdtypes/simulation_workload.h"
68 #include "gromacs/nbnxm/gpu_common_utils.h"
69 #include "gromacs/nbnxm/gpu_data_mgmt.h"
70 #include "gromacs/pbcutil/ishift.h"
71 #include "gromacs/timing/gpu_timing.h"
72 #include "gromacs/utility/cstringutil.h"
73 #include "gromacs/utility/exceptions.h"
74 #include "gromacs/utility/fatalerror.h"
76 #include "nbnxm_gpu.h"
77 #include "pairlistsets.h"
82 inline void issueClFlushInStream(const DeviceStream& deviceStream)
85 /* Based on the v1.2 section 5.13 of the OpenCL spec, a flush is needed
86 * in the stream after marking an event in it in order to be able to sync with
87 * the event from another stream.
89 cl_int cl_error = clFlush(deviceStream.stream());
90 if (cl_error != CL_SUCCESS)
92 GMX_THROW(gmx::InternalError("clFlush failed: " + ocl_get_error_string(cl_error)));
95 GMX_UNUSED_VALUE(deviceStream);
99 void init_ewald_coulomb_force_table(const EwaldCorrectionTables& tables,
101 const DeviceContext& deviceContext)
103 if (nbp->coulomb_tab)
105 destroyParamLookupTable(&nbp->coulomb_tab, nbp->coulomb_tab_texobj);
108 nbp->coulomb_tab_scale = tables.scale;
109 initParamLookupTable(
110 &nbp->coulomb_tab, &nbp->coulomb_tab_texobj, tables.tableF.data(), tables.tableF.size(), deviceContext);
113 enum ElecType nbnxn_gpu_pick_ewald_kernel_type(const interaction_const_t& ic,
114 const DeviceInformation gmx_unused& deviceInfo)
116 bool bTwinCut = (ic.rcoulomb != ic.rvdw);
118 /* Benchmarking/development environment variables to force the use of
119 analytical or tabulated Ewald kernel. */
120 const bool forceAnalyticalEwald = (getenv("GMX_GPU_NB_ANA_EWALD") != nullptr);
121 const bool forceTabulatedEwald = (getenv("GMX_GPU_NB_TAB_EWALD") != nullptr);
122 const bool forceTwinCutoffEwald = (getenv("GMX_GPU_NB_EWALD_TWINCUT") != nullptr);
124 if (forceAnalyticalEwald && forceTabulatedEwald)
127 "Both analytical and tabulated Ewald GPU non-bonded kernels "
128 "requested through environment variables.");
131 /* By default, use analytical Ewald except with CUDA on NVIDIA CC 7.0 and 8.0.
133 const bool c_useTabulatedEwaldDefault =
135 (deviceInfo.prop.major == 7 && deviceInfo.prop.minor == 0)
136 || (deviceInfo.prop.major == 8 && deviceInfo.prop.minor == 0);
140 bool bUseAnalyticalEwald = !c_useTabulatedEwaldDefault;
141 if (forceAnalyticalEwald)
143 bUseAnalyticalEwald = true;
146 fprintf(debug, "Using analytical Ewald GPU kernels\n");
149 else if (forceTabulatedEwald)
151 bUseAnalyticalEwald = false;
155 fprintf(debug, "Using tabulated Ewald GPU kernels\n");
159 /* Use twin cut-off kernels if requested by bTwinCut or the env. var.
160 forces it (use it for debugging/benchmarking only). */
161 if (!bTwinCut && !forceTwinCutoffEwald)
163 return bUseAnalyticalEwald ? ElecType::EwaldAna : ElecType::EwaldTab;
167 return bUseAnalyticalEwald ? ElecType::EwaldAnaTwin : ElecType::EwaldTabTwin;
171 void set_cutoff_parameters(NBParamGpu* nbp, const interaction_const_t* ic, const PairlistParams& listParams)
173 nbp->ewald_beta = ic->ewaldcoeff_q;
174 nbp->sh_ewald = ic->sh_ewald;
175 nbp->epsfac = ic->epsfac;
176 nbp->two_k_rf = 2.0 * ic->reactionFieldCoefficient;
177 nbp->c_rf = ic->reactionFieldShift;
178 nbp->rvdw_sq = ic->rvdw * ic->rvdw;
179 nbp->rcoulomb_sq = ic->rcoulomb * ic->rcoulomb;
180 nbp->rlistOuter_sq = listParams.rlistOuter * listParams.rlistOuter;
181 nbp->rlistInner_sq = listParams.rlistInner * listParams.rlistInner;
182 nbp->useDynamicPruning = listParams.useDynamicPruning;
184 nbp->sh_lj_ewald = ic->sh_lj_ewald;
185 nbp->ewaldcoeff_lj = ic->ewaldcoeff_lj;
187 nbp->rvdw_switch = ic->rvdw_switch;
188 nbp->dispersion_shift = ic->dispersion_shift;
189 nbp->repulsion_shift = ic->repulsion_shift;
190 nbp->vdw_switch = ic->vdw_switch;
193 void gpu_pme_loadbal_update_param(const nonbonded_verlet_t* nbv, const interaction_const_t* ic)
195 if (!nbv || !nbv->useGpu())
199 NbnxmGpu* nb = nbv->gpu_nbv;
200 NBParamGpu* nbp = nb->nbparam;
202 set_cutoff_parameters(nbp, ic, nbv->pairlistSets().params());
204 nbp->elecType = nbnxn_gpu_pick_ewald_kernel_type(*ic, nb->deviceContext_->deviceInfo());
206 GMX_RELEASE_ASSERT(ic->coulombEwaldTables, "Need valid Coulomb Ewald correction tables");
207 init_ewald_coulomb_force_table(*ic->coulombEwaldTables, nbp, *nb->deviceContext_);
210 void init_plist(gpu_plist* pl)
212 /* initialize to nullptr pointers to data that is not allocated here and will
213 need reallocation in nbnxn_gpu_init_pairlist */
219 /* size -1 indicates that the respective array hasn't been initialized yet */
226 pl->imask_nalloc = -1;
228 pl->excl_nalloc = -1;
229 pl->haveFreshList = false;
230 pl->rollingPruningNumParts = 0;
231 pl->rollingPruningPart = 0;
234 void init_timings(gmx_wallclock_gpu_nbnxn_t* t)
241 for (int i = 0; i < 2; i++)
243 for (int j = 0; j < 2; j++)
245 t->ktime[i][j].t = 0.0;
246 t->ktime[i][j].c = 0;
250 t->pruneTime.t = 0.0;
251 t->dynamicPruneTime.c = 0;
252 t->dynamicPruneTime.t = 0.0;
255 //! This function is documented in the header file
256 void gpu_init_pairlist(NbnxmGpu* nb, const NbnxnPairlistGpu* h_plist, const InteractionLocality iloc)
259 // Timing accumulation should happen only if there was work to do
260 // because getLastRangeTime() gets skipped with empty lists later
261 // which leads to the counter not being reset.
262 bool bDoTime = (nb->bDoTime && !h_plist->sci.empty());
263 const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
264 gpu_plist* d_plist = nb->plist[iloc];
266 if (d_plist->na_c < 0)
268 d_plist->na_c = h_plist->na_ci;
272 if (d_plist->na_c != h_plist->na_ci)
275 "In init_plist: the #atoms per cell has changed (from %d to %d)",
282 GpuTimers::Interaction& iTimers = nb->timers->interaction[iloc];
286 iTimers.pl_h2d.openTimingRegion(deviceStream);
287 iTimers.didPairlistH2D = true;
290 // TODO most of this function is same in CUDA and OpenCL, move into the header
291 const DeviceContext& deviceContext = *nb->deviceContext_;
293 reallocateDeviceBuffer(
294 &d_plist->sci, h_plist->sci.size(), &d_plist->nsci, &d_plist->sci_nalloc, deviceContext);
295 copyToDeviceBuffer(&d_plist->sci,
300 GpuApiCallBehavior::Async,
301 bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
303 reallocateDeviceBuffer(
304 &d_plist->cj4, h_plist->cj4.size(), &d_plist->ncj4, &d_plist->cj4_nalloc, deviceContext);
305 copyToDeviceBuffer(&d_plist->cj4,
310 GpuApiCallBehavior::Async,
311 bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
313 reallocateDeviceBuffer(&d_plist->imask,
314 h_plist->cj4.size() * c_nbnxnGpuClusterpairSplit,
316 &d_plist->imask_nalloc,
319 reallocateDeviceBuffer(
320 &d_plist->excl, h_plist->excl.size(), &d_plist->nexcl, &d_plist->excl_nalloc, deviceContext);
321 copyToDeviceBuffer(&d_plist->excl,
322 h_plist->excl.data(),
324 h_plist->excl.size(),
326 GpuApiCallBehavior::Async,
327 bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
331 iTimers.pl_h2d.closeTimingRegion(deviceStream);
334 /* need to prune the pair list during the next step */
335 d_plist->haveFreshList = true;
338 void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat)
340 bool bDoTime = nb->bDoTime;
341 Nbnxm::GpuTimers* timers = bDoTime ? nb->timers : nullptr;
342 NBAtomData* atdat = nb->atdat;
343 const DeviceContext& deviceContext = *nb->deviceContext_;
344 const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local];
346 int numAtoms = nbat->numAtoms();
347 bool realloced = false;
351 /* time async copy */
352 timers->atdat.openTimingRegion(localStream);
355 /* need to reallocate if we have to copy more atoms than the amount of space
356 available and only allocate if we haven't initialized yet, i.e atdat->natoms == -1 */
357 if (numAtoms > atdat->numAtomsAlloc)
359 int numAlloc = over_alloc_small(numAtoms);
361 /* free up first if the arrays have already been initialized */
362 if (atdat->numAtomsAlloc != -1)
364 freeDeviceBuffer(&atdat->f);
365 freeDeviceBuffer(&atdat->xq);
366 freeDeviceBuffer(&atdat->ljComb);
367 freeDeviceBuffer(&atdat->atomTypes);
371 allocateDeviceBuffer(&atdat->f, numAlloc, deviceContext);
372 allocateDeviceBuffer(&atdat->xq, numAlloc, deviceContext);
374 if (useLjCombRule(nb->nbparam->vdwType))
376 // Two Lennard-Jones parameters per atom
377 allocateDeviceBuffer(&atdat->ljComb, numAlloc, deviceContext);
381 allocateDeviceBuffer(&atdat->atomTypes, numAlloc, deviceContext);
384 atdat->numAtomsAlloc = numAlloc;
388 atdat->numAtoms = numAtoms;
389 atdat->numAtomsLocal = nbat->natoms_local;
391 /* need to clear GPU f output if realloc happened */
394 clearDeviceBufferAsync(&atdat->f, 0, atdat->numAtomsAlloc, localStream);
397 if (useLjCombRule(nb->nbparam->vdwType))
400 sizeof(Float2) == 2 * sizeof(*nbat->params().lj_comb.data()),
401 "Size of a pair of LJ parameters elements should be equal to the size of Float2.");
402 copyToDeviceBuffer(&atdat->ljComb,
403 reinterpret_cast<const Float2*>(nbat->params().lj_comb.data()),
407 GpuApiCallBehavior::Async,
408 bDoTime ? timers->atdat.fetchNextEvent() : nullptr);
412 static_assert(sizeof(int) == sizeof(*nbat->params().type.data()),
413 "Sizes of host- and device-side atom types should be the same.");
414 copyToDeviceBuffer(&atdat->atomTypes,
415 nbat->params().type.data(),
419 GpuApiCallBehavior::Async,
420 bDoTime ? timers->atdat.fetchNextEvent() : nullptr);
425 timers->atdat.closeTimingRegion(localStream);
428 /* kick off the tasks enqueued above to ensure concurrency with the search */
429 issueClFlushInStream(localStream);
432 //! This function is documented in the header file
433 gmx_wallclock_gpu_nbnxn_t* gpu_get_timings(NbnxmGpu* nb)
435 return (nb != nullptr && nb->bDoTime) ? nb->timings : nullptr;
438 //! This function is documented in the header file
439 void gpu_reset_timings(nonbonded_verlet_t* nbv)
441 if (nbv->gpu_nbv && nbv->gpu_nbv->bDoTime)
443 init_timings(nbv->gpu_nbv->timings);
447 bool gpu_is_kernel_ewald_analytical(const NbnxmGpu* nb)
449 return ((nb->nbparam->elecType == ElecType::EwaldAna)
450 || (nb->nbparam->elecType == ElecType::EwaldAnaTwin));
453 enum ElecType nbnxmGpuPickElectrostaticsKernelType(const interaction_const_t* ic,
454 const DeviceInformation& deviceInfo)
456 if (ic->eeltype == CoulombInteractionType::Cut)
458 return ElecType::Cut;
460 else if (EEL_RF(ic->eeltype))
464 else if ((EEL_PME(ic->eeltype) || ic->eeltype == CoulombInteractionType::Ewald))
466 return nbnxn_gpu_pick_ewald_kernel_type(*ic, deviceInfo);
470 /* Shouldn't happen, as this is checked when choosing Verlet-scheme */
471 GMX_THROW(gmx::InconsistentInputError(
472 gmx::formatString("The requested electrostatics type %s is not implemented in "
473 "the GPU accelerated kernels!",
474 enumValueToString(ic->eeltype))));
479 enum VdwType nbnxmGpuPickVdwKernelType(const interaction_const_t* ic, LJCombinationRule ljCombinationRule)
481 if (ic->vdwtype == VanDerWaalsType::Cut)
483 switch (ic->vdw_modifier)
485 case InteractionModifiers::None:
486 case InteractionModifiers::PotShift:
487 switch (ljCombinationRule)
489 case LJCombinationRule::None: return VdwType::Cut;
490 case LJCombinationRule::Geometric: return VdwType::CutCombGeom;
491 case LJCombinationRule::LorentzBerthelot: return VdwType::CutCombLB;
493 GMX_THROW(gmx::InconsistentInputError(gmx::formatString(
494 "The requested LJ combination rule %s is not implemented in "
495 "the GPU accelerated kernels!",
496 enumValueToString(ljCombinationRule))));
498 case InteractionModifiers::ForceSwitch: return VdwType::FSwitch;
499 case InteractionModifiers::PotSwitch: return VdwType::PSwitch;
501 GMX_THROW(gmx::InconsistentInputError(
502 gmx::formatString("The requested VdW interaction modifier %s is not "
503 "implemented in the GPU accelerated kernels!",
504 enumValueToString(ic->vdw_modifier))));
507 else if (ic->vdwtype == VanDerWaalsType::Pme)
509 if (ic->ljpme_comb_rule == LongRangeVdW::Geom)
511 assert(ljCombinationRule == LJCombinationRule::Geometric);
512 return VdwType::EwaldGeom;
516 assert(ljCombinationRule == LJCombinationRule::LorentzBerthelot);
517 return VdwType::EwaldLB;
522 GMX_THROW(gmx::InconsistentInputError(gmx::formatString(
523 "The requested VdW type %s is not implemented in the GPU accelerated kernels!",
524 enumValueToString(ic->vdwtype))));
528 void setupGpuShortRangeWork(NbnxmGpu* nb, const gmx::GpuBonded* gpuBonded, const gmx::InteractionLocality iLocality)
530 GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
532 // There is short-range work if the pair list for the provided
533 // interaction locality contains entries or if there is any
534 // bonded work (as this is not split into local/nonlocal).
535 nb->haveWork[iLocality] = ((nb->plist[iLocality]->nsci != 0)
536 || (gpuBonded != nullptr && gpuBonded->haveInteractions()));
539 bool haveGpuShortRangeWork(const NbnxmGpu* nb, const gmx::AtomLocality aLocality)
541 GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
543 return haveGpuShortRangeWork(*nb, gpuAtomToInteractionLocality(aLocality));
547 * Launch asynchronously the download of nonbonded forces from the GPU
548 * (and energies/shift forces if required).
550 void gpu_launch_cpyback(NbnxmGpu* nb,
551 struct nbnxn_atomdata_t* nbatom,
552 const gmx::StepWorkload& stepWork,
553 const AtomLocality atomLocality)
555 GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
557 /* determine interaction locality from atom locality */
558 const InteractionLocality iloc = gpuAtomToInteractionLocality(atomLocality);
559 GMX_ASSERT(iloc == InteractionLocality::Local
560 || (iloc == InteractionLocality::NonLocal && nb->bNonLocalStreamDoneMarked == false),
561 "Non-local stream is indicating that the copy back event is enqueued at the "
562 "beginning of the copy back function.");
564 /* extract the data */
565 NBAtomData* adat = nb->atdat;
566 Nbnxm::GpuTimers* timers = nb->timers;
567 bool bDoTime = nb->bDoTime;
568 const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
570 /* don't launch non-local copy-back if there was no non-local work to do */
571 if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(*nb, iloc))
573 /* TODO An alternative way to signal that non-local work is
574 complete is to use a clEnqueueMarker+clEnqueueBarrier
575 pair. However, the use of bNonLocalStreamDoneMarked has the
576 advantage of being local to the host, so probably minimizes
577 overhead. Curiously, for NVIDIA OpenCL with an empty-domain
578 test case, overall simulation performance was higher with
579 the API calls, but this has not been tested on AMD OpenCL,
580 so could be worth considering in future. */
581 nb->bNonLocalStreamDoneMarked = false;
585 /* local/nonlocal offset and length used for xq and f */
586 auto atomsRange = getGpuAtomRange(adat, atomLocality);
588 /* beginning of timed D2H section */
591 timers->xf[atomLocality].nb_d2h.openTimingRegion(deviceStream);
594 /* With DD the local D2H transfer can only start after the non-local
595 has been launched. */
596 if (iloc == InteractionLocality::Local && nb->bNonLocalStreamDoneMarked)
598 nb->nonlocal_done.enqueueWaitEvent(deviceStream);
599 nb->bNonLocalStreamDoneMarked = false;
603 static_assert(sizeof(*nbatom->out[0].f.data()) == sizeof(float),
604 "The host force buffer should be in single precision to match device data size.");
605 copyFromDeviceBuffer(reinterpret_cast<Float3*>(nbatom->out[0].f.data()) + atomsRange.begin(),
610 GpuApiCallBehavior::Async,
611 bDoTime ? timers->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
613 issueClFlushInStream(deviceStream);
615 /* After the non-local D2H is launched the nonlocal_done event can be
616 recorded which signals that the local D2H can proceed. This event is not
617 placed after the non-local kernel because we first need the non-local
619 if (iloc == InteractionLocality::NonLocal)
621 nb->nonlocal_done.markEvent(deviceStream);
622 nb->bNonLocalStreamDoneMarked = true;
625 /* only transfer energies in the local stream */
626 if (iloc == InteractionLocality::Local)
628 /* DtoH fshift when virial is needed */
629 if (stepWork.computeVirial)
632 sizeof(*nb->nbst.fShift) == sizeof(Float3),
633 "Sizes of host- and device-side shift vector elements should be the same.");
634 copyFromDeviceBuffer(nb->nbst.fShift,
639 GpuApiCallBehavior::Async,
640 bDoTime ? timers->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
644 if (stepWork.computeEnergy)
646 static_assert(sizeof(*nb->nbst.eLJ) == sizeof(float),
647 "Sizes of host- and device-side LJ energy terms should be the same.");
648 copyFromDeviceBuffer(nb->nbst.eLJ,
653 GpuApiCallBehavior::Async,
654 bDoTime ? timers->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
655 static_assert(sizeof(*nb->nbst.eElec) == sizeof(float),
656 "Sizes of host- and device-side electrostatic energy terms should be the "
658 copyFromDeviceBuffer(nb->nbst.eElec,
663 GpuApiCallBehavior::Async,
664 bDoTime ? timers->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
670 timers->xf[atomLocality].nb_d2h.closeTimingRegion(deviceStream);
674 void nbnxnInsertNonlocalGpuDependency(NbnxmGpu* nb, const InteractionLocality interactionLocality)
676 const DeviceStream& deviceStream = *nb->deviceStreams[interactionLocality];
678 /* When we get here all misc operations issued in the local stream as well as
679 the local xq H2D are done,
680 so we record that in the local stream and wait for it in the nonlocal one.
681 This wait needs to precede any PP tasks, bonded or nonbonded, that may
682 compute on interactions between local and nonlocal atoms.
684 if (nb->bUseTwoStreams)
686 if (interactionLocality == InteractionLocality::Local)
688 nb->misc_ops_and_local_H2D_done.markEvent(deviceStream);
689 issueClFlushInStream(deviceStream);
693 nb->misc_ops_and_local_H2D_done.enqueueWaitEvent(deviceStream);
698 /*! \brief Launch asynchronously the xq buffer host to device copy. */
699 void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const AtomLocality atomLocality)
701 GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
703 const InteractionLocality iloc = gpuAtomToInteractionLocality(atomLocality);
705 NBAtomData* adat = nb->atdat;
706 gpu_plist* plist = nb->plist[iloc];
707 Nbnxm::GpuTimers* timers = nb->timers;
708 const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
710 const bool bDoTime = nb->bDoTime;
712 /* Don't launch the non-local H2D copy if there is no dependent
713 work to do: neither non-local nor other (e.g. bonded) work
714 to do that has as input the nbnxn coordaintes.
715 Doing the same for the local kernel is more complicated, since the
716 local part of the force array also depends on the non-local kernel.
717 So to avoid complicating the code and to reduce the risk of bugs,
718 we always call the local local x+q copy (and the rest of the local
719 work in nbnxn_gpu_launch_kernel().
721 if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(*nb, iloc))
723 plist->haveFreshList = false;
725 // The event is marked for Local interactions unconditionally,
726 // so it has to be released here because of the early return
727 // for NonLocal interactions.
728 nb->misc_ops_and_local_H2D_done.reset();
733 /* local/nonlocal offset and length used for xq and f */
734 const auto atomsRange = getGpuAtomRange(adat, atomLocality);
736 /* beginning of timed HtoD section */
739 timers->xf[atomLocality].nb_h2d.openTimingRegion(deviceStream);
743 GMX_ASSERT(nbatom->XFormat == nbatXYZQ,
744 "The coordinates should be in xyzq format to copy to the Float4 device buffer.");
745 copyToDeviceBuffer(&adat->xq,
746 reinterpret_cast<const Float4*>(nbatom->x().data()) + atomsRange.begin(),
750 GpuApiCallBehavior::Async,
755 timers->xf[atomLocality].nb_h2d.closeTimingRegion(deviceStream);
758 /* When we get here all misc operations issued in the local stream as well as
759 the local xq H2D are done,
760 so we record that in the local stream and wait for it in the nonlocal one.
761 This wait needs to precede any PP tasks, bonded or nonbonded, that may
762 compute on interactions between local and nonlocal atoms.
764 nbnxnInsertNonlocalGpuDependency(nb, iloc);