2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2016 by the GROMACS development team.
5 * Copyright (c) 2017,2018,2019,2020,2021, by the GROMACS development team, led by
6 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
7 * and including many others, as listed in the AUTHORS file in the
8 * top-level source directory and at http://www.gromacs.org.
10 * GROMACS is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public License
12 * as published by the Free Software Foundation; either version 2.1
13 * of the License, or (at your option) any later version.
15 * GROMACS is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with GROMACS; if not, see
22 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
23 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
25 * If you want to redistribute modifications to GROMACS, please
26 * consider that scientific software is very special. Version
27 * control is crucial - bugs must be traceable. We will be happy to
28 * consider code for inclusion in the official distribution, but
29 * derived work must not be called official GROMACS. Details are found
30 * in the README & COPYING files - if they are missing, get the
31 * official version at http://www.gromacs.org.
33 * To help us fund GROMACS development, we humbly ask that you cite
34 * the research papers on the package. Check out http://www.gromacs.org.
37 * \brief Define common implementation of nbnxm_gpu_data_mgmt.h
39 * \author Anca Hamuraru <anca@streamcomputing.eu>
40 * \author Dimitrios Karkoulis <dimitris.karkoulis@gmail.com>
41 * \author Teemu Virolainen <teemu@streamcomputing.eu>
42 * \author Szilárd Páll <pall.szilard@gmail.com>
43 * \author Artem Zhmurov <zhmurov@gmail.com>
45 * \ingroup module_nbnxm
52 # include "cuda/nbnxm_cuda_types.h"
56 # include "opencl/nbnxm_ocl_types.h"
60 # include "sycl/nbnxm_sycl_types.h"
63 #include "nbnxm_gpu_data_mgmt.h"
65 #include "gromacs/hardware/device_information.h"
66 #include "gromacs/mdtypes/interaction_const.h"
67 #include "gromacs/mdtypes/simulation_workload.h"
68 #include "gromacs/nbnxm/gpu_common_utils.h"
69 #include "gromacs/nbnxm/gpu_data_mgmt.h"
70 #include "gromacs/pbcutil/ishift.h"
71 #include "gromacs/timing/gpu_timing.h"
72 #include "gromacs/pbcutil/ishift.h"
73 #include "gromacs/utility/cstringutil.h"
74 #include "gromacs/utility/exceptions.h"
75 #include "gromacs/utility/fatalerror.h"
77 #include "nbnxm_gpu.h"
78 #include "pairlistsets.h"
83 inline void issueClFlushInStream(const DeviceStream& deviceStream)
86 /* Based on the v1.2 section 5.13 of the OpenCL spec, a flush is needed
87 * in the stream after marking an event in it in order to be able to sync with
88 * the event from another stream.
90 cl_int cl_error = clFlush(deviceStream.stream());
91 if (cl_error != CL_SUCCESS)
93 GMX_THROW(gmx::InternalError("clFlush failed: " + ocl_get_error_string(cl_error)));
96 GMX_UNUSED_VALUE(deviceStream);
100 void init_ewald_coulomb_force_table(const EwaldCorrectionTables& tables,
102 const DeviceContext& deviceContext)
104 if (nbp->coulomb_tab)
106 destroyParamLookupTable(&nbp->coulomb_tab, nbp->coulomb_tab_texobj);
109 nbp->coulomb_tab_scale = tables.scale;
110 initParamLookupTable(
111 &nbp->coulomb_tab, &nbp->coulomb_tab_texobj, tables.tableF.data(), tables.tableF.size(), deviceContext);
114 enum ElecType nbnxn_gpu_pick_ewald_kernel_type(const interaction_const_t& ic,
115 const DeviceInformation gmx_unused& deviceInfo)
117 bool bTwinCut = (ic.rcoulomb != ic.rvdw);
119 /* Benchmarking/development environment variables to force the use of
120 analytical or tabulated Ewald kernel. */
121 const bool forceAnalyticalEwald = (getenv("GMX_GPU_NB_ANA_EWALD") != nullptr);
122 const bool forceTabulatedEwald = (getenv("GMX_GPU_NB_TAB_EWALD") != nullptr);
123 const bool forceTwinCutoffEwald = (getenv("GMX_GPU_NB_EWALD_TWINCUT") != nullptr);
125 if (forceAnalyticalEwald && forceTabulatedEwald)
128 "Both analytical and tabulated Ewald GPU non-bonded kernels "
129 "requested through environment variables.");
132 /* By default, use analytical Ewald except with CUDA on NVIDIA CC 7.0 and 8.0.
134 const bool c_useTabulatedEwaldDefault =
136 (deviceInfo.prop.major == 7 && deviceInfo.prop.minor == 0)
137 || (deviceInfo.prop.major == 8 && deviceInfo.prop.minor == 0);
141 bool bUseAnalyticalEwald = !c_useTabulatedEwaldDefault;
142 if (forceAnalyticalEwald)
144 bUseAnalyticalEwald = true;
147 fprintf(debug, "Using analytical Ewald GPU kernels\n");
150 else if (forceTabulatedEwald)
152 bUseAnalyticalEwald = false;
156 fprintf(debug, "Using tabulated Ewald GPU kernels\n");
160 /* Use twin cut-off kernels if requested by bTwinCut or the env. var.
161 forces it (use it for debugging/benchmarking only). */
162 if (!bTwinCut && !forceTwinCutoffEwald)
164 return bUseAnalyticalEwald ? ElecType::EwaldAna : ElecType::EwaldTab;
168 return bUseAnalyticalEwald ? ElecType::EwaldAnaTwin : ElecType::EwaldTabTwin;
172 void set_cutoff_parameters(NBParamGpu* nbp, const interaction_const_t* ic, const PairlistParams& listParams)
174 nbp->ewald_beta = ic->ewaldcoeff_q;
175 nbp->sh_ewald = ic->sh_ewald;
176 nbp->epsfac = ic->epsfac;
177 nbp->two_k_rf = 2.0 * ic->reactionFieldCoefficient;
178 nbp->c_rf = ic->reactionFieldShift;
179 nbp->rvdw_sq = ic->rvdw * ic->rvdw;
180 nbp->rcoulomb_sq = ic->rcoulomb * ic->rcoulomb;
181 nbp->rlistOuter_sq = listParams.rlistOuter * listParams.rlistOuter;
182 nbp->rlistInner_sq = listParams.rlistInner * listParams.rlistInner;
183 nbp->useDynamicPruning = listParams.useDynamicPruning;
185 nbp->sh_lj_ewald = ic->sh_lj_ewald;
186 nbp->ewaldcoeff_lj = ic->ewaldcoeff_lj;
188 nbp->rvdw_switch = ic->rvdw_switch;
189 nbp->dispersion_shift = ic->dispersion_shift;
190 nbp->repulsion_shift = ic->repulsion_shift;
191 nbp->vdw_switch = ic->vdw_switch;
194 void gpu_pme_loadbal_update_param(const nonbonded_verlet_t* nbv, const interaction_const_t* ic)
196 if (!nbv || !nbv->useGpu())
200 NbnxmGpu* nb = nbv->gpu_nbv;
201 NBParamGpu* nbp = nb->nbparam;
203 set_cutoff_parameters(nbp, ic, nbv->pairlistSets().params());
205 nbp->elecType = nbnxn_gpu_pick_ewald_kernel_type(*ic, nb->deviceContext_->deviceInfo());
207 GMX_RELEASE_ASSERT(ic->coulombEwaldTables, "Need valid Coulomb Ewald correction tables");
208 init_ewald_coulomb_force_table(*ic->coulombEwaldTables, nbp, *nb->deviceContext_);
211 void init_plist(gpu_plist* pl)
213 /* initialize to nullptr pointers to data that is not allocated here and will
214 need reallocation in nbnxn_gpu_init_pairlist */
220 /* size -1 indicates that the respective array hasn't been initialized yet */
227 pl->imask_nalloc = -1;
229 pl->excl_nalloc = -1;
230 pl->haveFreshList = false;
231 pl->rollingPruningNumParts = 0;
232 pl->rollingPruningPart = 0;
235 void init_timings(gmx_wallclock_gpu_nbnxn_t* t)
242 for (int i = 0; i < 2; i++)
244 for (int j = 0; j < 2; j++)
246 t->ktime[i][j].t = 0.0;
247 t->ktime[i][j].c = 0;
251 t->pruneTime.t = 0.0;
252 t->dynamicPruneTime.c = 0;
253 t->dynamicPruneTime.t = 0.0;
256 //! This function is documented in the header file
257 void gpu_init_pairlist(NbnxmGpu* nb, const NbnxnPairlistGpu* h_plist, const InteractionLocality iloc)
260 // Timing accumulation should happen only if there was work to do
261 // because getLastRangeTime() gets skipped with empty lists later
262 // which leads to the counter not being reset.
263 bool bDoTime = (nb->bDoTime && !h_plist->sci.empty());
264 const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
265 gpu_plist* d_plist = nb->plist[iloc];
267 if (d_plist->na_c < 0)
269 d_plist->na_c = h_plist->na_ci;
273 if (d_plist->na_c != h_plist->na_ci)
276 "In init_plist: the #atoms per cell has changed (from %d to %d)",
283 GpuTimers::Interaction& iTimers = nb->timers->interaction[iloc];
287 iTimers.pl_h2d.openTimingRegion(deviceStream);
288 iTimers.didPairlistH2D = true;
291 // TODO most of this function is same in CUDA and OpenCL, move into the header
292 const DeviceContext& deviceContext = *nb->deviceContext_;
294 reallocateDeviceBuffer(
295 &d_plist->sci, h_plist->sci.size(), &d_plist->nsci, &d_plist->sci_nalloc, deviceContext);
296 copyToDeviceBuffer(&d_plist->sci,
301 GpuApiCallBehavior::Async,
302 bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
304 reallocateDeviceBuffer(
305 &d_plist->cj4, h_plist->cj4.size(), &d_plist->ncj4, &d_plist->cj4_nalloc, deviceContext);
306 copyToDeviceBuffer(&d_plist->cj4,
311 GpuApiCallBehavior::Async,
312 bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
314 reallocateDeviceBuffer(&d_plist->imask,
315 h_plist->cj4.size() * c_nbnxnGpuClusterpairSplit,
317 &d_plist->imask_nalloc,
320 reallocateDeviceBuffer(
321 &d_plist->excl, h_plist->excl.size(), &d_plist->nexcl, &d_plist->excl_nalloc, deviceContext);
322 copyToDeviceBuffer(&d_plist->excl,
323 h_plist->excl.data(),
325 h_plist->excl.size(),
327 GpuApiCallBehavior::Async,
328 bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
332 iTimers.pl_h2d.closeTimingRegion(deviceStream);
335 /* need to prune the pair list during the next step */
336 d_plist->haveFreshList = true;
339 void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat)
341 bool bDoTime = nb->bDoTime;
342 Nbnxm::GpuTimers* timers = bDoTime ? nb->timers : nullptr;
343 NBAtomData* atdat = nb->atdat;
344 const DeviceContext& deviceContext = *nb->deviceContext_;
345 const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local];
347 int numAtoms = nbat->numAtoms();
348 bool realloced = false;
352 /* time async copy */
353 timers->atdat.openTimingRegion(localStream);
356 /* need to reallocate if we have to copy more atoms than the amount of space
357 available and only allocate if we haven't initialized yet, i.e atdat->natoms == -1 */
358 if (numAtoms > atdat->numAtomsAlloc)
360 int numAlloc = over_alloc_small(numAtoms);
362 /* free up first if the arrays have already been initialized */
363 if (atdat->numAtomsAlloc != -1)
365 freeDeviceBuffer(&atdat->f);
366 freeDeviceBuffer(&atdat->xq);
367 freeDeviceBuffer(&atdat->ljComb);
368 freeDeviceBuffer(&atdat->atomTypes);
372 allocateDeviceBuffer(&atdat->f, numAlloc, deviceContext);
373 allocateDeviceBuffer(&atdat->xq, numAlloc, deviceContext);
375 if (useLjCombRule(nb->nbparam->vdwType))
377 // Two Lennard-Jones parameters per atom
378 allocateDeviceBuffer(&atdat->ljComb, numAlloc, deviceContext);
382 allocateDeviceBuffer(&atdat->atomTypes, numAlloc, deviceContext);
385 atdat->numAtomsAlloc = numAlloc;
389 atdat->numAtoms = numAtoms;
390 atdat->numAtomsLocal = nbat->natoms_local;
392 /* need to clear GPU f output if realloc happened */
395 clearDeviceBufferAsync(&atdat->f, 0, atdat->numAtomsAlloc, localStream);
398 if (useLjCombRule(nb->nbparam->vdwType))
401 sizeof(Float2) == 2 * sizeof(*nbat->params().lj_comb.data()),
402 "Size of a pair of LJ parameters elements should be equal to the size of Float2.");
403 copyToDeviceBuffer(&atdat->ljComb,
404 reinterpret_cast<const Float2*>(nbat->params().lj_comb.data()),
408 GpuApiCallBehavior::Async,
409 bDoTime ? timers->atdat.fetchNextEvent() : nullptr);
413 static_assert(sizeof(int) == sizeof(*nbat->params().type.data()),
414 "Sizes of host- and device-side atom types should be the same.");
415 copyToDeviceBuffer(&atdat->atomTypes,
416 nbat->params().type.data(),
420 GpuApiCallBehavior::Async,
421 bDoTime ? timers->atdat.fetchNextEvent() : nullptr);
426 timers->atdat.closeTimingRegion(localStream);
429 /* kick off the tasks enqueued above to ensure concurrency with the search */
430 issueClFlushInStream(localStream);
433 void gpu_clear_outputs(NbnxmGpu* nb, bool computeVirial)
435 NBAtomData* adat = nb->atdat;
436 const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local];
438 clearDeviceBufferAsync(&adat->f, 0, nb->atdat->numAtoms, localStream);
439 // Clear shift force array and energies if the outputs were used in the current step
442 clearDeviceBufferAsync(&adat->fShift, 0, SHIFTS, localStream);
443 clearDeviceBufferAsync(&adat->eLJ, 0, 1, localStream);
444 clearDeviceBufferAsync(&adat->eElec, 0, 1, localStream);
446 issueClFlushInStream(localStream);
449 //! This function is documented in the header file
450 gmx_wallclock_gpu_nbnxn_t* gpu_get_timings(NbnxmGpu* nb)
452 return (nb != nullptr && nb->bDoTime) ? nb->timings : nullptr;
455 //! This function is documented in the header file
456 void gpu_reset_timings(nonbonded_verlet_t* nbv)
458 if (nbv->gpu_nbv && nbv->gpu_nbv->bDoTime)
460 init_timings(nbv->gpu_nbv->timings);
464 bool gpu_is_kernel_ewald_analytical(const NbnxmGpu* nb)
466 return ((nb->nbparam->elecType == ElecType::EwaldAna)
467 || (nb->nbparam->elecType == ElecType::EwaldAnaTwin));
470 enum ElecType nbnxmGpuPickElectrostaticsKernelType(const interaction_const_t* ic,
471 const DeviceInformation& deviceInfo)
473 if (ic->eeltype == CoulombInteractionType::Cut)
475 return ElecType::Cut;
477 else if (EEL_RF(ic->eeltype))
481 else if ((EEL_PME(ic->eeltype) || ic->eeltype == CoulombInteractionType::Ewald))
483 return nbnxn_gpu_pick_ewald_kernel_type(*ic, deviceInfo);
487 /* Shouldn't happen, as this is checked when choosing Verlet-scheme */
488 GMX_THROW(gmx::InconsistentInputError(
489 gmx::formatString("The requested electrostatics type %s is not implemented in "
490 "the GPU accelerated kernels!",
491 enumValueToString(ic->eeltype))));
496 enum VdwType nbnxmGpuPickVdwKernelType(const interaction_const_t* ic, LJCombinationRule ljCombinationRule)
498 if (ic->vdwtype == VanDerWaalsType::Cut)
500 switch (ic->vdw_modifier)
502 case InteractionModifiers::None:
503 case InteractionModifiers::PotShift:
504 switch (ljCombinationRule)
506 case LJCombinationRule::None: return VdwType::Cut;
507 case LJCombinationRule::Geometric: return VdwType::CutCombGeom;
508 case LJCombinationRule::LorentzBerthelot: return VdwType::CutCombLB;
510 GMX_THROW(gmx::InconsistentInputError(gmx::formatString(
511 "The requested LJ combination rule %s is not implemented in "
512 "the GPU accelerated kernels!",
513 enumValueToString(ljCombinationRule))));
515 case InteractionModifiers::ForceSwitch: return VdwType::FSwitch;
516 case InteractionModifiers::PotSwitch: return VdwType::PSwitch;
518 GMX_THROW(gmx::InconsistentInputError(
519 gmx::formatString("The requested VdW interaction modifier %s is not "
520 "implemented in the GPU accelerated kernels!",
521 enumValueToString(ic->vdw_modifier))));
524 else if (ic->vdwtype == VanDerWaalsType::Pme)
526 if (ic->ljpme_comb_rule == LongRangeVdW::Geom)
528 assert(ljCombinationRule == LJCombinationRule::Geometric);
529 return VdwType::EwaldGeom;
533 assert(ljCombinationRule == LJCombinationRule::LorentzBerthelot);
534 return VdwType::EwaldLB;
539 GMX_THROW(gmx::InconsistentInputError(gmx::formatString(
540 "The requested VdW type %s is not implemented in the GPU accelerated kernels!",
541 enumValueToString(ic->vdwtype))));
545 void setupGpuShortRangeWork(NbnxmGpu* nb, const gmx::GpuBonded* gpuBonded, const gmx::InteractionLocality iLocality)
547 GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
549 // There is short-range work if the pair list for the provided
550 // interaction locality contains entries or if there is any
551 // bonded work (as this is not split into local/nonlocal).
552 nb->haveWork[iLocality] = ((nb->plist[iLocality]->nsci != 0)
553 || (gpuBonded != nullptr && gpuBonded->haveInteractions()));
556 bool haveGpuShortRangeWork(const NbnxmGpu* nb, const gmx::AtomLocality aLocality)
558 GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
560 return haveGpuShortRangeWork(*nb, gpuAtomToInteractionLocality(aLocality));
564 * Launch asynchronously the download of nonbonded forces from the GPU
565 * (and energies/shift forces if required).
567 void gpu_launch_cpyback(NbnxmGpu* nb,
568 struct nbnxn_atomdata_t* nbatom,
569 const gmx::StepWorkload& stepWork,
570 const AtomLocality atomLocality)
572 GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
574 /* determine interaction locality from atom locality */
575 const InteractionLocality iloc = gpuAtomToInteractionLocality(atomLocality);
576 GMX_ASSERT(iloc == InteractionLocality::Local
577 || (iloc == InteractionLocality::NonLocal && nb->bNonLocalStreamDoneMarked == false),
578 "Non-local stream is indicating that the copy back event is enqueued at the "
579 "beginning of the copy back function.");
581 /* extract the data */
582 NBAtomData* adat = nb->atdat;
583 Nbnxm::GpuTimers* timers = nb->timers;
584 bool bDoTime = nb->bDoTime;
585 const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
587 /* don't launch non-local copy-back if there was no non-local work to do */
588 if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(*nb, iloc))
590 /* TODO An alternative way to signal that non-local work is
591 complete is to use a clEnqueueMarker+clEnqueueBarrier
592 pair. However, the use of bNonLocalStreamDoneMarked has the
593 advantage of being local to the host, so probably minimizes
594 overhead. Curiously, for NVIDIA OpenCL with an empty-domain
595 test case, overall simulation performance was higher with
596 the API calls, but this has not been tested on AMD OpenCL,
597 so could be worth considering in future. */
598 nb->bNonLocalStreamDoneMarked = false;
602 /* local/nonlocal offset and length used for xq and f */
603 auto atomsRange = getGpuAtomRange(adat, atomLocality);
605 /* beginning of timed D2H section */
608 timers->xf[atomLocality].nb_d2h.openTimingRegion(deviceStream);
611 /* With DD the local D2H transfer can only start after the non-local
612 has been launched. */
613 if (iloc == InteractionLocality::Local && nb->bNonLocalStreamDoneMarked)
615 nb->nonlocal_done.enqueueWaitEvent(deviceStream);
616 nb->bNonLocalStreamDoneMarked = false;
620 static_assert(sizeof(*nbatom->out[0].f.data()) == sizeof(float),
621 "The host force buffer should be in single precision to match device data size.");
622 copyFromDeviceBuffer(reinterpret_cast<Float3*>(nbatom->out[0].f.data()) + atomsRange.begin(),
627 GpuApiCallBehavior::Async,
628 bDoTime ? timers->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
630 issueClFlushInStream(deviceStream);
632 /* After the non-local D2H is launched the nonlocal_done event can be
633 recorded which signals that the local D2H can proceed. This event is not
634 placed after the non-local kernel because we first need the non-local
636 if (iloc == InteractionLocality::NonLocal)
638 nb->nonlocal_done.markEvent(deviceStream);
639 nb->bNonLocalStreamDoneMarked = true;
642 /* only transfer energies in the local stream */
643 if (iloc == InteractionLocality::Local)
645 /* DtoH fshift when virial is needed */
646 if (stepWork.computeVirial)
649 sizeof(*nb->nbst.fShift) == sizeof(Float3),
650 "Sizes of host- and device-side shift vector elements should be the same.");
651 copyFromDeviceBuffer(nb->nbst.fShift,
656 GpuApiCallBehavior::Async,
657 bDoTime ? timers->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
661 if (stepWork.computeEnergy)
663 static_assert(sizeof(*nb->nbst.eLJ) == sizeof(float),
664 "Sizes of host- and device-side LJ energy terms should be the same.");
665 copyFromDeviceBuffer(nb->nbst.eLJ,
670 GpuApiCallBehavior::Async,
671 bDoTime ? timers->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
672 static_assert(sizeof(*nb->nbst.eElec) == sizeof(float),
673 "Sizes of host- and device-side electrostatic energy terms should be the "
675 copyFromDeviceBuffer(nb->nbst.eElec,
680 GpuApiCallBehavior::Async,
681 bDoTime ? timers->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
687 timers->xf[atomLocality].nb_d2h.closeTimingRegion(deviceStream);
691 void nbnxnInsertNonlocalGpuDependency(NbnxmGpu* nb, const InteractionLocality interactionLocality)
693 const DeviceStream& deviceStream = *nb->deviceStreams[interactionLocality];
695 /* When we get here all misc operations issued in the local stream as well as
696 the local xq H2D are done,
697 so we record that in the local stream and wait for it in the nonlocal one.
698 This wait needs to precede any PP tasks, bonded or nonbonded, that may
699 compute on interactions between local and nonlocal atoms.
701 if (nb->bUseTwoStreams)
703 if (interactionLocality == InteractionLocality::Local)
705 nb->misc_ops_and_local_H2D_done.markEvent(deviceStream);
706 issueClFlushInStream(deviceStream);
710 nb->misc_ops_and_local_H2D_done.enqueueWaitEvent(deviceStream);
715 /*! \brief Launch asynchronously the xq buffer host to device copy. */
716 void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const AtomLocality atomLocality)
718 GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
720 const InteractionLocality iloc = gpuAtomToInteractionLocality(atomLocality);
722 NBAtomData* adat = nb->atdat;
723 gpu_plist* plist = nb->plist[iloc];
724 Nbnxm::GpuTimers* timers = nb->timers;
725 const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
727 const bool bDoTime = nb->bDoTime;
729 /* Don't launch the non-local H2D copy if there is no dependent
730 work to do: neither non-local nor other (e.g. bonded) work
731 to do that has as input the nbnxn coordaintes.
732 Doing the same for the local kernel is more complicated, since the
733 local part of the force array also depends on the non-local kernel.
734 So to avoid complicating the code and to reduce the risk of bugs,
735 we always call the local local x+q copy (and the rest of the local
736 work in nbnxn_gpu_launch_kernel().
738 if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(*nb, iloc))
740 plist->haveFreshList = false;
742 // The event is marked for Local interactions unconditionally,
743 // so it has to be released here because of the early return
744 // for NonLocal interactions.
745 nb->misc_ops_and_local_H2D_done.reset();
750 /* local/nonlocal offset and length used for xq and f */
751 const auto atomsRange = getGpuAtomRange(adat, atomLocality);
753 /* beginning of timed HtoD section */
756 timers->xf[atomLocality].nb_h2d.openTimingRegion(deviceStream);
760 GMX_ASSERT(nbatom->XFormat == nbatXYZQ,
761 "The coordinates should be in xyzq format to copy to the Float4 device buffer.");
762 copyToDeviceBuffer(&adat->xq,
763 reinterpret_cast<const Float4*>(nbatom->x().data()) + atomsRange.begin(),
767 GpuApiCallBehavior::Async,
772 timers->xf[atomLocality].nb_h2d.closeTimingRegion(deviceStream);
775 /* When we get here all misc operations issued in the local stream as well as
776 the local xq H2D are done,
777 so we record that in the local stream and wait for it in the nonlocal one.
778 This wait needs to precede any PP tasks, bonded or nonbonded, that may
779 compute on interactions between local and nonlocal atoms.
781 nbnxnInsertNonlocalGpuDependency(nb, iloc);