2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2016 by the GROMACS development team.
5 * Copyright (c) 2017,2018,2019,2020,2021, by the GROMACS development team, led by
6 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
7 * and including many others, as listed in the AUTHORS file in the
8 * top-level source directory and at http://www.gromacs.org.
10 * GROMACS is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public License
12 * as published by the Free Software Foundation; either version 2.1
13 * of the License, or (at your option) any later version.
15 * GROMACS is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with GROMACS; if not, see
22 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
23 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
25 * If you want to redistribute modifications to GROMACS, please
26 * consider that scientific software is very special. Version
27 * control is crucial - bugs must be traceable. We will be happy to
28 * consider code for inclusion in the official distribution, but
29 * derived work must not be called official GROMACS. Details are found
30 * in the README & COPYING files - if they are missing, get the
31 * official version at http://www.gromacs.org.
33 * To help us fund GROMACS development, we humbly ask that you cite
34 * the research papers on the package. Check out http://www.gromacs.org.
37 * \brief Define common implementation of nbnxm_gpu_data_mgmt.h
39 * \author Anca Hamuraru <anca@streamcomputing.eu>
40 * \author Dimitrios Karkoulis <dimitris.karkoulis@gmail.com>
41 * \author Teemu Virolainen <teemu@streamcomputing.eu>
42 * \author Szilárd Páll <pall.szilard@gmail.com>
43 * \author Artem Zhmurov <zhmurov@gmail.com>
45 * \ingroup module_nbnxm
52 # include "cuda/nbnxm_cuda_types.h"
56 # include "opencl/nbnxm_ocl_types.h"
60 # include "sycl/nbnxm_sycl_types.h"
63 #include "nbnxm_gpu_data_mgmt.h"
65 #include "gromacs/hardware/device_information.h"
66 #include "gromacs/mdtypes/interaction_const.h"
67 #include "gromacs/nbnxm/gpu_common_utils.h"
68 #include "gromacs/nbnxm/gpu_data_mgmt.h"
69 #include "gromacs/timing/gpu_timing.h"
70 #include "gromacs/utility/cstringutil.h"
71 #include "gromacs/utility/exceptions.h"
72 #include "gromacs/utility/fatalerror.h"
74 #include "nbnxm_gpu.h"
75 #include "pairlistsets.h"
80 inline void issueClFlushInStream(const DeviceStream& deviceStream)
83 /* Based on the v1.2 section 5.13 of the OpenCL spec, a flush is needed
84 * in the stream after marking an event in it in order to be able to sync with
85 * the event from another stream.
87 cl_int cl_error = clFlush(deviceStream.stream());
88 if (cl_error != CL_SUCCESS)
90 GMX_THROW(gmx::InternalError("clFlush failed: " + ocl_get_error_string(cl_error)));
93 GMX_UNUSED_VALUE(deviceStream);
97 void init_ewald_coulomb_force_table(const EwaldCorrectionTables& tables,
99 const DeviceContext& deviceContext)
101 if (nbp->coulomb_tab)
103 destroyParamLookupTable(&nbp->coulomb_tab, nbp->coulomb_tab_texobj);
106 nbp->coulomb_tab_scale = tables.scale;
107 initParamLookupTable(
108 &nbp->coulomb_tab, &nbp->coulomb_tab_texobj, tables.tableF.data(), tables.tableF.size(), deviceContext);
111 enum ElecType nbnxn_gpu_pick_ewald_kernel_type(const interaction_const_t& ic,
112 const DeviceInformation gmx_unused& deviceInfo)
114 bool bTwinCut = (ic.rcoulomb != ic.rvdw);
116 /* Benchmarking/development environment variables to force the use of
117 analytical or tabulated Ewald kernel. */
118 const bool forceAnalyticalEwald = (getenv("GMX_GPU_NB_ANA_EWALD") != nullptr);
119 const bool forceTabulatedEwald = (getenv("GMX_GPU_NB_TAB_EWALD") != nullptr);
120 const bool forceTwinCutoffEwald = (getenv("GMX_GPU_NB_EWALD_TWINCUT") != nullptr);
122 if (forceAnalyticalEwald && forceTabulatedEwald)
125 "Both analytical and tabulated Ewald GPU non-bonded kernels "
126 "requested through environment variables.");
129 /* By default, use analytical Ewald except with CUDA on NVIDIA CC 7.0 and 8.0.
131 const bool c_useTabulatedEwaldDefault =
133 (deviceInfo.prop.major == 7 && deviceInfo.prop.minor == 0)
134 || (deviceInfo.prop.major == 8 && deviceInfo.prop.minor == 0);
138 bool bUseAnalyticalEwald = !c_useTabulatedEwaldDefault;
139 if (forceAnalyticalEwald)
141 bUseAnalyticalEwald = true;
144 fprintf(debug, "Using analytical Ewald GPU kernels\n");
147 else if (forceTabulatedEwald)
149 bUseAnalyticalEwald = false;
153 fprintf(debug, "Using tabulated Ewald GPU kernels\n");
157 /* Use twin cut-off kernels if requested by bTwinCut or the env. var.
158 forces it (use it for debugging/benchmarking only). */
159 if (!bTwinCut && !forceTwinCutoffEwald)
161 return bUseAnalyticalEwald ? ElecType::EwaldAna : ElecType::EwaldTab;
165 return bUseAnalyticalEwald ? ElecType::EwaldAnaTwin : ElecType::EwaldTabTwin;
169 void set_cutoff_parameters(NBParamGpu* nbp, const interaction_const_t* ic, const PairlistParams& listParams)
171 nbp->ewald_beta = ic->ewaldcoeff_q;
172 nbp->sh_ewald = ic->sh_ewald;
173 nbp->epsfac = ic->epsfac;
174 nbp->two_k_rf = 2.0 * ic->reactionFieldCoefficient;
175 nbp->c_rf = ic->reactionFieldShift;
176 nbp->rvdw_sq = ic->rvdw * ic->rvdw;
177 nbp->rcoulomb_sq = ic->rcoulomb * ic->rcoulomb;
178 nbp->rlistOuter_sq = listParams.rlistOuter * listParams.rlistOuter;
179 nbp->rlistInner_sq = listParams.rlistInner * listParams.rlistInner;
180 nbp->useDynamicPruning = listParams.useDynamicPruning;
182 nbp->sh_lj_ewald = ic->sh_lj_ewald;
183 nbp->ewaldcoeff_lj = ic->ewaldcoeff_lj;
185 nbp->rvdw_switch = ic->rvdw_switch;
186 nbp->dispersion_shift = ic->dispersion_shift;
187 nbp->repulsion_shift = ic->repulsion_shift;
188 nbp->vdw_switch = ic->vdw_switch;
191 void gpu_pme_loadbal_update_param(const nonbonded_verlet_t* nbv, const interaction_const_t* ic)
193 if (!nbv || !nbv->useGpu())
197 NbnxmGpu* nb = nbv->gpu_nbv;
198 NBParamGpu* nbp = nb->nbparam;
200 set_cutoff_parameters(nbp, ic, nbv->pairlistSets().params());
202 nbp->elecType = nbnxn_gpu_pick_ewald_kernel_type(*ic, nb->deviceContext_->deviceInfo());
204 GMX_RELEASE_ASSERT(ic->coulombEwaldTables, "Need valid Coulomb Ewald correction tables");
205 init_ewald_coulomb_force_table(*ic->coulombEwaldTables, nbp, *nb->deviceContext_);
208 void init_plist(gpu_plist* pl)
210 /* initialize to nullptr pointers to data that is not allocated here and will
211 need reallocation in nbnxn_gpu_init_pairlist */
217 /* size -1 indicates that the respective array hasn't been initialized yet */
224 pl->imask_nalloc = -1;
226 pl->excl_nalloc = -1;
227 pl->haveFreshList = false;
228 pl->rollingPruningNumParts = 0;
229 pl->rollingPruningPart = 0;
232 void init_timings(gmx_wallclock_gpu_nbnxn_t* t)
239 for (int i = 0; i < 2; i++)
241 for (int j = 0; j < 2; j++)
243 t->ktime[i][j].t = 0.0;
244 t->ktime[i][j].c = 0;
248 t->pruneTime.t = 0.0;
249 t->dynamicPruneTime.c = 0;
250 t->dynamicPruneTime.t = 0.0;
253 //! This function is documented in the header file
254 void gpu_init_pairlist(NbnxmGpu* nb, const NbnxnPairlistGpu* h_plist, const InteractionLocality iloc)
257 // Timing accumulation should happen only if there was work to do
258 // because getLastRangeTime() gets skipped with empty lists later
259 // which leads to the counter not being reset.
260 bool bDoTime = (nb->bDoTime && !h_plist->sci.empty());
261 const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
262 gpu_plist* d_plist = nb->plist[iloc];
264 if (d_plist->na_c < 0)
266 d_plist->na_c = h_plist->na_ci;
270 if (d_plist->na_c != h_plist->na_ci)
273 "In init_plist: the #atoms per cell has changed (from %d to %d)",
280 GpuTimers::Interaction& iTimers = nb->timers->interaction[iloc];
284 iTimers.pl_h2d.openTimingRegion(deviceStream);
285 iTimers.didPairlistH2D = true;
288 // TODO most of this function is same in CUDA and OpenCL, move into the header
289 const DeviceContext& deviceContext = *nb->deviceContext_;
291 reallocateDeviceBuffer(
292 &d_plist->sci, h_plist->sci.size(), &d_plist->nsci, &d_plist->sci_nalloc, deviceContext);
293 copyToDeviceBuffer(&d_plist->sci,
298 GpuApiCallBehavior::Async,
299 bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
301 reallocateDeviceBuffer(
302 &d_plist->cj4, h_plist->cj4.size(), &d_plist->ncj4, &d_plist->cj4_nalloc, deviceContext);
303 copyToDeviceBuffer(&d_plist->cj4,
308 GpuApiCallBehavior::Async,
309 bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
311 reallocateDeviceBuffer(&d_plist->imask,
312 h_plist->cj4.size() * c_nbnxnGpuClusterpairSplit,
314 &d_plist->imask_nalloc,
317 reallocateDeviceBuffer(
318 &d_plist->excl, h_plist->excl.size(), &d_plist->nexcl, &d_plist->excl_nalloc, deviceContext);
319 copyToDeviceBuffer(&d_plist->excl,
320 h_plist->excl.data(),
322 h_plist->excl.size(),
324 GpuApiCallBehavior::Async,
325 bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
329 iTimers.pl_h2d.closeTimingRegion(deviceStream);
332 /* need to prune the pair list during the next step */
333 d_plist->haveFreshList = true;
336 void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat)
338 bool bDoTime = nb->bDoTime;
339 Nbnxm::GpuTimers* timers = bDoTime ? nb->timers : nullptr;
340 NBAtomData* atdat = nb->atdat;
341 const DeviceContext& deviceContext = *nb->deviceContext_;
342 const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local];
344 int numAtoms = nbat->numAtoms();
345 bool realloced = false;
349 /* time async copy */
350 timers->atdat.openTimingRegion(localStream);
353 /* need to reallocate if we have to copy more atoms than the amount of space
354 available and only allocate if we haven't initialized yet, i.e atdat->natoms == -1 */
355 if (numAtoms > atdat->numAtomsAlloc)
357 int numAlloc = over_alloc_small(numAtoms);
359 /* free up first if the arrays have already been initialized */
360 if (atdat->numAtomsAlloc != -1)
362 freeDeviceBuffer(&atdat->f);
363 freeDeviceBuffer(&atdat->xq);
364 freeDeviceBuffer(&atdat->ljComb);
365 freeDeviceBuffer(&atdat->atomTypes);
369 allocateDeviceBuffer(&atdat->f, numAlloc, deviceContext);
370 allocateDeviceBuffer(&atdat->xq, numAlloc, deviceContext);
372 if (useLjCombRule(nb->nbparam->vdwType))
374 // Two Lennard-Jones parameters per atom
375 allocateDeviceBuffer(&atdat->ljComb, numAlloc, deviceContext);
379 allocateDeviceBuffer(&atdat->atomTypes, numAlloc, deviceContext);
382 atdat->numAtomsAlloc = numAlloc;
386 atdat->numAtoms = numAtoms;
387 atdat->numAtomsLocal = nbat->natoms_local;
389 /* need to clear GPU f output if realloc happened */
392 clearDeviceBufferAsync(&atdat->f, 0, atdat->numAtomsAlloc, localStream);
395 if (useLjCombRule(nb->nbparam->vdwType))
398 sizeof(Float2) == 2 * sizeof(*nbat->params().lj_comb.data()),
399 "Size of a pair of LJ parameters elements should be equal to the size of Float2.");
400 copyToDeviceBuffer(&atdat->ljComb,
401 reinterpret_cast<const Float2*>(nbat->params().lj_comb.data()),
405 GpuApiCallBehavior::Async,
406 bDoTime ? timers->atdat.fetchNextEvent() : nullptr);
410 static_assert(sizeof(int) == sizeof(*nbat->params().type.data()),
411 "Sizes of host- and device-side atom types should be the same.");
412 copyToDeviceBuffer(&atdat->atomTypes,
413 nbat->params().type.data(),
417 GpuApiCallBehavior::Async,
418 bDoTime ? timers->atdat.fetchNextEvent() : nullptr);
423 timers->atdat.closeTimingRegion(localStream);
426 /* kick off the tasks enqueued above to ensure concurrency with the search */
427 issueClFlushInStream(localStream);
430 //! This function is documented in the header file
431 gmx_wallclock_gpu_nbnxn_t* gpu_get_timings(NbnxmGpu* nb)
433 return (nb != nullptr && nb->bDoTime) ? nb->timings : nullptr;
436 //! This function is documented in the header file
437 void gpu_reset_timings(nonbonded_verlet_t* nbv)
439 if (nbv->gpu_nbv && nbv->gpu_nbv->bDoTime)
441 init_timings(nbv->gpu_nbv->timings);
445 bool gpu_is_kernel_ewald_analytical(const NbnxmGpu* nb)
447 return ((nb->nbparam->elecType == ElecType::EwaldAna)
448 || (nb->nbparam->elecType == ElecType::EwaldAnaTwin));
451 enum ElecType nbnxmGpuPickElectrostaticsKernelType(const interaction_const_t* ic,
452 const DeviceInformation& deviceInfo)
454 if (ic->eeltype == CoulombInteractionType::Cut)
456 return ElecType::Cut;
458 else if (EEL_RF(ic->eeltype))
462 else if ((EEL_PME(ic->eeltype) || ic->eeltype == CoulombInteractionType::Ewald))
464 return nbnxn_gpu_pick_ewald_kernel_type(*ic, deviceInfo);
468 /* Shouldn't happen, as this is checked when choosing Verlet-scheme */
469 GMX_THROW(gmx::InconsistentInputError(
470 gmx::formatString("The requested electrostatics type %s is not implemented in "
471 "the GPU accelerated kernels!",
472 enumValueToString(ic->eeltype))));
477 enum VdwType nbnxmGpuPickVdwKernelType(const interaction_const_t* ic, LJCombinationRule ljCombinationRule)
479 if (ic->vdwtype == VanDerWaalsType::Cut)
481 switch (ic->vdw_modifier)
483 case InteractionModifiers::None:
484 case InteractionModifiers::PotShift:
485 switch (ljCombinationRule)
487 case LJCombinationRule::None: return VdwType::Cut;
488 case LJCombinationRule::Geometric: return VdwType::CutCombGeom;
489 case LJCombinationRule::LorentzBerthelot: return VdwType::CutCombLB;
491 GMX_THROW(gmx::InconsistentInputError(gmx::formatString(
492 "The requested LJ combination rule %s is not implemented in "
493 "the GPU accelerated kernels!",
494 enumValueToString(ljCombinationRule))));
496 case InteractionModifiers::ForceSwitch: return VdwType::FSwitch;
497 case InteractionModifiers::PotSwitch: return VdwType::PSwitch;
499 GMX_THROW(gmx::InconsistentInputError(
500 gmx::formatString("The requested VdW interaction modifier %s is not "
501 "implemented in the GPU accelerated kernels!",
502 enumValueToString(ic->vdw_modifier))));
505 else if (ic->vdwtype == VanDerWaalsType::Pme)
507 if (ic->ljpme_comb_rule == LongRangeVdW::Geom)
509 assert(ljCombinationRule == LJCombinationRule::Geometric);
510 return VdwType::EwaldGeom;
514 assert(ljCombinationRule == LJCombinationRule::LorentzBerthelot);
515 return VdwType::EwaldLB;
520 GMX_THROW(gmx::InconsistentInputError(gmx::formatString(
521 "The requested VdW type %s is not implemented in the GPU accelerated kernels!",
522 enumValueToString(ic->vdwtype))));
526 void setupGpuShortRangeWork(NbnxmGpu* nb, const gmx::GpuBonded* gpuBonded, const gmx::InteractionLocality iLocality)
528 GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
530 // There is short-range work if the pair list for the provided
531 // interaction locality contains entries or if there is any
532 // bonded work (as this is not split into local/nonlocal).
533 nb->haveWork[iLocality] = ((nb->plist[iLocality]->nsci != 0)
534 || (gpuBonded != nullptr && gpuBonded->haveInteractions()));
537 bool haveGpuShortRangeWork(const NbnxmGpu* nb, const gmx::AtomLocality aLocality)
539 GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
541 return haveGpuShortRangeWork(*nb, gpuAtomToInteractionLocality(aLocality));
544 void nbnxnInsertNonlocalGpuDependency(NbnxmGpu* nb, const InteractionLocality interactionLocality)
546 const DeviceStream& deviceStream = *nb->deviceStreams[interactionLocality];
548 /* When we get here all misc operations issued in the local stream as well as
549 the local xq H2D are done,
550 so we record that in the local stream and wait for it in the nonlocal one.
551 This wait needs to precede any PP tasks, bonded or nonbonded, that may
552 compute on interactions between local and nonlocal atoms.
554 if (nb->bUseTwoStreams)
556 if (interactionLocality == InteractionLocality::Local)
558 nb->misc_ops_and_local_H2D_done.markEvent(deviceStream);
559 issueClFlushInStream(deviceStream);
563 nb->misc_ops_and_local_H2D_done.enqueueWaitEvent(deviceStream);
568 /*! \brief Launch asynchronously the xq buffer host to device copy. */
569 void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const AtomLocality atomLocality)
571 GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
573 const InteractionLocality iloc = gpuAtomToInteractionLocality(atomLocality);
575 NBAtomData* adat = nb->atdat;
576 gpu_plist* plist = nb->plist[iloc];
577 Nbnxm::GpuTimers* timers = nb->timers;
578 const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
580 const bool bDoTime = nb->bDoTime;
582 /* Don't launch the non-local H2D copy if there is no dependent
583 work to do: neither non-local nor other (e.g. bonded) work
584 to do that has as input the nbnxn coordaintes.
585 Doing the same for the local kernel is more complicated, since the
586 local part of the force array also depends on the non-local kernel.
587 So to avoid complicating the code and to reduce the risk of bugs,
588 we always call the local local x+q copy (and the rest of the local
589 work in nbnxn_gpu_launch_kernel().
591 if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(*nb, iloc))
593 plist->haveFreshList = false;
595 // The event is marked for Local interactions unconditionally,
596 // so it has to be released here because of the early return
597 // for NonLocal interactions.
598 nb->misc_ops_and_local_H2D_done.reset();
603 /* local/nonlocal offset and length used for xq and f */
604 const auto atomsRange = getGpuAtomRange(adat, atomLocality);
606 /* beginning of timed HtoD section */
609 timers->xf[atomLocality].nb_h2d.openTimingRegion(deviceStream);
613 GMX_ASSERT(nbatom->XFormat == nbatXYZQ,
614 "The coordinates should be in xyzq format to copy to the Float4 device buffer.");
615 copyToDeviceBuffer(&adat->xq,
616 reinterpret_cast<const Float4*>(nbatom->x().data()) + atomsRange.begin(),
620 GpuApiCallBehavior::Async,
625 timers->xf[atomLocality].nb_h2d.closeTimingRegion(deviceStream);
628 /* When we get here all misc operations issued in the local stream as well as
629 the local xq H2D are done,
630 so we record that in the local stream and wait for it in the nonlocal one.
631 This wait needs to precede any PP tasks, bonded or nonbonded, that may
632 compute on interactions between local and nonlocal atoms.
634 nbnxnInsertNonlocalGpuDependency(nb, iloc);