2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2016 by the GROMACS development team.
5 * Copyright (c) 2017,2018,2019,2020,2021, by the GROMACS development team, led by
6 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
7 * and including many others, as listed in the AUTHORS file in the
8 * top-level source directory and at http://www.gromacs.org.
10 * GROMACS is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public License
12 * as published by the Free Software Foundation; either version 2.1
13 * of the License, or (at your option) any later version.
15 * GROMACS is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with GROMACS; if not, see
22 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
23 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
25 * If you want to redistribute modifications to GROMACS, please
26 * consider that scientific software is very special. Version
27 * control is crucial - bugs must be traceable. We will be happy to
28 * consider code for inclusion in the official distribution, but
29 * derived work must not be called official GROMACS. Details are found
30 * in the README & COPYING files - if they are missing, get the
31 * official version at http://www.gromacs.org.
33 * To help us fund GROMACS development, we humbly ask that you cite
34 * the research papers on the package. Check out http://www.gromacs.org.
37 * \brief Define common implementation of nbnxm_gpu_data_mgmt.h
39 * \author Anca Hamuraru <anca@streamcomputing.eu>
40 * \author Dimitrios Karkoulis <dimitris.karkoulis@gmail.com>
41 * \author Teemu Virolainen <teemu@streamcomputing.eu>
42 * \author Szilárd Páll <pall.szilard@gmail.com>
43 * \author Artem Zhmurov <zhmurov@gmail.com>
45 * \ingroup module_nbnxm
52 # include "cuda/nbnxm_cuda_types.h"
56 # include "opencl/nbnxm_ocl_types.h"
60 # include "sycl/nbnxm_sycl_types.h"
63 #include "nbnxm_gpu_data_mgmt.h"
65 #include "gromacs/hardware/device_information.h"
66 #include "gromacs/mdtypes/interaction_const.h"
67 #include "gromacs/nbnxm/gpu_common_utils.h"
68 #include "gromacs/nbnxm/gpu_data_mgmt.h"
69 #include "gromacs/timing/gpu_timing.h"
70 #include "gromacs/utility/cstringutil.h"
71 #include "gromacs/utility/fatalerror.h"
73 #include "nbnxm_gpu.h"
74 #include "pairlistsets.h"
79 void init_ewald_coulomb_force_table(const EwaldCorrectionTables& tables,
81 const DeviceContext& deviceContext)
85 destroyParamLookupTable(&nbp->coulomb_tab, nbp->coulomb_tab_texobj);
88 nbp->coulomb_tab_scale = tables.scale;
90 &nbp->coulomb_tab, &nbp->coulomb_tab_texobj, tables.tableF.data(), tables.tableF.size(), deviceContext);
93 enum ElecType nbnxn_gpu_pick_ewald_kernel_type(const interaction_const_t& ic,
94 const DeviceInformation gmx_unused& deviceInfo)
96 bool bTwinCut = (ic.rcoulomb != ic.rvdw);
98 /* Benchmarking/development environment variables to force the use of
99 analytical or tabulated Ewald kernel. */
100 const bool forceAnalyticalEwald = (getenv("GMX_GPU_NB_ANA_EWALD") != nullptr);
101 const bool forceTabulatedEwald = (getenv("GMX_GPU_NB_TAB_EWALD") != nullptr);
102 const bool forceTwinCutoffEwald = (getenv("GMX_GPU_NB_EWALD_TWINCUT") != nullptr);
104 if (forceAnalyticalEwald && forceTabulatedEwald)
107 "Both analytical and tabulated Ewald GPU non-bonded kernels "
108 "requested through environment variables.");
111 /* By default, use analytical Ewald except with CUDA on NVIDIA CC 7.0 and 8.0.
113 const bool c_useTabulatedEwaldDefault =
115 (deviceInfo.prop.major == 7 && deviceInfo.prop.minor == 0)
116 || (deviceInfo.prop.major == 8 && deviceInfo.prop.minor == 0);
120 bool bUseAnalyticalEwald = !c_useTabulatedEwaldDefault;
121 if (forceAnalyticalEwald)
123 bUseAnalyticalEwald = true;
126 fprintf(debug, "Using analytical Ewald GPU kernels\n");
129 else if (forceTabulatedEwald)
131 bUseAnalyticalEwald = false;
135 fprintf(debug, "Using tabulated Ewald GPU kernels\n");
139 /* Use twin cut-off kernels if requested by bTwinCut or the env. var.
140 forces it (use it for debugging/benchmarking only). */
141 if (!bTwinCut && !forceTwinCutoffEwald)
143 return bUseAnalyticalEwald ? ElecType::EwaldAna : ElecType::EwaldTab;
147 return bUseAnalyticalEwald ? ElecType::EwaldAnaTwin : ElecType::EwaldTabTwin;
151 void set_cutoff_parameters(NBParamGpu* nbp, const interaction_const_t* ic, const PairlistParams& listParams)
153 nbp->ewald_beta = ic->ewaldcoeff_q;
154 nbp->sh_ewald = ic->sh_ewald;
155 nbp->epsfac = ic->epsfac;
156 nbp->two_k_rf = 2.0 * ic->reactionFieldCoefficient;
157 nbp->c_rf = ic->reactionFieldShift;
158 nbp->rvdw_sq = ic->rvdw * ic->rvdw;
159 nbp->rcoulomb_sq = ic->rcoulomb * ic->rcoulomb;
160 nbp->rlistOuter_sq = listParams.rlistOuter * listParams.rlistOuter;
161 nbp->rlistInner_sq = listParams.rlistInner * listParams.rlistInner;
162 nbp->useDynamicPruning = listParams.useDynamicPruning;
164 nbp->sh_lj_ewald = ic->sh_lj_ewald;
165 nbp->ewaldcoeff_lj = ic->ewaldcoeff_lj;
167 nbp->rvdw_switch = ic->rvdw_switch;
168 nbp->dispersion_shift = ic->dispersion_shift;
169 nbp->repulsion_shift = ic->repulsion_shift;
170 nbp->vdw_switch = ic->vdw_switch;
173 void gpu_pme_loadbal_update_param(const nonbonded_verlet_t* nbv, const interaction_const_t* ic)
175 if (!nbv || !nbv->useGpu())
179 NbnxmGpu* nb = nbv->gpu_nbv;
180 NBParamGpu* nbp = nb->nbparam;
182 set_cutoff_parameters(nbp, ic, nbv->pairlistSets().params());
184 nbp->elecType = nbnxn_gpu_pick_ewald_kernel_type(*ic, nb->deviceContext_->deviceInfo());
186 GMX_RELEASE_ASSERT(ic->coulombEwaldTables, "Need valid Coulomb Ewald correction tables");
187 init_ewald_coulomb_force_table(*ic->coulombEwaldTables, nbp, *nb->deviceContext_);
190 void init_plist(gpu_plist* pl)
192 /* initialize to nullptr pointers to data that is not allocated here and will
193 need reallocation in nbnxn_gpu_init_pairlist */
199 /* size -1 indicates that the respective array hasn't been initialized yet */
206 pl->imask_nalloc = -1;
208 pl->excl_nalloc = -1;
209 pl->haveFreshList = false;
210 pl->rollingPruningNumParts = 0;
211 pl->rollingPruningPart = 0;
214 void init_timings(gmx_wallclock_gpu_nbnxn_t* t)
221 for (int i = 0; i < 2; i++)
223 for (int j = 0; j < 2; j++)
225 t->ktime[i][j].t = 0.0;
226 t->ktime[i][j].c = 0;
230 t->pruneTime.t = 0.0;
231 t->dynamicPruneTime.c = 0;
232 t->dynamicPruneTime.t = 0.0;
235 //! This function is documented in the header file
236 void gpu_init_pairlist(NbnxmGpu* nb, const NbnxnPairlistGpu* h_plist, const InteractionLocality iloc)
239 // Timing accumulation should happen only if there was work to do
240 // because getLastRangeTime() gets skipped with empty lists later
241 // which leads to the counter not being reset.
242 bool bDoTime = (nb->bDoTime && !h_plist->sci.empty());
243 const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
244 gpu_plist* d_plist = nb->plist[iloc];
246 if (d_plist->na_c < 0)
248 d_plist->na_c = h_plist->na_ci;
252 if (d_plist->na_c != h_plist->na_ci)
255 "In init_plist: the #atoms per cell has changed (from %d to %d)",
262 GpuTimers::Interaction& iTimers = nb->timers->interaction[iloc];
266 iTimers.pl_h2d.openTimingRegion(deviceStream);
267 iTimers.didPairlistH2D = true;
270 // TODO most of this function is same in CUDA and OpenCL, move into the header
271 const DeviceContext& deviceContext = *nb->deviceContext_;
273 reallocateDeviceBuffer(
274 &d_plist->sci, h_plist->sci.size(), &d_plist->nsci, &d_plist->sci_nalloc, deviceContext);
275 copyToDeviceBuffer(&d_plist->sci,
280 GpuApiCallBehavior::Async,
281 bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
283 reallocateDeviceBuffer(
284 &d_plist->cj4, h_plist->cj4.size(), &d_plist->ncj4, &d_plist->cj4_nalloc, deviceContext);
285 copyToDeviceBuffer(&d_plist->cj4,
290 GpuApiCallBehavior::Async,
291 bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
293 reallocateDeviceBuffer(&d_plist->imask,
294 h_plist->cj4.size() * c_nbnxnGpuClusterpairSplit,
296 &d_plist->imask_nalloc,
299 reallocateDeviceBuffer(
300 &d_plist->excl, h_plist->excl.size(), &d_plist->nexcl, &d_plist->excl_nalloc, deviceContext);
301 copyToDeviceBuffer(&d_plist->excl,
302 h_plist->excl.data(),
304 h_plist->excl.size(),
306 GpuApiCallBehavior::Async,
307 bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
311 iTimers.pl_h2d.closeTimingRegion(deviceStream);
314 /* need to prune the pair list during the next step */
315 d_plist->haveFreshList = true;
318 //! This function is documented in the header file
319 gmx_wallclock_gpu_nbnxn_t* gpu_get_timings(NbnxmGpu* nb)
321 return (nb != nullptr && nb->bDoTime) ? nb->timings : nullptr;
324 //! This function is documented in the header file
325 void gpu_reset_timings(nonbonded_verlet_t* nbv)
327 if (nbv->gpu_nbv && nbv->gpu_nbv->bDoTime)
329 init_timings(nbv->gpu_nbv->timings);
333 bool gpu_is_kernel_ewald_analytical(const NbnxmGpu* nb)
335 return ((nb->nbparam->elecType == ElecType::EwaldAna)
336 || (nb->nbparam->elecType == ElecType::EwaldAnaTwin));
339 enum ElecType nbnxmGpuPickElectrostaticsKernelType(const interaction_const_t* ic,
340 const DeviceInformation& deviceInfo)
342 if (ic->eeltype == CoulombInteractionType::Cut)
344 return ElecType::Cut;
346 else if (EEL_RF(ic->eeltype))
350 else if ((EEL_PME(ic->eeltype) || ic->eeltype == CoulombInteractionType::Ewald))
352 return nbnxn_gpu_pick_ewald_kernel_type(*ic, deviceInfo);
356 /* Shouldn't happen, as this is checked when choosing Verlet-scheme */
357 GMX_THROW(gmx::InconsistentInputError(
358 gmx::formatString("The requested electrostatics type %s is not implemented in "
359 "the GPU accelerated kernels!",
360 enumValueToString(ic->eeltype))));
365 enum VdwType nbnxmGpuPickVdwKernelType(const interaction_const_t* ic, LJCombinationRule ljCombinationRule)
367 if (ic->vdwtype == VanDerWaalsType::Cut)
369 switch (ic->vdw_modifier)
371 case InteractionModifiers::None:
372 case InteractionModifiers::PotShift:
373 switch (ljCombinationRule)
375 case LJCombinationRule::None: return VdwType::Cut;
376 case LJCombinationRule::Geometric: return VdwType::CutCombGeom;
377 case LJCombinationRule::LorentzBerthelot: return VdwType::CutCombLB;
379 GMX_THROW(gmx::InconsistentInputError(gmx::formatString(
380 "The requested LJ combination rule %s is not implemented in "
381 "the GPU accelerated kernels!",
382 enumValueToString(ljCombinationRule))));
384 case InteractionModifiers::ForceSwitch: return VdwType::FSwitch;
385 case InteractionModifiers::PotSwitch: return VdwType::PSwitch;
387 GMX_THROW(gmx::InconsistentInputError(
388 gmx::formatString("The requested VdW interaction modifier %s is not "
389 "implemented in the GPU accelerated kernels!",
390 enumValueToString(ic->vdw_modifier))));
393 else if (ic->vdwtype == VanDerWaalsType::Pme)
395 if (ic->ljpme_comb_rule == LongRangeVdW::Geom)
397 assert(ljCombinationRule == LJCombinationRule::Geometric);
398 return VdwType::EwaldGeom;
402 assert(ljCombinationRule == LJCombinationRule::LorentzBerthelot);
403 return VdwType::EwaldLB;
408 GMX_THROW(gmx::InconsistentInputError(gmx::formatString(
409 "The requested VdW type %s is not implemented in the GPU accelerated kernels!",
410 enumValueToString(ic->vdwtype))));
414 void setupGpuShortRangeWork(NbnxmGpu* nb, const gmx::GpuBonded* gpuBonded, const gmx::InteractionLocality iLocality)
416 GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
418 // There is short-range work if the pair list for the provided
419 // interaction locality contains entries or if there is any
420 // bonded work (as this is not split into local/nonlocal).
421 nb->haveWork[iLocality] = ((nb->plist[iLocality]->nsci != 0)
422 || (gpuBonded != nullptr && gpuBonded->haveInteractions()));
425 bool haveGpuShortRangeWork(const NbnxmGpu* nb, const gmx::AtomLocality aLocality)
427 GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
429 return haveGpuShortRangeWork(*nb, gpuAtomToInteractionLocality(aLocality));
432 /*! \brief Launch asynchronously the xq buffer host to device copy. */
433 void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const AtomLocality atomLocality)
435 GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
437 const InteractionLocality iloc = gpuAtomToInteractionLocality(atomLocality);
439 NBAtomData* adat = nb->atdat;
440 gpu_plist* plist = nb->plist[iloc];
441 Nbnxm::GpuTimers* timers = nb->timers;
442 const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
444 const bool bDoTime = nb->bDoTime;
446 /* Don't launch the non-local H2D copy if there is no dependent
447 work to do: neither non-local nor other (e.g. bonded) work
448 to do that has as input the nbnxn coordaintes.
449 Doing the same for the local kernel is more complicated, since the
450 local part of the force array also depends on the non-local kernel.
451 So to avoid complicating the code and to reduce the risk of bugs,
452 we always call the local local x+q copy (and the rest of the local
453 work in nbnxn_gpu_launch_kernel().
455 if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(*nb, iloc))
457 plist->haveFreshList = false;
459 // The event is marked for Local interactions unconditionally,
460 // so it has to be released here because of the early return
461 // for NonLocal interactions.
462 nb->misc_ops_and_local_H2D_done.reset();
467 /* local/nonlocal offset and length used for xq and f */
468 const auto atomsRange = getGpuAtomRange(adat, atomLocality);
470 /* beginning of timed HtoD section */
473 timers->xf[atomLocality].nb_h2d.openTimingRegion(deviceStream);
477 GMX_ASSERT(nbatom->XFormat == nbatXYZQ,
478 "The coordinates should be in xyzq format to copy to the Float4 device buffer.");
479 copyToDeviceBuffer(&adat->xq,
480 reinterpret_cast<const Float4*>(nbatom->x().data()) + atomsRange.begin(),
484 GpuApiCallBehavior::Async,
489 timers->xf[atomLocality].nb_h2d.closeTimingRegion(deviceStream);
492 /* When we get here all misc operations issued in the local stream as well as
493 the local xq H2D are done,
494 so we record that in the local stream and wait for it in the nonlocal one.
495 This wait needs to precede any PP tasks, bonded or nonbonded, that may
496 compute on interactions between local and nonlocal atoms.
498 nbnxnInsertNonlocalGpuDependency(nb, iloc);