2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2016 by the GROMACS development team.
5 * Copyright (c) 2017,2018,2019,2020,2021, by the GROMACS development team, led by
6 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
7 * and including many others, as listed in the AUTHORS file in the
8 * top-level source directory and at http://www.gromacs.org.
10 * GROMACS is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public License
12 * as published by the Free Software Foundation; either version 2.1
13 * of the License, or (at your option) any later version.
15 * GROMACS is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with GROMACS; if not, see
22 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
23 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
25 * If you want to redistribute modifications to GROMACS, please
26 * consider that scientific software is very special. Version
27 * control is crucial - bugs must be traceable. We will be happy to
28 * consider code for inclusion in the official distribution, but
29 * derived work must not be called official GROMACS. Details are found
30 * in the README & COPYING files - if they are missing, get the
31 * official version at http://www.gromacs.org.
33 * To help us fund GROMACS development, we humbly ask that you cite
34 * the research papers on the package. Check out http://www.gromacs.org.
37 * \brief Define common implementation of nbnxm_gpu_data_mgmt.h
39 * \author Anca Hamuraru <anca@streamcomputing.eu>
40 * \author Dimitrios Karkoulis <dimitris.karkoulis@gmail.com>
41 * \author Teemu Virolainen <teemu@streamcomputing.eu>
42 * \author Szilárd Páll <pall.szilard@gmail.com>
43 * \author Artem Zhmurov <zhmurov@gmail.com>
45 * \ingroup module_nbnxm
52 # include "cuda/nbnxm_cuda_types.h"
56 # include "opencl/nbnxm_ocl_types.h"
60 # include "sycl/nbnxm_sycl_types.h"
63 #include "nbnxm_gpu_data_mgmt.h"
65 #include "gromacs/hardware/device_information.h"
66 #include "gromacs/mdtypes/interaction_const.h"
67 #include "gromacs/nbnxm/gpu_common_utils.h"
68 #include "gromacs/nbnxm/gpu_data_mgmt.h"
69 #include "gromacs/timing/gpu_timing.h"
70 #include "gromacs/utility/cstringutil.h"
71 #include "gromacs/utility/exceptions.h"
72 #include "gromacs/utility/fatalerror.h"
74 #include "nbnxm_gpu.h"
75 #include "pairlistsets.h"
80 void init_ewald_coulomb_force_table(const EwaldCorrectionTables& tables,
82 const DeviceContext& deviceContext)
86 destroyParamLookupTable(&nbp->coulomb_tab, nbp->coulomb_tab_texobj);
89 nbp->coulomb_tab_scale = tables.scale;
91 &nbp->coulomb_tab, &nbp->coulomb_tab_texobj, tables.tableF.data(), tables.tableF.size(), deviceContext);
94 enum ElecType nbnxn_gpu_pick_ewald_kernel_type(const interaction_const_t& ic,
95 const DeviceInformation gmx_unused& deviceInfo)
97 bool bTwinCut = (ic.rcoulomb != ic.rvdw);
99 /* Benchmarking/development environment variables to force the use of
100 analytical or tabulated Ewald kernel. */
101 const bool forceAnalyticalEwald = (getenv("GMX_GPU_NB_ANA_EWALD") != nullptr);
102 const bool forceTabulatedEwald = (getenv("GMX_GPU_NB_TAB_EWALD") != nullptr);
103 const bool forceTwinCutoffEwald = (getenv("GMX_GPU_NB_EWALD_TWINCUT") != nullptr);
105 if (forceAnalyticalEwald && forceTabulatedEwald)
108 "Both analytical and tabulated Ewald GPU non-bonded kernels "
109 "requested through environment variables.");
112 /* By default, use analytical Ewald except with CUDA on NVIDIA CC 7.0 and 8.0.
114 const bool c_useTabulatedEwaldDefault =
116 (deviceInfo.prop.major == 7 && deviceInfo.prop.minor == 0)
117 || (deviceInfo.prop.major == 8 && deviceInfo.prop.minor == 0);
121 bool bUseAnalyticalEwald = !c_useTabulatedEwaldDefault;
122 if (forceAnalyticalEwald)
124 bUseAnalyticalEwald = true;
127 fprintf(debug, "Using analytical Ewald GPU kernels\n");
130 else if (forceTabulatedEwald)
132 bUseAnalyticalEwald = false;
136 fprintf(debug, "Using tabulated Ewald GPU kernels\n");
140 /* Use twin cut-off kernels if requested by bTwinCut or the env. var.
141 forces it (use it for debugging/benchmarking only). */
142 if (!bTwinCut && !forceTwinCutoffEwald)
144 return bUseAnalyticalEwald ? ElecType::EwaldAna : ElecType::EwaldTab;
148 return bUseAnalyticalEwald ? ElecType::EwaldAnaTwin : ElecType::EwaldTabTwin;
152 void set_cutoff_parameters(NBParamGpu* nbp, const interaction_const_t* ic, const PairlistParams& listParams)
154 nbp->ewald_beta = ic->ewaldcoeff_q;
155 nbp->sh_ewald = ic->sh_ewald;
156 nbp->epsfac = ic->epsfac;
157 nbp->two_k_rf = 2.0 * ic->reactionFieldCoefficient;
158 nbp->c_rf = ic->reactionFieldShift;
159 nbp->rvdw_sq = ic->rvdw * ic->rvdw;
160 nbp->rcoulomb_sq = ic->rcoulomb * ic->rcoulomb;
161 nbp->rlistOuter_sq = listParams.rlistOuter * listParams.rlistOuter;
162 nbp->rlistInner_sq = listParams.rlistInner * listParams.rlistInner;
163 nbp->useDynamicPruning = listParams.useDynamicPruning;
165 nbp->sh_lj_ewald = ic->sh_lj_ewald;
166 nbp->ewaldcoeff_lj = ic->ewaldcoeff_lj;
168 nbp->rvdw_switch = ic->rvdw_switch;
169 nbp->dispersion_shift = ic->dispersion_shift;
170 nbp->repulsion_shift = ic->repulsion_shift;
171 nbp->vdw_switch = ic->vdw_switch;
174 void gpu_pme_loadbal_update_param(const nonbonded_verlet_t* nbv, const interaction_const_t* ic)
176 if (!nbv || !nbv->useGpu())
180 NbnxmGpu* nb = nbv->gpu_nbv;
181 NBParamGpu* nbp = nb->nbparam;
183 set_cutoff_parameters(nbp, ic, nbv->pairlistSets().params());
185 nbp->elecType = nbnxn_gpu_pick_ewald_kernel_type(*ic, nb->deviceContext_->deviceInfo());
187 GMX_RELEASE_ASSERT(ic->coulombEwaldTables, "Need valid Coulomb Ewald correction tables");
188 init_ewald_coulomb_force_table(*ic->coulombEwaldTables, nbp, *nb->deviceContext_);
191 void init_plist(gpu_plist* pl)
193 /* initialize to nullptr pointers to data that is not allocated here and will
194 need reallocation in nbnxn_gpu_init_pairlist */
200 /* size -1 indicates that the respective array hasn't been initialized yet */
207 pl->imask_nalloc = -1;
209 pl->excl_nalloc = -1;
210 pl->haveFreshList = false;
211 pl->rollingPruningNumParts = 0;
212 pl->rollingPruningPart = 0;
215 void init_timings(gmx_wallclock_gpu_nbnxn_t* t)
222 for (int i = 0; i < 2; i++)
224 for (int j = 0; j < 2; j++)
226 t->ktime[i][j].t = 0.0;
227 t->ktime[i][j].c = 0;
231 t->pruneTime.t = 0.0;
232 t->dynamicPruneTime.c = 0;
233 t->dynamicPruneTime.t = 0.0;
236 //! This function is documented in the header file
237 void gpu_init_pairlist(NbnxmGpu* nb, const NbnxnPairlistGpu* h_plist, const InteractionLocality iloc)
240 // Timing accumulation should happen only if there was work to do
241 // because getLastRangeTime() gets skipped with empty lists later
242 // which leads to the counter not being reset.
243 bool bDoTime = (nb->bDoTime && !h_plist->sci.empty());
244 const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
245 gpu_plist* d_plist = nb->plist[iloc];
247 if (d_plist->na_c < 0)
249 d_plist->na_c = h_plist->na_ci;
253 if (d_plist->na_c != h_plist->na_ci)
256 "In init_plist: the #atoms per cell has changed (from %d to %d)",
263 GpuTimers::Interaction& iTimers = nb->timers->interaction[iloc];
267 iTimers.pl_h2d.openTimingRegion(deviceStream);
268 iTimers.didPairlistH2D = true;
271 // TODO most of this function is same in CUDA and OpenCL, move into the header
272 const DeviceContext& deviceContext = *nb->deviceContext_;
274 reallocateDeviceBuffer(
275 &d_plist->sci, h_plist->sci.size(), &d_plist->nsci, &d_plist->sci_nalloc, deviceContext);
276 copyToDeviceBuffer(&d_plist->sci,
281 GpuApiCallBehavior::Async,
282 bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
284 reallocateDeviceBuffer(
285 &d_plist->cj4, h_plist->cj4.size(), &d_plist->ncj4, &d_plist->cj4_nalloc, deviceContext);
286 copyToDeviceBuffer(&d_plist->cj4,
291 GpuApiCallBehavior::Async,
292 bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
294 reallocateDeviceBuffer(&d_plist->imask,
295 h_plist->cj4.size() * c_nbnxnGpuClusterpairSplit,
297 &d_plist->imask_nalloc,
300 reallocateDeviceBuffer(
301 &d_plist->excl, h_plist->excl.size(), &d_plist->nexcl, &d_plist->excl_nalloc, deviceContext);
302 copyToDeviceBuffer(&d_plist->excl,
303 h_plist->excl.data(),
305 h_plist->excl.size(),
307 GpuApiCallBehavior::Async,
308 bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
312 iTimers.pl_h2d.closeTimingRegion(deviceStream);
315 /* need to prune the pair list during the next step */
316 d_plist->haveFreshList = true;
319 //! This function is documented in the header file
320 gmx_wallclock_gpu_nbnxn_t* gpu_get_timings(NbnxmGpu* nb)
322 return (nb != nullptr && nb->bDoTime) ? nb->timings : nullptr;
325 //! This function is documented in the header file
326 void gpu_reset_timings(nonbonded_verlet_t* nbv)
328 if (nbv->gpu_nbv && nbv->gpu_nbv->bDoTime)
330 init_timings(nbv->gpu_nbv->timings);
334 bool gpu_is_kernel_ewald_analytical(const NbnxmGpu* nb)
336 return ((nb->nbparam->elecType == ElecType::EwaldAna)
337 || (nb->nbparam->elecType == ElecType::EwaldAnaTwin));
340 enum ElecType nbnxmGpuPickElectrostaticsKernelType(const interaction_const_t* ic,
341 const DeviceInformation& deviceInfo)
343 if (ic->eeltype == CoulombInteractionType::Cut)
345 return ElecType::Cut;
347 else if (EEL_RF(ic->eeltype))
351 else if ((EEL_PME(ic->eeltype) || ic->eeltype == CoulombInteractionType::Ewald))
353 return nbnxn_gpu_pick_ewald_kernel_type(*ic, deviceInfo);
357 /* Shouldn't happen, as this is checked when choosing Verlet-scheme */
358 GMX_THROW(gmx::InconsistentInputError(
359 gmx::formatString("The requested electrostatics type %s is not implemented in "
360 "the GPU accelerated kernels!",
361 enumValueToString(ic->eeltype))));
366 enum VdwType nbnxmGpuPickVdwKernelType(const interaction_const_t* ic, LJCombinationRule ljCombinationRule)
368 if (ic->vdwtype == VanDerWaalsType::Cut)
370 switch (ic->vdw_modifier)
372 case InteractionModifiers::None:
373 case InteractionModifiers::PotShift:
374 switch (ljCombinationRule)
376 case LJCombinationRule::None: return VdwType::Cut;
377 case LJCombinationRule::Geometric: return VdwType::CutCombGeom;
378 case LJCombinationRule::LorentzBerthelot: return VdwType::CutCombLB;
380 GMX_THROW(gmx::InconsistentInputError(gmx::formatString(
381 "The requested LJ combination rule %s is not implemented in "
382 "the GPU accelerated kernels!",
383 enumValueToString(ljCombinationRule))));
385 case InteractionModifiers::ForceSwitch: return VdwType::FSwitch;
386 case InteractionModifiers::PotSwitch: return VdwType::PSwitch;
388 GMX_THROW(gmx::InconsistentInputError(
389 gmx::formatString("The requested VdW interaction modifier %s is not "
390 "implemented in the GPU accelerated kernels!",
391 enumValueToString(ic->vdw_modifier))));
394 else if (ic->vdwtype == VanDerWaalsType::Pme)
396 if (ic->ljpme_comb_rule == LongRangeVdW::Geom)
398 assert(ljCombinationRule == LJCombinationRule::Geometric);
399 return VdwType::EwaldGeom;
403 assert(ljCombinationRule == LJCombinationRule::LorentzBerthelot);
404 return VdwType::EwaldLB;
409 GMX_THROW(gmx::InconsistentInputError(gmx::formatString(
410 "The requested VdW type %s is not implemented in the GPU accelerated kernels!",
411 enumValueToString(ic->vdwtype))));
415 void setupGpuShortRangeWork(NbnxmGpu* nb, const gmx::GpuBonded* gpuBonded, const gmx::InteractionLocality iLocality)
417 GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
419 // There is short-range work if the pair list for the provided
420 // interaction locality contains entries or if there is any
421 // bonded work (as this is not split into local/nonlocal).
422 nb->haveWork[iLocality] = ((nb->plist[iLocality]->nsci != 0)
423 || (gpuBonded != nullptr && gpuBonded->haveInteractions()));
426 bool haveGpuShortRangeWork(const NbnxmGpu* nb, const gmx::AtomLocality aLocality)
428 GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
430 return haveGpuShortRangeWork(*nb, gpuAtomToInteractionLocality(aLocality));
433 inline void issueClFlushInStream(const DeviceStream& gmx_unused deviceStream)
436 /* Based on the v1.2 section 5.13 of the OpenCL spec, a flush is needed
437 * in the stream after marking an event in it in order to be able to sync with
438 * the event from another stream.
440 cl_int cl_error = clFlush(deviceStream.stream());
441 if (cl_error != CL_SUCCESS)
443 GMX_THROW(gmx::InternalError("clFlush failed: " + ocl_get_error_string(cl_error)));
448 void nbnxnInsertNonlocalGpuDependency(NbnxmGpu* nb, const InteractionLocality interactionLocality)
450 const DeviceStream& deviceStream = *nb->deviceStreams[interactionLocality];
452 /* When we get here all misc operations issued in the local stream as well as
453 the local xq H2D are done,
454 so we record that in the local stream and wait for it in the nonlocal one.
455 This wait needs to precede any PP tasks, bonded or nonbonded, that may
456 compute on interactions between local and nonlocal atoms.
458 if (nb->bUseTwoStreams)
460 if (interactionLocality == InteractionLocality::Local)
462 nb->misc_ops_and_local_H2D_done.markEvent(deviceStream);
463 issueClFlushInStream(deviceStream);
467 nb->misc_ops_and_local_H2D_done.enqueueWaitEvent(deviceStream);
472 /*! \brief Launch asynchronously the xq buffer host to device copy. */
473 void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const AtomLocality atomLocality)
475 GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
477 const InteractionLocality iloc = gpuAtomToInteractionLocality(atomLocality);
479 NBAtomData* adat = nb->atdat;
480 gpu_plist* plist = nb->plist[iloc];
481 Nbnxm::GpuTimers* timers = nb->timers;
482 const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
484 const bool bDoTime = nb->bDoTime;
486 /* Don't launch the non-local H2D copy if there is no dependent
487 work to do: neither non-local nor other (e.g. bonded) work
488 to do that has as input the nbnxn coordaintes.
489 Doing the same for the local kernel is more complicated, since the
490 local part of the force array also depends on the non-local kernel.
491 So to avoid complicating the code and to reduce the risk of bugs,
492 we always call the local local x+q copy (and the rest of the local
493 work in nbnxn_gpu_launch_kernel().
495 if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(*nb, iloc))
497 plist->haveFreshList = false;
499 // The event is marked for Local interactions unconditionally,
500 // so it has to be released here because of the early return
501 // for NonLocal interactions.
502 nb->misc_ops_and_local_H2D_done.reset();
507 /* local/nonlocal offset and length used for xq and f */
508 const auto atomsRange = getGpuAtomRange(adat, atomLocality);
510 /* beginning of timed HtoD section */
513 timers->xf[atomLocality].nb_h2d.openTimingRegion(deviceStream);
517 GMX_ASSERT(nbatom->XFormat == nbatXYZQ,
518 "The coordinates should be in xyzq format to copy to the Float4 device buffer.");
519 copyToDeviceBuffer(&adat->xq,
520 reinterpret_cast<const Float4*>(nbatom->x().data()) + atomsRange.begin(),
524 GpuApiCallBehavior::Async,
529 timers->xf[atomLocality].nb_h2d.closeTimingRegion(deviceStream);
532 /* When we get here all misc operations issued in the local stream as well as
533 the local xq H2D are done,
534 so we record that in the local stream and wait for it in the nonlocal one.
535 This wait needs to precede any PP tasks, bonded or nonbonded, that may
536 compute on interactions between local and nonlocal atoms.
538 nbnxnInsertNonlocalGpuDependency(nb, iloc);