2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2016 by the GROMACS development team.
5 * Copyright (c) 2017,2018,2019,2020,2021, by the GROMACS development team, led by
6 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
7 * and including many others, as listed in the AUTHORS file in the
8 * top-level source directory and at http://www.gromacs.org.
10 * GROMACS is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public License
12 * as published by the Free Software Foundation; either version 2.1
13 * of the License, or (at your option) any later version.
15 * GROMACS is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with GROMACS; if not, see
22 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
23 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
25 * If you want to redistribute modifications to GROMACS, please
26 * consider that scientific software is very special. Version
27 * control is crucial - bugs must be traceable. We will be happy to
28 * consider code for inclusion in the official distribution, but
29 * derived work must not be called official GROMACS. Details are found
30 * in the README & COPYING files - if they are missing, get the
31 * official version at http://www.gromacs.org.
33 * To help us fund GROMACS development, we humbly ask that you cite
34 * the research papers on the package. Check out http://www.gromacs.org.
37 * \brief Define common implementation of nbnxm_gpu_data_mgmt.h
39 * \author Anca Hamuraru <anca@streamcomputing.eu>
40 * \author Dimitrios Karkoulis <dimitris.karkoulis@gmail.com>
41 * \author Teemu Virolainen <teemu@streamcomputing.eu>
42 * \author Szilárd Páll <pall.szilard@gmail.com>
43 * \author Artem Zhmurov <zhmurov@gmail.com>
45 * \ingroup module_nbnxm
52 # include "cuda/nbnxm_cuda_types.h"
56 # include "opencl/nbnxm_ocl_types.h"
60 # include "sycl/nbnxm_sycl_types.h"
63 #include "nbnxm_gpu_data_mgmt.h"
65 #include "gromacs/gpu_utils/device_stream_manager.h"
66 #include "gromacs/gpu_utils/pmalloc.h"
67 #include "gromacs/hardware/device_information.h"
68 #include "gromacs/mdtypes/interaction_const.h"
69 #include "gromacs/mdtypes/simulation_workload.h"
70 #include "gromacs/nbnxm/gpu_common_utils.h"
71 #include "gromacs/nbnxm/gpu_data_mgmt.h"
72 #include "gromacs/nbnxm/gridset.h"
73 #include "gromacs/pbcutil/ishift.h"
74 #include "gromacs/timing/gpu_timing.h"
75 #include "gromacs/pbcutil/ishift.h"
76 #include "gromacs/utility/cstringutil.h"
77 #include "gromacs/utility/exceptions.h"
78 #include "gromacs/utility/fatalerror.h"
80 #include "nbnxm_gpu.h"
81 #include "pairlistsets.h"
86 inline void issueClFlushInStream(const DeviceStream& deviceStream)
89 /* Based on the v1.2 section 5.13 of the OpenCL spec, a flush is needed
90 * in the stream after marking an event in it in order to be able to sync with
91 * the event from another stream.
93 cl_int cl_error = clFlush(deviceStream.stream());
94 if (cl_error != CL_SUCCESS)
96 GMX_THROW(gmx::InternalError("clFlush failed: " + ocl_get_error_string(cl_error)));
99 GMX_UNUSED_VALUE(deviceStream);
103 void init_ewald_coulomb_force_table(const EwaldCorrectionTables& tables,
105 const DeviceContext& deviceContext)
107 if (nbp->coulomb_tab)
109 destroyParamLookupTable(&nbp->coulomb_tab, nbp->coulomb_tab_texobj);
112 nbp->coulomb_tab_scale = tables.scale;
113 initParamLookupTable(
114 &nbp->coulomb_tab, &nbp->coulomb_tab_texobj, tables.tableF.data(), tables.tableF.size(), deviceContext);
117 enum ElecType nbnxn_gpu_pick_ewald_kernel_type(const interaction_const_t& ic,
118 const DeviceInformation gmx_unused& deviceInfo)
120 bool bTwinCut = (ic.rcoulomb != ic.rvdw);
122 /* Benchmarking/development environment variables to force the use of
123 analytical or tabulated Ewald kernel. */
124 const bool forceAnalyticalEwald = (getenv("GMX_GPU_NB_ANA_EWALD") != nullptr);
125 const bool forceTabulatedEwald = (getenv("GMX_GPU_NB_TAB_EWALD") != nullptr);
126 const bool forceTwinCutoffEwald = (getenv("GMX_GPU_NB_EWALD_TWINCUT") != nullptr);
128 if (forceAnalyticalEwald && forceTabulatedEwald)
131 "Both analytical and tabulated Ewald GPU non-bonded kernels "
132 "requested through environment variables.");
135 /* By default, use analytical Ewald except with CUDA on NVIDIA CC 7.0 and 8.0.
137 const bool c_useTabulatedEwaldDefault =
139 (deviceInfo.prop.major == 7 && deviceInfo.prop.minor == 0)
140 || (deviceInfo.prop.major == 8 && deviceInfo.prop.minor == 0);
144 bool bUseAnalyticalEwald = !c_useTabulatedEwaldDefault;
145 if (forceAnalyticalEwald)
147 bUseAnalyticalEwald = true;
150 fprintf(debug, "Using analytical Ewald GPU kernels\n");
153 else if (forceTabulatedEwald)
155 bUseAnalyticalEwald = false;
159 fprintf(debug, "Using tabulated Ewald GPU kernels\n");
163 /* Use twin cut-off kernels if requested by bTwinCut or the env. var.
164 forces it (use it for debugging/benchmarking only). */
165 if (!bTwinCut && !forceTwinCutoffEwald)
167 return bUseAnalyticalEwald ? ElecType::EwaldAna : ElecType::EwaldTab;
171 return bUseAnalyticalEwald ? ElecType::EwaldAnaTwin : ElecType::EwaldTabTwin;
175 void set_cutoff_parameters(NBParamGpu* nbp, const interaction_const_t& ic, const PairlistParams& listParams)
177 nbp->ewald_beta = ic.ewaldcoeff_q;
178 nbp->sh_ewald = ic.sh_ewald;
179 nbp->epsfac = ic.epsfac;
180 nbp->two_k_rf = 2.0 * ic.reactionFieldCoefficient;
181 nbp->c_rf = ic.reactionFieldShift;
182 nbp->rvdw_sq = ic.rvdw * ic.rvdw;
183 nbp->rcoulomb_sq = ic.rcoulomb * ic.rcoulomb;
184 nbp->rlistOuter_sq = listParams.rlistOuter * listParams.rlistOuter;
185 nbp->rlistInner_sq = listParams.rlistInner * listParams.rlistInner;
186 nbp->useDynamicPruning = listParams.useDynamicPruning;
188 nbp->sh_lj_ewald = ic.sh_lj_ewald;
189 nbp->ewaldcoeff_lj = ic.ewaldcoeff_lj;
191 nbp->rvdw_switch = ic.rvdw_switch;
192 nbp->dispersion_shift = ic.dispersion_shift;
193 nbp->repulsion_shift = ic.repulsion_shift;
194 nbp->vdw_switch = ic.vdw_switch;
197 void gpu_pme_loadbal_update_param(const nonbonded_verlet_t* nbv, const interaction_const_t& ic)
199 if (!nbv || !nbv->useGpu())
203 NbnxmGpu* nb = nbv->gpu_nbv;
204 NBParamGpu* nbp = nb->nbparam;
206 set_cutoff_parameters(nbp, ic, nbv->pairlistSets().params());
208 nbp->elecType = nbnxn_gpu_pick_ewald_kernel_type(ic, nb->deviceContext_->deviceInfo());
210 GMX_RELEASE_ASSERT(ic.coulombEwaldTables, "Need valid Coulomb Ewald correction tables");
211 init_ewald_coulomb_force_table(*ic.coulombEwaldTables, nbp, *nb->deviceContext_);
214 void init_plist(gpu_plist* pl)
216 /* initialize to nullptr pointers to data that is not allocated here and will
217 need reallocation in nbnxn_gpu_init_pairlist */
223 /* size -1 indicates that the respective array hasn't been initialized yet */
230 pl->imask_nalloc = -1;
232 pl->excl_nalloc = -1;
233 pl->haveFreshList = false;
234 pl->rollingPruningNumParts = 0;
235 pl->rollingPruningPart = 0;
238 void init_timings(gmx_wallclock_gpu_nbnxn_t* t)
245 for (int i = 0; i < 2; i++)
247 for (int j = 0; j < 2; j++)
249 t->ktime[i][j].t = 0.0;
250 t->ktime[i][j].c = 0;
254 t->pruneTime.t = 0.0;
255 t->dynamicPruneTime.c = 0;
256 t->dynamicPruneTime.t = 0.0;
259 /*! \brief Initialize \p atomdata first time; it only gets filled at pair-search. */
260 static void initAtomdataFirst(NBAtomData* atomdata,
262 const DeviceContext& deviceContext,
263 const DeviceStream& localStream)
265 atomdata->numTypes = numTypes;
266 allocateDeviceBuffer(&atomdata->shiftVec, SHIFTS, deviceContext);
267 atomdata->shiftVecUploaded = false;
269 allocateDeviceBuffer(&atomdata->fShift, SHIFTS, deviceContext);
270 allocateDeviceBuffer(&atomdata->eLJ, 1, deviceContext);
271 allocateDeviceBuffer(&atomdata->eElec, 1, deviceContext);
273 clearDeviceBufferAsync(&atomdata->fShift, 0, SHIFTS, localStream);
274 clearDeviceBufferAsync(&atomdata->eElec, 0, 1, localStream);
275 clearDeviceBufferAsync(&atomdata->eLJ, 0, 1, localStream);
277 /* initialize to nullptr pointers to data that is not allocated here and will
278 need reallocation in later */
279 atomdata->xq = nullptr;
280 atomdata->f = nullptr;
282 /* size -1 indicates that the respective array hasn't been initialized yet */
283 atomdata->numAtoms = -1;
284 atomdata->numAtomsAlloc = -1;
287 /*! \brief Initialize the nonbonded parameter data structure. */
288 static void initNbparam(NBParamGpu* nbp,
289 const interaction_const_t& ic,
290 const PairlistParams& listParams,
291 const nbnxn_atomdata_t::Params& nbatParams,
292 const DeviceContext& deviceContext)
294 const int numTypes = nbatParams.numTypes;
296 set_cutoff_parameters(nbp, ic, listParams);
298 nbp->vdwType = nbnxmGpuPickVdwKernelType(ic, nbatParams.ljCombinationRule);
299 nbp->elecType = nbnxmGpuPickElectrostaticsKernelType(ic, deviceContext.deviceInfo());
301 if (ic.vdwtype == VanDerWaalsType::Pme)
303 if (ic.ljpme_comb_rule == LongRangeVdW::Geom)
305 GMX_ASSERT(nbatParams.ljCombinationRule == LJCombinationRule::Geometric,
306 "Combination rule mismatch!");
310 GMX_ASSERT(nbatParams.ljCombinationRule == LJCombinationRule::LorentzBerthelot,
311 "Combination rule mismatch!");
315 /* generate table for PME */
316 if (nbp->elecType == ElecType::EwaldTab || nbp->elecType == ElecType::EwaldTabTwin)
318 GMX_RELEASE_ASSERT(ic.coulombEwaldTables, "Need valid Coulomb Ewald correction tables");
319 init_ewald_coulomb_force_table(*ic.coulombEwaldTables, nbp, deviceContext);
323 // Need to initialize for OpenCL, since it is unconditionally used as a kernel argument.
324 allocateDeviceBuffer(&nbp->coulomb_tab, 1, deviceContext);
327 /* set up LJ parameter lookup table */
328 if (!useLjCombRule(nbp->vdwType))
330 static_assert(sizeof(decltype(nbp->nbfp)) == 2 * sizeof(decltype(*nbatParams.nbfp.data())),
331 "Mismatch in the size of host / device data types");
332 initParamLookupTable(&nbp->nbfp,
334 reinterpret_cast<const Float2*>(nbatParams.nbfp.data()),
340 // Need to initialize for OpenCL, since it is unconditionally used as a kernel argument.
341 allocateDeviceBuffer(&nbp->nbfp, 1, deviceContext);
344 /* set up LJ-PME parameter lookup table */
345 if (ic.vdwtype == VanDerWaalsType::Pme)
347 static_assert(sizeof(decltype(nbp->nbfp_comb))
348 == 2 * sizeof(decltype(*nbatParams.nbfp_comb.data())),
349 "Mismatch in the size of host / device data types");
350 initParamLookupTable(&nbp->nbfp_comb,
351 &nbp->nbfp_comb_texobj,
352 reinterpret_cast<const Float2*>(nbatParams.nbfp_comb.data()),
358 // Need to initialize for OpenCL, since it is unconditionally used as a kernel argument.
359 allocateDeviceBuffer(&nbp->nbfp_comb, 1, deviceContext);
363 NbnxmGpu* gpu_init(const gmx::DeviceStreamManager& deviceStreamManager,
364 const interaction_const_t* ic,
365 const PairlistParams& listParams,
366 const nbnxn_atomdata_t* nbat,
367 const bool bLocalAndNonlocal)
369 auto* nb = new NbnxmGpu();
370 nb->deviceContext_ = &deviceStreamManager.context();
371 nb->atdat = new NBAtomData;
372 nb->nbparam = new NBParamGpu;
373 nb->plist[InteractionLocality::Local] = new Nbnxm::gpu_plist;
374 if (bLocalAndNonlocal)
376 nb->plist[InteractionLocality::NonLocal] = new Nbnxm::gpu_plist;
379 nb->bUseTwoStreams = bLocalAndNonlocal;
381 nb->timers = new Nbnxm::GpuTimers();
382 snew(nb->timings, 1);
384 /* WARNING: CUDA timings are incorrect with multiple streams.
385 * This is the main reason why they are disabled by default.
386 * Can be enabled by setting GMX_ENABLE_GPU_TIMING environment variable.
387 * TODO: Consider turning on by default when we can detect nr of streams.
389 * OpenCL timing is enabled by default and can be disabled by
390 * GMX_DISABLE_GPU_TIMING environment variable.
392 * Timing is disabled in SYCL.
394 nb->bDoTime = (GMX_GPU_CUDA && (getenv("GMX_ENABLE_GPU_TIMING") != nullptr))
395 || (GMX_GPU_OPENCL && (getenv("GMX_DISABLE_GPU_TIMING") == nullptr));
399 init_timings(nb->timings);
403 pmalloc(reinterpret_cast<void**>(&nb->nbst.eLJ), sizeof(*nb->nbst.eLJ));
404 pmalloc(reinterpret_cast<void**>(&nb->nbst.eElec), sizeof(*nb->nbst.eElec));
405 pmalloc(reinterpret_cast<void**>(&nb->nbst.fShift), SHIFTS * sizeof(*nb->nbst.fShift));
407 init_plist(nb->plist[InteractionLocality::Local]);
409 /* local/non-local GPU streams */
410 GMX_RELEASE_ASSERT(deviceStreamManager.streamIsValid(gmx::DeviceStreamType::NonBondedLocal),
411 "Local non-bonded stream should be initialized to use GPU for non-bonded.");
412 const DeviceStream& localStream = deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedLocal);
413 nb->deviceStreams[InteractionLocality::Local] = &localStream;
414 // In general, it's not strictly necessary to use 2 streams for SYCL, since they are
415 // out-of-order. But for the time being, it will be less disruptive to keep them.
416 if (nb->bUseTwoStreams)
418 init_plist(nb->plist[InteractionLocality::NonLocal]);
420 GMX_RELEASE_ASSERT(deviceStreamManager.streamIsValid(gmx::DeviceStreamType::NonBondedNonLocal),
421 "Non-local non-bonded stream should be initialized to use GPU for "
422 "non-bonded with domain decomposition.");
423 nb->deviceStreams[InteractionLocality::NonLocal] =
424 &deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedNonLocal);
427 const nbnxn_atomdata_t::Params& nbatParams = nbat->params();
428 const DeviceContext& deviceContext = *nb->deviceContext_;
430 initNbparam(nb->nbparam, *ic, listParams, nbatParams, deviceContext);
431 initAtomdataFirst(nb->atdat, nbatParams.numTypes, deviceContext, localStream);
433 gpu_init_platform_specific(nb);
437 fprintf(debug, "Initialized NBNXM GPU data structures.\n");
443 //! This function is documented in the header file
444 void gpu_init_pairlist(NbnxmGpu* nb, const NbnxnPairlistGpu* h_plist, const InteractionLocality iloc)
447 // Timing accumulation should happen only if there was work to do
448 // because getLastRangeTime() gets skipped with empty lists later
449 // which leads to the counter not being reset.
450 bool bDoTime = (nb->bDoTime && !h_plist->sci.empty());
451 const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
452 gpu_plist* d_plist = nb->plist[iloc];
454 if (d_plist->na_c < 0)
456 d_plist->na_c = h_plist->na_ci;
460 if (d_plist->na_c != h_plist->na_ci)
463 "In init_plist: the #atoms per cell has changed (from %d to %d)",
470 GpuTimers::Interaction& iTimers = nb->timers->interaction[iloc];
474 iTimers.pl_h2d.openTimingRegion(deviceStream);
475 iTimers.didPairlistH2D = true;
478 // TODO most of this function is same in CUDA and OpenCL, move into the header
479 const DeviceContext& deviceContext = *nb->deviceContext_;
481 reallocateDeviceBuffer(
482 &d_plist->sci, h_plist->sci.size(), &d_plist->nsci, &d_plist->sci_nalloc, deviceContext);
483 copyToDeviceBuffer(&d_plist->sci,
488 GpuApiCallBehavior::Async,
489 bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
491 reallocateDeviceBuffer(
492 &d_plist->cj4, h_plist->cj4.size(), &d_plist->ncj4, &d_plist->cj4_nalloc, deviceContext);
493 copyToDeviceBuffer(&d_plist->cj4,
498 GpuApiCallBehavior::Async,
499 bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
501 reallocateDeviceBuffer(&d_plist->imask,
502 h_plist->cj4.size() * c_nbnxnGpuClusterpairSplit,
504 &d_plist->imask_nalloc,
507 reallocateDeviceBuffer(
508 &d_plist->excl, h_plist->excl.size(), &d_plist->nexcl, &d_plist->excl_nalloc, deviceContext);
509 copyToDeviceBuffer(&d_plist->excl,
510 h_plist->excl.data(),
512 h_plist->excl.size(),
514 GpuApiCallBehavior::Async,
515 bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
519 iTimers.pl_h2d.closeTimingRegion(deviceStream);
522 /* need to prune the pair list during the next step */
523 d_plist->haveFreshList = true;
526 void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat)
528 bool bDoTime = nb->bDoTime;
529 Nbnxm::GpuTimers* timers = bDoTime ? nb->timers : nullptr;
530 NBAtomData* atdat = nb->atdat;
531 const DeviceContext& deviceContext = *nb->deviceContext_;
532 const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local];
534 int numAtoms = nbat->numAtoms();
535 bool realloced = false;
539 /* time async copy */
540 timers->atdat.openTimingRegion(localStream);
543 /* need to reallocate if we have to copy more atoms than the amount of space
544 available and only allocate if we haven't initialized yet, i.e atdat->natoms == -1 */
545 if (numAtoms > atdat->numAtomsAlloc)
547 int numAlloc = over_alloc_small(numAtoms);
549 /* free up first if the arrays have already been initialized */
550 if (atdat->numAtomsAlloc != -1)
552 freeDeviceBuffer(&atdat->f);
553 freeDeviceBuffer(&atdat->xq);
554 if (useLjCombRule(nb->nbparam->vdwType))
556 freeDeviceBuffer(&atdat->ljComb);
560 freeDeviceBuffer(&atdat->atomTypes);
565 allocateDeviceBuffer(&atdat->f, numAlloc, deviceContext);
566 allocateDeviceBuffer(&atdat->xq, numAlloc, deviceContext);
568 if (useLjCombRule(nb->nbparam->vdwType))
570 // Two Lennard-Jones parameters per atom
571 allocateDeviceBuffer(&atdat->ljComb, numAlloc, deviceContext);
575 allocateDeviceBuffer(&atdat->atomTypes, numAlloc, deviceContext);
578 atdat->numAtomsAlloc = numAlloc;
582 atdat->numAtoms = numAtoms;
583 atdat->numAtomsLocal = nbat->natoms_local;
585 /* need to clear GPU f output if realloc happened */
588 clearDeviceBufferAsync(&atdat->f, 0, atdat->numAtomsAlloc, localStream);
591 if (useLjCombRule(nb->nbparam->vdwType))
594 sizeof(Float2) == 2 * sizeof(*nbat->params().lj_comb.data()),
595 "Size of a pair of LJ parameters elements should be equal to the size of Float2.");
596 copyToDeviceBuffer(&atdat->ljComb,
597 reinterpret_cast<const Float2*>(nbat->params().lj_comb.data()),
601 GpuApiCallBehavior::Async,
602 bDoTime ? timers->atdat.fetchNextEvent() : nullptr);
606 static_assert(sizeof(int) == sizeof(*nbat->params().type.data()),
607 "Sizes of host- and device-side atom types should be the same.");
608 copyToDeviceBuffer(&atdat->atomTypes,
609 nbat->params().type.data(),
613 GpuApiCallBehavior::Async,
614 bDoTime ? timers->atdat.fetchNextEvent() : nullptr);
619 timers->atdat.closeTimingRegion(localStream);
622 /* kick off the tasks enqueued above to ensure concurrency with the search */
623 issueClFlushInStream(localStream);
626 void gpu_clear_outputs(NbnxmGpu* nb, bool computeVirial)
628 NBAtomData* adat = nb->atdat;
629 const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local];
631 clearDeviceBufferAsync(&adat->f, 0, nb->atdat->numAtoms, localStream);
632 // Clear shift force array and energies if the outputs were used in the current step
635 clearDeviceBufferAsync(&adat->fShift, 0, SHIFTS, localStream);
636 clearDeviceBufferAsync(&adat->eLJ, 0, 1, localStream);
637 clearDeviceBufferAsync(&adat->eElec, 0, 1, localStream);
639 issueClFlushInStream(localStream);
642 //! This function is documented in the header file
643 gmx_wallclock_gpu_nbnxn_t* gpu_get_timings(NbnxmGpu* nb)
645 return (nb != nullptr && nb->bDoTime) ? nb->timings : nullptr;
648 //! This function is documented in the header file
649 void gpu_reset_timings(nonbonded_verlet_t* nbv)
651 if (nbv->gpu_nbv && nbv->gpu_nbv->bDoTime)
653 init_timings(nbv->gpu_nbv->timings);
657 bool gpu_is_kernel_ewald_analytical(const NbnxmGpu* nb)
659 return ((nb->nbparam->elecType == ElecType::EwaldAna)
660 || (nb->nbparam->elecType == ElecType::EwaldAnaTwin));
663 enum ElecType nbnxmGpuPickElectrostaticsKernelType(const interaction_const_t& ic,
664 const DeviceInformation& deviceInfo)
666 if (ic.eeltype == CoulombInteractionType::Cut)
668 return ElecType::Cut;
670 else if (EEL_RF(ic.eeltype))
674 else if ((EEL_PME(ic.eeltype) || ic.eeltype == CoulombInteractionType::Ewald))
676 return nbnxn_gpu_pick_ewald_kernel_type(ic, deviceInfo);
680 /* Shouldn't happen, as this is checked when choosing Verlet-scheme */
681 GMX_THROW(gmx::InconsistentInputError(
682 gmx::formatString("The requested electrostatics type %s is not implemented in "
683 "the GPU accelerated kernels!",
684 enumValueToString(ic.eeltype))));
689 enum VdwType nbnxmGpuPickVdwKernelType(const interaction_const_t& ic, LJCombinationRule ljCombinationRule)
691 if (ic.vdwtype == VanDerWaalsType::Cut)
693 switch (ic.vdw_modifier)
695 case InteractionModifiers::None:
696 case InteractionModifiers::PotShift:
697 switch (ljCombinationRule)
699 case LJCombinationRule::None: return VdwType::Cut;
700 case LJCombinationRule::Geometric: return VdwType::CutCombGeom;
701 case LJCombinationRule::LorentzBerthelot: return VdwType::CutCombLB;
703 GMX_THROW(gmx::InconsistentInputError(gmx::formatString(
704 "The requested LJ combination rule %s is not implemented in "
705 "the GPU accelerated kernels!",
706 enumValueToString(ljCombinationRule))));
708 case InteractionModifiers::ForceSwitch: return VdwType::FSwitch;
709 case InteractionModifiers::PotSwitch: return VdwType::PSwitch;
711 GMX_THROW(gmx::InconsistentInputError(
712 gmx::formatString("The requested VdW interaction modifier %s is not "
713 "implemented in the GPU accelerated kernels!",
714 enumValueToString(ic.vdw_modifier))));
717 else if (ic.vdwtype == VanDerWaalsType::Pme)
719 if (ic.ljpme_comb_rule == LongRangeVdW::Geom)
721 assert(ljCombinationRule == LJCombinationRule::Geometric);
722 return VdwType::EwaldGeom;
726 assert(ljCombinationRule == LJCombinationRule::LorentzBerthelot);
727 return VdwType::EwaldLB;
732 GMX_THROW(gmx::InconsistentInputError(gmx::formatString(
733 "The requested VdW type %s is not implemented in the GPU accelerated kernels!",
734 enumValueToString(ic.vdwtype))));
738 void setupGpuShortRangeWork(NbnxmGpu* nb, const gmx::GpuBonded* gpuBonded, const gmx::InteractionLocality iLocality)
740 GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
742 // There is short-range work if the pair list for the provided
743 // interaction locality contains entries or if there is any
744 // bonded work (as this is not split into local/nonlocal).
745 nb->haveWork[iLocality] = ((nb->plist[iLocality]->nsci != 0)
746 || (gpuBonded != nullptr && gpuBonded->haveInteractions()));
749 bool haveGpuShortRangeWork(const NbnxmGpu* nb, const gmx::InteractionLocality interactionLocality)
751 GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
753 return nb->haveWork[interactionLocality];
757 * Launch asynchronously the download of nonbonded forces from the GPU
758 * (and energies/shift forces if required).
760 void gpu_launch_cpyback(NbnxmGpu* nb,
761 struct nbnxn_atomdata_t* nbatom,
762 const gmx::StepWorkload& stepWork,
763 const AtomLocality atomLocality)
765 GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
767 /* determine interaction locality from atom locality */
768 const InteractionLocality iloc = atomToInteractionLocality(atomLocality);
769 GMX_ASSERT(iloc == InteractionLocality::Local
770 || (iloc == InteractionLocality::NonLocal && nb->bNonLocalStreamDoneMarked == false),
771 "Non-local stream is indicating that the copy back event is enqueued at the "
772 "beginning of the copy back function.");
774 /* extract the data */
775 NBAtomData* adat = nb->atdat;
776 Nbnxm::GpuTimers* timers = nb->timers;
777 bool bDoTime = nb->bDoTime;
778 const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
780 /* don't launch non-local copy-back if there was no non-local work to do */
781 if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(nb, iloc))
783 /* TODO An alternative way to signal that non-local work is
784 complete is to use a clEnqueueMarker+clEnqueueBarrier
785 pair. However, the use of bNonLocalStreamDoneMarked has the
786 advantage of being local to the host, so probably minimizes
787 overhead. Curiously, for NVIDIA OpenCL with an empty-domain
788 test case, overall simulation performance was higher with
789 the API calls, but this has not been tested on AMD OpenCL,
790 so could be worth considering in future. */
791 nb->bNonLocalStreamDoneMarked = false;
795 /* local/nonlocal offset and length used for xq and f */
796 auto atomsRange = getGpuAtomRange(adat, atomLocality);
798 /* beginning of timed D2H section */
801 timers->xf[atomLocality].nb_d2h.openTimingRegion(deviceStream);
804 /* With DD the local D2H transfer can only start after the non-local
805 has been launched. */
806 if (iloc == InteractionLocality::Local && nb->bNonLocalStreamDoneMarked)
808 nb->nonlocal_done.enqueueWaitEvent(deviceStream);
809 nb->bNonLocalStreamDoneMarked = false;
813 static_assert(sizeof(*nbatom->out[0].f.data()) == sizeof(float),
814 "The host force buffer should be in single precision to match device data size.");
815 copyFromDeviceBuffer(reinterpret_cast<Float3*>(nbatom->out[0].f.data()) + atomsRange.begin(),
820 GpuApiCallBehavior::Async,
821 bDoTime ? timers->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
823 issueClFlushInStream(deviceStream);
825 /* After the non-local D2H is launched the nonlocal_done event can be
826 recorded which signals that the local D2H can proceed. This event is not
827 placed after the non-local kernel because we first need the non-local
829 if (iloc == InteractionLocality::NonLocal)
831 nb->nonlocal_done.markEvent(deviceStream);
832 nb->bNonLocalStreamDoneMarked = true;
835 /* only transfer energies in the local stream */
836 if (iloc == InteractionLocality::Local)
838 /* DtoH fshift when virial is needed */
839 if (stepWork.computeVirial)
842 sizeof(*nb->nbst.fShift) == sizeof(Float3),
843 "Sizes of host- and device-side shift vector elements should be the same.");
844 copyFromDeviceBuffer(nb->nbst.fShift,
849 GpuApiCallBehavior::Async,
850 bDoTime ? timers->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
854 if (stepWork.computeEnergy)
856 static_assert(sizeof(*nb->nbst.eLJ) == sizeof(float),
857 "Sizes of host- and device-side LJ energy terms should be the same.");
858 copyFromDeviceBuffer(nb->nbst.eLJ,
863 GpuApiCallBehavior::Async,
864 bDoTime ? timers->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
865 static_assert(sizeof(*nb->nbst.eElec) == sizeof(float),
866 "Sizes of host- and device-side electrostatic energy terms should be the "
868 copyFromDeviceBuffer(nb->nbst.eElec,
873 GpuApiCallBehavior::Async,
874 bDoTime ? timers->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
880 timers->xf[atomLocality].nb_d2h.closeTimingRegion(deviceStream);
884 void nbnxnInsertNonlocalGpuDependency(NbnxmGpu* nb, const InteractionLocality interactionLocality)
886 const DeviceStream& deviceStream = *nb->deviceStreams[interactionLocality];
888 /* When we get here all misc operations issued in the local stream as well as
889 the local xq H2D are done,
890 so we record that in the local stream and wait for it in the nonlocal one.
891 This wait needs to precede any PP tasks, bonded or nonbonded, that may
892 compute on interactions between local and nonlocal atoms.
894 if (nb->bUseTwoStreams)
896 if (interactionLocality == InteractionLocality::Local)
898 nb->misc_ops_and_local_H2D_done.markEvent(deviceStream);
899 issueClFlushInStream(deviceStream);
903 nb->misc_ops_and_local_H2D_done.enqueueWaitEvent(deviceStream);
908 /*! \brief Launch asynchronously the xq buffer host to device copy. */
909 void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const AtomLocality atomLocality)
911 GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
913 const InteractionLocality iloc = atomToInteractionLocality(atomLocality);
915 NBAtomData* adat = nb->atdat;
916 gpu_plist* plist = nb->plist[iloc];
917 Nbnxm::GpuTimers* timers = nb->timers;
918 const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
920 const bool bDoTime = nb->bDoTime;
922 /* Don't launch the non-local H2D copy if there is no dependent
923 work to do: neither non-local nor other (e.g. bonded) work
924 to do that has as input the nbnxn coordaintes.
925 Doing the same for the local kernel is more complicated, since the
926 local part of the force array also depends on the non-local kernel.
927 So to avoid complicating the code and to reduce the risk of bugs,
928 we always call the local local x+q copy (and the rest of the local
929 work in nbnxn_gpu_launch_kernel().
931 if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(nb, iloc))
933 plist->haveFreshList = false;
935 // The event is marked for Local interactions unconditionally,
936 // so it has to be released here because of the early return
937 // for NonLocal interactions.
938 nb->misc_ops_and_local_H2D_done.reset();
943 /* local/nonlocal offset and length used for xq and f */
944 const auto atomsRange = getGpuAtomRange(adat, atomLocality);
946 /* beginning of timed HtoD section */
949 timers->xf[atomLocality].nb_h2d.openTimingRegion(deviceStream);
953 GMX_ASSERT(nbatom->XFormat == nbatXYZQ,
954 "The coordinates should be in xyzq format to copy to the Float4 device buffer.");
955 copyToDeviceBuffer(&adat->xq,
956 reinterpret_cast<const Float4*>(nbatom->x().data()) + atomsRange.begin(),
960 GpuApiCallBehavior::Async,
965 timers->xf[atomLocality].nb_h2d.closeTimingRegion(deviceStream);
968 /* When we get here all misc operations issued in the local stream as well as
969 the local xq H2D are done,
970 so we record that in the local stream and wait for it in the nonlocal one.
971 This wait needs to precede any PP tasks, bonded or nonbonded, that may
972 compute on interactions between local and nonlocal atoms.
974 nbnxnInsertNonlocalGpuDependency(nb, iloc);
978 /* Initialization for X buffer operations on GPU. */
979 void nbnxn_gpu_init_x_to_nbat_x(const Nbnxm::GridSet& gridSet, NbnxmGpu* gpu_nbv)
981 const DeviceStream& localStream = *gpu_nbv->deviceStreams[InteractionLocality::Local];
982 const bool bDoTime = gpu_nbv->bDoTime;
983 const int maxNumColumns = gridSet.numColumnsMax();
985 reallocateDeviceBuffer(&gpu_nbv->cxy_na,
986 maxNumColumns * gridSet.grids().size(),
988 &gpu_nbv->ncxy_na_alloc,
989 *gpu_nbv->deviceContext_);
990 reallocateDeviceBuffer(&gpu_nbv->cxy_ind,
991 maxNumColumns * gridSet.grids().size(),
993 &gpu_nbv->ncxy_ind_alloc,
994 *gpu_nbv->deviceContext_);
996 for (unsigned int g = 0; g < gridSet.grids().size(); g++)
998 const Nbnxm::Grid& grid = gridSet.grids()[g];
1000 const int numColumns = grid.numColumns();
1001 const int* atomIndices = gridSet.atomIndices().data();
1002 const int atomIndicesSize = gridSet.atomIndices().size();
1003 const int* cxy_na = grid.cxy_na().data();
1004 const int* cxy_ind = grid.cxy_ind().data();
1006 auto* timerH2D = bDoTime ? &gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d : nullptr;
1008 reallocateDeviceBuffer(&gpu_nbv->atomIndices,
1010 &gpu_nbv->atomIndicesSize,
1011 &gpu_nbv->atomIndicesSize_alloc,
1012 *gpu_nbv->deviceContext_);
1014 if (atomIndicesSize > 0)
1018 timerH2D->openTimingRegion(localStream);
1021 copyToDeviceBuffer(&gpu_nbv->atomIndices,
1026 GpuApiCallBehavior::Async,
1027 bDoTime ? timerH2D->fetchNextEvent() : nullptr);
1031 timerH2D->closeTimingRegion(localStream);
1039 timerH2D->openTimingRegion(localStream);
1042 copyToDeviceBuffer(&gpu_nbv->cxy_na,
1047 GpuApiCallBehavior::Async,
1048 bDoTime ? timerH2D->fetchNextEvent() : nullptr);
1052 timerH2D->closeTimingRegion(localStream);
1057 timerH2D->openTimingRegion(localStream);
1060 copyToDeviceBuffer(&gpu_nbv->cxy_ind,
1065 GpuApiCallBehavior::Async,
1066 bDoTime ? timerH2D->fetchNextEvent() : nullptr);
1070 timerH2D->closeTimingRegion(localStream);
1075 // The above data is transferred on the local stream but is a
1076 // dependency of the nonlocal stream (specifically the nonlocal X
1077 // buf ops kernel). We therefore set a dependency to ensure
1078 // that the nonlocal stream waits on the local stream here.
1079 // This call records an event in the local stream:
1080 nbnxnInsertNonlocalGpuDependency(gpu_nbv, Nbnxm::InteractionLocality::Local);
1081 // ...and this call instructs the nonlocal stream to wait on that event:
1082 nbnxnInsertNonlocalGpuDependency(gpu_nbv, Nbnxm::InteractionLocality::NonLocal);
1085 } // namespace Nbnxm