2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2016 by the GROMACS development team.
5 * Copyright (c) 2017,2018,2019,2020,2021, by the GROMACS development team, led by
6 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
7 * and including many others, as listed in the AUTHORS file in the
8 * top-level source directory and at http://www.gromacs.org.
10 * GROMACS is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public License
12 * as published by the Free Software Foundation; either version 2.1
13 * of the License, or (at your option) any later version.
15 * GROMACS is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with GROMACS; if not, see
22 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
23 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
25 * If you want to redistribute modifications to GROMACS, please
26 * consider that scientific software is very special. Version
27 * control is crucial - bugs must be traceable. We will be happy to
28 * consider code for inclusion in the official distribution, but
29 * derived work must not be called official GROMACS. Details are found
30 * in the README & COPYING files - if they are missing, get the
31 * official version at http://www.gromacs.org.
33 * To help us fund GROMACS development, we humbly ask that you cite
34 * the research papers on the package. Check out http://www.gromacs.org.
37 * \brief Define common implementation of nbnxm_gpu_data_mgmt.h
39 * \author Anca Hamuraru <anca@streamcomputing.eu>
40 * \author Dimitrios Karkoulis <dimitris.karkoulis@gmail.com>
41 * \author Teemu Virolainen <teemu@streamcomputing.eu>
42 * \author Szilárd Páll <pall.szilard@gmail.com>
43 * \author Artem Zhmurov <zhmurov@gmail.com>
45 * \ingroup module_nbnxm
52 # include "cuda/nbnxm_cuda_types.h"
56 # include "opencl/nbnxm_ocl_types.h"
60 # include "sycl/nbnxm_sycl_types.h"
63 #include "nbnxm_gpu_data_mgmt.h"
65 #include "gromacs/gpu_utils/device_stream_manager.h"
66 #include "gromacs/gpu_utils/pmalloc.h"
67 #include "gromacs/hardware/device_information.h"
68 #include "gromacs/mdtypes/interaction_const.h"
69 #include "gromacs/mdtypes/simulation_workload.h"
70 #include "gromacs/nbnxm/gpu_common_utils.h"
71 #include "gromacs/nbnxm/gpu_data_mgmt.h"
72 #include "gromacs/pbcutil/ishift.h"
73 #include "gromacs/timing/gpu_timing.h"
74 #include "gromacs/pbcutil/ishift.h"
75 #include "gromacs/utility/cstringutil.h"
76 #include "gromacs/utility/exceptions.h"
77 #include "gromacs/utility/fatalerror.h"
79 #include "nbnxm_gpu.h"
80 #include "pairlistsets.h"
85 inline void issueClFlushInStream(const DeviceStream& deviceStream)
88 /* Based on the v1.2 section 5.13 of the OpenCL spec, a flush is needed
89 * in the stream after marking an event in it in order to be able to sync with
90 * the event from another stream.
92 cl_int cl_error = clFlush(deviceStream.stream());
93 if (cl_error != CL_SUCCESS)
95 GMX_THROW(gmx::InternalError("clFlush failed: " + ocl_get_error_string(cl_error)));
98 GMX_UNUSED_VALUE(deviceStream);
102 void init_ewald_coulomb_force_table(const EwaldCorrectionTables& tables,
104 const DeviceContext& deviceContext)
106 if (nbp->coulomb_tab)
108 destroyParamLookupTable(&nbp->coulomb_tab, nbp->coulomb_tab_texobj);
111 nbp->coulomb_tab_scale = tables.scale;
112 initParamLookupTable(
113 &nbp->coulomb_tab, &nbp->coulomb_tab_texobj, tables.tableF.data(), tables.tableF.size(), deviceContext);
116 enum ElecType nbnxn_gpu_pick_ewald_kernel_type(const interaction_const_t& ic,
117 const DeviceInformation gmx_unused& deviceInfo)
119 bool bTwinCut = (ic.rcoulomb != ic.rvdw);
121 /* Benchmarking/development environment variables to force the use of
122 analytical or tabulated Ewald kernel. */
123 const bool forceAnalyticalEwald = (getenv("GMX_GPU_NB_ANA_EWALD") != nullptr);
124 const bool forceTabulatedEwald = (getenv("GMX_GPU_NB_TAB_EWALD") != nullptr);
125 const bool forceTwinCutoffEwald = (getenv("GMX_GPU_NB_EWALD_TWINCUT") != nullptr);
127 if (forceAnalyticalEwald && forceTabulatedEwald)
130 "Both analytical and tabulated Ewald GPU non-bonded kernels "
131 "requested through environment variables.");
134 /* By default, use analytical Ewald except with CUDA on NVIDIA CC 7.0 and 8.0.
136 const bool c_useTabulatedEwaldDefault =
138 (deviceInfo.prop.major == 7 && deviceInfo.prop.minor == 0)
139 || (deviceInfo.prop.major == 8 && deviceInfo.prop.minor == 0);
143 bool bUseAnalyticalEwald = !c_useTabulatedEwaldDefault;
144 if (forceAnalyticalEwald)
146 bUseAnalyticalEwald = true;
149 fprintf(debug, "Using analytical Ewald GPU kernels\n");
152 else if (forceTabulatedEwald)
154 bUseAnalyticalEwald = false;
158 fprintf(debug, "Using tabulated Ewald GPU kernels\n");
162 /* Use twin cut-off kernels if requested by bTwinCut or the env. var.
163 forces it (use it for debugging/benchmarking only). */
164 if (!bTwinCut && !forceTwinCutoffEwald)
166 return bUseAnalyticalEwald ? ElecType::EwaldAna : ElecType::EwaldTab;
170 return bUseAnalyticalEwald ? ElecType::EwaldAnaTwin : ElecType::EwaldTabTwin;
174 void set_cutoff_parameters(NBParamGpu* nbp, const interaction_const_t& ic, const PairlistParams& listParams)
176 nbp->ewald_beta = ic.ewaldcoeff_q;
177 nbp->sh_ewald = ic.sh_ewald;
178 nbp->epsfac = ic.epsfac;
179 nbp->two_k_rf = 2.0 * ic.reactionFieldCoefficient;
180 nbp->c_rf = ic.reactionFieldShift;
181 nbp->rvdw_sq = ic.rvdw * ic.rvdw;
182 nbp->rcoulomb_sq = ic.rcoulomb * ic.rcoulomb;
183 nbp->rlistOuter_sq = listParams.rlistOuter * listParams.rlistOuter;
184 nbp->rlistInner_sq = listParams.rlistInner * listParams.rlistInner;
185 nbp->useDynamicPruning = listParams.useDynamicPruning;
187 nbp->sh_lj_ewald = ic.sh_lj_ewald;
188 nbp->ewaldcoeff_lj = ic.ewaldcoeff_lj;
190 nbp->rvdw_switch = ic.rvdw_switch;
191 nbp->dispersion_shift = ic.dispersion_shift;
192 nbp->repulsion_shift = ic.repulsion_shift;
193 nbp->vdw_switch = ic.vdw_switch;
196 void gpu_pme_loadbal_update_param(const nonbonded_verlet_t* nbv, const interaction_const_t& ic)
198 if (!nbv || !nbv->useGpu())
202 NbnxmGpu* nb = nbv->gpu_nbv;
203 NBParamGpu* nbp = nb->nbparam;
205 set_cutoff_parameters(nbp, ic, nbv->pairlistSets().params());
207 nbp->elecType = nbnxn_gpu_pick_ewald_kernel_type(ic, nb->deviceContext_->deviceInfo());
209 GMX_RELEASE_ASSERT(ic.coulombEwaldTables, "Need valid Coulomb Ewald correction tables");
210 init_ewald_coulomb_force_table(*ic.coulombEwaldTables, nbp, *nb->deviceContext_);
213 void init_plist(gpu_plist* pl)
215 /* initialize to nullptr pointers to data that is not allocated here and will
216 need reallocation in nbnxn_gpu_init_pairlist */
222 /* size -1 indicates that the respective array hasn't been initialized yet */
229 pl->imask_nalloc = -1;
231 pl->excl_nalloc = -1;
232 pl->haveFreshList = false;
233 pl->rollingPruningNumParts = 0;
234 pl->rollingPruningPart = 0;
237 void init_timings(gmx_wallclock_gpu_nbnxn_t* t)
244 for (int i = 0; i < 2; i++)
246 for (int j = 0; j < 2; j++)
248 t->ktime[i][j].t = 0.0;
249 t->ktime[i][j].c = 0;
253 t->pruneTime.t = 0.0;
254 t->dynamicPruneTime.c = 0;
255 t->dynamicPruneTime.t = 0.0;
258 /*! \brief Initialize \p atomdata first time; it only gets filled at pair-search. */
259 static void initAtomdataFirst(NBAtomData* atomdata,
261 const DeviceContext& deviceContext,
262 const DeviceStream& localStream)
264 atomdata->numTypes = numTypes;
265 allocateDeviceBuffer(&atomdata->shiftVec, SHIFTS, deviceContext);
266 atomdata->shiftVecUploaded = false;
268 allocateDeviceBuffer(&atomdata->fShift, SHIFTS, deviceContext);
269 allocateDeviceBuffer(&atomdata->eLJ, 1, deviceContext);
270 allocateDeviceBuffer(&atomdata->eElec, 1, deviceContext);
272 clearDeviceBufferAsync(&atomdata->fShift, 0, SHIFTS, localStream);
273 clearDeviceBufferAsync(&atomdata->eElec, 0, 1, localStream);
274 clearDeviceBufferAsync(&atomdata->eLJ, 0, 1, localStream);
276 /* initialize to nullptr pointers to data that is not allocated here and will
277 need reallocation in later */
278 atomdata->xq = nullptr;
279 atomdata->f = nullptr;
281 /* size -1 indicates that the respective array hasn't been initialized yet */
282 atomdata->numAtoms = -1;
283 atomdata->numAtomsAlloc = -1;
286 /*! \brief Initialize the nonbonded parameter data structure. */
287 static void initNbparam(NBParamGpu* nbp,
288 const interaction_const_t& ic,
289 const PairlistParams& listParams,
290 const nbnxn_atomdata_t::Params& nbatParams,
291 const DeviceContext& deviceContext)
293 const int numTypes = nbatParams.numTypes;
295 set_cutoff_parameters(nbp, ic, listParams);
297 nbp->vdwType = nbnxmGpuPickVdwKernelType(ic, nbatParams.ljCombinationRule);
298 nbp->elecType = nbnxmGpuPickElectrostaticsKernelType(ic, deviceContext.deviceInfo());
300 if (ic.vdwtype == VanDerWaalsType::Pme)
302 if (ic.ljpme_comb_rule == LongRangeVdW::Geom)
304 GMX_ASSERT(nbatParams.ljCombinationRule == LJCombinationRule::Geometric,
305 "Combination rule mismatch!");
309 GMX_ASSERT(nbatParams.ljCombinationRule == LJCombinationRule::LorentzBerthelot,
310 "Combination rule mismatch!");
314 /* generate table for PME */
315 if (nbp->elecType == ElecType::EwaldTab || nbp->elecType == ElecType::EwaldTabTwin)
317 GMX_RELEASE_ASSERT(ic.coulombEwaldTables, "Need valid Coulomb Ewald correction tables");
318 init_ewald_coulomb_force_table(*ic.coulombEwaldTables, nbp, deviceContext);
322 // Need to initialize for OpenCL, since it is unconditionally used as a kernel argument.
323 allocateDeviceBuffer(&nbp->coulomb_tab, 1, deviceContext);
326 /* set up LJ parameter lookup table */
327 if (!useLjCombRule(nbp->vdwType))
329 static_assert(sizeof(decltype(nbp->nbfp)) == 2 * sizeof(decltype(*nbatParams.nbfp.data())),
330 "Mismatch in the size of host / device data types");
331 initParamLookupTable(&nbp->nbfp,
333 reinterpret_cast<const Float2*>(nbatParams.nbfp.data()),
339 // Need to initialize for OpenCL, since it is unconditionally used as a kernel argument.
340 allocateDeviceBuffer(&nbp->nbfp, 1, deviceContext);
343 /* set up LJ-PME parameter lookup table */
344 if (ic.vdwtype == VanDerWaalsType::Pme)
346 static_assert(sizeof(decltype(nbp->nbfp_comb))
347 == 2 * sizeof(decltype(*nbatParams.nbfp_comb.data())),
348 "Mismatch in the size of host / device data types");
349 initParamLookupTable(&nbp->nbfp_comb,
350 &nbp->nbfp_comb_texobj,
351 reinterpret_cast<const Float2*>(nbatParams.nbfp_comb.data()),
357 // Need to initialize for OpenCL, since it is unconditionally used as a kernel argument.
358 allocateDeviceBuffer(&nbp->nbfp_comb, 1, deviceContext);
362 NbnxmGpu* gpu_init(const gmx::DeviceStreamManager& deviceStreamManager,
363 const interaction_const_t* ic,
364 const PairlistParams& listParams,
365 const nbnxn_atomdata_t* nbat,
366 const bool bLocalAndNonlocal)
368 auto* nb = new NbnxmGpu();
369 nb->deviceContext_ = &deviceStreamManager.context();
370 nb->atdat = new NBAtomData;
371 nb->nbparam = new NBParamGpu;
372 nb->plist[InteractionLocality::Local] = new Nbnxm::gpu_plist;
373 if (bLocalAndNonlocal)
375 nb->plist[InteractionLocality::NonLocal] = new Nbnxm::gpu_plist;
378 nb->bUseTwoStreams = bLocalAndNonlocal;
380 nb->timers = new Nbnxm::GpuTimers();
381 snew(nb->timings, 1);
383 /* WARNING: CUDA timings are incorrect with multiple streams.
384 * This is the main reason why they are disabled by default.
385 * Can be enabled by setting GMX_ENABLE_GPU_TIMING environment variable.
386 * TODO: Consider turning on by default when we can detect nr of streams.
388 * OpenCL timing is enabled by default and can be disabled by
389 * GMX_DISABLE_GPU_TIMING environment variable.
391 * Timing is disabled in SYCL.
393 nb->bDoTime = (GMX_GPU_CUDA && (getenv("GMX_ENABLE_GPU_TIMING") != nullptr))
394 || (GMX_GPU_OPENCL && (getenv("GMX_DISABLE_GPU_TIMING") == nullptr));
398 init_timings(nb->timings);
402 pmalloc(reinterpret_cast<void**>(&nb->nbst.eLJ), sizeof(*nb->nbst.eLJ));
403 pmalloc(reinterpret_cast<void**>(&nb->nbst.eElec), sizeof(*nb->nbst.eElec));
404 pmalloc(reinterpret_cast<void**>(&nb->nbst.fShift), SHIFTS * sizeof(*nb->nbst.fShift));
406 init_plist(nb->plist[InteractionLocality::Local]);
408 /* local/non-local GPU streams */
409 GMX_RELEASE_ASSERT(deviceStreamManager.streamIsValid(gmx::DeviceStreamType::NonBondedLocal),
410 "Local non-bonded stream should be initialized to use GPU for non-bonded.");
411 const DeviceStream& localStream = deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedLocal);
412 nb->deviceStreams[InteractionLocality::Local] = &localStream;
413 // In general, it's not strictly necessary to use 2 streams for SYCL, since they are
414 // out-of-order. But for the time being, it will be less disruptive to keep them.
415 if (nb->bUseTwoStreams)
417 init_plist(nb->plist[InteractionLocality::NonLocal]);
419 GMX_RELEASE_ASSERT(deviceStreamManager.streamIsValid(gmx::DeviceStreamType::NonBondedNonLocal),
420 "Non-local non-bonded stream should be initialized to use GPU for "
421 "non-bonded with domain decomposition.");
422 nb->deviceStreams[InteractionLocality::NonLocal] =
423 &deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedNonLocal);
426 const nbnxn_atomdata_t::Params& nbatParams = nbat->params();
427 const DeviceContext& deviceContext = *nb->deviceContext_;
429 initNbparam(nb->nbparam, *ic, listParams, nbatParams, deviceContext);
430 initAtomdataFirst(nb->atdat, nbatParams.numTypes, deviceContext, localStream);
432 gpu_init_platform_specific(nb);
436 fprintf(debug, "Initialized NBNXM GPU data structures.\n");
442 //! This function is documented in the header file
443 void gpu_init_pairlist(NbnxmGpu* nb, const NbnxnPairlistGpu* h_plist, const InteractionLocality iloc)
446 // Timing accumulation should happen only if there was work to do
447 // because getLastRangeTime() gets skipped with empty lists later
448 // which leads to the counter not being reset.
449 bool bDoTime = (nb->bDoTime && !h_plist->sci.empty());
450 const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
451 gpu_plist* d_plist = nb->plist[iloc];
453 if (d_plist->na_c < 0)
455 d_plist->na_c = h_plist->na_ci;
459 if (d_plist->na_c != h_plist->na_ci)
462 "In init_plist: the #atoms per cell has changed (from %d to %d)",
469 GpuTimers::Interaction& iTimers = nb->timers->interaction[iloc];
473 iTimers.pl_h2d.openTimingRegion(deviceStream);
474 iTimers.didPairlistH2D = true;
477 // TODO most of this function is same in CUDA and OpenCL, move into the header
478 const DeviceContext& deviceContext = *nb->deviceContext_;
480 reallocateDeviceBuffer(
481 &d_plist->sci, h_plist->sci.size(), &d_plist->nsci, &d_plist->sci_nalloc, deviceContext);
482 copyToDeviceBuffer(&d_plist->sci,
487 GpuApiCallBehavior::Async,
488 bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
490 reallocateDeviceBuffer(
491 &d_plist->cj4, h_plist->cj4.size(), &d_plist->ncj4, &d_plist->cj4_nalloc, deviceContext);
492 copyToDeviceBuffer(&d_plist->cj4,
497 GpuApiCallBehavior::Async,
498 bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
500 reallocateDeviceBuffer(&d_plist->imask,
501 h_plist->cj4.size() * c_nbnxnGpuClusterpairSplit,
503 &d_plist->imask_nalloc,
506 reallocateDeviceBuffer(
507 &d_plist->excl, h_plist->excl.size(), &d_plist->nexcl, &d_plist->excl_nalloc, deviceContext);
508 copyToDeviceBuffer(&d_plist->excl,
509 h_plist->excl.data(),
511 h_plist->excl.size(),
513 GpuApiCallBehavior::Async,
514 bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
518 iTimers.pl_h2d.closeTimingRegion(deviceStream);
521 /* need to prune the pair list during the next step */
522 d_plist->haveFreshList = true;
525 void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat)
527 bool bDoTime = nb->bDoTime;
528 Nbnxm::GpuTimers* timers = bDoTime ? nb->timers : nullptr;
529 NBAtomData* atdat = nb->atdat;
530 const DeviceContext& deviceContext = *nb->deviceContext_;
531 const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local];
533 int numAtoms = nbat->numAtoms();
534 bool realloced = false;
538 /* time async copy */
539 timers->atdat.openTimingRegion(localStream);
542 /* need to reallocate if we have to copy more atoms than the amount of space
543 available and only allocate if we haven't initialized yet, i.e atdat->natoms == -1 */
544 if (numAtoms > atdat->numAtomsAlloc)
546 int numAlloc = over_alloc_small(numAtoms);
548 /* free up first if the arrays have already been initialized */
549 if (atdat->numAtomsAlloc != -1)
551 freeDeviceBuffer(&atdat->f);
552 freeDeviceBuffer(&atdat->xq);
553 if (useLjCombRule(nb->nbparam->vdwType))
555 freeDeviceBuffer(&atdat->ljComb);
559 freeDeviceBuffer(&atdat->atomTypes);
564 allocateDeviceBuffer(&atdat->f, numAlloc, deviceContext);
565 allocateDeviceBuffer(&atdat->xq, numAlloc, deviceContext);
567 if (useLjCombRule(nb->nbparam->vdwType))
569 // Two Lennard-Jones parameters per atom
570 allocateDeviceBuffer(&atdat->ljComb, numAlloc, deviceContext);
574 allocateDeviceBuffer(&atdat->atomTypes, numAlloc, deviceContext);
577 atdat->numAtomsAlloc = numAlloc;
581 atdat->numAtoms = numAtoms;
582 atdat->numAtomsLocal = nbat->natoms_local;
584 /* need to clear GPU f output if realloc happened */
587 clearDeviceBufferAsync(&atdat->f, 0, atdat->numAtomsAlloc, localStream);
590 if (useLjCombRule(nb->nbparam->vdwType))
593 sizeof(Float2) == 2 * sizeof(*nbat->params().lj_comb.data()),
594 "Size of a pair of LJ parameters elements should be equal to the size of Float2.");
595 copyToDeviceBuffer(&atdat->ljComb,
596 reinterpret_cast<const Float2*>(nbat->params().lj_comb.data()),
600 GpuApiCallBehavior::Async,
601 bDoTime ? timers->atdat.fetchNextEvent() : nullptr);
605 static_assert(sizeof(int) == sizeof(*nbat->params().type.data()),
606 "Sizes of host- and device-side atom types should be the same.");
607 copyToDeviceBuffer(&atdat->atomTypes,
608 nbat->params().type.data(),
612 GpuApiCallBehavior::Async,
613 bDoTime ? timers->atdat.fetchNextEvent() : nullptr);
618 timers->atdat.closeTimingRegion(localStream);
621 /* kick off the tasks enqueued above to ensure concurrency with the search */
622 issueClFlushInStream(localStream);
625 void gpu_clear_outputs(NbnxmGpu* nb, bool computeVirial)
627 NBAtomData* adat = nb->atdat;
628 const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local];
630 clearDeviceBufferAsync(&adat->f, 0, nb->atdat->numAtoms, localStream);
631 // Clear shift force array and energies if the outputs were used in the current step
634 clearDeviceBufferAsync(&adat->fShift, 0, SHIFTS, localStream);
635 clearDeviceBufferAsync(&adat->eLJ, 0, 1, localStream);
636 clearDeviceBufferAsync(&adat->eElec, 0, 1, localStream);
638 issueClFlushInStream(localStream);
641 //! This function is documented in the header file
642 gmx_wallclock_gpu_nbnxn_t* gpu_get_timings(NbnxmGpu* nb)
644 return (nb != nullptr && nb->bDoTime) ? nb->timings : nullptr;
647 //! This function is documented in the header file
648 void gpu_reset_timings(nonbonded_verlet_t* nbv)
650 if (nbv->gpu_nbv && nbv->gpu_nbv->bDoTime)
652 init_timings(nbv->gpu_nbv->timings);
656 bool gpu_is_kernel_ewald_analytical(const NbnxmGpu* nb)
658 return ((nb->nbparam->elecType == ElecType::EwaldAna)
659 || (nb->nbparam->elecType == ElecType::EwaldAnaTwin));
662 enum ElecType nbnxmGpuPickElectrostaticsKernelType(const interaction_const_t& ic,
663 const DeviceInformation& deviceInfo)
665 if (ic.eeltype == CoulombInteractionType::Cut)
667 return ElecType::Cut;
669 else if (EEL_RF(ic.eeltype))
673 else if ((EEL_PME(ic.eeltype) || ic.eeltype == CoulombInteractionType::Ewald))
675 return nbnxn_gpu_pick_ewald_kernel_type(ic, deviceInfo);
679 /* Shouldn't happen, as this is checked when choosing Verlet-scheme */
680 GMX_THROW(gmx::InconsistentInputError(
681 gmx::formatString("The requested electrostatics type %s is not implemented in "
682 "the GPU accelerated kernels!",
683 enumValueToString(ic.eeltype))));
688 enum VdwType nbnxmGpuPickVdwKernelType(const interaction_const_t& ic, LJCombinationRule ljCombinationRule)
690 if (ic.vdwtype == VanDerWaalsType::Cut)
692 switch (ic.vdw_modifier)
694 case InteractionModifiers::None:
695 case InteractionModifiers::PotShift:
696 switch (ljCombinationRule)
698 case LJCombinationRule::None: return VdwType::Cut;
699 case LJCombinationRule::Geometric: return VdwType::CutCombGeom;
700 case LJCombinationRule::LorentzBerthelot: return VdwType::CutCombLB;
702 GMX_THROW(gmx::InconsistentInputError(gmx::formatString(
703 "The requested LJ combination rule %s is not implemented in "
704 "the GPU accelerated kernels!",
705 enumValueToString(ljCombinationRule))));
707 case InteractionModifiers::ForceSwitch: return VdwType::FSwitch;
708 case InteractionModifiers::PotSwitch: return VdwType::PSwitch;
710 GMX_THROW(gmx::InconsistentInputError(
711 gmx::formatString("The requested VdW interaction modifier %s is not "
712 "implemented in the GPU accelerated kernels!",
713 enumValueToString(ic.vdw_modifier))));
716 else if (ic.vdwtype == VanDerWaalsType::Pme)
718 if (ic.ljpme_comb_rule == LongRangeVdW::Geom)
720 assert(ljCombinationRule == LJCombinationRule::Geometric);
721 return VdwType::EwaldGeom;
725 assert(ljCombinationRule == LJCombinationRule::LorentzBerthelot);
726 return VdwType::EwaldLB;
731 GMX_THROW(gmx::InconsistentInputError(gmx::formatString(
732 "The requested VdW type %s is not implemented in the GPU accelerated kernels!",
733 enumValueToString(ic.vdwtype))));
737 void setupGpuShortRangeWork(NbnxmGpu* nb, const gmx::GpuBonded* gpuBonded, const gmx::InteractionLocality iLocality)
739 GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
741 // There is short-range work if the pair list for the provided
742 // interaction locality contains entries or if there is any
743 // bonded work (as this is not split into local/nonlocal).
744 nb->haveWork[iLocality] = ((nb->plist[iLocality]->nsci != 0)
745 || (gpuBonded != nullptr && gpuBonded->haveInteractions()));
748 bool haveGpuShortRangeWork(const NbnxmGpu* nb, const gmx::InteractionLocality interactionLocality)
750 GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
752 return nb->haveWork[interactionLocality];
756 * Launch asynchronously the download of nonbonded forces from the GPU
757 * (and energies/shift forces if required).
759 void gpu_launch_cpyback(NbnxmGpu* nb,
760 struct nbnxn_atomdata_t* nbatom,
761 const gmx::StepWorkload& stepWork,
762 const AtomLocality atomLocality)
764 GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
766 /* determine interaction locality from atom locality */
767 const InteractionLocality iloc = atomToInteractionLocality(atomLocality);
768 GMX_ASSERT(iloc == InteractionLocality::Local
769 || (iloc == InteractionLocality::NonLocal && nb->bNonLocalStreamDoneMarked == false),
770 "Non-local stream is indicating that the copy back event is enqueued at the "
771 "beginning of the copy back function.");
773 /* extract the data */
774 NBAtomData* adat = nb->atdat;
775 Nbnxm::GpuTimers* timers = nb->timers;
776 bool bDoTime = nb->bDoTime;
777 const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
779 /* don't launch non-local copy-back if there was no non-local work to do */
780 if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(nb, iloc))
782 /* TODO An alternative way to signal that non-local work is
783 complete is to use a clEnqueueMarker+clEnqueueBarrier
784 pair. However, the use of bNonLocalStreamDoneMarked has the
785 advantage of being local to the host, so probably minimizes
786 overhead. Curiously, for NVIDIA OpenCL with an empty-domain
787 test case, overall simulation performance was higher with
788 the API calls, but this has not been tested on AMD OpenCL,
789 so could be worth considering in future. */
790 nb->bNonLocalStreamDoneMarked = false;
794 /* local/nonlocal offset and length used for xq and f */
795 auto atomsRange = getGpuAtomRange(adat, atomLocality);
797 /* beginning of timed D2H section */
800 timers->xf[atomLocality].nb_d2h.openTimingRegion(deviceStream);
803 /* With DD the local D2H transfer can only start after the non-local
804 has been launched. */
805 if (iloc == InteractionLocality::Local && nb->bNonLocalStreamDoneMarked)
807 nb->nonlocal_done.enqueueWaitEvent(deviceStream);
808 nb->bNonLocalStreamDoneMarked = false;
812 static_assert(sizeof(*nbatom->out[0].f.data()) == sizeof(float),
813 "The host force buffer should be in single precision to match device data size.");
814 copyFromDeviceBuffer(reinterpret_cast<Float3*>(nbatom->out[0].f.data()) + atomsRange.begin(),
819 GpuApiCallBehavior::Async,
820 bDoTime ? timers->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
822 issueClFlushInStream(deviceStream);
824 /* After the non-local D2H is launched the nonlocal_done event can be
825 recorded which signals that the local D2H can proceed. This event is not
826 placed after the non-local kernel because we first need the non-local
828 if (iloc == InteractionLocality::NonLocal)
830 nb->nonlocal_done.markEvent(deviceStream);
831 nb->bNonLocalStreamDoneMarked = true;
834 /* only transfer energies in the local stream */
835 if (iloc == InteractionLocality::Local)
837 /* DtoH fshift when virial is needed */
838 if (stepWork.computeVirial)
841 sizeof(*nb->nbst.fShift) == sizeof(Float3),
842 "Sizes of host- and device-side shift vector elements should be the same.");
843 copyFromDeviceBuffer(nb->nbst.fShift,
848 GpuApiCallBehavior::Async,
849 bDoTime ? timers->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
853 if (stepWork.computeEnergy)
855 static_assert(sizeof(*nb->nbst.eLJ) == sizeof(float),
856 "Sizes of host- and device-side LJ energy terms should be the same.");
857 copyFromDeviceBuffer(nb->nbst.eLJ,
862 GpuApiCallBehavior::Async,
863 bDoTime ? timers->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
864 static_assert(sizeof(*nb->nbst.eElec) == sizeof(float),
865 "Sizes of host- and device-side electrostatic energy terms should be the "
867 copyFromDeviceBuffer(nb->nbst.eElec,
872 GpuApiCallBehavior::Async,
873 bDoTime ? timers->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
879 timers->xf[atomLocality].nb_d2h.closeTimingRegion(deviceStream);
883 void nbnxnInsertNonlocalGpuDependency(NbnxmGpu* nb, const InteractionLocality interactionLocality)
885 const DeviceStream& deviceStream = *nb->deviceStreams[interactionLocality];
887 /* When we get here all misc operations issued in the local stream as well as
888 the local xq H2D are done,
889 so we record that in the local stream and wait for it in the nonlocal one.
890 This wait needs to precede any PP tasks, bonded or nonbonded, that may
891 compute on interactions between local and nonlocal atoms.
893 if (nb->bUseTwoStreams)
895 if (interactionLocality == InteractionLocality::Local)
897 nb->misc_ops_and_local_H2D_done.markEvent(deviceStream);
898 issueClFlushInStream(deviceStream);
902 nb->misc_ops_and_local_H2D_done.enqueueWaitEvent(deviceStream);
907 /*! \brief Launch asynchronously the xq buffer host to device copy. */
908 void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const AtomLocality atomLocality)
910 GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
912 const InteractionLocality iloc = atomToInteractionLocality(atomLocality);
914 NBAtomData* adat = nb->atdat;
915 gpu_plist* plist = nb->plist[iloc];
916 Nbnxm::GpuTimers* timers = nb->timers;
917 const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
919 const bool bDoTime = nb->bDoTime;
921 /* Don't launch the non-local H2D copy if there is no dependent
922 work to do: neither non-local nor other (e.g. bonded) work
923 to do that has as input the nbnxn coordaintes.
924 Doing the same for the local kernel is more complicated, since the
925 local part of the force array also depends on the non-local kernel.
926 So to avoid complicating the code and to reduce the risk of bugs,
927 we always call the local local x+q copy (and the rest of the local
928 work in nbnxn_gpu_launch_kernel().
930 if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(nb, iloc))
932 plist->haveFreshList = false;
934 // The event is marked for Local interactions unconditionally,
935 // so it has to be released here because of the early return
936 // for NonLocal interactions.
937 nb->misc_ops_and_local_H2D_done.reset();
942 /* local/nonlocal offset and length used for xq and f */
943 const auto atomsRange = getGpuAtomRange(adat, atomLocality);
945 /* beginning of timed HtoD section */
948 timers->xf[atomLocality].nb_h2d.openTimingRegion(deviceStream);
952 GMX_ASSERT(nbatom->XFormat == nbatXYZQ,
953 "The coordinates should be in xyzq format to copy to the Float4 device buffer.");
954 copyToDeviceBuffer(&adat->xq,
955 reinterpret_cast<const Float4*>(nbatom->x().data()) + atomsRange.begin(),
959 GpuApiCallBehavior::Async,
964 timers->xf[atomLocality].nb_h2d.closeTimingRegion(deviceStream);
967 /* When we get here all misc operations issued in the local stream as well as
968 the local xq H2D are done,
969 so we record that in the local stream and wait for it in the nonlocal one.
970 This wait needs to precede any PP tasks, bonded or nonbonded, that may
971 compute on interactions between local and nonlocal atoms.
973 nbnxnInsertNonlocalGpuDependency(nb, iloc);