2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2016 by the GROMACS development team.
5 * Copyright (c) 2017,2018,2019,2020,2021, by the GROMACS development team, led by
6 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
7 * and including many others, as listed in the AUTHORS file in the
8 * top-level source directory and at http://www.gromacs.org.
10 * GROMACS is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public License
12 * as published by the Free Software Foundation; either version 2.1
13 * of the License, or (at your option) any later version.
15 * GROMACS is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with GROMACS; if not, see
22 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
23 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
25 * If you want to redistribute modifications to GROMACS, please
26 * consider that scientific software is very special. Version
27 * control is crucial - bugs must be traceable. We will be happy to
28 * consider code for inclusion in the official distribution, but
29 * derived work must not be called official GROMACS. Details are found
30 * in the README & COPYING files - if they are missing, get the
31 * official version at http://www.gromacs.org.
33 * To help us fund GROMACS development, we humbly ask that you cite
34 * the research papers on the package. Check out http://www.gromacs.org.
37 * \brief Define common implementation of nbnxm_gpu_data_mgmt.h
39 * \author Anca Hamuraru <anca@streamcomputing.eu>
40 * \author Dimitrios Karkoulis <dimitris.karkoulis@gmail.com>
41 * \author Teemu Virolainen <teemu@streamcomputing.eu>
42 * \author Szilárd Páll <pall.szilard@gmail.com>
43 * \author Artem Zhmurov <zhmurov@gmail.com>
45 * \ingroup module_nbnxm
52 # include "cuda/nbnxm_cuda_types.h"
56 # include "opencl/nbnxm_ocl_types.h"
60 # include "sycl/nbnxm_sycl_types.h"
63 #include "nbnxm_gpu_data_mgmt.h"
65 #include "gromacs/gpu_utils/device_stream_manager.h"
66 #include "gromacs/gpu_utils/gputraits.h"
67 #include "gromacs/gpu_utils/pmalloc.h"
68 #include "gromacs/hardware/device_information.h"
69 #include "gromacs/mdtypes/interaction_const.h"
70 #include "gromacs/mdtypes/simulation_workload.h"
71 #include "gromacs/nbnxm/gpu_common_utils.h"
72 #include "gromacs/nbnxm/gpu_data_mgmt.h"
73 #include "gromacs/nbnxm/gridset.h"
74 #include "gromacs/pbcutil/ishift.h"
75 #include "gromacs/timing/gpu_timing.h"
76 #include "gromacs/pbcutil/ishift.h"
77 #include "gromacs/utility/cstringutil.h"
78 #include "gromacs/utility/exceptions.h"
79 #include "gromacs/utility/fatalerror.h"
81 #include "nbnxm_gpu.h"
82 #include "pairlistsets.h"
87 inline void issueClFlushInStream(const DeviceStream& deviceStream)
90 /* Based on the v1.2 section 5.13 of the OpenCL spec, a flush is needed
91 * in the stream after marking an event in it in order to be able to sync with
92 * the event from another stream.
94 cl_int cl_error = clFlush(deviceStream.stream());
95 if (cl_error != CL_SUCCESS)
97 GMX_THROW(gmx::InternalError("clFlush failed: " + ocl_get_error_string(cl_error)));
100 GMX_UNUSED_VALUE(deviceStream);
104 void init_ewald_coulomb_force_table(const EwaldCorrectionTables& tables,
106 const DeviceContext& deviceContext)
108 if (nbp->coulomb_tab)
110 destroyParamLookupTable(&nbp->coulomb_tab, nbp->coulomb_tab_texobj);
113 nbp->coulomb_tab_scale = tables.scale;
114 initParamLookupTable(
115 &nbp->coulomb_tab, &nbp->coulomb_tab_texobj, tables.tableF.data(), tables.tableF.size(), deviceContext);
118 enum ElecType nbnxn_gpu_pick_ewald_kernel_type(const interaction_const_t& ic,
119 const DeviceInformation gmx_unused& deviceInfo)
121 bool bTwinCut = (ic.rcoulomb != ic.rvdw);
123 /* Benchmarking/development environment variables to force the use of
124 analytical or tabulated Ewald kernel. */
125 const bool forceAnalyticalEwald = (getenv("GMX_GPU_NB_ANA_EWALD") != nullptr);
126 const bool forceTabulatedEwald = (getenv("GMX_GPU_NB_TAB_EWALD") != nullptr);
127 const bool forceTwinCutoffEwald = (getenv("GMX_GPU_NB_EWALD_TWINCUT") != nullptr);
129 if (forceAnalyticalEwald && forceTabulatedEwald)
132 "Both analytical and tabulated Ewald GPU non-bonded kernels "
133 "requested through environment variables.");
136 /* By default, use analytical Ewald except with CUDA on NVIDIA CC 7.0 and 8.0.
138 const bool c_useTabulatedEwaldDefault =
140 (deviceInfo.prop.major == 7 && deviceInfo.prop.minor == 0)
141 || (deviceInfo.prop.major == 8 && deviceInfo.prop.minor == 0);
145 bool bUseAnalyticalEwald = !c_useTabulatedEwaldDefault;
146 if (forceAnalyticalEwald)
148 bUseAnalyticalEwald = true;
151 fprintf(debug, "Using analytical Ewald GPU kernels\n");
154 else if (forceTabulatedEwald)
156 bUseAnalyticalEwald = false;
160 fprintf(debug, "Using tabulated Ewald GPU kernels\n");
164 /* Use twin cut-off kernels if requested by bTwinCut or the env. var.
165 forces it (use it for debugging/benchmarking only). */
166 if (!bTwinCut && !forceTwinCutoffEwald)
168 return bUseAnalyticalEwald ? ElecType::EwaldAna : ElecType::EwaldTab;
172 return bUseAnalyticalEwald ? ElecType::EwaldAnaTwin : ElecType::EwaldTabTwin;
176 void set_cutoff_parameters(NBParamGpu* nbp, const interaction_const_t& ic, const PairlistParams& listParams)
178 nbp->ewald_beta = ic.ewaldcoeff_q;
179 nbp->sh_ewald = ic.sh_ewald;
180 nbp->epsfac = ic.epsfac;
181 nbp->two_k_rf = 2.0 * ic.reactionFieldCoefficient;
182 nbp->c_rf = ic.reactionFieldShift;
183 nbp->rvdw_sq = ic.rvdw * ic.rvdw;
184 nbp->rcoulomb_sq = ic.rcoulomb * ic.rcoulomb;
185 nbp->rlistOuter_sq = listParams.rlistOuter * listParams.rlistOuter;
186 nbp->rlistInner_sq = listParams.rlistInner * listParams.rlistInner;
187 nbp->useDynamicPruning = listParams.useDynamicPruning;
189 nbp->sh_lj_ewald = ic.sh_lj_ewald;
190 nbp->ewaldcoeff_lj = ic.ewaldcoeff_lj;
192 nbp->rvdw_switch = ic.rvdw_switch;
193 nbp->dispersion_shift = ic.dispersion_shift;
194 nbp->repulsion_shift = ic.repulsion_shift;
195 nbp->vdw_switch = ic.vdw_switch;
198 void gpu_pme_loadbal_update_param(const nonbonded_verlet_t* nbv, const interaction_const_t& ic)
200 if (!nbv || !nbv->useGpu())
204 NbnxmGpu* nb = nbv->gpu_nbv;
205 NBParamGpu* nbp = nb->nbparam;
207 set_cutoff_parameters(nbp, ic, nbv->pairlistSets().params());
209 nbp->elecType = nbnxn_gpu_pick_ewald_kernel_type(ic, nb->deviceContext_->deviceInfo());
211 GMX_RELEASE_ASSERT(ic.coulombEwaldTables, "Need valid Coulomb Ewald correction tables");
212 init_ewald_coulomb_force_table(*ic.coulombEwaldTables, nbp, *nb->deviceContext_);
215 void init_plist(gpu_plist* pl)
217 /* initialize to nullptr pointers to data that is not allocated here and will
218 need reallocation in nbnxn_gpu_init_pairlist */
224 /* size -1 indicates that the respective array hasn't been initialized yet */
231 pl->imask_nalloc = -1;
233 pl->excl_nalloc = -1;
234 pl->haveFreshList = false;
235 pl->rollingPruningNumParts = 0;
236 pl->rollingPruningPart = 0;
239 void init_timings(gmx_wallclock_gpu_nbnxn_t* t)
246 for (int i = 0; i < 2; i++)
248 for (int j = 0; j < 2; j++)
250 t->ktime[i][j].t = 0.0;
251 t->ktime[i][j].c = 0;
255 t->pruneTime.t = 0.0;
256 t->dynamicPruneTime.c = 0;
257 t->dynamicPruneTime.t = 0.0;
260 /*! \brief Initialize \p atomdata first time; it only gets filled at pair-search. */
261 static void initAtomdataFirst(NBAtomData* atomdata,
263 const DeviceContext& deviceContext,
264 const DeviceStream& localStream)
266 atomdata->numTypes = numTypes;
267 allocateDeviceBuffer(&atomdata->shiftVec, SHIFTS, deviceContext);
268 atomdata->shiftVecUploaded = false;
270 allocateDeviceBuffer(&atomdata->fShift, SHIFTS, deviceContext);
271 allocateDeviceBuffer(&atomdata->eLJ, 1, deviceContext);
272 allocateDeviceBuffer(&atomdata->eElec, 1, deviceContext);
274 clearDeviceBufferAsync(&atomdata->fShift, 0, SHIFTS, localStream);
275 clearDeviceBufferAsync(&atomdata->eElec, 0, 1, localStream);
276 clearDeviceBufferAsync(&atomdata->eLJ, 0, 1, localStream);
278 /* initialize to nullptr pointers to data that is not allocated here and will
279 need reallocation in later */
280 atomdata->xq = nullptr;
281 atomdata->f = nullptr;
283 /* size -1 indicates that the respective array hasn't been initialized yet */
284 atomdata->numAtoms = -1;
285 atomdata->numAtomsAlloc = -1;
288 /*! \brief Initialize the nonbonded parameter data structure. */
289 static void initNbparam(NBParamGpu* nbp,
290 const interaction_const_t& ic,
291 const PairlistParams& listParams,
292 const nbnxn_atomdata_t::Params& nbatParams,
293 const DeviceContext& deviceContext)
295 const int numTypes = nbatParams.numTypes;
297 set_cutoff_parameters(nbp, ic, listParams);
299 nbp->vdwType = nbnxmGpuPickVdwKernelType(ic, nbatParams.ljCombinationRule);
300 nbp->elecType = nbnxmGpuPickElectrostaticsKernelType(ic, deviceContext.deviceInfo());
302 if (ic.vdwtype == VanDerWaalsType::Pme)
304 if (ic.ljpme_comb_rule == LongRangeVdW::Geom)
306 GMX_ASSERT(nbatParams.ljCombinationRule == LJCombinationRule::Geometric,
307 "Combination rule mismatch!");
311 GMX_ASSERT(nbatParams.ljCombinationRule == LJCombinationRule::LorentzBerthelot,
312 "Combination rule mismatch!");
316 /* generate table for PME */
317 if (nbp->elecType == ElecType::EwaldTab || nbp->elecType == ElecType::EwaldTabTwin)
319 GMX_RELEASE_ASSERT(ic.coulombEwaldTables, "Need valid Coulomb Ewald correction tables");
320 init_ewald_coulomb_force_table(*ic.coulombEwaldTables, nbp, deviceContext);
324 // Need to initialize for OpenCL, since it is unconditionally used as a kernel argument.
325 allocateDeviceBuffer(&nbp->coulomb_tab, 1, deviceContext);
328 /* set up LJ parameter lookup table */
329 if (!useLjCombRule(nbp->vdwType))
331 static_assert(sizeof(decltype(nbp->nbfp)) == 2 * sizeof(decltype(*nbatParams.nbfp.data())),
332 "Mismatch in the size of host / device data types");
333 initParamLookupTable(&nbp->nbfp,
335 reinterpret_cast<const Float2*>(nbatParams.nbfp.data()),
341 // Need to initialize for OpenCL, since it is unconditionally used as a kernel argument.
342 allocateDeviceBuffer(&nbp->nbfp, 1, deviceContext);
345 /* set up LJ-PME parameter lookup table */
346 if (ic.vdwtype == VanDerWaalsType::Pme)
348 static_assert(sizeof(decltype(nbp->nbfp_comb))
349 == 2 * sizeof(decltype(*nbatParams.nbfp_comb.data())),
350 "Mismatch in the size of host / device data types");
351 initParamLookupTable(&nbp->nbfp_comb,
352 &nbp->nbfp_comb_texobj,
353 reinterpret_cast<const Float2*>(nbatParams.nbfp_comb.data()),
359 // Need to initialize for OpenCL, since it is unconditionally used as a kernel argument.
360 allocateDeviceBuffer(&nbp->nbfp_comb, 1, deviceContext);
364 NbnxmGpu* gpu_init(const gmx::DeviceStreamManager& deviceStreamManager,
365 const interaction_const_t* ic,
366 const PairlistParams& listParams,
367 const nbnxn_atomdata_t* nbat,
368 const bool bLocalAndNonlocal)
370 auto* nb = new NbnxmGpu();
371 nb->deviceContext_ = &deviceStreamManager.context();
372 nb->atdat = new NBAtomData;
373 nb->nbparam = new NBParamGpu;
374 nb->plist[InteractionLocality::Local] = new Nbnxm::gpu_plist;
375 if (bLocalAndNonlocal)
377 nb->plist[InteractionLocality::NonLocal] = new Nbnxm::gpu_plist;
380 nb->bUseTwoStreams = bLocalAndNonlocal;
382 nb->timers = new Nbnxm::GpuTimers();
383 snew(nb->timings, 1);
385 /* WARNING: CUDA timings are incorrect with multiple streams.
386 * This is the main reason why they are disabled by default.
387 * Can be enabled by setting GMX_ENABLE_GPU_TIMING environment variable.
388 * TODO: Consider turning on by default when we can detect nr of streams.
390 * OpenCL timing is enabled by default and can be disabled by
391 * GMX_DISABLE_GPU_TIMING environment variable.
393 * Timing is disabled in SYCL.
395 nb->bDoTime = (GMX_GPU_CUDA && (getenv("GMX_ENABLE_GPU_TIMING") != nullptr))
396 || (GMX_GPU_OPENCL && (getenv("GMX_DISABLE_GPU_TIMING") == nullptr));
400 init_timings(nb->timings);
404 pmalloc(reinterpret_cast<void**>(&nb->nbst.eLJ), sizeof(*nb->nbst.eLJ));
405 pmalloc(reinterpret_cast<void**>(&nb->nbst.eElec), sizeof(*nb->nbst.eElec));
406 pmalloc(reinterpret_cast<void**>(&nb->nbst.fShift), SHIFTS * sizeof(*nb->nbst.fShift));
408 init_plist(nb->plist[InteractionLocality::Local]);
410 /* local/non-local GPU streams */
411 GMX_RELEASE_ASSERT(deviceStreamManager.streamIsValid(gmx::DeviceStreamType::NonBondedLocal),
412 "Local non-bonded stream should be initialized to use GPU for non-bonded.");
413 const DeviceStream& localStream = deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedLocal);
414 nb->deviceStreams[InteractionLocality::Local] = &localStream;
415 // In general, it's not strictly necessary to use 2 streams for SYCL, since they are
416 // out-of-order. But for the time being, it will be less disruptive to keep them.
417 if (nb->bUseTwoStreams)
419 init_plist(nb->plist[InteractionLocality::NonLocal]);
421 GMX_RELEASE_ASSERT(deviceStreamManager.streamIsValid(gmx::DeviceStreamType::NonBondedNonLocal),
422 "Non-local non-bonded stream should be initialized to use GPU for "
423 "non-bonded with domain decomposition.");
424 nb->deviceStreams[InteractionLocality::NonLocal] =
425 &deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedNonLocal);
428 const nbnxn_atomdata_t::Params& nbatParams = nbat->params();
429 const DeviceContext& deviceContext = *nb->deviceContext_;
431 initNbparam(nb->nbparam, *ic, listParams, nbatParams, deviceContext);
432 initAtomdataFirst(nb->atdat, nbatParams.numTypes, deviceContext, localStream);
434 gpu_init_platform_specific(nb);
438 fprintf(debug, "Initialized NBNXM GPU data structures.\n");
444 void gpu_upload_shiftvec(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom)
446 NBAtomData* adat = nb->atdat;
447 const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local];
449 /* only if we have a dynamic box */
450 if (nbatom->bDynamicBox || !adat->shiftVecUploaded)
452 copyToDeviceBuffer(&adat->shiftVec,
453 gmx::asGenericFloat3Pointer(nbatom->shift_vec),
457 GpuApiCallBehavior::Async,
459 adat->shiftVecUploaded = true;
463 //! This function is documented in the header file
464 void gpu_init_pairlist(NbnxmGpu* nb, const NbnxnPairlistGpu* h_plist, const InteractionLocality iloc)
467 // Timing accumulation should happen only if there was work to do
468 // because getLastRangeTime() gets skipped with empty lists later
469 // which leads to the counter not being reset.
470 bool bDoTime = (nb->bDoTime && !h_plist->sci.empty());
471 const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
472 gpu_plist* d_plist = nb->plist[iloc];
474 if (d_plist->na_c < 0)
476 d_plist->na_c = h_plist->na_ci;
480 if (d_plist->na_c != h_plist->na_ci)
483 "In init_plist: the #atoms per cell has changed (from %d to %d)",
490 GpuTimers::Interaction& iTimers = nb->timers->interaction[iloc];
494 iTimers.pl_h2d.openTimingRegion(deviceStream);
495 iTimers.didPairlistH2D = true;
498 // TODO most of this function is same in CUDA and OpenCL, move into the header
499 const DeviceContext& deviceContext = *nb->deviceContext_;
501 reallocateDeviceBuffer(
502 &d_plist->sci, h_plist->sci.size(), &d_plist->nsci, &d_plist->sci_nalloc, deviceContext);
503 copyToDeviceBuffer(&d_plist->sci,
508 GpuApiCallBehavior::Async,
509 bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
511 reallocateDeviceBuffer(
512 &d_plist->cj4, h_plist->cj4.size(), &d_plist->ncj4, &d_plist->cj4_nalloc, deviceContext);
513 copyToDeviceBuffer(&d_plist->cj4,
518 GpuApiCallBehavior::Async,
519 bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
521 reallocateDeviceBuffer(&d_plist->imask,
522 h_plist->cj4.size() * c_nbnxnGpuClusterpairSplit,
524 &d_plist->imask_nalloc,
527 reallocateDeviceBuffer(
528 &d_plist->excl, h_plist->excl.size(), &d_plist->nexcl, &d_plist->excl_nalloc, deviceContext);
529 copyToDeviceBuffer(&d_plist->excl,
530 h_plist->excl.data(),
532 h_plist->excl.size(),
534 GpuApiCallBehavior::Async,
535 bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
539 iTimers.pl_h2d.closeTimingRegion(deviceStream);
542 /* need to prune the pair list during the next step */
543 d_plist->haveFreshList = true;
546 void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat)
548 bool bDoTime = nb->bDoTime;
549 Nbnxm::GpuTimers* timers = bDoTime ? nb->timers : nullptr;
550 NBAtomData* atdat = nb->atdat;
551 const DeviceContext& deviceContext = *nb->deviceContext_;
552 const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local];
554 int numAtoms = nbat->numAtoms();
555 bool realloced = false;
559 /* time async copy */
560 timers->atdat.openTimingRegion(localStream);
563 /* need to reallocate if we have to copy more atoms than the amount of space
564 available and only allocate if we haven't initialized yet, i.e atdat->natoms == -1 */
565 if (numAtoms > atdat->numAtomsAlloc)
567 int numAlloc = over_alloc_small(numAtoms);
569 /* free up first if the arrays have already been initialized */
570 if (atdat->numAtomsAlloc != -1)
572 freeDeviceBuffer(&atdat->f);
573 freeDeviceBuffer(&atdat->xq);
574 if (useLjCombRule(nb->nbparam->vdwType))
576 freeDeviceBuffer(&atdat->ljComb);
580 freeDeviceBuffer(&atdat->atomTypes);
585 allocateDeviceBuffer(&atdat->f, numAlloc, deviceContext);
586 allocateDeviceBuffer(&atdat->xq, numAlloc, deviceContext);
588 if (useLjCombRule(nb->nbparam->vdwType))
590 // Two Lennard-Jones parameters per atom
591 allocateDeviceBuffer(&atdat->ljComb, numAlloc, deviceContext);
595 allocateDeviceBuffer(&atdat->atomTypes, numAlloc, deviceContext);
598 atdat->numAtomsAlloc = numAlloc;
602 atdat->numAtoms = numAtoms;
603 atdat->numAtomsLocal = nbat->natoms_local;
605 /* need to clear GPU f output if realloc happened */
608 clearDeviceBufferAsync(&atdat->f, 0, atdat->numAtomsAlloc, localStream);
611 if (useLjCombRule(nb->nbparam->vdwType))
614 sizeof(Float2) == 2 * sizeof(*nbat->params().lj_comb.data()),
615 "Size of a pair of LJ parameters elements should be equal to the size of Float2.");
616 copyToDeviceBuffer(&atdat->ljComb,
617 reinterpret_cast<const Float2*>(nbat->params().lj_comb.data()),
621 GpuApiCallBehavior::Async,
622 bDoTime ? timers->atdat.fetchNextEvent() : nullptr);
626 static_assert(sizeof(int) == sizeof(*nbat->params().type.data()),
627 "Sizes of host- and device-side atom types should be the same.");
628 copyToDeviceBuffer(&atdat->atomTypes,
629 nbat->params().type.data(),
633 GpuApiCallBehavior::Async,
634 bDoTime ? timers->atdat.fetchNextEvent() : nullptr);
639 timers->atdat.closeTimingRegion(localStream);
642 /* kick off the tasks enqueued above to ensure concurrency with the search */
643 issueClFlushInStream(localStream);
646 void gpu_clear_outputs(NbnxmGpu* nb, bool computeVirial)
648 NBAtomData* adat = nb->atdat;
649 const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local];
651 clearDeviceBufferAsync(&adat->f, 0, nb->atdat->numAtoms, localStream);
652 // Clear shift force array and energies if the outputs were used in the current step
655 clearDeviceBufferAsync(&adat->fShift, 0, SHIFTS, localStream);
656 clearDeviceBufferAsync(&adat->eLJ, 0, 1, localStream);
657 clearDeviceBufferAsync(&adat->eElec, 0, 1, localStream);
659 issueClFlushInStream(localStream);
662 //! This function is documented in the header file
663 gmx_wallclock_gpu_nbnxn_t* gpu_get_timings(NbnxmGpu* nb)
665 return (nb != nullptr && nb->bDoTime) ? nb->timings : nullptr;
668 //! This function is documented in the header file
669 void gpu_reset_timings(nonbonded_verlet_t* nbv)
671 if (nbv->gpu_nbv && nbv->gpu_nbv->bDoTime)
673 init_timings(nbv->gpu_nbv->timings);
677 bool gpu_is_kernel_ewald_analytical(const NbnxmGpu* nb)
679 return ((nb->nbparam->elecType == ElecType::EwaldAna)
680 || (nb->nbparam->elecType == ElecType::EwaldAnaTwin));
683 enum ElecType nbnxmGpuPickElectrostaticsKernelType(const interaction_const_t& ic,
684 const DeviceInformation& deviceInfo)
686 if (ic.eeltype == CoulombInteractionType::Cut)
688 return ElecType::Cut;
690 else if (EEL_RF(ic.eeltype))
694 else if ((EEL_PME(ic.eeltype) || ic.eeltype == CoulombInteractionType::Ewald))
696 return nbnxn_gpu_pick_ewald_kernel_type(ic, deviceInfo);
700 /* Shouldn't happen, as this is checked when choosing Verlet-scheme */
701 GMX_THROW(gmx::InconsistentInputError(
702 gmx::formatString("The requested electrostatics type %s is not implemented in "
703 "the GPU accelerated kernels!",
704 enumValueToString(ic.eeltype))));
709 enum VdwType nbnxmGpuPickVdwKernelType(const interaction_const_t& ic, LJCombinationRule ljCombinationRule)
711 if (ic.vdwtype == VanDerWaalsType::Cut)
713 switch (ic.vdw_modifier)
715 case InteractionModifiers::None:
716 case InteractionModifiers::PotShift:
717 switch (ljCombinationRule)
719 case LJCombinationRule::None: return VdwType::Cut;
720 case LJCombinationRule::Geometric: return VdwType::CutCombGeom;
721 case LJCombinationRule::LorentzBerthelot: return VdwType::CutCombLB;
723 GMX_THROW(gmx::InconsistentInputError(gmx::formatString(
724 "The requested LJ combination rule %s is not implemented in "
725 "the GPU accelerated kernels!",
726 enumValueToString(ljCombinationRule))));
728 case InteractionModifiers::ForceSwitch: return VdwType::FSwitch;
729 case InteractionModifiers::PotSwitch: return VdwType::PSwitch;
731 GMX_THROW(gmx::InconsistentInputError(
732 gmx::formatString("The requested VdW interaction modifier %s is not "
733 "implemented in the GPU accelerated kernels!",
734 enumValueToString(ic.vdw_modifier))));
737 else if (ic.vdwtype == VanDerWaalsType::Pme)
739 if (ic.ljpme_comb_rule == LongRangeVdW::Geom)
741 assert(ljCombinationRule == LJCombinationRule::Geometric);
742 return VdwType::EwaldGeom;
746 assert(ljCombinationRule == LJCombinationRule::LorentzBerthelot);
747 return VdwType::EwaldLB;
752 GMX_THROW(gmx::InconsistentInputError(gmx::formatString(
753 "The requested VdW type %s is not implemented in the GPU accelerated kernels!",
754 enumValueToString(ic.vdwtype))));
758 void setupGpuShortRangeWork(NbnxmGpu* nb, const gmx::GpuBonded* gpuBonded, const gmx::InteractionLocality iLocality)
760 GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
762 // There is short-range work if the pair list for the provided
763 // interaction locality contains entries or if there is any
764 // bonded work (as this is not split into local/nonlocal).
765 nb->haveWork[iLocality] = ((nb->plist[iLocality]->nsci != 0)
766 || (gpuBonded != nullptr && gpuBonded->haveInteractions()));
769 bool haveGpuShortRangeWork(const NbnxmGpu* nb, const gmx::InteractionLocality interactionLocality)
771 GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
773 return nb->haveWork[interactionLocality];
777 * Launch asynchronously the download of nonbonded forces from the GPU
778 * (and energies/shift forces if required).
780 void gpu_launch_cpyback(NbnxmGpu* nb,
781 struct nbnxn_atomdata_t* nbatom,
782 const gmx::StepWorkload& stepWork,
783 const AtomLocality atomLocality)
785 GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
787 /* determine interaction locality from atom locality */
788 const InteractionLocality iloc = atomToInteractionLocality(atomLocality);
789 GMX_ASSERT(iloc == InteractionLocality::Local
790 || (iloc == InteractionLocality::NonLocal && nb->bNonLocalStreamDoneMarked == false),
791 "Non-local stream is indicating that the copy back event is enqueued at the "
792 "beginning of the copy back function.");
794 /* extract the data */
795 NBAtomData* adat = nb->atdat;
796 Nbnxm::GpuTimers* timers = nb->timers;
797 bool bDoTime = nb->bDoTime;
798 const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
800 /* don't launch non-local copy-back if there was no non-local work to do */
801 if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(nb, iloc))
803 /* TODO An alternative way to signal that non-local work is
804 complete is to use a clEnqueueMarker+clEnqueueBarrier
805 pair. However, the use of bNonLocalStreamDoneMarked has the
806 advantage of being local to the host, so probably minimizes
807 overhead. Curiously, for NVIDIA OpenCL with an empty-domain
808 test case, overall simulation performance was higher with
809 the API calls, but this has not been tested on AMD OpenCL,
810 so could be worth considering in future. */
811 nb->bNonLocalStreamDoneMarked = false;
815 /* local/nonlocal offset and length used for xq and f */
816 auto atomsRange = getGpuAtomRange(adat, atomLocality);
818 /* beginning of timed D2H section */
821 timers->xf[atomLocality].nb_d2h.openTimingRegion(deviceStream);
824 /* With DD the local D2H transfer can only start after the non-local
825 has been launched. */
826 if (iloc == InteractionLocality::Local && nb->bNonLocalStreamDoneMarked)
828 nb->nonlocal_done.enqueueWaitEvent(deviceStream);
829 nb->bNonLocalStreamDoneMarked = false;
833 static_assert(sizeof(*nbatom->out[0].f.data()) == sizeof(float),
834 "The host force buffer should be in single precision to match device data size.");
835 copyFromDeviceBuffer(reinterpret_cast<Float3*>(nbatom->out[0].f.data()) + atomsRange.begin(),
840 GpuApiCallBehavior::Async,
841 bDoTime ? timers->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
843 issueClFlushInStream(deviceStream);
845 /* After the non-local D2H is launched the nonlocal_done event can be
846 recorded which signals that the local D2H can proceed. This event is not
847 placed after the non-local kernel because we first need the non-local
849 if (iloc == InteractionLocality::NonLocal)
851 nb->nonlocal_done.markEvent(deviceStream);
852 nb->bNonLocalStreamDoneMarked = true;
855 /* only transfer energies in the local stream */
856 if (iloc == InteractionLocality::Local)
858 /* DtoH fshift when virial is needed */
859 if (stepWork.computeVirial)
862 sizeof(*nb->nbst.fShift) == sizeof(Float3),
863 "Sizes of host- and device-side shift vector elements should be the same.");
864 copyFromDeviceBuffer(nb->nbst.fShift,
869 GpuApiCallBehavior::Async,
870 bDoTime ? timers->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
874 if (stepWork.computeEnergy)
876 static_assert(sizeof(*nb->nbst.eLJ) == sizeof(float),
877 "Sizes of host- and device-side LJ energy terms should be the same.");
878 copyFromDeviceBuffer(nb->nbst.eLJ,
883 GpuApiCallBehavior::Async,
884 bDoTime ? timers->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
885 static_assert(sizeof(*nb->nbst.eElec) == sizeof(float),
886 "Sizes of host- and device-side electrostatic energy terms should be the "
888 copyFromDeviceBuffer(nb->nbst.eElec,
893 GpuApiCallBehavior::Async,
894 bDoTime ? timers->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
900 timers->xf[atomLocality].nb_d2h.closeTimingRegion(deviceStream);
904 void nbnxnInsertNonlocalGpuDependency(NbnxmGpu* nb, const InteractionLocality interactionLocality)
906 const DeviceStream& deviceStream = *nb->deviceStreams[interactionLocality];
908 /* When we get here all misc operations issued in the local stream as well as
909 the local xq H2D are done,
910 so we record that in the local stream and wait for it in the nonlocal one.
911 This wait needs to precede any PP tasks, bonded or nonbonded, that may
912 compute on interactions between local and nonlocal atoms.
914 if (nb->bUseTwoStreams)
916 if (interactionLocality == InteractionLocality::Local)
918 nb->misc_ops_and_local_H2D_done.markEvent(deviceStream);
919 issueClFlushInStream(deviceStream);
923 nb->misc_ops_and_local_H2D_done.enqueueWaitEvent(deviceStream);
928 /*! \brief Launch asynchronously the xq buffer host to device copy. */
929 void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const AtomLocality atomLocality)
931 GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
933 const InteractionLocality iloc = atomToInteractionLocality(atomLocality);
935 NBAtomData* adat = nb->atdat;
936 gpu_plist* plist = nb->plist[iloc];
937 Nbnxm::GpuTimers* timers = nb->timers;
938 const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
940 const bool bDoTime = nb->bDoTime;
942 /* Don't launch the non-local H2D copy if there is no dependent
943 work to do: neither non-local nor other (e.g. bonded) work
944 to do that has as input the nbnxn coordaintes.
945 Doing the same for the local kernel is more complicated, since the
946 local part of the force array also depends on the non-local kernel.
947 So to avoid complicating the code and to reduce the risk of bugs,
948 we always call the local local x+q copy (and the rest of the local
949 work in nbnxn_gpu_launch_kernel().
951 if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(nb, iloc))
953 plist->haveFreshList = false;
955 // The event is marked for Local interactions unconditionally,
956 // so it has to be released here because of the early return
957 // for NonLocal interactions.
958 nb->misc_ops_and_local_H2D_done.reset();
963 /* local/nonlocal offset and length used for xq and f */
964 const auto atomsRange = getGpuAtomRange(adat, atomLocality);
966 /* beginning of timed HtoD section */
969 timers->xf[atomLocality].nb_h2d.openTimingRegion(deviceStream);
973 GMX_ASSERT(nbatom->XFormat == nbatXYZQ,
974 "The coordinates should be in xyzq format to copy to the Float4 device buffer.");
975 copyToDeviceBuffer(&adat->xq,
976 reinterpret_cast<const Float4*>(nbatom->x().data()) + atomsRange.begin(),
980 GpuApiCallBehavior::Async,
985 timers->xf[atomLocality].nb_h2d.closeTimingRegion(deviceStream);
988 /* When we get here all misc operations issued in the local stream as well as
989 the local xq H2D are done,
990 so we record that in the local stream and wait for it in the nonlocal one.
991 This wait needs to precede any PP tasks, bonded or nonbonded, that may
992 compute on interactions between local and nonlocal atoms.
994 nbnxnInsertNonlocalGpuDependency(nb, iloc);
998 /* Initialization for X buffer operations on GPU. */
999 void nbnxn_gpu_init_x_to_nbat_x(const Nbnxm::GridSet& gridSet, NbnxmGpu* gpu_nbv)
1001 const DeviceStream& localStream = *gpu_nbv->deviceStreams[InteractionLocality::Local];
1002 const bool bDoTime = gpu_nbv->bDoTime;
1003 const int maxNumColumns = gridSet.numColumnsMax();
1005 reallocateDeviceBuffer(&gpu_nbv->cxy_na,
1006 maxNumColumns * gridSet.grids().size(),
1008 &gpu_nbv->ncxy_na_alloc,
1009 *gpu_nbv->deviceContext_);
1010 reallocateDeviceBuffer(&gpu_nbv->cxy_ind,
1011 maxNumColumns * gridSet.grids().size(),
1013 &gpu_nbv->ncxy_ind_alloc,
1014 *gpu_nbv->deviceContext_);
1016 for (unsigned int g = 0; g < gridSet.grids().size(); g++)
1018 const Nbnxm::Grid& grid = gridSet.grids()[g];
1020 const int numColumns = grid.numColumns();
1021 const int* atomIndices = gridSet.atomIndices().data();
1022 const int atomIndicesSize = gridSet.atomIndices().size();
1023 const int* cxy_na = grid.cxy_na().data();
1024 const int* cxy_ind = grid.cxy_ind().data();
1026 auto* timerH2D = bDoTime ? &gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d : nullptr;
1028 reallocateDeviceBuffer(&gpu_nbv->atomIndices,
1030 &gpu_nbv->atomIndicesSize,
1031 &gpu_nbv->atomIndicesSize_alloc,
1032 *gpu_nbv->deviceContext_);
1034 if (atomIndicesSize > 0)
1038 timerH2D->openTimingRegion(localStream);
1041 copyToDeviceBuffer(&gpu_nbv->atomIndices,
1046 GpuApiCallBehavior::Async,
1047 bDoTime ? timerH2D->fetchNextEvent() : nullptr);
1051 timerH2D->closeTimingRegion(localStream);
1059 timerH2D->openTimingRegion(localStream);
1062 copyToDeviceBuffer(&gpu_nbv->cxy_na,
1067 GpuApiCallBehavior::Async,
1068 bDoTime ? timerH2D->fetchNextEvent() : nullptr);
1072 timerH2D->closeTimingRegion(localStream);
1077 timerH2D->openTimingRegion(localStream);
1080 copyToDeviceBuffer(&gpu_nbv->cxy_ind,
1085 GpuApiCallBehavior::Async,
1086 bDoTime ? timerH2D->fetchNextEvent() : nullptr);
1090 timerH2D->closeTimingRegion(localStream);
1095 // The above data is transferred on the local stream but is a
1096 // dependency of the nonlocal stream (specifically the nonlocal X
1097 // buf ops kernel). We therefore set a dependency to ensure
1098 // that the nonlocal stream waits on the local stream here.
1099 // This call records an event in the local stream:
1100 nbnxnInsertNonlocalGpuDependency(gpu_nbv, Nbnxm::InteractionLocality::Local);
1101 // ...and this call instructs the nonlocal stream to wait on that event:
1102 nbnxnInsertNonlocalGpuDependency(gpu_nbv, Nbnxm::InteractionLocality::NonLocal);
1105 } // namespace Nbnxm