2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2016 by the GROMACS development team.
5 * Copyright (c) 2017,2018,2019,2020,2021, by the GROMACS development team, led by
6 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
7 * and including many others, as listed in the AUTHORS file in the
8 * top-level source directory and at http://www.gromacs.org.
10 * GROMACS is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public License
12 * as published by the Free Software Foundation; either version 2.1
13 * of the License, or (at your option) any later version.
15 * GROMACS is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with GROMACS; if not, see
22 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
23 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
25 * If you want to redistribute modifications to GROMACS, please
26 * consider that scientific software is very special. Version
27 * control is crucial - bugs must be traceable. We will be happy to
28 * consider code for inclusion in the official distribution, but
29 * derived work must not be called official GROMACS. Details are found
30 * in the README & COPYING files - if they are missing, get the
31 * official version at http://www.gromacs.org.
33 * To help us fund GROMACS development, we humbly ask that you cite
34 * the research papers on the package. Check out http://www.gromacs.org.
37 * \brief Define common implementation of nbnxm_gpu_data_mgmt.h
39 * \author Anca Hamuraru <anca@streamcomputing.eu>
40 * \author Dimitrios Karkoulis <dimitris.karkoulis@gmail.com>
41 * \author Teemu Virolainen <teemu@streamcomputing.eu>
42 * \author Szilárd Páll <pall.szilard@gmail.com>
43 * \author Artem Zhmurov <zhmurov@gmail.com>
45 * \ingroup module_nbnxm
52 # include "cuda/nbnxm_cuda_types.h"
56 # include "opencl/nbnxm_ocl_types.h"
60 # include "sycl/nbnxm_sycl_types.h"
63 #include "nbnxm_gpu_data_mgmt.h"
65 #include "gromacs/gpu_utils/device_stream_manager.h"
66 #include "gromacs/gpu_utils/pmalloc.h"
67 #include "gromacs/hardware/device_information.h"
68 #include "gromacs/mdtypes/interaction_const.h"
69 #include "gromacs/mdtypes/simulation_workload.h"
70 #include "gromacs/nbnxm/gpu_common_utils.h"
71 #include "gromacs/nbnxm/gpu_data_mgmt.h"
72 #include "gromacs/pbcutil/ishift.h"
73 #include "gromacs/timing/gpu_timing.h"
74 #include "gromacs/pbcutil/ishift.h"
75 #include "gromacs/utility/cstringutil.h"
76 #include "gromacs/utility/exceptions.h"
77 #include "gromacs/utility/fatalerror.h"
79 #include "nbnxm_gpu.h"
80 #include "pairlistsets.h"
85 inline void issueClFlushInStream(const DeviceStream& deviceStream)
88 /* Based on the v1.2 section 5.13 of the OpenCL spec, a flush is needed
89 * in the stream after marking an event in it in order to be able to sync with
90 * the event from another stream.
92 cl_int cl_error = clFlush(deviceStream.stream());
93 if (cl_error != CL_SUCCESS)
95 GMX_THROW(gmx::InternalError("clFlush failed: " + ocl_get_error_string(cl_error)));
98 GMX_UNUSED_VALUE(deviceStream);
102 void init_ewald_coulomb_force_table(const EwaldCorrectionTables& tables,
104 const DeviceContext& deviceContext)
106 if (nbp->coulomb_tab)
108 destroyParamLookupTable(&nbp->coulomb_tab, nbp->coulomb_tab_texobj);
111 nbp->coulomb_tab_scale = tables.scale;
112 initParamLookupTable(
113 &nbp->coulomb_tab, &nbp->coulomb_tab_texobj, tables.tableF.data(), tables.tableF.size(), deviceContext);
116 enum ElecType nbnxn_gpu_pick_ewald_kernel_type(const interaction_const_t& ic,
117 const DeviceInformation gmx_unused& deviceInfo)
119 bool bTwinCut = (ic.rcoulomb != ic.rvdw);
121 /* Benchmarking/development environment variables to force the use of
122 analytical or tabulated Ewald kernel. */
123 const bool forceAnalyticalEwald = (getenv("GMX_GPU_NB_ANA_EWALD") != nullptr);
124 const bool forceTabulatedEwald = (getenv("GMX_GPU_NB_TAB_EWALD") != nullptr);
125 const bool forceTwinCutoffEwald = (getenv("GMX_GPU_NB_EWALD_TWINCUT") != nullptr);
127 if (forceAnalyticalEwald && forceTabulatedEwald)
130 "Both analytical and tabulated Ewald GPU non-bonded kernels "
131 "requested through environment variables.");
134 /* By default, use analytical Ewald except with CUDA on NVIDIA CC 7.0 and 8.0.
136 const bool c_useTabulatedEwaldDefault =
138 (deviceInfo.prop.major == 7 && deviceInfo.prop.minor == 0)
139 || (deviceInfo.prop.major == 8 && deviceInfo.prop.minor == 0);
143 bool bUseAnalyticalEwald = !c_useTabulatedEwaldDefault;
144 if (forceAnalyticalEwald)
146 bUseAnalyticalEwald = true;
149 fprintf(debug, "Using analytical Ewald GPU kernels\n");
152 else if (forceTabulatedEwald)
154 bUseAnalyticalEwald = false;
158 fprintf(debug, "Using tabulated Ewald GPU kernels\n");
162 /* Use twin cut-off kernels if requested by bTwinCut or the env. var.
163 forces it (use it for debugging/benchmarking only). */
164 if (!bTwinCut && !forceTwinCutoffEwald)
166 return bUseAnalyticalEwald ? ElecType::EwaldAna : ElecType::EwaldTab;
170 return bUseAnalyticalEwald ? ElecType::EwaldAnaTwin : ElecType::EwaldTabTwin;
174 void set_cutoff_parameters(NBParamGpu* nbp, const interaction_const_t& ic, const PairlistParams& listParams)
176 nbp->ewald_beta = ic.ewaldcoeff_q;
177 nbp->sh_ewald = ic.sh_ewald;
178 nbp->epsfac = ic.epsfac;
179 nbp->two_k_rf = 2.0 * ic.reactionFieldCoefficient;
180 nbp->c_rf = ic.reactionFieldShift;
181 nbp->rvdw_sq = ic.rvdw * ic.rvdw;
182 nbp->rcoulomb_sq = ic.rcoulomb * ic.rcoulomb;
183 nbp->rlistOuter_sq = listParams.rlistOuter * listParams.rlistOuter;
184 nbp->rlistInner_sq = listParams.rlistInner * listParams.rlistInner;
185 nbp->useDynamicPruning = listParams.useDynamicPruning;
187 nbp->sh_lj_ewald = ic.sh_lj_ewald;
188 nbp->ewaldcoeff_lj = ic.ewaldcoeff_lj;
190 nbp->rvdw_switch = ic.rvdw_switch;
191 nbp->dispersion_shift = ic.dispersion_shift;
192 nbp->repulsion_shift = ic.repulsion_shift;
193 nbp->vdw_switch = ic.vdw_switch;
196 void gpu_pme_loadbal_update_param(const nonbonded_verlet_t* nbv, const interaction_const_t& ic)
198 if (!nbv || !nbv->useGpu())
202 NbnxmGpu* nb = nbv->gpu_nbv;
203 NBParamGpu* nbp = nb->nbparam;
205 set_cutoff_parameters(nbp, ic, nbv->pairlistSets().params());
207 nbp->elecType = nbnxn_gpu_pick_ewald_kernel_type(ic, nb->deviceContext_->deviceInfo());
209 GMX_RELEASE_ASSERT(ic.coulombEwaldTables, "Need valid Coulomb Ewald correction tables");
210 init_ewald_coulomb_force_table(*ic.coulombEwaldTables, nbp, *nb->deviceContext_);
213 void init_plist(gpu_plist* pl)
215 /* initialize to nullptr pointers to data that is not allocated here and will
216 need reallocation in nbnxn_gpu_init_pairlist */
222 /* size -1 indicates that the respective array hasn't been initialized yet */
229 pl->imask_nalloc = -1;
231 pl->excl_nalloc = -1;
232 pl->haveFreshList = false;
233 pl->rollingPruningNumParts = 0;
234 pl->rollingPruningPart = 0;
237 void init_timings(gmx_wallclock_gpu_nbnxn_t* t)
244 for (int i = 0; i < 2; i++)
246 for (int j = 0; j < 2; j++)
248 t->ktime[i][j].t = 0.0;
249 t->ktime[i][j].c = 0;
253 t->pruneTime.t = 0.0;
254 t->dynamicPruneTime.c = 0;
255 t->dynamicPruneTime.t = 0.0;
258 /*! \brief Initialize \p atomdata first time; it only gets filled at pair-search. */
259 static void initAtomdataFirst(NBAtomData* atomdata,
261 const DeviceContext& deviceContext,
262 const DeviceStream& localStream)
264 atomdata->numTypes = numTypes;
265 allocateDeviceBuffer(&atomdata->shiftVec, SHIFTS, deviceContext);
266 atomdata->shiftVecUploaded = false;
268 allocateDeviceBuffer(&atomdata->fShift, SHIFTS, deviceContext);
269 allocateDeviceBuffer(&atomdata->eLJ, 1, deviceContext);
270 allocateDeviceBuffer(&atomdata->eElec, 1, deviceContext);
272 clearDeviceBufferAsync(&atomdata->fShift, 0, SHIFTS, localStream);
273 clearDeviceBufferAsync(&atomdata->eElec, 0, 1, localStream);
274 clearDeviceBufferAsync(&atomdata->eLJ, 0, 1, localStream);
276 /* initialize to nullptr pointers to data that is not allocated here and will
277 need reallocation in later */
278 atomdata->xq = nullptr;
279 atomdata->f = nullptr;
281 /* size -1 indicates that the respective array hasn't been initialized yet */
282 atomdata->numAtoms = -1;
283 atomdata->numAtomsAlloc = -1;
286 /*! \brief Initialize the nonbonded parameter data structure. */
287 static void initNbparam(NBParamGpu* nbp,
288 const interaction_const_t& ic,
289 const PairlistParams& listParams,
290 const nbnxn_atomdata_t::Params& nbatParams,
291 const DeviceContext& deviceContext)
293 const int numTypes = nbatParams.numTypes;
295 set_cutoff_parameters(nbp, ic, listParams);
297 nbp->vdwType = nbnxmGpuPickVdwKernelType(ic, nbatParams.ljCombinationRule);
298 nbp->elecType = nbnxmGpuPickElectrostaticsKernelType(ic, deviceContext.deviceInfo());
300 if (ic.vdwtype == VanDerWaalsType::Pme)
302 if (ic.ljpme_comb_rule == LongRangeVdW::Geom)
304 GMX_ASSERT(nbatParams.ljCombinationRule == LJCombinationRule::Geometric,
305 "Combination rule mismatch!");
309 GMX_ASSERT(nbatParams.ljCombinationRule == LJCombinationRule::LorentzBerthelot,
310 "Combination rule mismatch!");
314 /* generate table for PME */
315 if (nbp->elecType == ElecType::EwaldTab || nbp->elecType == ElecType::EwaldTabTwin)
317 GMX_RELEASE_ASSERT(ic.coulombEwaldTables, "Need valid Coulomb Ewald correction tables");
318 init_ewald_coulomb_force_table(*ic.coulombEwaldTables, nbp, deviceContext);
322 // Need to initialize for OpenCL, since it is unconditionally used as a kernel argument.
323 allocateDeviceBuffer(&nbp->coulomb_tab, 1, deviceContext);
326 /* set up LJ parameter lookup table */
327 if (!useLjCombRule(nbp->vdwType))
329 static_assert(sizeof(decltype(nbp->nbfp)) == 2 * sizeof(decltype(*nbatParams.nbfp.data())),
330 "Mismatch in the size of host / device data types");
331 initParamLookupTable(&nbp->nbfp,
333 reinterpret_cast<const Float2*>(nbatParams.nbfp.data()),
339 // Need to initialize for OpenCL, since it is unconditionally used as a kernel argument.
340 allocateDeviceBuffer(&nbp->nbfp, 1, deviceContext);
343 /* set up LJ-PME parameter lookup table */
344 if (ic.vdwtype == VanDerWaalsType::Pme)
346 static_assert(sizeof(decltype(nbp->nbfp_comb))
347 == 2 * sizeof(decltype(*nbatParams.nbfp_comb.data())),
348 "Mismatch in the size of host / device data types");
349 initParamLookupTable(&nbp->nbfp_comb,
350 &nbp->nbfp_comb_texobj,
351 reinterpret_cast<const Float2*>(nbatParams.nbfp_comb.data()),
357 // Need to initialize for OpenCL, since it is unconditionally used as a kernel argument.
358 allocateDeviceBuffer(&nbp->nbfp_comb, 1, deviceContext);
362 NbnxmGpu* gpu_init(const gmx::DeviceStreamManager& deviceStreamManager,
363 const interaction_const_t* ic,
364 const PairlistParams& listParams,
365 const nbnxn_atomdata_t* nbat,
366 const bool bLocalAndNonlocal)
368 auto* nb = new NbnxmGpu();
369 nb->deviceContext_ = &deviceStreamManager.context();
370 nb->atdat = new NBAtomData;
371 nb->nbparam = new NBParamGpu;
372 nb->plist[InteractionLocality::Local] = new Nbnxm::gpu_plist;
373 if (bLocalAndNonlocal)
375 nb->plist[InteractionLocality::NonLocal] = new Nbnxm::gpu_plist;
378 nb->bUseTwoStreams = bLocalAndNonlocal;
380 GMX_ASSERT(!(GMX_GPU_SYCL && !nb->bDoTime), "GPU timing is not supported in SYCL");
381 nb->timers = new Nbnxm::GpuTimers();
382 snew(nb->timings, 1);
384 /* WARNING: CUDA timings are incorrect with multiple streams.
385 * This is the main reason why they are disabled by default.
386 * Can be enabled by setting GMX_ENABLE_GPU_TIMING environment variable.
387 * TODO: Consider turning on by default when we can detect nr of streams.
389 * OpenCL timing is enabled by default and can be disabled by
390 * GMX_DISABLE_GPU_TIMING environment variable.
392 * Timing is disabled in SYCL.
394 nb->bDoTime = (GMX_GPU_CUDA && (getenv("GMX_ENABLE_GPU_TIMING") != nullptr))
395 || (GMX_GPU_OPENCL && (getenv("GMX_DISABLE_GPU_TIMING") == nullptr));
399 init_timings(nb->timings);
403 pmalloc(reinterpret_cast<void**>(&nb->nbst.eLJ), sizeof(*nb->nbst.eLJ));
404 pmalloc(reinterpret_cast<void**>(&nb->nbst.eElec), sizeof(*nb->nbst.eElec));
405 pmalloc(reinterpret_cast<void**>(&nb->nbst.fShift), SHIFTS * sizeof(*nb->nbst.fShift));
407 init_plist(nb->plist[InteractionLocality::Local]);
409 /* local/non-local GPU streams */
410 GMX_RELEASE_ASSERT(deviceStreamManager.streamIsValid(gmx::DeviceStreamType::NonBondedLocal),
411 "Local non-bonded stream should be initialized to use GPU for non-bonded.");
412 const DeviceStream& localStream = deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedLocal);
413 nb->deviceStreams[InteractionLocality::Local] = &localStream;
414 // In general, it's not strictly necessary to use 2 streams for SYCL, since they are
415 // out-of-order. But for the time being, it will be less disruptive to keep them.
416 if (nb->bUseTwoStreams)
418 init_plist(nb->plist[InteractionLocality::NonLocal]);
420 GMX_RELEASE_ASSERT(deviceStreamManager.streamIsValid(gmx::DeviceStreamType::NonBondedNonLocal),
421 "Non-local non-bonded stream should be initialized to use GPU for "
422 "non-bonded with domain decomposition.");
423 nb->deviceStreams[InteractionLocality::NonLocal] =
424 &deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedNonLocal);
427 const nbnxn_atomdata_t::Params& nbatParams = nbat->params();
428 const DeviceContext& deviceContext = *nb->deviceContext_;
430 initNbparam(nb->nbparam, *ic, listParams, nbatParams, deviceContext);
431 initAtomdataFirst(nb->atdat, nbatParams.numTypes, deviceContext, localStream);
433 gpu_init_platform_specific(nb);
437 fprintf(debug, "Initialized NBNXM GPU data structures.\n");
443 //! This function is documented in the header file
444 void gpu_init_pairlist(NbnxmGpu* nb, const NbnxnPairlistGpu* h_plist, const InteractionLocality iloc)
447 // Timing accumulation should happen only if there was work to do
448 // because getLastRangeTime() gets skipped with empty lists later
449 // which leads to the counter not being reset.
450 bool bDoTime = (nb->bDoTime && !h_plist->sci.empty());
451 const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
452 gpu_plist* d_plist = nb->plist[iloc];
454 if (d_plist->na_c < 0)
456 d_plist->na_c = h_plist->na_ci;
460 if (d_plist->na_c != h_plist->na_ci)
463 "In init_plist: the #atoms per cell has changed (from %d to %d)",
470 GpuTimers::Interaction& iTimers = nb->timers->interaction[iloc];
474 iTimers.pl_h2d.openTimingRegion(deviceStream);
475 iTimers.didPairlistH2D = true;
478 // TODO most of this function is same in CUDA and OpenCL, move into the header
479 const DeviceContext& deviceContext = *nb->deviceContext_;
481 reallocateDeviceBuffer(
482 &d_plist->sci, h_plist->sci.size(), &d_plist->nsci, &d_plist->sci_nalloc, deviceContext);
483 copyToDeviceBuffer(&d_plist->sci,
488 GpuApiCallBehavior::Async,
489 bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
491 reallocateDeviceBuffer(
492 &d_plist->cj4, h_plist->cj4.size(), &d_plist->ncj4, &d_plist->cj4_nalloc, deviceContext);
493 copyToDeviceBuffer(&d_plist->cj4,
498 GpuApiCallBehavior::Async,
499 bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
501 reallocateDeviceBuffer(&d_plist->imask,
502 h_plist->cj4.size() * c_nbnxnGpuClusterpairSplit,
504 &d_plist->imask_nalloc,
507 reallocateDeviceBuffer(
508 &d_plist->excl, h_plist->excl.size(), &d_plist->nexcl, &d_plist->excl_nalloc, deviceContext);
509 copyToDeviceBuffer(&d_plist->excl,
510 h_plist->excl.data(),
512 h_plist->excl.size(),
514 GpuApiCallBehavior::Async,
515 bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
519 iTimers.pl_h2d.closeTimingRegion(deviceStream);
522 /* need to prune the pair list during the next step */
523 d_plist->haveFreshList = true;
526 void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat)
528 bool bDoTime = nb->bDoTime;
529 Nbnxm::GpuTimers* timers = bDoTime ? nb->timers : nullptr;
530 NBAtomData* atdat = nb->atdat;
531 const DeviceContext& deviceContext = *nb->deviceContext_;
532 const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local];
534 int numAtoms = nbat->numAtoms();
535 bool realloced = false;
539 /* time async copy */
540 timers->atdat.openTimingRegion(localStream);
543 /* need to reallocate if we have to copy more atoms than the amount of space
544 available and only allocate if we haven't initialized yet, i.e atdat->natoms == -1 */
545 if (numAtoms > atdat->numAtomsAlloc)
547 int numAlloc = over_alloc_small(numAtoms);
549 /* free up first if the arrays have already been initialized */
550 if (atdat->numAtomsAlloc != -1)
552 freeDeviceBuffer(&atdat->f);
553 freeDeviceBuffer(&atdat->xq);
554 if (useLjCombRule(nb->nbparam->vdwType))
556 freeDeviceBuffer(&atdat->ljComb);
560 freeDeviceBuffer(&atdat->atomTypes);
565 allocateDeviceBuffer(&atdat->f, numAlloc, deviceContext);
566 allocateDeviceBuffer(&atdat->xq, numAlloc, deviceContext);
568 if (useLjCombRule(nb->nbparam->vdwType))
570 // Two Lennard-Jones parameters per atom
571 allocateDeviceBuffer(&atdat->ljComb, numAlloc, deviceContext);
575 allocateDeviceBuffer(&atdat->atomTypes, numAlloc, deviceContext);
578 atdat->numAtomsAlloc = numAlloc;
582 atdat->numAtoms = numAtoms;
583 atdat->numAtomsLocal = nbat->natoms_local;
585 /* need to clear GPU f output if realloc happened */
588 clearDeviceBufferAsync(&atdat->f, 0, atdat->numAtomsAlloc, localStream);
591 if (useLjCombRule(nb->nbparam->vdwType))
594 sizeof(Float2) == 2 * sizeof(*nbat->params().lj_comb.data()),
595 "Size of a pair of LJ parameters elements should be equal to the size of Float2.");
596 copyToDeviceBuffer(&atdat->ljComb,
597 reinterpret_cast<const Float2*>(nbat->params().lj_comb.data()),
601 GpuApiCallBehavior::Async,
602 bDoTime ? timers->atdat.fetchNextEvent() : nullptr);
606 static_assert(sizeof(int) == sizeof(*nbat->params().type.data()),
607 "Sizes of host- and device-side atom types should be the same.");
608 copyToDeviceBuffer(&atdat->atomTypes,
609 nbat->params().type.data(),
613 GpuApiCallBehavior::Async,
614 bDoTime ? timers->atdat.fetchNextEvent() : nullptr);
619 timers->atdat.closeTimingRegion(localStream);
622 /* kick off the tasks enqueued above to ensure concurrency with the search */
623 issueClFlushInStream(localStream);
626 void gpu_clear_outputs(NbnxmGpu* nb, bool computeVirial)
628 NBAtomData* adat = nb->atdat;
629 const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local];
631 clearDeviceBufferAsync(&adat->f, 0, nb->atdat->numAtoms, localStream);
632 // Clear shift force array and energies if the outputs were used in the current step
635 clearDeviceBufferAsync(&adat->fShift, 0, SHIFTS, localStream);
636 clearDeviceBufferAsync(&adat->eLJ, 0, 1, localStream);
637 clearDeviceBufferAsync(&adat->eElec, 0, 1, localStream);
639 issueClFlushInStream(localStream);
642 //! This function is documented in the header file
643 gmx_wallclock_gpu_nbnxn_t* gpu_get_timings(NbnxmGpu* nb)
645 return (nb != nullptr && nb->bDoTime) ? nb->timings : nullptr;
648 //! This function is documented in the header file
649 void gpu_reset_timings(nonbonded_verlet_t* nbv)
651 if (nbv->gpu_nbv && nbv->gpu_nbv->bDoTime)
653 init_timings(nbv->gpu_nbv->timings);
657 bool gpu_is_kernel_ewald_analytical(const NbnxmGpu* nb)
659 return ((nb->nbparam->elecType == ElecType::EwaldAna)
660 || (nb->nbparam->elecType == ElecType::EwaldAnaTwin));
663 enum ElecType nbnxmGpuPickElectrostaticsKernelType(const interaction_const_t& ic,
664 const DeviceInformation& deviceInfo)
666 if (ic.eeltype == CoulombInteractionType::Cut)
668 return ElecType::Cut;
670 else if (EEL_RF(ic.eeltype))
674 else if ((EEL_PME(ic.eeltype) || ic.eeltype == CoulombInteractionType::Ewald))
676 return nbnxn_gpu_pick_ewald_kernel_type(ic, deviceInfo);
680 /* Shouldn't happen, as this is checked when choosing Verlet-scheme */
681 GMX_THROW(gmx::InconsistentInputError(
682 gmx::formatString("The requested electrostatics type %s is not implemented in "
683 "the GPU accelerated kernels!",
684 enumValueToString(ic.eeltype))));
689 enum VdwType nbnxmGpuPickVdwKernelType(const interaction_const_t& ic, LJCombinationRule ljCombinationRule)
691 if (ic.vdwtype == VanDerWaalsType::Cut)
693 switch (ic.vdw_modifier)
695 case InteractionModifiers::None:
696 case InteractionModifiers::PotShift:
697 switch (ljCombinationRule)
699 case LJCombinationRule::None: return VdwType::Cut;
700 case LJCombinationRule::Geometric: return VdwType::CutCombGeom;
701 case LJCombinationRule::LorentzBerthelot: return VdwType::CutCombLB;
703 GMX_THROW(gmx::InconsistentInputError(gmx::formatString(
704 "The requested LJ combination rule %s is not implemented in "
705 "the GPU accelerated kernels!",
706 enumValueToString(ljCombinationRule))));
708 case InteractionModifiers::ForceSwitch: return VdwType::FSwitch;
709 case InteractionModifiers::PotSwitch: return VdwType::PSwitch;
711 GMX_THROW(gmx::InconsistentInputError(
712 gmx::formatString("The requested VdW interaction modifier %s is not "
713 "implemented in the GPU accelerated kernels!",
714 enumValueToString(ic.vdw_modifier))));
717 else if (ic.vdwtype == VanDerWaalsType::Pme)
719 if (ic.ljpme_comb_rule == LongRangeVdW::Geom)
721 assert(ljCombinationRule == LJCombinationRule::Geometric);
722 return VdwType::EwaldGeom;
726 assert(ljCombinationRule == LJCombinationRule::LorentzBerthelot);
727 return VdwType::EwaldLB;
732 GMX_THROW(gmx::InconsistentInputError(gmx::formatString(
733 "The requested VdW type %s is not implemented in the GPU accelerated kernels!",
734 enumValueToString(ic.vdwtype))));
738 void setupGpuShortRangeWork(NbnxmGpu* nb, const gmx::GpuBonded* gpuBonded, const gmx::InteractionLocality iLocality)
740 GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
742 // There is short-range work if the pair list for the provided
743 // interaction locality contains entries or if there is any
744 // bonded work (as this is not split into local/nonlocal).
745 nb->haveWork[iLocality] = ((nb->plist[iLocality]->nsci != 0)
746 || (gpuBonded != nullptr && gpuBonded->haveInteractions()));
749 bool haveGpuShortRangeWork(const NbnxmGpu* nb, const gmx::InteractionLocality interactionLocality)
751 GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
753 return nb->haveWork[interactionLocality];
757 * Launch asynchronously the download of nonbonded forces from the GPU
758 * (and energies/shift forces if required).
760 void gpu_launch_cpyback(NbnxmGpu* nb,
761 struct nbnxn_atomdata_t* nbatom,
762 const gmx::StepWorkload& stepWork,
763 const AtomLocality atomLocality)
765 GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
767 /* determine interaction locality from atom locality */
768 const InteractionLocality iloc = atomToInteractionLocality(atomLocality);
769 GMX_ASSERT(iloc == InteractionLocality::Local
770 || (iloc == InteractionLocality::NonLocal && nb->bNonLocalStreamDoneMarked == false),
771 "Non-local stream is indicating that the copy back event is enqueued at the "
772 "beginning of the copy back function.");
774 /* extract the data */
775 NBAtomData* adat = nb->atdat;
776 Nbnxm::GpuTimers* timers = nb->timers;
777 bool bDoTime = nb->bDoTime;
778 const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
780 /* don't launch non-local copy-back if there was no non-local work to do */
781 if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(nb, iloc))
783 /* TODO An alternative way to signal that non-local work is
784 complete is to use a clEnqueueMarker+clEnqueueBarrier
785 pair. However, the use of bNonLocalStreamDoneMarked has the
786 advantage of being local to the host, so probably minimizes
787 overhead. Curiously, for NVIDIA OpenCL with an empty-domain
788 test case, overall simulation performance was higher with
789 the API calls, but this has not been tested on AMD OpenCL,
790 so could be worth considering in future. */
791 nb->bNonLocalStreamDoneMarked = false;
795 /* local/nonlocal offset and length used for xq and f */
796 auto atomsRange = getGpuAtomRange(adat, atomLocality);
798 /* beginning of timed D2H section */
801 timers->xf[atomLocality].nb_d2h.openTimingRegion(deviceStream);
804 /* With DD the local D2H transfer can only start after the non-local
805 has been launched. */
806 if (iloc == InteractionLocality::Local && nb->bNonLocalStreamDoneMarked)
808 nb->nonlocal_done.enqueueWaitEvent(deviceStream);
809 nb->bNonLocalStreamDoneMarked = false;
813 static_assert(sizeof(*nbatom->out[0].f.data()) == sizeof(float),
814 "The host force buffer should be in single precision to match device data size.");
815 copyFromDeviceBuffer(reinterpret_cast<Float3*>(nbatom->out[0].f.data()) + atomsRange.begin(),
820 GpuApiCallBehavior::Async,
821 bDoTime ? timers->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
823 issueClFlushInStream(deviceStream);
825 /* After the non-local D2H is launched the nonlocal_done event can be
826 recorded which signals that the local D2H can proceed. This event is not
827 placed after the non-local kernel because we first need the non-local
829 if (iloc == InteractionLocality::NonLocal)
831 nb->nonlocal_done.markEvent(deviceStream);
832 nb->bNonLocalStreamDoneMarked = true;
835 /* only transfer energies in the local stream */
836 if (iloc == InteractionLocality::Local)
838 /* DtoH fshift when virial is needed */
839 if (stepWork.computeVirial)
842 sizeof(*nb->nbst.fShift) == sizeof(Float3),
843 "Sizes of host- and device-side shift vector elements should be the same.");
844 copyFromDeviceBuffer(nb->nbst.fShift,
849 GpuApiCallBehavior::Async,
850 bDoTime ? timers->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
854 if (stepWork.computeEnergy)
856 static_assert(sizeof(*nb->nbst.eLJ) == sizeof(float),
857 "Sizes of host- and device-side LJ energy terms should be the same.");
858 copyFromDeviceBuffer(nb->nbst.eLJ,
863 GpuApiCallBehavior::Async,
864 bDoTime ? timers->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
865 static_assert(sizeof(*nb->nbst.eElec) == sizeof(float),
866 "Sizes of host- and device-side electrostatic energy terms should be the "
868 copyFromDeviceBuffer(nb->nbst.eElec,
873 GpuApiCallBehavior::Async,
874 bDoTime ? timers->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
880 timers->xf[atomLocality].nb_d2h.closeTimingRegion(deviceStream);
884 void nbnxnInsertNonlocalGpuDependency(NbnxmGpu* nb, const InteractionLocality interactionLocality)
886 const DeviceStream& deviceStream = *nb->deviceStreams[interactionLocality];
888 /* When we get here all misc operations issued in the local stream as well as
889 the local xq H2D are done,
890 so we record that in the local stream and wait for it in the nonlocal one.
891 This wait needs to precede any PP tasks, bonded or nonbonded, that may
892 compute on interactions between local and nonlocal atoms.
894 if (nb->bUseTwoStreams)
896 if (interactionLocality == InteractionLocality::Local)
898 nb->misc_ops_and_local_H2D_done.markEvent(deviceStream);
899 issueClFlushInStream(deviceStream);
903 nb->misc_ops_and_local_H2D_done.enqueueWaitEvent(deviceStream);
908 /*! \brief Launch asynchronously the xq buffer host to device copy. */
909 void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const AtomLocality atomLocality)
911 GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
913 const InteractionLocality iloc = atomToInteractionLocality(atomLocality);
915 NBAtomData* adat = nb->atdat;
916 gpu_plist* plist = nb->plist[iloc];
917 Nbnxm::GpuTimers* timers = nb->timers;
918 const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
920 const bool bDoTime = nb->bDoTime;
922 /* Don't launch the non-local H2D copy if there is no dependent
923 work to do: neither non-local nor other (e.g. bonded) work
924 to do that has as input the nbnxn coordaintes.
925 Doing the same for the local kernel is more complicated, since the
926 local part of the force array also depends on the non-local kernel.
927 So to avoid complicating the code and to reduce the risk of bugs,
928 we always call the local local x+q copy (and the rest of the local
929 work in nbnxn_gpu_launch_kernel().
931 if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(nb, iloc))
933 plist->haveFreshList = false;
935 // The event is marked for Local interactions unconditionally,
936 // so it has to be released here because of the early return
937 // for NonLocal interactions.
938 nb->misc_ops_and_local_H2D_done.reset();
943 /* local/nonlocal offset and length used for xq and f */
944 const auto atomsRange = getGpuAtomRange(adat, atomLocality);
946 /* beginning of timed HtoD section */
949 timers->xf[atomLocality].nb_h2d.openTimingRegion(deviceStream);
953 GMX_ASSERT(nbatom->XFormat == nbatXYZQ,
954 "The coordinates should be in xyzq format to copy to the Float4 device buffer.");
955 copyToDeviceBuffer(&adat->xq,
956 reinterpret_cast<const Float4*>(nbatom->x().data()) + atomsRange.begin(),
960 GpuApiCallBehavior::Async,
965 timers->xf[atomLocality].nb_h2d.closeTimingRegion(deviceStream);
968 /* When we get here all misc operations issued in the local stream as well as
969 the local xq H2D are done,
970 so we record that in the local stream and wait for it in the nonlocal one.
971 This wait needs to precede any PP tasks, bonded or nonbonded, that may
972 compute on interactions between local and nonlocal atoms.
974 nbnxnInsertNonlocalGpuDependency(nb, iloc);