2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2016 by the GROMACS development team.
5 * Copyright (c) 2017,2018,2019,2020,2021, by the GROMACS development team, led by
6 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
7 * and including many others, as listed in the AUTHORS file in the
8 * top-level source directory and at http://www.gromacs.org.
10 * GROMACS is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public License
12 * as published by the Free Software Foundation; either version 2.1
13 * of the License, or (at your option) any later version.
15 * GROMACS is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with GROMACS; if not, see
22 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
23 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
25 * If you want to redistribute modifications to GROMACS, please
26 * consider that scientific software is very special. Version
27 * control is crucial - bugs must be traceable. We will be happy to
28 * consider code for inclusion in the official distribution, but
29 * derived work must not be called official GROMACS. Details are found
30 * in the README & COPYING files - if they are missing, get the
31 * official version at http://www.gromacs.org.
33 * To help us fund GROMACS development, we humbly ask that you cite
34 * the research papers on the package. Check out http://www.gromacs.org.
37 * \brief Define common implementation of nbnxm_gpu_data_mgmt.h
39 * \author Anca Hamuraru <anca@streamcomputing.eu>
40 * \author Dimitrios Karkoulis <dimitris.karkoulis@gmail.com>
41 * \author Teemu Virolainen <teemu@streamcomputing.eu>
42 * \author Szilárd Páll <pall.szilard@gmail.com>
43 * \author Artem Zhmurov <zhmurov@gmail.com>
45 * \ingroup module_nbnxm
52 # include "cuda/nbnxm_cuda_types.h"
56 # include "opencl/nbnxm_ocl_types.h"
60 # include "sycl/nbnxm_sycl_types.h"
63 #include "nbnxm_gpu_data_mgmt.h"
65 #include "gromacs/gpu_utils/device_stream_manager.h"
66 #include "gromacs/gpu_utils/gputraits.h"
67 #include "gromacs/gpu_utils/pmalloc.h"
68 #include "gromacs/hardware/device_information.h"
69 #include "gromacs/mdtypes/interaction_const.h"
70 #include "gromacs/mdtypes/simulation_workload.h"
71 #include "gromacs/nbnxm/gpu_common_utils.h"
72 #include "gromacs/nbnxm/gpu_data_mgmt.h"
73 #include "gromacs/nbnxm/gridset.h"
74 #include "gromacs/pbcutil/ishift.h"
75 #include "gromacs/timing/gpu_timing.h"
76 #include "gromacs/pbcutil/ishift.h"
77 #include "gromacs/utility/cstringutil.h"
78 #include "gromacs/utility/exceptions.h"
79 #include "gromacs/utility/fatalerror.h"
81 #include "nbnxm_gpu.h"
82 #include "pairlistsets.h"
87 inline void issueClFlushInStream(const DeviceStream& deviceStream)
90 /* Based on the v1.2 section 5.13 of the OpenCL spec, a flush is needed
91 * in the stream after marking an event in it in order to be able to sync with
92 * the event from another stream.
94 cl_int cl_error = clFlush(deviceStream.stream());
95 if (cl_error != CL_SUCCESS)
97 GMX_THROW(gmx::InternalError("clFlush failed: " + ocl_get_error_string(cl_error)));
100 GMX_UNUSED_VALUE(deviceStream);
104 void init_ewald_coulomb_force_table(const EwaldCorrectionTables& tables,
106 const DeviceContext& deviceContext)
108 if (nbp->coulomb_tab)
110 destroyParamLookupTable(&nbp->coulomb_tab, nbp->coulomb_tab_texobj);
113 nbp->coulomb_tab_scale = tables.scale;
114 initParamLookupTable(
115 &nbp->coulomb_tab, &nbp->coulomb_tab_texobj, tables.tableF.data(), tables.tableF.size(), deviceContext);
118 enum ElecType nbnxn_gpu_pick_ewald_kernel_type(const interaction_const_t& ic,
119 const DeviceInformation gmx_unused& deviceInfo)
121 bool bTwinCut = (ic.rcoulomb != ic.rvdw);
123 /* Benchmarking/development environment variables to force the use of
124 analytical or tabulated Ewald kernel. */
125 const bool forceAnalyticalEwald = (getenv("GMX_GPU_NB_ANA_EWALD") != nullptr);
126 const bool forceTabulatedEwald = (getenv("GMX_GPU_NB_TAB_EWALD") != nullptr);
127 const bool forceTwinCutoffEwald = (getenv("GMX_GPU_NB_EWALD_TWINCUT") != nullptr);
129 if (forceAnalyticalEwald && forceTabulatedEwald)
132 "Both analytical and tabulated Ewald GPU non-bonded kernels "
133 "requested through environment variables.");
136 /* By default, use analytical Ewald except with CUDA on NVIDIA CC 7.0 and 8.0.
138 const bool c_useTabulatedEwaldDefault =
140 (deviceInfo.prop.major == 7 && deviceInfo.prop.minor == 0)
141 || (deviceInfo.prop.major == 8 && deviceInfo.prop.minor == 0);
145 bool bUseAnalyticalEwald = !c_useTabulatedEwaldDefault;
146 if (forceAnalyticalEwald)
148 bUseAnalyticalEwald = true;
151 fprintf(debug, "Using analytical Ewald GPU kernels\n");
154 else if (forceTabulatedEwald)
156 bUseAnalyticalEwald = false;
160 fprintf(debug, "Using tabulated Ewald GPU kernels\n");
164 /* Use twin cut-off kernels if requested by bTwinCut or the env. var.
165 forces it (use it for debugging/benchmarking only). */
166 if (!bTwinCut && !forceTwinCutoffEwald)
168 return bUseAnalyticalEwald ? ElecType::EwaldAna : ElecType::EwaldTab;
172 return bUseAnalyticalEwald ? ElecType::EwaldAnaTwin : ElecType::EwaldTabTwin;
176 void set_cutoff_parameters(NBParamGpu* nbp, const interaction_const_t& ic, const PairlistParams& listParams)
178 nbp->ewald_beta = ic.ewaldcoeff_q;
179 nbp->sh_ewald = ic.sh_ewald;
180 nbp->epsfac = ic.epsfac;
181 nbp->two_k_rf = 2.0 * ic.reactionFieldCoefficient;
182 nbp->c_rf = ic.reactionFieldShift;
183 nbp->rvdw_sq = ic.rvdw * ic.rvdw;
184 nbp->rcoulomb_sq = ic.rcoulomb * ic.rcoulomb;
185 nbp->rlistOuter_sq = listParams.rlistOuter * listParams.rlistOuter;
186 nbp->rlistInner_sq = listParams.rlistInner * listParams.rlistInner;
187 nbp->useDynamicPruning = listParams.useDynamicPruning;
189 nbp->sh_lj_ewald = ic.sh_lj_ewald;
190 nbp->ewaldcoeff_lj = ic.ewaldcoeff_lj;
192 nbp->rvdw_switch = ic.rvdw_switch;
193 nbp->dispersion_shift = ic.dispersion_shift;
194 nbp->repulsion_shift = ic.repulsion_shift;
195 nbp->vdw_switch = ic.vdw_switch;
198 void gpu_pme_loadbal_update_param(const nonbonded_verlet_t* nbv, const interaction_const_t& ic)
200 if (!nbv || !nbv->useGpu())
204 NbnxmGpu* nb = nbv->gpu_nbv;
205 NBParamGpu* nbp = nb->nbparam;
207 set_cutoff_parameters(nbp, ic, nbv->pairlistSets().params());
209 nbp->elecType = nbnxn_gpu_pick_ewald_kernel_type(ic, nb->deviceContext_->deviceInfo());
211 GMX_RELEASE_ASSERT(ic.coulombEwaldTables, "Need valid Coulomb Ewald correction tables");
212 init_ewald_coulomb_force_table(*ic.coulombEwaldTables, nbp, *nb->deviceContext_);
215 void init_plist(gpu_plist* pl)
217 /* initialize to nullptr pointers to data that is not allocated here and will
218 need reallocation in nbnxn_gpu_init_pairlist */
224 /* size -1 indicates that the respective array hasn't been initialized yet */
231 pl->imask_nalloc = -1;
233 pl->excl_nalloc = -1;
234 pl->haveFreshList = false;
235 pl->rollingPruningNumParts = 0;
236 pl->rollingPruningPart = 0;
239 void init_timings(gmx_wallclock_gpu_nbnxn_t* t)
246 for (int i = 0; i < 2; i++)
248 for (int j = 0; j < 2; j++)
250 t->ktime[i][j].t = 0.0;
251 t->ktime[i][j].c = 0;
255 t->pruneTime.t = 0.0;
256 t->dynamicPruneTime.c = 0;
257 t->dynamicPruneTime.t = 0.0;
260 /*! \brief Initialize \p atomdata first time; it only gets filled at pair-search. */
261 static void initAtomdataFirst(NBAtomData* atomdata,
263 const DeviceContext& deviceContext,
264 const DeviceStream& localStream)
266 atomdata->numTypes = numTypes;
267 allocateDeviceBuffer(&atomdata->shiftVec, SHIFTS, deviceContext);
268 atomdata->shiftVecUploaded = false;
270 allocateDeviceBuffer(&atomdata->fShift, SHIFTS, deviceContext);
271 allocateDeviceBuffer(&atomdata->eLJ, 1, deviceContext);
272 allocateDeviceBuffer(&atomdata->eElec, 1, deviceContext);
274 clearDeviceBufferAsync(&atomdata->fShift, 0, SHIFTS, localStream);
275 clearDeviceBufferAsync(&atomdata->eElec, 0, 1, localStream);
276 clearDeviceBufferAsync(&atomdata->eLJ, 0, 1, localStream);
278 /* initialize to nullptr pointers to data that is not allocated here and will
279 need reallocation in later */
280 atomdata->xq = nullptr;
281 atomdata->f = nullptr;
283 /* size -1 indicates that the respective array hasn't been initialized yet */
284 atomdata->numAtoms = -1;
285 atomdata->numAtomsAlloc = -1;
288 /*! \brief Initialize the nonbonded parameter data structure. */
289 static void initNbparam(NBParamGpu* nbp,
290 const interaction_const_t& ic,
291 const PairlistParams& listParams,
292 const nbnxn_atomdata_t::Params& nbatParams,
293 const DeviceContext& deviceContext)
295 const int numTypes = nbatParams.numTypes;
297 set_cutoff_parameters(nbp, ic, listParams);
299 nbp->vdwType = nbnxmGpuPickVdwKernelType(ic, nbatParams.ljCombinationRule);
300 nbp->elecType = nbnxmGpuPickElectrostaticsKernelType(ic, deviceContext.deviceInfo());
302 if (ic.vdwtype == VanDerWaalsType::Pme)
304 if (ic.ljpme_comb_rule == LongRangeVdW::Geom)
306 GMX_ASSERT(nbatParams.ljCombinationRule == LJCombinationRule::Geometric,
307 "Combination rule mismatch!");
311 GMX_ASSERT(nbatParams.ljCombinationRule == LJCombinationRule::LorentzBerthelot,
312 "Combination rule mismatch!");
316 /* generate table for PME */
317 if (nbp->elecType == ElecType::EwaldTab || nbp->elecType == ElecType::EwaldTabTwin)
319 GMX_RELEASE_ASSERT(ic.coulombEwaldTables, "Need valid Coulomb Ewald correction tables");
320 init_ewald_coulomb_force_table(*ic.coulombEwaldTables, nbp, deviceContext);
323 /* set up LJ parameter lookup table */
324 if (!useLjCombRule(nbp->vdwType))
326 static_assert(sizeof(decltype(nbp->nbfp)) == 2 * sizeof(decltype(*nbatParams.nbfp.data())),
327 "Mismatch in the size of host / device data types");
328 initParamLookupTable(&nbp->nbfp,
330 reinterpret_cast<const Float2*>(nbatParams.nbfp.data()),
335 /* set up LJ-PME parameter lookup table */
336 if (ic.vdwtype == VanDerWaalsType::Pme)
338 static_assert(sizeof(decltype(nbp->nbfp_comb))
339 == 2 * sizeof(decltype(*nbatParams.nbfp_comb.data())),
340 "Mismatch in the size of host / device data types");
341 initParamLookupTable(&nbp->nbfp_comb,
342 &nbp->nbfp_comb_texobj,
343 reinterpret_cast<const Float2*>(nbatParams.nbfp_comb.data()),
349 NbnxmGpu* gpu_init(const gmx::DeviceStreamManager& deviceStreamManager,
350 const interaction_const_t* ic,
351 const PairlistParams& listParams,
352 const nbnxn_atomdata_t* nbat,
353 const bool bLocalAndNonlocal)
355 auto* nb = new NbnxmGpu();
356 nb->deviceContext_ = &deviceStreamManager.context();
357 nb->atdat = new NBAtomData;
358 nb->nbparam = new NBParamGpu;
359 nb->plist[InteractionLocality::Local] = new Nbnxm::gpu_plist;
360 if (bLocalAndNonlocal)
362 nb->plist[InteractionLocality::NonLocal] = new Nbnxm::gpu_plist;
365 nb->bUseTwoStreams = bLocalAndNonlocal;
367 nb->timers = new Nbnxm::GpuTimers();
368 snew(nb->timings, 1);
370 /* WARNING: CUDA timings are incorrect with multiple streams.
371 * This is the main reason why they are disabled by default.
372 * Can be enabled by setting GMX_ENABLE_GPU_TIMING environment variable.
373 * TODO: Consider turning on by default when we can detect nr of streams.
375 * OpenCL timing is enabled by default and can be disabled by
376 * GMX_DISABLE_GPU_TIMING environment variable.
378 * Timing is disabled in SYCL.
380 nb->bDoTime = (GMX_GPU_CUDA && (getenv("GMX_ENABLE_GPU_TIMING") != nullptr))
381 || (GMX_GPU_OPENCL && (getenv("GMX_DISABLE_GPU_TIMING") == nullptr));
385 init_timings(nb->timings);
389 pmalloc(reinterpret_cast<void**>(&nb->nbst.eLJ), sizeof(*nb->nbst.eLJ));
390 pmalloc(reinterpret_cast<void**>(&nb->nbst.eElec), sizeof(*nb->nbst.eElec));
391 pmalloc(reinterpret_cast<void**>(&nb->nbst.fShift), SHIFTS * sizeof(*nb->nbst.fShift));
393 init_plist(nb->plist[InteractionLocality::Local]);
395 /* local/non-local GPU streams */
396 GMX_RELEASE_ASSERT(deviceStreamManager.streamIsValid(gmx::DeviceStreamType::NonBondedLocal),
397 "Local non-bonded stream should be initialized to use GPU for non-bonded.");
398 const DeviceStream& localStream = deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedLocal);
399 nb->deviceStreams[InteractionLocality::Local] = &localStream;
400 // In general, it's not strictly necessary to use 2 streams for SYCL, since they are
401 // out-of-order. But for the time being, it will be less disruptive to keep them.
402 if (nb->bUseTwoStreams)
404 init_plist(nb->plist[InteractionLocality::NonLocal]);
406 GMX_RELEASE_ASSERT(deviceStreamManager.streamIsValid(gmx::DeviceStreamType::NonBondedNonLocal),
407 "Non-local non-bonded stream should be initialized to use GPU for "
408 "non-bonded with domain decomposition.");
409 nb->deviceStreams[InteractionLocality::NonLocal] =
410 &deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedNonLocal);
413 const nbnxn_atomdata_t::Params& nbatParams = nbat->params();
414 const DeviceContext& deviceContext = *nb->deviceContext_;
416 initNbparam(nb->nbparam, *ic, listParams, nbatParams, deviceContext);
417 initAtomdataFirst(nb->atdat, nbatParams.numTypes, deviceContext, localStream);
419 gpu_init_platform_specific(nb);
423 fprintf(debug, "Initialized NBNXM GPU data structures.\n");
429 void gpu_upload_shiftvec(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom)
431 NBAtomData* adat = nb->atdat;
432 const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local];
434 /* only if we have a dynamic box */
435 if (nbatom->bDynamicBox || !adat->shiftVecUploaded)
437 copyToDeviceBuffer(&adat->shiftVec,
438 gmx::asGenericFloat3Pointer(nbatom->shift_vec),
442 GpuApiCallBehavior::Async,
444 adat->shiftVecUploaded = true;
448 //! This function is documented in the header file
449 void gpu_init_pairlist(NbnxmGpu* nb, const NbnxnPairlistGpu* h_plist, const InteractionLocality iloc)
452 // Timing accumulation should happen only if there was work to do
453 // because getLastRangeTime() gets skipped with empty lists later
454 // which leads to the counter not being reset.
455 bool bDoTime = (nb->bDoTime && !h_plist->sci.empty());
456 const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
457 gpu_plist* d_plist = nb->plist[iloc];
459 if (d_plist->na_c < 0)
461 d_plist->na_c = h_plist->na_ci;
465 if (d_plist->na_c != h_plist->na_ci)
468 "In init_plist: the #atoms per cell has changed (from %d to %d)",
475 GpuTimers::Interaction& iTimers = nb->timers->interaction[iloc];
479 iTimers.pl_h2d.openTimingRegion(deviceStream);
480 iTimers.didPairlistH2D = true;
483 // TODO most of this function is same in CUDA and OpenCL, move into the header
484 const DeviceContext& deviceContext = *nb->deviceContext_;
486 reallocateDeviceBuffer(
487 &d_plist->sci, h_plist->sci.size(), &d_plist->nsci, &d_plist->sci_nalloc, deviceContext);
488 copyToDeviceBuffer(&d_plist->sci,
493 GpuApiCallBehavior::Async,
494 bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
496 reallocateDeviceBuffer(
497 &d_plist->cj4, h_plist->cj4.size(), &d_plist->ncj4, &d_plist->cj4_nalloc, deviceContext);
498 copyToDeviceBuffer(&d_plist->cj4,
503 GpuApiCallBehavior::Async,
504 bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
506 reallocateDeviceBuffer(&d_plist->imask,
507 h_plist->cj4.size() * c_nbnxnGpuClusterpairSplit,
509 &d_plist->imask_nalloc,
512 reallocateDeviceBuffer(
513 &d_plist->excl, h_plist->excl.size(), &d_plist->nexcl, &d_plist->excl_nalloc, deviceContext);
514 copyToDeviceBuffer(&d_plist->excl,
515 h_plist->excl.data(),
517 h_plist->excl.size(),
519 GpuApiCallBehavior::Async,
520 bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
524 iTimers.pl_h2d.closeTimingRegion(deviceStream);
527 /* need to prune the pair list during the next step */
528 d_plist->haveFreshList = true;
531 void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat)
533 bool bDoTime = nb->bDoTime;
534 Nbnxm::GpuTimers* timers = bDoTime ? nb->timers : nullptr;
535 NBAtomData* atdat = nb->atdat;
536 const DeviceContext& deviceContext = *nb->deviceContext_;
537 const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local];
539 int numAtoms = nbat->numAtoms();
540 bool realloced = false;
544 /* time async copy */
545 timers->atdat.openTimingRegion(localStream);
548 /* need to reallocate if we have to copy more atoms than the amount of space
549 available and only allocate if we haven't initialized yet, i.e atdat->natoms == -1 */
550 if (numAtoms > atdat->numAtomsAlloc)
552 int numAlloc = over_alloc_small(numAtoms);
554 /* free up first if the arrays have already been initialized */
555 if (atdat->numAtomsAlloc != -1)
557 freeDeviceBuffer(&atdat->f);
558 freeDeviceBuffer(&atdat->xq);
559 if (useLjCombRule(nb->nbparam->vdwType))
561 freeDeviceBuffer(&atdat->ljComb);
565 freeDeviceBuffer(&atdat->atomTypes);
570 allocateDeviceBuffer(&atdat->f, numAlloc, deviceContext);
571 allocateDeviceBuffer(&atdat->xq, numAlloc, deviceContext);
573 if (useLjCombRule(nb->nbparam->vdwType))
575 // Two Lennard-Jones parameters per atom
576 allocateDeviceBuffer(&atdat->ljComb, numAlloc, deviceContext);
580 allocateDeviceBuffer(&atdat->atomTypes, numAlloc, deviceContext);
583 atdat->numAtomsAlloc = numAlloc;
587 atdat->numAtoms = numAtoms;
588 atdat->numAtomsLocal = nbat->natoms_local;
590 /* need to clear GPU f output if realloc happened */
593 clearDeviceBufferAsync(&atdat->f, 0, atdat->numAtomsAlloc, localStream);
596 if (useLjCombRule(nb->nbparam->vdwType))
599 sizeof(Float2) == 2 * sizeof(*nbat->params().lj_comb.data()),
600 "Size of a pair of LJ parameters elements should be equal to the size of Float2.");
601 copyToDeviceBuffer(&atdat->ljComb,
602 reinterpret_cast<const Float2*>(nbat->params().lj_comb.data()),
606 GpuApiCallBehavior::Async,
607 bDoTime ? timers->atdat.fetchNextEvent() : nullptr);
611 static_assert(sizeof(int) == sizeof(*nbat->params().type.data()),
612 "Sizes of host- and device-side atom types should be the same.");
613 copyToDeviceBuffer(&atdat->atomTypes,
614 nbat->params().type.data(),
618 GpuApiCallBehavior::Async,
619 bDoTime ? timers->atdat.fetchNextEvent() : nullptr);
624 timers->atdat.closeTimingRegion(localStream);
627 /* kick off the tasks enqueued above to ensure concurrency with the search */
628 issueClFlushInStream(localStream);
631 void gpu_clear_outputs(NbnxmGpu* nb, bool computeVirial)
633 NBAtomData* adat = nb->atdat;
634 const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local];
636 clearDeviceBufferAsync(&adat->f, 0, nb->atdat->numAtoms, localStream);
637 // Clear shift force array and energies if the outputs were used in the current step
640 clearDeviceBufferAsync(&adat->fShift, 0, SHIFTS, localStream);
641 clearDeviceBufferAsync(&adat->eLJ, 0, 1, localStream);
642 clearDeviceBufferAsync(&adat->eElec, 0, 1, localStream);
644 issueClFlushInStream(localStream);
647 //! This function is documented in the header file
648 gmx_wallclock_gpu_nbnxn_t* gpu_get_timings(NbnxmGpu* nb)
650 return (nb != nullptr && nb->bDoTime) ? nb->timings : nullptr;
653 //! This function is documented in the header file
654 void gpu_reset_timings(nonbonded_verlet_t* nbv)
656 if (nbv->gpu_nbv && nbv->gpu_nbv->bDoTime)
658 init_timings(nbv->gpu_nbv->timings);
662 bool gpu_is_kernel_ewald_analytical(const NbnxmGpu* nb)
664 return ((nb->nbparam->elecType == ElecType::EwaldAna)
665 || (nb->nbparam->elecType == ElecType::EwaldAnaTwin));
668 enum ElecType nbnxmGpuPickElectrostaticsKernelType(const interaction_const_t& ic,
669 const DeviceInformation& deviceInfo)
671 if (ic.eeltype == CoulombInteractionType::Cut)
673 return ElecType::Cut;
675 else if (EEL_RF(ic.eeltype))
679 else if ((EEL_PME(ic.eeltype) || ic.eeltype == CoulombInteractionType::Ewald))
681 return nbnxn_gpu_pick_ewald_kernel_type(ic, deviceInfo);
685 /* Shouldn't happen, as this is checked when choosing Verlet-scheme */
686 GMX_THROW(gmx::InconsistentInputError(
687 gmx::formatString("The requested electrostatics type %s is not implemented in "
688 "the GPU accelerated kernels!",
689 enumValueToString(ic.eeltype))));
694 enum VdwType nbnxmGpuPickVdwKernelType(const interaction_const_t& ic, LJCombinationRule ljCombinationRule)
696 if (ic.vdwtype == VanDerWaalsType::Cut)
698 switch (ic.vdw_modifier)
700 case InteractionModifiers::None:
701 case InteractionModifiers::PotShift:
702 switch (ljCombinationRule)
704 case LJCombinationRule::None: return VdwType::Cut;
705 case LJCombinationRule::Geometric: return VdwType::CutCombGeom;
706 case LJCombinationRule::LorentzBerthelot: return VdwType::CutCombLB;
708 GMX_THROW(gmx::InconsistentInputError(gmx::formatString(
709 "The requested LJ combination rule %s is not implemented in "
710 "the GPU accelerated kernels!",
711 enumValueToString(ljCombinationRule))));
713 case InteractionModifiers::ForceSwitch: return VdwType::FSwitch;
714 case InteractionModifiers::PotSwitch: return VdwType::PSwitch;
716 GMX_THROW(gmx::InconsistentInputError(
717 gmx::formatString("The requested VdW interaction modifier %s is not "
718 "implemented in the GPU accelerated kernels!",
719 enumValueToString(ic.vdw_modifier))));
722 else if (ic.vdwtype == VanDerWaalsType::Pme)
724 if (ic.ljpme_comb_rule == LongRangeVdW::Geom)
726 assert(ljCombinationRule == LJCombinationRule::Geometric);
727 return VdwType::EwaldGeom;
731 assert(ljCombinationRule == LJCombinationRule::LorentzBerthelot);
732 return VdwType::EwaldLB;
737 GMX_THROW(gmx::InconsistentInputError(gmx::formatString(
738 "The requested VdW type %s is not implemented in the GPU accelerated kernels!",
739 enumValueToString(ic.vdwtype))));
743 void setupGpuShortRangeWork(NbnxmGpu* nb, const gmx::GpuBonded* gpuBonded, const gmx::InteractionLocality iLocality)
745 GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
747 // There is short-range work if the pair list for the provided
748 // interaction locality contains entries or if there is any
749 // bonded work (as this is not split into local/nonlocal).
750 nb->haveWork[iLocality] = ((nb->plist[iLocality]->nsci != 0)
751 || (gpuBonded != nullptr && gpuBonded->haveInteractions()));
754 bool haveGpuShortRangeWork(const NbnxmGpu* nb, const gmx::InteractionLocality interactionLocality)
756 GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
758 return nb->haveWork[interactionLocality];
762 * Launch asynchronously the download of nonbonded forces from the GPU
763 * (and energies/shift forces if required).
765 void gpu_launch_cpyback(NbnxmGpu* nb,
766 struct nbnxn_atomdata_t* nbatom,
767 const gmx::StepWorkload& stepWork,
768 const AtomLocality atomLocality)
770 GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
772 /* determine interaction locality from atom locality */
773 const InteractionLocality iloc = atomToInteractionLocality(atomLocality);
774 GMX_ASSERT(iloc == InteractionLocality::Local
775 || (iloc == InteractionLocality::NonLocal && nb->bNonLocalStreamDoneMarked == false),
776 "Non-local stream is indicating that the copy back event is enqueued at the "
777 "beginning of the copy back function.");
779 /* extract the data */
780 NBAtomData* adat = nb->atdat;
781 Nbnxm::GpuTimers* timers = nb->timers;
782 bool bDoTime = nb->bDoTime;
783 const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
785 /* don't launch non-local copy-back if there was no non-local work to do */
786 if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(nb, iloc))
788 /* TODO An alternative way to signal that non-local work is
789 complete is to use a clEnqueueMarker+clEnqueueBarrier
790 pair. However, the use of bNonLocalStreamDoneMarked has the
791 advantage of being local to the host, so probably minimizes
792 overhead. Curiously, for NVIDIA OpenCL with an empty-domain
793 test case, overall simulation performance was higher with
794 the API calls, but this has not been tested on AMD OpenCL,
795 so could be worth considering in future. */
796 nb->bNonLocalStreamDoneMarked = false;
800 /* local/nonlocal offset and length used for xq and f */
801 auto atomsRange = getGpuAtomRange(adat, atomLocality);
803 /* beginning of timed D2H section */
806 timers->xf[atomLocality].nb_d2h.openTimingRegion(deviceStream);
809 /* With DD the local D2H transfer can only start after the non-local
810 has been launched. */
811 if (iloc == InteractionLocality::Local && nb->bNonLocalStreamDoneMarked)
813 nb->nonlocal_done.enqueueWaitEvent(deviceStream);
814 nb->bNonLocalStreamDoneMarked = false;
818 if (!stepWork.useGpuFBufferOps)
821 sizeof(*nbatom->out[0].f.data()) == sizeof(float),
822 "The host force buffer should be in single precision to match device data size.");
823 copyFromDeviceBuffer(reinterpret_cast<Float3*>(nbatom->out[0].f.data()) + atomsRange.begin(),
828 GpuApiCallBehavior::Async,
829 bDoTime ? timers->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
831 issueClFlushInStream(deviceStream);
834 /* After the non-local D2H is launched the nonlocal_done event can be
835 recorded which signals that the local D2H can proceed. This event is not
836 placed after the non-local kernel because we first need the non-local
838 if (iloc == InteractionLocality::NonLocal)
840 nb->nonlocal_done.markEvent(deviceStream);
841 nb->bNonLocalStreamDoneMarked = true;
844 /* only transfer energies in the local stream */
845 if (iloc == InteractionLocality::Local)
847 /* DtoH fshift when virial is needed */
848 if (stepWork.computeVirial)
851 sizeof(*nb->nbst.fShift) == sizeof(Float3),
852 "Sizes of host- and device-side shift vector elements should be the same.");
853 copyFromDeviceBuffer(nb->nbst.fShift,
858 GpuApiCallBehavior::Async,
859 bDoTime ? timers->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
863 if (stepWork.computeEnergy)
865 static_assert(sizeof(*nb->nbst.eLJ) == sizeof(float),
866 "Sizes of host- and device-side LJ energy terms should be the same.");
867 copyFromDeviceBuffer(nb->nbst.eLJ,
872 GpuApiCallBehavior::Async,
873 bDoTime ? timers->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
874 static_assert(sizeof(*nb->nbst.eElec) == sizeof(float),
875 "Sizes of host- and device-side electrostatic energy terms should be the "
877 copyFromDeviceBuffer(nb->nbst.eElec,
882 GpuApiCallBehavior::Async,
883 bDoTime ? timers->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
889 timers->xf[atomLocality].nb_d2h.closeTimingRegion(deviceStream);
893 void nbnxnInsertNonlocalGpuDependency(NbnxmGpu* nb, const InteractionLocality interactionLocality)
895 const DeviceStream& deviceStream = *nb->deviceStreams[interactionLocality];
897 /* When we get here all misc operations issued in the local stream as well as
898 the local xq H2D are done,
899 so we record that in the local stream and wait for it in the nonlocal one.
900 This wait needs to precede any PP tasks, bonded or nonbonded, that may
901 compute on interactions between local and nonlocal atoms.
903 if (nb->bUseTwoStreams)
905 if (interactionLocality == InteractionLocality::Local)
907 nb->misc_ops_and_local_H2D_done.markEvent(deviceStream);
908 issueClFlushInStream(deviceStream);
912 nb->misc_ops_and_local_H2D_done.enqueueWaitEvent(deviceStream);
917 /*! \brief Launch asynchronously the xq buffer host to device copy. */
918 void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const AtomLocality atomLocality)
920 GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
922 const InteractionLocality iloc = atomToInteractionLocality(atomLocality);
924 NBAtomData* adat = nb->atdat;
925 gpu_plist* plist = nb->plist[iloc];
926 Nbnxm::GpuTimers* timers = nb->timers;
927 const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
929 const bool bDoTime = nb->bDoTime;
931 /* Don't launch the non-local H2D copy if there is no dependent
932 work to do: neither non-local nor other (e.g. bonded) work
933 to do that has as input the nbnxn coordaintes.
934 Doing the same for the local kernel is more complicated, since the
935 local part of the force array also depends on the non-local kernel.
936 So to avoid complicating the code and to reduce the risk of bugs,
937 we always call the local local x+q copy (and the rest of the local
938 work in nbnxn_gpu_launch_kernel().
940 if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(nb, iloc))
942 plist->haveFreshList = false;
944 // The event is marked for Local interactions unconditionally,
945 // so it has to be released here because of the early return
946 // for NonLocal interactions.
947 nb->misc_ops_and_local_H2D_done.reset();
952 /* local/nonlocal offset and length used for xq and f */
953 const auto atomsRange = getGpuAtomRange(adat, atomLocality);
955 /* beginning of timed HtoD section */
958 timers->xf[atomLocality].nb_h2d.openTimingRegion(deviceStream);
962 GMX_ASSERT(nbatom->XFormat == nbatXYZQ,
963 "The coordinates should be in xyzq format to copy to the Float4 device buffer.");
964 copyToDeviceBuffer(&adat->xq,
965 reinterpret_cast<const Float4*>(nbatom->x().data()) + atomsRange.begin(),
969 GpuApiCallBehavior::Async,
974 timers->xf[atomLocality].nb_h2d.closeTimingRegion(deviceStream);
977 /* When we get here all misc operations issued in the local stream as well as
978 the local xq H2D are done,
979 so we record that in the local stream and wait for it in the nonlocal one.
980 This wait needs to precede any PP tasks, bonded or nonbonded, that may
981 compute on interactions between local and nonlocal atoms.
983 nbnxnInsertNonlocalGpuDependency(nb, iloc);
987 /* Initialization for X buffer operations on GPU. */
988 void nbnxn_gpu_init_x_to_nbat_x(const Nbnxm::GridSet& gridSet, NbnxmGpu* gpu_nbv)
990 const DeviceStream& localStream = *gpu_nbv->deviceStreams[InteractionLocality::Local];
991 const bool bDoTime = gpu_nbv->bDoTime;
992 const int maxNumColumns = gridSet.numColumnsMax();
994 reallocateDeviceBuffer(&gpu_nbv->cxy_na,
995 maxNumColumns * gridSet.grids().size(),
997 &gpu_nbv->ncxy_na_alloc,
998 *gpu_nbv->deviceContext_);
999 reallocateDeviceBuffer(&gpu_nbv->cxy_ind,
1000 maxNumColumns * gridSet.grids().size(),
1002 &gpu_nbv->ncxy_ind_alloc,
1003 *gpu_nbv->deviceContext_);
1005 for (unsigned int g = 0; g < gridSet.grids().size(); g++)
1007 const Nbnxm::Grid& grid = gridSet.grids()[g];
1009 const int numColumns = grid.numColumns();
1010 const int* atomIndices = gridSet.atomIndices().data();
1011 const int atomIndicesSize = gridSet.atomIndices().size();
1012 const int* cxy_na = grid.cxy_na().data();
1013 const int* cxy_ind = grid.cxy_ind().data();
1015 auto* timerH2D = bDoTime ? &gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d : nullptr;
1017 reallocateDeviceBuffer(&gpu_nbv->atomIndices,
1019 &gpu_nbv->atomIndicesSize,
1020 &gpu_nbv->atomIndicesSize_alloc,
1021 *gpu_nbv->deviceContext_);
1023 if (atomIndicesSize > 0)
1027 timerH2D->openTimingRegion(localStream);
1030 copyToDeviceBuffer(&gpu_nbv->atomIndices,
1035 GpuApiCallBehavior::Async,
1036 bDoTime ? timerH2D->fetchNextEvent() : nullptr);
1040 timerH2D->closeTimingRegion(localStream);
1048 timerH2D->openTimingRegion(localStream);
1051 copyToDeviceBuffer(&gpu_nbv->cxy_na,
1056 GpuApiCallBehavior::Async,
1057 bDoTime ? timerH2D->fetchNextEvent() : nullptr);
1061 timerH2D->closeTimingRegion(localStream);
1066 timerH2D->openTimingRegion(localStream);
1069 copyToDeviceBuffer(&gpu_nbv->cxy_ind,
1074 GpuApiCallBehavior::Async,
1075 bDoTime ? timerH2D->fetchNextEvent() : nullptr);
1079 timerH2D->closeTimingRegion(localStream);
1084 // The above data is transferred on the local stream but is a
1085 // dependency of the nonlocal stream (specifically the nonlocal X
1086 // buf ops kernel). We therefore set a dependency to ensure
1087 // that the nonlocal stream waits on the local stream here.
1088 // This call records an event in the local stream:
1089 nbnxnInsertNonlocalGpuDependency(gpu_nbv, Nbnxm::InteractionLocality::Local);
1090 // ...and this call instructs the nonlocal stream to wait on that event:
1091 nbnxnInsertNonlocalGpuDependency(gpu_nbv, Nbnxm::InteractionLocality::NonLocal);
1094 } // namespace Nbnxm