2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2016 by the GROMACS development team.
5 * Copyright (c) 2017,2018,2019,2020,2021, by the GROMACS development team, led by
6 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
7 * and including many others, as listed in the AUTHORS file in the
8 * top-level source directory and at http://www.gromacs.org.
10 * GROMACS is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public License
12 * as published by the Free Software Foundation; either version 2.1
13 * of the License, or (at your option) any later version.
15 * GROMACS is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with GROMACS; if not, see
22 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
23 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
25 * If you want to redistribute modifications to GROMACS, please
26 * consider that scientific software is very special. Version
27 * control is crucial - bugs must be traceable. We will be happy to
28 * consider code for inclusion in the official distribution, but
29 * derived work must not be called official GROMACS. Details are found
30 * in the README & COPYING files - if they are missing, get the
31 * official version at http://www.gromacs.org.
33 * To help us fund GROMACS development, we humbly ask that you cite
34 * the research papers on the package. Check out http://www.gromacs.org.
37 * \brief Define common implementation of nbnxm_gpu_data_mgmt.h
39 * \author Anca Hamuraru <anca@streamcomputing.eu>
40 * \author Dimitrios Karkoulis <dimitris.karkoulis@gmail.com>
41 * \author Teemu Virolainen <teemu@streamcomputing.eu>
42 * \author Szilárd Páll <pall.szilard@gmail.com>
43 * \author Artem Zhmurov <zhmurov@gmail.com>
45 * \ingroup module_nbnxm
52 # include "cuda/nbnxm_cuda_types.h"
56 # include "opencl/nbnxm_ocl_types.h"
60 # include "sycl/nbnxm_sycl_types.h"
63 #include "nbnxm_gpu_data_mgmt.h"
65 #include "gromacs/gpu_utils/device_stream_manager.h"
66 #include "gromacs/gpu_utils/gputraits.h"
67 #include "gromacs/gpu_utils/pmalloc.h"
68 #include "gromacs/hardware/device_information.h"
69 #include "gromacs/mdtypes/interaction_const.h"
70 #include "gromacs/mdtypes/simulation_workload.h"
71 #include "gromacs/nbnxm/gpu_common_utils.h"
72 #include "gromacs/nbnxm/gpu_data_mgmt.h"
73 #include "gromacs/nbnxm/gridset.h"
74 #include "gromacs/pbcutil/ishift.h"
75 #include "gromacs/timing/gpu_timing.h"
76 #include "gromacs/pbcutil/ishift.h"
77 #include "gromacs/utility/cstringutil.h"
78 #include "gromacs/utility/exceptions.h"
79 #include "gromacs/utility/fatalerror.h"
81 #include "nbnxm_gpu.h"
82 #include "pairlistsets.h"
87 static inline void issueClFlushInStream(const DeviceStream& deviceStream)
90 /* Based on the v1.2 section 5.13 of the OpenCL spec, a flush is needed
91 * in the stream after marking an event in it in order to be able to sync with
92 * the event from another stream.
94 cl_int cl_error = clFlush(deviceStream.stream());
95 if (cl_error != CL_SUCCESS)
97 GMX_THROW(gmx::InternalError("clFlush failed: " + ocl_get_error_string(cl_error)));
100 GMX_UNUSED_VALUE(deviceStream);
104 static inline void init_ewald_coulomb_force_table(const EwaldCorrectionTables& tables,
106 const DeviceContext& deviceContext)
108 if (nbp->coulomb_tab)
110 destroyParamLookupTable(&nbp->coulomb_tab, &nbp->coulomb_tab_texobj);
113 nbp->coulomb_tab_scale = tables.scale;
114 initParamLookupTable(
115 &nbp->coulomb_tab, &nbp->coulomb_tab_texobj, tables.tableF.data(), tables.tableF.size(), deviceContext);
118 static inline ElecType nbnxn_gpu_pick_ewald_kernel_type(const interaction_const_t& ic,
119 const DeviceInformation gmx_unused& deviceInfo)
121 bool bTwinCut = (ic.rcoulomb != ic.rvdw);
123 /* Benchmarking/development environment variables to force the use of
124 analytical or tabulated Ewald kernel. */
125 const bool forceAnalyticalEwald = (getenv("GMX_GPU_NB_ANA_EWALD") != nullptr);
126 const bool forceTabulatedEwald = (getenv("GMX_GPU_NB_TAB_EWALD") != nullptr);
127 const bool forceTwinCutoffEwald = (getenv("GMX_GPU_NB_EWALD_TWINCUT") != nullptr);
129 if (forceAnalyticalEwald && forceTabulatedEwald)
132 "Both analytical and tabulated Ewald GPU non-bonded kernels "
133 "requested through environment variables.");
136 /* By default, use analytical Ewald except with CUDA on NVIDIA CC 7.0 and 8.0.
138 const bool c_useTabulatedEwaldDefault =
140 (deviceInfo.prop.major == 7 && deviceInfo.prop.minor == 0)
141 || (deviceInfo.prop.major == 8 && deviceInfo.prop.minor == 0);
145 bool bUseAnalyticalEwald = !c_useTabulatedEwaldDefault;
146 if (forceAnalyticalEwald)
148 bUseAnalyticalEwald = true;
151 fprintf(debug, "Using analytical Ewald GPU kernels\n");
154 else if (forceTabulatedEwald)
156 bUseAnalyticalEwald = false;
160 fprintf(debug, "Using tabulated Ewald GPU kernels\n");
164 /* Use twin cut-off kernels if requested by bTwinCut or the env. var.
165 forces it (use it for debugging/benchmarking only). */
166 if (!bTwinCut && !forceTwinCutoffEwald)
168 return bUseAnalyticalEwald ? ElecType::EwaldAna : ElecType::EwaldTab;
172 return bUseAnalyticalEwald ? ElecType::EwaldAnaTwin : ElecType::EwaldTabTwin;
176 static inline void set_cutoff_parameters(NBParamGpu* nbp,
177 const interaction_const_t& ic,
178 const PairlistParams& listParams)
180 nbp->ewald_beta = ic.ewaldcoeff_q;
181 nbp->sh_ewald = ic.sh_ewald;
182 nbp->epsfac = ic.epsfac;
183 nbp->two_k_rf = 2.0 * ic.reactionFieldCoefficient;
184 nbp->c_rf = ic.reactionFieldShift;
185 nbp->rvdw_sq = ic.rvdw * ic.rvdw;
186 nbp->rcoulomb_sq = ic.rcoulomb * ic.rcoulomb;
187 nbp->rlistOuter_sq = listParams.rlistOuter * listParams.rlistOuter;
188 nbp->rlistInner_sq = listParams.rlistInner * listParams.rlistInner;
189 nbp->useDynamicPruning = listParams.useDynamicPruning;
191 nbp->sh_lj_ewald = ic.sh_lj_ewald;
192 nbp->ewaldcoeff_lj = ic.ewaldcoeff_lj;
194 nbp->rvdw_switch = ic.rvdw_switch;
195 nbp->dispersion_shift = ic.dispersion_shift;
196 nbp->repulsion_shift = ic.repulsion_shift;
197 nbp->vdw_switch = ic.vdw_switch;
200 static inline void init_plist(gpu_plist* pl)
202 /* initialize to nullptr pointers to data that is not allocated here and will
203 need reallocation in nbnxn_gpu_init_pairlist */
209 /* size -1 indicates that the respective array hasn't been initialized yet */
216 pl->imask_nalloc = -1;
218 pl->excl_nalloc = -1;
219 pl->haveFreshList = false;
220 pl->rollingPruningNumParts = 0;
221 pl->rollingPruningPart = 0;
224 static inline void init_timings(gmx_wallclock_gpu_nbnxn_t* t)
231 for (int i = 0; i < 2; i++)
233 for (int j = 0; j < 2; j++)
235 t->ktime[i][j].t = 0.0;
236 t->ktime[i][j].c = 0;
240 t->pruneTime.t = 0.0;
241 t->dynamicPruneTime.c = 0;
242 t->dynamicPruneTime.t = 0.0;
245 /*! \brief Initialize \p atomdata first time; it only gets filled at pair-search. */
246 static inline void initAtomdataFirst(NBAtomDataGpu* atomdata,
248 const DeviceContext& deviceContext,
249 const DeviceStream& localStream)
251 atomdata->numTypes = numTypes;
252 allocateDeviceBuffer(&atomdata->shiftVec, gmx::c_numShiftVectors, deviceContext);
253 atomdata->shiftVecUploaded = false;
255 allocateDeviceBuffer(&atomdata->fShift, gmx::c_numShiftVectors, deviceContext);
256 allocateDeviceBuffer(&atomdata->eLJ, 1, deviceContext);
257 allocateDeviceBuffer(&atomdata->eElec, 1, deviceContext);
259 clearDeviceBufferAsync(&atomdata->fShift, 0, gmx::c_numShiftVectors, localStream);
260 clearDeviceBufferAsync(&atomdata->eElec, 0, 1, localStream);
261 clearDeviceBufferAsync(&atomdata->eLJ, 0, 1, localStream);
263 /* initialize to nullptr pointers to data that is not allocated here and will
264 need reallocation in later */
265 atomdata->xq = nullptr;
266 atomdata->f = nullptr;
268 /* size -1 indicates that the respective array hasn't been initialized yet */
269 atomdata->numAtoms = -1;
270 atomdata->numAtomsAlloc = -1;
273 static inline VdwType nbnxmGpuPickVdwKernelType(const interaction_const_t& ic,
274 LJCombinationRule ljCombinationRule)
276 if (ic.vdwtype == VanDerWaalsType::Cut)
278 switch (ic.vdw_modifier)
280 case InteractionModifiers::None:
281 case InteractionModifiers::PotShift:
282 switch (ljCombinationRule)
284 case LJCombinationRule::None: return VdwType::Cut;
285 case LJCombinationRule::Geometric: return VdwType::CutCombGeom;
286 case LJCombinationRule::LorentzBerthelot: return VdwType::CutCombLB;
288 GMX_THROW(gmx::InconsistentInputError(gmx::formatString(
289 "The requested LJ combination rule %s is not implemented in "
290 "the GPU accelerated kernels!",
291 enumValueToString(ljCombinationRule))));
293 case InteractionModifiers::ForceSwitch: return VdwType::FSwitch;
294 case InteractionModifiers::PotSwitch: return VdwType::PSwitch;
296 GMX_THROW(gmx::InconsistentInputError(
297 gmx::formatString("The requested VdW interaction modifier %s is not "
298 "implemented in the GPU accelerated kernels!",
299 enumValueToString(ic.vdw_modifier))));
302 else if (ic.vdwtype == VanDerWaalsType::Pme)
304 if (ic.ljpme_comb_rule == LongRangeVdW::Geom)
307 ljCombinationRule == LJCombinationRule::Geometric,
308 "Combination rules for long- and short-range interactions should match.");
309 return VdwType::EwaldGeom;
314 ljCombinationRule == LJCombinationRule::LorentzBerthelot,
315 "Combination rules for long- and short-range interactions should match.");
316 return VdwType::EwaldLB;
321 GMX_THROW(gmx::InconsistentInputError(gmx::formatString(
322 "The requested VdW type %s is not implemented in the GPU accelerated kernels!",
323 enumValueToString(ic.vdwtype))));
327 static inline ElecType nbnxmGpuPickElectrostaticsKernelType(const interaction_const_t& ic,
328 const DeviceInformation& deviceInfo)
330 if (ic.eeltype == CoulombInteractionType::Cut)
332 return ElecType::Cut;
334 else if (EEL_RF(ic.eeltype))
338 else if ((EEL_PME(ic.eeltype) || ic.eeltype == CoulombInteractionType::Ewald))
340 return nbnxn_gpu_pick_ewald_kernel_type(ic, deviceInfo);
344 /* Shouldn't happen, as this is checked when choosing Verlet-scheme */
345 GMX_THROW(gmx::InconsistentInputError(
346 gmx::formatString("The requested electrostatics type %s is not implemented in "
347 "the GPU accelerated kernels!",
348 enumValueToString(ic.eeltype))));
352 /*! \brief Initialize the nonbonded parameter data structure. */
353 static inline void initNbparam(NBParamGpu* nbp,
354 const interaction_const_t& ic,
355 const PairlistParams& listParams,
356 const nbnxn_atomdata_t::Params& nbatParams,
357 const DeviceContext& deviceContext)
359 const int numTypes = nbatParams.numTypes;
361 set_cutoff_parameters(nbp, ic, listParams);
363 nbp->vdwType = nbnxmGpuPickVdwKernelType(ic, nbatParams.ljCombinationRule);
364 nbp->elecType = nbnxmGpuPickElectrostaticsKernelType(ic, deviceContext.deviceInfo());
366 if (ic.vdwtype == VanDerWaalsType::Pme)
368 if (ic.ljpme_comb_rule == LongRangeVdW::Geom)
370 GMX_ASSERT(nbatParams.ljCombinationRule == LJCombinationRule::Geometric,
371 "Combination rule mismatch!");
375 GMX_ASSERT(nbatParams.ljCombinationRule == LJCombinationRule::LorentzBerthelot,
376 "Combination rule mismatch!");
380 /* generate table for PME */
381 if (nbp->elecType == ElecType::EwaldTab || nbp->elecType == ElecType::EwaldTabTwin)
383 GMX_RELEASE_ASSERT(ic.coulombEwaldTables, "Need valid Coulomb Ewald correction tables");
384 init_ewald_coulomb_force_table(*ic.coulombEwaldTables, nbp, deviceContext);
387 /* set up LJ parameter lookup table */
388 if (!useLjCombRule(nbp->vdwType))
390 static_assert(sizeof(decltype(nbp->nbfp)) == 2 * sizeof(decltype(*nbatParams.nbfp.data())),
391 "Mismatch in the size of host / device data types");
392 initParamLookupTable(&nbp->nbfp,
394 reinterpret_cast<const Float2*>(nbatParams.nbfp.data()),
399 /* set up LJ-PME parameter lookup table */
400 if (ic.vdwtype == VanDerWaalsType::Pme)
402 static_assert(sizeof(decltype(nbp->nbfp_comb))
403 == 2 * sizeof(decltype(*nbatParams.nbfp_comb.data())),
404 "Mismatch in the size of host / device data types");
405 initParamLookupTable(&nbp->nbfp_comb,
406 &nbp->nbfp_comb_texobj,
407 reinterpret_cast<const Float2*>(nbatParams.nbfp_comb.data()),
413 NbnxmGpu* gpu_init(const gmx::DeviceStreamManager& deviceStreamManager,
414 const interaction_const_t* ic,
415 const PairlistParams& listParams,
416 const nbnxn_atomdata_t* nbat,
417 const bool bLocalAndNonlocal)
419 auto* nb = new NbnxmGpu();
420 nb->deviceContext_ = &deviceStreamManager.context();
421 nb->atdat = new NBAtomDataGpu;
422 nb->nbparam = new NBParamGpu;
423 nb->plist[InteractionLocality::Local] = new Nbnxm::gpu_plist;
424 if (bLocalAndNonlocal)
426 nb->plist[InteractionLocality::NonLocal] = new Nbnxm::gpu_plist;
429 nb->bUseTwoStreams = bLocalAndNonlocal;
431 nb->timers = new Nbnxm::GpuTimers();
432 snew(nb->timings, 1);
434 nb->bDoTime = decideGpuTimingsUsage();
438 init_timings(nb->timings);
442 pmalloc(reinterpret_cast<void**>(&nb->nbst.eLJ), sizeof(*nb->nbst.eLJ));
443 pmalloc(reinterpret_cast<void**>(&nb->nbst.eElec), sizeof(*nb->nbst.eElec));
444 pmalloc(reinterpret_cast<void**>(&nb->nbst.fShift), gmx::c_numShiftVectors * sizeof(*nb->nbst.fShift));
446 init_plist(nb->plist[InteractionLocality::Local]);
448 /* local/non-local GPU streams */
449 GMX_RELEASE_ASSERT(deviceStreamManager.streamIsValid(gmx::DeviceStreamType::NonBondedLocal),
450 "Local non-bonded stream should be initialized to use GPU for non-bonded.");
451 const DeviceStream& localStream = deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedLocal);
452 nb->deviceStreams[InteractionLocality::Local] = &localStream;
453 // In general, it's not strictly necessary to use 2 streams for SYCL, since they are
454 // out-of-order. But for the time being, it will be less disruptive to keep them.
455 if (nb->bUseTwoStreams)
457 init_plist(nb->plist[InteractionLocality::NonLocal]);
459 GMX_RELEASE_ASSERT(deviceStreamManager.streamIsValid(gmx::DeviceStreamType::NonBondedNonLocal),
460 "Non-local non-bonded stream should be initialized to use GPU for "
461 "non-bonded with domain decomposition.");
462 nb->deviceStreams[InteractionLocality::NonLocal] =
463 &deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedNonLocal);
466 const nbnxn_atomdata_t::Params& nbatParams = nbat->params();
467 const DeviceContext& deviceContext = *nb->deviceContext_;
469 initNbparam(nb->nbparam, *ic, listParams, nbatParams, deviceContext);
470 initAtomdataFirst(nb->atdat, nbatParams.numTypes, deviceContext, localStream);
472 gpu_init_platform_specific(nb);
476 fprintf(debug, "Initialized NBNXM GPU data structures.\n");
482 void gpu_pme_loadbal_update_param(const nonbonded_verlet_t* nbv, const interaction_const_t& ic)
484 if (!nbv || !nbv->useGpu())
488 NbnxmGpu* nb = nbv->gpu_nbv;
489 NBParamGpu* nbp = nb->nbparam;
491 set_cutoff_parameters(nbp, ic, nbv->pairlistSets().params());
493 nbp->elecType = nbnxn_gpu_pick_ewald_kernel_type(ic, nb->deviceContext_->deviceInfo());
495 GMX_RELEASE_ASSERT(ic.coulombEwaldTables, "Need valid Coulomb Ewald correction tables");
496 init_ewald_coulomb_force_table(*ic.coulombEwaldTables, nbp, *nb->deviceContext_);
499 void gpu_upload_shiftvec(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom)
501 NBAtomDataGpu* adat = nb->atdat;
502 const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local];
504 /* only if we have a dynamic box */
505 if (nbatom->bDynamicBox || !adat->shiftVecUploaded)
507 copyToDeviceBuffer(&adat->shiftVec,
508 gmx::asGenericFloat3Pointer(nbatom->shift_vec),
510 gmx::c_numShiftVectors,
512 GpuApiCallBehavior::Async,
514 adat->shiftVecUploaded = true;
518 //! This function is documented in the header file
519 void gpu_init_pairlist(NbnxmGpu* nb, const NbnxnPairlistGpu* h_plist, const InteractionLocality iloc)
522 // Timing accumulation should happen only if there was work to do
523 // because getLastRangeTime() gets skipped with empty lists later
524 // which leads to the counter not being reset.
525 bool bDoTime = (nb->bDoTime && !h_plist->sci.empty());
526 const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
527 gpu_plist* d_plist = nb->plist[iloc];
529 if (d_plist->na_c < 0)
531 d_plist->na_c = h_plist->na_ci;
535 if (d_plist->na_c != h_plist->na_ci)
538 "In init_plist: the #atoms per cell has changed (from %d to %d)",
545 GpuTimers::Interaction& iTimers = nb->timers->interaction[iloc];
549 iTimers.pl_h2d.openTimingRegion(deviceStream);
550 iTimers.didPairlistH2D = true;
553 // TODO most of this function is same in CUDA and OpenCL, move into the header
554 const DeviceContext& deviceContext = *nb->deviceContext_;
556 reallocateDeviceBuffer(
557 &d_plist->sci, h_plist->sci.size(), &d_plist->nsci, &d_plist->sci_nalloc, deviceContext);
558 copyToDeviceBuffer(&d_plist->sci,
563 GpuApiCallBehavior::Async,
564 bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
566 reallocateDeviceBuffer(
567 &d_plist->cj4, h_plist->cj4.size(), &d_plist->ncj4, &d_plist->cj4_nalloc, deviceContext);
568 copyToDeviceBuffer(&d_plist->cj4,
573 GpuApiCallBehavior::Async,
574 bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
576 reallocateDeviceBuffer(&d_plist->imask,
577 h_plist->cj4.size() * c_nbnxnGpuClusterpairSplit,
579 &d_plist->imask_nalloc,
582 reallocateDeviceBuffer(
583 &d_plist->excl, h_plist->excl.size(), &d_plist->nexcl, &d_plist->excl_nalloc, deviceContext);
584 copyToDeviceBuffer(&d_plist->excl,
585 h_plist->excl.data(),
587 h_plist->excl.size(),
589 GpuApiCallBehavior::Async,
590 bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
594 iTimers.pl_h2d.closeTimingRegion(deviceStream);
597 /* need to prune the pair list during the next step */
598 d_plist->haveFreshList = true;
601 void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat)
603 bool bDoTime = nb->bDoTime;
604 Nbnxm::GpuTimers* timers = bDoTime ? nb->timers : nullptr;
605 NBAtomDataGpu* atdat = nb->atdat;
606 const DeviceContext& deviceContext = *nb->deviceContext_;
607 const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local];
609 int numAtoms = nbat->numAtoms();
610 bool realloced = false;
614 /* time async copy */
615 timers->atdat.openTimingRegion(localStream);
618 /* need to reallocate if we have to copy more atoms than the amount of space
619 available and only allocate if we haven't initialized yet, i.e atdat->natoms == -1 */
620 if (numAtoms > atdat->numAtomsAlloc)
622 int numAlloc = over_alloc_small(numAtoms);
624 /* free up first if the arrays have already been initialized */
625 if (atdat->numAtomsAlloc != -1)
627 freeDeviceBuffer(&atdat->f);
628 freeDeviceBuffer(&atdat->xq);
629 if (useLjCombRule(nb->nbparam->vdwType))
631 freeDeviceBuffer(&atdat->ljComb);
635 freeDeviceBuffer(&atdat->atomTypes);
640 allocateDeviceBuffer(&atdat->f, numAlloc, deviceContext);
641 allocateDeviceBuffer(&atdat->xq, numAlloc, deviceContext);
643 if (useLjCombRule(nb->nbparam->vdwType))
645 // Two Lennard-Jones parameters per atom
646 allocateDeviceBuffer(&atdat->ljComb, numAlloc, deviceContext);
650 allocateDeviceBuffer(&atdat->atomTypes, numAlloc, deviceContext);
653 atdat->numAtomsAlloc = numAlloc;
657 atdat->numAtoms = numAtoms;
658 atdat->numAtomsLocal = nbat->natoms_local;
660 /* need to clear GPU f output if realloc happened */
663 clearDeviceBufferAsync(&atdat->f, 0, atdat->numAtomsAlloc, localStream);
666 if (useLjCombRule(nb->nbparam->vdwType))
669 sizeof(Float2) == 2 * sizeof(*nbat->params().lj_comb.data()),
670 "Size of a pair of LJ parameters elements should be equal to the size of Float2.");
671 copyToDeviceBuffer(&atdat->ljComb,
672 reinterpret_cast<const Float2*>(nbat->params().lj_comb.data()),
676 GpuApiCallBehavior::Async,
677 bDoTime ? timers->atdat.fetchNextEvent() : nullptr);
681 static_assert(sizeof(int) == sizeof(*nbat->params().type.data()),
682 "Sizes of host- and device-side atom types should be the same.");
683 copyToDeviceBuffer(&atdat->atomTypes,
684 nbat->params().type.data(),
688 GpuApiCallBehavior::Async,
689 bDoTime ? timers->atdat.fetchNextEvent() : nullptr);
694 timers->atdat.closeTimingRegion(localStream);
697 /* kick off the tasks enqueued above to ensure concurrency with the search */
698 issueClFlushInStream(localStream);
701 void gpu_clear_outputs(NbnxmGpu* nb, bool computeVirial)
703 NBAtomDataGpu* adat = nb->atdat;
704 const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local];
706 clearDeviceBufferAsync(&adat->f, 0, nb->atdat->numAtoms, localStream);
707 // Clear shift force array and energies if the outputs were used in the current step
710 clearDeviceBufferAsync(&adat->fShift, 0, gmx::c_numShiftVectors, localStream);
711 clearDeviceBufferAsync(&adat->eLJ, 0, 1, localStream);
712 clearDeviceBufferAsync(&adat->eElec, 0, 1, localStream);
714 issueClFlushInStream(localStream);
717 //! This function is documented in the header file
718 gmx_wallclock_gpu_nbnxn_t* gpu_get_timings(NbnxmGpu* nb)
720 return (nb != nullptr && nb->bDoTime) ? nb->timings : nullptr;
723 //! This function is documented in the header file
724 void gpu_reset_timings(nonbonded_verlet_t* nbv)
726 if (nbv->gpu_nbv && nbv->gpu_nbv->bDoTime)
728 init_timings(nbv->gpu_nbv->timings);
732 bool gpu_is_kernel_ewald_analytical(const NbnxmGpu* nb)
734 return ((nb->nbparam->elecType == ElecType::EwaldAna)
735 || (nb->nbparam->elecType == ElecType::EwaldAnaTwin));
738 void setupGpuShortRangeWork(NbnxmGpu* nb,
739 const gmx::ListedForcesGpu* listedForcesGpu,
740 const gmx::InteractionLocality iLocality)
742 GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
744 // There is short-range work if the pair list for the provided
745 // interaction locality contains entries or if there is any
746 // bonded work (as this is not split into local/nonlocal).
747 nb->haveWork[iLocality] = ((nb->plist[iLocality]->nsci != 0)
748 || (listedForcesGpu != nullptr && listedForcesGpu->haveInteractions()));
751 bool haveGpuShortRangeWork(const NbnxmGpu* nb, const gmx::InteractionLocality interactionLocality)
753 GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
755 return nb->haveWork[interactionLocality];
759 * Launch asynchronously the download of nonbonded forces from the GPU
760 * (and energies/shift forces if required).
762 void gpu_launch_cpyback(NbnxmGpu* nb,
763 struct nbnxn_atomdata_t* nbatom,
764 const gmx::StepWorkload& stepWork,
765 const AtomLocality atomLocality)
767 GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
769 /* determine interaction locality from atom locality */
770 const InteractionLocality iloc = atomToInteractionLocality(atomLocality);
771 GMX_ASSERT(iloc == InteractionLocality::Local
772 || (iloc == InteractionLocality::NonLocal && nb->bNonLocalStreamDoneMarked == false),
773 "Non-local stream is indicating that the copy back event is enqueued at the "
774 "beginning of the copy back function.");
776 /* extract the data */
777 NBAtomDataGpu* adat = nb->atdat;
778 Nbnxm::GpuTimers* timers = nb->timers;
779 bool bDoTime = nb->bDoTime;
780 const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
782 /* don't launch non-local copy-back if there was no non-local work to do */
783 if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(nb, iloc))
785 /* TODO An alternative way to signal that non-local work is
786 complete is to use a clEnqueueMarker+clEnqueueBarrier
787 pair. However, the use of bNonLocalStreamDoneMarked has the
788 advantage of being local to the host, so probably minimizes
789 overhead. Curiously, for NVIDIA OpenCL with an empty-domain
790 test case, overall simulation performance was higher with
791 the API calls, but this has not been tested on AMD OpenCL,
792 so could be worth considering in future. */
793 nb->bNonLocalStreamDoneMarked = false;
797 /* local/nonlocal offset and length used for xq and f */
798 auto atomsRange = getGpuAtomRange(adat, atomLocality);
800 /* beginning of timed D2H section */
803 timers->xf[atomLocality].nb_d2h.openTimingRegion(deviceStream);
806 /* With DD the local D2H transfer can only start after the non-local
807 has been launched. */
808 if (iloc == InteractionLocality::Local && nb->bNonLocalStreamDoneMarked)
810 nb->nonlocal_done.enqueueWaitEvent(deviceStream);
811 nb->bNonLocalStreamDoneMarked = false;
815 if (!stepWork.useGpuFBufferOps)
818 sizeof(*nbatom->out[0].f.data()) == sizeof(float),
819 "The host force buffer should be in single precision to match device data size.");
820 copyFromDeviceBuffer(reinterpret_cast<Float3*>(nbatom->out[0].f.data()) + atomsRange.begin(),
825 GpuApiCallBehavior::Async,
826 bDoTime ? timers->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
828 issueClFlushInStream(deviceStream);
831 /* After the non-local D2H is launched the nonlocal_done event can be
832 recorded which signals that the local D2H can proceed. This event is not
833 placed after the non-local kernel because we first need the non-local
835 if (iloc == InteractionLocality::NonLocal)
837 nb->nonlocal_done.markEvent(deviceStream);
838 nb->bNonLocalStreamDoneMarked = true;
841 /* only transfer energies in the local stream */
842 if (iloc == InteractionLocality::Local)
844 /* DtoH fshift when virial is needed */
845 if (stepWork.computeVirial)
848 sizeof(*nb->nbst.fShift) == sizeof(Float3),
849 "Sizes of host- and device-side shift vector elements should be the same.");
850 copyFromDeviceBuffer(nb->nbst.fShift,
853 gmx::c_numShiftVectors,
855 GpuApiCallBehavior::Async,
856 bDoTime ? timers->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
860 if (stepWork.computeEnergy)
862 static_assert(sizeof(*nb->nbst.eLJ) == sizeof(float),
863 "Sizes of host- and device-side LJ energy terms should be the same.");
864 copyFromDeviceBuffer(nb->nbst.eLJ,
869 GpuApiCallBehavior::Async,
870 bDoTime ? timers->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
871 static_assert(sizeof(*nb->nbst.eElec) == sizeof(float),
872 "Sizes of host- and device-side electrostatic energy terms should be the "
874 copyFromDeviceBuffer(nb->nbst.eElec,
879 GpuApiCallBehavior::Async,
880 bDoTime ? timers->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
886 timers->xf[atomLocality].nb_d2h.closeTimingRegion(deviceStream);
890 void nbnxnInsertNonlocalGpuDependency(NbnxmGpu* nb, const InteractionLocality interactionLocality)
892 const DeviceStream& deviceStream = *nb->deviceStreams[interactionLocality];
894 /* When we get here all misc operations issued in the local stream as well as
895 the local xq H2D are done,
896 so we record that in the local stream and wait for it in the nonlocal one.
897 This wait needs to precede any PP tasks, bonded or nonbonded, that may
898 compute on interactions between local and nonlocal atoms.
900 if (nb->bUseTwoStreams)
902 if (interactionLocality == InteractionLocality::Local)
904 nb->misc_ops_and_local_H2D_done.markEvent(deviceStream);
905 issueClFlushInStream(deviceStream);
909 nb->misc_ops_and_local_H2D_done.enqueueWaitEvent(deviceStream);
914 /*! \brief Launch asynchronously the xq buffer host to device copy. */
915 void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const AtomLocality atomLocality)
917 GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
919 const InteractionLocality iloc = atomToInteractionLocality(atomLocality);
921 NBAtomDataGpu* adat = nb->atdat;
922 gpu_plist* plist = nb->plist[iloc];
923 Nbnxm::GpuTimers* timers = nb->timers;
924 const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
926 const bool bDoTime = nb->bDoTime;
928 /* Don't launch the non-local H2D copy if there is no dependent
929 work to do: neither non-local nor other (e.g. bonded) work
930 to do that has as input the nbnxn coordaintes.
931 Doing the same for the local kernel is more complicated, since the
932 local part of the force array also depends on the non-local kernel.
933 So to avoid complicating the code and to reduce the risk of bugs,
934 we always call the local local x+q copy (and the rest of the local
935 work in nbnxn_gpu_launch_kernel().
937 if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(nb, iloc))
939 plist->haveFreshList = false;
941 // The event is marked for Local interactions unconditionally,
942 // so it has to be released here because of the early return
943 // for NonLocal interactions.
944 nb->misc_ops_and_local_H2D_done.reset();
949 /* local/nonlocal offset and length used for xq and f */
950 const auto atomsRange = getGpuAtomRange(adat, atomLocality);
952 /* beginning of timed HtoD section */
955 timers->xf[atomLocality].nb_h2d.openTimingRegion(deviceStream);
959 GMX_ASSERT(nbatom->XFormat == nbatXYZQ,
960 "The coordinates should be in xyzq format to copy to the Float4 device buffer.");
961 copyToDeviceBuffer(&adat->xq,
962 reinterpret_cast<const Float4*>(nbatom->x().data()) + atomsRange.begin(),
966 GpuApiCallBehavior::Async,
971 timers->xf[atomLocality].nb_h2d.closeTimingRegion(deviceStream);
974 /* When we get here all misc operations issued in the local stream as well as
975 the local xq H2D are done,
976 so we record that in the local stream and wait for it in the nonlocal one.
977 This wait needs to precede any PP tasks, bonded or nonbonded, that may
978 compute on interactions between local and nonlocal atoms.
980 nbnxnInsertNonlocalGpuDependency(nb, iloc);
984 /* Initialization for X buffer operations on GPU. */
985 void nbnxn_gpu_init_x_to_nbat_x(const Nbnxm::GridSet& gridSet, NbnxmGpu* gpu_nbv)
987 const DeviceStream& localStream = *gpu_nbv->deviceStreams[InteractionLocality::Local];
988 const bool bDoTime = gpu_nbv->bDoTime;
989 const int maxNumColumns = gridSet.numColumnsMax();
991 reallocateDeviceBuffer(&gpu_nbv->cxy_na,
992 maxNumColumns * gridSet.grids().size(),
994 &gpu_nbv->ncxy_na_alloc,
995 *gpu_nbv->deviceContext_);
996 reallocateDeviceBuffer(&gpu_nbv->cxy_ind,
997 maxNumColumns * gridSet.grids().size(),
999 &gpu_nbv->ncxy_ind_alloc,
1000 *gpu_nbv->deviceContext_);
1002 for (unsigned int g = 0; g < gridSet.grids().size(); g++)
1004 const Nbnxm::Grid& grid = gridSet.grids()[g];
1006 const int numColumns = grid.numColumns();
1007 const int* atomIndices = gridSet.atomIndices().data();
1008 const int atomIndicesSize = gridSet.atomIndices().size();
1009 const int* cxy_na = grid.cxy_na().data();
1010 const int* cxy_ind = grid.cxy_ind().data();
1012 auto* timerH2D = bDoTime ? &gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d : nullptr;
1014 reallocateDeviceBuffer(&gpu_nbv->atomIndices,
1016 &gpu_nbv->atomIndicesSize,
1017 &gpu_nbv->atomIndicesSize_alloc,
1018 *gpu_nbv->deviceContext_);
1020 if (atomIndicesSize > 0)
1024 timerH2D->openTimingRegion(localStream);
1027 copyToDeviceBuffer(&gpu_nbv->atomIndices,
1032 GpuApiCallBehavior::Async,
1033 bDoTime ? timerH2D->fetchNextEvent() : nullptr);
1037 timerH2D->closeTimingRegion(localStream);
1045 timerH2D->openTimingRegion(localStream);
1048 copyToDeviceBuffer(&gpu_nbv->cxy_na,
1053 GpuApiCallBehavior::Async,
1054 bDoTime ? timerH2D->fetchNextEvent() : nullptr);
1058 timerH2D->closeTimingRegion(localStream);
1063 timerH2D->openTimingRegion(localStream);
1066 copyToDeviceBuffer(&gpu_nbv->cxy_ind,
1071 GpuApiCallBehavior::Async,
1072 bDoTime ? timerH2D->fetchNextEvent() : nullptr);
1076 timerH2D->closeTimingRegion(localStream);
1081 // The above data is transferred on the local stream but is a
1082 // dependency of the nonlocal stream (specifically the nonlocal X
1083 // buf ops kernel). We therefore set a dependency to ensure
1084 // that the nonlocal stream waits on the local stream here.
1085 // This call records an event in the local stream:
1086 nbnxnInsertNonlocalGpuDependency(gpu_nbv, Nbnxm::InteractionLocality::Local);
1087 // ...and this call instructs the nonlocal stream to wait on that event:
1088 nbnxnInsertNonlocalGpuDependency(gpu_nbv, Nbnxm::InteractionLocality::NonLocal);
1091 //! This function is documented in the header file
1092 void gpu_free(NbnxmGpu* nb)
1099 gpu_free_platform_specific(nb);
1104 NBAtomDataGpu* atdat = nb->atdat;
1105 NBParamGpu* nbparam = nb->nbparam;
1108 freeDeviceBuffer(&(nb->atdat->xq));
1109 freeDeviceBuffer(&(nb->atdat->f));
1110 freeDeviceBuffer(&(nb->atdat->eLJ));
1111 freeDeviceBuffer(&(nb->atdat->eElec));
1112 freeDeviceBuffer(&(nb->atdat->fShift));
1113 freeDeviceBuffer(&(nb->atdat->shiftVec));
1114 if (useLjCombRule(nb->nbparam->vdwType))
1116 freeDeviceBuffer(&atdat->ljComb);
1120 freeDeviceBuffer(&atdat->atomTypes);
1124 if (nbparam->elecType == ElecType::EwaldTab || nbparam->elecType == ElecType::EwaldTabTwin)
1126 destroyParamLookupTable(&nbparam->coulomb_tab, &nbparam->coulomb_tab_texobj);
1129 if (!useLjCombRule(nb->nbparam->vdwType))
1131 destroyParamLookupTable(&nbparam->nbfp, &nbparam->nbfp_texobj);
1134 if (nbparam->vdwType == VdwType::EwaldGeom || nbparam->vdwType == VdwType::EwaldLB)
1136 destroyParamLookupTable(&nbparam->nbfp_comb, &nbparam->nbfp_comb_texobj);
1140 auto* plist = nb->plist[InteractionLocality::Local];
1141 freeDeviceBuffer(&plist->sci);
1142 freeDeviceBuffer(&plist->cj4);
1143 freeDeviceBuffer(&plist->imask);
1144 freeDeviceBuffer(&plist->excl);
1146 if (nb->bUseTwoStreams)
1148 auto* plist_nl = nb->plist[InteractionLocality::NonLocal];
1149 freeDeviceBuffer(&plist_nl->sci);
1150 freeDeviceBuffer(&plist_nl->cj4);
1151 freeDeviceBuffer(&plist_nl->imask);
1152 freeDeviceBuffer(&plist_nl->excl);
1157 pfree(nb->nbst.eLJ);
1158 nb->nbst.eLJ = nullptr;
1160 pfree(nb->nbst.eElec);
1161 nb->nbst.eElec = nullptr;
1163 pfree(nb->nbst.fShift);
1164 nb->nbst.fShift = nullptr;
1172 fprintf(debug, "Cleaned up NBNXM GPU data structures.\n");
1176 DeviceBuffer<gmx::RVec> gpu_get_f(NbnxmGpu* nb)
1178 GMX_ASSERT(nb != nullptr, "nb pointer must be valid");
1180 return nb->atdat->f;
1183 } // namespace Nbnxm