2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2016 by the GROMACS development team.
5 * Copyright (c) 2017,2018,2019,2020,2021, by the GROMACS development team, led by
6 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
7 * and including many others, as listed in the AUTHORS file in the
8 * top-level source directory and at http://www.gromacs.org.
10 * GROMACS is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public License
12 * as published by the Free Software Foundation; either version 2.1
13 * of the License, or (at your option) any later version.
15 * GROMACS is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with GROMACS; if not, see
22 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
23 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
25 * If you want to redistribute modifications to GROMACS, please
26 * consider that scientific software is very special. Version
27 * control is crucial - bugs must be traceable. We will be happy to
28 * consider code for inclusion in the official distribution, but
29 * derived work must not be called official GROMACS. Details are found
30 * in the README & COPYING files - if they are missing, get the
31 * official version at http://www.gromacs.org.
33 * To help us fund GROMACS development, we humbly ask that you cite
34 * the research papers on the package. Check out http://www.gromacs.org.
37 * \brief Define common implementation of nbnxm_gpu_data_mgmt.h
39 * \author Anca Hamuraru <anca@streamcomputing.eu>
40 * \author Dimitrios Karkoulis <dimitris.karkoulis@gmail.com>
41 * \author Teemu Virolainen <teemu@streamcomputing.eu>
42 * \author Szilárd Páll <pall.szilard@gmail.com>
43 * \author Artem Zhmurov <zhmurov@gmail.com>
45 * \ingroup module_nbnxm
52 # include "cuda/nbnxm_cuda_types.h"
56 # include "opencl/nbnxm_ocl_types.h"
60 # include "sycl/nbnxm_sycl_types.h"
63 #include "nbnxm_gpu_data_mgmt.h"
65 #include "gromacs/gpu_utils/device_stream_manager.h"
66 #include "gromacs/gpu_utils/gputraits.h"
67 #include "gromacs/gpu_utils/pmalloc.h"
68 #include "gromacs/hardware/device_information.h"
69 #include "gromacs/mdtypes/interaction_const.h"
70 #include "gromacs/mdtypes/simulation_workload.h"
71 #include "gromacs/nbnxm/gpu_common_utils.h"
72 #include "gromacs/nbnxm/gpu_data_mgmt.h"
73 #include "gromacs/nbnxm/gridset.h"
74 #include "gromacs/pbcutil/ishift.h"
75 #include "gromacs/timing/gpu_timing.h"
76 #include "gromacs/pbcutil/ishift.h"
77 #include "gromacs/utility/cstringutil.h"
78 #include "gromacs/utility/exceptions.h"
79 #include "gromacs/utility/fatalerror.h"
81 #include "nbnxm_gpu.h"
82 #include "pairlistsets.h"
87 static inline void issueClFlushInStream(const DeviceStream& deviceStream)
90 /* Based on the v1.2 section 5.13 of the OpenCL spec, a flush is needed
91 * in the stream after marking an event in it in order to be able to sync with
92 * the event from another stream.
94 cl_int cl_error = clFlush(deviceStream.stream());
95 if (cl_error != CL_SUCCESS)
97 GMX_THROW(gmx::InternalError("clFlush failed: " + ocl_get_error_string(cl_error)));
100 GMX_UNUSED_VALUE(deviceStream);
104 static inline void init_ewald_coulomb_force_table(const EwaldCorrectionTables& tables,
106 const DeviceContext& deviceContext)
108 if (nbp->coulomb_tab)
110 destroyParamLookupTable(&nbp->coulomb_tab, nbp->coulomb_tab_texobj);
113 nbp->coulomb_tab_scale = tables.scale;
114 initParamLookupTable(
115 &nbp->coulomb_tab, &nbp->coulomb_tab_texobj, tables.tableF.data(), tables.tableF.size(), deviceContext);
118 static inline ElecType nbnxn_gpu_pick_ewald_kernel_type(const interaction_const_t& ic,
119 const DeviceInformation gmx_unused& deviceInfo)
121 bool bTwinCut = (ic.rcoulomb != ic.rvdw);
123 /* Benchmarking/development environment variables to force the use of
124 analytical or tabulated Ewald kernel. */
125 const bool forceAnalyticalEwald = (getenv("GMX_GPU_NB_ANA_EWALD") != nullptr);
126 const bool forceTabulatedEwald = (getenv("GMX_GPU_NB_TAB_EWALD") != nullptr);
127 const bool forceTwinCutoffEwald = (getenv("GMX_GPU_NB_EWALD_TWINCUT") != nullptr);
129 if (forceAnalyticalEwald && forceTabulatedEwald)
132 "Both analytical and tabulated Ewald GPU non-bonded kernels "
133 "requested through environment variables.");
136 /* By default, use analytical Ewald except with CUDA on NVIDIA CC 7.0 and 8.0.
138 const bool c_useTabulatedEwaldDefault =
140 (deviceInfo.prop.major == 7 && deviceInfo.prop.minor == 0)
141 || (deviceInfo.prop.major == 8 && deviceInfo.prop.minor == 0);
145 bool bUseAnalyticalEwald = !c_useTabulatedEwaldDefault;
146 if (forceAnalyticalEwald)
148 bUseAnalyticalEwald = true;
151 fprintf(debug, "Using analytical Ewald GPU kernels\n");
154 else if (forceTabulatedEwald)
156 bUseAnalyticalEwald = false;
160 fprintf(debug, "Using tabulated Ewald GPU kernels\n");
164 /* Use twin cut-off kernels if requested by bTwinCut or the env. var.
165 forces it (use it for debugging/benchmarking only). */
166 if (!bTwinCut && !forceTwinCutoffEwald)
168 return bUseAnalyticalEwald ? ElecType::EwaldAna : ElecType::EwaldTab;
172 return bUseAnalyticalEwald ? ElecType::EwaldAnaTwin : ElecType::EwaldTabTwin;
176 static inline void set_cutoff_parameters(NBParamGpu* nbp,
177 const interaction_const_t& ic,
178 const PairlistParams& listParams)
180 nbp->ewald_beta = ic.ewaldcoeff_q;
181 nbp->sh_ewald = ic.sh_ewald;
182 nbp->epsfac = ic.epsfac;
183 nbp->two_k_rf = 2.0 * ic.reactionFieldCoefficient;
184 nbp->c_rf = ic.reactionFieldShift;
185 nbp->rvdw_sq = ic.rvdw * ic.rvdw;
186 nbp->rcoulomb_sq = ic.rcoulomb * ic.rcoulomb;
187 nbp->rlistOuter_sq = listParams.rlistOuter * listParams.rlistOuter;
188 nbp->rlistInner_sq = listParams.rlistInner * listParams.rlistInner;
189 nbp->useDynamicPruning = listParams.useDynamicPruning;
191 nbp->sh_lj_ewald = ic.sh_lj_ewald;
192 nbp->ewaldcoeff_lj = ic.ewaldcoeff_lj;
194 nbp->rvdw_switch = ic.rvdw_switch;
195 nbp->dispersion_shift = ic.dispersion_shift;
196 nbp->repulsion_shift = ic.repulsion_shift;
197 nbp->vdw_switch = ic.vdw_switch;
200 static inline void init_plist(gpu_plist* pl)
202 /* initialize to nullptr pointers to data that is not allocated here and will
203 need reallocation in nbnxn_gpu_init_pairlist */
209 /* size -1 indicates that the respective array hasn't been initialized yet */
216 pl->imask_nalloc = -1;
218 pl->excl_nalloc = -1;
219 pl->haveFreshList = false;
220 pl->rollingPruningNumParts = 0;
221 pl->rollingPruningPart = 0;
224 static inline void init_timings(gmx_wallclock_gpu_nbnxn_t* t)
231 for (int i = 0; i < 2; i++)
233 for (int j = 0; j < 2; j++)
235 t->ktime[i][j].t = 0.0;
236 t->ktime[i][j].c = 0;
240 t->pruneTime.t = 0.0;
241 t->dynamicPruneTime.c = 0;
242 t->dynamicPruneTime.t = 0.0;
245 /*! \brief Initialize \p atomdata first time; it only gets filled at pair-search. */
246 static inline void initAtomdataFirst(NBAtomDataGpu* atomdata,
248 const DeviceContext& deviceContext,
249 const DeviceStream& localStream)
251 atomdata->numTypes = numTypes;
252 allocateDeviceBuffer(&atomdata->shiftVec, SHIFTS, deviceContext);
253 atomdata->shiftVecUploaded = false;
255 allocateDeviceBuffer(&atomdata->fShift, SHIFTS, deviceContext);
256 allocateDeviceBuffer(&atomdata->eLJ, 1, deviceContext);
257 allocateDeviceBuffer(&atomdata->eElec, 1, deviceContext);
259 clearDeviceBufferAsync(&atomdata->fShift, 0, SHIFTS, localStream);
260 clearDeviceBufferAsync(&atomdata->eElec, 0, 1, localStream);
261 clearDeviceBufferAsync(&atomdata->eLJ, 0, 1, localStream);
263 /* initialize to nullptr pointers to data that is not allocated here and will
264 need reallocation in later */
265 atomdata->xq = nullptr;
266 atomdata->f = nullptr;
268 /* size -1 indicates that the respective array hasn't been initialized yet */
269 atomdata->numAtoms = -1;
270 atomdata->numAtomsAlloc = -1;
273 static inline VdwType nbnxmGpuPickVdwKernelType(const interaction_const_t& ic,
274 LJCombinationRule ljCombinationRule)
276 if (ic.vdwtype == VanDerWaalsType::Cut)
278 switch (ic.vdw_modifier)
280 case InteractionModifiers::None:
281 case InteractionModifiers::PotShift:
282 switch (ljCombinationRule)
284 case LJCombinationRule::None: return VdwType::Cut;
285 case LJCombinationRule::Geometric: return VdwType::CutCombGeom;
286 case LJCombinationRule::LorentzBerthelot: return VdwType::CutCombLB;
288 GMX_THROW(gmx::InconsistentInputError(gmx::formatString(
289 "The requested LJ combination rule %s is not implemented in "
290 "the GPU accelerated kernels!",
291 enumValueToString(ljCombinationRule))));
293 case InteractionModifiers::ForceSwitch: return VdwType::FSwitch;
294 case InteractionModifiers::PotSwitch: return VdwType::PSwitch;
296 GMX_THROW(gmx::InconsistentInputError(
297 gmx::formatString("The requested VdW interaction modifier %s is not "
298 "implemented in the GPU accelerated kernels!",
299 enumValueToString(ic.vdw_modifier))));
302 else if (ic.vdwtype == VanDerWaalsType::Pme)
304 if (ic.ljpme_comb_rule == LongRangeVdW::Geom)
307 ljCombinationRule == LJCombinationRule::Geometric,
308 "Combination rules for long- and short-range interactions should match.");
309 return VdwType::EwaldGeom;
314 ljCombinationRule == LJCombinationRule::LorentzBerthelot,
315 "Combination rules for long- and short-range interactions should match.");
316 return VdwType::EwaldLB;
321 GMX_THROW(gmx::InconsistentInputError(gmx::formatString(
322 "The requested VdW type %s is not implemented in the GPU accelerated kernels!",
323 enumValueToString(ic.vdwtype))));
327 static inline ElecType nbnxmGpuPickElectrostaticsKernelType(const interaction_const_t& ic,
328 const DeviceInformation& deviceInfo)
330 if (ic.eeltype == CoulombInteractionType::Cut)
332 return ElecType::Cut;
334 else if (EEL_RF(ic.eeltype))
338 else if ((EEL_PME(ic.eeltype) || ic.eeltype == CoulombInteractionType::Ewald))
340 return nbnxn_gpu_pick_ewald_kernel_type(ic, deviceInfo);
344 /* Shouldn't happen, as this is checked when choosing Verlet-scheme */
345 GMX_THROW(gmx::InconsistentInputError(
346 gmx::formatString("The requested electrostatics type %s is not implemented in "
347 "the GPU accelerated kernels!",
348 enumValueToString(ic.eeltype))));
352 /*! \brief Initialize the nonbonded parameter data structure. */
353 static inline void initNbparam(NBParamGpu* nbp,
354 const interaction_const_t& ic,
355 const PairlistParams& listParams,
356 const nbnxn_atomdata_t::Params& nbatParams,
357 const DeviceContext& deviceContext)
359 const int numTypes = nbatParams.numTypes;
361 set_cutoff_parameters(nbp, ic, listParams);
363 nbp->vdwType = nbnxmGpuPickVdwKernelType(ic, nbatParams.ljCombinationRule);
364 nbp->elecType = nbnxmGpuPickElectrostaticsKernelType(ic, deviceContext.deviceInfo());
366 if (ic.vdwtype == VanDerWaalsType::Pme)
368 if (ic.ljpme_comb_rule == LongRangeVdW::Geom)
370 GMX_ASSERT(nbatParams.ljCombinationRule == LJCombinationRule::Geometric,
371 "Combination rule mismatch!");
375 GMX_ASSERT(nbatParams.ljCombinationRule == LJCombinationRule::LorentzBerthelot,
376 "Combination rule mismatch!");
380 /* generate table for PME */
381 if (nbp->elecType == ElecType::EwaldTab || nbp->elecType == ElecType::EwaldTabTwin)
383 GMX_RELEASE_ASSERT(ic.coulombEwaldTables, "Need valid Coulomb Ewald correction tables");
384 init_ewald_coulomb_force_table(*ic.coulombEwaldTables, nbp, deviceContext);
387 /* set up LJ parameter lookup table */
388 if (!useLjCombRule(nbp->vdwType))
390 static_assert(sizeof(decltype(nbp->nbfp)) == 2 * sizeof(decltype(*nbatParams.nbfp.data())),
391 "Mismatch in the size of host / device data types");
392 initParamLookupTable(&nbp->nbfp,
394 reinterpret_cast<const Float2*>(nbatParams.nbfp.data()),
399 /* set up LJ-PME parameter lookup table */
400 if (ic.vdwtype == VanDerWaalsType::Pme)
402 static_assert(sizeof(decltype(nbp->nbfp_comb))
403 == 2 * sizeof(decltype(*nbatParams.nbfp_comb.data())),
404 "Mismatch in the size of host / device data types");
405 initParamLookupTable(&nbp->nbfp_comb,
406 &nbp->nbfp_comb_texobj,
407 reinterpret_cast<const Float2*>(nbatParams.nbfp_comb.data()),
413 NbnxmGpu* gpu_init(const gmx::DeviceStreamManager& deviceStreamManager,
414 const interaction_const_t* ic,
415 const PairlistParams& listParams,
416 const nbnxn_atomdata_t* nbat,
417 const bool bLocalAndNonlocal)
419 auto* nb = new NbnxmGpu();
420 nb->deviceContext_ = &deviceStreamManager.context();
421 nb->atdat = new NBAtomDataGpu;
422 nb->nbparam = new NBParamGpu;
423 nb->plist[InteractionLocality::Local] = new Nbnxm::gpu_plist;
424 if (bLocalAndNonlocal)
426 nb->plist[InteractionLocality::NonLocal] = new Nbnxm::gpu_plist;
429 nb->bUseTwoStreams = bLocalAndNonlocal;
431 nb->timers = new Nbnxm::GpuTimers();
432 snew(nb->timings, 1);
434 /* WARNING: CUDA timings are incorrect with multiple streams.
435 * This is the main reason why they are disabled by default.
436 * Can be enabled by setting GMX_ENABLE_GPU_TIMING environment variable.
437 * TODO: Consider turning on by default when we can detect nr of streams.
439 * OpenCL timing is enabled by default and can be disabled by
440 * GMX_DISABLE_GPU_TIMING environment variable.
442 * Timing is disabled in SYCL.
444 nb->bDoTime = (GMX_GPU_CUDA && (getenv("GMX_ENABLE_GPU_TIMING") != nullptr))
445 || (GMX_GPU_OPENCL && (getenv("GMX_DISABLE_GPU_TIMING") == nullptr));
449 init_timings(nb->timings);
453 pmalloc(reinterpret_cast<void**>(&nb->nbst.eLJ), sizeof(*nb->nbst.eLJ));
454 pmalloc(reinterpret_cast<void**>(&nb->nbst.eElec), sizeof(*nb->nbst.eElec));
455 pmalloc(reinterpret_cast<void**>(&nb->nbst.fShift), SHIFTS * sizeof(*nb->nbst.fShift));
457 init_plist(nb->plist[InteractionLocality::Local]);
459 /* local/non-local GPU streams */
460 GMX_RELEASE_ASSERT(deviceStreamManager.streamIsValid(gmx::DeviceStreamType::NonBondedLocal),
461 "Local non-bonded stream should be initialized to use GPU for non-bonded.");
462 const DeviceStream& localStream = deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedLocal);
463 nb->deviceStreams[InteractionLocality::Local] = &localStream;
464 // In general, it's not strictly necessary to use 2 streams for SYCL, since they are
465 // out-of-order. But for the time being, it will be less disruptive to keep them.
466 if (nb->bUseTwoStreams)
468 init_plist(nb->plist[InteractionLocality::NonLocal]);
470 GMX_RELEASE_ASSERT(deviceStreamManager.streamIsValid(gmx::DeviceStreamType::NonBondedNonLocal),
471 "Non-local non-bonded stream should be initialized to use GPU for "
472 "non-bonded with domain decomposition.");
473 nb->deviceStreams[InteractionLocality::NonLocal] =
474 &deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedNonLocal);
477 const nbnxn_atomdata_t::Params& nbatParams = nbat->params();
478 const DeviceContext& deviceContext = *nb->deviceContext_;
480 initNbparam(nb->nbparam, *ic, listParams, nbatParams, deviceContext);
481 initAtomdataFirst(nb->atdat, nbatParams.numTypes, deviceContext, localStream);
483 gpu_init_platform_specific(nb);
487 fprintf(debug, "Initialized NBNXM GPU data structures.\n");
493 void gpu_pme_loadbal_update_param(const nonbonded_verlet_t* nbv, const interaction_const_t& ic)
495 if (!nbv || !nbv->useGpu())
499 NbnxmGpu* nb = nbv->gpu_nbv;
500 NBParamGpu* nbp = nb->nbparam;
502 set_cutoff_parameters(nbp, ic, nbv->pairlistSets().params());
504 nbp->elecType = nbnxn_gpu_pick_ewald_kernel_type(ic, nb->deviceContext_->deviceInfo());
506 GMX_RELEASE_ASSERT(ic.coulombEwaldTables, "Need valid Coulomb Ewald correction tables");
507 init_ewald_coulomb_force_table(*ic.coulombEwaldTables, nbp, *nb->deviceContext_);
510 void gpu_upload_shiftvec(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom)
512 NBAtomDataGpu* adat = nb->atdat;
513 const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local];
515 /* only if we have a dynamic box */
516 if (nbatom->bDynamicBox || !adat->shiftVecUploaded)
518 copyToDeviceBuffer(&adat->shiftVec,
519 gmx::asGenericFloat3Pointer(nbatom->shift_vec),
523 GpuApiCallBehavior::Async,
525 adat->shiftVecUploaded = true;
529 //! This function is documented in the header file
530 void gpu_init_pairlist(NbnxmGpu* nb, const NbnxnPairlistGpu* h_plist, const InteractionLocality iloc)
533 // Timing accumulation should happen only if there was work to do
534 // because getLastRangeTime() gets skipped with empty lists later
535 // which leads to the counter not being reset.
536 bool bDoTime = (nb->bDoTime && !h_plist->sci.empty());
537 const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
538 gpu_plist* d_plist = nb->plist[iloc];
540 if (d_plist->na_c < 0)
542 d_plist->na_c = h_plist->na_ci;
546 if (d_plist->na_c != h_plist->na_ci)
549 "In init_plist: the #atoms per cell has changed (from %d to %d)",
556 GpuTimers::Interaction& iTimers = nb->timers->interaction[iloc];
560 iTimers.pl_h2d.openTimingRegion(deviceStream);
561 iTimers.didPairlistH2D = true;
564 // TODO most of this function is same in CUDA and OpenCL, move into the header
565 const DeviceContext& deviceContext = *nb->deviceContext_;
567 reallocateDeviceBuffer(
568 &d_plist->sci, h_plist->sci.size(), &d_plist->nsci, &d_plist->sci_nalloc, deviceContext);
569 copyToDeviceBuffer(&d_plist->sci,
574 GpuApiCallBehavior::Async,
575 bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
577 reallocateDeviceBuffer(
578 &d_plist->cj4, h_plist->cj4.size(), &d_plist->ncj4, &d_plist->cj4_nalloc, deviceContext);
579 copyToDeviceBuffer(&d_plist->cj4,
584 GpuApiCallBehavior::Async,
585 bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
587 reallocateDeviceBuffer(&d_plist->imask,
588 h_plist->cj4.size() * c_nbnxnGpuClusterpairSplit,
590 &d_plist->imask_nalloc,
593 reallocateDeviceBuffer(
594 &d_plist->excl, h_plist->excl.size(), &d_plist->nexcl, &d_plist->excl_nalloc, deviceContext);
595 copyToDeviceBuffer(&d_plist->excl,
596 h_plist->excl.data(),
598 h_plist->excl.size(),
600 GpuApiCallBehavior::Async,
601 bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
605 iTimers.pl_h2d.closeTimingRegion(deviceStream);
608 /* need to prune the pair list during the next step */
609 d_plist->haveFreshList = true;
612 void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat)
614 bool bDoTime = nb->bDoTime;
615 Nbnxm::GpuTimers* timers = bDoTime ? nb->timers : nullptr;
616 NBAtomDataGpu* atdat = nb->atdat;
617 const DeviceContext& deviceContext = *nb->deviceContext_;
618 const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local];
620 int numAtoms = nbat->numAtoms();
621 bool realloced = false;
625 /* time async copy */
626 timers->atdat.openTimingRegion(localStream);
629 /* need to reallocate if we have to copy more atoms than the amount of space
630 available and only allocate if we haven't initialized yet, i.e atdat->natoms == -1 */
631 if (numAtoms > atdat->numAtomsAlloc)
633 int numAlloc = over_alloc_small(numAtoms);
635 /* free up first if the arrays have already been initialized */
636 if (atdat->numAtomsAlloc != -1)
638 freeDeviceBuffer(&atdat->f);
639 freeDeviceBuffer(&atdat->xq);
640 if (useLjCombRule(nb->nbparam->vdwType))
642 freeDeviceBuffer(&atdat->ljComb);
646 freeDeviceBuffer(&atdat->atomTypes);
651 allocateDeviceBuffer(&atdat->f, numAlloc, deviceContext);
652 allocateDeviceBuffer(&atdat->xq, numAlloc, deviceContext);
654 if (useLjCombRule(nb->nbparam->vdwType))
656 // Two Lennard-Jones parameters per atom
657 allocateDeviceBuffer(&atdat->ljComb, numAlloc, deviceContext);
661 allocateDeviceBuffer(&atdat->atomTypes, numAlloc, deviceContext);
664 atdat->numAtomsAlloc = numAlloc;
668 atdat->numAtoms = numAtoms;
669 atdat->numAtomsLocal = nbat->natoms_local;
671 /* need to clear GPU f output if realloc happened */
674 clearDeviceBufferAsync(&atdat->f, 0, atdat->numAtomsAlloc, localStream);
677 if (useLjCombRule(nb->nbparam->vdwType))
680 sizeof(Float2) == 2 * sizeof(*nbat->params().lj_comb.data()),
681 "Size of a pair of LJ parameters elements should be equal to the size of Float2.");
682 copyToDeviceBuffer(&atdat->ljComb,
683 reinterpret_cast<const Float2*>(nbat->params().lj_comb.data()),
687 GpuApiCallBehavior::Async,
688 bDoTime ? timers->atdat.fetchNextEvent() : nullptr);
692 static_assert(sizeof(int) == sizeof(*nbat->params().type.data()),
693 "Sizes of host- and device-side atom types should be the same.");
694 copyToDeviceBuffer(&atdat->atomTypes,
695 nbat->params().type.data(),
699 GpuApiCallBehavior::Async,
700 bDoTime ? timers->atdat.fetchNextEvent() : nullptr);
705 timers->atdat.closeTimingRegion(localStream);
708 /* kick off the tasks enqueued above to ensure concurrency with the search */
709 issueClFlushInStream(localStream);
712 void gpu_clear_outputs(NbnxmGpu* nb, bool computeVirial)
714 NBAtomDataGpu* adat = nb->atdat;
715 const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local];
717 clearDeviceBufferAsync(&adat->f, 0, nb->atdat->numAtoms, localStream);
718 // Clear shift force array and energies if the outputs were used in the current step
721 clearDeviceBufferAsync(&adat->fShift, 0, SHIFTS, localStream);
722 clearDeviceBufferAsync(&adat->eLJ, 0, 1, localStream);
723 clearDeviceBufferAsync(&adat->eElec, 0, 1, localStream);
725 issueClFlushInStream(localStream);
728 //! This function is documented in the header file
729 gmx_wallclock_gpu_nbnxn_t* gpu_get_timings(NbnxmGpu* nb)
731 return (nb != nullptr && nb->bDoTime) ? nb->timings : nullptr;
734 //! This function is documented in the header file
735 void gpu_reset_timings(nonbonded_verlet_t* nbv)
737 if (nbv->gpu_nbv && nbv->gpu_nbv->bDoTime)
739 init_timings(nbv->gpu_nbv->timings);
743 bool gpu_is_kernel_ewald_analytical(const NbnxmGpu* nb)
745 return ((nb->nbparam->elecType == ElecType::EwaldAna)
746 || (nb->nbparam->elecType == ElecType::EwaldAnaTwin));
749 void setupGpuShortRangeWork(NbnxmGpu* nb, const gmx::GpuBonded* gpuBonded, const gmx::InteractionLocality iLocality)
751 GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
753 // There is short-range work if the pair list for the provided
754 // interaction locality contains entries or if there is any
755 // bonded work (as this is not split into local/nonlocal).
756 nb->haveWork[iLocality] = ((nb->plist[iLocality]->nsci != 0)
757 || (gpuBonded != nullptr && gpuBonded->haveInteractions()));
760 bool haveGpuShortRangeWork(const NbnxmGpu* nb, const gmx::InteractionLocality interactionLocality)
762 GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
764 return nb->haveWork[interactionLocality];
768 * Launch asynchronously the download of nonbonded forces from the GPU
769 * (and energies/shift forces if required).
771 void gpu_launch_cpyback(NbnxmGpu* nb,
772 struct nbnxn_atomdata_t* nbatom,
773 const gmx::StepWorkload& stepWork,
774 const AtomLocality atomLocality)
776 GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
778 /* determine interaction locality from atom locality */
779 const InteractionLocality iloc = atomToInteractionLocality(atomLocality);
780 GMX_ASSERT(iloc == InteractionLocality::Local
781 || (iloc == InteractionLocality::NonLocal && nb->bNonLocalStreamDoneMarked == false),
782 "Non-local stream is indicating that the copy back event is enqueued at the "
783 "beginning of the copy back function.");
785 /* extract the data */
786 NBAtomDataGpu* adat = nb->atdat;
787 Nbnxm::GpuTimers* timers = nb->timers;
788 bool bDoTime = nb->bDoTime;
789 const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
791 /* don't launch non-local copy-back if there was no non-local work to do */
792 if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(nb, iloc))
794 /* TODO An alternative way to signal that non-local work is
795 complete is to use a clEnqueueMarker+clEnqueueBarrier
796 pair. However, the use of bNonLocalStreamDoneMarked has the
797 advantage of being local to the host, so probably minimizes
798 overhead. Curiously, for NVIDIA OpenCL with an empty-domain
799 test case, overall simulation performance was higher with
800 the API calls, but this has not been tested on AMD OpenCL,
801 so could be worth considering in future. */
802 nb->bNonLocalStreamDoneMarked = false;
806 /* local/nonlocal offset and length used for xq and f */
807 auto atomsRange = getGpuAtomRange(adat, atomLocality);
809 /* beginning of timed D2H section */
812 timers->xf[atomLocality].nb_d2h.openTimingRegion(deviceStream);
815 /* With DD the local D2H transfer can only start after the non-local
816 has been launched. */
817 if (iloc == InteractionLocality::Local && nb->bNonLocalStreamDoneMarked)
819 nb->nonlocal_done.enqueueWaitEvent(deviceStream);
820 nb->bNonLocalStreamDoneMarked = false;
824 if (!stepWork.useGpuFBufferOps)
827 sizeof(*nbatom->out[0].f.data()) == sizeof(float),
828 "The host force buffer should be in single precision to match device data size.");
829 copyFromDeviceBuffer(reinterpret_cast<Float3*>(nbatom->out[0].f.data()) + atomsRange.begin(),
834 GpuApiCallBehavior::Async,
835 bDoTime ? timers->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
837 issueClFlushInStream(deviceStream);
840 /* After the non-local D2H is launched the nonlocal_done event can be
841 recorded which signals that the local D2H can proceed. This event is not
842 placed after the non-local kernel because we first need the non-local
844 if (iloc == InteractionLocality::NonLocal)
846 nb->nonlocal_done.markEvent(deviceStream);
847 nb->bNonLocalStreamDoneMarked = true;
850 /* only transfer energies in the local stream */
851 if (iloc == InteractionLocality::Local)
853 /* DtoH fshift when virial is needed */
854 if (stepWork.computeVirial)
857 sizeof(*nb->nbst.fShift) == sizeof(Float3),
858 "Sizes of host- and device-side shift vector elements should be the same.");
859 copyFromDeviceBuffer(nb->nbst.fShift,
864 GpuApiCallBehavior::Async,
865 bDoTime ? timers->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
869 if (stepWork.computeEnergy)
871 static_assert(sizeof(*nb->nbst.eLJ) == sizeof(float),
872 "Sizes of host- and device-side LJ energy terms should be the same.");
873 copyFromDeviceBuffer(nb->nbst.eLJ,
878 GpuApiCallBehavior::Async,
879 bDoTime ? timers->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
880 static_assert(sizeof(*nb->nbst.eElec) == sizeof(float),
881 "Sizes of host- and device-side electrostatic energy terms should be the "
883 copyFromDeviceBuffer(nb->nbst.eElec,
888 GpuApiCallBehavior::Async,
889 bDoTime ? timers->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
895 timers->xf[atomLocality].nb_d2h.closeTimingRegion(deviceStream);
899 void nbnxnInsertNonlocalGpuDependency(NbnxmGpu* nb, const InteractionLocality interactionLocality)
901 const DeviceStream& deviceStream = *nb->deviceStreams[interactionLocality];
903 /* When we get here all misc operations issued in the local stream as well as
904 the local xq H2D are done,
905 so we record that in the local stream and wait for it in the nonlocal one.
906 This wait needs to precede any PP tasks, bonded or nonbonded, that may
907 compute on interactions between local and nonlocal atoms.
909 if (nb->bUseTwoStreams)
911 if (interactionLocality == InteractionLocality::Local)
913 nb->misc_ops_and_local_H2D_done.markEvent(deviceStream);
914 issueClFlushInStream(deviceStream);
918 nb->misc_ops_and_local_H2D_done.enqueueWaitEvent(deviceStream);
923 /*! \brief Launch asynchronously the xq buffer host to device copy. */
924 void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const AtomLocality atomLocality)
926 GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
928 const InteractionLocality iloc = atomToInteractionLocality(atomLocality);
930 NBAtomDataGpu* adat = nb->atdat;
931 gpu_plist* plist = nb->plist[iloc];
932 Nbnxm::GpuTimers* timers = nb->timers;
933 const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
935 const bool bDoTime = nb->bDoTime;
937 /* Don't launch the non-local H2D copy if there is no dependent
938 work to do: neither non-local nor other (e.g. bonded) work
939 to do that has as input the nbnxn coordaintes.
940 Doing the same for the local kernel is more complicated, since the
941 local part of the force array also depends on the non-local kernel.
942 So to avoid complicating the code and to reduce the risk of bugs,
943 we always call the local local x+q copy (and the rest of the local
944 work in nbnxn_gpu_launch_kernel().
946 if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(nb, iloc))
948 plist->haveFreshList = false;
950 // The event is marked for Local interactions unconditionally,
951 // so it has to be released here because of the early return
952 // for NonLocal interactions.
953 nb->misc_ops_and_local_H2D_done.reset();
958 /* local/nonlocal offset and length used for xq and f */
959 const auto atomsRange = getGpuAtomRange(adat, atomLocality);
961 /* beginning of timed HtoD section */
964 timers->xf[atomLocality].nb_h2d.openTimingRegion(deviceStream);
968 GMX_ASSERT(nbatom->XFormat == nbatXYZQ,
969 "The coordinates should be in xyzq format to copy to the Float4 device buffer.");
970 copyToDeviceBuffer(&adat->xq,
971 reinterpret_cast<const Float4*>(nbatom->x().data()) + atomsRange.begin(),
975 GpuApiCallBehavior::Async,
980 timers->xf[atomLocality].nb_h2d.closeTimingRegion(deviceStream);
983 /* When we get here all misc operations issued in the local stream as well as
984 the local xq H2D are done,
985 so we record that in the local stream and wait for it in the nonlocal one.
986 This wait needs to precede any PP tasks, bonded or nonbonded, that may
987 compute on interactions between local and nonlocal atoms.
989 nbnxnInsertNonlocalGpuDependency(nb, iloc);
993 /* Initialization for X buffer operations on GPU. */
994 void nbnxn_gpu_init_x_to_nbat_x(const Nbnxm::GridSet& gridSet, NbnxmGpu* gpu_nbv)
996 const DeviceStream& localStream = *gpu_nbv->deviceStreams[InteractionLocality::Local];
997 const bool bDoTime = gpu_nbv->bDoTime;
998 const int maxNumColumns = gridSet.numColumnsMax();
1000 reallocateDeviceBuffer(&gpu_nbv->cxy_na,
1001 maxNumColumns * gridSet.grids().size(),
1003 &gpu_nbv->ncxy_na_alloc,
1004 *gpu_nbv->deviceContext_);
1005 reallocateDeviceBuffer(&gpu_nbv->cxy_ind,
1006 maxNumColumns * gridSet.grids().size(),
1008 &gpu_nbv->ncxy_ind_alloc,
1009 *gpu_nbv->deviceContext_);
1011 for (unsigned int g = 0; g < gridSet.grids().size(); g++)
1013 const Nbnxm::Grid& grid = gridSet.grids()[g];
1015 const int numColumns = grid.numColumns();
1016 const int* atomIndices = gridSet.atomIndices().data();
1017 const int atomIndicesSize = gridSet.atomIndices().size();
1018 const int* cxy_na = grid.cxy_na().data();
1019 const int* cxy_ind = grid.cxy_ind().data();
1021 auto* timerH2D = bDoTime ? &gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d : nullptr;
1023 reallocateDeviceBuffer(&gpu_nbv->atomIndices,
1025 &gpu_nbv->atomIndicesSize,
1026 &gpu_nbv->atomIndicesSize_alloc,
1027 *gpu_nbv->deviceContext_);
1029 if (atomIndicesSize > 0)
1033 timerH2D->openTimingRegion(localStream);
1036 copyToDeviceBuffer(&gpu_nbv->atomIndices,
1041 GpuApiCallBehavior::Async,
1042 bDoTime ? timerH2D->fetchNextEvent() : nullptr);
1046 timerH2D->closeTimingRegion(localStream);
1054 timerH2D->openTimingRegion(localStream);
1057 copyToDeviceBuffer(&gpu_nbv->cxy_na,
1062 GpuApiCallBehavior::Async,
1063 bDoTime ? timerH2D->fetchNextEvent() : nullptr);
1067 timerH2D->closeTimingRegion(localStream);
1072 timerH2D->openTimingRegion(localStream);
1075 copyToDeviceBuffer(&gpu_nbv->cxy_ind,
1080 GpuApiCallBehavior::Async,
1081 bDoTime ? timerH2D->fetchNextEvent() : nullptr);
1085 timerH2D->closeTimingRegion(localStream);
1090 // The above data is transferred on the local stream but is a
1091 // dependency of the nonlocal stream (specifically the nonlocal X
1092 // buf ops kernel). We therefore set a dependency to ensure
1093 // that the nonlocal stream waits on the local stream here.
1094 // This call records an event in the local stream:
1095 nbnxnInsertNonlocalGpuDependency(gpu_nbv, Nbnxm::InteractionLocality::Local);
1096 // ...and this call instructs the nonlocal stream to wait on that event:
1097 nbnxnInsertNonlocalGpuDependency(gpu_nbv, Nbnxm::InteractionLocality::NonLocal);
1100 //! This function is documented in the header file
1101 void gpu_free(NbnxmGpu* nb)
1108 gpu_free_platform_specific(nb);
1113 NBAtomDataGpu* atdat = nb->atdat;
1114 NBParamGpu* nbparam = nb->nbparam;
1117 freeDeviceBuffer(&(nb->atdat->xq));
1118 freeDeviceBuffer(&(nb->atdat->f));
1119 freeDeviceBuffer(&(nb->atdat->eLJ));
1120 freeDeviceBuffer(&(nb->atdat->eElec));
1121 freeDeviceBuffer(&(nb->atdat->fShift));
1122 freeDeviceBuffer(&(nb->atdat->shiftVec));
1123 if (useLjCombRule(nb->nbparam->vdwType))
1125 freeDeviceBuffer(&atdat->ljComb);
1129 freeDeviceBuffer(&atdat->atomTypes);
1133 if (nbparam->elecType == ElecType::EwaldTab || nbparam->elecType == ElecType::EwaldTabTwin)
1135 destroyParamLookupTable(&nbparam->coulomb_tab, nbparam->coulomb_tab_texobj);
1138 if (!useLjCombRule(nb->nbparam->vdwType))
1140 destroyParamLookupTable(&nbparam->nbfp, nbparam->nbfp_texobj);
1143 if (nbparam->vdwType == VdwType::EwaldGeom || nbparam->vdwType == VdwType::EwaldLB)
1145 destroyParamLookupTable(&nbparam->nbfp_comb, nbparam->nbfp_comb_texobj);
1149 auto* plist = nb->plist[InteractionLocality::Local];
1150 freeDeviceBuffer(&plist->sci);
1151 freeDeviceBuffer(&plist->cj4);
1152 freeDeviceBuffer(&plist->imask);
1153 freeDeviceBuffer(&plist->excl);
1155 if (nb->bUseTwoStreams)
1157 auto* plist_nl = nb->plist[InteractionLocality::NonLocal];
1158 freeDeviceBuffer(&plist_nl->sci);
1159 freeDeviceBuffer(&plist_nl->cj4);
1160 freeDeviceBuffer(&plist_nl->imask);
1161 freeDeviceBuffer(&plist_nl->excl);
1166 pfree(nb->nbst.eLJ);
1167 nb->nbst.eLJ = nullptr;
1169 pfree(nb->nbst.eElec);
1170 nb->nbst.eElec = nullptr;
1172 pfree(nb->nbst.fShift);
1173 nb->nbst.fShift = nullptr;
1181 fprintf(debug, "Cleaned up NBNXM GPU data structures.\n");
1185 } // namespace Nbnxm