2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2016 by the GROMACS development team.
5 * Copyright (c) 2017,2018,2019,2020,2021, by the GROMACS development team, led by
6 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
7 * and including many others, as listed in the AUTHORS file in the
8 * top-level source directory and at http://www.gromacs.org.
10 * GROMACS is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public License
12 * as published by the Free Software Foundation; either version 2.1
13 * of the License, or (at your option) any later version.
15 * GROMACS is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with GROMACS; if not, see
22 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
23 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
25 * If you want to redistribute modifications to GROMACS, please
26 * consider that scientific software is very special. Version
27 * control is crucial - bugs must be traceable. We will be happy to
28 * consider code for inclusion in the official distribution, but
29 * derived work must not be called official GROMACS. Details are found
30 * in the README & COPYING files - if they are missing, get the
31 * official version at http://www.gromacs.org.
33 * To help us fund GROMACS development, we humbly ask that you cite
34 * the research papers on the package. Check out http://www.gromacs.org.
37 * \brief Define common implementation of nbnxm_gpu_data_mgmt.h
39 * \author Anca Hamuraru <anca@streamcomputing.eu>
40 * \author Dimitrios Karkoulis <dimitris.karkoulis@gmail.com>
41 * \author Teemu Virolainen <teemu@streamcomputing.eu>
42 * \author Szilárd Páll <pall.szilard@gmail.com>
43 * \author Artem Zhmurov <zhmurov@gmail.com>
45 * \ingroup module_nbnxm
52 # include "cuda/nbnxm_cuda_types.h"
56 # include "opencl/nbnxm_ocl_types.h"
60 # include "sycl/nbnxm_sycl_types.h"
63 #include "nbnxm_gpu_data_mgmt.h"
65 #include "gromacs/gpu_utils/device_stream_manager.h"
66 #include "gromacs/gpu_utils/gputraits.h"
67 #include "gromacs/gpu_utils/pmalloc.h"
68 #include "gromacs/hardware/device_information.h"
69 #include "gromacs/mdtypes/interaction_const.h"
70 #include "gromacs/mdtypes/simulation_workload.h"
71 #include "gromacs/nbnxm/gpu_common_utils.h"
72 #include "gromacs/nbnxm/gpu_data_mgmt.h"
73 #include "gromacs/nbnxm/gridset.h"
74 #include "gromacs/pbcutil/ishift.h"
75 #include "gromacs/timing/gpu_timing.h"
76 #include "gromacs/pbcutil/ishift.h"
77 #include "gromacs/utility/cstringutil.h"
78 #include "gromacs/utility/exceptions.h"
79 #include "gromacs/utility/fatalerror.h"
81 #include "nbnxm_gpu.h"
82 #include "pairlistsets.h"
87 static inline void issueClFlushInStream(const DeviceStream& deviceStream)
90 /* Based on the v1.2 section 5.13 of the OpenCL spec, a flush is needed
91 * in the stream after marking an event in it in order to be able to sync with
92 * the event from another stream.
94 cl_int cl_error = clFlush(deviceStream.stream());
95 if (cl_error != CL_SUCCESS)
97 GMX_THROW(gmx::InternalError("clFlush failed: " + ocl_get_error_string(cl_error)));
100 GMX_UNUSED_VALUE(deviceStream);
104 static inline void init_ewald_coulomb_force_table(const EwaldCorrectionTables& tables,
106 const DeviceContext& deviceContext)
108 if (nbp->coulomb_tab)
110 destroyParamLookupTable(&nbp->coulomb_tab, &nbp->coulomb_tab_texobj);
113 nbp->coulomb_tab_scale = tables.scale;
114 initParamLookupTable(
115 &nbp->coulomb_tab, &nbp->coulomb_tab_texobj, tables.tableF.data(), tables.tableF.size(), deviceContext);
118 static inline ElecType nbnxn_gpu_pick_ewald_kernel_type(const interaction_const_t& ic,
119 const DeviceInformation gmx_unused& deviceInfo)
121 bool bTwinCut = (ic.rcoulomb != ic.rvdw);
123 /* Benchmarking/development environment variables to force the use of
124 analytical or tabulated Ewald kernel. */
125 const bool forceAnalyticalEwald = (getenv("GMX_GPU_NB_ANA_EWALD") != nullptr);
126 const bool forceTabulatedEwald = (getenv("GMX_GPU_NB_TAB_EWALD") != nullptr);
127 const bool forceTwinCutoffEwald = (getenv("GMX_GPU_NB_EWALD_TWINCUT") != nullptr);
129 if (forceAnalyticalEwald && forceTabulatedEwald)
132 "Both analytical and tabulated Ewald GPU non-bonded kernels "
133 "requested through environment variables.");
136 /* By default, use analytical Ewald except with CUDA on NVIDIA CC 7.0 and 8.0.
138 const bool c_useTabulatedEwaldDefault =
140 (deviceInfo.prop.major == 7 && deviceInfo.prop.minor == 0)
141 || (deviceInfo.prop.major == 8 && deviceInfo.prop.minor == 0);
145 bool bUseAnalyticalEwald = !c_useTabulatedEwaldDefault;
146 if (forceAnalyticalEwald)
148 bUseAnalyticalEwald = true;
151 fprintf(debug, "Using analytical Ewald GPU kernels\n");
154 else if (forceTabulatedEwald)
156 bUseAnalyticalEwald = false;
160 fprintf(debug, "Using tabulated Ewald GPU kernels\n");
164 /* Use twin cut-off kernels if requested by bTwinCut or the env. var.
165 forces it (use it for debugging/benchmarking only). */
166 if (!bTwinCut && !forceTwinCutoffEwald)
168 return bUseAnalyticalEwald ? ElecType::EwaldAna : ElecType::EwaldTab;
172 return bUseAnalyticalEwald ? ElecType::EwaldAnaTwin : ElecType::EwaldTabTwin;
176 static inline void set_cutoff_parameters(NBParamGpu* nbp,
177 const interaction_const_t& ic,
178 const PairlistParams& listParams)
180 nbp->ewald_beta = ic.ewaldcoeff_q;
181 nbp->sh_ewald = ic.sh_ewald;
182 nbp->epsfac = ic.epsfac;
183 nbp->two_k_rf = 2.0 * ic.reactionFieldCoefficient;
184 nbp->c_rf = ic.reactionFieldShift;
185 nbp->rvdw_sq = ic.rvdw * ic.rvdw;
186 nbp->rcoulomb_sq = ic.rcoulomb * ic.rcoulomb;
187 nbp->rlistOuter_sq = listParams.rlistOuter * listParams.rlistOuter;
188 nbp->rlistInner_sq = listParams.rlistInner * listParams.rlistInner;
189 nbp->useDynamicPruning = listParams.useDynamicPruning;
191 nbp->sh_lj_ewald = ic.sh_lj_ewald;
192 nbp->ewaldcoeff_lj = ic.ewaldcoeff_lj;
194 nbp->rvdw_switch = ic.rvdw_switch;
195 nbp->dispersion_shift = ic.dispersion_shift;
196 nbp->repulsion_shift = ic.repulsion_shift;
197 nbp->vdw_switch = ic.vdw_switch;
200 static inline void init_plist(gpu_plist* pl)
202 /* initialize to nullptr pointers to data that is not allocated here and will
203 need reallocation in nbnxn_gpu_init_pairlist */
209 /* size -1 indicates that the respective array hasn't been initialized yet */
216 pl->imask_nalloc = -1;
218 pl->excl_nalloc = -1;
219 pl->haveFreshList = false;
220 pl->rollingPruningNumParts = 0;
221 pl->rollingPruningPart = 0;
224 static inline void init_timings(gmx_wallclock_gpu_nbnxn_t* t)
231 for (int i = 0; i < 2; i++)
233 for (int j = 0; j < 2; j++)
235 t->ktime[i][j].t = 0.0;
236 t->ktime[i][j].c = 0;
240 t->pruneTime.t = 0.0;
241 t->dynamicPruneTime.c = 0;
242 t->dynamicPruneTime.t = 0.0;
245 /*! \brief Initialize \p atomdata first time; it only gets filled at pair-search. */
246 static inline void initAtomdataFirst(NBAtomDataGpu* atomdata,
248 const DeviceContext& deviceContext,
249 const DeviceStream& localStream)
251 atomdata->numTypes = numTypes;
252 allocateDeviceBuffer(&atomdata->shiftVec, gmx::c_numShiftVectors, deviceContext);
253 atomdata->shiftVecUploaded = false;
255 allocateDeviceBuffer(&atomdata->fShift, gmx::c_numShiftVectors, deviceContext);
256 allocateDeviceBuffer(&atomdata->eLJ, 1, deviceContext);
257 allocateDeviceBuffer(&atomdata->eElec, 1, deviceContext);
259 clearDeviceBufferAsync(&atomdata->fShift, 0, gmx::c_numShiftVectors, localStream);
260 clearDeviceBufferAsync(&atomdata->eElec, 0, 1, localStream);
261 clearDeviceBufferAsync(&atomdata->eLJ, 0, 1, localStream);
263 /* initialize to nullptr pointers to data that is not allocated here and will
264 need reallocation in later */
265 atomdata->xq = nullptr;
266 atomdata->f = nullptr;
268 /* size -1 indicates that the respective array hasn't been initialized yet */
269 atomdata->numAtoms = -1;
270 atomdata->numAtomsAlloc = -1;
273 static inline VdwType nbnxmGpuPickVdwKernelType(const interaction_const_t& ic,
274 LJCombinationRule ljCombinationRule)
276 if (ic.vdwtype == VanDerWaalsType::Cut)
278 switch (ic.vdw_modifier)
280 case InteractionModifiers::None:
281 case InteractionModifiers::PotShift:
282 switch (ljCombinationRule)
284 case LJCombinationRule::None: return VdwType::Cut;
285 case LJCombinationRule::Geometric: return VdwType::CutCombGeom;
286 case LJCombinationRule::LorentzBerthelot: return VdwType::CutCombLB;
288 GMX_THROW(gmx::InconsistentInputError(gmx::formatString(
289 "The requested LJ combination rule %s is not implemented in "
290 "the GPU accelerated kernels!",
291 enumValueToString(ljCombinationRule))));
293 case InteractionModifiers::ForceSwitch: return VdwType::FSwitch;
294 case InteractionModifiers::PotSwitch: return VdwType::PSwitch;
296 GMX_THROW(gmx::InconsistentInputError(
297 gmx::formatString("The requested VdW interaction modifier %s is not "
298 "implemented in the GPU accelerated kernels!",
299 enumValueToString(ic.vdw_modifier))));
302 else if (ic.vdwtype == VanDerWaalsType::Pme)
304 if (ic.ljpme_comb_rule == LongRangeVdW::Geom)
307 ljCombinationRule == LJCombinationRule::Geometric,
308 "Combination rules for long- and short-range interactions should match.");
309 return VdwType::EwaldGeom;
314 ljCombinationRule == LJCombinationRule::LorentzBerthelot,
315 "Combination rules for long- and short-range interactions should match.");
316 return VdwType::EwaldLB;
321 GMX_THROW(gmx::InconsistentInputError(gmx::formatString(
322 "The requested VdW type %s is not implemented in the GPU accelerated kernels!",
323 enumValueToString(ic.vdwtype))));
327 static inline ElecType nbnxmGpuPickElectrostaticsKernelType(const interaction_const_t& ic,
328 const DeviceInformation& deviceInfo)
330 if (ic.eeltype == CoulombInteractionType::Cut)
332 return ElecType::Cut;
334 else if (EEL_RF(ic.eeltype))
338 else if ((EEL_PME(ic.eeltype) || ic.eeltype == CoulombInteractionType::Ewald))
340 return nbnxn_gpu_pick_ewald_kernel_type(ic, deviceInfo);
344 /* Shouldn't happen, as this is checked when choosing Verlet-scheme */
345 GMX_THROW(gmx::InconsistentInputError(
346 gmx::formatString("The requested electrostatics type %s is not implemented in "
347 "the GPU accelerated kernels!",
348 enumValueToString(ic.eeltype))));
352 /*! \brief Initialize the nonbonded parameter data structure. */
353 static inline void initNbparam(NBParamGpu* nbp,
354 const interaction_const_t& ic,
355 const PairlistParams& listParams,
356 const nbnxn_atomdata_t::Params& nbatParams,
357 const DeviceContext& deviceContext)
359 const int numTypes = nbatParams.numTypes;
361 set_cutoff_parameters(nbp, ic, listParams);
363 nbp->vdwType = nbnxmGpuPickVdwKernelType(ic, nbatParams.ljCombinationRule);
364 nbp->elecType = nbnxmGpuPickElectrostaticsKernelType(ic, deviceContext.deviceInfo());
366 if (ic.vdwtype == VanDerWaalsType::Pme)
368 if (ic.ljpme_comb_rule == LongRangeVdW::Geom)
370 GMX_ASSERT(nbatParams.ljCombinationRule == LJCombinationRule::Geometric,
371 "Combination rule mismatch!");
375 GMX_ASSERT(nbatParams.ljCombinationRule == LJCombinationRule::LorentzBerthelot,
376 "Combination rule mismatch!");
380 /* generate table for PME */
381 if (nbp->elecType == ElecType::EwaldTab || nbp->elecType == ElecType::EwaldTabTwin)
383 GMX_RELEASE_ASSERT(ic.coulombEwaldTables, "Need valid Coulomb Ewald correction tables");
384 init_ewald_coulomb_force_table(*ic.coulombEwaldTables, nbp, deviceContext);
387 /* set up LJ parameter lookup table */
388 if (!useLjCombRule(nbp->vdwType))
390 static_assert(sizeof(decltype(nbp->nbfp)) == 2 * sizeof(decltype(*nbatParams.nbfp.data())),
391 "Mismatch in the size of host / device data types");
392 initParamLookupTable(&nbp->nbfp,
394 reinterpret_cast<const Float2*>(nbatParams.nbfp.data()),
399 /* set up LJ-PME parameter lookup table */
400 if (ic.vdwtype == VanDerWaalsType::Pme)
402 static_assert(sizeof(decltype(nbp->nbfp_comb))
403 == 2 * sizeof(decltype(*nbatParams.nbfp_comb.data())),
404 "Mismatch in the size of host / device data types");
405 initParamLookupTable(&nbp->nbfp_comb,
406 &nbp->nbfp_comb_texobj,
407 reinterpret_cast<const Float2*>(nbatParams.nbfp_comb.data()),
413 NbnxmGpu* gpu_init(const gmx::DeviceStreamManager& deviceStreamManager,
414 const interaction_const_t* ic,
415 const PairlistParams& listParams,
416 const nbnxn_atomdata_t* nbat,
417 const bool bLocalAndNonlocal)
419 auto* nb = new NbnxmGpu();
420 nb->deviceContext_ = &deviceStreamManager.context();
421 nb->atdat = new NBAtomDataGpu;
422 nb->nbparam = new NBParamGpu;
423 nb->plist[InteractionLocality::Local] = new Nbnxm::gpu_plist;
424 if (bLocalAndNonlocal)
426 nb->plist[InteractionLocality::NonLocal] = new Nbnxm::gpu_plist;
429 nb->bUseTwoStreams = bLocalAndNonlocal;
431 nb->timers = new Nbnxm::GpuTimers();
432 snew(nb->timings, 1);
434 /* WARNING: CUDA timings are incorrect with multiple streams.
435 * This is the main reason why they are disabled by default.
436 * Can be enabled by setting GMX_ENABLE_GPU_TIMING environment variable.
437 * TODO: Consider turning on by default when we can detect nr of streams.
439 * OpenCL timing is enabled by default and can be disabled by
440 * GMX_DISABLE_GPU_TIMING environment variable.
442 * Timing is disabled in SYCL.
444 nb->bDoTime = (GMX_GPU_CUDA && (getenv("GMX_ENABLE_GPU_TIMING") != nullptr))
445 || (GMX_GPU_OPENCL && (getenv("GMX_DISABLE_GPU_TIMING") == nullptr));
449 init_timings(nb->timings);
453 pmalloc(reinterpret_cast<void**>(&nb->nbst.eLJ), sizeof(*nb->nbst.eLJ));
454 pmalloc(reinterpret_cast<void**>(&nb->nbst.eElec), sizeof(*nb->nbst.eElec));
455 pmalloc(reinterpret_cast<void**>(&nb->nbst.fShift), gmx::c_numShiftVectors * sizeof(*nb->nbst.fShift));
457 init_plist(nb->plist[InteractionLocality::Local]);
459 /* local/non-local GPU streams */
460 GMX_RELEASE_ASSERT(deviceStreamManager.streamIsValid(gmx::DeviceStreamType::NonBondedLocal),
461 "Local non-bonded stream should be initialized to use GPU for non-bonded.");
462 const DeviceStream& localStream = deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedLocal);
463 nb->deviceStreams[InteractionLocality::Local] = &localStream;
464 // In general, it's not strictly necessary to use 2 streams for SYCL, since they are
465 // out-of-order. But for the time being, it will be less disruptive to keep them.
466 if (nb->bUseTwoStreams)
468 init_plist(nb->plist[InteractionLocality::NonLocal]);
470 GMX_RELEASE_ASSERT(deviceStreamManager.streamIsValid(gmx::DeviceStreamType::NonBondedNonLocal),
471 "Non-local non-bonded stream should be initialized to use GPU for "
472 "non-bonded with domain decomposition.");
473 nb->deviceStreams[InteractionLocality::NonLocal] =
474 &deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedNonLocal);
477 const nbnxn_atomdata_t::Params& nbatParams = nbat->params();
478 const DeviceContext& deviceContext = *nb->deviceContext_;
480 initNbparam(nb->nbparam, *ic, listParams, nbatParams, deviceContext);
481 initAtomdataFirst(nb->atdat, nbatParams.numTypes, deviceContext, localStream);
483 gpu_init_platform_specific(nb);
487 fprintf(debug, "Initialized NBNXM GPU data structures.\n");
493 void gpu_pme_loadbal_update_param(const nonbonded_verlet_t* nbv, const interaction_const_t& ic)
495 if (!nbv || !nbv->useGpu())
499 NbnxmGpu* nb = nbv->gpu_nbv;
500 NBParamGpu* nbp = nb->nbparam;
502 set_cutoff_parameters(nbp, ic, nbv->pairlistSets().params());
504 nbp->elecType = nbnxn_gpu_pick_ewald_kernel_type(ic, nb->deviceContext_->deviceInfo());
506 GMX_RELEASE_ASSERT(ic.coulombEwaldTables, "Need valid Coulomb Ewald correction tables");
507 init_ewald_coulomb_force_table(*ic.coulombEwaldTables, nbp, *nb->deviceContext_);
510 void gpu_upload_shiftvec(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom)
512 NBAtomDataGpu* adat = nb->atdat;
513 const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local];
515 /* only if we have a dynamic box */
516 if (nbatom->bDynamicBox || !adat->shiftVecUploaded)
518 copyToDeviceBuffer(&adat->shiftVec,
519 gmx::asGenericFloat3Pointer(nbatom->shift_vec),
521 gmx::c_numShiftVectors,
523 GpuApiCallBehavior::Async,
525 adat->shiftVecUploaded = true;
529 //! This function is documented in the header file
530 void gpu_init_pairlist(NbnxmGpu* nb, const NbnxnPairlistGpu* h_plist, const InteractionLocality iloc)
533 // Timing accumulation should happen only if there was work to do
534 // because getLastRangeTime() gets skipped with empty lists later
535 // which leads to the counter not being reset.
536 bool bDoTime = (nb->bDoTime && !h_plist->sci.empty());
537 const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
538 gpu_plist* d_plist = nb->plist[iloc];
540 if (d_plist->na_c < 0)
542 d_plist->na_c = h_plist->na_ci;
546 if (d_plist->na_c != h_plist->na_ci)
549 "In init_plist: the #atoms per cell has changed (from %d to %d)",
556 GpuTimers::Interaction& iTimers = nb->timers->interaction[iloc];
560 iTimers.pl_h2d.openTimingRegion(deviceStream);
561 iTimers.didPairlistH2D = true;
564 // TODO most of this function is same in CUDA and OpenCL, move into the header
565 const DeviceContext& deviceContext = *nb->deviceContext_;
567 reallocateDeviceBuffer(
568 &d_plist->sci, h_plist->sci.size(), &d_plist->nsci, &d_plist->sci_nalloc, deviceContext);
569 copyToDeviceBuffer(&d_plist->sci,
574 GpuApiCallBehavior::Async,
575 bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
577 reallocateDeviceBuffer(
578 &d_plist->cj4, h_plist->cj4.size(), &d_plist->ncj4, &d_plist->cj4_nalloc, deviceContext);
579 copyToDeviceBuffer(&d_plist->cj4,
584 GpuApiCallBehavior::Async,
585 bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
587 reallocateDeviceBuffer(&d_plist->imask,
588 h_plist->cj4.size() * c_nbnxnGpuClusterpairSplit,
590 &d_plist->imask_nalloc,
593 reallocateDeviceBuffer(
594 &d_plist->excl, h_plist->excl.size(), &d_plist->nexcl, &d_plist->excl_nalloc, deviceContext);
595 copyToDeviceBuffer(&d_plist->excl,
596 h_plist->excl.data(),
598 h_plist->excl.size(),
600 GpuApiCallBehavior::Async,
601 bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
605 iTimers.pl_h2d.closeTimingRegion(deviceStream);
608 /* need to prune the pair list during the next step */
609 d_plist->haveFreshList = true;
612 void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat)
614 bool bDoTime = nb->bDoTime;
615 Nbnxm::GpuTimers* timers = bDoTime ? nb->timers : nullptr;
616 NBAtomDataGpu* atdat = nb->atdat;
617 const DeviceContext& deviceContext = *nb->deviceContext_;
618 const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local];
620 int numAtoms = nbat->numAtoms();
621 bool realloced = false;
625 /* time async copy */
626 timers->atdat.openTimingRegion(localStream);
629 /* need to reallocate if we have to copy more atoms than the amount of space
630 available and only allocate if we haven't initialized yet, i.e atdat->natoms == -1 */
631 if (numAtoms > atdat->numAtomsAlloc)
633 int numAlloc = over_alloc_small(numAtoms);
635 /* free up first if the arrays have already been initialized */
636 if (atdat->numAtomsAlloc != -1)
638 freeDeviceBuffer(&atdat->f);
639 freeDeviceBuffer(&atdat->xq);
640 if (useLjCombRule(nb->nbparam->vdwType))
642 freeDeviceBuffer(&atdat->ljComb);
646 freeDeviceBuffer(&atdat->atomTypes);
651 allocateDeviceBuffer(&atdat->f, numAlloc, deviceContext);
652 allocateDeviceBuffer(&atdat->xq, numAlloc, deviceContext);
654 if (useLjCombRule(nb->nbparam->vdwType))
656 // Two Lennard-Jones parameters per atom
657 allocateDeviceBuffer(&atdat->ljComb, numAlloc, deviceContext);
661 allocateDeviceBuffer(&atdat->atomTypes, numAlloc, deviceContext);
664 atdat->numAtomsAlloc = numAlloc;
668 atdat->numAtoms = numAtoms;
669 atdat->numAtomsLocal = nbat->natoms_local;
671 /* need to clear GPU f output if realloc happened */
674 clearDeviceBufferAsync(&atdat->f, 0, atdat->numAtomsAlloc, localStream);
677 if (useLjCombRule(nb->nbparam->vdwType))
680 sizeof(Float2) == 2 * sizeof(*nbat->params().lj_comb.data()),
681 "Size of a pair of LJ parameters elements should be equal to the size of Float2.");
682 copyToDeviceBuffer(&atdat->ljComb,
683 reinterpret_cast<const Float2*>(nbat->params().lj_comb.data()),
687 GpuApiCallBehavior::Async,
688 bDoTime ? timers->atdat.fetchNextEvent() : nullptr);
692 static_assert(sizeof(int) == sizeof(*nbat->params().type.data()),
693 "Sizes of host- and device-side atom types should be the same.");
694 copyToDeviceBuffer(&atdat->atomTypes,
695 nbat->params().type.data(),
699 GpuApiCallBehavior::Async,
700 bDoTime ? timers->atdat.fetchNextEvent() : nullptr);
705 timers->atdat.closeTimingRegion(localStream);
708 /* kick off the tasks enqueued above to ensure concurrency with the search */
709 issueClFlushInStream(localStream);
712 void gpu_clear_outputs(NbnxmGpu* nb, bool computeVirial)
714 NBAtomDataGpu* adat = nb->atdat;
715 const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local];
717 clearDeviceBufferAsync(&adat->f, 0, nb->atdat->numAtoms, localStream);
718 // Clear shift force array and energies if the outputs were used in the current step
721 clearDeviceBufferAsync(&adat->fShift, 0, gmx::c_numShiftVectors, localStream);
722 clearDeviceBufferAsync(&adat->eLJ, 0, 1, localStream);
723 clearDeviceBufferAsync(&adat->eElec, 0, 1, localStream);
725 issueClFlushInStream(localStream);
728 //! This function is documented in the header file
729 gmx_wallclock_gpu_nbnxn_t* gpu_get_timings(NbnxmGpu* nb)
731 return (nb != nullptr && nb->bDoTime) ? nb->timings : nullptr;
734 //! This function is documented in the header file
735 void gpu_reset_timings(nonbonded_verlet_t* nbv)
737 if (nbv->gpu_nbv && nbv->gpu_nbv->bDoTime)
739 init_timings(nbv->gpu_nbv->timings);
743 bool gpu_is_kernel_ewald_analytical(const NbnxmGpu* nb)
745 return ((nb->nbparam->elecType == ElecType::EwaldAna)
746 || (nb->nbparam->elecType == ElecType::EwaldAnaTwin));
749 void setupGpuShortRangeWork(NbnxmGpu* nb,
750 const gmx::ListedForcesGpu* listedForcesGpu,
751 const gmx::InteractionLocality iLocality)
753 GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
755 // There is short-range work if the pair list for the provided
756 // interaction locality contains entries or if there is any
757 // bonded work (as this is not split into local/nonlocal).
758 nb->haveWork[iLocality] = ((nb->plist[iLocality]->nsci != 0)
759 || (listedForcesGpu != nullptr && listedForcesGpu->haveInteractions()));
762 bool haveGpuShortRangeWork(const NbnxmGpu* nb, const gmx::InteractionLocality interactionLocality)
764 GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
766 return nb->haveWork[interactionLocality];
770 * Launch asynchronously the download of nonbonded forces from the GPU
771 * (and energies/shift forces if required).
773 void gpu_launch_cpyback(NbnxmGpu* nb,
774 struct nbnxn_atomdata_t* nbatom,
775 const gmx::StepWorkload& stepWork,
776 const AtomLocality atomLocality)
778 GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
780 /* determine interaction locality from atom locality */
781 const InteractionLocality iloc = atomToInteractionLocality(atomLocality);
782 GMX_ASSERT(iloc == InteractionLocality::Local
783 || (iloc == InteractionLocality::NonLocal && nb->bNonLocalStreamDoneMarked == false),
784 "Non-local stream is indicating that the copy back event is enqueued at the "
785 "beginning of the copy back function.");
787 /* extract the data */
788 NBAtomDataGpu* adat = nb->atdat;
789 Nbnxm::GpuTimers* timers = nb->timers;
790 bool bDoTime = nb->bDoTime;
791 const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
793 /* don't launch non-local copy-back if there was no non-local work to do */
794 if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(nb, iloc))
796 /* TODO An alternative way to signal that non-local work is
797 complete is to use a clEnqueueMarker+clEnqueueBarrier
798 pair. However, the use of bNonLocalStreamDoneMarked has the
799 advantage of being local to the host, so probably minimizes
800 overhead. Curiously, for NVIDIA OpenCL with an empty-domain
801 test case, overall simulation performance was higher with
802 the API calls, but this has not been tested on AMD OpenCL,
803 so could be worth considering in future. */
804 nb->bNonLocalStreamDoneMarked = false;
808 /* local/nonlocal offset and length used for xq and f */
809 auto atomsRange = getGpuAtomRange(adat, atomLocality);
811 /* beginning of timed D2H section */
814 timers->xf[atomLocality].nb_d2h.openTimingRegion(deviceStream);
817 /* With DD the local D2H transfer can only start after the non-local
818 has been launched. */
819 if (iloc == InteractionLocality::Local && nb->bNonLocalStreamDoneMarked)
821 nb->nonlocal_done.enqueueWaitEvent(deviceStream);
822 nb->bNonLocalStreamDoneMarked = false;
826 if (!stepWork.useGpuFBufferOps)
829 sizeof(*nbatom->out[0].f.data()) == sizeof(float),
830 "The host force buffer should be in single precision to match device data size.");
831 copyFromDeviceBuffer(reinterpret_cast<Float3*>(nbatom->out[0].f.data()) + atomsRange.begin(),
836 GpuApiCallBehavior::Async,
837 bDoTime ? timers->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
839 issueClFlushInStream(deviceStream);
842 /* After the non-local D2H is launched the nonlocal_done event can be
843 recorded which signals that the local D2H can proceed. This event is not
844 placed after the non-local kernel because we first need the non-local
846 if (iloc == InteractionLocality::NonLocal)
848 nb->nonlocal_done.markEvent(deviceStream);
849 nb->bNonLocalStreamDoneMarked = true;
852 /* only transfer energies in the local stream */
853 if (iloc == InteractionLocality::Local)
855 /* DtoH fshift when virial is needed */
856 if (stepWork.computeVirial)
859 sizeof(*nb->nbst.fShift) == sizeof(Float3),
860 "Sizes of host- and device-side shift vector elements should be the same.");
861 copyFromDeviceBuffer(nb->nbst.fShift,
864 gmx::c_numShiftVectors,
866 GpuApiCallBehavior::Async,
867 bDoTime ? timers->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
871 if (stepWork.computeEnergy)
873 static_assert(sizeof(*nb->nbst.eLJ) == sizeof(float),
874 "Sizes of host- and device-side LJ energy terms should be the same.");
875 copyFromDeviceBuffer(nb->nbst.eLJ,
880 GpuApiCallBehavior::Async,
881 bDoTime ? timers->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
882 static_assert(sizeof(*nb->nbst.eElec) == sizeof(float),
883 "Sizes of host- and device-side electrostatic energy terms should be the "
885 copyFromDeviceBuffer(nb->nbst.eElec,
890 GpuApiCallBehavior::Async,
891 bDoTime ? timers->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
897 timers->xf[atomLocality].nb_d2h.closeTimingRegion(deviceStream);
901 void nbnxnInsertNonlocalGpuDependency(NbnxmGpu* nb, const InteractionLocality interactionLocality)
903 const DeviceStream& deviceStream = *nb->deviceStreams[interactionLocality];
905 /* When we get here all misc operations issued in the local stream as well as
906 the local xq H2D are done,
907 so we record that in the local stream and wait for it in the nonlocal one.
908 This wait needs to precede any PP tasks, bonded or nonbonded, that may
909 compute on interactions between local and nonlocal atoms.
911 if (nb->bUseTwoStreams)
913 if (interactionLocality == InteractionLocality::Local)
915 nb->misc_ops_and_local_H2D_done.markEvent(deviceStream);
916 issueClFlushInStream(deviceStream);
920 nb->misc_ops_and_local_H2D_done.enqueueWaitEvent(deviceStream);
925 /*! \brief Launch asynchronously the xq buffer host to device copy. */
926 void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const AtomLocality atomLocality)
928 GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
930 const InteractionLocality iloc = atomToInteractionLocality(atomLocality);
932 NBAtomDataGpu* adat = nb->atdat;
933 gpu_plist* plist = nb->plist[iloc];
934 Nbnxm::GpuTimers* timers = nb->timers;
935 const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
937 const bool bDoTime = nb->bDoTime;
939 /* Don't launch the non-local H2D copy if there is no dependent
940 work to do: neither non-local nor other (e.g. bonded) work
941 to do that has as input the nbnxn coordaintes.
942 Doing the same for the local kernel is more complicated, since the
943 local part of the force array also depends on the non-local kernel.
944 So to avoid complicating the code and to reduce the risk of bugs,
945 we always call the local local x+q copy (and the rest of the local
946 work in nbnxn_gpu_launch_kernel().
948 if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(nb, iloc))
950 plist->haveFreshList = false;
952 // The event is marked for Local interactions unconditionally,
953 // so it has to be released here because of the early return
954 // for NonLocal interactions.
955 nb->misc_ops_and_local_H2D_done.reset();
960 /* local/nonlocal offset and length used for xq and f */
961 const auto atomsRange = getGpuAtomRange(adat, atomLocality);
963 /* beginning of timed HtoD section */
966 timers->xf[atomLocality].nb_h2d.openTimingRegion(deviceStream);
970 GMX_ASSERT(nbatom->XFormat == nbatXYZQ,
971 "The coordinates should be in xyzq format to copy to the Float4 device buffer.");
972 copyToDeviceBuffer(&adat->xq,
973 reinterpret_cast<const Float4*>(nbatom->x().data()) + atomsRange.begin(),
977 GpuApiCallBehavior::Async,
982 timers->xf[atomLocality].nb_h2d.closeTimingRegion(deviceStream);
985 /* When we get here all misc operations issued in the local stream as well as
986 the local xq H2D are done,
987 so we record that in the local stream and wait for it in the nonlocal one.
988 This wait needs to precede any PP tasks, bonded or nonbonded, that may
989 compute on interactions between local and nonlocal atoms.
991 nbnxnInsertNonlocalGpuDependency(nb, iloc);
995 /* Initialization for X buffer operations on GPU. */
996 void nbnxn_gpu_init_x_to_nbat_x(const Nbnxm::GridSet& gridSet, NbnxmGpu* gpu_nbv)
998 const DeviceStream& localStream = *gpu_nbv->deviceStreams[InteractionLocality::Local];
999 const bool bDoTime = gpu_nbv->bDoTime;
1000 const int maxNumColumns = gridSet.numColumnsMax();
1002 reallocateDeviceBuffer(&gpu_nbv->cxy_na,
1003 maxNumColumns * gridSet.grids().size(),
1005 &gpu_nbv->ncxy_na_alloc,
1006 *gpu_nbv->deviceContext_);
1007 reallocateDeviceBuffer(&gpu_nbv->cxy_ind,
1008 maxNumColumns * gridSet.grids().size(),
1010 &gpu_nbv->ncxy_ind_alloc,
1011 *gpu_nbv->deviceContext_);
1013 for (unsigned int g = 0; g < gridSet.grids().size(); g++)
1015 const Nbnxm::Grid& grid = gridSet.grids()[g];
1017 const int numColumns = grid.numColumns();
1018 const int* atomIndices = gridSet.atomIndices().data();
1019 const int atomIndicesSize = gridSet.atomIndices().size();
1020 const int* cxy_na = grid.cxy_na().data();
1021 const int* cxy_ind = grid.cxy_ind().data();
1023 auto* timerH2D = bDoTime ? &gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d : nullptr;
1025 reallocateDeviceBuffer(&gpu_nbv->atomIndices,
1027 &gpu_nbv->atomIndicesSize,
1028 &gpu_nbv->atomIndicesSize_alloc,
1029 *gpu_nbv->deviceContext_);
1031 if (atomIndicesSize > 0)
1035 timerH2D->openTimingRegion(localStream);
1038 copyToDeviceBuffer(&gpu_nbv->atomIndices,
1043 GpuApiCallBehavior::Async,
1044 bDoTime ? timerH2D->fetchNextEvent() : nullptr);
1048 timerH2D->closeTimingRegion(localStream);
1056 timerH2D->openTimingRegion(localStream);
1059 copyToDeviceBuffer(&gpu_nbv->cxy_na,
1064 GpuApiCallBehavior::Async,
1065 bDoTime ? timerH2D->fetchNextEvent() : nullptr);
1069 timerH2D->closeTimingRegion(localStream);
1074 timerH2D->openTimingRegion(localStream);
1077 copyToDeviceBuffer(&gpu_nbv->cxy_ind,
1082 GpuApiCallBehavior::Async,
1083 bDoTime ? timerH2D->fetchNextEvent() : nullptr);
1087 timerH2D->closeTimingRegion(localStream);
1092 // The above data is transferred on the local stream but is a
1093 // dependency of the nonlocal stream (specifically the nonlocal X
1094 // buf ops kernel). We therefore set a dependency to ensure
1095 // that the nonlocal stream waits on the local stream here.
1096 // This call records an event in the local stream:
1097 nbnxnInsertNonlocalGpuDependency(gpu_nbv, Nbnxm::InteractionLocality::Local);
1098 // ...and this call instructs the nonlocal stream to wait on that event:
1099 nbnxnInsertNonlocalGpuDependency(gpu_nbv, Nbnxm::InteractionLocality::NonLocal);
1102 //! This function is documented in the header file
1103 void gpu_free(NbnxmGpu* nb)
1110 gpu_free_platform_specific(nb);
1115 NBAtomDataGpu* atdat = nb->atdat;
1116 NBParamGpu* nbparam = nb->nbparam;
1119 freeDeviceBuffer(&(nb->atdat->xq));
1120 freeDeviceBuffer(&(nb->atdat->f));
1121 freeDeviceBuffer(&(nb->atdat->eLJ));
1122 freeDeviceBuffer(&(nb->atdat->eElec));
1123 freeDeviceBuffer(&(nb->atdat->fShift));
1124 freeDeviceBuffer(&(nb->atdat->shiftVec));
1125 if (useLjCombRule(nb->nbparam->vdwType))
1127 freeDeviceBuffer(&atdat->ljComb);
1131 freeDeviceBuffer(&atdat->atomTypes);
1135 if (nbparam->elecType == ElecType::EwaldTab || nbparam->elecType == ElecType::EwaldTabTwin)
1137 destroyParamLookupTable(&nbparam->coulomb_tab, &nbparam->coulomb_tab_texobj);
1140 if (!useLjCombRule(nb->nbparam->vdwType))
1142 destroyParamLookupTable(&nbparam->nbfp, &nbparam->nbfp_texobj);
1145 if (nbparam->vdwType == VdwType::EwaldGeom || nbparam->vdwType == VdwType::EwaldLB)
1147 destroyParamLookupTable(&nbparam->nbfp_comb, &nbparam->nbfp_comb_texobj);
1151 auto* plist = nb->plist[InteractionLocality::Local];
1152 freeDeviceBuffer(&plist->sci);
1153 freeDeviceBuffer(&plist->cj4);
1154 freeDeviceBuffer(&plist->imask);
1155 freeDeviceBuffer(&plist->excl);
1157 if (nb->bUseTwoStreams)
1159 auto* plist_nl = nb->plist[InteractionLocality::NonLocal];
1160 freeDeviceBuffer(&plist_nl->sci);
1161 freeDeviceBuffer(&plist_nl->cj4);
1162 freeDeviceBuffer(&plist_nl->imask);
1163 freeDeviceBuffer(&plist_nl->excl);
1168 pfree(nb->nbst.eLJ);
1169 nb->nbst.eLJ = nullptr;
1171 pfree(nb->nbst.eElec);
1172 nb->nbst.eElec = nullptr;
1174 pfree(nb->nbst.fShift);
1175 nb->nbst.fShift = nullptr;
1183 fprintf(debug, "Cleaned up NBNXM GPU data structures.\n");
1187 DeviceBuffer<gmx::RVec> gpu_get_f(NbnxmGpu* nb)
1189 GMX_ASSERT(nb != nullptr, "nb pointer must be valid");
1191 return nb->atdat->f;
1194 } // namespace Nbnxm