874f9a614f190c4c38023bc3e3ec05ce80e5ba32
[alexxy/gromacs.git] / src / gromacs / nbnxm / nbnxm_gpu_data_mgmt.cpp
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014,2015,2016 by the GROMACS development team.
5  * Copyright (c) 2017,2018,2019,2020,2021, by the GROMACS development team, led by
6  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
7  * and including many others, as listed in the AUTHORS file in the
8  * top-level source directory and at http://www.gromacs.org.
9  *
10  * GROMACS is free software; you can redistribute it and/or
11  * modify it under the terms of the GNU Lesser General Public License
12  * as published by the Free Software Foundation; either version 2.1
13  * of the License, or (at your option) any later version.
14  *
15  * GROMACS is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18  * Lesser General Public License for more details.
19  *
20  * You should have received a copy of the GNU Lesser General Public
21  * License along with GROMACS; if not, see
22  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
23  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
24  *
25  * If you want to redistribute modifications to GROMACS, please
26  * consider that scientific software is very special. Version
27  * control is crucial - bugs must be traceable. We will be happy to
28  * consider code for inclusion in the official distribution, but
29  * derived work must not be called official GROMACS. Details are found
30  * in the README & COPYING files - if they are missing, get the
31  * official version at http://www.gromacs.org.
32  *
33  * To help us fund GROMACS development, we humbly ask that you cite
34  * the research papers on the package. Check out http://www.gromacs.org.
35  */
36 /*! \internal \file
37  *  \brief Define common implementation of nbnxm_gpu_data_mgmt.h
38  *
39  *  \author Anca Hamuraru <anca@streamcomputing.eu>
40  *  \author Dimitrios Karkoulis <dimitris.karkoulis@gmail.com>
41  *  \author Teemu Virolainen <teemu@streamcomputing.eu>
42  *  \author Szilárd Páll <pall.szilard@gmail.com>
43  *  \author Artem Zhmurov <zhmurov@gmail.com>
44  *
45  *  \ingroup module_nbnxm
46  */
47 #include "gmxpre.h"
48
49 #include "config.h"
50
51 #if GMX_GPU_CUDA
52 #    include "cuda/nbnxm_cuda_types.h"
53 #endif
54
55 #if GMX_GPU_OPENCL
56 #    include "opencl/nbnxm_ocl_types.h"
57 #endif
58
59 #if GMX_GPU_SYCL
60 #    include "sycl/nbnxm_sycl_types.h"
61 #endif
62
63 #include "nbnxm_gpu_data_mgmt.h"
64
65 #include "gromacs/gpu_utils/device_stream_manager.h"
66 #include "gromacs/gpu_utils/gputraits.h"
67 #include "gromacs/gpu_utils/pmalloc.h"
68 #include "gromacs/hardware/device_information.h"
69 #include "gromacs/mdtypes/interaction_const.h"
70 #include "gromacs/mdtypes/simulation_workload.h"
71 #include "gromacs/nbnxm/gpu_common_utils.h"
72 #include "gromacs/nbnxm/gpu_data_mgmt.h"
73 #include "gromacs/nbnxm/gridset.h"
74 #include "gromacs/pbcutil/ishift.h"
75 #include "gromacs/timing/gpu_timing.h"
76 #include "gromacs/pbcutil/ishift.h"
77 #include "gromacs/utility/cstringutil.h"
78 #include "gromacs/utility/exceptions.h"
79 #include "gromacs/utility/fatalerror.h"
80
81 #include "nbnxm_gpu.h"
82 #include "pairlistsets.h"
83
84 namespace Nbnxm
85 {
86
87 static inline void issueClFlushInStream(const DeviceStream& deviceStream)
88 {
89 #if GMX_GPU_OPENCL
90     /* Based on the v1.2 section 5.13 of the OpenCL spec, a flush is needed
91      * in the stream after marking an event in it in order to be able to sync with
92      * the event from another stream.
93      */
94     cl_int cl_error = clFlush(deviceStream.stream());
95     if (cl_error != CL_SUCCESS)
96     {
97         GMX_THROW(gmx::InternalError("clFlush failed: " + ocl_get_error_string(cl_error)));
98     }
99 #else
100     GMX_UNUSED_VALUE(deviceStream);
101 #endif
102 }
103
104 static inline void init_ewald_coulomb_force_table(const EwaldCorrectionTables& tables,
105                                                   NBParamGpu*                  nbp,
106                                                   const DeviceContext&         deviceContext)
107 {
108     if (nbp->coulomb_tab)
109     {
110         destroyParamLookupTable(&nbp->coulomb_tab, &nbp->coulomb_tab_texobj);
111     }
112
113     nbp->coulomb_tab_scale = tables.scale;
114     initParamLookupTable(
115             &nbp->coulomb_tab, &nbp->coulomb_tab_texobj, tables.tableF.data(), tables.tableF.size(), deviceContext);
116 }
117
118 static inline ElecType nbnxn_gpu_pick_ewald_kernel_type(const interaction_const_t& ic,
119                                                         const DeviceInformation gmx_unused& deviceInfo)
120 {
121     bool bTwinCut = (ic.rcoulomb != ic.rvdw);
122
123     /* Benchmarking/development environment variables to force the use of
124        analytical or tabulated Ewald kernel. */
125     const bool forceAnalyticalEwald = (getenv("GMX_GPU_NB_ANA_EWALD") != nullptr);
126     const bool forceTabulatedEwald  = (getenv("GMX_GPU_NB_TAB_EWALD") != nullptr);
127     const bool forceTwinCutoffEwald = (getenv("GMX_GPU_NB_EWALD_TWINCUT") != nullptr);
128
129     if (forceAnalyticalEwald && forceTabulatedEwald)
130     {
131         gmx_incons(
132                 "Both analytical and tabulated Ewald GPU non-bonded kernels "
133                 "requested through environment variables.");
134     }
135
136     /* By default, use analytical Ewald except with CUDA on NVIDIA CC 7.0 and 8.0.
137      */
138     const bool c_useTabulatedEwaldDefault =
139 #if GMX_GPU_CUDA
140             (deviceInfo.prop.major == 7 && deviceInfo.prop.minor == 0)
141             || (deviceInfo.prop.major == 8 && deviceInfo.prop.minor == 0);
142 #else
143             false;
144 #endif
145     bool bUseAnalyticalEwald = !c_useTabulatedEwaldDefault;
146     if (forceAnalyticalEwald)
147     {
148         bUseAnalyticalEwald = true;
149         if (debug)
150         {
151             fprintf(debug, "Using analytical Ewald GPU kernels\n");
152         }
153     }
154     else if (forceTabulatedEwald)
155     {
156         bUseAnalyticalEwald = false;
157
158         if (debug)
159         {
160             fprintf(debug, "Using tabulated Ewald GPU kernels\n");
161         }
162     }
163
164     /* Use twin cut-off kernels if requested by bTwinCut or the env. var.
165        forces it (use it for debugging/benchmarking only). */
166     if (!bTwinCut && !forceTwinCutoffEwald)
167     {
168         return bUseAnalyticalEwald ? ElecType::EwaldAna : ElecType::EwaldTab;
169     }
170     else
171     {
172         return bUseAnalyticalEwald ? ElecType::EwaldAnaTwin : ElecType::EwaldTabTwin;
173     }
174 }
175
176 static inline void set_cutoff_parameters(NBParamGpu*                nbp,
177                                          const interaction_const_t& ic,
178                                          const PairlistParams&      listParams)
179 {
180     nbp->ewald_beta        = ic.ewaldcoeff_q;
181     nbp->sh_ewald          = ic.sh_ewald;
182     nbp->epsfac            = ic.epsfac;
183     nbp->two_k_rf          = 2.0 * ic.reactionFieldCoefficient;
184     nbp->c_rf              = ic.reactionFieldShift;
185     nbp->rvdw_sq           = ic.rvdw * ic.rvdw;
186     nbp->rcoulomb_sq       = ic.rcoulomb * ic.rcoulomb;
187     nbp->rlistOuter_sq     = listParams.rlistOuter * listParams.rlistOuter;
188     nbp->rlistInner_sq     = listParams.rlistInner * listParams.rlistInner;
189     nbp->useDynamicPruning = listParams.useDynamicPruning;
190
191     nbp->sh_lj_ewald   = ic.sh_lj_ewald;
192     nbp->ewaldcoeff_lj = ic.ewaldcoeff_lj;
193
194     nbp->rvdw_switch      = ic.rvdw_switch;
195     nbp->dispersion_shift = ic.dispersion_shift;
196     nbp->repulsion_shift  = ic.repulsion_shift;
197     nbp->vdw_switch       = ic.vdw_switch;
198 }
199
200 static inline void init_plist(gpu_plist* pl)
201 {
202     /* initialize to nullptr pointers to data that is not allocated here and will
203        need reallocation in nbnxn_gpu_init_pairlist */
204     pl->sci   = nullptr;
205     pl->cj4   = nullptr;
206     pl->imask = nullptr;
207     pl->excl  = nullptr;
208
209     /* size -1 indicates that the respective array hasn't been initialized yet */
210     pl->na_c                   = -1;
211     pl->nsci                   = -1;
212     pl->sci_nalloc             = -1;
213     pl->ncj4                   = -1;
214     pl->cj4_nalloc             = -1;
215     pl->nimask                 = -1;
216     pl->imask_nalloc           = -1;
217     pl->nexcl                  = -1;
218     pl->excl_nalloc            = -1;
219     pl->haveFreshList          = false;
220     pl->rollingPruningNumParts = 0;
221     pl->rollingPruningPart     = 0;
222 }
223
224 static inline void init_timings(gmx_wallclock_gpu_nbnxn_t* t)
225 {
226     t->nb_h2d_t = 0.0;
227     t->nb_d2h_t = 0.0;
228     t->nb_c     = 0;
229     t->pl_h2d_t = 0.0;
230     t->pl_h2d_c = 0;
231     for (int i = 0; i < 2; i++)
232     {
233         for (int j = 0; j < 2; j++)
234         {
235             t->ktime[i][j].t = 0.0;
236             t->ktime[i][j].c = 0;
237         }
238     }
239     t->pruneTime.c        = 0;
240     t->pruneTime.t        = 0.0;
241     t->dynamicPruneTime.c = 0;
242     t->dynamicPruneTime.t = 0.0;
243 }
244
245 /*! \brief Initialize \p atomdata first time; it only gets filled at pair-search. */
246 static inline void initAtomdataFirst(NBAtomDataGpu*       atomdata,
247                                      int                  numTypes,
248                                      const DeviceContext& deviceContext,
249                                      const DeviceStream&  localStream)
250 {
251     atomdata->numTypes = numTypes;
252     allocateDeviceBuffer(&atomdata->shiftVec, gmx::c_numShiftVectors, deviceContext);
253     atomdata->shiftVecUploaded = false;
254
255     allocateDeviceBuffer(&atomdata->fShift, gmx::c_numShiftVectors, deviceContext);
256     allocateDeviceBuffer(&atomdata->eLJ, 1, deviceContext);
257     allocateDeviceBuffer(&atomdata->eElec, 1, deviceContext);
258
259     clearDeviceBufferAsync(&atomdata->fShift, 0, gmx::c_numShiftVectors, localStream);
260     clearDeviceBufferAsync(&atomdata->eElec, 0, 1, localStream);
261     clearDeviceBufferAsync(&atomdata->eLJ, 0, 1, localStream);
262
263     /* initialize to nullptr pointers to data that is not allocated here and will
264        need reallocation in later */
265     atomdata->xq = nullptr;
266     atomdata->f  = nullptr;
267
268     /* size -1 indicates that the respective array hasn't been initialized yet */
269     atomdata->numAtoms      = -1;
270     atomdata->numAtomsAlloc = -1;
271 }
272
273 static inline VdwType nbnxmGpuPickVdwKernelType(const interaction_const_t& ic,
274                                                 LJCombinationRule          ljCombinationRule)
275 {
276     if (ic.vdwtype == VanDerWaalsType::Cut)
277     {
278         switch (ic.vdw_modifier)
279         {
280             case InteractionModifiers::None:
281             case InteractionModifiers::PotShift:
282                 switch (ljCombinationRule)
283                 {
284                     case LJCombinationRule::None: return VdwType::Cut;
285                     case LJCombinationRule::Geometric: return VdwType::CutCombGeom;
286                     case LJCombinationRule::LorentzBerthelot: return VdwType::CutCombLB;
287                     default:
288                         GMX_THROW(gmx::InconsistentInputError(gmx::formatString(
289                                 "The requested LJ combination rule %s is not implemented in "
290                                 "the GPU accelerated kernels!",
291                                 enumValueToString(ljCombinationRule))));
292                 }
293             case InteractionModifiers::ForceSwitch: return VdwType::FSwitch;
294             case InteractionModifiers::PotSwitch: return VdwType::PSwitch;
295             default:
296                 GMX_THROW(gmx::InconsistentInputError(
297                         gmx::formatString("The requested VdW interaction modifier %s is not "
298                                           "implemented in the GPU accelerated kernels!",
299                                           enumValueToString(ic.vdw_modifier))));
300         }
301     }
302     else if (ic.vdwtype == VanDerWaalsType::Pme)
303     {
304         if (ic.ljpme_comb_rule == LongRangeVdW::Geom)
305         {
306             GMX_RELEASE_ASSERT(
307                     ljCombinationRule == LJCombinationRule::Geometric,
308                     "Combination rules for long- and short-range interactions should match.");
309             return VdwType::EwaldGeom;
310         }
311         else
312         {
313             GMX_RELEASE_ASSERT(
314                     ljCombinationRule == LJCombinationRule::LorentzBerthelot,
315                     "Combination rules for long- and short-range interactions should match.");
316             return VdwType::EwaldLB;
317         }
318     }
319     else
320     {
321         GMX_THROW(gmx::InconsistentInputError(gmx::formatString(
322                 "The requested VdW type %s is not implemented in the GPU accelerated kernels!",
323                 enumValueToString(ic.vdwtype))));
324     }
325 }
326
327 static inline ElecType nbnxmGpuPickElectrostaticsKernelType(const interaction_const_t& ic,
328                                                             const DeviceInformation&   deviceInfo)
329 {
330     if (ic.eeltype == CoulombInteractionType::Cut)
331     {
332         return ElecType::Cut;
333     }
334     else if (EEL_RF(ic.eeltype))
335     {
336         return ElecType::RF;
337     }
338     else if ((EEL_PME(ic.eeltype) || ic.eeltype == CoulombInteractionType::Ewald))
339     {
340         return nbnxn_gpu_pick_ewald_kernel_type(ic, deviceInfo);
341     }
342     else
343     {
344         /* Shouldn't happen, as this is checked when choosing Verlet-scheme */
345         GMX_THROW(gmx::InconsistentInputError(
346                 gmx::formatString("The requested electrostatics type %s is not implemented in "
347                                   "the GPU accelerated kernels!",
348                                   enumValueToString(ic.eeltype))));
349     }
350 }
351
352 /*! \brief Initialize the nonbonded parameter data structure. */
353 static inline void initNbparam(NBParamGpu*                     nbp,
354                                const interaction_const_t&      ic,
355                                const PairlistParams&           listParams,
356                                const nbnxn_atomdata_t::Params& nbatParams,
357                                const DeviceContext&            deviceContext)
358 {
359     const int numTypes = nbatParams.numTypes;
360
361     set_cutoff_parameters(nbp, ic, listParams);
362
363     nbp->vdwType  = nbnxmGpuPickVdwKernelType(ic, nbatParams.ljCombinationRule);
364     nbp->elecType = nbnxmGpuPickElectrostaticsKernelType(ic, deviceContext.deviceInfo());
365
366     if (ic.vdwtype == VanDerWaalsType::Pme)
367     {
368         if (ic.ljpme_comb_rule == LongRangeVdW::Geom)
369         {
370             GMX_ASSERT(nbatParams.ljCombinationRule == LJCombinationRule::Geometric,
371                        "Combination rule mismatch!");
372         }
373         else
374         {
375             GMX_ASSERT(nbatParams.ljCombinationRule == LJCombinationRule::LorentzBerthelot,
376                        "Combination rule mismatch!");
377         }
378     }
379
380     /* generate table for PME */
381     if (nbp->elecType == ElecType::EwaldTab || nbp->elecType == ElecType::EwaldTabTwin)
382     {
383         GMX_RELEASE_ASSERT(ic.coulombEwaldTables, "Need valid Coulomb Ewald correction tables");
384         init_ewald_coulomb_force_table(*ic.coulombEwaldTables, nbp, deviceContext);
385     }
386
387     /* set up LJ parameter lookup table */
388     if (!useLjCombRule(nbp->vdwType))
389     {
390         static_assert(sizeof(decltype(nbp->nbfp)) == 2 * sizeof(decltype(*nbatParams.nbfp.data())),
391                       "Mismatch in the size of host / device data types");
392         initParamLookupTable(&nbp->nbfp,
393                              &nbp->nbfp_texobj,
394                              reinterpret_cast<const Float2*>(nbatParams.nbfp.data()),
395                              numTypes * numTypes,
396                              deviceContext);
397     }
398
399     /* set up LJ-PME parameter lookup table */
400     if (ic.vdwtype == VanDerWaalsType::Pme)
401     {
402         static_assert(sizeof(decltype(nbp->nbfp_comb))
403                               == 2 * sizeof(decltype(*nbatParams.nbfp_comb.data())),
404                       "Mismatch in the size of host / device data types");
405         initParamLookupTable(&nbp->nbfp_comb,
406                              &nbp->nbfp_comb_texobj,
407                              reinterpret_cast<const Float2*>(nbatParams.nbfp_comb.data()),
408                              numTypes,
409                              deviceContext);
410     }
411 }
412
413 NbnxmGpu* gpu_init(const gmx::DeviceStreamManager& deviceStreamManager,
414                    const interaction_const_t*      ic,
415                    const PairlistParams&           listParams,
416                    const nbnxn_atomdata_t*         nbat,
417                    const bool                      bLocalAndNonlocal)
418 {
419     auto* nb                              = new NbnxmGpu();
420     nb->deviceContext_                    = &deviceStreamManager.context();
421     nb->atdat                             = new NBAtomDataGpu;
422     nb->nbparam                           = new NBParamGpu;
423     nb->plist[InteractionLocality::Local] = new Nbnxm::gpu_plist;
424     if (bLocalAndNonlocal)
425     {
426         nb->plist[InteractionLocality::NonLocal] = new Nbnxm::gpu_plist;
427     }
428
429     nb->bUseTwoStreams = bLocalAndNonlocal;
430
431     nb->timers = new Nbnxm::GpuTimers();
432     snew(nb->timings, 1);
433
434     /* WARNING: CUDA timings are incorrect with multiple streams.
435      * This is the main reason why they are disabled by default.
436      * Can be enabled by setting GMX_ENABLE_GPU_TIMING environment variable.
437      * TODO: Consider turning on by default when we can detect nr of streams.
438      *
439      * OpenCL timing is enabled by default and can be disabled by
440      * GMX_DISABLE_GPU_TIMING environment variable.
441      *
442      * Timing is disabled in SYCL.
443      */
444     nb->bDoTime = (GMX_GPU_CUDA && (getenv("GMX_ENABLE_GPU_TIMING") != nullptr))
445                   || (GMX_GPU_OPENCL && (getenv("GMX_DISABLE_GPU_TIMING") == nullptr));
446
447     if (nb->bDoTime)
448     {
449         init_timings(nb->timings);
450     }
451
452     /* init nbst */
453     pmalloc(reinterpret_cast<void**>(&nb->nbst.eLJ), sizeof(*nb->nbst.eLJ));
454     pmalloc(reinterpret_cast<void**>(&nb->nbst.eElec), sizeof(*nb->nbst.eElec));
455     pmalloc(reinterpret_cast<void**>(&nb->nbst.fShift), gmx::c_numShiftVectors * sizeof(*nb->nbst.fShift));
456
457     init_plist(nb->plist[InteractionLocality::Local]);
458
459     /* local/non-local GPU streams */
460     GMX_RELEASE_ASSERT(deviceStreamManager.streamIsValid(gmx::DeviceStreamType::NonBondedLocal),
461                        "Local non-bonded stream should be initialized to use GPU for non-bonded.");
462     const DeviceStream& localStream = deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedLocal);
463     nb->deviceStreams[InteractionLocality::Local] = &localStream;
464     // In general, it's not strictly necessary to use 2 streams for SYCL, since they are
465     // out-of-order. But for the time being, it will be less disruptive to keep them.
466     if (nb->bUseTwoStreams)
467     {
468         init_plist(nb->plist[InteractionLocality::NonLocal]);
469
470         GMX_RELEASE_ASSERT(deviceStreamManager.streamIsValid(gmx::DeviceStreamType::NonBondedNonLocal),
471                            "Non-local non-bonded stream should be initialized to use GPU for "
472                            "non-bonded with domain decomposition.");
473         nb->deviceStreams[InteractionLocality::NonLocal] =
474                 &deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedNonLocal);
475     }
476
477     const nbnxn_atomdata_t::Params& nbatParams    = nbat->params();
478     const DeviceContext&            deviceContext = *nb->deviceContext_;
479
480     initNbparam(nb->nbparam, *ic, listParams, nbatParams, deviceContext);
481     initAtomdataFirst(nb->atdat, nbatParams.numTypes, deviceContext, localStream);
482
483     gpu_init_platform_specific(nb);
484
485     if (debug)
486     {
487         fprintf(debug, "Initialized NBNXM GPU data structures.\n");
488     }
489
490     return nb;
491 }
492
493 void gpu_pme_loadbal_update_param(const nonbonded_verlet_t* nbv, const interaction_const_t& ic)
494 {
495     if (!nbv || !nbv->useGpu())
496     {
497         return;
498     }
499     NbnxmGpu*   nb  = nbv->gpu_nbv;
500     NBParamGpu* nbp = nb->nbparam;
501
502     set_cutoff_parameters(nbp, ic, nbv->pairlistSets().params());
503
504     nbp->elecType = nbnxn_gpu_pick_ewald_kernel_type(ic, nb->deviceContext_->deviceInfo());
505
506     GMX_RELEASE_ASSERT(ic.coulombEwaldTables, "Need valid Coulomb Ewald correction tables");
507     init_ewald_coulomb_force_table(*ic.coulombEwaldTables, nbp, *nb->deviceContext_);
508 }
509
510 void gpu_upload_shiftvec(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom)
511 {
512     NBAtomDataGpu*      adat        = nb->atdat;
513     const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local];
514
515     /* only if we have a dynamic box */
516     if (nbatom->bDynamicBox || !adat->shiftVecUploaded)
517     {
518         copyToDeviceBuffer(&adat->shiftVec,
519                            gmx::asGenericFloat3Pointer(nbatom->shift_vec),
520                            0,
521                            gmx::c_numShiftVectors,
522                            localStream,
523                            GpuApiCallBehavior::Async,
524                            nullptr);
525         adat->shiftVecUploaded = true;
526     }
527 }
528
529 //! This function is documented in the header file
530 void gpu_init_pairlist(NbnxmGpu* nb, const NbnxnPairlistGpu* h_plist, const InteractionLocality iloc)
531 {
532     char sbuf[STRLEN];
533     // Timing accumulation should happen only if there was work to do
534     // because getLastRangeTime() gets skipped with empty lists later
535     // which leads to the counter not being reset.
536     bool                bDoTime      = (nb->bDoTime && !h_plist->sci.empty());
537     const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
538     gpu_plist*          d_plist      = nb->plist[iloc];
539
540     if (d_plist->na_c < 0)
541     {
542         d_plist->na_c = h_plist->na_ci;
543     }
544     else
545     {
546         if (d_plist->na_c != h_plist->na_ci)
547         {
548             sprintf(sbuf,
549                     "In init_plist: the #atoms per cell has changed (from %d to %d)",
550                     d_plist->na_c,
551                     h_plist->na_ci);
552             gmx_incons(sbuf);
553         }
554     }
555
556     GpuTimers::Interaction& iTimers = nb->timers->interaction[iloc];
557
558     if (bDoTime)
559     {
560         iTimers.pl_h2d.openTimingRegion(deviceStream);
561         iTimers.didPairlistH2D = true;
562     }
563
564     // TODO most of this function is same in CUDA and OpenCL, move into the header
565     const DeviceContext& deviceContext = *nb->deviceContext_;
566
567     reallocateDeviceBuffer(
568             &d_plist->sci, h_plist->sci.size(), &d_plist->nsci, &d_plist->sci_nalloc, deviceContext);
569     copyToDeviceBuffer(&d_plist->sci,
570                        h_plist->sci.data(),
571                        0,
572                        h_plist->sci.size(),
573                        deviceStream,
574                        GpuApiCallBehavior::Async,
575                        bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
576
577     reallocateDeviceBuffer(
578             &d_plist->cj4, h_plist->cj4.size(), &d_plist->ncj4, &d_plist->cj4_nalloc, deviceContext);
579     copyToDeviceBuffer(&d_plist->cj4,
580                        h_plist->cj4.data(),
581                        0,
582                        h_plist->cj4.size(),
583                        deviceStream,
584                        GpuApiCallBehavior::Async,
585                        bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
586
587     reallocateDeviceBuffer(&d_plist->imask,
588                            h_plist->cj4.size() * c_nbnxnGpuClusterpairSplit,
589                            &d_plist->nimask,
590                            &d_plist->imask_nalloc,
591                            deviceContext);
592
593     reallocateDeviceBuffer(
594             &d_plist->excl, h_plist->excl.size(), &d_plist->nexcl, &d_plist->excl_nalloc, deviceContext);
595     copyToDeviceBuffer(&d_plist->excl,
596                        h_plist->excl.data(),
597                        0,
598                        h_plist->excl.size(),
599                        deviceStream,
600                        GpuApiCallBehavior::Async,
601                        bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
602
603     if (bDoTime)
604     {
605         iTimers.pl_h2d.closeTimingRegion(deviceStream);
606     }
607
608     /* need to prune the pair list during the next step */
609     d_plist->haveFreshList = true;
610 }
611
612 void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat)
613 {
614     bool                 bDoTime       = nb->bDoTime;
615     Nbnxm::GpuTimers*    timers        = bDoTime ? nb->timers : nullptr;
616     NBAtomDataGpu*       atdat         = nb->atdat;
617     const DeviceContext& deviceContext = *nb->deviceContext_;
618     const DeviceStream&  localStream   = *nb->deviceStreams[InteractionLocality::Local];
619
620     int  numAtoms  = nbat->numAtoms();
621     bool realloced = false;
622
623     if (bDoTime)
624     {
625         /* time async copy */
626         timers->atdat.openTimingRegion(localStream);
627     }
628
629     /* need to reallocate if we have to copy more atoms than the amount of space
630        available and only allocate if we haven't initialized yet, i.e atdat->natoms == -1 */
631     if (numAtoms > atdat->numAtomsAlloc)
632     {
633         int numAlloc = over_alloc_small(numAtoms);
634
635         /* free up first if the arrays have already been initialized */
636         if (atdat->numAtomsAlloc != -1)
637         {
638             freeDeviceBuffer(&atdat->f);
639             freeDeviceBuffer(&atdat->xq);
640             if (useLjCombRule(nb->nbparam->vdwType))
641             {
642                 freeDeviceBuffer(&atdat->ljComb);
643             }
644             else
645             {
646                 freeDeviceBuffer(&atdat->atomTypes);
647             }
648         }
649
650
651         allocateDeviceBuffer(&atdat->f, numAlloc, deviceContext);
652         allocateDeviceBuffer(&atdat->xq, numAlloc, deviceContext);
653
654         if (useLjCombRule(nb->nbparam->vdwType))
655         {
656             // Two Lennard-Jones parameters per atom
657             allocateDeviceBuffer(&atdat->ljComb, numAlloc, deviceContext);
658         }
659         else
660         {
661             allocateDeviceBuffer(&atdat->atomTypes, numAlloc, deviceContext);
662         }
663
664         atdat->numAtomsAlloc = numAlloc;
665         realloced            = true;
666     }
667
668     atdat->numAtoms      = numAtoms;
669     atdat->numAtomsLocal = nbat->natoms_local;
670
671     /* need to clear GPU f output if realloc happened */
672     if (realloced)
673     {
674         clearDeviceBufferAsync(&atdat->f, 0, atdat->numAtomsAlloc, localStream);
675     }
676
677     if (useLjCombRule(nb->nbparam->vdwType))
678     {
679         static_assert(
680                 sizeof(Float2) == 2 * sizeof(*nbat->params().lj_comb.data()),
681                 "Size of a pair of LJ parameters elements should be equal to the size of Float2.");
682         copyToDeviceBuffer(&atdat->ljComb,
683                            reinterpret_cast<const Float2*>(nbat->params().lj_comb.data()),
684                            0,
685                            numAtoms,
686                            localStream,
687                            GpuApiCallBehavior::Async,
688                            bDoTime ? timers->atdat.fetchNextEvent() : nullptr);
689     }
690     else
691     {
692         static_assert(sizeof(int) == sizeof(*nbat->params().type.data()),
693                       "Sizes of host- and device-side atom types should be the same.");
694         copyToDeviceBuffer(&atdat->atomTypes,
695                            nbat->params().type.data(),
696                            0,
697                            numAtoms,
698                            localStream,
699                            GpuApiCallBehavior::Async,
700                            bDoTime ? timers->atdat.fetchNextEvent() : nullptr);
701     }
702
703     if (bDoTime)
704     {
705         timers->atdat.closeTimingRegion(localStream);
706     }
707
708     /* kick off the tasks enqueued above to ensure concurrency with the search */
709     issueClFlushInStream(localStream);
710 }
711
712 void gpu_clear_outputs(NbnxmGpu* nb, bool computeVirial)
713 {
714     NBAtomDataGpu*      adat        = nb->atdat;
715     const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local];
716     // Clear forces
717     clearDeviceBufferAsync(&adat->f, 0, nb->atdat->numAtoms, localStream);
718     // Clear shift force array and energies if the outputs were used in the current step
719     if (computeVirial)
720     {
721         clearDeviceBufferAsync(&adat->fShift, 0, gmx::c_numShiftVectors, localStream);
722         clearDeviceBufferAsync(&adat->eLJ, 0, 1, localStream);
723         clearDeviceBufferAsync(&adat->eElec, 0, 1, localStream);
724     }
725     issueClFlushInStream(localStream);
726 }
727
728 //! This function is documented in the header file
729 gmx_wallclock_gpu_nbnxn_t* gpu_get_timings(NbnxmGpu* nb)
730 {
731     return (nb != nullptr && nb->bDoTime) ? nb->timings : nullptr;
732 }
733
734 //! This function is documented in the header file
735 void gpu_reset_timings(nonbonded_verlet_t* nbv)
736 {
737     if (nbv->gpu_nbv && nbv->gpu_nbv->bDoTime)
738     {
739         init_timings(nbv->gpu_nbv->timings);
740     }
741 }
742
743 bool gpu_is_kernel_ewald_analytical(const NbnxmGpu* nb)
744 {
745     return ((nb->nbparam->elecType == ElecType::EwaldAna)
746             || (nb->nbparam->elecType == ElecType::EwaldAnaTwin));
747 }
748
749 void setupGpuShortRangeWork(NbnxmGpu*                      nb,
750                             const gmx::ListedForcesGpu*    listedForcesGpu,
751                             const gmx::InteractionLocality iLocality)
752 {
753     GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
754
755     // There is short-range work if the pair list for the provided
756     // interaction locality contains entries or if there is any
757     // bonded work (as this is not split into local/nonlocal).
758     nb->haveWork[iLocality] = ((nb->plist[iLocality]->nsci != 0)
759                                || (listedForcesGpu != nullptr && listedForcesGpu->haveInteractions()));
760 }
761
762 bool haveGpuShortRangeWork(const NbnxmGpu* nb, const gmx::InteractionLocality interactionLocality)
763 {
764     GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
765
766     return nb->haveWork[interactionLocality];
767 }
768
769 /*! \brief
770  * Launch asynchronously the download of nonbonded forces from the GPU
771  * (and energies/shift forces if required).
772  */
773 void gpu_launch_cpyback(NbnxmGpu*                nb,
774                         struct nbnxn_atomdata_t* nbatom,
775                         const gmx::StepWorkload& stepWork,
776                         const AtomLocality       atomLocality)
777 {
778     GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
779
780     /* determine interaction locality from atom locality */
781     const InteractionLocality iloc = atomToInteractionLocality(atomLocality);
782     GMX_ASSERT(iloc == InteractionLocality::Local
783                        || (iloc == InteractionLocality::NonLocal && nb->bNonLocalStreamDoneMarked == false),
784                "Non-local stream is indicating that the copy back event is enqueued at the "
785                "beginning of the copy back function.");
786
787     /* extract the data */
788     NBAtomDataGpu*      adat         = nb->atdat;
789     Nbnxm::GpuTimers*   timers       = nb->timers;
790     bool                bDoTime      = nb->bDoTime;
791     const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
792
793     /* don't launch non-local copy-back if there was no non-local work to do */
794     if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(nb, iloc))
795     {
796         /* TODO An alternative way to signal that non-local work is
797            complete is to use a clEnqueueMarker+clEnqueueBarrier
798            pair. However, the use of bNonLocalStreamDoneMarked has the
799            advantage of being local to the host, so probably minimizes
800            overhead. Curiously, for NVIDIA OpenCL with an empty-domain
801            test case, overall simulation performance was higher with
802            the API calls, but this has not been tested on AMD OpenCL,
803            so could be worth considering in future. */
804         nb->bNonLocalStreamDoneMarked = false;
805         return;
806     }
807
808     /* local/nonlocal offset and length used for xq and f */
809     auto atomsRange = getGpuAtomRange(adat, atomLocality);
810
811     /* beginning of timed D2H section */
812     if (bDoTime)
813     {
814         timers->xf[atomLocality].nb_d2h.openTimingRegion(deviceStream);
815     }
816
817     /* With DD the local D2H transfer can only start after the non-local
818        has been launched. */
819     if (iloc == InteractionLocality::Local && nb->bNonLocalStreamDoneMarked)
820     {
821         nb->nonlocal_done.enqueueWaitEvent(deviceStream);
822         nb->bNonLocalStreamDoneMarked = false;
823     }
824
825     /* DtoH f */
826     if (!stepWork.useGpuFBufferOps)
827     {
828         static_assert(
829                 sizeof(*nbatom->out[0].f.data()) == sizeof(float),
830                 "The host force buffer should be in single precision to match device data size.");
831         copyFromDeviceBuffer(reinterpret_cast<Float3*>(nbatom->out[0].f.data()) + atomsRange.begin(),
832                              &adat->f,
833                              atomsRange.begin(),
834                              atomsRange.size(),
835                              deviceStream,
836                              GpuApiCallBehavior::Async,
837                              bDoTime ? timers->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
838
839         issueClFlushInStream(deviceStream);
840     }
841
842     /* After the non-local D2H is launched the nonlocal_done event can be
843        recorded which signals that the local D2H can proceed. This event is not
844        placed after the non-local kernel because we first need the non-local
845        data back first. */
846     if (iloc == InteractionLocality::NonLocal)
847     {
848         nb->nonlocal_done.markEvent(deviceStream);
849         nb->bNonLocalStreamDoneMarked = true;
850     }
851
852     /* only transfer energies in the local stream */
853     if (iloc == InteractionLocality::Local)
854     {
855         /* DtoH fshift when virial is needed */
856         if (stepWork.computeVirial)
857         {
858             static_assert(
859                     sizeof(*nb->nbst.fShift) == sizeof(Float3),
860                     "Sizes of host- and device-side shift vector elements should be the same.");
861             copyFromDeviceBuffer(nb->nbst.fShift,
862                                  &adat->fShift,
863                                  0,
864                                  gmx::c_numShiftVectors,
865                                  deviceStream,
866                                  GpuApiCallBehavior::Async,
867                                  bDoTime ? timers->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
868         }
869
870         /* DtoH energies */
871         if (stepWork.computeEnergy)
872         {
873             static_assert(sizeof(*nb->nbst.eLJ) == sizeof(float),
874                           "Sizes of host- and device-side LJ energy terms should be the same.");
875             copyFromDeviceBuffer(nb->nbst.eLJ,
876                                  &adat->eLJ,
877                                  0,
878                                  1,
879                                  deviceStream,
880                                  GpuApiCallBehavior::Async,
881                                  bDoTime ? timers->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
882             static_assert(sizeof(*nb->nbst.eElec) == sizeof(float),
883                           "Sizes of host- and device-side electrostatic energy terms should be the "
884                           "same.");
885             copyFromDeviceBuffer(nb->nbst.eElec,
886                                  &adat->eElec,
887                                  0,
888                                  1,
889                                  deviceStream,
890                                  GpuApiCallBehavior::Async,
891                                  bDoTime ? timers->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
892         }
893     }
894
895     if (bDoTime)
896     {
897         timers->xf[atomLocality].nb_d2h.closeTimingRegion(deviceStream);
898     }
899 }
900
901 void nbnxnInsertNonlocalGpuDependency(NbnxmGpu* nb, const InteractionLocality interactionLocality)
902 {
903     const DeviceStream& deviceStream = *nb->deviceStreams[interactionLocality];
904
905     /* When we get here all misc operations issued in the local stream as well as
906        the local xq H2D are done,
907        so we record that in the local stream and wait for it in the nonlocal one.
908        This wait needs to precede any PP tasks, bonded or nonbonded, that may
909        compute on interactions between local and nonlocal atoms.
910      */
911     if (nb->bUseTwoStreams)
912     {
913         if (interactionLocality == InteractionLocality::Local)
914         {
915             nb->misc_ops_and_local_H2D_done.markEvent(deviceStream);
916             issueClFlushInStream(deviceStream);
917         }
918         else
919         {
920             nb->misc_ops_and_local_H2D_done.enqueueWaitEvent(deviceStream);
921         }
922     }
923 }
924
925 /*! \brief Launch asynchronously the xq buffer host to device copy. */
926 void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const AtomLocality atomLocality)
927 {
928     GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
929
930     const InteractionLocality iloc = atomToInteractionLocality(atomLocality);
931
932     NBAtomDataGpu*      adat         = nb->atdat;
933     gpu_plist*          plist        = nb->plist[iloc];
934     Nbnxm::GpuTimers*   timers       = nb->timers;
935     const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
936
937     const bool bDoTime = nb->bDoTime;
938
939     /* Don't launch the non-local H2D copy if there is no dependent
940        work to do: neither non-local nor other (e.g. bonded) work
941        to do that has as input the nbnxn coordaintes.
942        Doing the same for the local kernel is more complicated, since the
943        local part of the force array also depends on the non-local kernel.
944        So to avoid complicating the code and to reduce the risk of bugs,
945        we always call the local local x+q copy (and the rest of the local
946        work in nbnxn_gpu_launch_kernel().
947      */
948     if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(nb, iloc))
949     {
950         plist->haveFreshList = false;
951
952         // The event is marked for Local interactions unconditionally,
953         // so it has to be released here because of the early return
954         // for NonLocal interactions.
955         nb->misc_ops_and_local_H2D_done.reset();
956
957         return;
958     }
959
960     /* local/nonlocal offset and length used for xq and f */
961     const auto atomsRange = getGpuAtomRange(adat, atomLocality);
962
963     /* beginning of timed HtoD section */
964     if (bDoTime)
965     {
966         timers->xf[atomLocality].nb_h2d.openTimingRegion(deviceStream);
967     }
968
969     /* HtoD x, q */
970     GMX_ASSERT(nbatom->XFormat == nbatXYZQ,
971                "The coordinates should be in xyzq format to copy to the Float4 device buffer.");
972     copyToDeviceBuffer(&adat->xq,
973                        reinterpret_cast<const Float4*>(nbatom->x().data()) + atomsRange.begin(),
974                        atomsRange.begin(),
975                        atomsRange.size(),
976                        deviceStream,
977                        GpuApiCallBehavior::Async,
978                        nullptr);
979
980     if (bDoTime)
981     {
982         timers->xf[atomLocality].nb_h2d.closeTimingRegion(deviceStream);
983     }
984
985     /* When we get here all misc operations issued in the local stream as well as
986        the local xq H2D are done,
987        so we record that in the local stream and wait for it in the nonlocal one.
988        This wait needs to precede any PP tasks, bonded or nonbonded, that may
989        compute on interactions between local and nonlocal atoms.
990      */
991     nbnxnInsertNonlocalGpuDependency(nb, iloc);
992 }
993
994
995 /* Initialization for X buffer operations on GPU. */
996 void nbnxn_gpu_init_x_to_nbat_x(const Nbnxm::GridSet& gridSet, NbnxmGpu* gpu_nbv)
997 {
998     const DeviceStream& localStream   = *gpu_nbv->deviceStreams[InteractionLocality::Local];
999     const bool          bDoTime       = gpu_nbv->bDoTime;
1000     const int           maxNumColumns = gridSet.numColumnsMax();
1001
1002     reallocateDeviceBuffer(&gpu_nbv->cxy_na,
1003                            maxNumColumns * gridSet.grids().size(),
1004                            &gpu_nbv->ncxy_na,
1005                            &gpu_nbv->ncxy_na_alloc,
1006                            *gpu_nbv->deviceContext_);
1007     reallocateDeviceBuffer(&gpu_nbv->cxy_ind,
1008                            maxNumColumns * gridSet.grids().size(),
1009                            &gpu_nbv->ncxy_ind,
1010                            &gpu_nbv->ncxy_ind_alloc,
1011                            *gpu_nbv->deviceContext_);
1012
1013     for (unsigned int g = 0; g < gridSet.grids().size(); g++)
1014     {
1015         const Nbnxm::Grid& grid = gridSet.grids()[g];
1016
1017         const int  numColumns      = grid.numColumns();
1018         const int* atomIndices     = gridSet.atomIndices().data();
1019         const int  atomIndicesSize = gridSet.atomIndices().size();
1020         const int* cxy_na          = grid.cxy_na().data();
1021         const int* cxy_ind         = grid.cxy_ind().data();
1022
1023         auto* timerH2D = bDoTime ? &gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d : nullptr;
1024
1025         reallocateDeviceBuffer(&gpu_nbv->atomIndices,
1026                                atomIndicesSize,
1027                                &gpu_nbv->atomIndicesSize,
1028                                &gpu_nbv->atomIndicesSize_alloc,
1029                                *gpu_nbv->deviceContext_);
1030
1031         if (atomIndicesSize > 0)
1032         {
1033             if (bDoTime)
1034             {
1035                 timerH2D->openTimingRegion(localStream);
1036             }
1037
1038             copyToDeviceBuffer(&gpu_nbv->atomIndices,
1039                                atomIndices,
1040                                0,
1041                                atomIndicesSize,
1042                                localStream,
1043                                GpuApiCallBehavior::Async,
1044                                bDoTime ? timerH2D->fetchNextEvent() : nullptr);
1045
1046             if (bDoTime)
1047             {
1048                 timerH2D->closeTimingRegion(localStream);
1049             }
1050         }
1051
1052         if (numColumns > 0)
1053         {
1054             if (bDoTime)
1055             {
1056                 timerH2D->openTimingRegion(localStream);
1057             }
1058
1059             copyToDeviceBuffer(&gpu_nbv->cxy_na,
1060                                cxy_na,
1061                                maxNumColumns * g,
1062                                numColumns,
1063                                localStream,
1064                                GpuApiCallBehavior::Async,
1065                                bDoTime ? timerH2D->fetchNextEvent() : nullptr);
1066
1067             if (bDoTime)
1068             {
1069                 timerH2D->closeTimingRegion(localStream);
1070             }
1071
1072             if (bDoTime)
1073             {
1074                 timerH2D->openTimingRegion(localStream);
1075             }
1076
1077             copyToDeviceBuffer(&gpu_nbv->cxy_ind,
1078                                cxy_ind,
1079                                maxNumColumns * g,
1080                                numColumns,
1081                                localStream,
1082                                GpuApiCallBehavior::Async,
1083                                bDoTime ? timerH2D->fetchNextEvent() : nullptr);
1084
1085             if (bDoTime)
1086             {
1087                 timerH2D->closeTimingRegion(localStream);
1088             }
1089         }
1090     }
1091
1092     // The above data is transferred on the local stream but is a
1093     // dependency of the nonlocal stream (specifically the nonlocal X
1094     // buf ops kernel).  We therefore set a dependency to ensure
1095     // that the nonlocal stream waits on the local stream here.
1096     // This call records an event in the local stream:
1097     nbnxnInsertNonlocalGpuDependency(gpu_nbv, Nbnxm::InteractionLocality::Local);
1098     // ...and this call instructs the nonlocal stream to wait on that event:
1099     nbnxnInsertNonlocalGpuDependency(gpu_nbv, Nbnxm::InteractionLocality::NonLocal);
1100 }
1101
1102 //! This function is documented in the header file
1103 void gpu_free(NbnxmGpu* nb)
1104 {
1105     if (nb == nullptr)
1106     {
1107         return;
1108     }
1109
1110     gpu_free_platform_specific(nb);
1111
1112     delete nb->timers;
1113     sfree(nb->timings);
1114
1115     NBAtomDataGpu* atdat   = nb->atdat;
1116     NBParamGpu*    nbparam = nb->nbparam;
1117
1118     /* Free atdat */
1119     freeDeviceBuffer(&(nb->atdat->xq));
1120     freeDeviceBuffer(&(nb->atdat->f));
1121     freeDeviceBuffer(&(nb->atdat->eLJ));
1122     freeDeviceBuffer(&(nb->atdat->eElec));
1123     freeDeviceBuffer(&(nb->atdat->fShift));
1124     freeDeviceBuffer(&(nb->atdat->shiftVec));
1125     if (useLjCombRule(nb->nbparam->vdwType))
1126     {
1127         freeDeviceBuffer(&atdat->ljComb);
1128     }
1129     else
1130     {
1131         freeDeviceBuffer(&atdat->atomTypes);
1132     }
1133
1134     /* Free nbparam */
1135     if (nbparam->elecType == ElecType::EwaldTab || nbparam->elecType == ElecType::EwaldTabTwin)
1136     {
1137         destroyParamLookupTable(&nbparam->coulomb_tab, &nbparam->coulomb_tab_texobj);
1138     }
1139
1140     if (!useLjCombRule(nb->nbparam->vdwType))
1141     {
1142         destroyParamLookupTable(&nbparam->nbfp, &nbparam->nbfp_texobj);
1143     }
1144
1145     if (nbparam->vdwType == VdwType::EwaldGeom || nbparam->vdwType == VdwType::EwaldLB)
1146     {
1147         destroyParamLookupTable(&nbparam->nbfp_comb, &nbparam->nbfp_comb_texobj);
1148     }
1149
1150     /* Free plist */
1151     auto* plist = nb->plist[InteractionLocality::Local];
1152     freeDeviceBuffer(&plist->sci);
1153     freeDeviceBuffer(&plist->cj4);
1154     freeDeviceBuffer(&plist->imask);
1155     freeDeviceBuffer(&plist->excl);
1156     delete plist;
1157     if (nb->bUseTwoStreams)
1158     {
1159         auto* plist_nl = nb->plist[InteractionLocality::NonLocal];
1160         freeDeviceBuffer(&plist_nl->sci);
1161         freeDeviceBuffer(&plist_nl->cj4);
1162         freeDeviceBuffer(&plist_nl->imask);
1163         freeDeviceBuffer(&plist_nl->excl);
1164         delete plist_nl;
1165     }
1166
1167     /* Free nbst */
1168     pfree(nb->nbst.eLJ);
1169     nb->nbst.eLJ = nullptr;
1170
1171     pfree(nb->nbst.eElec);
1172     nb->nbst.eElec = nullptr;
1173
1174     pfree(nb->nbst.fShift);
1175     nb->nbst.fShift = nullptr;
1176
1177     delete atdat;
1178     delete nbparam;
1179     delete nb;
1180
1181     if (debug)
1182     {
1183         fprintf(debug, "Cleaned up NBNXM GPU data structures.\n");
1184     }
1185 }
1186
1187 DeviceBuffer<gmx::RVec> gpu_get_f(NbnxmGpu* nb)
1188 {
1189     GMX_ASSERT(nb != nullptr, "nb pointer must be valid");
1190
1191     return nb->atdat->f;
1192 }
1193
1194 } // namespace Nbnxm