src/gromacs/nbnxm/nbnxm_gpu_data_mgmt.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2012,2013,2014,2015,2016 by the GROMACS development team.
   5  * Copyright (c) 2017,2018,2019,2020,2021, by the GROMACS development team, led by
   6  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   7  * and including many others, as listed in the AUTHORS file in the
   8  * top-level source directory and at http://www.gromacs.org.
   9  *
  10  * GROMACS is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU Lesser General Public License
  12  * as published by the Free Software Foundation; either version 2.1
  13  * of the License, or (at your option) any later version.
  14  *
  15  * GROMACS is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18  * Lesser General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public
  21  * License along with GROMACS; if not, see
  22  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  23  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  24  *
  25  * If you want to redistribute modifications to GROMACS, please
  26  * consider that scientific software is very special. Version
  27  * control is crucial - bugs must be traceable. We will be happy to
  28  * consider code for inclusion in the official distribution, but
  29  * derived work must not be called official GROMACS. Details are found
  30  * in the README & COPYING files - if they are missing, get the
  31  * official version at http://www.gromacs.org.
  32  *
  33  * To help us fund GROMACS development, we humbly ask that you cite
  34  * the research papers on the package. Check out http://www.gromacs.org.
  35  */
  36 /*! \internal \file
  37  *  \brief Define common implementation of nbnxm_gpu_data_mgmt.h
  38  *
  39  *  \author Anca Hamuraru <anca@streamcomputing.eu>
  40  *  \author Dimitrios Karkoulis <dimitris.karkoulis@gmail.com>
  41  *  \author Teemu Virolainen <teemu@streamcomputing.eu>
  42  *  \author Szilárd Páll <pall.szilard@gmail.com>
  43  *  \author Artem Zhmurov <zhmurov@gmail.com>
  44  *
  45  *  \ingroup module_nbnxm
  46  */
  47 #include "gmxpre.h"
  48
  49 #include "config.h"
  50
  51 #if GMX_GPU_CUDA
  52 #    include "cuda/nbnxm_cuda_types.h"
  53 #endif
  54
  55 #if GMX_GPU_OPENCL
  56 #    include "opencl/nbnxm_ocl_types.h"
  57 #endif
  58
  59 #if GMX_GPU_SYCL
  60 #    include "sycl/nbnxm_sycl_types.h"
  61 #endif
  62
  63 #include "nbnxm_gpu_data_mgmt.h"
  64
  65 #include "gromacs/gpu_utils/device_stream_manager.h"
  66 #include "gromacs/gpu_utils/gputraits.h"
  67 #include "gromacs/gpu_utils/pmalloc.h"
  68 #include "gromacs/hardware/device_information.h"
  69 #include "gromacs/mdtypes/interaction_const.h"
  70 #include "gromacs/mdtypes/simulation_workload.h"
  71 #include "gromacs/nbnxm/gpu_common_utils.h"
  72 #include "gromacs/nbnxm/gpu_data_mgmt.h"
  73 #include "gromacs/nbnxm/gridset.h"
  74 #include "gromacs/pbcutil/ishift.h"
  75 #include "gromacs/timing/gpu_timing.h"
  76 #include "gromacs/pbcutil/ishift.h"
  77 #include "gromacs/utility/cstringutil.h"
  78 #include "gromacs/utility/exceptions.h"
  79 #include "gromacs/utility/fatalerror.h"
  80
  81 #include "nbnxm_gpu.h"
  82 #include "pairlistsets.h"
  83
  84 namespace Nbnxm
  85 {
  86
  87 static inline void issueClFlushInStream(const DeviceStream& deviceStream)
  88 {
  89 #if GMX_GPU_OPENCL
  90     /* Based on the v1.2 section 5.13 of the OpenCL spec, a flush is needed
  91      * in the stream after marking an event in it in order to be able to sync with
  92      * the event from another stream.
  93      */
  94     cl_int cl_error = clFlush(deviceStream.stream());
  95     if (cl_error != CL_SUCCESS)
  96     {
  97         GMX_THROW(gmx::InternalError("clFlush failed: " + ocl_get_error_string(cl_error)));
  98     }
  99 #else
 100     GMX_UNUSED_VALUE(deviceStream);
 101 #endif
 102 }
 103
 104 static inline void init_ewald_coulomb_force_table(const EwaldCorrectionTables& tables,
 105                                                   NBParamGpu*                  nbp,
 106                                                   const DeviceContext&         deviceContext)
 107 {
 108     if (nbp->coulomb_tab)
 109     {
 110         destroyParamLookupTable(&nbp->coulomb_tab, &nbp->coulomb_tab_texobj);
 111     }
 112
 113     nbp->coulomb_tab_scale = tables.scale;
 114     initParamLookupTable(
 115             &nbp->coulomb_tab, &nbp->coulomb_tab_texobj, tables.tableF.data(), tables.tableF.size(), deviceContext);
 116 }
 117
 118 static inline ElecType nbnxn_gpu_pick_ewald_kernel_type(const interaction_const_t& ic,
 119                                                         const DeviceInformation gmx_unused& deviceInfo)
 120 {
 121     bool bTwinCut = (ic.rcoulomb != ic.rvdw);
 122
 123     /* Benchmarking/development environment variables to force the use of
 124        analytical or tabulated Ewald kernel. */
 125     const bool forceAnalyticalEwald = (getenv("GMX_GPU_NB_ANA_EWALD") != nullptr);
 126     const bool forceTabulatedEwald  = (getenv("GMX_GPU_NB_TAB_EWALD") != nullptr);
 127     const bool forceTwinCutoffEwald = (getenv("GMX_GPU_NB_EWALD_TWINCUT") != nullptr);
 128
 129     if (forceAnalyticalEwald && forceTabulatedEwald)
 130     {
 131         gmx_incons(
 132                 "Both analytical and tabulated Ewald GPU non-bonded kernels "
 133                 "requested through environment variables.");
 134     }
 135
 136     /* By default, use analytical Ewald except with CUDA on NVIDIA CC 7.0 and 8.0.
 137      */
 138     const bool c_useTabulatedEwaldDefault =
 139 #if GMX_GPU_CUDA
 140             (deviceInfo.prop.major == 7 && deviceInfo.prop.minor == 0)
 141             || (deviceInfo.prop.major == 8 && deviceInfo.prop.minor == 0);
 142 #else
 143             false;
 144 #endif
 145     bool bUseAnalyticalEwald = !c_useTabulatedEwaldDefault;
 146     if (forceAnalyticalEwald)
 147     {
 148         bUseAnalyticalEwald = true;
 149         if (debug)
 150         {
 151             fprintf(debug, "Using analytical Ewald GPU kernels\n");
 152         }
 153     }
 154     else if (forceTabulatedEwald)
 155     {
 156         bUseAnalyticalEwald = false;
 157
 158         if (debug)
 159         {
 160             fprintf(debug, "Using tabulated Ewald GPU kernels\n");
 161         }
 162     }
 163
 164     /* Use twin cut-off kernels if requested by bTwinCut or the env. var.
 165        forces it (use it for debugging/benchmarking only). */
 166     if (!bTwinCut && !forceTwinCutoffEwald)
 167     {
 168         return bUseAnalyticalEwald ? ElecType::EwaldAna : ElecType::EwaldTab;
 169     }
 170     else
 171     {
 172         return bUseAnalyticalEwald ? ElecType::EwaldAnaTwin : ElecType::EwaldTabTwin;
 173     }
 174 }
 175
 176 static inline void set_cutoff_parameters(NBParamGpu*                nbp,
 177                                          const interaction_const_t& ic,
 178                                          const PairlistParams&      listParams)
 179 {
 180     nbp->ewald_beta        = ic.ewaldcoeff_q;
 181     nbp->sh_ewald          = ic.sh_ewald;
 182     nbp->epsfac            = ic.epsfac;
 183     nbp->two_k_rf          = 2.0 * ic.reactionFieldCoefficient;
 184     nbp->c_rf              = ic.reactionFieldShift;
 185     nbp->rvdw_sq           = ic.rvdw * ic.rvdw;
 186     nbp->rcoulomb_sq       = ic.rcoulomb * ic.rcoulomb;
 187     nbp->rlistOuter_sq     = listParams.rlistOuter * listParams.rlistOuter;
 188     nbp->rlistInner_sq     = listParams.rlistInner * listParams.rlistInner;
 189     nbp->useDynamicPruning = listParams.useDynamicPruning;
 190
 191     nbp->sh_lj_ewald   = ic.sh_lj_ewald;
 192     nbp->ewaldcoeff_lj = ic.ewaldcoeff_lj;
 193
 194     nbp->rvdw_switch      = ic.rvdw_switch;
 195     nbp->dispersion_shift = ic.dispersion_shift;
 196     nbp->repulsion_shift  = ic.repulsion_shift;
 197     nbp->vdw_switch       = ic.vdw_switch;
 198 }
 199
 200 static inline void init_plist(gpu_plist* pl)
 201 {
 202     /* initialize to nullptr pointers to data that is not allocated here and will
 203        need reallocation in nbnxn_gpu_init_pairlist */
 204     pl->sci   = nullptr;
 205     pl->cj4   = nullptr;
 206     pl->imask = nullptr;
 207     pl->excl  = nullptr;
 208
 209     /* size -1 indicates that the respective array hasn't been initialized yet */
 210     pl->na_c                   = -1;
 211     pl->nsci                   = -1;
 212     pl->sci_nalloc             = -1;
 213     pl->ncj4                   = -1;
 214     pl->cj4_nalloc             = -1;
 215     pl->nimask                 = -1;
 216     pl->imask_nalloc           = -1;
 217     pl->nexcl                  = -1;
 218     pl->excl_nalloc            = -1;
 219     pl->haveFreshList          = false;
 220     pl->rollingPruningNumParts = 0;
 221     pl->rollingPruningPart     = 0;
 222 }
 223
 224 static inline void init_timings(gmx_wallclock_gpu_nbnxn_t* t)
 225 {
 226     t->nb_h2d_t = 0.0;
 227     t->nb_d2h_t = 0.0;
 228     t->nb_c     = 0;
 229     t->pl_h2d_t = 0.0;
 230     t->pl_h2d_c = 0;
 231     for (int i = 0; i < 2; i++)
 232     {
 233         for (int j = 0; j < 2; j++)
 234         {
 235             t->ktime[i][j].t = 0.0;
 236             t->ktime[i][j].c = 0;
 237         }
 238     }
 239     t->pruneTime.c        = 0;
 240     t->pruneTime.t        = 0.0;
 241     t->dynamicPruneTime.c = 0;
 242     t->dynamicPruneTime.t = 0.0;
 243 }
 244
 245 /*! \brief Initialize \p atomdata first time; it only gets filled at pair-search. */
 246 static inline void initAtomdataFirst(NBAtomDataGpu*       atomdata,
 247                                      int                  numTypes,
 248                                      const DeviceContext& deviceContext,
 249                                      const DeviceStream&  localStream)
 250 {
 251     atomdata->numTypes = numTypes;
 252     allocateDeviceBuffer(&atomdata->shiftVec, gmx::c_numShiftVectors, deviceContext);
 253     atomdata->shiftVecUploaded = false;
 254
 255     allocateDeviceBuffer(&atomdata->fShift, gmx::c_numShiftVectors, deviceContext);
 256     allocateDeviceBuffer(&atomdata->eLJ, 1, deviceContext);
 257     allocateDeviceBuffer(&atomdata->eElec, 1, deviceContext);
 258
 259     clearDeviceBufferAsync(&atomdata->fShift, 0, gmx::c_numShiftVectors, localStream);
 260     clearDeviceBufferAsync(&atomdata->eElec, 0, 1, localStream);
 261     clearDeviceBufferAsync(&atomdata->eLJ, 0, 1, localStream);
 262
 263     /* initialize to nullptr pointers to data that is not allocated here and will
 264        need reallocation in later */
 265     atomdata->xq = nullptr;
 266     atomdata->f  = nullptr;
 267
 268     /* size -1 indicates that the respective array hasn't been initialized yet */
 269     atomdata->numAtoms      = -1;
 270     atomdata->numAtomsAlloc = -1;
 271 }
 272
 273 static inline VdwType nbnxmGpuPickVdwKernelType(const interaction_const_t& ic,
 274                                                 LJCombinationRule          ljCombinationRule)
 275 {
 276     if (ic.vdwtype == VanDerWaalsType::Cut)
 277     {
 278         switch (ic.vdw_modifier)
 279         {
 280             case InteractionModifiers::None:
 281             case InteractionModifiers::PotShift:
 282                 switch (ljCombinationRule)
 283                 {
 284                     case LJCombinationRule::None: return VdwType::Cut;
 285                     case LJCombinationRule::Geometric: return VdwType::CutCombGeom;
 286                     case LJCombinationRule::LorentzBerthelot: return VdwType::CutCombLB;
 287                     default:
 288                         GMX_THROW(gmx::InconsistentInputError(gmx::formatString(
 289                                 "The requested LJ combination rule %s is not implemented in "
 290                                 "the GPU accelerated kernels!",
 291                                 enumValueToString(ljCombinationRule))));
 292                 }
 293             case InteractionModifiers::ForceSwitch: return VdwType::FSwitch;
 294             case InteractionModifiers::PotSwitch: return VdwType::PSwitch;
 295             default:
 296                 GMX_THROW(gmx::InconsistentInputError(
 297                         gmx::formatString("The requested VdW interaction modifier %s is not "
 298                                           "implemented in the GPU accelerated kernels!",
 299                                           enumValueToString(ic.vdw_modifier))));
 300         }
 301     }
 302     else if (ic.vdwtype == VanDerWaalsType::Pme)
 303     {
 304         if (ic.ljpme_comb_rule == LongRangeVdW::Geom)
 305         {
 306             GMX_RELEASE_ASSERT(
 307                     ljCombinationRule == LJCombinationRule::Geometric,
 308                     "Combination rules for long- and short-range interactions should match.");
 309             return VdwType::EwaldGeom;
 310         }
 311         else
 312         {
 313             GMX_RELEASE_ASSERT(
 314                     ljCombinationRule == LJCombinationRule::LorentzBerthelot,
 315                     "Combination rules for long- and short-range interactions should match.");
 316             return VdwType::EwaldLB;
 317         }
 318     }
 319     else
 320     {
 321         GMX_THROW(gmx::InconsistentInputError(gmx::formatString(
 322                 "The requested VdW type %s is not implemented in the GPU accelerated kernels!",
 323                 enumValueToString(ic.vdwtype))));
 324     }
 325 }
 326
 327 static inline ElecType nbnxmGpuPickElectrostaticsKernelType(const interaction_const_t& ic,
 328                                                             const DeviceInformation&   deviceInfo)
 329 {
 330     if (ic.eeltype == CoulombInteractionType::Cut)
 331     {
 332         return ElecType::Cut;
 333     }
 334     else if (EEL_RF(ic.eeltype))
 335     {
 336         return ElecType::RF;
 337     }
 338     else if ((EEL_PME(ic.eeltype) || ic.eeltype == CoulombInteractionType::Ewald))
 339     {
 340         return nbnxn_gpu_pick_ewald_kernel_type(ic, deviceInfo);
 341     }
 342     else
 343     {
 344         /* Shouldn't happen, as this is checked when choosing Verlet-scheme */
 345         GMX_THROW(gmx::InconsistentInputError(
 346                 gmx::formatString("The requested electrostatics type %s is not implemented in "
 347                                   "the GPU accelerated kernels!",
 348                                   enumValueToString(ic.eeltype))));
 349     }
 350 }
 351
 352 /*! \brief Initialize the nonbonded parameter data structure. */
 353 static inline void initNbparam(NBParamGpu*                     nbp,
 354                                const interaction_const_t&      ic,
 355                                const PairlistParams&           listParams,
 356                                const nbnxn_atomdata_t::Params& nbatParams,
 357                                const DeviceContext&            deviceContext)
 358 {
 359     const int numTypes = nbatParams.numTypes;
 360
 361     set_cutoff_parameters(nbp, ic, listParams);
 362
 363     nbp->vdwType  = nbnxmGpuPickVdwKernelType(ic, nbatParams.ljCombinationRule);
 364     nbp->elecType = nbnxmGpuPickElectrostaticsKernelType(ic, deviceContext.deviceInfo());
 365
 366     if (ic.vdwtype == VanDerWaalsType::Pme)
 367     {
 368         if (ic.ljpme_comb_rule == LongRangeVdW::Geom)
 369         {
 370             GMX_ASSERT(nbatParams.ljCombinationRule == LJCombinationRule::Geometric,
 371                        "Combination rule mismatch!");
 372         }
 373         else
 374         {
 375             GMX_ASSERT(nbatParams.ljCombinationRule == LJCombinationRule::LorentzBerthelot,
 376                        "Combination rule mismatch!");
 377         }
 378     }
 379
 380     /* generate table for PME */
 381     if (nbp->elecType == ElecType::EwaldTab || nbp->elecType == ElecType::EwaldTabTwin)
 382     {
 383         GMX_RELEASE_ASSERT(ic.coulombEwaldTables, "Need valid Coulomb Ewald correction tables");
 384         init_ewald_coulomb_force_table(*ic.coulombEwaldTables, nbp, deviceContext);
 385     }
 386
 387     /* set up LJ parameter lookup table */
 388     if (!useLjCombRule(nbp->vdwType))
 389     {
 390         static_assert(sizeof(decltype(nbp->nbfp)) == 2 * sizeof(decltype(*nbatParams.nbfp.data())),
 391                       "Mismatch in the size of host / device data types");
 392         initParamLookupTable(&nbp->nbfp,
 393                              &nbp->nbfp_texobj,
 394                              reinterpret_cast<const Float2*>(nbatParams.nbfp.data()),
 395                              numTypes * numTypes,
 396                              deviceContext);
 397     }
 398
 399     /* set up LJ-PME parameter lookup table */
 400     if (ic.vdwtype == VanDerWaalsType::Pme)
 401     {
 402         static_assert(sizeof(decltype(nbp->nbfp_comb))
 403                               == 2 * sizeof(decltype(*nbatParams.nbfp_comb.data())),
 404                       "Mismatch in the size of host / device data types");
 405         initParamLookupTable(&nbp->nbfp_comb,
 406                              &nbp->nbfp_comb_texobj,
 407                              reinterpret_cast<const Float2*>(nbatParams.nbfp_comb.data()),
 408                              numTypes,
 409                              deviceContext);
 410     }
 411 }
 412
 413 NbnxmGpu* gpu_init(const gmx::DeviceStreamManager& deviceStreamManager,
 414                    const interaction_const_t*      ic,
 415                    const PairlistParams&           listParams,
 416                    const nbnxn_atomdata_t*         nbat,
 417                    const bool                      bLocalAndNonlocal)
 418 {
 419     auto* nb                              = new NbnxmGpu();
 420     nb->deviceContext_                    = &deviceStreamManager.context();
 421     nb->atdat                             = new NBAtomDataGpu;
 422     nb->nbparam                           = new NBParamGpu;
 423     nb->plist[InteractionLocality::Local] = new Nbnxm::gpu_plist;
 424     if (bLocalAndNonlocal)
 425     {
 426         nb->plist[InteractionLocality::NonLocal] = new Nbnxm::gpu_plist;
 427     }
 428
 429     nb->bUseTwoStreams = bLocalAndNonlocal;
 430
 431     nb->timers = new Nbnxm::GpuTimers();
 432     snew(nb->timings, 1);
 433
 434     nb->bDoTime = decideGpuTimingsUsage();
 435
 436     if (nb->bDoTime)
 437     {
 438         init_timings(nb->timings);
 439     }
 440
 441     /* init nbst */
 442     pmalloc(reinterpret_cast<void**>(&nb->nbst.eLJ), sizeof(*nb->nbst.eLJ));
 443     pmalloc(reinterpret_cast<void**>(&nb->nbst.eElec), sizeof(*nb->nbst.eElec));
 444     pmalloc(reinterpret_cast<void**>(&nb->nbst.fShift), gmx::c_numShiftVectors * sizeof(*nb->nbst.fShift));
 445
 446     init_plist(nb->plist[InteractionLocality::Local]);
 447
 448     /* local/non-local GPU streams */
 449     GMX_RELEASE_ASSERT(deviceStreamManager.streamIsValid(gmx::DeviceStreamType::NonBondedLocal),
 450                        "Local non-bonded stream should be initialized to use GPU for non-bonded.");
 451     const DeviceStream& localStream = deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedLocal);
 452     nb->deviceStreams[InteractionLocality::Local] = &localStream;
 453     // In general, it's not strictly necessary to use 2 streams for SYCL, since they are
 454     // out-of-order. But for the time being, it will be less disruptive to keep them.
 455     if (nb->bUseTwoStreams)
 456     {
 457         init_plist(nb->plist[InteractionLocality::NonLocal]);
 458
 459         GMX_RELEASE_ASSERT(deviceStreamManager.streamIsValid(gmx::DeviceStreamType::NonBondedNonLocal),
 460                            "Non-local non-bonded stream should be initialized to use GPU for "
 461                            "non-bonded with domain decomposition.");
 462         nb->deviceStreams[InteractionLocality::NonLocal] =
 463                 &deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedNonLocal);
 464     }
 465
 466     const nbnxn_atomdata_t::Params& nbatParams    = nbat->params();
 467     const DeviceContext&            deviceContext = *nb->deviceContext_;
 468
 469     initNbparam(nb->nbparam, *ic, listParams, nbatParams, deviceContext);
 470     initAtomdataFirst(nb->atdat, nbatParams.numTypes, deviceContext, localStream);
 471
 472     gpu_init_platform_specific(nb);
 473
 474     if (debug)
 475     {
 476         fprintf(debug, "Initialized NBNXM GPU data structures.\n");
 477     }
 478
 479     return nb;
 480 }
 481
 482 void gpu_pme_loadbal_update_param(const nonbonded_verlet_t* nbv, const interaction_const_t& ic)
 483 {
 484     if (!nbv || !nbv->useGpu())
 485     {
 486         return;
 487     }
 488     NbnxmGpu*   nb  = nbv->gpu_nbv;
 489     NBParamGpu* nbp = nb->nbparam;
 490
 491     set_cutoff_parameters(nbp, ic, nbv->pairlistSets().params());
 492
 493     nbp->elecType = nbnxn_gpu_pick_ewald_kernel_type(ic, nb->deviceContext_->deviceInfo());
 494
 495     GMX_RELEASE_ASSERT(ic.coulombEwaldTables, "Need valid Coulomb Ewald correction tables");
 496     init_ewald_coulomb_force_table(*ic.coulombEwaldTables, nbp, *nb->deviceContext_);
 497 }
 498
 499 void gpu_upload_shiftvec(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom)
 500 {
 501     NBAtomDataGpu*      adat        = nb->atdat;
 502     const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local];
 503
 504     /* only if we have a dynamic box */
 505     if (nbatom->bDynamicBox || !adat->shiftVecUploaded)
 506     {
 507         copyToDeviceBuffer(&adat->shiftVec,
 508                            gmx::asGenericFloat3Pointer(nbatom->shift_vec),
 509                            0,
 510                            gmx::c_numShiftVectors,
 511                            localStream,
 512                            GpuApiCallBehavior::Async,
 513                            nullptr);
 514         adat->shiftVecUploaded = true;
 515     }
 516 }
 517
 518 //! This function is documented in the header file
 519 void gpu_init_pairlist(NbnxmGpu* nb, const NbnxnPairlistGpu* h_plist, const InteractionLocality iloc)
 520 {
 521     char sbuf[STRLEN];
 522     // Timing accumulation should happen only if there was work to do
 523     // because getLastRangeTime() gets skipped with empty lists later
 524     // which leads to the counter not being reset.
 525     bool                bDoTime      = (nb->bDoTime && !h_plist->sci.empty());
 526     const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
 527     gpu_plist*          d_plist      = nb->plist[iloc];
 528
 529     if (d_plist->na_c < 0)
 530     {
 531         d_plist->na_c = h_plist->na_ci;
 532     }
 533     else
 534     {
 535         if (d_plist->na_c != h_plist->na_ci)
 536         {
 537             sprintf(sbuf,
 538                     "In init_plist: the #atoms per cell has changed (from %d to %d)",
 539                     d_plist->na_c,
 540                     h_plist->na_ci);
 541             gmx_incons(sbuf);
 542         }
 543     }
 544
 545     GpuTimers::Interaction& iTimers = nb->timers->interaction[iloc];
 546
 547     if (bDoTime)
 548     {
 549         iTimers.pl_h2d.openTimingRegion(deviceStream);
 550         iTimers.didPairlistH2D = true;
 551     }
 552
 553     // TODO most of this function is same in CUDA and OpenCL, move into the header
 554     const DeviceContext& deviceContext = *nb->deviceContext_;
 555
 556     reallocateDeviceBuffer(
 557             &d_plist->sci, h_plist->sci.size(), &d_plist->nsci, &d_plist->sci_nalloc, deviceContext);
 558     copyToDeviceBuffer(&d_plist->sci,
 559                        h_plist->sci.data(),
 560                        0,
 561                        h_plist->sci.size(),
 562                        deviceStream,
 563                        GpuApiCallBehavior::Async,
 564                        bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
 565
 566     reallocateDeviceBuffer(
 567             &d_plist->cj4, h_plist->cj4.size(), &d_plist->ncj4, &d_plist->cj4_nalloc, deviceContext);
 568     copyToDeviceBuffer(&d_plist->cj4,
 569                        h_plist->cj4.data(),
 570                        0,
 571                        h_plist->cj4.size(),
 572                        deviceStream,
 573                        GpuApiCallBehavior::Async,
 574                        bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
 575
 576     reallocateDeviceBuffer(&d_plist->imask,
 577                            h_plist->cj4.size() * c_nbnxnGpuClusterpairSplit,
 578                            &d_plist->nimask,
 579                            &d_plist->imask_nalloc,
 580                            deviceContext);
 581
 582     reallocateDeviceBuffer(
 583             &d_plist->excl, h_plist->excl.size(), &d_plist->nexcl, &d_plist->excl_nalloc, deviceContext);
 584     copyToDeviceBuffer(&d_plist->excl,
 585                        h_plist->excl.data(),
 586                        0,
 587                        h_plist->excl.size(),
 588                        deviceStream,
 589                        GpuApiCallBehavior::Async,
 590                        bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
 591
 592     if (bDoTime)
 593     {
 594         iTimers.pl_h2d.closeTimingRegion(deviceStream);
 595     }
 596
 597     /* need to prune the pair list during the next step */
 598     d_plist->haveFreshList = true;
 599 }
 600
 601 void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat)
 602 {
 603     bool                 bDoTime       = nb->bDoTime;
 604     Nbnxm::GpuTimers*    timers        = bDoTime ? nb->timers : nullptr;
 605     NBAtomDataGpu*       atdat         = nb->atdat;
 606     const DeviceContext& deviceContext = *nb->deviceContext_;
 607     const DeviceStream&  localStream   = *nb->deviceStreams[InteractionLocality::Local];
 608
 609     int  numAtoms  = nbat->numAtoms();
 610     bool realloced = false;
 611
 612     if (bDoTime)
 613     {
 614         /* time async copy */
 615         timers->atdat.openTimingRegion(localStream);
 616     }
 617
 618     /* need to reallocate if we have to copy more atoms than the amount of space
 619        available and only allocate if we haven't initialized yet, i.e atdat->natoms == -1 */
 620     if (numAtoms > atdat->numAtomsAlloc)
 621     {
 622         int numAlloc = over_alloc_small(numAtoms);
 623
 624         /* free up first if the arrays have already been initialized */
 625         if (atdat->numAtomsAlloc != -1)
 626         {
 627             freeDeviceBuffer(&atdat->f);
 628             freeDeviceBuffer(&atdat->xq);
 629             if (useLjCombRule(nb->nbparam->vdwType))
 630             {
 631                 freeDeviceBuffer(&atdat->ljComb);
 632             }
 633             else
 634             {
 635                 freeDeviceBuffer(&atdat->atomTypes);
 636             }
 637         }
 638
 639
 640         allocateDeviceBuffer(&atdat->f, numAlloc, deviceContext);
 641         allocateDeviceBuffer(&atdat->xq, numAlloc, deviceContext);
 642
 643         if (useLjCombRule(nb->nbparam->vdwType))
 644         {
 645             // Two Lennard-Jones parameters per atom
 646             allocateDeviceBuffer(&atdat->ljComb, numAlloc, deviceContext);
 647         }
 648         else
 649         {
 650             allocateDeviceBuffer(&atdat->atomTypes, numAlloc, deviceContext);
 651         }
 652
 653         atdat->numAtomsAlloc = numAlloc;
 654         realloced            = true;
 655     }
 656
 657     atdat->numAtoms      = numAtoms;
 658     atdat->numAtomsLocal = nbat->natoms_local;
 659
 660     /* need to clear GPU f output if realloc happened */
 661     if (realloced)
 662     {
 663         clearDeviceBufferAsync(&atdat->f, 0, atdat->numAtomsAlloc, localStream);
 664     }
 665
 666     if (useLjCombRule(nb->nbparam->vdwType))
 667     {
 668         static_assert(
 669                 sizeof(Float2) == 2 * sizeof(*nbat->params().lj_comb.data()),
 670                 "Size of a pair of LJ parameters elements should be equal to the size of Float2.");
 671         copyToDeviceBuffer(&atdat->ljComb,
 672                            reinterpret_cast<const Float2*>(nbat->params().lj_comb.data()),
 673                            0,
 674                            numAtoms,
 675                            localStream,
 676                            GpuApiCallBehavior::Async,
 677                            bDoTime ? timers->atdat.fetchNextEvent() : nullptr);
 678     }
 679     else
 680     {
 681         static_assert(sizeof(int) == sizeof(*nbat->params().type.data()),
 682                       "Sizes of host- and device-side atom types should be the same.");
 683         copyToDeviceBuffer(&atdat->atomTypes,
 684                            nbat->params().type.data(),
 685                            0,
 686                            numAtoms,
 687                            localStream,
 688                            GpuApiCallBehavior::Async,
 689                            bDoTime ? timers->atdat.fetchNextEvent() : nullptr);
 690     }
 691
 692     if (bDoTime)
 693     {
 694         timers->atdat.closeTimingRegion(localStream);
 695     }
 696
 697     /* kick off the tasks enqueued above to ensure concurrency with the search */
 698     issueClFlushInStream(localStream);
 699 }
 700
 701 void gpu_clear_outputs(NbnxmGpu* nb, bool computeVirial)
 702 {
 703     NBAtomDataGpu*      adat        = nb->atdat;
 704     const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local];
 705     // Clear forces
 706     clearDeviceBufferAsync(&adat->f, 0, nb->atdat->numAtoms, localStream);
 707     // Clear shift force array and energies if the outputs were used in the current step
 708     if (computeVirial)
 709     {
 710         clearDeviceBufferAsync(&adat->fShift, 0, gmx::c_numShiftVectors, localStream);
 711         clearDeviceBufferAsync(&adat->eLJ, 0, 1, localStream);
 712         clearDeviceBufferAsync(&adat->eElec, 0, 1, localStream);
 713     }
 714     issueClFlushInStream(localStream);
 715 }
 716
 717 //! This function is documented in the header file
 718 gmx_wallclock_gpu_nbnxn_t* gpu_get_timings(NbnxmGpu* nb)
 719 {
 720     return (nb != nullptr && nb->bDoTime) ? nb->timings : nullptr;
 721 }
 722
 723 //! This function is documented in the header file
 724 void gpu_reset_timings(nonbonded_verlet_t* nbv)
 725 {
 726     if (nbv->gpu_nbv && nbv->gpu_nbv->bDoTime)
 727     {
 728         init_timings(nbv->gpu_nbv->timings);
 729     }
 730 }
 731
 732 bool gpu_is_kernel_ewald_analytical(const NbnxmGpu* nb)
 733 {
 734     return ((nb->nbparam->elecType == ElecType::EwaldAna)
 735             || (nb->nbparam->elecType == ElecType::EwaldAnaTwin));
 736 }
 737
 738 void setupGpuShortRangeWork(NbnxmGpu*                      nb,
 739                             const gmx::ListedForcesGpu*    listedForcesGpu,
 740                             const gmx::InteractionLocality iLocality)
 741 {
 742     GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
 743
 744     // There is short-range work if the pair list for the provided
 745     // interaction locality contains entries or if there is any
 746     // bonded work (as this is not split into local/nonlocal).
 747     nb->haveWork[iLocality] = ((nb->plist[iLocality]->nsci != 0)
 748                                || (listedForcesGpu != nullptr && listedForcesGpu->haveInteractions()));
 749 }
 750
 751 bool haveGpuShortRangeWork(const NbnxmGpu* nb, const gmx::InteractionLocality interactionLocality)
 752 {
 753     GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
 754
 755     return nb->haveWork[interactionLocality];
 756 }
 757
 758 /*! \brief
 759  * Launch asynchronously the download of nonbonded forces from the GPU
 760  * (and energies/shift forces if required).
 761  */
 762 void gpu_launch_cpyback(NbnxmGpu*                nb,
 763                         struct nbnxn_atomdata_t* nbatom,
 764                         const gmx::StepWorkload& stepWork,
 765                         const AtomLocality       atomLocality)
 766 {
 767     GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
 768
 769     /* determine interaction locality from atom locality */
 770     const InteractionLocality iloc = atomToInteractionLocality(atomLocality);
 771     GMX_ASSERT(iloc == InteractionLocality::Local
 772                        || (iloc == InteractionLocality::NonLocal && nb->bNonLocalStreamDoneMarked == false),
 773                "Non-local stream is indicating that the copy back event is enqueued at the "
 774                "beginning of the copy back function.");
 775
 776     /* extract the data */
 777     NBAtomDataGpu*      adat         = nb->atdat;
 778     Nbnxm::GpuTimers*   timers       = nb->timers;
 779     bool                bDoTime      = nb->bDoTime;
 780     const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
 781
 782     /* don't launch non-local copy-back if there was no non-local work to do */
 783     if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(nb, iloc))
 784     {
 785         /* TODO An alternative way to signal that non-local work is
 786            complete is to use a clEnqueueMarker+clEnqueueBarrier
 787            pair. However, the use of bNonLocalStreamDoneMarked has the
 788            advantage of being local to the host, so probably minimizes
 789            overhead. Curiously, for NVIDIA OpenCL with an empty-domain
 790            test case, overall simulation performance was higher with
 791            the API calls, but this has not been tested on AMD OpenCL,
 792            so could be worth considering in future. */
 793         nb->bNonLocalStreamDoneMarked = false;
 794         return;
 795     }
 796
 797     /* local/nonlocal offset and length used for xq and f */
 798     auto atomsRange = getGpuAtomRange(adat, atomLocality);
 799
 800     /* beginning of timed D2H section */
 801     if (bDoTime)
 802     {
 803         timers->xf[atomLocality].nb_d2h.openTimingRegion(deviceStream);
 804     }
 805
 806     /* With DD the local D2H transfer can only start after the non-local
 807        has been launched. */
 808     if (iloc == InteractionLocality::Local && nb->bNonLocalStreamDoneMarked)
 809     {
 810         nb->nonlocal_done.enqueueWaitEvent(deviceStream);
 811         nb->bNonLocalStreamDoneMarked = false;
 812     }
 813
 814     /* DtoH f */
 815     if (!stepWork.useGpuFBufferOps)
 816     {
 817         static_assert(
 818                 sizeof(*nbatom->out[0].f.data()) == sizeof(float),
 819                 "The host force buffer should be in single precision to match device data size.");
 820         copyFromDeviceBuffer(reinterpret_cast<Float3*>(nbatom->out[0].f.data()) + atomsRange.begin(),
 821                              &adat->f,
 822                              atomsRange.begin(),
 823                              atomsRange.size(),
 824                              deviceStream,
 825                              GpuApiCallBehavior::Async,
 826                              bDoTime ? timers->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
 827
 828         issueClFlushInStream(deviceStream);
 829     }
 830
 831     /* After the non-local D2H is launched the nonlocal_done event can be
 832        recorded which signals that the local D2H can proceed. This event is not
 833        placed after the non-local kernel because we first need the non-local
 834        data back first. */
 835     if (iloc == InteractionLocality::NonLocal)
 836     {
 837         nb->nonlocal_done.markEvent(deviceStream);
 838         nb->bNonLocalStreamDoneMarked = true;
 839     }
 840
 841     /* only transfer energies in the local stream */
 842     if (iloc == InteractionLocality::Local)
 843     {
 844         /* DtoH fshift when virial is needed */
 845         if (stepWork.computeVirial)
 846         {
 847             static_assert(
 848                     sizeof(*nb->nbst.fShift) == sizeof(Float3),
 849                     "Sizes of host- and device-side shift vector elements should be the same.");
 850             copyFromDeviceBuffer(nb->nbst.fShift,
 851                                  &adat->fShift,
 852                                  0,
 853                                  gmx::c_numShiftVectors,
 854                                  deviceStream,
 855                                  GpuApiCallBehavior::Async,
 856                                  bDoTime ? timers->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
 857         }
 858
 859         /* DtoH energies */
 860         if (stepWork.computeEnergy)
 861         {
 862             static_assert(sizeof(*nb->nbst.eLJ) == sizeof(float),
 863                           "Sizes of host- and device-side LJ energy terms should be the same.");
 864             copyFromDeviceBuffer(nb->nbst.eLJ,
 865                                  &adat->eLJ,
 866                                  0,
 867                                  1,
 868                                  deviceStream,
 869                                  GpuApiCallBehavior::Async,
 870                                  bDoTime ? timers->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
 871             static_assert(sizeof(*nb->nbst.eElec) == sizeof(float),
 872                           "Sizes of host- and device-side electrostatic energy terms should be the "
 873                           "same.");
 874             copyFromDeviceBuffer(nb->nbst.eElec,
 875                                  &adat->eElec,
 876                                  0,
 877                                  1,
 878                                  deviceStream,
 879                                  GpuApiCallBehavior::Async,
 880                                  bDoTime ? timers->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
 881         }
 882     }
 883
 884     if (bDoTime)
 885     {
 886         timers->xf[atomLocality].nb_d2h.closeTimingRegion(deviceStream);
 887     }
 888 }
 889
 890 void nbnxnInsertNonlocalGpuDependency(NbnxmGpu* nb, const InteractionLocality interactionLocality)
 891 {
 892     const DeviceStream& deviceStream = *nb->deviceStreams[interactionLocality];
 893
 894     /* When we get here all misc operations issued in the local stream as well as
 895        the local xq H2D are done,
 896        so we record that in the local stream and wait for it in the nonlocal one.
 897        This wait needs to precede any PP tasks, bonded or nonbonded, that may
 898        compute on interactions between local and nonlocal atoms.
 899      */
 900     if (nb->bUseTwoStreams)
 901     {
 902         if (interactionLocality == InteractionLocality::Local)
 903         {
 904             nb->misc_ops_and_local_H2D_done.markEvent(deviceStream);
 905             issueClFlushInStream(deviceStream);
 906         }
 907         else
 908         {
 909             nb->misc_ops_and_local_H2D_done.enqueueWaitEvent(deviceStream);
 910         }
 911     }
 912 }
 913
 914 /*! \brief Launch asynchronously the xq buffer host to device copy. */
 915 void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const AtomLocality atomLocality)
 916 {
 917     GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
 918
 919     const InteractionLocality iloc = atomToInteractionLocality(atomLocality);
 920
 921     NBAtomDataGpu*      adat         = nb->atdat;
 922     gpu_plist*          plist        = nb->plist[iloc];
 923     Nbnxm::GpuTimers*   timers       = nb->timers;
 924     const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
 925
 926     const bool bDoTime = nb->bDoTime;
 927
 928     /* Don't launch the non-local H2D copy if there is no dependent
 929        work to do: neither non-local nor other (e.g. bonded) work
 930        to do that has as input the nbnxn coordaintes.
 931        Doing the same for the local kernel is more complicated, since the
 932        local part of the force array also depends on the non-local kernel.
 933        So to avoid complicating the code and to reduce the risk of bugs,
 934        we always call the local local x+q copy (and the rest of the local
 935        work in nbnxn_gpu_launch_kernel().
 936      */
 937     if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(nb, iloc))
 938     {
 939         plist->haveFreshList = false;
 940
 941         // The event is marked for Local interactions unconditionally,
 942         // so it has to be released here because of the early return
 943         // for NonLocal interactions.
 944         nb->misc_ops_and_local_H2D_done.reset();
 945
 946         return;
 947     }
 948
 949     /* local/nonlocal offset and length used for xq and f */
 950     const auto atomsRange = getGpuAtomRange(adat, atomLocality);
 951
 952     /* beginning of timed HtoD section */
 953     if (bDoTime)
 954     {
 955         timers->xf[atomLocality].nb_h2d.openTimingRegion(deviceStream);
 956     }
 957
 958     /* HtoD x, q */
 959     GMX_ASSERT(nbatom->XFormat == nbatXYZQ,
 960                "The coordinates should be in xyzq format to copy to the Float4 device buffer.");
 961     copyToDeviceBuffer(&adat->xq,
 962                        reinterpret_cast<const Float4*>(nbatom->x().data()) + atomsRange.begin(),
 963                        atomsRange.begin(),
 964                        atomsRange.size(),
 965                        deviceStream,
 966                        GpuApiCallBehavior::Async,
 967                        nullptr);
 968
 969     if (bDoTime)
 970     {
 971         timers->xf[atomLocality].nb_h2d.closeTimingRegion(deviceStream);
 972     }
 973
 974     /* When we get here all misc operations issued in the local stream as well as
 975        the local xq H2D are done,
 976        so we record that in the local stream and wait for it in the nonlocal one.
 977        This wait needs to precede any PP tasks, bonded or nonbonded, that may
 978        compute on interactions between local and nonlocal atoms.
 979      */
 980     nbnxnInsertNonlocalGpuDependency(nb, iloc);
 981 }
 982
 983
 984 /* Initialization for X buffer operations on GPU. */
 985 void nbnxn_gpu_init_x_to_nbat_x(const Nbnxm::GridSet& gridSet, NbnxmGpu* gpu_nbv)
 986 {
 987     const DeviceStream& localStream   = *gpu_nbv->deviceStreams[InteractionLocality::Local];
 988     const bool          bDoTime       = gpu_nbv->bDoTime;
 989     const int           maxNumColumns = gridSet.numColumnsMax();
 990
 991     reallocateDeviceBuffer(&gpu_nbv->cxy_na,
 992                            maxNumColumns * gridSet.grids().size(),
 993                            &gpu_nbv->ncxy_na,
 994                            &gpu_nbv->ncxy_na_alloc,
 995                            *gpu_nbv->deviceContext_);
 996     reallocateDeviceBuffer(&gpu_nbv->cxy_ind,
 997                            maxNumColumns * gridSet.grids().size(),
 998                            &gpu_nbv->ncxy_ind,
 999                            &gpu_nbv->ncxy_ind_alloc,
1000                            *gpu_nbv->deviceContext_);
1001
1002     for (unsigned int g = 0; g < gridSet.grids().size(); g++)
1003     {
1004         const Nbnxm::Grid& grid = gridSet.grids()[g];
1005
1006         const int  numColumns      = grid.numColumns();
1007         const int* atomIndices     = gridSet.atomIndices().data();
1008         const int  atomIndicesSize = gridSet.atomIndices().size();
1009         const int* cxy_na          = grid.cxy_na().data();
1010         const int* cxy_ind         = grid.cxy_ind().data();
1011
1012         auto* timerH2D = bDoTime ? &gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d : nullptr;
1013
1014         reallocateDeviceBuffer(&gpu_nbv->atomIndices,
1015                                atomIndicesSize,
1016                                &gpu_nbv->atomIndicesSize,
1017                                &gpu_nbv->atomIndicesSize_alloc,
1018                                *gpu_nbv->deviceContext_);
1019
1020         if (atomIndicesSize > 0)
1021         {
1022             if (bDoTime)
1023             {
1024                 timerH2D->openTimingRegion(localStream);
1025             }
1026
1027             copyToDeviceBuffer(&gpu_nbv->atomIndices,
1028                                atomIndices,
1029                                0,
1030                                atomIndicesSize,
1031                                localStream,
1032                                GpuApiCallBehavior::Async,
1033                                bDoTime ? timerH2D->fetchNextEvent() : nullptr);
1034
1035             if (bDoTime)
1036             {
1037                 timerH2D->closeTimingRegion(localStream);
1038             }
1039         }
1040
1041         if (numColumns > 0)
1042         {
1043             if (bDoTime)
1044             {
1045                 timerH2D->openTimingRegion(localStream);
1046             }
1047
1048             copyToDeviceBuffer(&gpu_nbv->cxy_na,
1049                                cxy_na,
1050                                maxNumColumns * g,
1051                                numColumns,
1052                                localStream,
1053                                GpuApiCallBehavior::Async,
1054                                bDoTime ? timerH2D->fetchNextEvent() : nullptr);
1055
1056             if (bDoTime)
1057             {
1058                 timerH2D->closeTimingRegion(localStream);
1059             }
1060
1061             if (bDoTime)
1062             {
1063                 timerH2D->openTimingRegion(localStream);
1064             }
1065
1066             copyToDeviceBuffer(&gpu_nbv->cxy_ind,
1067                                cxy_ind,
1068                                maxNumColumns * g,
1069                                numColumns,
1070                                localStream,
1071                                GpuApiCallBehavior::Async,
1072                                bDoTime ? timerH2D->fetchNextEvent() : nullptr);
1073
1074             if (bDoTime)
1075             {
1076                 timerH2D->closeTimingRegion(localStream);
1077             }
1078         }
1079     }
1080
1081     // The above data is transferred on the local stream but is a
1082     // dependency of the nonlocal stream (specifically the nonlocal X
1083     // buf ops kernel).  We therefore set a dependency to ensure
1084     // that the nonlocal stream waits on the local stream here.
1085     // This call records an event in the local stream:
1086     nbnxnInsertNonlocalGpuDependency(gpu_nbv, Nbnxm::InteractionLocality::Local);
1087     // ...and this call instructs the nonlocal stream to wait on that event:
1088     nbnxnInsertNonlocalGpuDependency(gpu_nbv, Nbnxm::InteractionLocality::NonLocal);
1089 }
1090
1091 //! This function is documented in the header file
1092 void gpu_free(NbnxmGpu* nb)
1093 {
1094     if (nb == nullptr)
1095     {
1096         return;
1097     }
1098
1099     gpu_free_platform_specific(nb);
1100
1101     delete nb->timers;
1102     sfree(nb->timings);
1103
1104     NBAtomDataGpu* atdat   = nb->atdat;
1105     NBParamGpu*    nbparam = nb->nbparam;
1106
1107     /* Free atdat */
1108     freeDeviceBuffer(&(nb->atdat->xq));
1109     freeDeviceBuffer(&(nb->atdat->f));
1110     freeDeviceBuffer(&(nb->atdat->eLJ));
1111     freeDeviceBuffer(&(nb->atdat->eElec));
1112     freeDeviceBuffer(&(nb->atdat->fShift));
1113     freeDeviceBuffer(&(nb->atdat->shiftVec));
1114     if (useLjCombRule(nb->nbparam->vdwType))
1115     {
1116         freeDeviceBuffer(&atdat->ljComb);
1117     }
1118     else
1119     {
1120         freeDeviceBuffer(&atdat->atomTypes);
1121     }
1122
1123     /* Free nbparam */
1124     if (nbparam->elecType == ElecType::EwaldTab || nbparam->elecType == ElecType::EwaldTabTwin)
1125     {
1126         destroyParamLookupTable(&nbparam->coulomb_tab, &nbparam->coulomb_tab_texobj);
1127     }
1128
1129     if (!useLjCombRule(nb->nbparam->vdwType))
1130     {
1131         destroyParamLookupTable(&nbparam->nbfp, &nbparam->nbfp_texobj);
1132     }
1133
1134     if (nbparam->vdwType == VdwType::EwaldGeom || nbparam->vdwType == VdwType::EwaldLB)
1135     {
1136         destroyParamLookupTable(&nbparam->nbfp_comb, &nbparam->nbfp_comb_texobj);
1137     }
1138
1139     /* Free plist */
1140     auto* plist = nb->plist[InteractionLocality::Local];
1141     freeDeviceBuffer(&plist->sci);
1142     freeDeviceBuffer(&plist->cj4);
1143     freeDeviceBuffer(&plist->imask);
1144     freeDeviceBuffer(&plist->excl);
1145     delete plist;
1146     if (nb->bUseTwoStreams)
1147     {
1148         auto* plist_nl = nb->plist[InteractionLocality::NonLocal];
1149         freeDeviceBuffer(&plist_nl->sci);
1150         freeDeviceBuffer(&plist_nl->cj4);
1151         freeDeviceBuffer(&plist_nl->imask);
1152         freeDeviceBuffer(&plist_nl->excl);
1153         delete plist_nl;
1154     }
1155
1156     /* Free nbst */
1157     pfree(nb->nbst.eLJ);
1158     nb->nbst.eLJ = nullptr;
1159
1160     pfree(nb->nbst.eElec);
1161     nb->nbst.eElec = nullptr;
1162
1163     pfree(nb->nbst.fShift);
1164     nb->nbst.fShift = nullptr;
1165
1166     delete atdat;
1167     delete nbparam;
1168     delete nb;
1169
1170     if (debug)
1171     {
1172         fprintf(debug, "Cleaned up NBNXM GPU data structures.\n");
1173     }
1174 }
1175
1176 DeviceBuffer<gmx::RVec> gpu_get_f(NbnxmGpu* nb)
1177 {
1178     GMX_ASSERT(nb != nullptr, "nb pointer must be valid");
1179
1180     return nb->atdat->f;
1181 }
1182
1183 } // namespace Nbnxm