src/gromacs/nbnxm/nbnxm_gpu_data_mgmt.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2012,2013,2014,2015,2016 by the GROMACS development team.
   5  * Copyright (c) 2017,2018,2019,2020,2021, by the GROMACS development team, led by
   6  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   7  * and including many others, as listed in the AUTHORS file in the
   8  * top-level source directory and at http://www.gromacs.org.
   9  *
  10  * GROMACS is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU Lesser General Public License
  12  * as published by the Free Software Foundation; either version 2.1
  13  * of the License, or (at your option) any later version.
  14  *
  15  * GROMACS is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18  * Lesser General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public
  21  * License along with GROMACS; if not, see
  22  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  23  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  24  *
  25  * If you want to redistribute modifications to GROMACS, please
  26  * consider that scientific software is very special. Version
  27  * control is crucial - bugs must be traceable. We will be happy to
  28  * consider code for inclusion in the official distribution, but
  29  * derived work must not be called official GROMACS. Details are found
  30  * in the README & COPYING files - if they are missing, get the
  31  * official version at http://www.gromacs.org.
  32  *
  33  * To help us fund GROMACS development, we humbly ask that you cite
  34  * the research papers on the package. Check out http://www.gromacs.org.
  35  */
  36 /*! \internal \file
  37  *  \brief Define common implementation of nbnxm_gpu_data_mgmt.h
  38  *
  39  *  \author Anca Hamuraru <anca@streamcomputing.eu>
  40  *  \author Dimitrios Karkoulis <dimitris.karkoulis@gmail.com>
  41  *  \author Teemu Virolainen <teemu@streamcomputing.eu>
  42  *  \author Szilárd Páll <pall.szilard@gmail.com>
  43  *  \author Artem Zhmurov <zhmurov@gmail.com>
  44  *
  45  *  \ingroup module_nbnxm
  46  */
  47 #include "gmxpre.h"
  48
  49 #include "config.h"
  50
  51 #if GMX_GPU_CUDA
  52 #    include "cuda/nbnxm_cuda_types.h"
  53 #endif
  54
  55 #if GMX_GPU_OPENCL
  56 #    include "opencl/nbnxm_ocl_types.h"
  57 #endif
  58
  59 #if GMX_GPU_SYCL
  60 #    include "sycl/nbnxm_sycl_types.h"
  61 #endif
  62
  63 #include "nbnxm_gpu_data_mgmt.h"
  64
  65 #include "gromacs/gpu_utils/device_stream_manager.h"
  66 #include "gromacs/gpu_utils/gputraits.h"
  67 #include "gromacs/gpu_utils/pmalloc.h"
  68 #include "gromacs/hardware/device_information.h"
  69 #include "gromacs/mdtypes/interaction_const.h"
  70 #include "gromacs/mdtypes/simulation_workload.h"
  71 #include "gromacs/nbnxm/gpu_common_utils.h"
  72 #include "gromacs/nbnxm/gpu_data_mgmt.h"
  73 #include "gromacs/nbnxm/gridset.h"
  74 #include "gromacs/pbcutil/ishift.h"
  75 #include "gromacs/timing/gpu_timing.h"
  76 #include "gromacs/pbcutil/ishift.h"
  77 #include "gromacs/utility/cstringutil.h"
  78 #include "gromacs/utility/exceptions.h"
  79 #include "gromacs/utility/fatalerror.h"
  80
  81 #include "nbnxm_gpu.h"
  82 #include "pairlistsets.h"
  83
  84 namespace Nbnxm
  85 {
  86
  87 static inline void issueClFlushInStream(const DeviceStream& deviceStream)
  88 {
  89 #if GMX_GPU_OPENCL
  90     /* Based on the v1.2 section 5.13 of the OpenCL spec, a flush is needed
  91      * in the stream after marking an event in it in order to be able to sync with
  92      * the event from another stream.
  93      */
  94     cl_int cl_error = clFlush(deviceStream.stream());
  95     if (cl_error != CL_SUCCESS)
  96     {
  97         GMX_THROW(gmx::InternalError("clFlush failed: " + ocl_get_error_string(cl_error)));
  98     }
  99 #else
 100     GMX_UNUSED_VALUE(deviceStream);
 101 #endif
 102 }
 103
 104 static inline void init_ewald_coulomb_force_table(const EwaldCorrectionTables& tables,
 105                                                   NBParamGpu*                  nbp,
 106                                                   const DeviceContext&         deviceContext)
 107 {
 108     if (nbp->coulomb_tab)
 109     {
 110         destroyParamLookupTable(&nbp->coulomb_tab, nbp->coulomb_tab_texobj);
 111     }
 112
 113     nbp->coulomb_tab_scale = tables.scale;
 114     initParamLookupTable(
 115             &nbp->coulomb_tab, &nbp->coulomb_tab_texobj, tables.tableF.data(), tables.tableF.size(), deviceContext);
 116 }
 117
 118 static inline ElecType nbnxn_gpu_pick_ewald_kernel_type(const interaction_const_t& ic,
 119                                                         const DeviceInformation gmx_unused& deviceInfo)
 120 {
 121     bool bTwinCut = (ic.rcoulomb != ic.rvdw);
 122
 123     /* Benchmarking/development environment variables to force the use of
 124        analytical or tabulated Ewald kernel. */
 125     const bool forceAnalyticalEwald = (getenv("GMX_GPU_NB_ANA_EWALD") != nullptr);
 126     const bool forceTabulatedEwald  = (getenv("GMX_GPU_NB_TAB_EWALD") != nullptr);
 127     const bool forceTwinCutoffEwald = (getenv("GMX_GPU_NB_EWALD_TWINCUT") != nullptr);
 128
 129     if (forceAnalyticalEwald && forceTabulatedEwald)
 130     {
 131         gmx_incons(
 132                 "Both analytical and tabulated Ewald GPU non-bonded kernels "
 133                 "requested through environment variables.");
 134     }
 135
 136     /* By default, use analytical Ewald except with CUDA on NVIDIA CC 7.0 and 8.0.
 137      */
 138     const bool c_useTabulatedEwaldDefault =
 139 #if GMX_GPU_CUDA
 140             (deviceInfo.prop.major == 7 && deviceInfo.prop.minor == 0)
 141             || (deviceInfo.prop.major == 8 && deviceInfo.prop.minor == 0);
 142 #else
 143             false;
 144 #endif
 145     bool bUseAnalyticalEwald = !c_useTabulatedEwaldDefault;
 146     if (forceAnalyticalEwald)
 147     {
 148         bUseAnalyticalEwald = true;
 149         if (debug)
 150         {
 151             fprintf(debug, "Using analytical Ewald GPU kernels\n");
 152         }
 153     }
 154     else if (forceTabulatedEwald)
 155     {
 156         bUseAnalyticalEwald = false;
 157
 158         if (debug)
 159         {
 160             fprintf(debug, "Using tabulated Ewald GPU kernels\n");
 161         }
 162     }
 163
 164     /* Use twin cut-off kernels if requested by bTwinCut or the env. var.
 165        forces it (use it for debugging/benchmarking only). */
 166     if (!bTwinCut && !forceTwinCutoffEwald)
 167     {
 168         return bUseAnalyticalEwald ? ElecType::EwaldAna : ElecType::EwaldTab;
 169     }
 170     else
 171     {
 172         return bUseAnalyticalEwald ? ElecType::EwaldAnaTwin : ElecType::EwaldTabTwin;
 173     }
 174 }
 175
 176 static inline void set_cutoff_parameters(NBParamGpu*                nbp,
 177                                          const interaction_const_t& ic,
 178                                          const PairlistParams&      listParams)
 179 {
 180     nbp->ewald_beta        = ic.ewaldcoeff_q;
 181     nbp->sh_ewald          = ic.sh_ewald;
 182     nbp->epsfac            = ic.epsfac;
 183     nbp->two_k_rf          = 2.0 * ic.reactionFieldCoefficient;
 184     nbp->c_rf              = ic.reactionFieldShift;
 185     nbp->rvdw_sq           = ic.rvdw * ic.rvdw;
 186     nbp->rcoulomb_sq       = ic.rcoulomb * ic.rcoulomb;
 187     nbp->rlistOuter_sq     = listParams.rlistOuter * listParams.rlistOuter;
 188     nbp->rlistInner_sq     = listParams.rlistInner * listParams.rlistInner;
 189     nbp->useDynamicPruning = listParams.useDynamicPruning;
 190
 191     nbp->sh_lj_ewald   = ic.sh_lj_ewald;
 192     nbp->ewaldcoeff_lj = ic.ewaldcoeff_lj;
 193
 194     nbp->rvdw_switch      = ic.rvdw_switch;
 195     nbp->dispersion_shift = ic.dispersion_shift;
 196     nbp->repulsion_shift  = ic.repulsion_shift;
 197     nbp->vdw_switch       = ic.vdw_switch;
 198 }
 199
 200 static inline void init_plist(gpu_plist* pl)
 201 {
 202     /* initialize to nullptr pointers to data that is not allocated here and will
 203        need reallocation in nbnxn_gpu_init_pairlist */
 204     pl->sci   = nullptr;
 205     pl->cj4   = nullptr;
 206     pl->imask = nullptr;
 207     pl->excl  = nullptr;
 208
 209     /* size -1 indicates that the respective array hasn't been initialized yet */
 210     pl->na_c                   = -1;
 211     pl->nsci                   = -1;
 212     pl->sci_nalloc             = -1;
 213     pl->ncj4                   = -1;
 214     pl->cj4_nalloc             = -1;
 215     pl->nimask                 = -1;
 216     pl->imask_nalloc           = -1;
 217     pl->nexcl                  = -1;
 218     pl->excl_nalloc            = -1;
 219     pl->haveFreshList          = false;
 220     pl->rollingPruningNumParts = 0;
 221     pl->rollingPruningPart     = 0;
 222 }
 223
 224 static inline void init_timings(gmx_wallclock_gpu_nbnxn_t* t)
 225 {
 226     t->nb_h2d_t = 0.0;
 227     t->nb_d2h_t = 0.0;
 228     t->nb_c     = 0;
 229     t->pl_h2d_t = 0.0;
 230     t->pl_h2d_c = 0;
 231     for (int i = 0; i < 2; i++)
 232     {
 233         for (int j = 0; j < 2; j++)
 234         {
 235             t->ktime[i][j].t = 0.0;
 236             t->ktime[i][j].c = 0;
 237         }
 238     }
 239     t->pruneTime.c        = 0;
 240     t->pruneTime.t        = 0.0;
 241     t->dynamicPruneTime.c = 0;
 242     t->dynamicPruneTime.t = 0.0;
 243 }
 244
 245 /*! \brief Initialize \p atomdata first time; it only gets filled at pair-search. */
 246 static inline void initAtomdataFirst(NBAtomDataGpu*       atomdata,
 247                                      int                  numTypes,
 248                                      const DeviceContext& deviceContext,
 249                                      const DeviceStream&  localStream)
 250 {
 251     atomdata->numTypes = numTypes;
 252     allocateDeviceBuffer(&atomdata->shiftVec, SHIFTS, deviceContext);
 253     atomdata->shiftVecUploaded = false;
 254
 255     allocateDeviceBuffer(&atomdata->fShift, SHIFTS, deviceContext);
 256     allocateDeviceBuffer(&atomdata->eLJ, 1, deviceContext);
 257     allocateDeviceBuffer(&atomdata->eElec, 1, deviceContext);
 258
 259     clearDeviceBufferAsync(&atomdata->fShift, 0, SHIFTS, localStream);
 260     clearDeviceBufferAsync(&atomdata->eElec, 0, 1, localStream);
 261     clearDeviceBufferAsync(&atomdata->eLJ, 0, 1, localStream);
 262
 263     /* initialize to nullptr pointers to data that is not allocated here and will
 264        need reallocation in later */
 265     atomdata->xq = nullptr;
 266     atomdata->f  = nullptr;
 267
 268     /* size -1 indicates that the respective array hasn't been initialized yet */
 269     atomdata->numAtoms      = -1;
 270     atomdata->numAtomsAlloc = -1;
 271 }
 272
 273 static inline VdwType nbnxmGpuPickVdwKernelType(const interaction_const_t& ic,
 274                                                 LJCombinationRule          ljCombinationRule)
 275 {
 276     if (ic.vdwtype == VanDerWaalsType::Cut)
 277     {
 278         switch (ic.vdw_modifier)
 279         {
 280             case InteractionModifiers::None:
 281             case InteractionModifiers::PotShift:
 282                 switch (ljCombinationRule)
 283                 {
 284                     case LJCombinationRule::None: return VdwType::Cut;
 285                     case LJCombinationRule::Geometric: return VdwType::CutCombGeom;
 286                     case LJCombinationRule::LorentzBerthelot: return VdwType::CutCombLB;
 287                     default:
 288                         GMX_THROW(gmx::InconsistentInputError(gmx::formatString(
 289                                 "The requested LJ combination rule %s is not implemented in "
 290                                 "the GPU accelerated kernels!",
 291                                 enumValueToString(ljCombinationRule))));
 292                 }
 293             case InteractionModifiers::ForceSwitch: return VdwType::FSwitch;
 294             case InteractionModifiers::PotSwitch: return VdwType::PSwitch;
 295             default:
 296                 GMX_THROW(gmx::InconsistentInputError(
 297                         gmx::formatString("The requested VdW interaction modifier %s is not "
 298                                           "implemented in the GPU accelerated kernels!",
 299                                           enumValueToString(ic.vdw_modifier))));
 300         }
 301     }
 302     else if (ic.vdwtype == VanDerWaalsType::Pme)
 303     {
 304         if (ic.ljpme_comb_rule == LongRangeVdW::Geom)
 305         {
 306             GMX_RELEASE_ASSERT(
 307                     ljCombinationRule == LJCombinationRule::Geometric,
 308                     "Combination rules for long- and short-range interactions should match.");
 309             return VdwType::EwaldGeom;
 310         }
 311         else
 312         {
 313             GMX_RELEASE_ASSERT(
 314                     ljCombinationRule == LJCombinationRule::LorentzBerthelot,
 315                     "Combination rules for long- and short-range interactions should match.");
 316             return VdwType::EwaldLB;
 317         }
 318     }
 319     else
 320     {
 321         GMX_THROW(gmx::InconsistentInputError(gmx::formatString(
 322                 "The requested VdW type %s is not implemented in the GPU accelerated kernels!",
 323                 enumValueToString(ic.vdwtype))));
 324     }
 325 }
 326
 327 static inline ElecType nbnxmGpuPickElectrostaticsKernelType(const interaction_const_t& ic,
 328                                                             const DeviceInformation&   deviceInfo)
 329 {
 330     if (ic.eeltype == CoulombInteractionType::Cut)
 331     {
 332         return ElecType::Cut;
 333     }
 334     else if (EEL_RF(ic.eeltype))
 335     {
 336         return ElecType::RF;
 337     }
 338     else if ((EEL_PME(ic.eeltype) || ic.eeltype == CoulombInteractionType::Ewald))
 339     {
 340         return nbnxn_gpu_pick_ewald_kernel_type(ic, deviceInfo);
 341     }
 342     else
 343     {
 344         /* Shouldn't happen, as this is checked when choosing Verlet-scheme */
 345         GMX_THROW(gmx::InconsistentInputError(
 346                 gmx::formatString("The requested electrostatics type %s is not implemented in "
 347                                   "the GPU accelerated kernels!",
 348                                   enumValueToString(ic.eeltype))));
 349     }
 350 }
 351
 352 /*! \brief Initialize the nonbonded parameter data structure. */
 353 static inline void initNbparam(NBParamGpu*                     nbp,
 354                                const interaction_const_t&      ic,
 355                                const PairlistParams&           listParams,
 356                                const nbnxn_atomdata_t::Params& nbatParams,
 357                                const DeviceContext&            deviceContext)
 358 {
 359     const int numTypes = nbatParams.numTypes;
 360
 361     set_cutoff_parameters(nbp, ic, listParams);
 362
 363     nbp->vdwType  = nbnxmGpuPickVdwKernelType(ic, nbatParams.ljCombinationRule);
 364     nbp->elecType = nbnxmGpuPickElectrostaticsKernelType(ic, deviceContext.deviceInfo());
 365
 366     if (ic.vdwtype == VanDerWaalsType::Pme)
 367     {
 368         if (ic.ljpme_comb_rule == LongRangeVdW::Geom)
 369         {
 370             GMX_ASSERT(nbatParams.ljCombinationRule == LJCombinationRule::Geometric,
 371                        "Combination rule mismatch!");
 372         }
 373         else
 374         {
 375             GMX_ASSERT(nbatParams.ljCombinationRule == LJCombinationRule::LorentzBerthelot,
 376                        "Combination rule mismatch!");
 377         }
 378     }
 379
 380     /* generate table for PME */
 381     if (nbp->elecType == ElecType::EwaldTab || nbp->elecType == ElecType::EwaldTabTwin)
 382     {
 383         GMX_RELEASE_ASSERT(ic.coulombEwaldTables, "Need valid Coulomb Ewald correction tables");
 384         init_ewald_coulomb_force_table(*ic.coulombEwaldTables, nbp, deviceContext);
 385     }
 386
 387     /* set up LJ parameter lookup table */
 388     if (!useLjCombRule(nbp->vdwType))
 389     {
 390         static_assert(sizeof(decltype(nbp->nbfp)) == 2 * sizeof(decltype(*nbatParams.nbfp.data())),
 391                       "Mismatch in the size of host / device data types");
 392         initParamLookupTable(&nbp->nbfp,
 393                              &nbp->nbfp_texobj,
 394                              reinterpret_cast<const Float2*>(nbatParams.nbfp.data()),
 395                              numTypes * numTypes,
 396                              deviceContext);
 397     }
 398
 399     /* set up LJ-PME parameter lookup table */
 400     if (ic.vdwtype == VanDerWaalsType::Pme)
 401     {
 402         static_assert(sizeof(decltype(nbp->nbfp_comb))
 403                               == 2 * sizeof(decltype(*nbatParams.nbfp_comb.data())),
 404                       "Mismatch in the size of host / device data types");
 405         initParamLookupTable(&nbp->nbfp_comb,
 406                              &nbp->nbfp_comb_texobj,
 407                              reinterpret_cast<const Float2*>(nbatParams.nbfp_comb.data()),
 408                              numTypes,
 409                              deviceContext);
 410     }
 411 }
 412
 413 NbnxmGpu* gpu_init(const gmx::DeviceStreamManager& deviceStreamManager,
 414                    const interaction_const_t*      ic,
 415                    const PairlistParams&           listParams,
 416                    const nbnxn_atomdata_t*         nbat,
 417                    const bool                      bLocalAndNonlocal)
 418 {
 419     auto* nb                              = new NbnxmGpu();
 420     nb->deviceContext_                    = &deviceStreamManager.context();
 421     nb->atdat                             = new NBAtomDataGpu;
 422     nb->nbparam                           = new NBParamGpu;
 423     nb->plist[InteractionLocality::Local] = new Nbnxm::gpu_plist;
 424     if (bLocalAndNonlocal)
 425     {
 426         nb->plist[InteractionLocality::NonLocal] = new Nbnxm::gpu_plist;
 427     }
 428
 429     nb->bUseTwoStreams = bLocalAndNonlocal;
 430
 431     nb->timers = new Nbnxm::GpuTimers();
 432     snew(nb->timings, 1);
 433
 434     /* WARNING: CUDA timings are incorrect with multiple streams.
 435      * This is the main reason why they are disabled by default.
 436      * Can be enabled by setting GMX_ENABLE_GPU_TIMING environment variable.
 437      * TODO: Consider turning on by default when we can detect nr of streams.
 438      *
 439      * OpenCL timing is enabled by default and can be disabled by
 440      * GMX_DISABLE_GPU_TIMING environment variable.
 441      *
 442      * Timing is disabled in SYCL.
 443      */
 444     nb->bDoTime = (GMX_GPU_CUDA && (getenv("GMX_ENABLE_GPU_TIMING") != nullptr))
 445                   || (GMX_GPU_OPENCL && (getenv("GMX_DISABLE_GPU_TIMING") == nullptr));
 446
 447     if (nb->bDoTime)
 448     {
 449         init_timings(nb->timings);
 450     }
 451
 452     /* init nbst */
 453     pmalloc(reinterpret_cast<void**>(&nb->nbst.eLJ), sizeof(*nb->nbst.eLJ));
 454     pmalloc(reinterpret_cast<void**>(&nb->nbst.eElec), sizeof(*nb->nbst.eElec));
 455     pmalloc(reinterpret_cast<void**>(&nb->nbst.fShift), SHIFTS * sizeof(*nb->nbst.fShift));
 456
 457     init_plist(nb->plist[InteractionLocality::Local]);
 458
 459     /* local/non-local GPU streams */
 460     GMX_RELEASE_ASSERT(deviceStreamManager.streamIsValid(gmx::DeviceStreamType::NonBondedLocal),
 461                        "Local non-bonded stream should be initialized to use GPU for non-bonded.");
 462     const DeviceStream& localStream = deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedLocal);
 463     nb->deviceStreams[InteractionLocality::Local] = &localStream;
 464     // In general, it's not strictly necessary to use 2 streams for SYCL, since they are
 465     // out-of-order. But for the time being, it will be less disruptive to keep them.
 466     if (nb->bUseTwoStreams)
 467     {
 468         init_plist(nb->plist[InteractionLocality::NonLocal]);
 469
 470         GMX_RELEASE_ASSERT(deviceStreamManager.streamIsValid(gmx::DeviceStreamType::NonBondedNonLocal),
 471                            "Non-local non-bonded stream should be initialized to use GPU for "
 472                            "non-bonded with domain decomposition.");
 473         nb->deviceStreams[InteractionLocality::NonLocal] =
 474                 &deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedNonLocal);
 475     }
 476
 477     const nbnxn_atomdata_t::Params& nbatParams    = nbat->params();
 478     const DeviceContext&            deviceContext = *nb->deviceContext_;
 479
 480     initNbparam(nb->nbparam, *ic, listParams, nbatParams, deviceContext);
 481     initAtomdataFirst(nb->atdat, nbatParams.numTypes, deviceContext, localStream);
 482
 483     gpu_init_platform_specific(nb);
 484
 485     if (debug)
 486     {
 487         fprintf(debug, "Initialized NBNXM GPU data structures.\n");
 488     }
 489
 490     return nb;
 491 }
 492
 493 void gpu_pme_loadbal_update_param(const nonbonded_verlet_t* nbv, const interaction_const_t& ic)
 494 {
 495     if (!nbv || !nbv->useGpu())
 496     {
 497         return;
 498     }
 499     NbnxmGpu*   nb  = nbv->gpu_nbv;
 500     NBParamGpu* nbp = nb->nbparam;
 501
 502     set_cutoff_parameters(nbp, ic, nbv->pairlistSets().params());
 503
 504     nbp->elecType = nbnxn_gpu_pick_ewald_kernel_type(ic, nb->deviceContext_->deviceInfo());
 505
 506     GMX_RELEASE_ASSERT(ic.coulombEwaldTables, "Need valid Coulomb Ewald correction tables");
 507     init_ewald_coulomb_force_table(*ic.coulombEwaldTables, nbp, *nb->deviceContext_);
 508 }
 509
 510 void gpu_upload_shiftvec(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom)
 511 {
 512     NBAtomDataGpu*      adat        = nb->atdat;
 513     const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local];
 514
 515     /* only if we have a dynamic box */
 516     if (nbatom->bDynamicBox || !adat->shiftVecUploaded)
 517     {
 518         copyToDeviceBuffer(&adat->shiftVec,
 519                            gmx::asGenericFloat3Pointer(nbatom->shift_vec),
 520                            0,
 521                            SHIFTS,
 522                            localStream,
 523                            GpuApiCallBehavior::Async,
 524                            nullptr);
 525         adat->shiftVecUploaded = true;
 526     }
 527 }
 528
 529 //! This function is documented in the header file
 530 void gpu_init_pairlist(NbnxmGpu* nb, const NbnxnPairlistGpu* h_plist, const InteractionLocality iloc)
 531 {
 532     char sbuf[STRLEN];
 533     // Timing accumulation should happen only if there was work to do
 534     // because getLastRangeTime() gets skipped with empty lists later
 535     // which leads to the counter not being reset.
 536     bool                bDoTime      = (nb->bDoTime && !h_plist->sci.empty());
 537     const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
 538     gpu_plist*          d_plist      = nb->plist[iloc];
 539
 540     if (d_plist->na_c < 0)
 541     {
 542         d_plist->na_c = h_plist->na_ci;
 543     }
 544     else
 545     {
 546         if (d_plist->na_c != h_plist->na_ci)
 547         {
 548             sprintf(sbuf,
 549                     "In init_plist: the #atoms per cell has changed (from %d to %d)",
 550                     d_plist->na_c,
 551                     h_plist->na_ci);
 552             gmx_incons(sbuf);
 553         }
 554     }
 555
 556     GpuTimers::Interaction& iTimers = nb->timers->interaction[iloc];
 557
 558     if (bDoTime)
 559     {
 560         iTimers.pl_h2d.openTimingRegion(deviceStream);
 561         iTimers.didPairlistH2D = true;
 562     }
 563
 564     // TODO most of this function is same in CUDA and OpenCL, move into the header
 565     const DeviceContext& deviceContext = *nb->deviceContext_;
 566
 567     reallocateDeviceBuffer(
 568             &d_plist->sci, h_plist->sci.size(), &d_plist->nsci, &d_plist->sci_nalloc, deviceContext);
 569     copyToDeviceBuffer(&d_plist->sci,
 570                        h_plist->sci.data(),
 571                        0,
 572                        h_plist->sci.size(),
 573                        deviceStream,
 574                        GpuApiCallBehavior::Async,
 575                        bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
 576
 577     reallocateDeviceBuffer(
 578             &d_plist->cj4, h_plist->cj4.size(), &d_plist->ncj4, &d_plist->cj4_nalloc, deviceContext);
 579     copyToDeviceBuffer(&d_plist->cj4,
 580                        h_plist->cj4.data(),
 581                        0,
 582                        h_plist->cj4.size(),
 583                        deviceStream,
 584                        GpuApiCallBehavior::Async,
 585                        bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
 586
 587     reallocateDeviceBuffer(&d_plist->imask,
 588                            h_plist->cj4.size() * c_nbnxnGpuClusterpairSplit,
 589                            &d_plist->nimask,
 590                            &d_plist->imask_nalloc,
 591                            deviceContext);
 592
 593     reallocateDeviceBuffer(
 594             &d_plist->excl, h_plist->excl.size(), &d_plist->nexcl, &d_plist->excl_nalloc, deviceContext);
 595     copyToDeviceBuffer(&d_plist->excl,
 596                        h_plist->excl.data(),
 597                        0,
 598                        h_plist->excl.size(),
 599                        deviceStream,
 600                        GpuApiCallBehavior::Async,
 601                        bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
 602
 603     if (bDoTime)
 604     {
 605         iTimers.pl_h2d.closeTimingRegion(deviceStream);
 606     }
 607
 608     /* need to prune the pair list during the next step */
 609     d_plist->haveFreshList = true;
 610 }
 611
 612 void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat)
 613 {
 614     bool                 bDoTime       = nb->bDoTime;
 615     Nbnxm::GpuTimers*    timers        = bDoTime ? nb->timers : nullptr;
 616     NBAtomDataGpu*       atdat         = nb->atdat;
 617     const DeviceContext& deviceContext = *nb->deviceContext_;
 618     const DeviceStream&  localStream   = *nb->deviceStreams[InteractionLocality::Local];
 619
 620     int  numAtoms  = nbat->numAtoms();
 621     bool realloced = false;
 622
 623     if (bDoTime)
 624     {
 625         /* time async copy */
 626         timers->atdat.openTimingRegion(localStream);
 627     }
 628
 629     /* need to reallocate if we have to copy more atoms than the amount of space
 630        available and only allocate if we haven't initialized yet, i.e atdat->natoms == -1 */
 631     if (numAtoms > atdat->numAtomsAlloc)
 632     {
 633         int numAlloc = over_alloc_small(numAtoms);
 634
 635         /* free up first if the arrays have already been initialized */
 636         if (atdat->numAtomsAlloc != -1)
 637         {
 638             freeDeviceBuffer(&atdat->f);
 639             freeDeviceBuffer(&atdat->xq);
 640             if (useLjCombRule(nb->nbparam->vdwType))
 641             {
 642                 freeDeviceBuffer(&atdat->ljComb);
 643             }
 644             else
 645             {
 646                 freeDeviceBuffer(&atdat->atomTypes);
 647             }
 648         }
 649
 650
 651         allocateDeviceBuffer(&atdat->f, numAlloc, deviceContext);
 652         allocateDeviceBuffer(&atdat->xq, numAlloc, deviceContext);
 653
 654         if (useLjCombRule(nb->nbparam->vdwType))
 655         {
 656             // Two Lennard-Jones parameters per atom
 657             allocateDeviceBuffer(&atdat->ljComb, numAlloc, deviceContext);
 658         }
 659         else
 660         {
 661             allocateDeviceBuffer(&atdat->atomTypes, numAlloc, deviceContext);
 662         }
 663
 664         atdat->numAtomsAlloc = numAlloc;
 665         realloced            = true;
 666     }
 667
 668     atdat->numAtoms      = numAtoms;
 669     atdat->numAtomsLocal = nbat->natoms_local;
 670
 671     /* need to clear GPU f output if realloc happened */
 672     if (realloced)
 673     {
 674         clearDeviceBufferAsync(&atdat->f, 0, atdat->numAtomsAlloc, localStream);
 675     }
 676
 677     if (useLjCombRule(nb->nbparam->vdwType))
 678     {
 679         static_assert(
 680                 sizeof(Float2) == 2 * sizeof(*nbat->params().lj_comb.data()),
 681                 "Size of a pair of LJ parameters elements should be equal to the size of Float2.");
 682         copyToDeviceBuffer(&atdat->ljComb,
 683                            reinterpret_cast<const Float2*>(nbat->params().lj_comb.data()),
 684                            0,
 685                            numAtoms,
 686                            localStream,
 687                            GpuApiCallBehavior::Async,
 688                            bDoTime ? timers->atdat.fetchNextEvent() : nullptr);
 689     }
 690     else
 691     {
 692         static_assert(sizeof(int) == sizeof(*nbat->params().type.data()),
 693                       "Sizes of host- and device-side atom types should be the same.");
 694         copyToDeviceBuffer(&atdat->atomTypes,
 695                            nbat->params().type.data(),
 696                            0,
 697                            numAtoms,
 698                            localStream,
 699                            GpuApiCallBehavior::Async,
 700                            bDoTime ? timers->atdat.fetchNextEvent() : nullptr);
 701     }
 702
 703     if (bDoTime)
 704     {
 705         timers->atdat.closeTimingRegion(localStream);
 706     }
 707
 708     /* kick off the tasks enqueued above to ensure concurrency with the search */
 709     issueClFlushInStream(localStream);
 710 }
 711
 712 void gpu_clear_outputs(NbnxmGpu* nb, bool computeVirial)
 713 {
 714     NBAtomDataGpu*      adat        = nb->atdat;
 715     const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local];
 716     // Clear forces
 717     clearDeviceBufferAsync(&adat->f, 0, nb->atdat->numAtoms, localStream);
 718     // Clear shift force array and energies if the outputs were used in the current step
 719     if (computeVirial)
 720     {
 721         clearDeviceBufferAsync(&adat->fShift, 0, SHIFTS, localStream);
 722         clearDeviceBufferAsync(&adat->eLJ, 0, 1, localStream);
 723         clearDeviceBufferAsync(&adat->eElec, 0, 1, localStream);
 724     }
 725     issueClFlushInStream(localStream);
 726 }
 727
 728 //! This function is documented in the header file
 729 gmx_wallclock_gpu_nbnxn_t* gpu_get_timings(NbnxmGpu* nb)
 730 {
 731     return (nb != nullptr && nb->bDoTime) ? nb->timings : nullptr;
 732 }
 733
 734 //! This function is documented in the header file
 735 void gpu_reset_timings(nonbonded_verlet_t* nbv)
 736 {
 737     if (nbv->gpu_nbv && nbv->gpu_nbv->bDoTime)
 738     {
 739         init_timings(nbv->gpu_nbv->timings);
 740     }
 741 }
 742
 743 bool gpu_is_kernel_ewald_analytical(const NbnxmGpu* nb)
 744 {
 745     return ((nb->nbparam->elecType == ElecType::EwaldAna)
 746             || (nb->nbparam->elecType == ElecType::EwaldAnaTwin));
 747 }
 748
 749 void setupGpuShortRangeWork(NbnxmGpu* nb, const gmx::GpuBonded* gpuBonded, const gmx::InteractionLocality iLocality)
 750 {
 751     GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
 752
 753     // There is short-range work if the pair list for the provided
 754     // interaction locality contains entries or if there is any
 755     // bonded work (as this is not split into local/nonlocal).
 756     nb->haveWork[iLocality] = ((nb->plist[iLocality]->nsci != 0)
 757                                || (gpuBonded != nullptr && gpuBonded->haveInteractions()));
 758 }
 759
 760 bool haveGpuShortRangeWork(const NbnxmGpu* nb, const gmx::InteractionLocality interactionLocality)
 761 {
 762     GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
 763
 764     return nb->haveWork[interactionLocality];
 765 }
 766
 767 /*! \brief
 768  * Launch asynchronously the download of nonbonded forces from the GPU
 769  * (and energies/shift forces if required).
 770  */
 771 void gpu_launch_cpyback(NbnxmGpu*                nb,
 772                         struct nbnxn_atomdata_t* nbatom,
 773                         const gmx::StepWorkload& stepWork,
 774                         const AtomLocality       atomLocality)
 775 {
 776     GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
 777
 778     /* determine interaction locality from atom locality */
 779     const InteractionLocality iloc = atomToInteractionLocality(atomLocality);
 780     GMX_ASSERT(iloc == InteractionLocality::Local
 781                        || (iloc == InteractionLocality::NonLocal && nb->bNonLocalStreamDoneMarked == false),
 782                "Non-local stream is indicating that the copy back event is enqueued at the "
 783                "beginning of the copy back function.");
 784
 785     /* extract the data */
 786     NBAtomDataGpu*      adat         = nb->atdat;
 787     Nbnxm::GpuTimers*   timers       = nb->timers;
 788     bool                bDoTime      = nb->bDoTime;
 789     const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
 790
 791     /* don't launch non-local copy-back if there was no non-local work to do */
 792     if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(nb, iloc))
 793     {
 794         /* TODO An alternative way to signal that non-local work is
 795            complete is to use a clEnqueueMarker+clEnqueueBarrier
 796            pair. However, the use of bNonLocalStreamDoneMarked has the
 797            advantage of being local to the host, so probably minimizes
 798            overhead. Curiously, for NVIDIA OpenCL with an empty-domain
 799            test case, overall simulation performance was higher with
 800            the API calls, but this has not been tested on AMD OpenCL,
 801            so could be worth considering in future. */
 802         nb->bNonLocalStreamDoneMarked = false;
 803         return;
 804     }
 805
 806     /* local/nonlocal offset and length used for xq and f */
 807     auto atomsRange = getGpuAtomRange(adat, atomLocality);
 808
 809     /* beginning of timed D2H section */
 810     if (bDoTime)
 811     {
 812         timers->xf[atomLocality].nb_d2h.openTimingRegion(deviceStream);
 813     }
 814
 815     /* With DD the local D2H transfer can only start after the non-local
 816        has been launched. */
 817     if (iloc == InteractionLocality::Local && nb->bNonLocalStreamDoneMarked)
 818     {
 819         nb->nonlocal_done.enqueueWaitEvent(deviceStream);
 820         nb->bNonLocalStreamDoneMarked = false;
 821     }
 822
 823     /* DtoH f */
 824     if (!stepWork.useGpuFBufferOps)
 825     {
 826         static_assert(
 827                 sizeof(*nbatom->out[0].f.data()) == sizeof(float),
 828                 "The host force buffer should be in single precision to match device data size.");
 829         copyFromDeviceBuffer(reinterpret_cast<Float3*>(nbatom->out[0].f.data()) + atomsRange.begin(),
 830                              &adat->f,
 831                              atomsRange.begin(),
 832                              atomsRange.size(),
 833                              deviceStream,
 834                              GpuApiCallBehavior::Async,
 835                              bDoTime ? timers->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
 836
 837         issueClFlushInStream(deviceStream);
 838     }
 839
 840     /* After the non-local D2H is launched the nonlocal_done event can be
 841        recorded which signals that the local D2H can proceed. This event is not
 842        placed after the non-local kernel because we first need the non-local
 843        data back first. */
 844     if (iloc == InteractionLocality::NonLocal)
 845     {
 846         nb->nonlocal_done.markEvent(deviceStream);
 847         nb->bNonLocalStreamDoneMarked = true;
 848     }
 849
 850     /* only transfer energies in the local stream */
 851     if (iloc == InteractionLocality::Local)
 852     {
 853         /* DtoH fshift when virial is needed */
 854         if (stepWork.computeVirial)
 855         {
 856             static_assert(
 857                     sizeof(*nb->nbst.fShift) == sizeof(Float3),
 858                     "Sizes of host- and device-side shift vector elements should be the same.");
 859             copyFromDeviceBuffer(nb->nbst.fShift,
 860                                  &adat->fShift,
 861                                  0,
 862                                  SHIFTS,
 863                                  deviceStream,
 864                                  GpuApiCallBehavior::Async,
 865                                  bDoTime ? timers->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
 866         }
 867
 868         /* DtoH energies */
 869         if (stepWork.computeEnergy)
 870         {
 871             static_assert(sizeof(*nb->nbst.eLJ) == sizeof(float),
 872                           "Sizes of host- and device-side LJ energy terms should be the same.");
 873             copyFromDeviceBuffer(nb->nbst.eLJ,
 874                                  &adat->eLJ,
 875                                  0,
 876                                  1,
 877                                  deviceStream,
 878                                  GpuApiCallBehavior::Async,
 879                                  bDoTime ? timers->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
 880             static_assert(sizeof(*nb->nbst.eElec) == sizeof(float),
 881                           "Sizes of host- and device-side electrostatic energy terms should be the "
 882                           "same.");
 883             copyFromDeviceBuffer(nb->nbst.eElec,
 884                                  &adat->eElec,
 885                                  0,
 886                                  1,
 887                                  deviceStream,
 888                                  GpuApiCallBehavior::Async,
 889                                  bDoTime ? timers->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
 890         }
 891     }
 892
 893     if (bDoTime)
 894     {
 895         timers->xf[atomLocality].nb_d2h.closeTimingRegion(deviceStream);
 896     }
 897 }
 898
 899 void nbnxnInsertNonlocalGpuDependency(NbnxmGpu* nb, const InteractionLocality interactionLocality)
 900 {
 901     const DeviceStream& deviceStream = *nb->deviceStreams[interactionLocality];
 902
 903     /* When we get here all misc operations issued in the local stream as well as
 904        the local xq H2D are done,
 905        so we record that in the local stream and wait for it in the nonlocal one.
 906        This wait needs to precede any PP tasks, bonded or nonbonded, that may
 907        compute on interactions between local and nonlocal atoms.
 908      */
 909     if (nb->bUseTwoStreams)
 910     {
 911         if (interactionLocality == InteractionLocality::Local)
 912         {
 913             nb->misc_ops_and_local_H2D_done.markEvent(deviceStream);
 914             issueClFlushInStream(deviceStream);
 915         }
 916         else
 917         {
 918             nb->misc_ops_and_local_H2D_done.enqueueWaitEvent(deviceStream);
 919         }
 920     }
 921 }
 922
 923 /*! \brief Launch asynchronously the xq buffer host to device copy. */
 924 void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const AtomLocality atomLocality)
 925 {
 926     GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
 927
 928     const InteractionLocality iloc = atomToInteractionLocality(atomLocality);
 929
 930     NBAtomDataGpu*      adat         = nb->atdat;
 931     gpu_plist*          plist        = nb->plist[iloc];
 932     Nbnxm::GpuTimers*   timers       = nb->timers;
 933     const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
 934
 935     const bool bDoTime = nb->bDoTime;
 936
 937     /* Don't launch the non-local H2D copy if there is no dependent
 938        work to do: neither non-local nor other (e.g. bonded) work
 939        to do that has as input the nbnxn coordaintes.
 940        Doing the same for the local kernel is more complicated, since the
 941        local part of the force array also depends on the non-local kernel.
 942        So to avoid complicating the code and to reduce the risk of bugs,
 943        we always call the local local x+q copy (and the rest of the local
 944        work in nbnxn_gpu_launch_kernel().
 945      */
 946     if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(nb, iloc))
 947     {
 948         plist->haveFreshList = false;
 949
 950         // The event is marked for Local interactions unconditionally,
 951         // so it has to be released here because of the early return
 952         // for NonLocal interactions.
 953         nb->misc_ops_and_local_H2D_done.reset();
 954
 955         return;
 956     }
 957
 958     /* local/nonlocal offset and length used for xq and f */
 959     const auto atomsRange = getGpuAtomRange(adat, atomLocality);
 960
 961     /* beginning of timed HtoD section */
 962     if (bDoTime)
 963     {
 964         timers->xf[atomLocality].nb_h2d.openTimingRegion(deviceStream);
 965     }
 966
 967     /* HtoD x, q */
 968     GMX_ASSERT(nbatom->XFormat == nbatXYZQ,
 969                "The coordinates should be in xyzq format to copy to the Float4 device buffer.");
 970     copyToDeviceBuffer(&adat->xq,
 971                        reinterpret_cast<const Float4*>(nbatom->x().data()) + atomsRange.begin(),
 972                        atomsRange.begin(),
 973                        atomsRange.size(),
 974                        deviceStream,
 975                        GpuApiCallBehavior::Async,
 976                        nullptr);
 977
 978     if (bDoTime)
 979     {
 980         timers->xf[atomLocality].nb_h2d.closeTimingRegion(deviceStream);
 981     }
 982
 983     /* When we get here all misc operations issued in the local stream as well as
 984        the local xq H2D are done,
 985        so we record that in the local stream and wait for it in the nonlocal one.
 986        This wait needs to precede any PP tasks, bonded or nonbonded, that may
 987        compute on interactions between local and nonlocal atoms.
 988      */
 989     nbnxnInsertNonlocalGpuDependency(nb, iloc);
 990 }
 991
 992
 993 /* Initialization for X buffer operations on GPU. */
 994 void nbnxn_gpu_init_x_to_nbat_x(const Nbnxm::GridSet& gridSet, NbnxmGpu* gpu_nbv)
 995 {
 996     const DeviceStream& localStream   = *gpu_nbv->deviceStreams[InteractionLocality::Local];
 997     const bool          bDoTime       = gpu_nbv->bDoTime;
 998     const int           maxNumColumns = gridSet.numColumnsMax();
 999
1000     reallocateDeviceBuffer(&gpu_nbv->cxy_na,
1001                            maxNumColumns * gridSet.grids().size(),
1002                            &gpu_nbv->ncxy_na,
1003                            &gpu_nbv->ncxy_na_alloc,
1004                            *gpu_nbv->deviceContext_);
1005     reallocateDeviceBuffer(&gpu_nbv->cxy_ind,
1006                            maxNumColumns * gridSet.grids().size(),
1007                            &gpu_nbv->ncxy_ind,
1008                            &gpu_nbv->ncxy_ind_alloc,
1009                            *gpu_nbv->deviceContext_);
1010
1011     for (unsigned int g = 0; g < gridSet.grids().size(); g++)
1012     {
1013         const Nbnxm::Grid& grid = gridSet.grids()[g];
1014
1015         const int  numColumns      = grid.numColumns();
1016         const int* atomIndices     = gridSet.atomIndices().data();
1017         const int  atomIndicesSize = gridSet.atomIndices().size();
1018         const int* cxy_na          = grid.cxy_na().data();
1019         const int* cxy_ind         = grid.cxy_ind().data();
1020
1021         auto* timerH2D = bDoTime ? &gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d : nullptr;
1022
1023         reallocateDeviceBuffer(&gpu_nbv->atomIndices,
1024                                atomIndicesSize,
1025                                &gpu_nbv->atomIndicesSize,
1026                                &gpu_nbv->atomIndicesSize_alloc,
1027                                *gpu_nbv->deviceContext_);
1028
1029         if (atomIndicesSize > 0)
1030         {
1031             if (bDoTime)
1032             {
1033                 timerH2D->openTimingRegion(localStream);
1034             }
1035
1036             copyToDeviceBuffer(&gpu_nbv->atomIndices,
1037                                atomIndices,
1038                                0,
1039                                atomIndicesSize,
1040                                localStream,
1041                                GpuApiCallBehavior::Async,
1042                                bDoTime ? timerH2D->fetchNextEvent() : nullptr);
1043
1044             if (bDoTime)
1045             {
1046                 timerH2D->closeTimingRegion(localStream);
1047             }
1048         }
1049
1050         if (numColumns > 0)
1051         {
1052             if (bDoTime)
1053             {
1054                 timerH2D->openTimingRegion(localStream);
1055             }
1056
1057             copyToDeviceBuffer(&gpu_nbv->cxy_na,
1058                                cxy_na,
1059                                maxNumColumns * g,
1060                                numColumns,
1061                                localStream,
1062                                GpuApiCallBehavior::Async,
1063                                bDoTime ? timerH2D->fetchNextEvent() : nullptr);
1064
1065             if (bDoTime)
1066             {
1067                 timerH2D->closeTimingRegion(localStream);
1068             }
1069
1070             if (bDoTime)
1071             {
1072                 timerH2D->openTimingRegion(localStream);
1073             }
1074
1075             copyToDeviceBuffer(&gpu_nbv->cxy_ind,
1076                                cxy_ind,
1077                                maxNumColumns * g,
1078                                numColumns,
1079                                localStream,
1080                                GpuApiCallBehavior::Async,
1081                                bDoTime ? timerH2D->fetchNextEvent() : nullptr);
1082
1083             if (bDoTime)
1084             {
1085                 timerH2D->closeTimingRegion(localStream);
1086             }
1087         }
1088     }
1089
1090     // The above data is transferred on the local stream but is a
1091     // dependency of the nonlocal stream (specifically the nonlocal X
1092     // buf ops kernel).  We therefore set a dependency to ensure
1093     // that the nonlocal stream waits on the local stream here.
1094     // This call records an event in the local stream:
1095     nbnxnInsertNonlocalGpuDependency(gpu_nbv, Nbnxm::InteractionLocality::Local);
1096     // ...and this call instructs the nonlocal stream to wait on that event:
1097     nbnxnInsertNonlocalGpuDependency(gpu_nbv, Nbnxm::InteractionLocality::NonLocal);
1098 }
1099
1100 //! This function is documented in the header file
1101 void gpu_free(NbnxmGpu* nb)
1102 {
1103     if (nb == nullptr)
1104     {
1105         return;
1106     }
1107
1108     gpu_free_platform_specific(nb);
1109
1110     delete nb->timers;
1111     sfree(nb->timings);
1112
1113     NBAtomDataGpu* atdat   = nb->atdat;
1114     NBParamGpu*    nbparam = nb->nbparam;
1115
1116     /* Free atdat */
1117     freeDeviceBuffer(&(nb->atdat->xq));
1118     freeDeviceBuffer(&(nb->atdat->f));
1119     freeDeviceBuffer(&(nb->atdat->eLJ));
1120     freeDeviceBuffer(&(nb->atdat->eElec));
1121     freeDeviceBuffer(&(nb->atdat->fShift));
1122     freeDeviceBuffer(&(nb->atdat->shiftVec));
1123     if (useLjCombRule(nb->nbparam->vdwType))
1124     {
1125         freeDeviceBuffer(&atdat->ljComb);
1126     }
1127     else
1128     {
1129         freeDeviceBuffer(&atdat->atomTypes);
1130     }
1131
1132     /* Free nbparam */
1133     if (nbparam->elecType == ElecType::EwaldTab || nbparam->elecType == ElecType::EwaldTabTwin)
1134     {
1135         destroyParamLookupTable(&nbparam->coulomb_tab, nbparam->coulomb_tab_texobj);
1136     }
1137
1138     if (!useLjCombRule(nb->nbparam->vdwType))
1139     {
1140         destroyParamLookupTable(&nbparam->nbfp, nbparam->nbfp_texobj);
1141     }
1142
1143     if (nbparam->vdwType == VdwType::EwaldGeom || nbparam->vdwType == VdwType::EwaldLB)
1144     {
1145         destroyParamLookupTable(&nbparam->nbfp_comb, nbparam->nbfp_comb_texobj);
1146     }
1147
1148     /* Free plist */
1149     auto* plist = nb->plist[InteractionLocality::Local];
1150     freeDeviceBuffer(&plist->sci);
1151     freeDeviceBuffer(&plist->cj4);
1152     freeDeviceBuffer(&plist->imask);
1153     freeDeviceBuffer(&plist->excl);
1154     delete plist;
1155     if (nb->bUseTwoStreams)
1156     {
1157         auto* plist_nl = nb->plist[InteractionLocality::NonLocal];
1158         freeDeviceBuffer(&plist_nl->sci);
1159         freeDeviceBuffer(&plist_nl->cj4);
1160         freeDeviceBuffer(&plist_nl->imask);
1161         freeDeviceBuffer(&plist_nl->excl);
1162         delete plist_nl;
1163     }
1164
1165     /* Free nbst */
1166     pfree(nb->nbst.eLJ);
1167     nb->nbst.eLJ = nullptr;
1168
1169     pfree(nb->nbst.eElec);
1170     nb->nbst.eElec = nullptr;
1171
1172     pfree(nb->nbst.fShift);
1173     nb->nbst.fShift = nullptr;
1174
1175     delete atdat;
1176     delete nbparam;
1177     delete nb;
1178
1179     if (debug)
1180     {
1181         fprintf(debug, "Cleaned up NBNXM GPU data structures.\n");
1182     }
1183 }
1184
1185 } // namespace Nbnxm