src/gromacs/nbnxm/nbnxm_gpu_data_mgmt.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2012,2013,2014,2015,2016 by the GROMACS development team.
   5  * Copyright (c) 2017,2018,2019,2020,2021, by the GROMACS development team, led by
   6  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   7  * and including many others, as listed in the AUTHORS file in the
   8  * top-level source directory and at http://www.gromacs.org.
   9  *
  10  * GROMACS is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU Lesser General Public License
  12  * as published by the Free Software Foundation; either version 2.1
  13  * of the License, or (at your option) any later version.
  14  *
  15  * GROMACS is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18  * Lesser General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public
  21  * License along with GROMACS; if not, see
  22  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  23  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  24  *
  25  * If you want to redistribute modifications to GROMACS, please
  26  * consider that scientific software is very special. Version
  27  * control is crucial - bugs must be traceable. We will be happy to
  28  * consider code for inclusion in the official distribution, but
  29  * derived work must not be called official GROMACS. Details are found
  30  * in the README & COPYING files - if they are missing, get the
  31  * official version at http://www.gromacs.org.
  32  *
  33  * To help us fund GROMACS development, we humbly ask that you cite
  34  * the research papers on the package. Check out http://www.gromacs.org.
  35  */
  36 /*! \internal \file
  37  *  \brief Define common implementation of nbnxm_gpu_data_mgmt.h
  38  *
  39  *  \author Anca Hamuraru <anca@streamcomputing.eu>
  40  *  \author Dimitrios Karkoulis <dimitris.karkoulis@gmail.com>
  41  *  \author Teemu Virolainen <teemu@streamcomputing.eu>
  42  *  \author Szilárd Páll <pall.szilard@gmail.com>
  43  *  \author Artem Zhmurov <zhmurov@gmail.com>
  44  *
  45  *  \ingroup module_nbnxm
  46  */
  47 #include "gmxpre.h"
  48
  49 #include "config.h"
  50
  51 #if GMX_GPU_CUDA
  52 #    include "cuda/nbnxm_cuda_types.h"
  53 #endif
  54
  55 #if GMX_GPU_OPENCL
  56 #    include "opencl/nbnxm_ocl_types.h"
  57 #endif
  58
  59 #if GMX_GPU_SYCL
  60 #    include "sycl/nbnxm_sycl_types.h"
  61 #endif
  62
  63 #include "nbnxm_gpu_data_mgmt.h"
  64
  65 #include "gromacs/gpu_utils/device_stream_manager.h"
  66 #include "gromacs/gpu_utils/gputraits.h"
  67 #include "gromacs/gpu_utils/pmalloc.h"
  68 #include "gromacs/hardware/device_information.h"
  69 #include "gromacs/mdtypes/interaction_const.h"
  70 #include "gromacs/mdtypes/simulation_workload.h"
  71 #include "gromacs/nbnxm/gpu_common_utils.h"
  72 #include "gromacs/nbnxm/gpu_data_mgmt.h"
  73 #include "gromacs/nbnxm/gridset.h"
  74 #include "gromacs/pbcutil/ishift.h"
  75 #include "gromacs/timing/gpu_timing.h"
  76 #include "gromacs/pbcutil/ishift.h"
  77 #include "gromacs/utility/cstringutil.h"
  78 #include "gromacs/utility/exceptions.h"
  79 #include "gromacs/utility/fatalerror.h"
  80
  81 #include "nbnxm_gpu.h"
  82 #include "pairlistsets.h"
  83
  84 namespace Nbnxm
  85 {
  86
  87 static inline void issueClFlushInStream(const DeviceStream& deviceStream)
  88 {
  89 #if GMX_GPU_OPENCL
  90     /* Based on the v1.2 section 5.13 of the OpenCL spec, a flush is needed
  91      * in the stream after marking an event in it in order to be able to sync with
  92      * the event from another stream.
  93      */
  94     cl_int cl_error = clFlush(deviceStream.stream());
  95     if (cl_error != CL_SUCCESS)
  96     {
  97         GMX_THROW(gmx::InternalError("clFlush failed: " + ocl_get_error_string(cl_error)));
  98     }
  99 #else
 100     GMX_UNUSED_VALUE(deviceStream);
 101 #endif
 102 }
 103
 104 static inline void init_ewald_coulomb_force_table(const EwaldCorrectionTables& tables,
 105                                                   NBParamGpu*                  nbp,
 106                                                   const DeviceContext&         deviceContext)
 107 {
 108     if (nbp->coulomb_tab)
 109     {
 110         destroyParamLookupTable(&nbp->coulomb_tab, &nbp->coulomb_tab_texobj);
 111     }
 112
 113     nbp->coulomb_tab_scale = tables.scale;
 114     initParamLookupTable(
 115             &nbp->coulomb_tab, &nbp->coulomb_tab_texobj, tables.tableF.data(), tables.tableF.size(), deviceContext);
 116 }
 117
 118 static inline ElecType nbnxn_gpu_pick_ewald_kernel_type(const interaction_const_t& ic,
 119                                                         const DeviceInformation gmx_unused& deviceInfo)
 120 {
 121     bool bTwinCut = (ic.rcoulomb != ic.rvdw);
 122
 123     /* Benchmarking/development environment variables to force the use of
 124        analytical or tabulated Ewald kernel. */
 125     const bool forceAnalyticalEwald = (getenv("GMX_GPU_NB_ANA_EWALD") != nullptr);
 126     const bool forceTabulatedEwald  = (getenv("GMX_GPU_NB_TAB_EWALD") != nullptr);
 127     const bool forceTwinCutoffEwald = (getenv("GMX_GPU_NB_EWALD_TWINCUT") != nullptr);
 128
 129     if (forceAnalyticalEwald && forceTabulatedEwald)
 130     {
 131         gmx_incons(
 132                 "Both analytical and tabulated Ewald GPU non-bonded kernels "
 133                 "requested through environment variables.");
 134     }
 135
 136     /* By default, use analytical Ewald except with CUDA on NVIDIA CC 7.0 and 8.0.
 137      */
 138     const bool c_useTabulatedEwaldDefault =
 139 #if GMX_GPU_CUDA
 140             (deviceInfo.prop.major == 7 && deviceInfo.prop.minor == 0)
 141             || (deviceInfo.prop.major == 8 && deviceInfo.prop.minor == 0);
 142 #else
 143             false;
 144 #endif
 145     bool bUseAnalyticalEwald = !c_useTabulatedEwaldDefault;
 146     if (forceAnalyticalEwald)
 147     {
 148         bUseAnalyticalEwald = true;
 149         if (debug)
 150         {
 151             fprintf(debug, "Using analytical Ewald GPU kernels\n");
 152         }
 153     }
 154     else if (forceTabulatedEwald)
 155     {
 156         bUseAnalyticalEwald = false;
 157
 158         if (debug)
 159         {
 160             fprintf(debug, "Using tabulated Ewald GPU kernels\n");
 161         }
 162     }
 163
 164     /* Use twin cut-off kernels if requested by bTwinCut or the env. var.
 165        forces it (use it for debugging/benchmarking only). */
 166     if (!bTwinCut && !forceTwinCutoffEwald)
 167     {
 168         return bUseAnalyticalEwald ? ElecType::EwaldAna : ElecType::EwaldTab;
 169     }
 170     else
 171     {
 172         return bUseAnalyticalEwald ? ElecType::EwaldAnaTwin : ElecType::EwaldTabTwin;
 173     }
 174 }
 175
 176 static inline void set_cutoff_parameters(NBParamGpu*                nbp,
 177                                          const interaction_const_t& ic,
 178                                          const PairlistParams&      listParams)
 179 {
 180     nbp->ewald_beta        = ic.ewaldcoeff_q;
 181     nbp->sh_ewald          = ic.sh_ewald;
 182     nbp->epsfac            = ic.epsfac;
 183     nbp->two_k_rf          = 2.0 * ic.reactionFieldCoefficient;
 184     nbp->c_rf              = ic.reactionFieldShift;
 185     nbp->rvdw_sq           = ic.rvdw * ic.rvdw;
 186     nbp->rcoulomb_sq       = ic.rcoulomb * ic.rcoulomb;
 187     nbp->rlistOuter_sq     = listParams.rlistOuter * listParams.rlistOuter;
 188     nbp->rlistInner_sq     = listParams.rlistInner * listParams.rlistInner;
 189     nbp->useDynamicPruning = listParams.useDynamicPruning;
 190
 191     nbp->sh_lj_ewald   = ic.sh_lj_ewald;
 192     nbp->ewaldcoeff_lj = ic.ewaldcoeff_lj;
 193
 194     nbp->rvdw_switch      = ic.rvdw_switch;
 195     nbp->dispersion_shift = ic.dispersion_shift;
 196     nbp->repulsion_shift  = ic.repulsion_shift;
 197     nbp->vdw_switch       = ic.vdw_switch;
 198 }
 199
 200 static inline void init_plist(gpu_plist* pl)
 201 {
 202     /* initialize to nullptr pointers to data that is not allocated here and will
 203        need reallocation in nbnxn_gpu_init_pairlist */
 204     pl->sci   = nullptr;
 205     pl->cj4   = nullptr;
 206     pl->imask = nullptr;
 207     pl->excl  = nullptr;
 208
 209     /* size -1 indicates that the respective array hasn't been initialized yet */
 210     pl->na_c                   = -1;
 211     pl->nsci                   = -1;
 212     pl->sci_nalloc             = -1;
 213     pl->ncj4                   = -1;
 214     pl->cj4_nalloc             = -1;
 215     pl->nimask                 = -1;
 216     pl->imask_nalloc           = -1;
 217     pl->nexcl                  = -1;
 218     pl->excl_nalloc            = -1;
 219     pl->haveFreshList          = false;
 220     pl->rollingPruningNumParts = 0;
 221     pl->rollingPruningPart     = 0;
 222 }
 223
 224 static inline void init_timings(gmx_wallclock_gpu_nbnxn_t* t)
 225 {
 226     t->nb_h2d_t = 0.0;
 227     t->nb_d2h_t = 0.0;
 228     t->nb_c     = 0;
 229     t->pl_h2d_t = 0.0;
 230     t->pl_h2d_c = 0;
 231     for (int i = 0; i < 2; i++)
 232     {
 233         for (int j = 0; j < 2; j++)
 234         {
 235             t->ktime[i][j].t = 0.0;
 236             t->ktime[i][j].c = 0;
 237         }
 238     }
 239     t->pruneTime.c        = 0;
 240     t->pruneTime.t        = 0.0;
 241     t->dynamicPruneTime.c = 0;
 242     t->dynamicPruneTime.t = 0.0;
 243 }
 244
 245 /*! \brief Initialize \p atomdata first time; it only gets filled at pair-search. */
 246 static inline void initAtomdataFirst(NBAtomDataGpu*       atomdata,
 247                                      int                  numTypes,
 248                                      const DeviceContext& deviceContext,
 249                                      const DeviceStream&  localStream)
 250 {
 251     atomdata->numTypes = numTypes;
 252     allocateDeviceBuffer(&atomdata->shiftVec, gmx::c_numShiftVectors, deviceContext);
 253     atomdata->shiftVecUploaded = false;
 254
 255     allocateDeviceBuffer(&atomdata->fShift, gmx::c_numShiftVectors, deviceContext);
 256     allocateDeviceBuffer(&atomdata->eLJ, 1, deviceContext);
 257     allocateDeviceBuffer(&atomdata->eElec, 1, deviceContext);
 258
 259     clearDeviceBufferAsync(&atomdata->fShift, 0, gmx::c_numShiftVectors, localStream);
 260     clearDeviceBufferAsync(&atomdata->eElec, 0, 1, localStream);
 261     clearDeviceBufferAsync(&atomdata->eLJ, 0, 1, localStream);
 262
 263     /* initialize to nullptr pointers to data that is not allocated here and will
 264        need reallocation in later */
 265     atomdata->xq = nullptr;
 266     atomdata->f  = nullptr;
 267
 268     /* size -1 indicates that the respective array hasn't been initialized yet */
 269     atomdata->numAtoms      = -1;
 270     atomdata->numAtomsAlloc = -1;
 271 }
 272
 273 static inline VdwType nbnxmGpuPickVdwKernelType(const interaction_const_t& ic,
 274                                                 LJCombinationRule          ljCombinationRule)
 275 {
 276     if (ic.vdwtype == VanDerWaalsType::Cut)
 277     {
 278         switch (ic.vdw_modifier)
 279         {
 280             case InteractionModifiers::None:
 281             case InteractionModifiers::PotShift:
 282                 switch (ljCombinationRule)
 283                 {
 284                     case LJCombinationRule::None: return VdwType::Cut;
 285                     case LJCombinationRule::Geometric: return VdwType::CutCombGeom;
 286                     case LJCombinationRule::LorentzBerthelot: return VdwType::CutCombLB;
 287                     default:
 288                         GMX_THROW(gmx::InconsistentInputError(gmx::formatString(
 289                                 "The requested LJ combination rule %s is not implemented in "
 290                                 "the GPU accelerated kernels!",
 291                                 enumValueToString(ljCombinationRule))));
 292                 }
 293             case InteractionModifiers::ForceSwitch: return VdwType::FSwitch;
 294             case InteractionModifiers::PotSwitch: return VdwType::PSwitch;
 295             default:
 296                 GMX_THROW(gmx::InconsistentInputError(
 297                         gmx::formatString("The requested VdW interaction modifier %s is not "
 298                                           "implemented in the GPU accelerated kernels!",
 299                                           enumValueToString(ic.vdw_modifier))));
 300         }
 301     }
 302     else if (ic.vdwtype == VanDerWaalsType::Pme)
 303     {
 304         if (ic.ljpme_comb_rule == LongRangeVdW::Geom)
 305         {
 306             GMX_RELEASE_ASSERT(
 307                     ljCombinationRule == LJCombinationRule::Geometric,
 308                     "Combination rules for long- and short-range interactions should match.");
 309             return VdwType::EwaldGeom;
 310         }
 311         else
 312         {
 313             GMX_RELEASE_ASSERT(
 314                     ljCombinationRule == LJCombinationRule::LorentzBerthelot,
 315                     "Combination rules for long- and short-range interactions should match.");
 316             return VdwType::EwaldLB;
 317         }
 318     }
 319     else
 320     {
 321         GMX_THROW(gmx::InconsistentInputError(gmx::formatString(
 322                 "The requested VdW type %s is not implemented in the GPU accelerated kernels!",
 323                 enumValueToString(ic.vdwtype))));
 324     }
 325 }
 326
 327 static inline ElecType nbnxmGpuPickElectrostaticsKernelType(const interaction_const_t& ic,
 328                                                             const DeviceInformation&   deviceInfo)
 329 {
 330     if (ic.eeltype == CoulombInteractionType::Cut)
 331     {
 332         return ElecType::Cut;
 333     }
 334     else if (EEL_RF(ic.eeltype))
 335     {
 336         return ElecType::RF;
 337     }
 338     else if ((EEL_PME(ic.eeltype) || ic.eeltype == CoulombInteractionType::Ewald))
 339     {
 340         return nbnxn_gpu_pick_ewald_kernel_type(ic, deviceInfo);
 341     }
 342     else
 343     {
 344         /* Shouldn't happen, as this is checked when choosing Verlet-scheme */
 345         GMX_THROW(gmx::InconsistentInputError(
 346                 gmx::formatString("The requested electrostatics type %s is not implemented in "
 347                                   "the GPU accelerated kernels!",
 348                                   enumValueToString(ic.eeltype))));
 349     }
 350 }
 351
 352 /*! \brief Initialize the nonbonded parameter data structure. */
 353 static inline void initNbparam(NBParamGpu*                     nbp,
 354                                const interaction_const_t&      ic,
 355                                const PairlistParams&           listParams,
 356                                const nbnxn_atomdata_t::Params& nbatParams,
 357                                const DeviceContext&            deviceContext)
 358 {
 359     const int numTypes = nbatParams.numTypes;
 360
 361     set_cutoff_parameters(nbp, ic, listParams);
 362
 363     nbp->vdwType  = nbnxmGpuPickVdwKernelType(ic, nbatParams.ljCombinationRule);
 364     nbp->elecType = nbnxmGpuPickElectrostaticsKernelType(ic, deviceContext.deviceInfo());
 365
 366     if (ic.vdwtype == VanDerWaalsType::Pme)
 367     {
 368         if (ic.ljpme_comb_rule == LongRangeVdW::Geom)
 369         {
 370             GMX_ASSERT(nbatParams.ljCombinationRule == LJCombinationRule::Geometric,
 371                        "Combination rule mismatch!");
 372         }
 373         else
 374         {
 375             GMX_ASSERT(nbatParams.ljCombinationRule == LJCombinationRule::LorentzBerthelot,
 376                        "Combination rule mismatch!");
 377         }
 378     }
 379
 380     /* generate table for PME */
 381     if (nbp->elecType == ElecType::EwaldTab || nbp->elecType == ElecType::EwaldTabTwin)
 382     {
 383         GMX_RELEASE_ASSERT(ic.coulombEwaldTables, "Need valid Coulomb Ewald correction tables");
 384         init_ewald_coulomb_force_table(*ic.coulombEwaldTables, nbp, deviceContext);
 385     }
 386
 387     /* set up LJ parameter lookup table */
 388     if (!useLjCombRule(nbp->vdwType))
 389     {
 390         static_assert(sizeof(decltype(nbp->nbfp)) == 2 * sizeof(decltype(*nbatParams.nbfp.data())),
 391                       "Mismatch in the size of host / device data types");
 392         initParamLookupTable(&nbp->nbfp,
 393                              &nbp->nbfp_texobj,
 394                              reinterpret_cast<const Float2*>(nbatParams.nbfp.data()),
 395                              numTypes * numTypes,
 396                              deviceContext);
 397     }
 398
 399     /* set up LJ-PME parameter lookup table */
 400     if (ic.vdwtype == VanDerWaalsType::Pme)
 401     {
 402         static_assert(sizeof(decltype(nbp->nbfp_comb))
 403                               == 2 * sizeof(decltype(*nbatParams.nbfp_comb.data())),
 404                       "Mismatch in the size of host / device data types");
 405         initParamLookupTable(&nbp->nbfp_comb,
 406                              &nbp->nbfp_comb_texobj,
 407                              reinterpret_cast<const Float2*>(nbatParams.nbfp_comb.data()),
 408                              numTypes,
 409                              deviceContext);
 410     }
 411 }
 412
 413 NbnxmGpu* gpu_init(const gmx::DeviceStreamManager& deviceStreamManager,
 414                    const interaction_const_t*      ic,
 415                    const PairlistParams&           listParams,
 416                    const nbnxn_atomdata_t*         nbat,
 417                    const bool                      bLocalAndNonlocal)
 418 {
 419     auto* nb                              = new NbnxmGpu();
 420     nb->deviceContext_                    = &deviceStreamManager.context();
 421     nb->atdat                             = new NBAtomDataGpu;
 422     nb->nbparam                           = new NBParamGpu;
 423     nb->plist[InteractionLocality::Local] = new Nbnxm::gpu_plist;
 424     if (bLocalAndNonlocal)
 425     {
 426         nb->plist[InteractionLocality::NonLocal] = new Nbnxm::gpu_plist;
 427     }
 428
 429     nb->bUseTwoStreams = bLocalAndNonlocal;
 430
 431     nb->timers = new Nbnxm::GpuTimers();
 432     snew(nb->timings, 1);
 433
 434     /* WARNING: CUDA timings are incorrect with multiple streams.
 435      * This is the main reason why they are disabled by default.
 436      * Can be enabled by setting GMX_ENABLE_GPU_TIMING environment variable.
 437      * TODO: Consider turning on by default when we can detect nr of streams.
 438      *
 439      * OpenCL timing is enabled by default and can be disabled by
 440      * GMX_DISABLE_GPU_TIMING environment variable.
 441      *
 442      * Timing is disabled in SYCL.
 443      */
 444     nb->bDoTime = (GMX_GPU_CUDA && (getenv("GMX_ENABLE_GPU_TIMING") != nullptr))
 445                   || (GMX_GPU_OPENCL && (getenv("GMX_DISABLE_GPU_TIMING") == nullptr));
 446
 447     if (nb->bDoTime)
 448     {
 449         init_timings(nb->timings);
 450     }
 451
 452     /* init nbst */
 453     pmalloc(reinterpret_cast<void**>(&nb->nbst.eLJ), sizeof(*nb->nbst.eLJ));
 454     pmalloc(reinterpret_cast<void**>(&nb->nbst.eElec), sizeof(*nb->nbst.eElec));
 455     pmalloc(reinterpret_cast<void**>(&nb->nbst.fShift), gmx::c_numShiftVectors * sizeof(*nb->nbst.fShift));
 456
 457     init_plist(nb->plist[InteractionLocality::Local]);
 458
 459     /* local/non-local GPU streams */
 460     GMX_RELEASE_ASSERT(deviceStreamManager.streamIsValid(gmx::DeviceStreamType::NonBondedLocal),
 461                        "Local non-bonded stream should be initialized to use GPU for non-bonded.");
 462     const DeviceStream& localStream = deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedLocal);
 463     nb->deviceStreams[InteractionLocality::Local] = &localStream;
 464     // In general, it's not strictly necessary to use 2 streams for SYCL, since they are
 465     // out-of-order. But for the time being, it will be less disruptive to keep them.
 466     if (nb->bUseTwoStreams)
 467     {
 468         init_plist(nb->plist[InteractionLocality::NonLocal]);
 469
 470         GMX_RELEASE_ASSERT(deviceStreamManager.streamIsValid(gmx::DeviceStreamType::NonBondedNonLocal),
 471                            "Non-local non-bonded stream should be initialized to use GPU for "
 472                            "non-bonded with domain decomposition.");
 473         nb->deviceStreams[InteractionLocality::NonLocal] =
 474                 &deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedNonLocal);
 475     }
 476
 477     const nbnxn_atomdata_t::Params& nbatParams    = nbat->params();
 478     const DeviceContext&            deviceContext = *nb->deviceContext_;
 479
 480     initNbparam(nb->nbparam, *ic, listParams, nbatParams, deviceContext);
 481     initAtomdataFirst(nb->atdat, nbatParams.numTypes, deviceContext, localStream);
 482
 483     gpu_init_platform_specific(nb);
 484
 485     if (debug)
 486     {
 487         fprintf(debug, "Initialized NBNXM GPU data structures.\n");
 488     }
 489
 490     return nb;
 491 }
 492
 493 void gpu_pme_loadbal_update_param(const nonbonded_verlet_t* nbv, const interaction_const_t& ic)
 494 {
 495     if (!nbv || !nbv->useGpu())
 496     {
 497         return;
 498     }
 499     NbnxmGpu*   nb  = nbv->gpu_nbv;
 500     NBParamGpu* nbp = nb->nbparam;
 501
 502     set_cutoff_parameters(nbp, ic, nbv->pairlistSets().params());
 503
 504     nbp->elecType = nbnxn_gpu_pick_ewald_kernel_type(ic, nb->deviceContext_->deviceInfo());
 505
 506     GMX_RELEASE_ASSERT(ic.coulombEwaldTables, "Need valid Coulomb Ewald correction tables");
 507     init_ewald_coulomb_force_table(*ic.coulombEwaldTables, nbp, *nb->deviceContext_);
 508 }
 509
 510 void gpu_upload_shiftvec(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom)
 511 {
 512     NBAtomDataGpu*      adat        = nb->atdat;
 513     const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local];
 514
 515     /* only if we have a dynamic box */
 516     if (nbatom->bDynamicBox || !adat->shiftVecUploaded)
 517     {
 518         copyToDeviceBuffer(&adat->shiftVec,
 519                            gmx::asGenericFloat3Pointer(nbatom->shift_vec),
 520                            0,
 521                            gmx::c_numShiftVectors,
 522                            localStream,
 523                            GpuApiCallBehavior::Async,
 524                            nullptr);
 525         adat->shiftVecUploaded = true;
 526     }
 527 }
 528
 529 //! This function is documented in the header file
 530 void gpu_init_pairlist(NbnxmGpu* nb, const NbnxnPairlistGpu* h_plist, const InteractionLocality iloc)
 531 {
 532     char sbuf[STRLEN];
 533     // Timing accumulation should happen only if there was work to do
 534     // because getLastRangeTime() gets skipped with empty lists later
 535     // which leads to the counter not being reset.
 536     bool                bDoTime      = (nb->bDoTime && !h_plist->sci.empty());
 537     const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
 538     gpu_plist*          d_plist      = nb->plist[iloc];
 539
 540     if (d_plist->na_c < 0)
 541     {
 542         d_plist->na_c = h_plist->na_ci;
 543     }
 544     else
 545     {
 546         if (d_plist->na_c != h_plist->na_ci)
 547         {
 548             sprintf(sbuf,
 549                     "In init_plist: the #atoms per cell has changed (from %d to %d)",
 550                     d_plist->na_c,
 551                     h_plist->na_ci);
 552             gmx_incons(sbuf);
 553         }
 554     }
 555
 556     GpuTimers::Interaction& iTimers = nb->timers->interaction[iloc];
 557
 558     if (bDoTime)
 559     {
 560         iTimers.pl_h2d.openTimingRegion(deviceStream);
 561         iTimers.didPairlistH2D = true;
 562     }
 563
 564     // TODO most of this function is same in CUDA and OpenCL, move into the header
 565     const DeviceContext& deviceContext = *nb->deviceContext_;
 566
 567     reallocateDeviceBuffer(
 568             &d_plist->sci, h_plist->sci.size(), &d_plist->nsci, &d_plist->sci_nalloc, deviceContext);
 569     copyToDeviceBuffer(&d_plist->sci,
 570                        h_plist->sci.data(),
 571                        0,
 572                        h_plist->sci.size(),
 573                        deviceStream,
 574                        GpuApiCallBehavior::Async,
 575                        bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
 576
 577     reallocateDeviceBuffer(
 578             &d_plist->cj4, h_plist->cj4.size(), &d_plist->ncj4, &d_plist->cj4_nalloc, deviceContext);
 579     copyToDeviceBuffer(&d_plist->cj4,
 580                        h_plist->cj4.data(),
 581                        0,
 582                        h_plist->cj4.size(),
 583                        deviceStream,
 584                        GpuApiCallBehavior::Async,
 585                        bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
 586
 587     reallocateDeviceBuffer(&d_plist->imask,
 588                            h_plist->cj4.size() * c_nbnxnGpuClusterpairSplit,
 589                            &d_plist->nimask,
 590                            &d_plist->imask_nalloc,
 591                            deviceContext);
 592
 593     reallocateDeviceBuffer(
 594             &d_plist->excl, h_plist->excl.size(), &d_plist->nexcl, &d_plist->excl_nalloc, deviceContext);
 595     copyToDeviceBuffer(&d_plist->excl,
 596                        h_plist->excl.data(),
 597                        0,
 598                        h_plist->excl.size(),
 599                        deviceStream,
 600                        GpuApiCallBehavior::Async,
 601                        bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
 602
 603     if (bDoTime)
 604     {
 605         iTimers.pl_h2d.closeTimingRegion(deviceStream);
 606     }
 607
 608     /* need to prune the pair list during the next step */
 609     d_plist->haveFreshList = true;
 610 }
 611
 612 void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat)
 613 {
 614     bool                 bDoTime       = nb->bDoTime;
 615     Nbnxm::GpuTimers*    timers        = bDoTime ? nb->timers : nullptr;
 616     NBAtomDataGpu*       atdat         = nb->atdat;
 617     const DeviceContext& deviceContext = *nb->deviceContext_;
 618     const DeviceStream&  localStream   = *nb->deviceStreams[InteractionLocality::Local];
 619
 620     int  numAtoms  = nbat->numAtoms();
 621     bool realloced = false;
 622
 623     if (bDoTime)
 624     {
 625         /* time async copy */
 626         timers->atdat.openTimingRegion(localStream);
 627     }
 628
 629     /* need to reallocate if we have to copy more atoms than the amount of space
 630        available and only allocate if we haven't initialized yet, i.e atdat->natoms == -1 */
 631     if (numAtoms > atdat->numAtomsAlloc)
 632     {
 633         int numAlloc = over_alloc_small(numAtoms);
 634
 635         /* free up first if the arrays have already been initialized */
 636         if (atdat->numAtomsAlloc != -1)
 637         {
 638             freeDeviceBuffer(&atdat->f);
 639             freeDeviceBuffer(&atdat->xq);
 640             if (useLjCombRule(nb->nbparam->vdwType))
 641             {
 642                 freeDeviceBuffer(&atdat->ljComb);
 643             }
 644             else
 645             {
 646                 freeDeviceBuffer(&atdat->atomTypes);
 647             }
 648         }
 649
 650
 651         allocateDeviceBuffer(&atdat->f, numAlloc, deviceContext);
 652         allocateDeviceBuffer(&atdat->xq, numAlloc, deviceContext);
 653
 654         if (useLjCombRule(nb->nbparam->vdwType))
 655         {
 656             // Two Lennard-Jones parameters per atom
 657             allocateDeviceBuffer(&atdat->ljComb, numAlloc, deviceContext);
 658         }
 659         else
 660         {
 661             allocateDeviceBuffer(&atdat->atomTypes, numAlloc, deviceContext);
 662         }
 663
 664         atdat->numAtomsAlloc = numAlloc;
 665         realloced            = true;
 666     }
 667
 668     atdat->numAtoms      = numAtoms;
 669     atdat->numAtomsLocal = nbat->natoms_local;
 670
 671     /* need to clear GPU f output if realloc happened */
 672     if (realloced)
 673     {
 674         clearDeviceBufferAsync(&atdat->f, 0, atdat->numAtomsAlloc, localStream);
 675     }
 676
 677     if (useLjCombRule(nb->nbparam->vdwType))
 678     {
 679         static_assert(
 680                 sizeof(Float2) == 2 * sizeof(*nbat->params().lj_comb.data()),
 681                 "Size of a pair of LJ parameters elements should be equal to the size of Float2.");
 682         copyToDeviceBuffer(&atdat->ljComb,
 683                            reinterpret_cast<const Float2*>(nbat->params().lj_comb.data()),
 684                            0,
 685                            numAtoms,
 686                            localStream,
 687                            GpuApiCallBehavior::Async,
 688                            bDoTime ? timers->atdat.fetchNextEvent() : nullptr);
 689     }
 690     else
 691     {
 692         static_assert(sizeof(int) == sizeof(*nbat->params().type.data()),
 693                       "Sizes of host- and device-side atom types should be the same.");
 694         copyToDeviceBuffer(&atdat->atomTypes,
 695                            nbat->params().type.data(),
 696                            0,
 697                            numAtoms,
 698                            localStream,
 699                            GpuApiCallBehavior::Async,
 700                            bDoTime ? timers->atdat.fetchNextEvent() : nullptr);
 701     }
 702
 703     if (bDoTime)
 704     {
 705         timers->atdat.closeTimingRegion(localStream);
 706     }
 707
 708     /* kick off the tasks enqueued above to ensure concurrency with the search */
 709     issueClFlushInStream(localStream);
 710 }
 711
 712 void gpu_clear_outputs(NbnxmGpu* nb, bool computeVirial)
 713 {
 714     NBAtomDataGpu*      adat        = nb->atdat;
 715     const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local];
 716     // Clear forces
 717     clearDeviceBufferAsync(&adat->f, 0, nb->atdat->numAtoms, localStream);
 718     // Clear shift force array and energies if the outputs were used in the current step
 719     if (computeVirial)
 720     {
 721         clearDeviceBufferAsync(&adat->fShift, 0, gmx::c_numShiftVectors, localStream);
 722         clearDeviceBufferAsync(&adat->eLJ, 0, 1, localStream);
 723         clearDeviceBufferAsync(&adat->eElec, 0, 1, localStream);
 724     }
 725     issueClFlushInStream(localStream);
 726 }
 727
 728 //! This function is documented in the header file
 729 gmx_wallclock_gpu_nbnxn_t* gpu_get_timings(NbnxmGpu* nb)
 730 {
 731     return (nb != nullptr && nb->bDoTime) ? nb->timings : nullptr;
 732 }
 733
 734 //! This function is documented in the header file
 735 void gpu_reset_timings(nonbonded_verlet_t* nbv)
 736 {
 737     if (nbv->gpu_nbv && nbv->gpu_nbv->bDoTime)
 738     {
 739         init_timings(nbv->gpu_nbv->timings);
 740     }
 741 }
 742
 743 bool gpu_is_kernel_ewald_analytical(const NbnxmGpu* nb)
 744 {
 745     return ((nb->nbparam->elecType == ElecType::EwaldAna)
 746             || (nb->nbparam->elecType == ElecType::EwaldAnaTwin));
 747 }
 748
 749 void setupGpuShortRangeWork(NbnxmGpu*                      nb,
 750                             const gmx::ListedForcesGpu*    listedForcesGpu,
 751                             const gmx::InteractionLocality iLocality)
 752 {
 753     GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
 754
 755     // There is short-range work if the pair list for the provided
 756     // interaction locality contains entries or if there is any
 757     // bonded work (as this is not split into local/nonlocal).
 758     nb->haveWork[iLocality] = ((nb->plist[iLocality]->nsci != 0)
 759                                || (listedForcesGpu != nullptr && listedForcesGpu->haveInteractions()));
 760 }
 761
 762 bool haveGpuShortRangeWork(const NbnxmGpu* nb, const gmx::InteractionLocality interactionLocality)
 763 {
 764     GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
 765
 766     return nb->haveWork[interactionLocality];
 767 }
 768
 769 /*! \brief
 770  * Launch asynchronously the download of nonbonded forces from the GPU
 771  * (and energies/shift forces if required).
 772  */
 773 void gpu_launch_cpyback(NbnxmGpu*                nb,
 774                         struct nbnxn_atomdata_t* nbatom,
 775                         const gmx::StepWorkload& stepWork,
 776                         const AtomLocality       atomLocality)
 777 {
 778     GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
 779
 780     /* determine interaction locality from atom locality */
 781     const InteractionLocality iloc = atomToInteractionLocality(atomLocality);
 782     GMX_ASSERT(iloc == InteractionLocality::Local
 783                        || (iloc == InteractionLocality::NonLocal && nb->bNonLocalStreamDoneMarked == false),
 784                "Non-local stream is indicating that the copy back event is enqueued at the "
 785                "beginning of the copy back function.");
 786
 787     /* extract the data */
 788     NBAtomDataGpu*      adat         = nb->atdat;
 789     Nbnxm::GpuTimers*   timers       = nb->timers;
 790     bool                bDoTime      = nb->bDoTime;
 791     const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
 792
 793     /* don't launch non-local copy-back if there was no non-local work to do */
 794     if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(nb, iloc))
 795     {
 796         /* TODO An alternative way to signal that non-local work is
 797            complete is to use a clEnqueueMarker+clEnqueueBarrier
 798            pair. However, the use of bNonLocalStreamDoneMarked has the
 799            advantage of being local to the host, so probably minimizes
 800            overhead. Curiously, for NVIDIA OpenCL with an empty-domain
 801            test case, overall simulation performance was higher with
 802            the API calls, but this has not been tested on AMD OpenCL,
 803            so could be worth considering in future. */
 804         nb->bNonLocalStreamDoneMarked = false;
 805         return;
 806     }
 807
 808     /* local/nonlocal offset and length used for xq and f */
 809     auto atomsRange = getGpuAtomRange(adat, atomLocality);
 810
 811     /* beginning of timed D2H section */
 812     if (bDoTime)
 813     {
 814         timers->xf[atomLocality].nb_d2h.openTimingRegion(deviceStream);
 815     }
 816
 817     /* With DD the local D2H transfer can only start after the non-local
 818        has been launched. */
 819     if (iloc == InteractionLocality::Local && nb->bNonLocalStreamDoneMarked)
 820     {
 821         nb->nonlocal_done.enqueueWaitEvent(deviceStream);
 822         nb->bNonLocalStreamDoneMarked = false;
 823     }
 824
 825     /* DtoH f */
 826     if (!stepWork.useGpuFBufferOps)
 827     {
 828         static_assert(
 829                 sizeof(*nbatom->out[0].f.data()) == sizeof(float),
 830                 "The host force buffer should be in single precision to match device data size.");
 831         copyFromDeviceBuffer(reinterpret_cast<Float3*>(nbatom->out[0].f.data()) + atomsRange.begin(),
 832                              &adat->f,
 833                              atomsRange.begin(),
 834                              atomsRange.size(),
 835                              deviceStream,
 836                              GpuApiCallBehavior::Async,
 837                              bDoTime ? timers->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
 838
 839         issueClFlushInStream(deviceStream);
 840     }
 841
 842     /* After the non-local D2H is launched the nonlocal_done event can be
 843        recorded which signals that the local D2H can proceed. This event is not
 844        placed after the non-local kernel because we first need the non-local
 845        data back first. */
 846     if (iloc == InteractionLocality::NonLocal)
 847     {
 848         nb->nonlocal_done.markEvent(deviceStream);
 849         nb->bNonLocalStreamDoneMarked = true;
 850     }
 851
 852     /* only transfer energies in the local stream */
 853     if (iloc == InteractionLocality::Local)
 854     {
 855         /* DtoH fshift when virial is needed */
 856         if (stepWork.computeVirial)
 857         {
 858             static_assert(
 859                     sizeof(*nb->nbst.fShift) == sizeof(Float3),
 860                     "Sizes of host- and device-side shift vector elements should be the same.");
 861             copyFromDeviceBuffer(nb->nbst.fShift,
 862                                  &adat->fShift,
 863                                  0,
 864                                  gmx::c_numShiftVectors,
 865                                  deviceStream,
 866                                  GpuApiCallBehavior::Async,
 867                                  bDoTime ? timers->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
 868         }
 869
 870         /* DtoH energies */
 871         if (stepWork.computeEnergy)
 872         {
 873             static_assert(sizeof(*nb->nbst.eLJ) == sizeof(float),
 874                           "Sizes of host- and device-side LJ energy terms should be the same.");
 875             copyFromDeviceBuffer(nb->nbst.eLJ,
 876                                  &adat->eLJ,
 877                                  0,
 878                                  1,
 879                                  deviceStream,
 880                                  GpuApiCallBehavior::Async,
 881                                  bDoTime ? timers->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
 882             static_assert(sizeof(*nb->nbst.eElec) == sizeof(float),
 883                           "Sizes of host- and device-side electrostatic energy terms should be the "
 884                           "same.");
 885             copyFromDeviceBuffer(nb->nbst.eElec,
 886                                  &adat->eElec,
 887                                  0,
 888                                  1,
 889                                  deviceStream,
 890                                  GpuApiCallBehavior::Async,
 891                                  bDoTime ? timers->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
 892         }
 893     }
 894
 895     if (bDoTime)
 896     {
 897         timers->xf[atomLocality].nb_d2h.closeTimingRegion(deviceStream);
 898     }
 899 }
 900
 901 void nbnxnInsertNonlocalGpuDependency(NbnxmGpu* nb, const InteractionLocality interactionLocality)
 902 {
 903     const DeviceStream& deviceStream = *nb->deviceStreams[interactionLocality];
 904
 905     /* When we get here all misc operations issued in the local stream as well as
 906        the local xq H2D are done,
 907        so we record that in the local stream and wait for it in the nonlocal one.
 908        This wait needs to precede any PP tasks, bonded or nonbonded, that may
 909        compute on interactions between local and nonlocal atoms.
 910      */
 911     if (nb->bUseTwoStreams)
 912     {
 913         if (interactionLocality == InteractionLocality::Local)
 914         {
 915             nb->misc_ops_and_local_H2D_done.markEvent(deviceStream);
 916             issueClFlushInStream(deviceStream);
 917         }
 918         else
 919         {
 920             nb->misc_ops_and_local_H2D_done.enqueueWaitEvent(deviceStream);
 921         }
 922     }
 923 }
 924
 925 /*! \brief Launch asynchronously the xq buffer host to device copy. */
 926 void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const AtomLocality atomLocality)
 927 {
 928     GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
 929
 930     const InteractionLocality iloc = atomToInteractionLocality(atomLocality);
 931
 932     NBAtomDataGpu*      adat         = nb->atdat;
 933     gpu_plist*          plist        = nb->plist[iloc];
 934     Nbnxm::GpuTimers*   timers       = nb->timers;
 935     const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
 936
 937     const bool bDoTime = nb->bDoTime;
 938
 939     /* Don't launch the non-local H2D copy if there is no dependent
 940        work to do: neither non-local nor other (e.g. bonded) work
 941        to do that has as input the nbnxn coordaintes.
 942        Doing the same for the local kernel is more complicated, since the
 943        local part of the force array also depends on the non-local kernel.
 944        So to avoid complicating the code and to reduce the risk of bugs,
 945        we always call the local local x+q copy (and the rest of the local
 946        work in nbnxn_gpu_launch_kernel().
 947      */
 948     if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(nb, iloc))
 949     {
 950         plist->haveFreshList = false;
 951
 952         // The event is marked for Local interactions unconditionally,
 953         // so it has to be released here because of the early return
 954         // for NonLocal interactions.
 955         nb->misc_ops_and_local_H2D_done.reset();
 956
 957         return;
 958     }
 959
 960     /* local/nonlocal offset and length used for xq and f */
 961     const auto atomsRange = getGpuAtomRange(adat, atomLocality);
 962
 963     /* beginning of timed HtoD section */
 964     if (bDoTime)
 965     {
 966         timers->xf[atomLocality].nb_h2d.openTimingRegion(deviceStream);
 967     }
 968
 969     /* HtoD x, q */
 970     GMX_ASSERT(nbatom->XFormat == nbatXYZQ,
 971                "The coordinates should be in xyzq format to copy to the Float4 device buffer.");
 972     copyToDeviceBuffer(&adat->xq,
 973                        reinterpret_cast<const Float4*>(nbatom->x().data()) + atomsRange.begin(),
 974                        atomsRange.begin(),
 975                        atomsRange.size(),
 976                        deviceStream,
 977                        GpuApiCallBehavior::Async,
 978                        nullptr);
 979
 980     if (bDoTime)
 981     {
 982         timers->xf[atomLocality].nb_h2d.closeTimingRegion(deviceStream);
 983     }
 984
 985     /* When we get here all misc operations issued in the local stream as well as
 986        the local xq H2D are done,
 987        so we record that in the local stream and wait for it in the nonlocal one.
 988        This wait needs to precede any PP tasks, bonded or nonbonded, that may
 989        compute on interactions between local and nonlocal atoms.
 990      */
 991     nbnxnInsertNonlocalGpuDependency(nb, iloc);
 992 }
 993
 994
 995 /* Initialization for X buffer operations on GPU. */
 996 void nbnxn_gpu_init_x_to_nbat_x(const Nbnxm::GridSet& gridSet, NbnxmGpu* gpu_nbv)
 997 {
 998     const DeviceStream& localStream   = *gpu_nbv->deviceStreams[InteractionLocality::Local];
 999     const bool          bDoTime       = gpu_nbv->bDoTime;
1000     const int           maxNumColumns = gridSet.numColumnsMax();
1001
1002     reallocateDeviceBuffer(&gpu_nbv->cxy_na,
1003                            maxNumColumns * gridSet.grids().size(),
1004                            &gpu_nbv->ncxy_na,
1005                            &gpu_nbv->ncxy_na_alloc,
1006                            *gpu_nbv->deviceContext_);
1007     reallocateDeviceBuffer(&gpu_nbv->cxy_ind,
1008                            maxNumColumns * gridSet.grids().size(),
1009                            &gpu_nbv->ncxy_ind,
1010                            &gpu_nbv->ncxy_ind_alloc,
1011                            *gpu_nbv->deviceContext_);
1012
1013     for (unsigned int g = 0; g < gridSet.grids().size(); g++)
1014     {
1015         const Nbnxm::Grid& grid = gridSet.grids()[g];
1016
1017         const int  numColumns      = grid.numColumns();
1018         const int* atomIndices     = gridSet.atomIndices().data();
1019         const int  atomIndicesSize = gridSet.atomIndices().size();
1020         const int* cxy_na          = grid.cxy_na().data();
1021         const int* cxy_ind         = grid.cxy_ind().data();
1022
1023         auto* timerH2D = bDoTime ? &gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d : nullptr;
1024
1025         reallocateDeviceBuffer(&gpu_nbv->atomIndices,
1026                                atomIndicesSize,
1027                                &gpu_nbv->atomIndicesSize,
1028                                &gpu_nbv->atomIndicesSize_alloc,
1029                                *gpu_nbv->deviceContext_);
1030
1031         if (atomIndicesSize > 0)
1032         {
1033             if (bDoTime)
1034             {
1035                 timerH2D->openTimingRegion(localStream);
1036             }
1037
1038             copyToDeviceBuffer(&gpu_nbv->atomIndices,
1039                                atomIndices,
1040                                0,
1041                                atomIndicesSize,
1042                                localStream,
1043                                GpuApiCallBehavior::Async,
1044                                bDoTime ? timerH2D->fetchNextEvent() : nullptr);
1045
1046             if (bDoTime)
1047             {
1048                 timerH2D->closeTimingRegion(localStream);
1049             }
1050         }
1051
1052         if (numColumns > 0)
1053         {
1054             if (bDoTime)
1055             {
1056                 timerH2D->openTimingRegion(localStream);
1057             }
1058
1059             copyToDeviceBuffer(&gpu_nbv->cxy_na,
1060                                cxy_na,
1061                                maxNumColumns * g,
1062                                numColumns,
1063                                localStream,
1064                                GpuApiCallBehavior::Async,
1065                                bDoTime ? timerH2D->fetchNextEvent() : nullptr);
1066
1067             if (bDoTime)
1068             {
1069                 timerH2D->closeTimingRegion(localStream);
1070             }
1071
1072             if (bDoTime)
1073             {
1074                 timerH2D->openTimingRegion(localStream);
1075             }
1076
1077             copyToDeviceBuffer(&gpu_nbv->cxy_ind,
1078                                cxy_ind,
1079                                maxNumColumns * g,
1080                                numColumns,
1081                                localStream,
1082                                GpuApiCallBehavior::Async,
1083                                bDoTime ? timerH2D->fetchNextEvent() : nullptr);
1084
1085             if (bDoTime)
1086             {
1087                 timerH2D->closeTimingRegion(localStream);
1088             }
1089         }
1090     }
1091
1092     // The above data is transferred on the local stream but is a
1093     // dependency of the nonlocal stream (specifically the nonlocal X
1094     // buf ops kernel).  We therefore set a dependency to ensure
1095     // that the nonlocal stream waits on the local stream here.
1096     // This call records an event in the local stream:
1097     nbnxnInsertNonlocalGpuDependency(gpu_nbv, Nbnxm::InteractionLocality::Local);
1098     // ...and this call instructs the nonlocal stream to wait on that event:
1099     nbnxnInsertNonlocalGpuDependency(gpu_nbv, Nbnxm::InteractionLocality::NonLocal);
1100 }
1101
1102 //! This function is documented in the header file
1103 void gpu_free(NbnxmGpu* nb)
1104 {
1105     if (nb == nullptr)
1106     {
1107         return;
1108     }
1109
1110     gpu_free_platform_specific(nb);
1111
1112     delete nb->timers;
1113     sfree(nb->timings);
1114
1115     NBAtomDataGpu* atdat   = nb->atdat;
1116     NBParamGpu*    nbparam = nb->nbparam;
1117
1118     /* Free atdat */
1119     freeDeviceBuffer(&(nb->atdat->xq));
1120     freeDeviceBuffer(&(nb->atdat->f));
1121     freeDeviceBuffer(&(nb->atdat->eLJ));
1122     freeDeviceBuffer(&(nb->atdat->eElec));
1123     freeDeviceBuffer(&(nb->atdat->fShift));
1124     freeDeviceBuffer(&(nb->atdat->shiftVec));
1125     if (useLjCombRule(nb->nbparam->vdwType))
1126     {
1127         freeDeviceBuffer(&atdat->ljComb);
1128     }
1129     else
1130     {
1131         freeDeviceBuffer(&atdat->atomTypes);
1132     }
1133
1134     /* Free nbparam */
1135     if (nbparam->elecType == ElecType::EwaldTab || nbparam->elecType == ElecType::EwaldTabTwin)
1136     {
1137         destroyParamLookupTable(&nbparam->coulomb_tab, &nbparam->coulomb_tab_texobj);
1138     }
1139
1140     if (!useLjCombRule(nb->nbparam->vdwType))
1141     {
1142         destroyParamLookupTable(&nbparam->nbfp, &nbparam->nbfp_texobj);
1143     }
1144
1145     if (nbparam->vdwType == VdwType::EwaldGeom || nbparam->vdwType == VdwType::EwaldLB)
1146     {
1147         destroyParamLookupTable(&nbparam->nbfp_comb, &nbparam->nbfp_comb_texobj);
1148     }
1149
1150     /* Free plist */
1151     auto* plist = nb->plist[InteractionLocality::Local];
1152     freeDeviceBuffer(&plist->sci);
1153     freeDeviceBuffer(&plist->cj4);
1154     freeDeviceBuffer(&plist->imask);
1155     freeDeviceBuffer(&plist->excl);
1156     delete plist;
1157     if (nb->bUseTwoStreams)
1158     {
1159         auto* plist_nl = nb->plist[InteractionLocality::NonLocal];
1160         freeDeviceBuffer(&plist_nl->sci);
1161         freeDeviceBuffer(&plist_nl->cj4);
1162         freeDeviceBuffer(&plist_nl->imask);
1163         freeDeviceBuffer(&plist_nl->excl);
1164         delete plist_nl;
1165     }
1166
1167     /* Free nbst */
1168     pfree(nb->nbst.eLJ);
1169     nb->nbst.eLJ = nullptr;
1170
1171     pfree(nb->nbst.eElec);
1172     nb->nbst.eElec = nullptr;
1173
1174     pfree(nb->nbst.fShift);
1175     nb->nbst.fShift = nullptr;
1176
1177     delete atdat;
1178     delete nbparam;
1179     delete nb;
1180
1181     if (debug)
1182     {
1183         fprintf(debug, "Cleaned up NBNXM GPU data structures.\n");
1184     }
1185 }
1186
1187 DeviceBuffer<gmx::RVec> gpu_get_f(NbnxmGpu* nb)
1188 {
1189     GMX_ASSERT(nb != nullptr, "nb pointer must be valid");
1190
1191     return nb->atdat->f;
1192 }
1193
1194 } // namespace Nbnxm