src/gromacs/nbnxm/nbnxm_gpu_data_mgmt.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2012,2013,2014,2015,2016 by the GROMACS development team.
   5  * Copyright (c) 2017,2018,2019,2020,2021, by the GROMACS development team, led by
   6  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   7  * and including many others, as listed in the AUTHORS file in the
   8  * top-level source directory and at http://www.gromacs.org.
   9  *
  10  * GROMACS is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU Lesser General Public License
  12  * as published by the Free Software Foundation; either version 2.1
  13  * of the License, or (at your option) any later version.
  14  *
  15  * GROMACS is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18  * Lesser General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public
  21  * License along with GROMACS; if not, see
  22  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  23  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  24  *
  25  * If you want to redistribute modifications to GROMACS, please
  26  * consider that scientific software is very special. Version
  27  * control is crucial - bugs must be traceable. We will be happy to
  28  * consider code for inclusion in the official distribution, but
  29  * derived work must not be called official GROMACS. Details are found
  30  * in the README & COPYING files - if they are missing, get the
  31  * official version at http://www.gromacs.org.
  32  *
  33  * To help us fund GROMACS development, we humbly ask that you cite
  34  * the research papers on the package. Check out http://www.gromacs.org.
  35  */
  36 /*! \internal \file
  37  *  \brief Define common implementation of nbnxm_gpu_data_mgmt.h
  38  *
  39  *  \author Anca Hamuraru <anca@streamcomputing.eu>
  40  *  \author Dimitrios Karkoulis <dimitris.karkoulis@gmail.com>
  41  *  \author Teemu Virolainen <teemu@streamcomputing.eu>
  42  *  \author Szilárd Páll <pall.szilard@gmail.com>
  43  *  \author Artem Zhmurov <zhmurov@gmail.com>
  44  *
  45  *  \ingroup module_nbnxm
  46  */
  47 #include "gmxpre.h"
  48
  49 #include "config.h"
  50
  51 #if GMX_GPU_CUDA
  52 #    include "cuda/nbnxm_cuda_types.h"
  53 #endif
  54
  55 #if GMX_GPU_OPENCL
  56 #    include "opencl/nbnxm_ocl_types.h"
  57 #endif
  58
  59 #if GMX_GPU_SYCL
  60 #    include "sycl/nbnxm_sycl_types.h"
  61 #endif
  62
  63 #include "nbnxm_gpu_data_mgmt.h"
  64
  65 #include "gromacs/gpu_utils/device_stream_manager.h"
  66 #include "gromacs/gpu_utils/pmalloc.h"
  67 #include "gromacs/hardware/device_information.h"
  68 #include "gromacs/mdtypes/interaction_const.h"
  69 #include "gromacs/mdtypes/simulation_workload.h"
  70 #include "gromacs/nbnxm/gpu_common_utils.h"
  71 #include "gromacs/nbnxm/gpu_data_mgmt.h"
  72 #include "gromacs/nbnxm/gridset.h"
  73 #include "gromacs/pbcutil/ishift.h"
  74 #include "gromacs/timing/gpu_timing.h"
  75 #include "gromacs/pbcutil/ishift.h"
  76 #include "gromacs/utility/cstringutil.h"
  77 #include "gromacs/utility/exceptions.h"
  78 #include "gromacs/utility/fatalerror.h"
  79
  80 #include "nbnxm_gpu.h"
  81 #include "pairlistsets.h"
  82
  83 namespace Nbnxm
  84 {
  85
  86 inline void issueClFlushInStream(const DeviceStream& deviceStream)
  87 {
  88 #if GMX_GPU_OPENCL
  89     /* Based on the v1.2 section 5.13 of the OpenCL spec, a flush is needed
  90      * in the stream after marking an event in it in order to be able to sync with
  91      * the event from another stream.
  92      */
  93     cl_int cl_error = clFlush(deviceStream.stream());
  94     if (cl_error != CL_SUCCESS)
  95     {
  96         GMX_THROW(gmx::InternalError("clFlush failed: " + ocl_get_error_string(cl_error)));
  97     }
  98 #else
  99     GMX_UNUSED_VALUE(deviceStream);
 100 #endif
 101 }
 102
 103 void init_ewald_coulomb_force_table(const EwaldCorrectionTables& tables,
 104                                     NBParamGpu*                  nbp,
 105                                     const DeviceContext&         deviceContext)
 106 {
 107     if (nbp->coulomb_tab)
 108     {
 109         destroyParamLookupTable(&nbp->coulomb_tab, nbp->coulomb_tab_texobj);
 110     }
 111
 112     nbp->coulomb_tab_scale = tables.scale;
 113     initParamLookupTable(
 114             &nbp->coulomb_tab, &nbp->coulomb_tab_texobj, tables.tableF.data(), tables.tableF.size(), deviceContext);
 115 }
 116
 117 enum ElecType nbnxn_gpu_pick_ewald_kernel_type(const interaction_const_t& ic,
 118                                                const DeviceInformation gmx_unused& deviceInfo)
 119 {
 120     bool bTwinCut = (ic.rcoulomb != ic.rvdw);
 121
 122     /* Benchmarking/development environment variables to force the use of
 123        analytical or tabulated Ewald kernel. */
 124     const bool forceAnalyticalEwald = (getenv("GMX_GPU_NB_ANA_EWALD") != nullptr);
 125     const bool forceTabulatedEwald  = (getenv("GMX_GPU_NB_TAB_EWALD") != nullptr);
 126     const bool forceTwinCutoffEwald = (getenv("GMX_GPU_NB_EWALD_TWINCUT") != nullptr);
 127
 128     if (forceAnalyticalEwald && forceTabulatedEwald)
 129     {
 130         gmx_incons(
 131                 "Both analytical and tabulated Ewald GPU non-bonded kernels "
 132                 "requested through environment variables.");
 133     }
 134
 135     /* By default, use analytical Ewald except with CUDA on NVIDIA CC 7.0 and 8.0.
 136      */
 137     const bool c_useTabulatedEwaldDefault =
 138 #if GMX_GPU_CUDA
 139             (deviceInfo.prop.major == 7 && deviceInfo.prop.minor == 0)
 140             || (deviceInfo.prop.major == 8 && deviceInfo.prop.minor == 0);
 141 #else
 142             false;
 143 #endif
 144     bool bUseAnalyticalEwald = !c_useTabulatedEwaldDefault;
 145     if (forceAnalyticalEwald)
 146     {
 147         bUseAnalyticalEwald = true;
 148         if (debug)
 149         {
 150             fprintf(debug, "Using analytical Ewald GPU kernels\n");
 151         }
 152     }
 153     else if (forceTabulatedEwald)
 154     {
 155         bUseAnalyticalEwald = false;
 156
 157         if (debug)
 158         {
 159             fprintf(debug, "Using tabulated Ewald GPU kernels\n");
 160         }
 161     }
 162
 163     /* Use twin cut-off kernels if requested by bTwinCut or the env. var.
 164        forces it (use it for debugging/benchmarking only). */
 165     if (!bTwinCut && !forceTwinCutoffEwald)
 166     {
 167         return bUseAnalyticalEwald ? ElecType::EwaldAna : ElecType::EwaldTab;
 168     }
 169     else
 170     {
 171         return bUseAnalyticalEwald ? ElecType::EwaldAnaTwin : ElecType::EwaldTabTwin;
 172     }
 173 }
 174
 175 void set_cutoff_parameters(NBParamGpu* nbp, const interaction_const_t& ic, const PairlistParams& listParams)
 176 {
 177     nbp->ewald_beta        = ic.ewaldcoeff_q;
 178     nbp->sh_ewald          = ic.sh_ewald;
 179     nbp->epsfac            = ic.epsfac;
 180     nbp->two_k_rf          = 2.0 * ic.reactionFieldCoefficient;
 181     nbp->c_rf              = ic.reactionFieldShift;
 182     nbp->rvdw_sq           = ic.rvdw * ic.rvdw;
 183     nbp->rcoulomb_sq       = ic.rcoulomb * ic.rcoulomb;
 184     nbp->rlistOuter_sq     = listParams.rlistOuter * listParams.rlistOuter;
 185     nbp->rlistInner_sq     = listParams.rlistInner * listParams.rlistInner;
 186     nbp->useDynamicPruning = listParams.useDynamicPruning;
 187
 188     nbp->sh_lj_ewald   = ic.sh_lj_ewald;
 189     nbp->ewaldcoeff_lj = ic.ewaldcoeff_lj;
 190
 191     nbp->rvdw_switch      = ic.rvdw_switch;
 192     nbp->dispersion_shift = ic.dispersion_shift;
 193     nbp->repulsion_shift  = ic.repulsion_shift;
 194     nbp->vdw_switch       = ic.vdw_switch;
 195 }
 196
 197 void gpu_pme_loadbal_update_param(const nonbonded_verlet_t* nbv, const interaction_const_t& ic)
 198 {
 199     if (!nbv || !nbv->useGpu())
 200     {
 201         return;
 202     }
 203     NbnxmGpu*   nb  = nbv->gpu_nbv;
 204     NBParamGpu* nbp = nb->nbparam;
 205
 206     set_cutoff_parameters(nbp, ic, nbv->pairlistSets().params());
 207
 208     nbp->elecType = nbnxn_gpu_pick_ewald_kernel_type(ic, nb->deviceContext_->deviceInfo());
 209
 210     GMX_RELEASE_ASSERT(ic.coulombEwaldTables, "Need valid Coulomb Ewald correction tables");
 211     init_ewald_coulomb_force_table(*ic.coulombEwaldTables, nbp, *nb->deviceContext_);
 212 }
 213
 214 void init_plist(gpu_plist* pl)
 215 {
 216     /* initialize to nullptr pointers to data that is not allocated here and will
 217        need reallocation in nbnxn_gpu_init_pairlist */
 218     pl->sci   = nullptr;
 219     pl->cj4   = nullptr;
 220     pl->imask = nullptr;
 221     pl->excl  = nullptr;
 222
 223     /* size -1 indicates that the respective array hasn't been initialized yet */
 224     pl->na_c                   = -1;
 225     pl->nsci                   = -1;
 226     pl->sci_nalloc             = -1;
 227     pl->ncj4                   = -1;
 228     pl->cj4_nalloc             = -1;
 229     pl->nimask                 = -1;
 230     pl->imask_nalloc           = -1;
 231     pl->nexcl                  = -1;
 232     pl->excl_nalloc            = -1;
 233     pl->haveFreshList          = false;
 234     pl->rollingPruningNumParts = 0;
 235     pl->rollingPruningPart     = 0;
 236 }
 237
 238 void init_timings(gmx_wallclock_gpu_nbnxn_t* t)
 239 {
 240     t->nb_h2d_t = 0.0;
 241     t->nb_d2h_t = 0.0;
 242     t->nb_c     = 0;
 243     t->pl_h2d_t = 0.0;
 244     t->pl_h2d_c = 0;
 245     for (int i = 0; i < 2; i++)
 246     {
 247         for (int j = 0; j < 2; j++)
 248         {
 249             t->ktime[i][j].t = 0.0;
 250             t->ktime[i][j].c = 0;
 251         }
 252     }
 253     t->pruneTime.c        = 0;
 254     t->pruneTime.t        = 0.0;
 255     t->dynamicPruneTime.c = 0;
 256     t->dynamicPruneTime.t = 0.0;
 257 }
 258
 259 /*! \brief Initialize \p atomdata first time; it only gets filled at pair-search. */
 260 static void initAtomdataFirst(NBAtomData*          atomdata,
 261                               int                  numTypes,
 262                               const DeviceContext& deviceContext,
 263                               const DeviceStream&  localStream)
 264 {
 265     atomdata->numTypes = numTypes;
 266     allocateDeviceBuffer(&atomdata->shiftVec, SHIFTS, deviceContext);
 267     atomdata->shiftVecUploaded = false;
 268
 269     allocateDeviceBuffer(&atomdata->fShift, SHIFTS, deviceContext);
 270     allocateDeviceBuffer(&atomdata->eLJ, 1, deviceContext);
 271     allocateDeviceBuffer(&atomdata->eElec, 1, deviceContext);
 272
 273     clearDeviceBufferAsync(&atomdata->fShift, 0, SHIFTS, localStream);
 274     clearDeviceBufferAsync(&atomdata->eElec, 0, 1, localStream);
 275     clearDeviceBufferAsync(&atomdata->eLJ, 0, 1, localStream);
 276
 277     /* initialize to nullptr pointers to data that is not allocated here and will
 278        need reallocation in later */
 279     atomdata->xq = nullptr;
 280     atomdata->f  = nullptr;
 281
 282     /* size -1 indicates that the respective array hasn't been initialized yet */
 283     atomdata->numAtoms      = -1;
 284     atomdata->numAtomsAlloc = -1;
 285 }
 286
 287 /*! \brief Initialize the nonbonded parameter data structure. */
 288 static void initNbparam(NBParamGpu*                     nbp,
 289                         const interaction_const_t&      ic,
 290                         const PairlistParams&           listParams,
 291                         const nbnxn_atomdata_t::Params& nbatParams,
 292                         const DeviceContext&            deviceContext)
 293 {
 294     const int numTypes = nbatParams.numTypes;
 295
 296     set_cutoff_parameters(nbp, ic, listParams);
 297
 298     nbp->vdwType  = nbnxmGpuPickVdwKernelType(ic, nbatParams.ljCombinationRule);
 299     nbp->elecType = nbnxmGpuPickElectrostaticsKernelType(ic, deviceContext.deviceInfo());
 300
 301     if (ic.vdwtype == VanDerWaalsType::Pme)
 302     {
 303         if (ic.ljpme_comb_rule == LongRangeVdW::Geom)
 304         {
 305             GMX_ASSERT(nbatParams.ljCombinationRule == LJCombinationRule::Geometric,
 306                        "Combination rule mismatch!");
 307         }
 308         else
 309         {
 310             GMX_ASSERT(nbatParams.ljCombinationRule == LJCombinationRule::LorentzBerthelot,
 311                        "Combination rule mismatch!");
 312         }
 313     }
 314
 315     /* generate table for PME */
 316     if (nbp->elecType == ElecType::EwaldTab || nbp->elecType == ElecType::EwaldTabTwin)
 317     {
 318         GMX_RELEASE_ASSERT(ic.coulombEwaldTables, "Need valid Coulomb Ewald correction tables");
 319         init_ewald_coulomb_force_table(*ic.coulombEwaldTables, nbp, deviceContext);
 320     }
 321     else
 322     {
 323         // Need to initialize for OpenCL, since it is unconditionally used as a kernel argument.
 324         allocateDeviceBuffer(&nbp->coulomb_tab, 1, deviceContext);
 325     }
 326
 327     /* set up LJ parameter lookup table */
 328     if (!useLjCombRule(nbp->vdwType))
 329     {
 330         static_assert(sizeof(decltype(nbp->nbfp)) == 2 * sizeof(decltype(*nbatParams.nbfp.data())),
 331                       "Mismatch in the size of host / device data types");
 332         initParamLookupTable(&nbp->nbfp,
 333                              &nbp->nbfp_texobj,
 334                              reinterpret_cast<const Float2*>(nbatParams.nbfp.data()),
 335                              numTypes * numTypes,
 336                              deviceContext);
 337     }
 338     else
 339     {
 340         // Need to initialize for OpenCL, since it is unconditionally used as a kernel argument.
 341         allocateDeviceBuffer(&nbp->nbfp, 1, deviceContext);
 342     }
 343
 344     /* set up LJ-PME parameter lookup table */
 345     if (ic.vdwtype == VanDerWaalsType::Pme)
 346     {
 347         static_assert(sizeof(decltype(nbp->nbfp_comb))
 348                               == 2 * sizeof(decltype(*nbatParams.nbfp_comb.data())),
 349                       "Mismatch in the size of host / device data types");
 350         initParamLookupTable(&nbp->nbfp_comb,
 351                              &nbp->nbfp_comb_texobj,
 352                              reinterpret_cast<const Float2*>(nbatParams.nbfp_comb.data()),
 353                              numTypes,
 354                              deviceContext);
 355     }
 356     else
 357     {
 358         // Need to initialize for OpenCL, since it is unconditionally used as a kernel argument.
 359         allocateDeviceBuffer(&nbp->nbfp_comb, 1, deviceContext);
 360     }
 361 }
 362
 363 NbnxmGpu* gpu_init(const gmx::DeviceStreamManager& deviceStreamManager,
 364                    const interaction_const_t*      ic,
 365                    const PairlistParams&           listParams,
 366                    const nbnxn_atomdata_t*         nbat,
 367                    const bool                      bLocalAndNonlocal)
 368 {
 369     auto* nb                              = new NbnxmGpu();
 370     nb->deviceContext_                    = &deviceStreamManager.context();
 371     nb->atdat                             = new NBAtomData;
 372     nb->nbparam                           = new NBParamGpu;
 373     nb->plist[InteractionLocality::Local] = new Nbnxm::gpu_plist;
 374     if (bLocalAndNonlocal)
 375     {
 376         nb->plist[InteractionLocality::NonLocal] = new Nbnxm::gpu_plist;
 377     }
 378
 379     nb->bUseTwoStreams = bLocalAndNonlocal;
 380
 381     nb->timers = new Nbnxm::GpuTimers();
 382     snew(nb->timings, 1);
 383
 384     /* WARNING: CUDA timings are incorrect with multiple streams.
 385      * This is the main reason why they are disabled by default.
 386      * Can be enabled by setting GMX_ENABLE_GPU_TIMING environment variable.
 387      * TODO: Consider turning on by default when we can detect nr of streams.
 388      *
 389      * OpenCL timing is enabled by default and can be disabled by
 390      * GMX_DISABLE_GPU_TIMING environment variable.
 391      *
 392      * Timing is disabled in SYCL.
 393      */
 394     nb->bDoTime = (GMX_GPU_CUDA && (getenv("GMX_ENABLE_GPU_TIMING") != nullptr))
 395                   || (GMX_GPU_OPENCL && (getenv("GMX_DISABLE_GPU_TIMING") == nullptr));
 396
 397     if (nb->bDoTime)
 398     {
 399         init_timings(nb->timings);
 400     }
 401
 402     /* init nbst */
 403     pmalloc(reinterpret_cast<void**>(&nb->nbst.eLJ), sizeof(*nb->nbst.eLJ));
 404     pmalloc(reinterpret_cast<void**>(&nb->nbst.eElec), sizeof(*nb->nbst.eElec));
 405     pmalloc(reinterpret_cast<void**>(&nb->nbst.fShift), SHIFTS * sizeof(*nb->nbst.fShift));
 406
 407     init_plist(nb->plist[InteractionLocality::Local]);
 408
 409     /* local/non-local GPU streams */
 410     GMX_RELEASE_ASSERT(deviceStreamManager.streamIsValid(gmx::DeviceStreamType::NonBondedLocal),
 411                        "Local non-bonded stream should be initialized to use GPU for non-bonded.");
 412     const DeviceStream& localStream = deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedLocal);
 413     nb->deviceStreams[InteractionLocality::Local] = &localStream;
 414     // In general, it's not strictly necessary to use 2 streams for SYCL, since they are
 415     // out-of-order. But for the time being, it will be less disruptive to keep them.
 416     if (nb->bUseTwoStreams)
 417     {
 418         init_plist(nb->plist[InteractionLocality::NonLocal]);
 419
 420         GMX_RELEASE_ASSERT(deviceStreamManager.streamIsValid(gmx::DeviceStreamType::NonBondedNonLocal),
 421                            "Non-local non-bonded stream should be initialized to use GPU for "
 422                            "non-bonded with domain decomposition.");
 423         nb->deviceStreams[InteractionLocality::NonLocal] =
 424                 &deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedNonLocal);
 425     }
 426
 427     const nbnxn_atomdata_t::Params& nbatParams    = nbat->params();
 428     const DeviceContext&            deviceContext = *nb->deviceContext_;
 429
 430     initNbparam(nb->nbparam, *ic, listParams, nbatParams, deviceContext);
 431     initAtomdataFirst(nb->atdat, nbatParams.numTypes, deviceContext, localStream);
 432
 433     gpu_init_platform_specific(nb);
 434
 435     if (debug)
 436     {
 437         fprintf(debug, "Initialized NBNXM GPU data structures.\n");
 438     }
 439
 440     return nb;
 441 }
 442
 443 //! This function is documented in the header file
 444 void gpu_init_pairlist(NbnxmGpu* nb, const NbnxnPairlistGpu* h_plist, const InteractionLocality iloc)
 445 {
 446     char sbuf[STRLEN];
 447     // Timing accumulation should happen only if there was work to do
 448     // because getLastRangeTime() gets skipped with empty lists later
 449     // which leads to the counter not being reset.
 450     bool                bDoTime      = (nb->bDoTime && !h_plist->sci.empty());
 451     const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
 452     gpu_plist*          d_plist      = nb->plist[iloc];
 453
 454     if (d_plist->na_c < 0)
 455     {
 456         d_plist->na_c = h_plist->na_ci;
 457     }
 458     else
 459     {
 460         if (d_plist->na_c != h_plist->na_ci)
 461         {
 462             sprintf(sbuf,
 463                     "In init_plist: the #atoms per cell has changed (from %d to %d)",
 464                     d_plist->na_c,
 465                     h_plist->na_ci);
 466             gmx_incons(sbuf);
 467         }
 468     }
 469
 470     GpuTimers::Interaction& iTimers = nb->timers->interaction[iloc];
 471
 472     if (bDoTime)
 473     {
 474         iTimers.pl_h2d.openTimingRegion(deviceStream);
 475         iTimers.didPairlistH2D = true;
 476     }
 477
 478     // TODO most of this function is same in CUDA and OpenCL, move into the header
 479     const DeviceContext& deviceContext = *nb->deviceContext_;
 480
 481     reallocateDeviceBuffer(
 482             &d_plist->sci, h_plist->sci.size(), &d_plist->nsci, &d_plist->sci_nalloc, deviceContext);
 483     copyToDeviceBuffer(&d_plist->sci,
 484                        h_plist->sci.data(),
 485                        0,
 486                        h_plist->sci.size(),
 487                        deviceStream,
 488                        GpuApiCallBehavior::Async,
 489                        bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
 490
 491     reallocateDeviceBuffer(
 492             &d_plist->cj4, h_plist->cj4.size(), &d_plist->ncj4, &d_plist->cj4_nalloc, deviceContext);
 493     copyToDeviceBuffer(&d_plist->cj4,
 494                        h_plist->cj4.data(),
 495                        0,
 496                        h_plist->cj4.size(),
 497                        deviceStream,
 498                        GpuApiCallBehavior::Async,
 499                        bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
 500
 501     reallocateDeviceBuffer(&d_plist->imask,
 502                            h_plist->cj4.size() * c_nbnxnGpuClusterpairSplit,
 503                            &d_plist->nimask,
 504                            &d_plist->imask_nalloc,
 505                            deviceContext);
 506
 507     reallocateDeviceBuffer(
 508             &d_plist->excl, h_plist->excl.size(), &d_plist->nexcl, &d_plist->excl_nalloc, deviceContext);
 509     copyToDeviceBuffer(&d_plist->excl,
 510                        h_plist->excl.data(),
 511                        0,
 512                        h_plist->excl.size(),
 513                        deviceStream,
 514                        GpuApiCallBehavior::Async,
 515                        bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
 516
 517     if (bDoTime)
 518     {
 519         iTimers.pl_h2d.closeTimingRegion(deviceStream);
 520     }
 521
 522     /* need to prune the pair list during the next step */
 523     d_plist->haveFreshList = true;
 524 }
 525
 526 void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat)
 527 {
 528     bool                 bDoTime       = nb->bDoTime;
 529     Nbnxm::GpuTimers*    timers        = bDoTime ? nb->timers : nullptr;
 530     NBAtomData*          atdat         = nb->atdat;
 531     const DeviceContext& deviceContext = *nb->deviceContext_;
 532     const DeviceStream&  localStream   = *nb->deviceStreams[InteractionLocality::Local];
 533
 534     int  numAtoms  = nbat->numAtoms();
 535     bool realloced = false;
 536
 537     if (bDoTime)
 538     {
 539         /* time async copy */
 540         timers->atdat.openTimingRegion(localStream);
 541     }
 542
 543     /* need to reallocate if we have to copy more atoms than the amount of space
 544        available and only allocate if we haven't initialized yet, i.e atdat->natoms == -1 */
 545     if (numAtoms > atdat->numAtomsAlloc)
 546     {
 547         int numAlloc = over_alloc_small(numAtoms);
 548
 549         /* free up first if the arrays have already been initialized */
 550         if (atdat->numAtomsAlloc != -1)
 551         {
 552             freeDeviceBuffer(&atdat->f);
 553             freeDeviceBuffer(&atdat->xq);
 554             if (useLjCombRule(nb->nbparam->vdwType))
 555             {
 556                 freeDeviceBuffer(&atdat->ljComb);
 557             }
 558             else
 559             {
 560                 freeDeviceBuffer(&atdat->atomTypes);
 561             }
 562         }
 563
 564
 565         allocateDeviceBuffer(&atdat->f, numAlloc, deviceContext);
 566         allocateDeviceBuffer(&atdat->xq, numAlloc, deviceContext);
 567
 568         if (useLjCombRule(nb->nbparam->vdwType))
 569         {
 570             // Two Lennard-Jones parameters per atom
 571             allocateDeviceBuffer(&atdat->ljComb, numAlloc, deviceContext);
 572         }
 573         else
 574         {
 575             allocateDeviceBuffer(&atdat->atomTypes, numAlloc, deviceContext);
 576         }
 577
 578         atdat->numAtomsAlloc = numAlloc;
 579         realloced            = true;
 580     }
 581
 582     atdat->numAtoms      = numAtoms;
 583     atdat->numAtomsLocal = nbat->natoms_local;
 584
 585     /* need to clear GPU f output if realloc happened */
 586     if (realloced)
 587     {
 588         clearDeviceBufferAsync(&atdat->f, 0, atdat->numAtomsAlloc, localStream);
 589     }
 590
 591     if (useLjCombRule(nb->nbparam->vdwType))
 592     {
 593         static_assert(
 594                 sizeof(Float2) == 2 * sizeof(*nbat->params().lj_comb.data()),
 595                 "Size of a pair of LJ parameters elements should be equal to the size of Float2.");
 596         copyToDeviceBuffer(&atdat->ljComb,
 597                            reinterpret_cast<const Float2*>(nbat->params().lj_comb.data()),
 598                            0,
 599                            numAtoms,
 600                            localStream,
 601                            GpuApiCallBehavior::Async,
 602                            bDoTime ? timers->atdat.fetchNextEvent() : nullptr);
 603     }
 604     else
 605     {
 606         static_assert(sizeof(int) == sizeof(*nbat->params().type.data()),
 607                       "Sizes of host- and device-side atom types should be the same.");
 608         copyToDeviceBuffer(&atdat->atomTypes,
 609                            nbat->params().type.data(),
 610                            0,
 611                            numAtoms,
 612                            localStream,
 613                            GpuApiCallBehavior::Async,
 614                            bDoTime ? timers->atdat.fetchNextEvent() : nullptr);
 615     }
 616
 617     if (bDoTime)
 618     {
 619         timers->atdat.closeTimingRegion(localStream);
 620     }
 621
 622     /* kick off the tasks enqueued above to ensure concurrency with the search */
 623     issueClFlushInStream(localStream);
 624 }
 625
 626 void gpu_clear_outputs(NbnxmGpu* nb, bool computeVirial)
 627 {
 628     NBAtomData*         adat        = nb->atdat;
 629     const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local];
 630     // Clear forces
 631     clearDeviceBufferAsync(&adat->f, 0, nb->atdat->numAtoms, localStream);
 632     // Clear shift force array and energies if the outputs were used in the current step
 633     if (computeVirial)
 634     {
 635         clearDeviceBufferAsync(&adat->fShift, 0, SHIFTS, localStream);
 636         clearDeviceBufferAsync(&adat->eLJ, 0, 1, localStream);
 637         clearDeviceBufferAsync(&adat->eElec, 0, 1, localStream);
 638     }
 639     issueClFlushInStream(localStream);
 640 }
 641
 642 //! This function is documented in the header file
 643 gmx_wallclock_gpu_nbnxn_t* gpu_get_timings(NbnxmGpu* nb)
 644 {
 645     return (nb != nullptr && nb->bDoTime) ? nb->timings : nullptr;
 646 }
 647
 648 //! This function is documented in the header file
 649 void gpu_reset_timings(nonbonded_verlet_t* nbv)
 650 {
 651     if (nbv->gpu_nbv && nbv->gpu_nbv->bDoTime)
 652     {
 653         init_timings(nbv->gpu_nbv->timings);
 654     }
 655 }
 656
 657 bool gpu_is_kernel_ewald_analytical(const NbnxmGpu* nb)
 658 {
 659     return ((nb->nbparam->elecType == ElecType::EwaldAna)
 660             || (nb->nbparam->elecType == ElecType::EwaldAnaTwin));
 661 }
 662
 663 enum ElecType nbnxmGpuPickElectrostaticsKernelType(const interaction_const_t& ic,
 664                                                    const DeviceInformation&   deviceInfo)
 665 {
 666     if (ic.eeltype == CoulombInteractionType::Cut)
 667     {
 668         return ElecType::Cut;
 669     }
 670     else if (EEL_RF(ic.eeltype))
 671     {
 672         return ElecType::RF;
 673     }
 674     else if ((EEL_PME(ic.eeltype) || ic.eeltype == CoulombInteractionType::Ewald))
 675     {
 676         return nbnxn_gpu_pick_ewald_kernel_type(ic, deviceInfo);
 677     }
 678     else
 679     {
 680         /* Shouldn't happen, as this is checked when choosing Verlet-scheme */
 681         GMX_THROW(gmx::InconsistentInputError(
 682                 gmx::formatString("The requested electrostatics type %s is not implemented in "
 683                                   "the GPU accelerated kernels!",
 684                                   enumValueToString(ic.eeltype))));
 685     }
 686 }
 687
 688
 689 enum VdwType nbnxmGpuPickVdwKernelType(const interaction_const_t& ic, LJCombinationRule ljCombinationRule)
 690 {
 691     if (ic.vdwtype == VanDerWaalsType::Cut)
 692     {
 693         switch (ic.vdw_modifier)
 694         {
 695             case InteractionModifiers::None:
 696             case InteractionModifiers::PotShift:
 697                 switch (ljCombinationRule)
 698                 {
 699                     case LJCombinationRule::None: return VdwType::Cut;
 700                     case LJCombinationRule::Geometric: return VdwType::CutCombGeom;
 701                     case LJCombinationRule::LorentzBerthelot: return VdwType::CutCombLB;
 702                     default:
 703                         GMX_THROW(gmx::InconsistentInputError(gmx::formatString(
 704                                 "The requested LJ combination rule %s is not implemented in "
 705                                 "the GPU accelerated kernels!",
 706                                 enumValueToString(ljCombinationRule))));
 707                 }
 708             case InteractionModifiers::ForceSwitch: return VdwType::FSwitch;
 709             case InteractionModifiers::PotSwitch: return VdwType::PSwitch;
 710             default:
 711                 GMX_THROW(gmx::InconsistentInputError(
 712                         gmx::formatString("The requested VdW interaction modifier %s is not "
 713                                           "implemented in the GPU accelerated kernels!",
 714                                           enumValueToString(ic.vdw_modifier))));
 715         }
 716     }
 717     else if (ic.vdwtype == VanDerWaalsType::Pme)
 718     {
 719         if (ic.ljpme_comb_rule == LongRangeVdW::Geom)
 720         {
 721             assert(ljCombinationRule == LJCombinationRule::Geometric);
 722             return VdwType::EwaldGeom;
 723         }
 724         else
 725         {
 726             assert(ljCombinationRule == LJCombinationRule::LorentzBerthelot);
 727             return VdwType::EwaldLB;
 728         }
 729     }
 730     else
 731     {
 732         GMX_THROW(gmx::InconsistentInputError(gmx::formatString(
 733                 "The requested VdW type %s is not implemented in the GPU accelerated kernels!",
 734                 enumValueToString(ic.vdwtype))));
 735     }
 736 }
 737
 738 void setupGpuShortRangeWork(NbnxmGpu* nb, const gmx::GpuBonded* gpuBonded, const gmx::InteractionLocality iLocality)
 739 {
 740     GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
 741
 742     // There is short-range work if the pair list for the provided
 743     // interaction locality contains entries or if there is any
 744     // bonded work (as this is not split into local/nonlocal).
 745     nb->haveWork[iLocality] = ((nb->plist[iLocality]->nsci != 0)
 746                                || (gpuBonded != nullptr && gpuBonded->haveInteractions()));
 747 }
 748
 749 bool haveGpuShortRangeWork(const NbnxmGpu* nb, const gmx::InteractionLocality interactionLocality)
 750 {
 751     GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
 752
 753     return nb->haveWork[interactionLocality];
 754 }
 755
 756 /*! \brief
 757  * Launch asynchronously the download of nonbonded forces from the GPU
 758  * (and energies/shift forces if required).
 759  */
 760 void gpu_launch_cpyback(NbnxmGpu*                nb,
 761                         struct nbnxn_atomdata_t* nbatom,
 762                         const gmx::StepWorkload& stepWork,
 763                         const AtomLocality       atomLocality)
 764 {
 765     GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
 766
 767     /* determine interaction locality from atom locality */
 768     const InteractionLocality iloc = atomToInteractionLocality(atomLocality);
 769     GMX_ASSERT(iloc == InteractionLocality::Local
 770                        || (iloc == InteractionLocality::NonLocal && nb->bNonLocalStreamDoneMarked == false),
 771                "Non-local stream is indicating that the copy back event is enqueued at the "
 772                "beginning of the copy back function.");
 773
 774     /* extract the data */
 775     NBAtomData*         adat         = nb->atdat;
 776     Nbnxm::GpuTimers*   timers       = nb->timers;
 777     bool                bDoTime      = nb->bDoTime;
 778     const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
 779
 780     /* don't launch non-local copy-back if there was no non-local work to do */
 781     if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(nb, iloc))
 782     {
 783         /* TODO An alternative way to signal that non-local work is
 784            complete is to use a clEnqueueMarker+clEnqueueBarrier
 785            pair. However, the use of bNonLocalStreamDoneMarked has the
 786            advantage of being local to the host, so probably minimizes
 787            overhead. Curiously, for NVIDIA OpenCL with an empty-domain
 788            test case, overall simulation performance was higher with
 789            the API calls, but this has not been tested on AMD OpenCL,
 790            so could be worth considering in future. */
 791         nb->bNonLocalStreamDoneMarked = false;
 792         return;
 793     }
 794
 795     /* local/nonlocal offset and length used for xq and f */
 796     auto atomsRange = getGpuAtomRange(adat, atomLocality);
 797
 798     /* beginning of timed D2H section */
 799     if (bDoTime)
 800     {
 801         timers->xf[atomLocality].nb_d2h.openTimingRegion(deviceStream);
 802     }
 803
 804     /* With DD the local D2H transfer can only start after the non-local
 805        has been launched. */
 806     if (iloc == InteractionLocality::Local && nb->bNonLocalStreamDoneMarked)
 807     {
 808         nb->nonlocal_done.enqueueWaitEvent(deviceStream);
 809         nb->bNonLocalStreamDoneMarked = false;
 810     }
 811
 812     /* DtoH f */
 813     static_assert(sizeof(*nbatom->out[0].f.data()) == sizeof(float),
 814                   "The host force buffer should be in single precision to match device data size.");
 815     copyFromDeviceBuffer(reinterpret_cast<Float3*>(nbatom->out[0].f.data()) + atomsRange.begin(),
 816                          &adat->f,
 817                          atomsRange.begin(),
 818                          atomsRange.size(),
 819                          deviceStream,
 820                          GpuApiCallBehavior::Async,
 821                          bDoTime ? timers->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
 822
 823     issueClFlushInStream(deviceStream);
 824
 825     /* After the non-local D2H is launched the nonlocal_done event can be
 826        recorded which signals that the local D2H can proceed. This event is not
 827        placed after the non-local kernel because we first need the non-local
 828        data back first. */
 829     if (iloc == InteractionLocality::NonLocal)
 830     {
 831         nb->nonlocal_done.markEvent(deviceStream);
 832         nb->bNonLocalStreamDoneMarked = true;
 833     }
 834
 835     /* only transfer energies in the local stream */
 836     if (iloc == InteractionLocality::Local)
 837     {
 838         /* DtoH fshift when virial is needed */
 839         if (stepWork.computeVirial)
 840         {
 841             static_assert(
 842                     sizeof(*nb->nbst.fShift) == sizeof(Float3),
 843                     "Sizes of host- and device-side shift vector elements should be the same.");
 844             copyFromDeviceBuffer(nb->nbst.fShift,
 845                                  &adat->fShift,
 846                                  0,
 847                                  SHIFTS,
 848                                  deviceStream,
 849                                  GpuApiCallBehavior::Async,
 850                                  bDoTime ? timers->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
 851         }
 852
 853         /* DtoH energies */
 854         if (stepWork.computeEnergy)
 855         {
 856             static_assert(sizeof(*nb->nbst.eLJ) == sizeof(float),
 857                           "Sizes of host- and device-side LJ energy terms should be the same.");
 858             copyFromDeviceBuffer(nb->nbst.eLJ,
 859                                  &adat->eLJ,
 860                                  0,
 861                                  1,
 862                                  deviceStream,
 863                                  GpuApiCallBehavior::Async,
 864                                  bDoTime ? timers->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
 865             static_assert(sizeof(*nb->nbst.eElec) == sizeof(float),
 866                           "Sizes of host- and device-side electrostatic energy terms should be the "
 867                           "same.");
 868             copyFromDeviceBuffer(nb->nbst.eElec,
 869                                  &adat->eElec,
 870                                  0,
 871                                  1,
 872                                  deviceStream,
 873                                  GpuApiCallBehavior::Async,
 874                                  bDoTime ? timers->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
 875         }
 876     }
 877
 878     if (bDoTime)
 879     {
 880         timers->xf[atomLocality].nb_d2h.closeTimingRegion(deviceStream);
 881     }
 882 }
 883
 884 void nbnxnInsertNonlocalGpuDependency(NbnxmGpu* nb, const InteractionLocality interactionLocality)
 885 {
 886     const DeviceStream& deviceStream = *nb->deviceStreams[interactionLocality];
 887
 888     /* When we get here all misc operations issued in the local stream as well as
 889        the local xq H2D are done,
 890        so we record that in the local stream and wait for it in the nonlocal one.
 891        This wait needs to precede any PP tasks, bonded or nonbonded, that may
 892        compute on interactions between local and nonlocal atoms.
 893      */
 894     if (nb->bUseTwoStreams)
 895     {
 896         if (interactionLocality == InteractionLocality::Local)
 897         {
 898             nb->misc_ops_and_local_H2D_done.markEvent(deviceStream);
 899             issueClFlushInStream(deviceStream);
 900         }
 901         else
 902         {
 903             nb->misc_ops_and_local_H2D_done.enqueueWaitEvent(deviceStream);
 904         }
 905     }
 906 }
 907
 908 /*! \brief Launch asynchronously the xq buffer host to device copy. */
 909 void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const AtomLocality atomLocality)
 910 {
 911     GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
 912
 913     const InteractionLocality iloc = atomToInteractionLocality(atomLocality);
 914
 915     NBAtomData*         adat         = nb->atdat;
 916     gpu_plist*          plist        = nb->plist[iloc];
 917     Nbnxm::GpuTimers*   timers       = nb->timers;
 918     const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
 919
 920     const bool bDoTime = nb->bDoTime;
 921
 922     /* Don't launch the non-local H2D copy if there is no dependent
 923        work to do: neither non-local nor other (e.g. bonded) work
 924        to do that has as input the nbnxn coordaintes.
 925        Doing the same for the local kernel is more complicated, since the
 926        local part of the force array also depends on the non-local kernel.
 927        So to avoid complicating the code and to reduce the risk of bugs,
 928        we always call the local local x+q copy (and the rest of the local
 929        work in nbnxn_gpu_launch_kernel().
 930      */
 931     if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(nb, iloc))
 932     {
 933         plist->haveFreshList = false;
 934
 935         // The event is marked for Local interactions unconditionally,
 936         // so it has to be released here because of the early return
 937         // for NonLocal interactions.
 938         nb->misc_ops_and_local_H2D_done.reset();
 939
 940         return;
 941     }
 942
 943     /* local/nonlocal offset and length used for xq and f */
 944     const auto atomsRange = getGpuAtomRange(adat, atomLocality);
 945
 946     /* beginning of timed HtoD section */
 947     if (bDoTime)
 948     {
 949         timers->xf[atomLocality].nb_h2d.openTimingRegion(deviceStream);
 950     }
 951
 952     /* HtoD x, q */
 953     GMX_ASSERT(nbatom->XFormat == nbatXYZQ,
 954                "The coordinates should be in xyzq format to copy to the Float4 device buffer.");
 955     copyToDeviceBuffer(&adat->xq,
 956                        reinterpret_cast<const Float4*>(nbatom->x().data()) + atomsRange.begin(),
 957                        atomsRange.begin(),
 958                        atomsRange.size(),
 959                        deviceStream,
 960                        GpuApiCallBehavior::Async,
 961                        nullptr);
 962
 963     if (bDoTime)
 964     {
 965         timers->xf[atomLocality].nb_h2d.closeTimingRegion(deviceStream);
 966     }
 967
 968     /* When we get here all misc operations issued in the local stream as well as
 969        the local xq H2D are done,
 970        so we record that in the local stream and wait for it in the nonlocal one.
 971        This wait needs to precede any PP tasks, bonded or nonbonded, that may
 972        compute on interactions between local and nonlocal atoms.
 973      */
 974     nbnxnInsertNonlocalGpuDependency(nb, iloc);
 975 }
 976
 977
 978 /* Initialization for X buffer operations on GPU. */
 979 void nbnxn_gpu_init_x_to_nbat_x(const Nbnxm::GridSet& gridSet, NbnxmGpu* gpu_nbv)
 980 {
 981     const DeviceStream& localStream   = *gpu_nbv->deviceStreams[InteractionLocality::Local];
 982     const bool          bDoTime       = gpu_nbv->bDoTime;
 983     const int           maxNumColumns = gridSet.numColumnsMax();
 984
 985     reallocateDeviceBuffer(&gpu_nbv->cxy_na,
 986                            maxNumColumns * gridSet.grids().size(),
 987                            &gpu_nbv->ncxy_na,
 988                            &gpu_nbv->ncxy_na_alloc,
 989                            *gpu_nbv->deviceContext_);
 990     reallocateDeviceBuffer(&gpu_nbv->cxy_ind,
 991                            maxNumColumns * gridSet.grids().size(),
 992                            &gpu_nbv->ncxy_ind,
 993                            &gpu_nbv->ncxy_ind_alloc,
 994                            *gpu_nbv->deviceContext_);
 995
 996     for (unsigned int g = 0; g < gridSet.grids().size(); g++)
 997     {
 998         const Nbnxm::Grid& grid = gridSet.grids()[g];
 999
1000         const int  numColumns      = grid.numColumns();
1001         const int* atomIndices     = gridSet.atomIndices().data();
1002         const int  atomIndicesSize = gridSet.atomIndices().size();
1003         const int* cxy_na          = grid.cxy_na().data();
1004         const int* cxy_ind         = grid.cxy_ind().data();
1005
1006         auto* timerH2D = bDoTime ? &gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d : nullptr;
1007
1008         reallocateDeviceBuffer(&gpu_nbv->atomIndices,
1009                                atomIndicesSize,
1010                                &gpu_nbv->atomIndicesSize,
1011                                &gpu_nbv->atomIndicesSize_alloc,
1012                                *gpu_nbv->deviceContext_);
1013
1014         if (atomIndicesSize > 0)
1015         {
1016             if (bDoTime)
1017             {
1018                 timerH2D->openTimingRegion(localStream);
1019             }
1020
1021             copyToDeviceBuffer(&gpu_nbv->atomIndices,
1022                                atomIndices,
1023                                0,
1024                                atomIndicesSize,
1025                                localStream,
1026                                GpuApiCallBehavior::Async,
1027                                bDoTime ? timerH2D->fetchNextEvent() : nullptr);
1028
1029             if (bDoTime)
1030             {
1031                 timerH2D->closeTimingRegion(localStream);
1032             }
1033         }
1034
1035         if (numColumns > 0)
1036         {
1037             if (bDoTime)
1038             {
1039                 timerH2D->openTimingRegion(localStream);
1040             }
1041
1042             copyToDeviceBuffer(&gpu_nbv->cxy_na,
1043                                cxy_na,
1044                                maxNumColumns * g,
1045                                numColumns,
1046                                localStream,
1047                                GpuApiCallBehavior::Async,
1048                                bDoTime ? timerH2D->fetchNextEvent() : nullptr);
1049
1050             if (bDoTime)
1051             {
1052                 timerH2D->closeTimingRegion(localStream);
1053             }
1054
1055             if (bDoTime)
1056             {
1057                 timerH2D->openTimingRegion(localStream);
1058             }
1059
1060             copyToDeviceBuffer(&gpu_nbv->cxy_ind,
1061                                cxy_ind,
1062                                maxNumColumns * g,
1063                                numColumns,
1064                                localStream,
1065                                GpuApiCallBehavior::Async,
1066                                bDoTime ? timerH2D->fetchNextEvent() : nullptr);
1067
1068             if (bDoTime)
1069             {
1070                 timerH2D->closeTimingRegion(localStream);
1071             }
1072         }
1073     }
1074
1075     // The above data is transferred on the local stream but is a
1076     // dependency of the nonlocal stream (specifically the nonlocal X
1077     // buf ops kernel).  We therefore set a dependency to ensure
1078     // that the nonlocal stream waits on the local stream here.
1079     // This call records an event in the local stream:
1080     nbnxnInsertNonlocalGpuDependency(gpu_nbv, Nbnxm::InteractionLocality::Local);
1081     // ...and this call instructs the nonlocal stream to wait on that event:
1082     nbnxnInsertNonlocalGpuDependency(gpu_nbv, Nbnxm::InteractionLocality::NonLocal);
1083 }
1084
1085 } // namespace Nbnxm