src/gromacs/nbnxm/nbnxm_gpu_data_mgmt.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2012,2013,2014,2015,2016 by the GROMACS development team.
   5  * Copyright (c) 2017,2018,2019,2020,2021, by the GROMACS development team, led by
   6  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   7  * and including many others, as listed in the AUTHORS file in the
   8  * top-level source directory and at http://www.gromacs.org.
   9  *
  10  * GROMACS is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU Lesser General Public License
  12  * as published by the Free Software Foundation; either version 2.1
  13  * of the License, or (at your option) any later version.
  14  *
  15  * GROMACS is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18  * Lesser General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public
  21  * License along with GROMACS; if not, see
  22  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  23  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  24  *
  25  * If you want to redistribute modifications to GROMACS, please
  26  * consider that scientific software is very special. Version
  27  * control is crucial - bugs must be traceable. We will be happy to
  28  * consider code for inclusion in the official distribution, but
  29  * derived work must not be called official GROMACS. Details are found
  30  * in the README & COPYING files - if they are missing, get the
  31  * official version at http://www.gromacs.org.
  32  *
  33  * To help us fund GROMACS development, we humbly ask that you cite
  34  * the research papers on the package. Check out http://www.gromacs.org.
  35  */
  36 /*! \internal \file
  37  *  \brief Define common implementation of nbnxm_gpu_data_mgmt.h
  38  *
  39  *  \author Anca Hamuraru <anca@streamcomputing.eu>
  40  *  \author Dimitrios Karkoulis <dimitris.karkoulis@gmail.com>
  41  *  \author Teemu Virolainen <teemu@streamcomputing.eu>
  42  *  \author Szilárd Páll <pall.szilard@gmail.com>
  43  *  \author Artem Zhmurov <zhmurov@gmail.com>
  44  *
  45  *  \ingroup module_nbnxm
  46  */
  47 #include "gmxpre.h"
  48
  49 #include "config.h"
  50
  51 #if GMX_GPU_CUDA
  52 #    include "cuda/nbnxm_cuda_types.h"
  53 #endif
  54
  55 #if GMX_GPU_OPENCL
  56 #    include "opencl/nbnxm_ocl_types.h"
  57 #endif
  58
  59 #if GMX_GPU_SYCL
  60 #    include "sycl/nbnxm_sycl_types.h"
  61 #endif
  62
  63 #include "nbnxm_gpu_data_mgmt.h"
  64
  65 #include "gromacs/gpu_utils/device_stream_manager.h"
  66 #include "gromacs/gpu_utils/pmalloc.h"
  67 #include "gromacs/hardware/device_information.h"
  68 #include "gromacs/mdtypes/interaction_const.h"
  69 #include "gromacs/mdtypes/simulation_workload.h"
  70 #include "gromacs/nbnxm/gpu_common_utils.h"
  71 #include "gromacs/nbnxm/gpu_data_mgmt.h"
  72 #include "gromacs/pbcutil/ishift.h"
  73 #include "gromacs/timing/gpu_timing.h"
  74 #include "gromacs/pbcutil/ishift.h"
  75 #include "gromacs/utility/cstringutil.h"
  76 #include "gromacs/utility/exceptions.h"
  77 #include "gromacs/utility/fatalerror.h"
  78
  79 #include "nbnxm_gpu.h"
  80 #include "pairlistsets.h"
  81
  82 namespace Nbnxm
  83 {
  84
  85 inline void issueClFlushInStream(const DeviceStream& deviceStream)
  86 {
  87 #if GMX_GPU_OPENCL
  88     /* Based on the v1.2 section 5.13 of the OpenCL spec, a flush is needed
  89      * in the stream after marking an event in it in order to be able to sync with
  90      * the event from another stream.
  91      */
  92     cl_int cl_error = clFlush(deviceStream.stream());
  93     if (cl_error != CL_SUCCESS)
  94     {
  95         GMX_THROW(gmx::InternalError("clFlush failed: " + ocl_get_error_string(cl_error)));
  96     }
  97 #else
  98     GMX_UNUSED_VALUE(deviceStream);
  99 #endif
 100 }
 101
 102 void init_ewald_coulomb_force_table(const EwaldCorrectionTables& tables,
 103                                     NBParamGpu*                  nbp,
 104                                     const DeviceContext&         deviceContext)
 105 {
 106     if (nbp->coulomb_tab)
 107     {
 108         destroyParamLookupTable(&nbp->coulomb_tab, nbp->coulomb_tab_texobj);
 109     }
 110
 111     nbp->coulomb_tab_scale = tables.scale;
 112     initParamLookupTable(
 113             &nbp->coulomb_tab, &nbp->coulomb_tab_texobj, tables.tableF.data(), tables.tableF.size(), deviceContext);
 114 }
 115
 116 enum ElecType nbnxn_gpu_pick_ewald_kernel_type(const interaction_const_t& ic,
 117                                                const DeviceInformation gmx_unused& deviceInfo)
 118 {
 119     bool bTwinCut = (ic.rcoulomb != ic.rvdw);
 120
 121     /* Benchmarking/development environment variables to force the use of
 122        analytical or tabulated Ewald kernel. */
 123     const bool forceAnalyticalEwald = (getenv("GMX_GPU_NB_ANA_EWALD") != nullptr);
 124     const bool forceTabulatedEwald  = (getenv("GMX_GPU_NB_TAB_EWALD") != nullptr);
 125     const bool forceTwinCutoffEwald = (getenv("GMX_GPU_NB_EWALD_TWINCUT") != nullptr);
 126
 127     if (forceAnalyticalEwald && forceTabulatedEwald)
 128     {
 129         gmx_incons(
 130                 "Both analytical and tabulated Ewald GPU non-bonded kernels "
 131                 "requested through environment variables.");
 132     }
 133
 134     /* By default, use analytical Ewald except with CUDA on NVIDIA CC 7.0 and 8.0.
 135      */
 136     const bool c_useTabulatedEwaldDefault =
 137 #if GMX_GPU_CUDA
 138             (deviceInfo.prop.major == 7 && deviceInfo.prop.minor == 0)
 139             || (deviceInfo.prop.major == 8 && deviceInfo.prop.minor == 0);
 140 #else
 141             false;
 142 #endif
 143     bool bUseAnalyticalEwald = !c_useTabulatedEwaldDefault;
 144     if (forceAnalyticalEwald)
 145     {
 146         bUseAnalyticalEwald = true;
 147         if (debug)
 148         {
 149             fprintf(debug, "Using analytical Ewald GPU kernels\n");
 150         }
 151     }
 152     else if (forceTabulatedEwald)
 153     {
 154         bUseAnalyticalEwald = false;
 155
 156         if (debug)
 157         {
 158             fprintf(debug, "Using tabulated Ewald GPU kernels\n");
 159         }
 160     }
 161
 162     /* Use twin cut-off kernels if requested by bTwinCut or the env. var.
 163        forces it (use it for debugging/benchmarking only). */
 164     if (!bTwinCut && !forceTwinCutoffEwald)
 165     {
 166         return bUseAnalyticalEwald ? ElecType::EwaldAna : ElecType::EwaldTab;
 167     }
 168     else
 169     {
 170         return bUseAnalyticalEwald ? ElecType::EwaldAnaTwin : ElecType::EwaldTabTwin;
 171     }
 172 }
 173
 174 void set_cutoff_parameters(NBParamGpu* nbp, const interaction_const_t& ic, const PairlistParams& listParams)
 175 {
 176     nbp->ewald_beta        = ic.ewaldcoeff_q;
 177     nbp->sh_ewald          = ic.sh_ewald;
 178     nbp->epsfac            = ic.epsfac;
 179     nbp->two_k_rf          = 2.0 * ic.reactionFieldCoefficient;
 180     nbp->c_rf              = ic.reactionFieldShift;
 181     nbp->rvdw_sq           = ic.rvdw * ic.rvdw;
 182     nbp->rcoulomb_sq       = ic.rcoulomb * ic.rcoulomb;
 183     nbp->rlistOuter_sq     = listParams.rlistOuter * listParams.rlistOuter;
 184     nbp->rlistInner_sq     = listParams.rlistInner * listParams.rlistInner;
 185     nbp->useDynamicPruning = listParams.useDynamicPruning;
 186
 187     nbp->sh_lj_ewald   = ic.sh_lj_ewald;
 188     nbp->ewaldcoeff_lj = ic.ewaldcoeff_lj;
 189
 190     nbp->rvdw_switch      = ic.rvdw_switch;
 191     nbp->dispersion_shift = ic.dispersion_shift;
 192     nbp->repulsion_shift  = ic.repulsion_shift;
 193     nbp->vdw_switch       = ic.vdw_switch;
 194 }
 195
 196 void gpu_pme_loadbal_update_param(const nonbonded_verlet_t* nbv, const interaction_const_t& ic)
 197 {
 198     if (!nbv || !nbv->useGpu())
 199     {
 200         return;
 201     }
 202     NbnxmGpu*   nb  = nbv->gpu_nbv;
 203     NBParamGpu* nbp = nb->nbparam;
 204
 205     set_cutoff_parameters(nbp, ic, nbv->pairlistSets().params());
 206
 207     nbp->elecType = nbnxn_gpu_pick_ewald_kernel_type(ic, nb->deviceContext_->deviceInfo());
 208
 209     GMX_RELEASE_ASSERT(ic.coulombEwaldTables, "Need valid Coulomb Ewald correction tables");
 210     init_ewald_coulomb_force_table(*ic.coulombEwaldTables, nbp, *nb->deviceContext_);
 211 }
 212
 213 void init_plist(gpu_plist* pl)
 214 {
 215     /* initialize to nullptr pointers to data that is not allocated here and will
 216        need reallocation in nbnxn_gpu_init_pairlist */
 217     pl->sci   = nullptr;
 218     pl->cj4   = nullptr;
 219     pl->imask = nullptr;
 220     pl->excl  = nullptr;
 221
 222     /* size -1 indicates that the respective array hasn't been initialized yet */
 223     pl->na_c                   = -1;
 224     pl->nsci                   = -1;
 225     pl->sci_nalloc             = -1;
 226     pl->ncj4                   = -1;
 227     pl->cj4_nalloc             = -1;
 228     pl->nimask                 = -1;
 229     pl->imask_nalloc           = -1;
 230     pl->nexcl                  = -1;
 231     pl->excl_nalloc            = -1;
 232     pl->haveFreshList          = false;
 233     pl->rollingPruningNumParts = 0;
 234     pl->rollingPruningPart     = 0;
 235 }
 236
 237 void init_timings(gmx_wallclock_gpu_nbnxn_t* t)
 238 {
 239     t->nb_h2d_t = 0.0;
 240     t->nb_d2h_t = 0.0;
 241     t->nb_c     = 0;
 242     t->pl_h2d_t = 0.0;
 243     t->pl_h2d_c = 0;
 244     for (int i = 0; i < 2; i++)
 245     {
 246         for (int j = 0; j < 2; j++)
 247         {
 248             t->ktime[i][j].t = 0.0;
 249             t->ktime[i][j].c = 0;
 250         }
 251     }
 252     t->pruneTime.c        = 0;
 253     t->pruneTime.t        = 0.0;
 254     t->dynamicPruneTime.c = 0;
 255     t->dynamicPruneTime.t = 0.0;
 256 }
 257
 258 /*! \brief Initialize \p atomdata first time; it only gets filled at pair-search. */
 259 static void initAtomdataFirst(NBAtomData*          atomdata,
 260                               int                  numTypes,
 261                               const DeviceContext& deviceContext,
 262                               const DeviceStream&  localStream)
 263 {
 264     atomdata->numTypes = numTypes;
 265     allocateDeviceBuffer(&atomdata->shiftVec, SHIFTS, deviceContext);
 266     atomdata->shiftVecUploaded = false;
 267
 268     allocateDeviceBuffer(&atomdata->fShift, SHIFTS, deviceContext);
 269     allocateDeviceBuffer(&atomdata->eLJ, 1, deviceContext);
 270     allocateDeviceBuffer(&atomdata->eElec, 1, deviceContext);
 271
 272     clearDeviceBufferAsync(&atomdata->fShift, 0, SHIFTS, localStream);
 273     clearDeviceBufferAsync(&atomdata->eElec, 0, 1, localStream);
 274     clearDeviceBufferAsync(&atomdata->eLJ, 0, 1, localStream);
 275
 276     /* initialize to nullptr pointers to data that is not allocated here and will
 277        need reallocation in later */
 278     atomdata->xq = nullptr;
 279     atomdata->f  = nullptr;
 280
 281     /* size -1 indicates that the respective array hasn't been initialized yet */
 282     atomdata->numAtoms      = -1;
 283     atomdata->numAtomsAlloc = -1;
 284 }
 285
 286 /*! \brief Initialize the nonbonded parameter data structure. */
 287 static void initNbparam(NBParamGpu*                     nbp,
 288                         const interaction_const_t&      ic,
 289                         const PairlistParams&           listParams,
 290                         const nbnxn_atomdata_t::Params& nbatParams,
 291                         const DeviceContext&            deviceContext)
 292 {
 293     const int numTypes = nbatParams.numTypes;
 294
 295     set_cutoff_parameters(nbp, ic, listParams);
 296
 297     nbp->vdwType  = nbnxmGpuPickVdwKernelType(ic, nbatParams.ljCombinationRule);
 298     nbp->elecType = nbnxmGpuPickElectrostaticsKernelType(ic, deviceContext.deviceInfo());
 299
 300     if (ic.vdwtype == VanDerWaalsType::Pme)
 301     {
 302         if (ic.ljpme_comb_rule == LongRangeVdW::Geom)
 303         {
 304             GMX_ASSERT(nbatParams.ljCombinationRule == LJCombinationRule::Geometric,
 305                        "Combination rule mismatch!");
 306         }
 307         else
 308         {
 309             GMX_ASSERT(nbatParams.ljCombinationRule == LJCombinationRule::LorentzBerthelot,
 310                        "Combination rule mismatch!");
 311         }
 312     }
 313
 314     /* generate table for PME */
 315     if (nbp->elecType == ElecType::EwaldTab || nbp->elecType == ElecType::EwaldTabTwin)
 316     {
 317         GMX_RELEASE_ASSERT(ic.coulombEwaldTables, "Need valid Coulomb Ewald correction tables");
 318         init_ewald_coulomb_force_table(*ic.coulombEwaldTables, nbp, deviceContext);
 319     }
 320     else
 321     {
 322         // Need to initialize for OpenCL, since it is unconditionally used as a kernel argument.
 323         allocateDeviceBuffer(&nbp->coulomb_tab, 1, deviceContext);
 324     }
 325
 326     /* set up LJ parameter lookup table */
 327     if (!useLjCombRule(nbp->vdwType))
 328     {
 329         static_assert(sizeof(decltype(nbp->nbfp)) == 2 * sizeof(decltype(*nbatParams.nbfp.data())),
 330                       "Mismatch in the size of host / device data types");
 331         initParamLookupTable(&nbp->nbfp,
 332                              &nbp->nbfp_texobj,
 333                              reinterpret_cast<const Float2*>(nbatParams.nbfp.data()),
 334                              numTypes * numTypes,
 335                              deviceContext);
 336     }
 337     else
 338     {
 339         // Need to initialize for OpenCL, since it is unconditionally used as a kernel argument.
 340         allocateDeviceBuffer(&nbp->nbfp, 1, deviceContext);
 341     }
 342
 343     /* set up LJ-PME parameter lookup table */
 344     if (ic.vdwtype == VanDerWaalsType::Pme)
 345     {
 346         static_assert(sizeof(decltype(nbp->nbfp_comb))
 347                               == 2 * sizeof(decltype(*nbatParams.nbfp_comb.data())),
 348                       "Mismatch in the size of host / device data types");
 349         initParamLookupTable(&nbp->nbfp_comb,
 350                              &nbp->nbfp_comb_texobj,
 351                              reinterpret_cast<const Float2*>(nbatParams.nbfp_comb.data()),
 352                              numTypes,
 353                              deviceContext);
 354     }
 355     else
 356     {
 357         // Need to initialize for OpenCL, since it is unconditionally used as a kernel argument.
 358         allocateDeviceBuffer(&nbp->nbfp_comb, 1, deviceContext);
 359     }
 360 }
 361
 362 NbnxmGpu* gpu_init(const gmx::DeviceStreamManager& deviceStreamManager,
 363                    const interaction_const_t*      ic,
 364                    const PairlistParams&           listParams,
 365                    const nbnxn_atomdata_t*         nbat,
 366                    const bool                      bLocalAndNonlocal)
 367 {
 368     auto* nb                              = new NbnxmGpu();
 369     nb->deviceContext_                    = &deviceStreamManager.context();
 370     nb->atdat                             = new NBAtomData;
 371     nb->nbparam                           = new NBParamGpu;
 372     nb->plist[InteractionLocality::Local] = new Nbnxm::gpu_plist;
 373     if (bLocalAndNonlocal)
 374     {
 375         nb->plist[InteractionLocality::NonLocal] = new Nbnxm::gpu_plist;
 376     }
 377
 378     nb->bUseTwoStreams = bLocalAndNonlocal;
 379
 380     nb->timers = new Nbnxm::GpuTimers();
 381     snew(nb->timings, 1);
 382
 383     /* WARNING: CUDA timings are incorrect with multiple streams.
 384      * This is the main reason why they are disabled by default.
 385      * Can be enabled by setting GMX_ENABLE_GPU_TIMING environment variable.
 386      * TODO: Consider turning on by default when we can detect nr of streams.
 387      *
 388      * OpenCL timing is enabled by default and can be disabled by
 389      * GMX_DISABLE_GPU_TIMING environment variable.
 390      *
 391      * Timing is disabled in SYCL.
 392      */
 393     nb->bDoTime = (GMX_GPU_CUDA && (getenv("GMX_ENABLE_GPU_TIMING") != nullptr))
 394                   || (GMX_GPU_OPENCL && (getenv("GMX_DISABLE_GPU_TIMING") == nullptr));
 395
 396     if (nb->bDoTime)
 397     {
 398         init_timings(nb->timings);
 399     }
 400
 401     /* init nbst */
 402     pmalloc(reinterpret_cast<void**>(&nb->nbst.eLJ), sizeof(*nb->nbst.eLJ));
 403     pmalloc(reinterpret_cast<void**>(&nb->nbst.eElec), sizeof(*nb->nbst.eElec));
 404     pmalloc(reinterpret_cast<void**>(&nb->nbst.fShift), SHIFTS * sizeof(*nb->nbst.fShift));
 405
 406     init_plist(nb->plist[InteractionLocality::Local]);
 407
 408     /* local/non-local GPU streams */
 409     GMX_RELEASE_ASSERT(deviceStreamManager.streamIsValid(gmx::DeviceStreamType::NonBondedLocal),
 410                        "Local non-bonded stream should be initialized to use GPU for non-bonded.");
 411     const DeviceStream& localStream = deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedLocal);
 412     nb->deviceStreams[InteractionLocality::Local] = &localStream;
 413     // In general, it's not strictly necessary to use 2 streams for SYCL, since they are
 414     // out-of-order. But for the time being, it will be less disruptive to keep them.
 415     if (nb->bUseTwoStreams)
 416     {
 417         init_plist(nb->plist[InteractionLocality::NonLocal]);
 418
 419         GMX_RELEASE_ASSERT(deviceStreamManager.streamIsValid(gmx::DeviceStreamType::NonBondedNonLocal),
 420                            "Non-local non-bonded stream should be initialized to use GPU for "
 421                            "non-bonded with domain decomposition.");
 422         nb->deviceStreams[InteractionLocality::NonLocal] =
 423                 &deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedNonLocal);
 424     }
 425
 426     const nbnxn_atomdata_t::Params& nbatParams    = nbat->params();
 427     const DeviceContext&            deviceContext = *nb->deviceContext_;
 428
 429     initNbparam(nb->nbparam, *ic, listParams, nbatParams, deviceContext);
 430     initAtomdataFirst(nb->atdat, nbatParams.numTypes, deviceContext, localStream);
 431
 432     gpu_init_platform_specific(nb);
 433
 434     if (debug)
 435     {
 436         fprintf(debug, "Initialized NBNXM GPU data structures.\n");
 437     }
 438
 439     return nb;
 440 }
 441
 442 //! This function is documented in the header file
 443 void gpu_init_pairlist(NbnxmGpu* nb, const NbnxnPairlistGpu* h_plist, const InteractionLocality iloc)
 444 {
 445     char sbuf[STRLEN];
 446     // Timing accumulation should happen only if there was work to do
 447     // because getLastRangeTime() gets skipped with empty lists later
 448     // which leads to the counter not being reset.
 449     bool                bDoTime      = (nb->bDoTime && !h_plist->sci.empty());
 450     const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
 451     gpu_plist*          d_plist      = nb->plist[iloc];
 452
 453     if (d_plist->na_c < 0)
 454     {
 455         d_plist->na_c = h_plist->na_ci;
 456     }
 457     else
 458     {
 459         if (d_plist->na_c != h_plist->na_ci)
 460         {
 461             sprintf(sbuf,
 462                     "In init_plist: the #atoms per cell has changed (from %d to %d)",
 463                     d_plist->na_c,
 464                     h_plist->na_ci);
 465             gmx_incons(sbuf);
 466         }
 467     }
 468
 469     GpuTimers::Interaction& iTimers = nb->timers->interaction[iloc];
 470
 471     if (bDoTime)
 472     {
 473         iTimers.pl_h2d.openTimingRegion(deviceStream);
 474         iTimers.didPairlistH2D = true;
 475     }
 476
 477     // TODO most of this function is same in CUDA and OpenCL, move into the header
 478     const DeviceContext& deviceContext = *nb->deviceContext_;
 479
 480     reallocateDeviceBuffer(
 481             &d_plist->sci, h_plist->sci.size(), &d_plist->nsci, &d_plist->sci_nalloc, deviceContext);
 482     copyToDeviceBuffer(&d_plist->sci,
 483                        h_plist->sci.data(),
 484                        0,
 485                        h_plist->sci.size(),
 486                        deviceStream,
 487                        GpuApiCallBehavior::Async,
 488                        bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
 489
 490     reallocateDeviceBuffer(
 491             &d_plist->cj4, h_plist->cj4.size(), &d_plist->ncj4, &d_plist->cj4_nalloc, deviceContext);
 492     copyToDeviceBuffer(&d_plist->cj4,
 493                        h_plist->cj4.data(),
 494                        0,
 495                        h_plist->cj4.size(),
 496                        deviceStream,
 497                        GpuApiCallBehavior::Async,
 498                        bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
 499
 500     reallocateDeviceBuffer(&d_plist->imask,
 501                            h_plist->cj4.size() * c_nbnxnGpuClusterpairSplit,
 502                            &d_plist->nimask,
 503                            &d_plist->imask_nalloc,
 504                            deviceContext);
 505
 506     reallocateDeviceBuffer(
 507             &d_plist->excl, h_plist->excl.size(), &d_plist->nexcl, &d_plist->excl_nalloc, deviceContext);
 508     copyToDeviceBuffer(&d_plist->excl,
 509                        h_plist->excl.data(),
 510                        0,
 511                        h_plist->excl.size(),
 512                        deviceStream,
 513                        GpuApiCallBehavior::Async,
 514                        bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
 515
 516     if (bDoTime)
 517     {
 518         iTimers.pl_h2d.closeTimingRegion(deviceStream);
 519     }
 520
 521     /* need to prune the pair list during the next step */
 522     d_plist->haveFreshList = true;
 523 }
 524
 525 void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat)
 526 {
 527     bool                 bDoTime       = nb->bDoTime;
 528     Nbnxm::GpuTimers*    timers        = bDoTime ? nb->timers : nullptr;
 529     NBAtomData*          atdat         = nb->atdat;
 530     const DeviceContext& deviceContext = *nb->deviceContext_;
 531     const DeviceStream&  localStream   = *nb->deviceStreams[InteractionLocality::Local];
 532
 533     int  numAtoms  = nbat->numAtoms();
 534     bool realloced = false;
 535
 536     if (bDoTime)
 537     {
 538         /* time async copy */
 539         timers->atdat.openTimingRegion(localStream);
 540     }
 541
 542     /* need to reallocate if we have to copy more atoms than the amount of space
 543        available and only allocate if we haven't initialized yet, i.e atdat->natoms == -1 */
 544     if (numAtoms > atdat->numAtomsAlloc)
 545     {
 546         int numAlloc = over_alloc_small(numAtoms);
 547
 548         /* free up first if the arrays have already been initialized */
 549         if (atdat->numAtomsAlloc != -1)
 550         {
 551             freeDeviceBuffer(&atdat->f);
 552             freeDeviceBuffer(&atdat->xq);
 553             if (useLjCombRule(nb->nbparam->vdwType))
 554             {
 555                 freeDeviceBuffer(&atdat->ljComb);
 556             }
 557             else
 558             {
 559                 freeDeviceBuffer(&atdat->atomTypes);
 560             }
 561         }
 562
 563
 564         allocateDeviceBuffer(&atdat->f, numAlloc, deviceContext);
 565         allocateDeviceBuffer(&atdat->xq, numAlloc, deviceContext);
 566
 567         if (useLjCombRule(nb->nbparam->vdwType))
 568         {
 569             // Two Lennard-Jones parameters per atom
 570             allocateDeviceBuffer(&atdat->ljComb, numAlloc, deviceContext);
 571         }
 572         else
 573         {
 574             allocateDeviceBuffer(&atdat->atomTypes, numAlloc, deviceContext);
 575         }
 576
 577         atdat->numAtomsAlloc = numAlloc;
 578         realloced            = true;
 579     }
 580
 581     atdat->numAtoms      = numAtoms;
 582     atdat->numAtomsLocal = nbat->natoms_local;
 583
 584     /* need to clear GPU f output if realloc happened */
 585     if (realloced)
 586     {
 587         clearDeviceBufferAsync(&atdat->f, 0, atdat->numAtomsAlloc, localStream);
 588     }
 589
 590     if (useLjCombRule(nb->nbparam->vdwType))
 591     {
 592         static_assert(
 593                 sizeof(Float2) == 2 * sizeof(*nbat->params().lj_comb.data()),
 594                 "Size of a pair of LJ parameters elements should be equal to the size of Float2.");
 595         copyToDeviceBuffer(&atdat->ljComb,
 596                            reinterpret_cast<const Float2*>(nbat->params().lj_comb.data()),
 597                            0,
 598                            numAtoms,
 599                            localStream,
 600                            GpuApiCallBehavior::Async,
 601                            bDoTime ? timers->atdat.fetchNextEvent() : nullptr);
 602     }
 603     else
 604     {
 605         static_assert(sizeof(int) == sizeof(*nbat->params().type.data()),
 606                       "Sizes of host- and device-side atom types should be the same.");
 607         copyToDeviceBuffer(&atdat->atomTypes,
 608                            nbat->params().type.data(),
 609                            0,
 610                            numAtoms,
 611                            localStream,
 612                            GpuApiCallBehavior::Async,
 613                            bDoTime ? timers->atdat.fetchNextEvent() : nullptr);
 614     }
 615
 616     if (bDoTime)
 617     {
 618         timers->atdat.closeTimingRegion(localStream);
 619     }
 620
 621     /* kick off the tasks enqueued above to ensure concurrency with the search */
 622     issueClFlushInStream(localStream);
 623 }
 624
 625 void gpu_clear_outputs(NbnxmGpu* nb, bool computeVirial)
 626 {
 627     NBAtomData*         adat        = nb->atdat;
 628     const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local];
 629     // Clear forces
 630     clearDeviceBufferAsync(&adat->f, 0, nb->atdat->numAtoms, localStream);
 631     // Clear shift force array and energies if the outputs were used in the current step
 632     if (computeVirial)
 633     {
 634         clearDeviceBufferAsync(&adat->fShift, 0, SHIFTS, localStream);
 635         clearDeviceBufferAsync(&adat->eLJ, 0, 1, localStream);
 636         clearDeviceBufferAsync(&adat->eElec, 0, 1, localStream);
 637     }
 638     issueClFlushInStream(localStream);
 639 }
 640
 641 //! This function is documented in the header file
 642 gmx_wallclock_gpu_nbnxn_t* gpu_get_timings(NbnxmGpu* nb)
 643 {
 644     return (nb != nullptr && nb->bDoTime) ? nb->timings : nullptr;
 645 }
 646
 647 //! This function is documented in the header file
 648 void gpu_reset_timings(nonbonded_verlet_t* nbv)
 649 {
 650     if (nbv->gpu_nbv && nbv->gpu_nbv->bDoTime)
 651     {
 652         init_timings(nbv->gpu_nbv->timings);
 653     }
 654 }
 655
 656 bool gpu_is_kernel_ewald_analytical(const NbnxmGpu* nb)
 657 {
 658     return ((nb->nbparam->elecType == ElecType::EwaldAna)
 659             || (nb->nbparam->elecType == ElecType::EwaldAnaTwin));
 660 }
 661
 662 enum ElecType nbnxmGpuPickElectrostaticsKernelType(const interaction_const_t& ic,
 663                                                    const DeviceInformation&   deviceInfo)
 664 {
 665     if (ic.eeltype == CoulombInteractionType::Cut)
 666     {
 667         return ElecType::Cut;
 668     }
 669     else if (EEL_RF(ic.eeltype))
 670     {
 671         return ElecType::RF;
 672     }
 673     else if ((EEL_PME(ic.eeltype) || ic.eeltype == CoulombInteractionType::Ewald))
 674     {
 675         return nbnxn_gpu_pick_ewald_kernel_type(ic, deviceInfo);
 676     }
 677     else
 678     {
 679         /* Shouldn't happen, as this is checked when choosing Verlet-scheme */
 680         GMX_THROW(gmx::InconsistentInputError(
 681                 gmx::formatString("The requested electrostatics type %s is not implemented in "
 682                                   "the GPU accelerated kernels!",
 683                                   enumValueToString(ic.eeltype))));
 684     }
 685 }
 686
 687
 688 enum VdwType nbnxmGpuPickVdwKernelType(const interaction_const_t& ic, LJCombinationRule ljCombinationRule)
 689 {
 690     if (ic.vdwtype == VanDerWaalsType::Cut)
 691     {
 692         switch (ic.vdw_modifier)
 693         {
 694             case InteractionModifiers::None:
 695             case InteractionModifiers::PotShift:
 696                 switch (ljCombinationRule)
 697                 {
 698                     case LJCombinationRule::None: return VdwType::Cut;
 699                     case LJCombinationRule::Geometric: return VdwType::CutCombGeom;
 700                     case LJCombinationRule::LorentzBerthelot: return VdwType::CutCombLB;
 701                     default:
 702                         GMX_THROW(gmx::InconsistentInputError(gmx::formatString(
 703                                 "The requested LJ combination rule %s is not implemented in "
 704                                 "the GPU accelerated kernels!",
 705                                 enumValueToString(ljCombinationRule))));
 706                 }
 707             case InteractionModifiers::ForceSwitch: return VdwType::FSwitch;
 708             case InteractionModifiers::PotSwitch: return VdwType::PSwitch;
 709             default:
 710                 GMX_THROW(gmx::InconsistentInputError(
 711                         gmx::formatString("The requested VdW interaction modifier %s is not "
 712                                           "implemented in the GPU accelerated kernels!",
 713                                           enumValueToString(ic.vdw_modifier))));
 714         }
 715     }
 716     else if (ic.vdwtype == VanDerWaalsType::Pme)
 717     {
 718         if (ic.ljpme_comb_rule == LongRangeVdW::Geom)
 719         {
 720             assert(ljCombinationRule == LJCombinationRule::Geometric);
 721             return VdwType::EwaldGeom;
 722         }
 723         else
 724         {
 725             assert(ljCombinationRule == LJCombinationRule::LorentzBerthelot);
 726             return VdwType::EwaldLB;
 727         }
 728     }
 729     else
 730     {
 731         GMX_THROW(gmx::InconsistentInputError(gmx::formatString(
 732                 "The requested VdW type %s is not implemented in the GPU accelerated kernels!",
 733                 enumValueToString(ic.vdwtype))));
 734     }
 735 }
 736
 737 void setupGpuShortRangeWork(NbnxmGpu* nb, const gmx::GpuBonded* gpuBonded, const gmx::InteractionLocality iLocality)
 738 {
 739     GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
 740
 741     // There is short-range work if the pair list for the provided
 742     // interaction locality contains entries or if there is any
 743     // bonded work (as this is not split into local/nonlocal).
 744     nb->haveWork[iLocality] = ((nb->plist[iLocality]->nsci != 0)
 745                                || (gpuBonded != nullptr && gpuBonded->haveInteractions()));
 746 }
 747
 748 bool haveGpuShortRangeWork(const NbnxmGpu* nb, const gmx::InteractionLocality interactionLocality)
 749 {
 750     GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
 751
 752     return nb->haveWork[interactionLocality];
 753 }
 754
 755 /*! \brief
 756  * Launch asynchronously the download of nonbonded forces from the GPU
 757  * (and energies/shift forces if required).
 758  */
 759 void gpu_launch_cpyback(NbnxmGpu*                nb,
 760                         struct nbnxn_atomdata_t* nbatom,
 761                         const gmx::StepWorkload& stepWork,
 762                         const AtomLocality       atomLocality)
 763 {
 764     GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
 765
 766     /* determine interaction locality from atom locality */
 767     const InteractionLocality iloc = atomToInteractionLocality(atomLocality);
 768     GMX_ASSERT(iloc == InteractionLocality::Local
 769                        || (iloc == InteractionLocality::NonLocal && nb->bNonLocalStreamDoneMarked == false),
 770                "Non-local stream is indicating that the copy back event is enqueued at the "
 771                "beginning of the copy back function.");
 772
 773     /* extract the data */
 774     NBAtomData*         adat         = nb->atdat;
 775     Nbnxm::GpuTimers*   timers       = nb->timers;
 776     bool                bDoTime      = nb->bDoTime;
 777     const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
 778
 779     /* don't launch non-local copy-back if there was no non-local work to do */
 780     if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(nb, iloc))
 781     {
 782         /* TODO An alternative way to signal that non-local work is
 783            complete is to use a clEnqueueMarker+clEnqueueBarrier
 784            pair. However, the use of bNonLocalStreamDoneMarked has the
 785            advantage of being local to the host, so probably minimizes
 786            overhead. Curiously, for NVIDIA OpenCL with an empty-domain
 787            test case, overall simulation performance was higher with
 788            the API calls, but this has not been tested on AMD OpenCL,
 789            so could be worth considering in future. */
 790         nb->bNonLocalStreamDoneMarked = false;
 791         return;
 792     }
 793
 794     /* local/nonlocal offset and length used for xq and f */
 795     auto atomsRange = getGpuAtomRange(adat, atomLocality);
 796
 797     /* beginning of timed D2H section */
 798     if (bDoTime)
 799     {
 800         timers->xf[atomLocality].nb_d2h.openTimingRegion(deviceStream);
 801     }
 802
 803     /* With DD the local D2H transfer can only start after the non-local
 804        has been launched. */
 805     if (iloc == InteractionLocality::Local && nb->bNonLocalStreamDoneMarked)
 806     {
 807         nb->nonlocal_done.enqueueWaitEvent(deviceStream);
 808         nb->bNonLocalStreamDoneMarked = false;
 809     }
 810
 811     /* DtoH f */
 812     static_assert(sizeof(*nbatom->out[0].f.data()) == sizeof(float),
 813                   "The host force buffer should be in single precision to match device data size.");
 814     copyFromDeviceBuffer(reinterpret_cast<Float3*>(nbatom->out[0].f.data()) + atomsRange.begin(),
 815                          &adat->f,
 816                          atomsRange.begin(),
 817                          atomsRange.size(),
 818                          deviceStream,
 819                          GpuApiCallBehavior::Async,
 820                          bDoTime ? timers->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
 821
 822     issueClFlushInStream(deviceStream);
 823
 824     /* After the non-local D2H is launched the nonlocal_done event can be
 825        recorded which signals that the local D2H can proceed. This event is not
 826        placed after the non-local kernel because we first need the non-local
 827        data back first. */
 828     if (iloc == InteractionLocality::NonLocal)
 829     {
 830         nb->nonlocal_done.markEvent(deviceStream);
 831         nb->bNonLocalStreamDoneMarked = true;
 832     }
 833
 834     /* only transfer energies in the local stream */
 835     if (iloc == InteractionLocality::Local)
 836     {
 837         /* DtoH fshift when virial is needed */
 838         if (stepWork.computeVirial)
 839         {
 840             static_assert(
 841                     sizeof(*nb->nbst.fShift) == sizeof(Float3),
 842                     "Sizes of host- and device-side shift vector elements should be the same.");
 843             copyFromDeviceBuffer(nb->nbst.fShift,
 844                                  &adat->fShift,
 845                                  0,
 846                                  SHIFTS,
 847                                  deviceStream,
 848                                  GpuApiCallBehavior::Async,
 849                                  bDoTime ? timers->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
 850         }
 851
 852         /* DtoH energies */
 853         if (stepWork.computeEnergy)
 854         {
 855             static_assert(sizeof(*nb->nbst.eLJ) == sizeof(float),
 856                           "Sizes of host- and device-side LJ energy terms should be the same.");
 857             copyFromDeviceBuffer(nb->nbst.eLJ,
 858                                  &adat->eLJ,
 859                                  0,
 860                                  1,
 861                                  deviceStream,
 862                                  GpuApiCallBehavior::Async,
 863                                  bDoTime ? timers->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
 864             static_assert(sizeof(*nb->nbst.eElec) == sizeof(float),
 865                           "Sizes of host- and device-side electrostatic energy terms should be the "
 866                           "same.");
 867             copyFromDeviceBuffer(nb->nbst.eElec,
 868                                  &adat->eElec,
 869                                  0,
 870                                  1,
 871                                  deviceStream,
 872                                  GpuApiCallBehavior::Async,
 873                                  bDoTime ? timers->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
 874         }
 875     }
 876
 877     if (bDoTime)
 878     {
 879         timers->xf[atomLocality].nb_d2h.closeTimingRegion(deviceStream);
 880     }
 881 }
 882
 883 void nbnxnInsertNonlocalGpuDependency(NbnxmGpu* nb, const InteractionLocality interactionLocality)
 884 {
 885     const DeviceStream& deviceStream = *nb->deviceStreams[interactionLocality];
 886
 887     /* When we get here all misc operations issued in the local stream as well as
 888        the local xq H2D are done,
 889        so we record that in the local stream and wait for it in the nonlocal one.
 890        This wait needs to precede any PP tasks, bonded or nonbonded, that may
 891        compute on interactions between local and nonlocal atoms.
 892      */
 893     if (nb->bUseTwoStreams)
 894     {
 895         if (interactionLocality == InteractionLocality::Local)
 896         {
 897             nb->misc_ops_and_local_H2D_done.markEvent(deviceStream);
 898             issueClFlushInStream(deviceStream);
 899         }
 900         else
 901         {
 902             nb->misc_ops_and_local_H2D_done.enqueueWaitEvent(deviceStream);
 903         }
 904     }
 905 }
 906
 907 /*! \brief Launch asynchronously the xq buffer host to device copy. */
 908 void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const AtomLocality atomLocality)
 909 {
 910     GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
 911
 912     const InteractionLocality iloc = atomToInteractionLocality(atomLocality);
 913
 914     NBAtomData*         adat         = nb->atdat;
 915     gpu_plist*          plist        = nb->plist[iloc];
 916     Nbnxm::GpuTimers*   timers       = nb->timers;
 917     const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
 918
 919     const bool bDoTime = nb->bDoTime;
 920
 921     /* Don't launch the non-local H2D copy if there is no dependent
 922        work to do: neither non-local nor other (e.g. bonded) work
 923        to do that has as input the nbnxn coordaintes.
 924        Doing the same for the local kernel is more complicated, since the
 925        local part of the force array also depends on the non-local kernel.
 926        So to avoid complicating the code and to reduce the risk of bugs,
 927        we always call the local local x+q copy (and the rest of the local
 928        work in nbnxn_gpu_launch_kernel().
 929      */
 930     if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(nb, iloc))
 931     {
 932         plist->haveFreshList = false;
 933
 934         // The event is marked for Local interactions unconditionally,
 935         // so it has to be released here because of the early return
 936         // for NonLocal interactions.
 937         nb->misc_ops_and_local_H2D_done.reset();
 938
 939         return;
 940     }
 941
 942     /* local/nonlocal offset and length used for xq and f */
 943     const auto atomsRange = getGpuAtomRange(adat, atomLocality);
 944
 945     /* beginning of timed HtoD section */
 946     if (bDoTime)
 947     {
 948         timers->xf[atomLocality].nb_h2d.openTimingRegion(deviceStream);
 949     }
 950
 951     /* HtoD x, q */
 952     GMX_ASSERT(nbatom->XFormat == nbatXYZQ,
 953                "The coordinates should be in xyzq format to copy to the Float4 device buffer.");
 954     copyToDeviceBuffer(&adat->xq,
 955                        reinterpret_cast<const Float4*>(nbatom->x().data()) + atomsRange.begin(),
 956                        atomsRange.begin(),
 957                        atomsRange.size(),
 958                        deviceStream,
 959                        GpuApiCallBehavior::Async,
 960                        nullptr);
 961
 962     if (bDoTime)
 963     {
 964         timers->xf[atomLocality].nb_h2d.closeTimingRegion(deviceStream);
 965     }
 966
 967     /* When we get here all misc operations issued in the local stream as well as
 968        the local xq H2D are done,
 969        so we record that in the local stream and wait for it in the nonlocal one.
 970        This wait needs to precede any PP tasks, bonded or nonbonded, that may
 971        compute on interactions between local and nonlocal atoms.
 972      */
 973     nbnxnInsertNonlocalGpuDependency(nb, iloc);
 974 }
 975
 976 } // namespace Nbnxm