src/gromacs/nbnxm/nbnxm_gpu_data_mgmt.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2012,2013,2014,2015,2016 by the GROMACS development team.
   5  * Copyright (c) 2017,2018,2019,2020,2021, by the GROMACS development team, led by
   6  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   7  * and including many others, as listed in the AUTHORS file in the
   8  * top-level source directory and at http://www.gromacs.org.
   9  *
  10  * GROMACS is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU Lesser General Public License
  12  * as published by the Free Software Foundation; either version 2.1
  13  * of the License, or (at your option) any later version.
  14  *
  15  * GROMACS is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18  * Lesser General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public
  21  * License along with GROMACS; if not, see
  22  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  23  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  24  *
  25  * If you want to redistribute modifications to GROMACS, please
  26  * consider that scientific software is very special. Version
  27  * control is crucial - bugs must be traceable. We will be happy to
  28  * consider code for inclusion in the official distribution, but
  29  * derived work must not be called official GROMACS. Details are found
  30  * in the README & COPYING files - if they are missing, get the
  31  * official version at http://www.gromacs.org.
  32  *
  33  * To help us fund GROMACS development, we humbly ask that you cite
  34  * the research papers on the package. Check out http://www.gromacs.org.
  35  */
  36 /*! \internal \file
  37  *  \brief Define common implementation of nbnxm_gpu_data_mgmt.h
  38  *
  39  *  \author Anca Hamuraru <anca@streamcomputing.eu>
  40  *  \author Dimitrios Karkoulis <dimitris.karkoulis@gmail.com>
  41  *  \author Teemu Virolainen <teemu@streamcomputing.eu>
  42  *  \author Szilárd Páll <pall.szilard@gmail.com>
  43  *  \author Artem Zhmurov <zhmurov@gmail.com>
  44  *
  45  *  \ingroup module_nbnxm
  46  */
  47 #include "gmxpre.h"
  48
  49 #include "config.h"
  50
  51 #if GMX_GPU_CUDA
  52 #    include "cuda/nbnxm_cuda_types.h"
  53 #endif
  54
  55 #if GMX_GPU_OPENCL
  56 #    include "opencl/nbnxm_ocl_types.h"
  57 #endif
  58
  59 #if GMX_GPU_SYCL
  60 #    include "sycl/nbnxm_sycl_types.h"
  61 #endif
  62
  63 #include "nbnxm_gpu_data_mgmt.h"
  64
  65 #include "gromacs/gpu_utils/device_stream_manager.h"
  66 #include "gromacs/gpu_utils/gputraits.h"
  67 #include "gromacs/gpu_utils/pmalloc.h"
  68 #include "gromacs/hardware/device_information.h"
  69 #include "gromacs/mdtypes/interaction_const.h"
  70 #include "gromacs/mdtypes/simulation_workload.h"
  71 #include "gromacs/nbnxm/gpu_common_utils.h"
  72 #include "gromacs/nbnxm/gpu_data_mgmt.h"
  73 #include "gromacs/nbnxm/gridset.h"
  74 #include "gromacs/pbcutil/ishift.h"
  75 #include "gromacs/timing/gpu_timing.h"
  76 #include "gromacs/pbcutil/ishift.h"
  77 #include "gromacs/utility/cstringutil.h"
  78 #include "gromacs/utility/exceptions.h"
  79 #include "gromacs/utility/fatalerror.h"
  80
  81 #include "nbnxm_gpu.h"
  82 #include "pairlistsets.h"
  83
  84 namespace Nbnxm
  85 {
  86
  87 inline void issueClFlushInStream(const DeviceStream& deviceStream)
  88 {
  89 #if GMX_GPU_OPENCL
  90     /* Based on the v1.2 section 5.13 of the OpenCL spec, a flush is needed
  91      * in the stream after marking an event in it in order to be able to sync with
  92      * the event from another stream.
  93      */
  94     cl_int cl_error = clFlush(deviceStream.stream());
  95     if (cl_error != CL_SUCCESS)
  96     {
  97         GMX_THROW(gmx::InternalError("clFlush failed: " + ocl_get_error_string(cl_error)));
  98     }
  99 #else
 100     GMX_UNUSED_VALUE(deviceStream);
 101 #endif
 102 }
 103
 104 void init_ewald_coulomb_force_table(const EwaldCorrectionTables& tables,
 105                                     NBParamGpu*                  nbp,
 106                                     const DeviceContext&         deviceContext)
 107 {
 108     if (nbp->coulomb_tab)
 109     {
 110         destroyParamLookupTable(&nbp->coulomb_tab, nbp->coulomb_tab_texobj);
 111     }
 112
 113     nbp->coulomb_tab_scale = tables.scale;
 114     initParamLookupTable(
 115             &nbp->coulomb_tab, &nbp->coulomb_tab_texobj, tables.tableF.data(), tables.tableF.size(), deviceContext);
 116 }
 117
 118 enum ElecType nbnxn_gpu_pick_ewald_kernel_type(const interaction_const_t& ic,
 119                                                const DeviceInformation gmx_unused& deviceInfo)
 120 {
 121     bool bTwinCut = (ic.rcoulomb != ic.rvdw);
 122
 123     /* Benchmarking/development environment variables to force the use of
 124        analytical or tabulated Ewald kernel. */
 125     const bool forceAnalyticalEwald = (getenv("GMX_GPU_NB_ANA_EWALD") != nullptr);
 126     const bool forceTabulatedEwald  = (getenv("GMX_GPU_NB_TAB_EWALD") != nullptr);
 127     const bool forceTwinCutoffEwald = (getenv("GMX_GPU_NB_EWALD_TWINCUT") != nullptr);
 128
 129     if (forceAnalyticalEwald && forceTabulatedEwald)
 130     {
 131         gmx_incons(
 132                 "Both analytical and tabulated Ewald GPU non-bonded kernels "
 133                 "requested through environment variables.");
 134     }
 135
 136     /* By default, use analytical Ewald except with CUDA on NVIDIA CC 7.0 and 8.0.
 137      */
 138     const bool c_useTabulatedEwaldDefault =
 139 #if GMX_GPU_CUDA
 140             (deviceInfo.prop.major == 7 && deviceInfo.prop.minor == 0)
 141             || (deviceInfo.prop.major == 8 && deviceInfo.prop.minor == 0);
 142 #else
 143             false;
 144 #endif
 145     bool bUseAnalyticalEwald = !c_useTabulatedEwaldDefault;
 146     if (forceAnalyticalEwald)
 147     {
 148         bUseAnalyticalEwald = true;
 149         if (debug)
 150         {
 151             fprintf(debug, "Using analytical Ewald GPU kernels\n");
 152         }
 153     }
 154     else if (forceTabulatedEwald)
 155     {
 156         bUseAnalyticalEwald = false;
 157
 158         if (debug)
 159         {
 160             fprintf(debug, "Using tabulated Ewald GPU kernels\n");
 161         }
 162     }
 163
 164     /* Use twin cut-off kernels if requested by bTwinCut or the env. var.
 165        forces it (use it for debugging/benchmarking only). */
 166     if (!bTwinCut && !forceTwinCutoffEwald)
 167     {
 168         return bUseAnalyticalEwald ? ElecType::EwaldAna : ElecType::EwaldTab;
 169     }
 170     else
 171     {
 172         return bUseAnalyticalEwald ? ElecType::EwaldAnaTwin : ElecType::EwaldTabTwin;
 173     }
 174 }
 175
 176 void set_cutoff_parameters(NBParamGpu* nbp, const interaction_const_t& ic, const PairlistParams& listParams)
 177 {
 178     nbp->ewald_beta        = ic.ewaldcoeff_q;
 179     nbp->sh_ewald          = ic.sh_ewald;
 180     nbp->epsfac            = ic.epsfac;
 181     nbp->two_k_rf          = 2.0 * ic.reactionFieldCoefficient;
 182     nbp->c_rf              = ic.reactionFieldShift;
 183     nbp->rvdw_sq           = ic.rvdw * ic.rvdw;
 184     nbp->rcoulomb_sq       = ic.rcoulomb * ic.rcoulomb;
 185     nbp->rlistOuter_sq     = listParams.rlistOuter * listParams.rlistOuter;
 186     nbp->rlistInner_sq     = listParams.rlistInner * listParams.rlistInner;
 187     nbp->useDynamicPruning = listParams.useDynamicPruning;
 188
 189     nbp->sh_lj_ewald   = ic.sh_lj_ewald;
 190     nbp->ewaldcoeff_lj = ic.ewaldcoeff_lj;
 191
 192     nbp->rvdw_switch      = ic.rvdw_switch;
 193     nbp->dispersion_shift = ic.dispersion_shift;
 194     nbp->repulsion_shift  = ic.repulsion_shift;
 195     nbp->vdw_switch       = ic.vdw_switch;
 196 }
 197
 198 void gpu_pme_loadbal_update_param(const nonbonded_verlet_t* nbv, const interaction_const_t& ic)
 199 {
 200     if (!nbv || !nbv->useGpu())
 201     {
 202         return;
 203     }
 204     NbnxmGpu*   nb  = nbv->gpu_nbv;
 205     NBParamGpu* nbp = nb->nbparam;
 206
 207     set_cutoff_parameters(nbp, ic, nbv->pairlistSets().params());
 208
 209     nbp->elecType = nbnxn_gpu_pick_ewald_kernel_type(ic, nb->deviceContext_->deviceInfo());
 210
 211     GMX_RELEASE_ASSERT(ic.coulombEwaldTables, "Need valid Coulomb Ewald correction tables");
 212     init_ewald_coulomb_force_table(*ic.coulombEwaldTables, nbp, *nb->deviceContext_);
 213 }
 214
 215 void init_plist(gpu_plist* pl)
 216 {
 217     /* initialize to nullptr pointers to data that is not allocated here and will
 218        need reallocation in nbnxn_gpu_init_pairlist */
 219     pl->sci   = nullptr;
 220     pl->cj4   = nullptr;
 221     pl->imask = nullptr;
 222     pl->excl  = nullptr;
 223
 224     /* size -1 indicates that the respective array hasn't been initialized yet */
 225     pl->na_c                   = -1;
 226     pl->nsci                   = -1;
 227     pl->sci_nalloc             = -1;
 228     pl->ncj4                   = -1;
 229     pl->cj4_nalloc             = -1;
 230     pl->nimask                 = -1;
 231     pl->imask_nalloc           = -1;
 232     pl->nexcl                  = -1;
 233     pl->excl_nalloc            = -1;
 234     pl->haveFreshList          = false;
 235     pl->rollingPruningNumParts = 0;
 236     pl->rollingPruningPart     = 0;
 237 }
 238
 239 void init_timings(gmx_wallclock_gpu_nbnxn_t* t)
 240 {
 241     t->nb_h2d_t = 0.0;
 242     t->nb_d2h_t = 0.0;
 243     t->nb_c     = 0;
 244     t->pl_h2d_t = 0.0;
 245     t->pl_h2d_c = 0;
 246     for (int i = 0; i < 2; i++)
 247     {
 248         for (int j = 0; j < 2; j++)
 249         {
 250             t->ktime[i][j].t = 0.0;
 251             t->ktime[i][j].c = 0;
 252         }
 253     }
 254     t->pruneTime.c        = 0;
 255     t->pruneTime.t        = 0.0;
 256     t->dynamicPruneTime.c = 0;
 257     t->dynamicPruneTime.t = 0.0;
 258 }
 259
 260 /*! \brief Initialize \p atomdata first time; it only gets filled at pair-search. */
 261 static void initAtomdataFirst(NBAtomData*          atomdata,
 262                               int                  numTypes,
 263                               const DeviceContext& deviceContext,
 264                               const DeviceStream&  localStream)
 265 {
 266     atomdata->numTypes = numTypes;
 267     allocateDeviceBuffer(&atomdata->shiftVec, SHIFTS, deviceContext);
 268     atomdata->shiftVecUploaded = false;
 269
 270     allocateDeviceBuffer(&atomdata->fShift, SHIFTS, deviceContext);
 271     allocateDeviceBuffer(&atomdata->eLJ, 1, deviceContext);
 272     allocateDeviceBuffer(&atomdata->eElec, 1, deviceContext);
 273
 274     clearDeviceBufferAsync(&atomdata->fShift, 0, SHIFTS, localStream);
 275     clearDeviceBufferAsync(&atomdata->eElec, 0, 1, localStream);
 276     clearDeviceBufferAsync(&atomdata->eLJ, 0, 1, localStream);
 277
 278     /* initialize to nullptr pointers to data that is not allocated here and will
 279        need reallocation in later */
 280     atomdata->xq = nullptr;
 281     atomdata->f  = nullptr;
 282
 283     /* size -1 indicates that the respective array hasn't been initialized yet */
 284     atomdata->numAtoms      = -1;
 285     atomdata->numAtomsAlloc = -1;
 286 }
 287
 288 /*! \brief Initialize the nonbonded parameter data structure. */
 289 static void initNbparam(NBParamGpu*                     nbp,
 290                         const interaction_const_t&      ic,
 291                         const PairlistParams&           listParams,
 292                         const nbnxn_atomdata_t::Params& nbatParams,
 293                         const DeviceContext&            deviceContext)
 294 {
 295     const int numTypes = nbatParams.numTypes;
 296
 297     set_cutoff_parameters(nbp, ic, listParams);
 298
 299     nbp->vdwType  = nbnxmGpuPickVdwKernelType(ic, nbatParams.ljCombinationRule);
 300     nbp->elecType = nbnxmGpuPickElectrostaticsKernelType(ic, deviceContext.deviceInfo());
 301
 302     if (ic.vdwtype == VanDerWaalsType::Pme)
 303     {
 304         if (ic.ljpme_comb_rule == LongRangeVdW::Geom)
 305         {
 306             GMX_ASSERT(nbatParams.ljCombinationRule == LJCombinationRule::Geometric,
 307                        "Combination rule mismatch!");
 308         }
 309         else
 310         {
 311             GMX_ASSERT(nbatParams.ljCombinationRule == LJCombinationRule::LorentzBerthelot,
 312                        "Combination rule mismatch!");
 313         }
 314     }
 315
 316     /* generate table for PME */
 317     if (nbp->elecType == ElecType::EwaldTab || nbp->elecType == ElecType::EwaldTabTwin)
 318     {
 319         GMX_RELEASE_ASSERT(ic.coulombEwaldTables, "Need valid Coulomb Ewald correction tables");
 320         init_ewald_coulomb_force_table(*ic.coulombEwaldTables, nbp, deviceContext);
 321     }
 322     else
 323     {
 324         // Need to initialize for OpenCL, since it is unconditionally used as a kernel argument.
 325         allocateDeviceBuffer(&nbp->coulomb_tab, 1, deviceContext);
 326     }
 327
 328     /* set up LJ parameter lookup table */
 329     if (!useLjCombRule(nbp->vdwType))
 330     {
 331         static_assert(sizeof(decltype(nbp->nbfp)) == 2 * sizeof(decltype(*nbatParams.nbfp.data())),
 332                       "Mismatch in the size of host / device data types");
 333         initParamLookupTable(&nbp->nbfp,
 334                              &nbp->nbfp_texobj,
 335                              reinterpret_cast<const Float2*>(nbatParams.nbfp.data()),
 336                              numTypes * numTypes,
 337                              deviceContext);
 338     }
 339     else
 340     {
 341         // Need to initialize for OpenCL, since it is unconditionally used as a kernel argument.
 342         allocateDeviceBuffer(&nbp->nbfp, 1, deviceContext);
 343     }
 344
 345     /* set up LJ-PME parameter lookup table */
 346     if (ic.vdwtype == VanDerWaalsType::Pme)
 347     {
 348         static_assert(sizeof(decltype(nbp->nbfp_comb))
 349                               == 2 * sizeof(decltype(*nbatParams.nbfp_comb.data())),
 350                       "Mismatch in the size of host / device data types");
 351         initParamLookupTable(&nbp->nbfp_comb,
 352                              &nbp->nbfp_comb_texobj,
 353                              reinterpret_cast<const Float2*>(nbatParams.nbfp_comb.data()),
 354                              numTypes,
 355                              deviceContext);
 356     }
 357     else
 358     {
 359         // Need to initialize for OpenCL, since it is unconditionally used as a kernel argument.
 360         allocateDeviceBuffer(&nbp->nbfp_comb, 1, deviceContext);
 361     }
 362 }
 363
 364 NbnxmGpu* gpu_init(const gmx::DeviceStreamManager& deviceStreamManager,
 365                    const interaction_const_t*      ic,
 366                    const PairlistParams&           listParams,
 367                    const nbnxn_atomdata_t*         nbat,
 368                    const bool                      bLocalAndNonlocal)
 369 {
 370     auto* nb                              = new NbnxmGpu();
 371     nb->deviceContext_                    = &deviceStreamManager.context();
 372     nb->atdat                             = new NBAtomData;
 373     nb->nbparam                           = new NBParamGpu;
 374     nb->plist[InteractionLocality::Local] = new Nbnxm::gpu_plist;
 375     if (bLocalAndNonlocal)
 376     {
 377         nb->plist[InteractionLocality::NonLocal] = new Nbnxm::gpu_plist;
 378     }
 379
 380     nb->bUseTwoStreams = bLocalAndNonlocal;
 381
 382     nb->timers = new Nbnxm::GpuTimers();
 383     snew(nb->timings, 1);
 384
 385     /* WARNING: CUDA timings are incorrect with multiple streams.
 386      * This is the main reason why they are disabled by default.
 387      * Can be enabled by setting GMX_ENABLE_GPU_TIMING environment variable.
 388      * TODO: Consider turning on by default when we can detect nr of streams.
 389      *
 390      * OpenCL timing is enabled by default and can be disabled by
 391      * GMX_DISABLE_GPU_TIMING environment variable.
 392      *
 393      * Timing is disabled in SYCL.
 394      */
 395     nb->bDoTime = (GMX_GPU_CUDA && (getenv("GMX_ENABLE_GPU_TIMING") != nullptr))
 396                   || (GMX_GPU_OPENCL && (getenv("GMX_DISABLE_GPU_TIMING") == nullptr));
 397
 398     if (nb->bDoTime)
 399     {
 400         init_timings(nb->timings);
 401     }
 402
 403     /* init nbst */
 404     pmalloc(reinterpret_cast<void**>(&nb->nbst.eLJ), sizeof(*nb->nbst.eLJ));
 405     pmalloc(reinterpret_cast<void**>(&nb->nbst.eElec), sizeof(*nb->nbst.eElec));
 406     pmalloc(reinterpret_cast<void**>(&nb->nbst.fShift), SHIFTS * sizeof(*nb->nbst.fShift));
 407
 408     init_plist(nb->plist[InteractionLocality::Local]);
 409
 410     /* local/non-local GPU streams */
 411     GMX_RELEASE_ASSERT(deviceStreamManager.streamIsValid(gmx::DeviceStreamType::NonBondedLocal),
 412                        "Local non-bonded stream should be initialized to use GPU for non-bonded.");
 413     const DeviceStream& localStream = deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedLocal);
 414     nb->deviceStreams[InteractionLocality::Local] = &localStream;
 415     // In general, it's not strictly necessary to use 2 streams for SYCL, since they are
 416     // out-of-order. But for the time being, it will be less disruptive to keep them.
 417     if (nb->bUseTwoStreams)
 418     {
 419         init_plist(nb->plist[InteractionLocality::NonLocal]);
 420
 421         GMX_RELEASE_ASSERT(deviceStreamManager.streamIsValid(gmx::DeviceStreamType::NonBondedNonLocal),
 422                            "Non-local non-bonded stream should be initialized to use GPU for "
 423                            "non-bonded with domain decomposition.");
 424         nb->deviceStreams[InteractionLocality::NonLocal] =
 425                 &deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedNonLocal);
 426     }
 427
 428     const nbnxn_atomdata_t::Params& nbatParams    = nbat->params();
 429     const DeviceContext&            deviceContext = *nb->deviceContext_;
 430
 431     initNbparam(nb->nbparam, *ic, listParams, nbatParams, deviceContext);
 432     initAtomdataFirst(nb->atdat, nbatParams.numTypes, deviceContext, localStream);
 433
 434     gpu_init_platform_specific(nb);
 435
 436     if (debug)
 437     {
 438         fprintf(debug, "Initialized NBNXM GPU data structures.\n");
 439     }
 440
 441     return nb;
 442 }
 443
 444 void gpu_upload_shiftvec(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom)
 445 {
 446     NBAtomData*         adat        = nb->atdat;
 447     const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local];
 448
 449     /* only if we have a dynamic box */
 450     if (nbatom->bDynamicBox || !adat->shiftVecUploaded)
 451     {
 452         copyToDeviceBuffer(&adat->shiftVec,
 453                            gmx::asGenericFloat3Pointer(nbatom->shift_vec),
 454                            0,
 455                            SHIFTS,
 456                            localStream,
 457                            GpuApiCallBehavior::Async,
 458                            nullptr);
 459         adat->shiftVecUploaded = true;
 460     }
 461 }
 462
 463 //! This function is documented in the header file
 464 void gpu_init_pairlist(NbnxmGpu* nb, const NbnxnPairlistGpu* h_plist, const InteractionLocality iloc)
 465 {
 466     char sbuf[STRLEN];
 467     // Timing accumulation should happen only if there was work to do
 468     // because getLastRangeTime() gets skipped with empty lists later
 469     // which leads to the counter not being reset.
 470     bool                bDoTime      = (nb->bDoTime && !h_plist->sci.empty());
 471     const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
 472     gpu_plist*          d_plist      = nb->plist[iloc];
 473
 474     if (d_plist->na_c < 0)
 475     {
 476         d_plist->na_c = h_plist->na_ci;
 477     }
 478     else
 479     {
 480         if (d_plist->na_c != h_plist->na_ci)
 481         {
 482             sprintf(sbuf,
 483                     "In init_plist: the #atoms per cell has changed (from %d to %d)",
 484                     d_plist->na_c,
 485                     h_plist->na_ci);
 486             gmx_incons(sbuf);
 487         }
 488     }
 489
 490     GpuTimers::Interaction& iTimers = nb->timers->interaction[iloc];
 491
 492     if (bDoTime)
 493     {
 494         iTimers.pl_h2d.openTimingRegion(deviceStream);
 495         iTimers.didPairlistH2D = true;
 496     }
 497
 498     // TODO most of this function is same in CUDA and OpenCL, move into the header
 499     const DeviceContext& deviceContext = *nb->deviceContext_;
 500
 501     reallocateDeviceBuffer(
 502             &d_plist->sci, h_plist->sci.size(), &d_plist->nsci, &d_plist->sci_nalloc, deviceContext);
 503     copyToDeviceBuffer(&d_plist->sci,
 504                        h_plist->sci.data(),
 505                        0,
 506                        h_plist->sci.size(),
 507                        deviceStream,
 508                        GpuApiCallBehavior::Async,
 509                        bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
 510
 511     reallocateDeviceBuffer(
 512             &d_plist->cj4, h_plist->cj4.size(), &d_plist->ncj4, &d_plist->cj4_nalloc, deviceContext);
 513     copyToDeviceBuffer(&d_plist->cj4,
 514                        h_plist->cj4.data(),
 515                        0,
 516                        h_plist->cj4.size(),
 517                        deviceStream,
 518                        GpuApiCallBehavior::Async,
 519                        bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
 520
 521     reallocateDeviceBuffer(&d_plist->imask,
 522                            h_plist->cj4.size() * c_nbnxnGpuClusterpairSplit,
 523                            &d_plist->nimask,
 524                            &d_plist->imask_nalloc,
 525                            deviceContext);
 526
 527     reallocateDeviceBuffer(
 528             &d_plist->excl, h_plist->excl.size(), &d_plist->nexcl, &d_plist->excl_nalloc, deviceContext);
 529     copyToDeviceBuffer(&d_plist->excl,
 530                        h_plist->excl.data(),
 531                        0,
 532                        h_plist->excl.size(),
 533                        deviceStream,
 534                        GpuApiCallBehavior::Async,
 535                        bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
 536
 537     if (bDoTime)
 538     {
 539         iTimers.pl_h2d.closeTimingRegion(deviceStream);
 540     }
 541
 542     /* need to prune the pair list during the next step */
 543     d_plist->haveFreshList = true;
 544 }
 545
 546 void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat)
 547 {
 548     bool                 bDoTime       = nb->bDoTime;
 549     Nbnxm::GpuTimers*    timers        = bDoTime ? nb->timers : nullptr;
 550     NBAtomData*          atdat         = nb->atdat;
 551     const DeviceContext& deviceContext = *nb->deviceContext_;
 552     const DeviceStream&  localStream   = *nb->deviceStreams[InteractionLocality::Local];
 553
 554     int  numAtoms  = nbat->numAtoms();
 555     bool realloced = false;
 556
 557     if (bDoTime)
 558     {
 559         /* time async copy */
 560         timers->atdat.openTimingRegion(localStream);
 561     }
 562
 563     /* need to reallocate if we have to copy more atoms than the amount of space
 564        available and only allocate if we haven't initialized yet, i.e atdat->natoms == -1 */
 565     if (numAtoms > atdat->numAtomsAlloc)
 566     {
 567         int numAlloc = over_alloc_small(numAtoms);
 568
 569         /* free up first if the arrays have already been initialized */
 570         if (atdat->numAtomsAlloc != -1)
 571         {
 572             freeDeviceBuffer(&atdat->f);
 573             freeDeviceBuffer(&atdat->xq);
 574             if (useLjCombRule(nb->nbparam->vdwType))
 575             {
 576                 freeDeviceBuffer(&atdat->ljComb);
 577             }
 578             else
 579             {
 580                 freeDeviceBuffer(&atdat->atomTypes);
 581             }
 582         }
 583
 584
 585         allocateDeviceBuffer(&atdat->f, numAlloc, deviceContext);
 586         allocateDeviceBuffer(&atdat->xq, numAlloc, deviceContext);
 587
 588         if (useLjCombRule(nb->nbparam->vdwType))
 589         {
 590             // Two Lennard-Jones parameters per atom
 591             allocateDeviceBuffer(&atdat->ljComb, numAlloc, deviceContext);
 592         }
 593         else
 594         {
 595             allocateDeviceBuffer(&atdat->atomTypes, numAlloc, deviceContext);
 596         }
 597
 598         atdat->numAtomsAlloc = numAlloc;
 599         realloced            = true;
 600     }
 601
 602     atdat->numAtoms      = numAtoms;
 603     atdat->numAtomsLocal = nbat->natoms_local;
 604
 605     /* need to clear GPU f output if realloc happened */
 606     if (realloced)
 607     {
 608         clearDeviceBufferAsync(&atdat->f, 0, atdat->numAtomsAlloc, localStream);
 609     }
 610
 611     if (useLjCombRule(nb->nbparam->vdwType))
 612     {
 613         static_assert(
 614                 sizeof(Float2) == 2 * sizeof(*nbat->params().lj_comb.data()),
 615                 "Size of a pair of LJ parameters elements should be equal to the size of Float2.");
 616         copyToDeviceBuffer(&atdat->ljComb,
 617                            reinterpret_cast<const Float2*>(nbat->params().lj_comb.data()),
 618                            0,
 619                            numAtoms,
 620                            localStream,
 621                            GpuApiCallBehavior::Async,
 622                            bDoTime ? timers->atdat.fetchNextEvent() : nullptr);
 623     }
 624     else
 625     {
 626         static_assert(sizeof(int) == sizeof(*nbat->params().type.data()),
 627                       "Sizes of host- and device-side atom types should be the same.");
 628         copyToDeviceBuffer(&atdat->atomTypes,
 629                            nbat->params().type.data(),
 630                            0,
 631                            numAtoms,
 632                            localStream,
 633                            GpuApiCallBehavior::Async,
 634                            bDoTime ? timers->atdat.fetchNextEvent() : nullptr);
 635     }
 636
 637     if (bDoTime)
 638     {
 639         timers->atdat.closeTimingRegion(localStream);
 640     }
 641
 642     /* kick off the tasks enqueued above to ensure concurrency with the search */
 643     issueClFlushInStream(localStream);
 644 }
 645
 646 void gpu_clear_outputs(NbnxmGpu* nb, bool computeVirial)
 647 {
 648     NBAtomData*         adat        = nb->atdat;
 649     const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local];
 650     // Clear forces
 651     clearDeviceBufferAsync(&adat->f, 0, nb->atdat->numAtoms, localStream);
 652     // Clear shift force array and energies if the outputs were used in the current step
 653     if (computeVirial)
 654     {
 655         clearDeviceBufferAsync(&adat->fShift, 0, SHIFTS, localStream);
 656         clearDeviceBufferAsync(&adat->eLJ, 0, 1, localStream);
 657         clearDeviceBufferAsync(&adat->eElec, 0, 1, localStream);
 658     }
 659     issueClFlushInStream(localStream);
 660 }
 661
 662 //! This function is documented in the header file
 663 gmx_wallclock_gpu_nbnxn_t* gpu_get_timings(NbnxmGpu* nb)
 664 {
 665     return (nb != nullptr && nb->bDoTime) ? nb->timings : nullptr;
 666 }
 667
 668 //! This function is documented in the header file
 669 void gpu_reset_timings(nonbonded_verlet_t* nbv)
 670 {
 671     if (nbv->gpu_nbv && nbv->gpu_nbv->bDoTime)
 672     {
 673         init_timings(nbv->gpu_nbv->timings);
 674     }
 675 }
 676
 677 bool gpu_is_kernel_ewald_analytical(const NbnxmGpu* nb)
 678 {
 679     return ((nb->nbparam->elecType == ElecType::EwaldAna)
 680             || (nb->nbparam->elecType == ElecType::EwaldAnaTwin));
 681 }
 682
 683 enum ElecType nbnxmGpuPickElectrostaticsKernelType(const interaction_const_t& ic,
 684                                                    const DeviceInformation&   deviceInfo)
 685 {
 686     if (ic.eeltype == CoulombInteractionType::Cut)
 687     {
 688         return ElecType::Cut;
 689     }
 690     else if (EEL_RF(ic.eeltype))
 691     {
 692         return ElecType::RF;
 693     }
 694     else if ((EEL_PME(ic.eeltype) || ic.eeltype == CoulombInteractionType::Ewald))
 695     {
 696         return nbnxn_gpu_pick_ewald_kernel_type(ic, deviceInfo);
 697     }
 698     else
 699     {
 700         /* Shouldn't happen, as this is checked when choosing Verlet-scheme */
 701         GMX_THROW(gmx::InconsistentInputError(
 702                 gmx::formatString("The requested electrostatics type %s is not implemented in "
 703                                   "the GPU accelerated kernels!",
 704                                   enumValueToString(ic.eeltype))));
 705     }
 706 }
 707
 708
 709 enum VdwType nbnxmGpuPickVdwKernelType(const interaction_const_t& ic, LJCombinationRule ljCombinationRule)
 710 {
 711     if (ic.vdwtype == VanDerWaalsType::Cut)
 712     {
 713         switch (ic.vdw_modifier)
 714         {
 715             case InteractionModifiers::None:
 716             case InteractionModifiers::PotShift:
 717                 switch (ljCombinationRule)
 718                 {
 719                     case LJCombinationRule::None: return VdwType::Cut;
 720                     case LJCombinationRule::Geometric: return VdwType::CutCombGeom;
 721                     case LJCombinationRule::LorentzBerthelot: return VdwType::CutCombLB;
 722                     default:
 723                         GMX_THROW(gmx::InconsistentInputError(gmx::formatString(
 724                                 "The requested LJ combination rule %s is not implemented in "
 725                                 "the GPU accelerated kernels!",
 726                                 enumValueToString(ljCombinationRule))));
 727                 }
 728             case InteractionModifiers::ForceSwitch: return VdwType::FSwitch;
 729             case InteractionModifiers::PotSwitch: return VdwType::PSwitch;
 730             default:
 731                 GMX_THROW(gmx::InconsistentInputError(
 732                         gmx::formatString("The requested VdW interaction modifier %s is not "
 733                                           "implemented in the GPU accelerated kernels!",
 734                                           enumValueToString(ic.vdw_modifier))));
 735         }
 736     }
 737     else if (ic.vdwtype == VanDerWaalsType::Pme)
 738     {
 739         if (ic.ljpme_comb_rule == LongRangeVdW::Geom)
 740         {
 741             assert(ljCombinationRule == LJCombinationRule::Geometric);
 742             return VdwType::EwaldGeom;
 743         }
 744         else
 745         {
 746             assert(ljCombinationRule == LJCombinationRule::LorentzBerthelot);
 747             return VdwType::EwaldLB;
 748         }
 749     }
 750     else
 751     {
 752         GMX_THROW(gmx::InconsistentInputError(gmx::formatString(
 753                 "The requested VdW type %s is not implemented in the GPU accelerated kernels!",
 754                 enumValueToString(ic.vdwtype))));
 755     }
 756 }
 757
 758 void setupGpuShortRangeWork(NbnxmGpu* nb, const gmx::GpuBonded* gpuBonded, const gmx::InteractionLocality iLocality)
 759 {
 760     GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
 761
 762     // There is short-range work if the pair list for the provided
 763     // interaction locality contains entries or if there is any
 764     // bonded work (as this is not split into local/nonlocal).
 765     nb->haveWork[iLocality] = ((nb->plist[iLocality]->nsci != 0)
 766                                || (gpuBonded != nullptr && gpuBonded->haveInteractions()));
 767 }
 768
 769 bool haveGpuShortRangeWork(const NbnxmGpu* nb, const gmx::InteractionLocality interactionLocality)
 770 {
 771     GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
 772
 773     return nb->haveWork[interactionLocality];
 774 }
 775
 776 /*! \brief
 777  * Launch asynchronously the download of nonbonded forces from the GPU
 778  * (and energies/shift forces if required).
 779  */
 780 void gpu_launch_cpyback(NbnxmGpu*                nb,
 781                         struct nbnxn_atomdata_t* nbatom,
 782                         const gmx::StepWorkload& stepWork,
 783                         const AtomLocality       atomLocality)
 784 {
 785     GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
 786
 787     /* determine interaction locality from atom locality */
 788     const InteractionLocality iloc = atomToInteractionLocality(atomLocality);
 789     GMX_ASSERT(iloc == InteractionLocality::Local
 790                        || (iloc == InteractionLocality::NonLocal && nb->bNonLocalStreamDoneMarked == false),
 791                "Non-local stream is indicating that the copy back event is enqueued at the "
 792                "beginning of the copy back function.");
 793
 794     /* extract the data */
 795     NBAtomData*         adat         = nb->atdat;
 796     Nbnxm::GpuTimers*   timers       = nb->timers;
 797     bool                bDoTime      = nb->bDoTime;
 798     const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
 799
 800     /* don't launch non-local copy-back if there was no non-local work to do */
 801     if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(nb, iloc))
 802     {
 803         /* TODO An alternative way to signal that non-local work is
 804            complete is to use a clEnqueueMarker+clEnqueueBarrier
 805            pair. However, the use of bNonLocalStreamDoneMarked has the
 806            advantage of being local to the host, so probably minimizes
 807            overhead. Curiously, for NVIDIA OpenCL with an empty-domain
 808            test case, overall simulation performance was higher with
 809            the API calls, but this has not been tested on AMD OpenCL,
 810            so could be worth considering in future. */
 811         nb->bNonLocalStreamDoneMarked = false;
 812         return;
 813     }
 814
 815     /* local/nonlocal offset and length used for xq and f */
 816     auto atomsRange = getGpuAtomRange(adat, atomLocality);
 817
 818     /* beginning of timed D2H section */
 819     if (bDoTime)
 820     {
 821         timers->xf[atomLocality].nb_d2h.openTimingRegion(deviceStream);
 822     }
 823
 824     /* With DD the local D2H transfer can only start after the non-local
 825        has been launched. */
 826     if (iloc == InteractionLocality::Local && nb->bNonLocalStreamDoneMarked)
 827     {
 828         nb->nonlocal_done.enqueueWaitEvent(deviceStream);
 829         nb->bNonLocalStreamDoneMarked = false;
 830     }
 831
 832     /* DtoH f */
 833     static_assert(sizeof(*nbatom->out[0].f.data()) == sizeof(float),
 834                   "The host force buffer should be in single precision to match device data size.");
 835     copyFromDeviceBuffer(reinterpret_cast<Float3*>(nbatom->out[0].f.data()) + atomsRange.begin(),
 836                          &adat->f,
 837                          atomsRange.begin(),
 838                          atomsRange.size(),
 839                          deviceStream,
 840                          GpuApiCallBehavior::Async,
 841                          bDoTime ? timers->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
 842
 843     issueClFlushInStream(deviceStream);
 844
 845     /* After the non-local D2H is launched the nonlocal_done event can be
 846        recorded which signals that the local D2H can proceed. This event is not
 847        placed after the non-local kernel because we first need the non-local
 848        data back first. */
 849     if (iloc == InteractionLocality::NonLocal)
 850     {
 851         nb->nonlocal_done.markEvent(deviceStream);
 852         nb->bNonLocalStreamDoneMarked = true;
 853     }
 854
 855     /* only transfer energies in the local stream */
 856     if (iloc == InteractionLocality::Local)
 857     {
 858         /* DtoH fshift when virial is needed */
 859         if (stepWork.computeVirial)
 860         {
 861             static_assert(
 862                     sizeof(*nb->nbst.fShift) == sizeof(Float3),
 863                     "Sizes of host- and device-side shift vector elements should be the same.");
 864             copyFromDeviceBuffer(nb->nbst.fShift,
 865                                  &adat->fShift,
 866                                  0,
 867                                  SHIFTS,
 868                                  deviceStream,
 869                                  GpuApiCallBehavior::Async,
 870                                  bDoTime ? timers->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
 871         }
 872
 873         /* DtoH energies */
 874         if (stepWork.computeEnergy)
 875         {
 876             static_assert(sizeof(*nb->nbst.eLJ) == sizeof(float),
 877                           "Sizes of host- and device-side LJ energy terms should be the same.");
 878             copyFromDeviceBuffer(nb->nbst.eLJ,
 879                                  &adat->eLJ,
 880                                  0,
 881                                  1,
 882                                  deviceStream,
 883                                  GpuApiCallBehavior::Async,
 884                                  bDoTime ? timers->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
 885             static_assert(sizeof(*nb->nbst.eElec) == sizeof(float),
 886                           "Sizes of host- and device-side electrostatic energy terms should be the "
 887                           "same.");
 888             copyFromDeviceBuffer(nb->nbst.eElec,
 889                                  &adat->eElec,
 890                                  0,
 891                                  1,
 892                                  deviceStream,
 893                                  GpuApiCallBehavior::Async,
 894                                  bDoTime ? timers->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
 895         }
 896     }
 897
 898     if (bDoTime)
 899     {
 900         timers->xf[atomLocality].nb_d2h.closeTimingRegion(deviceStream);
 901     }
 902 }
 903
 904 void nbnxnInsertNonlocalGpuDependency(NbnxmGpu* nb, const InteractionLocality interactionLocality)
 905 {
 906     const DeviceStream& deviceStream = *nb->deviceStreams[interactionLocality];
 907
 908     /* When we get here all misc operations issued in the local stream as well as
 909        the local xq H2D are done,
 910        so we record that in the local stream and wait for it in the nonlocal one.
 911        This wait needs to precede any PP tasks, bonded or nonbonded, that may
 912        compute on interactions between local and nonlocal atoms.
 913      */
 914     if (nb->bUseTwoStreams)
 915     {
 916         if (interactionLocality == InteractionLocality::Local)
 917         {
 918             nb->misc_ops_and_local_H2D_done.markEvent(deviceStream);
 919             issueClFlushInStream(deviceStream);
 920         }
 921         else
 922         {
 923             nb->misc_ops_and_local_H2D_done.enqueueWaitEvent(deviceStream);
 924         }
 925     }
 926 }
 927
 928 /*! \brief Launch asynchronously the xq buffer host to device copy. */
 929 void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const AtomLocality atomLocality)
 930 {
 931     GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
 932
 933     const InteractionLocality iloc = atomToInteractionLocality(atomLocality);
 934
 935     NBAtomData*         adat         = nb->atdat;
 936     gpu_plist*          plist        = nb->plist[iloc];
 937     Nbnxm::GpuTimers*   timers       = nb->timers;
 938     const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
 939
 940     const bool bDoTime = nb->bDoTime;
 941
 942     /* Don't launch the non-local H2D copy if there is no dependent
 943        work to do: neither non-local nor other (e.g. bonded) work
 944        to do that has as input the nbnxn coordaintes.
 945        Doing the same for the local kernel is more complicated, since the
 946        local part of the force array also depends on the non-local kernel.
 947        So to avoid complicating the code and to reduce the risk of bugs,
 948        we always call the local local x+q copy (and the rest of the local
 949        work in nbnxn_gpu_launch_kernel().
 950      */
 951     if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(nb, iloc))
 952     {
 953         plist->haveFreshList = false;
 954
 955         // The event is marked for Local interactions unconditionally,
 956         // so it has to be released here because of the early return
 957         // for NonLocal interactions.
 958         nb->misc_ops_and_local_H2D_done.reset();
 959
 960         return;
 961     }
 962
 963     /* local/nonlocal offset and length used for xq and f */
 964     const auto atomsRange = getGpuAtomRange(adat, atomLocality);
 965
 966     /* beginning of timed HtoD section */
 967     if (bDoTime)
 968     {
 969         timers->xf[atomLocality].nb_h2d.openTimingRegion(deviceStream);
 970     }
 971
 972     /* HtoD x, q */
 973     GMX_ASSERT(nbatom->XFormat == nbatXYZQ,
 974                "The coordinates should be in xyzq format to copy to the Float4 device buffer.");
 975     copyToDeviceBuffer(&adat->xq,
 976                        reinterpret_cast<const Float4*>(nbatom->x().data()) + atomsRange.begin(),
 977                        atomsRange.begin(),
 978                        atomsRange.size(),
 979                        deviceStream,
 980                        GpuApiCallBehavior::Async,
 981                        nullptr);
 982
 983     if (bDoTime)
 984     {
 985         timers->xf[atomLocality].nb_h2d.closeTimingRegion(deviceStream);
 986     }
 987
 988     /* When we get here all misc operations issued in the local stream as well as
 989        the local xq H2D are done,
 990        so we record that in the local stream and wait for it in the nonlocal one.
 991        This wait needs to precede any PP tasks, bonded or nonbonded, that may
 992        compute on interactions between local and nonlocal atoms.
 993      */
 994     nbnxnInsertNonlocalGpuDependency(nb, iloc);
 995 }
 996
 997
 998 /* Initialization for X buffer operations on GPU. */
 999 void nbnxn_gpu_init_x_to_nbat_x(const Nbnxm::GridSet& gridSet, NbnxmGpu* gpu_nbv)
1000 {
1001     const DeviceStream& localStream   = *gpu_nbv->deviceStreams[InteractionLocality::Local];
1002     const bool          bDoTime       = gpu_nbv->bDoTime;
1003     const int           maxNumColumns = gridSet.numColumnsMax();
1004
1005     reallocateDeviceBuffer(&gpu_nbv->cxy_na,
1006                            maxNumColumns * gridSet.grids().size(),
1007                            &gpu_nbv->ncxy_na,
1008                            &gpu_nbv->ncxy_na_alloc,
1009                            *gpu_nbv->deviceContext_);
1010     reallocateDeviceBuffer(&gpu_nbv->cxy_ind,
1011                            maxNumColumns * gridSet.grids().size(),
1012                            &gpu_nbv->ncxy_ind,
1013                            &gpu_nbv->ncxy_ind_alloc,
1014                            *gpu_nbv->deviceContext_);
1015
1016     for (unsigned int g = 0; g < gridSet.grids().size(); g++)
1017     {
1018         const Nbnxm::Grid& grid = gridSet.grids()[g];
1019
1020         const int  numColumns      = grid.numColumns();
1021         const int* atomIndices     = gridSet.atomIndices().data();
1022         const int  atomIndicesSize = gridSet.atomIndices().size();
1023         const int* cxy_na          = grid.cxy_na().data();
1024         const int* cxy_ind         = grid.cxy_ind().data();
1025
1026         auto* timerH2D = bDoTime ? &gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d : nullptr;
1027
1028         reallocateDeviceBuffer(&gpu_nbv->atomIndices,
1029                                atomIndicesSize,
1030                                &gpu_nbv->atomIndicesSize,
1031                                &gpu_nbv->atomIndicesSize_alloc,
1032                                *gpu_nbv->deviceContext_);
1033
1034         if (atomIndicesSize > 0)
1035         {
1036             if (bDoTime)
1037             {
1038                 timerH2D->openTimingRegion(localStream);
1039             }
1040
1041             copyToDeviceBuffer(&gpu_nbv->atomIndices,
1042                                atomIndices,
1043                                0,
1044                                atomIndicesSize,
1045                                localStream,
1046                                GpuApiCallBehavior::Async,
1047                                bDoTime ? timerH2D->fetchNextEvent() : nullptr);
1048
1049             if (bDoTime)
1050             {
1051                 timerH2D->closeTimingRegion(localStream);
1052             }
1053         }
1054
1055         if (numColumns > 0)
1056         {
1057             if (bDoTime)
1058             {
1059                 timerH2D->openTimingRegion(localStream);
1060             }
1061
1062             copyToDeviceBuffer(&gpu_nbv->cxy_na,
1063                                cxy_na,
1064                                maxNumColumns * g,
1065                                numColumns,
1066                                localStream,
1067                                GpuApiCallBehavior::Async,
1068                                bDoTime ? timerH2D->fetchNextEvent() : nullptr);
1069
1070             if (bDoTime)
1071             {
1072                 timerH2D->closeTimingRegion(localStream);
1073             }
1074
1075             if (bDoTime)
1076             {
1077                 timerH2D->openTimingRegion(localStream);
1078             }
1079
1080             copyToDeviceBuffer(&gpu_nbv->cxy_ind,
1081                                cxy_ind,
1082                                maxNumColumns * g,
1083                                numColumns,
1084                                localStream,
1085                                GpuApiCallBehavior::Async,
1086                                bDoTime ? timerH2D->fetchNextEvent() : nullptr);
1087
1088             if (bDoTime)
1089             {
1090                 timerH2D->closeTimingRegion(localStream);
1091             }
1092         }
1093     }
1094
1095     // The above data is transferred on the local stream but is a
1096     // dependency of the nonlocal stream (specifically the nonlocal X
1097     // buf ops kernel).  We therefore set a dependency to ensure
1098     // that the nonlocal stream waits on the local stream here.
1099     // This call records an event in the local stream:
1100     nbnxnInsertNonlocalGpuDependency(gpu_nbv, Nbnxm::InteractionLocality::Local);
1101     // ...and this call instructs the nonlocal stream to wait on that event:
1102     nbnxnInsertNonlocalGpuDependency(gpu_nbv, Nbnxm::InteractionLocality::NonLocal);
1103 }
1104
1105 } // namespace Nbnxm