src/gromacs/nbnxm/nbnxm_gpu_data_mgmt.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2012,2013,2014,2015,2016 by the GROMACS development team.
   5  * Copyright (c) 2017,2018,2019,2020,2021, by the GROMACS development team, led by
   6  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   7  * and including many others, as listed in the AUTHORS file in the
   8  * top-level source directory and at http://www.gromacs.org.
   9  *
  10  * GROMACS is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU Lesser General Public License
  12  * as published by the Free Software Foundation; either version 2.1
  13  * of the License, or (at your option) any later version.
  14  *
  15  * GROMACS is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18  * Lesser General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public
  21  * License along with GROMACS; if not, see
  22  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  23  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  24  *
  25  * If you want to redistribute modifications to GROMACS, please
  26  * consider that scientific software is very special. Version
  27  * control is crucial - bugs must be traceable. We will be happy to
  28  * consider code for inclusion in the official distribution, but
  29  * derived work must not be called official GROMACS. Details are found
  30  * in the README & COPYING files - if they are missing, get the
  31  * official version at http://www.gromacs.org.
  32  *
  33  * To help us fund GROMACS development, we humbly ask that you cite
  34  * the research papers on the package. Check out http://www.gromacs.org.
  35  */
  36 /*! \internal \file
  37  *  \brief Define common implementation of nbnxm_gpu_data_mgmt.h
  38  *
  39  *  \author Anca Hamuraru <anca@streamcomputing.eu>
  40  *  \author Dimitrios Karkoulis <dimitris.karkoulis@gmail.com>
  41  *  \author Teemu Virolainen <teemu@streamcomputing.eu>
  42  *  \author Szilárd Páll <pall.szilard@gmail.com>
  43  *  \author Artem Zhmurov <zhmurov@gmail.com>
  44  *
  45  *  \ingroup module_nbnxm
  46  */
  47 #include "gmxpre.h"
  48
  49 #include "config.h"
  50
  51 #if GMX_GPU_CUDA
  52 #    include "cuda/nbnxm_cuda_types.h"
  53 #endif
  54
  55 #if GMX_GPU_OPENCL
  56 #    include "opencl/nbnxm_ocl_types.h"
  57 #endif
  58
  59 #if GMX_GPU_SYCL
  60 #    include "sycl/nbnxm_sycl_types.h"
  61 #endif
  62
  63 #include "nbnxm_gpu_data_mgmt.h"
  64
  65 #include "gromacs/hardware/device_information.h"
  66 #include "gromacs/mdtypes/interaction_const.h"
  67 #include "gromacs/nbnxm/gpu_common_utils.h"
  68 #include "gromacs/nbnxm/gpu_data_mgmt.h"
  69 #include "gromacs/timing/gpu_timing.h"
  70 #include "gromacs/utility/cstringutil.h"
  71 #include "gromacs/utility/exceptions.h"
  72 #include "gromacs/utility/fatalerror.h"
  73
  74 #include "nbnxm_gpu.h"
  75 #include "pairlistsets.h"
  76
  77 namespace Nbnxm
  78 {
  79
  80 inline void issueClFlushInStream(const DeviceStream& deviceStream)
  81 {
  82 #if GMX_GPU_OPENCL
  83     /* Based on the v1.2 section 5.13 of the OpenCL spec, a flush is needed
  84      * in the stream after marking an event in it in order to be able to sync with
  85      * the event from another stream.
  86      */
  87     cl_int cl_error = clFlush(deviceStream.stream());
  88     if (cl_error != CL_SUCCESS)
  89     {
  90         GMX_THROW(gmx::InternalError("clFlush failed: " + ocl_get_error_string(cl_error)));
  91     }
  92 #else
  93     GMX_UNUSED_VALUE(deviceStream);
  94 #endif
  95 }
  96
  97 void init_ewald_coulomb_force_table(const EwaldCorrectionTables& tables,
  98                                     NBParamGpu*                  nbp,
  99                                     const DeviceContext&         deviceContext)
 100 {
 101     if (nbp->coulomb_tab)
 102     {
 103         destroyParamLookupTable(&nbp->coulomb_tab, nbp->coulomb_tab_texobj);
 104     }
 105
 106     nbp->coulomb_tab_scale = tables.scale;
 107     initParamLookupTable(
 108             &nbp->coulomb_tab, &nbp->coulomb_tab_texobj, tables.tableF.data(), tables.tableF.size(), deviceContext);
 109 }
 110
 111 enum ElecType nbnxn_gpu_pick_ewald_kernel_type(const interaction_const_t& ic,
 112                                                const DeviceInformation gmx_unused& deviceInfo)
 113 {
 114     bool bTwinCut = (ic.rcoulomb != ic.rvdw);
 115
 116     /* Benchmarking/development environment variables to force the use of
 117        analytical or tabulated Ewald kernel. */
 118     const bool forceAnalyticalEwald = (getenv("GMX_GPU_NB_ANA_EWALD") != nullptr);
 119     const bool forceTabulatedEwald  = (getenv("GMX_GPU_NB_TAB_EWALD") != nullptr);
 120     const bool forceTwinCutoffEwald = (getenv("GMX_GPU_NB_EWALD_TWINCUT") != nullptr);
 121
 122     if (forceAnalyticalEwald && forceTabulatedEwald)
 123     {
 124         gmx_incons(
 125                 "Both analytical and tabulated Ewald GPU non-bonded kernels "
 126                 "requested through environment variables.");
 127     }
 128
 129     /* By default, use analytical Ewald except with CUDA on NVIDIA CC 7.0 and 8.0.
 130      */
 131     const bool c_useTabulatedEwaldDefault =
 132 #if GMX_GPU_CUDA
 133             (deviceInfo.prop.major == 7 && deviceInfo.prop.minor == 0)
 134             || (deviceInfo.prop.major == 8 && deviceInfo.prop.minor == 0);
 135 #else
 136             false;
 137 #endif
 138     bool bUseAnalyticalEwald = !c_useTabulatedEwaldDefault;
 139     if (forceAnalyticalEwald)
 140     {
 141         bUseAnalyticalEwald = true;
 142         if (debug)
 143         {
 144             fprintf(debug, "Using analytical Ewald GPU kernels\n");
 145         }
 146     }
 147     else if (forceTabulatedEwald)
 148     {
 149         bUseAnalyticalEwald = false;
 150
 151         if (debug)
 152         {
 153             fprintf(debug, "Using tabulated Ewald GPU kernels\n");
 154         }
 155     }
 156
 157     /* Use twin cut-off kernels if requested by bTwinCut or the env. var.
 158        forces it (use it for debugging/benchmarking only). */
 159     if (!bTwinCut && !forceTwinCutoffEwald)
 160     {
 161         return bUseAnalyticalEwald ? ElecType::EwaldAna : ElecType::EwaldTab;
 162     }
 163     else
 164     {
 165         return bUseAnalyticalEwald ? ElecType::EwaldAnaTwin : ElecType::EwaldTabTwin;
 166     }
 167 }
 168
 169 void set_cutoff_parameters(NBParamGpu* nbp, const interaction_const_t* ic, const PairlistParams& listParams)
 170 {
 171     nbp->ewald_beta        = ic->ewaldcoeff_q;
 172     nbp->sh_ewald          = ic->sh_ewald;
 173     nbp->epsfac            = ic->epsfac;
 174     nbp->two_k_rf          = 2.0 * ic->reactionFieldCoefficient;
 175     nbp->c_rf              = ic->reactionFieldShift;
 176     nbp->rvdw_sq           = ic->rvdw * ic->rvdw;
 177     nbp->rcoulomb_sq       = ic->rcoulomb * ic->rcoulomb;
 178     nbp->rlistOuter_sq     = listParams.rlistOuter * listParams.rlistOuter;
 179     nbp->rlistInner_sq     = listParams.rlistInner * listParams.rlistInner;
 180     nbp->useDynamicPruning = listParams.useDynamicPruning;
 181
 182     nbp->sh_lj_ewald   = ic->sh_lj_ewald;
 183     nbp->ewaldcoeff_lj = ic->ewaldcoeff_lj;
 184
 185     nbp->rvdw_switch      = ic->rvdw_switch;
 186     nbp->dispersion_shift = ic->dispersion_shift;
 187     nbp->repulsion_shift  = ic->repulsion_shift;
 188     nbp->vdw_switch       = ic->vdw_switch;
 189 }
 190
 191 void gpu_pme_loadbal_update_param(const nonbonded_verlet_t* nbv, const interaction_const_t* ic)
 192 {
 193     if (!nbv || !nbv->useGpu())
 194     {
 195         return;
 196     }
 197     NbnxmGpu*   nb  = nbv->gpu_nbv;
 198     NBParamGpu* nbp = nb->nbparam;
 199
 200     set_cutoff_parameters(nbp, ic, nbv->pairlistSets().params());
 201
 202     nbp->elecType = nbnxn_gpu_pick_ewald_kernel_type(*ic, nb->deviceContext_->deviceInfo());
 203
 204     GMX_RELEASE_ASSERT(ic->coulombEwaldTables, "Need valid Coulomb Ewald correction tables");
 205     init_ewald_coulomb_force_table(*ic->coulombEwaldTables, nbp, *nb->deviceContext_);
 206 }
 207
 208 void init_plist(gpu_plist* pl)
 209 {
 210     /* initialize to nullptr pointers to data that is not allocated here and will
 211        need reallocation in nbnxn_gpu_init_pairlist */
 212     pl->sci   = nullptr;
 213     pl->cj4   = nullptr;
 214     pl->imask = nullptr;
 215     pl->excl  = nullptr;
 216
 217     /* size -1 indicates that the respective array hasn't been initialized yet */
 218     pl->na_c                   = -1;
 219     pl->nsci                   = -1;
 220     pl->sci_nalloc             = -1;
 221     pl->ncj4                   = -1;
 222     pl->cj4_nalloc             = -1;
 223     pl->nimask                 = -1;
 224     pl->imask_nalloc           = -1;
 225     pl->nexcl                  = -1;
 226     pl->excl_nalloc            = -1;
 227     pl->haveFreshList          = false;
 228     pl->rollingPruningNumParts = 0;
 229     pl->rollingPruningPart     = 0;
 230 }
 231
 232 void init_timings(gmx_wallclock_gpu_nbnxn_t* t)
 233 {
 234     t->nb_h2d_t = 0.0;
 235     t->nb_d2h_t = 0.0;
 236     t->nb_c     = 0;
 237     t->pl_h2d_t = 0.0;
 238     t->pl_h2d_c = 0;
 239     for (int i = 0; i < 2; i++)
 240     {
 241         for (int j = 0; j < 2; j++)
 242         {
 243             t->ktime[i][j].t = 0.0;
 244             t->ktime[i][j].c = 0;
 245         }
 246     }
 247     t->pruneTime.c        = 0;
 248     t->pruneTime.t        = 0.0;
 249     t->dynamicPruneTime.c = 0;
 250     t->dynamicPruneTime.t = 0.0;
 251 }
 252
 253 //! This function is documented in the header file
 254 void gpu_init_pairlist(NbnxmGpu* nb, const NbnxnPairlistGpu* h_plist, const InteractionLocality iloc)
 255 {
 256     char sbuf[STRLEN];
 257     // Timing accumulation should happen only if there was work to do
 258     // because getLastRangeTime() gets skipped with empty lists later
 259     // which leads to the counter not being reset.
 260     bool                bDoTime      = (nb->bDoTime && !h_plist->sci.empty());
 261     const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
 262     gpu_plist*          d_plist      = nb->plist[iloc];
 263
 264     if (d_plist->na_c < 0)
 265     {
 266         d_plist->na_c = h_plist->na_ci;
 267     }
 268     else
 269     {
 270         if (d_plist->na_c != h_plist->na_ci)
 271         {
 272             sprintf(sbuf,
 273                     "In init_plist: the #atoms per cell has changed (from %d to %d)",
 274                     d_plist->na_c,
 275                     h_plist->na_ci);
 276             gmx_incons(sbuf);
 277         }
 278     }
 279
 280     GpuTimers::Interaction& iTimers = nb->timers->interaction[iloc];
 281
 282     if (bDoTime)
 283     {
 284         iTimers.pl_h2d.openTimingRegion(deviceStream);
 285         iTimers.didPairlistH2D = true;
 286     }
 287
 288     // TODO most of this function is same in CUDA and OpenCL, move into the header
 289     const DeviceContext& deviceContext = *nb->deviceContext_;
 290
 291     reallocateDeviceBuffer(
 292             &d_plist->sci, h_plist->sci.size(), &d_plist->nsci, &d_plist->sci_nalloc, deviceContext);
 293     copyToDeviceBuffer(&d_plist->sci,
 294                        h_plist->sci.data(),
 295                        0,
 296                        h_plist->sci.size(),
 297                        deviceStream,
 298                        GpuApiCallBehavior::Async,
 299                        bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
 300
 301     reallocateDeviceBuffer(
 302             &d_plist->cj4, h_plist->cj4.size(), &d_plist->ncj4, &d_plist->cj4_nalloc, deviceContext);
 303     copyToDeviceBuffer(&d_plist->cj4,
 304                        h_plist->cj4.data(),
 305                        0,
 306                        h_plist->cj4.size(),
 307                        deviceStream,
 308                        GpuApiCallBehavior::Async,
 309                        bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
 310
 311     reallocateDeviceBuffer(&d_plist->imask,
 312                            h_plist->cj4.size() * c_nbnxnGpuClusterpairSplit,
 313                            &d_plist->nimask,
 314                            &d_plist->imask_nalloc,
 315                            deviceContext);
 316
 317     reallocateDeviceBuffer(
 318             &d_plist->excl, h_plist->excl.size(), &d_plist->nexcl, &d_plist->excl_nalloc, deviceContext);
 319     copyToDeviceBuffer(&d_plist->excl,
 320                        h_plist->excl.data(),
 321                        0,
 322                        h_plist->excl.size(),
 323                        deviceStream,
 324                        GpuApiCallBehavior::Async,
 325                        bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
 326
 327     if (bDoTime)
 328     {
 329         iTimers.pl_h2d.closeTimingRegion(deviceStream);
 330     }
 331
 332     /* need to prune the pair list during the next step */
 333     d_plist->haveFreshList = true;
 334 }
 335
 336 void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat)
 337 {
 338     bool                 bDoTime       = nb->bDoTime;
 339     Nbnxm::GpuTimers*    timers        = bDoTime ? nb->timers : nullptr;
 340     NBAtomData*          atdat         = nb->atdat;
 341     const DeviceContext& deviceContext = *nb->deviceContext_;
 342     const DeviceStream&  localStream   = *nb->deviceStreams[InteractionLocality::Local];
 343
 344     int  numAtoms  = nbat->numAtoms();
 345     bool realloced = false;
 346
 347     if (bDoTime)
 348     {
 349         /* time async copy */
 350         timers->atdat.openTimingRegion(localStream);
 351     }
 352
 353     /* need to reallocate if we have to copy more atoms than the amount of space
 354        available and only allocate if we haven't initialized yet, i.e atdat->natoms == -1 */
 355     if (numAtoms > atdat->numAtomsAlloc)
 356     {
 357         int numAlloc = over_alloc_small(numAtoms);
 358
 359         /* free up first if the arrays have already been initialized */
 360         if (atdat->numAtomsAlloc != -1)
 361         {
 362             freeDeviceBuffer(&atdat->f);
 363             freeDeviceBuffer(&atdat->xq);
 364             freeDeviceBuffer(&atdat->ljComb);
 365             freeDeviceBuffer(&atdat->atomTypes);
 366         }
 367
 368
 369         allocateDeviceBuffer(&atdat->f, numAlloc, deviceContext);
 370         allocateDeviceBuffer(&atdat->xq, numAlloc, deviceContext);
 371
 372         if (useLjCombRule(nb->nbparam->vdwType))
 373         {
 374             // Two Lennard-Jones parameters per atom
 375             allocateDeviceBuffer(&atdat->ljComb, numAlloc, deviceContext);
 376         }
 377         else
 378         {
 379             allocateDeviceBuffer(&atdat->atomTypes, numAlloc, deviceContext);
 380         }
 381
 382         atdat->numAtomsAlloc = numAlloc;
 383         realloced            = true;
 384     }
 385
 386     atdat->numAtoms      = numAtoms;
 387     atdat->numAtomsLocal = nbat->natoms_local;
 388
 389     /* need to clear GPU f output if realloc happened */
 390     if (realloced)
 391     {
 392         clearDeviceBufferAsync(&atdat->f, 0, atdat->numAtomsAlloc, localStream);
 393     }
 394
 395     if (useLjCombRule(nb->nbparam->vdwType))
 396     {
 397         static_assert(
 398                 sizeof(Float2) == 2 * sizeof(*nbat->params().lj_comb.data()),
 399                 "Size of a pair of LJ parameters elements should be equal to the size of Float2.");
 400         copyToDeviceBuffer(&atdat->ljComb,
 401                            reinterpret_cast<const Float2*>(nbat->params().lj_comb.data()),
 402                            0,
 403                            numAtoms,
 404                            localStream,
 405                            GpuApiCallBehavior::Async,
 406                            bDoTime ? timers->atdat.fetchNextEvent() : nullptr);
 407     }
 408     else
 409     {
 410         static_assert(sizeof(int) == sizeof(*nbat->params().type.data()),
 411                       "Sizes of host- and device-side atom types should be the same.");
 412         copyToDeviceBuffer(&atdat->atomTypes,
 413                            nbat->params().type.data(),
 414                            0,
 415                            numAtoms,
 416                            localStream,
 417                            GpuApiCallBehavior::Async,
 418                            bDoTime ? timers->atdat.fetchNextEvent() : nullptr);
 419     }
 420
 421     if (bDoTime)
 422     {
 423         timers->atdat.closeTimingRegion(localStream);
 424     }
 425
 426     /* kick off the tasks enqueued above to ensure concurrency with the search */
 427     issueClFlushInStream(localStream);
 428 }
 429
 430 //! This function is documented in the header file
 431 gmx_wallclock_gpu_nbnxn_t* gpu_get_timings(NbnxmGpu* nb)
 432 {
 433     return (nb != nullptr && nb->bDoTime) ? nb->timings : nullptr;
 434 }
 435
 436 //! This function is documented in the header file
 437 void gpu_reset_timings(nonbonded_verlet_t* nbv)
 438 {
 439     if (nbv->gpu_nbv && nbv->gpu_nbv->bDoTime)
 440     {
 441         init_timings(nbv->gpu_nbv->timings);
 442     }
 443 }
 444
 445 bool gpu_is_kernel_ewald_analytical(const NbnxmGpu* nb)
 446 {
 447     return ((nb->nbparam->elecType == ElecType::EwaldAna)
 448             || (nb->nbparam->elecType == ElecType::EwaldAnaTwin));
 449 }
 450
 451 enum ElecType nbnxmGpuPickElectrostaticsKernelType(const interaction_const_t* ic,
 452                                                    const DeviceInformation&   deviceInfo)
 453 {
 454     if (ic->eeltype == CoulombInteractionType::Cut)
 455     {
 456         return ElecType::Cut;
 457     }
 458     else if (EEL_RF(ic->eeltype))
 459     {
 460         return ElecType::RF;
 461     }
 462     else if ((EEL_PME(ic->eeltype) || ic->eeltype == CoulombInteractionType::Ewald))
 463     {
 464         return nbnxn_gpu_pick_ewald_kernel_type(*ic, deviceInfo);
 465     }
 466     else
 467     {
 468         /* Shouldn't happen, as this is checked when choosing Verlet-scheme */
 469         GMX_THROW(gmx::InconsistentInputError(
 470                 gmx::formatString("The requested electrostatics type %s is not implemented in "
 471                                   "the GPU accelerated kernels!",
 472                                   enumValueToString(ic->eeltype))));
 473     }
 474 }
 475
 476
 477 enum VdwType nbnxmGpuPickVdwKernelType(const interaction_const_t* ic, LJCombinationRule ljCombinationRule)
 478 {
 479     if (ic->vdwtype == VanDerWaalsType::Cut)
 480     {
 481         switch (ic->vdw_modifier)
 482         {
 483             case InteractionModifiers::None:
 484             case InteractionModifiers::PotShift:
 485                 switch (ljCombinationRule)
 486                 {
 487                     case LJCombinationRule::None: return VdwType::Cut;
 488                     case LJCombinationRule::Geometric: return VdwType::CutCombGeom;
 489                     case LJCombinationRule::LorentzBerthelot: return VdwType::CutCombLB;
 490                     default:
 491                         GMX_THROW(gmx::InconsistentInputError(gmx::formatString(
 492                                 "The requested LJ combination rule %s is not implemented in "
 493                                 "the GPU accelerated kernels!",
 494                                 enumValueToString(ljCombinationRule))));
 495                 }
 496             case InteractionModifiers::ForceSwitch: return VdwType::FSwitch;
 497             case InteractionModifiers::PotSwitch: return VdwType::PSwitch;
 498             default:
 499                 GMX_THROW(gmx::InconsistentInputError(
 500                         gmx::formatString("The requested VdW interaction modifier %s is not "
 501                                           "implemented in the GPU accelerated kernels!",
 502                                           enumValueToString(ic->vdw_modifier))));
 503         }
 504     }
 505     else if (ic->vdwtype == VanDerWaalsType::Pme)
 506     {
 507         if (ic->ljpme_comb_rule == LongRangeVdW::Geom)
 508         {
 509             assert(ljCombinationRule == LJCombinationRule::Geometric);
 510             return VdwType::EwaldGeom;
 511         }
 512         else
 513         {
 514             assert(ljCombinationRule == LJCombinationRule::LorentzBerthelot);
 515             return VdwType::EwaldLB;
 516         }
 517     }
 518     else
 519     {
 520         GMX_THROW(gmx::InconsistentInputError(gmx::formatString(
 521                 "The requested VdW type %s is not implemented in the GPU accelerated kernels!",
 522                 enumValueToString(ic->vdwtype))));
 523     }
 524 }
 525
 526 void setupGpuShortRangeWork(NbnxmGpu* nb, const gmx::GpuBonded* gpuBonded, const gmx::InteractionLocality iLocality)
 527 {
 528     GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
 529
 530     // There is short-range work if the pair list for the provided
 531     // interaction locality contains entries or if there is any
 532     // bonded work (as this is not split into local/nonlocal).
 533     nb->haveWork[iLocality] = ((nb->plist[iLocality]->nsci != 0)
 534                                || (gpuBonded != nullptr && gpuBonded->haveInteractions()));
 535 }
 536
 537 bool haveGpuShortRangeWork(const NbnxmGpu* nb, const gmx::AtomLocality aLocality)
 538 {
 539     GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
 540
 541     return haveGpuShortRangeWork(*nb, gpuAtomToInteractionLocality(aLocality));
 542 }
 543
 544 void nbnxnInsertNonlocalGpuDependency(NbnxmGpu* nb, const InteractionLocality interactionLocality)
 545 {
 546     const DeviceStream& deviceStream = *nb->deviceStreams[interactionLocality];
 547
 548     /* When we get here all misc operations issued in the local stream as well as
 549        the local xq H2D are done,
 550        so we record that in the local stream and wait for it in the nonlocal one.
 551        This wait needs to precede any PP tasks, bonded or nonbonded, that may
 552        compute on interactions between local and nonlocal atoms.
 553      */
 554     if (nb->bUseTwoStreams)
 555     {
 556         if (interactionLocality == InteractionLocality::Local)
 557         {
 558             nb->misc_ops_and_local_H2D_done.markEvent(deviceStream);
 559             issueClFlushInStream(deviceStream);
 560         }
 561         else
 562         {
 563             nb->misc_ops_and_local_H2D_done.enqueueWaitEvent(deviceStream);
 564         }
 565     }
 566 }
 567
 568 /*! \brief Launch asynchronously the xq buffer host to device copy. */
 569 void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const AtomLocality atomLocality)
 570 {
 571     GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
 572
 573     const InteractionLocality iloc = gpuAtomToInteractionLocality(atomLocality);
 574
 575     NBAtomData*         adat         = nb->atdat;
 576     gpu_plist*          plist        = nb->plist[iloc];
 577     Nbnxm::GpuTimers*   timers       = nb->timers;
 578     const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
 579
 580     const bool bDoTime = nb->bDoTime;
 581
 582     /* Don't launch the non-local H2D copy if there is no dependent
 583        work to do: neither non-local nor other (e.g. bonded) work
 584        to do that has as input the nbnxn coordaintes.
 585        Doing the same for the local kernel is more complicated, since the
 586        local part of the force array also depends on the non-local kernel.
 587        So to avoid complicating the code and to reduce the risk of bugs,
 588        we always call the local local x+q copy (and the rest of the local
 589        work in nbnxn_gpu_launch_kernel().
 590      */
 591     if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(*nb, iloc))
 592     {
 593         plist->haveFreshList = false;
 594
 595         // The event is marked for Local interactions unconditionally,
 596         // so it has to be released here because of the early return
 597         // for NonLocal interactions.
 598         nb->misc_ops_and_local_H2D_done.reset();
 599
 600         return;
 601     }
 602
 603     /* local/nonlocal offset and length used for xq and f */
 604     const auto atomsRange = getGpuAtomRange(adat, atomLocality);
 605
 606     /* beginning of timed HtoD section */
 607     if (bDoTime)
 608     {
 609         timers->xf[atomLocality].nb_h2d.openTimingRegion(deviceStream);
 610     }
 611
 612     /* HtoD x, q */
 613     GMX_ASSERT(nbatom->XFormat == nbatXYZQ,
 614                "The coordinates should be in xyzq format to copy to the Float4 device buffer.");
 615     copyToDeviceBuffer(&adat->xq,
 616                        reinterpret_cast<const Float4*>(nbatom->x().data()) + atomsRange.begin(),
 617                        atomsRange.begin(),
 618                        atomsRange.size(),
 619                        deviceStream,
 620                        GpuApiCallBehavior::Async,
 621                        nullptr);
 622
 623     if (bDoTime)
 624     {
 625         timers->xf[atomLocality].nb_h2d.closeTimingRegion(deviceStream);
 626     }
 627
 628     /* When we get here all misc operations issued in the local stream as well as
 629        the local xq H2D are done,
 630        so we record that in the local stream and wait for it in the nonlocal one.
 631        This wait needs to precede any PP tasks, bonded or nonbonded, that may
 632        compute on interactions between local and nonlocal atoms.
 633      */
 634     nbnxnInsertNonlocalGpuDependency(nb, iloc);
 635 }
 636
 637 } // namespace Nbnxm