src/gromacs/nbnxm/nbnxm_gpu_data_mgmt.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2012,2013,2014,2015,2016 by the GROMACS development team.
   5  * Copyright (c) 2017,2018,2019,2020,2021, by the GROMACS development team, led by
   6  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   7  * and including many others, as listed in the AUTHORS file in the
   8  * top-level source directory and at http://www.gromacs.org.
   9  *
  10  * GROMACS is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU Lesser General Public License
  12  * as published by the Free Software Foundation; either version 2.1
  13  * of the License, or (at your option) any later version.
  14  *
  15  * GROMACS is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18  * Lesser General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public
  21  * License along with GROMACS; if not, see
  22  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  23  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  24  *
  25  * If you want to redistribute modifications to GROMACS, please
  26  * consider that scientific software is very special. Version
  27  * control is crucial - bugs must be traceable. We will be happy to
  28  * consider code for inclusion in the official distribution, but
  29  * derived work must not be called official GROMACS. Details are found
  30  * in the README & COPYING files - if they are missing, get the
  31  * official version at http://www.gromacs.org.
  32  *
  33  * To help us fund GROMACS development, we humbly ask that you cite
  34  * the research papers on the package. Check out http://www.gromacs.org.
  35  */
  36 /*! \internal \file
  37  *  \brief Define common implementation of nbnxm_gpu_data_mgmt.h
  38  *
  39  *  \author Anca Hamuraru <anca@streamcomputing.eu>
  40  *  \author Dimitrios Karkoulis <dimitris.karkoulis@gmail.com>
  41  *  \author Teemu Virolainen <teemu@streamcomputing.eu>
  42  *  \author Szilárd Páll <pall.szilard@gmail.com>
  43  *  \author Artem Zhmurov <zhmurov@gmail.com>
  44  *
  45  *  \ingroup module_nbnxm
  46  */
  47 #include "gmxpre.h"
  48
  49 #include "config.h"
  50
  51 #if GMX_GPU_CUDA
  52 #    include "cuda/nbnxm_cuda_types.h"
  53 #endif
  54
  55 #if GMX_GPU_OPENCL
  56 #    include "opencl/nbnxm_ocl_types.h"
  57 #endif
  58
  59 #if GMX_GPU_SYCL
  60 #    include "sycl/nbnxm_sycl_types.h"
  61 #endif
  62
  63 #include "nbnxm_gpu_data_mgmt.h"
  64
  65 #include "gromacs/hardware/device_information.h"
  66 #include "gromacs/mdtypes/interaction_const.h"
  67 #include "gromacs/mdtypes/simulation_workload.h"
  68 #include "gromacs/nbnxm/gpu_common_utils.h"
  69 #include "gromacs/nbnxm/gpu_data_mgmt.h"
  70 #include "gromacs/pbcutil/ishift.h"
  71 #include "gromacs/timing/gpu_timing.h"
  72 #include "gromacs/utility/cstringutil.h"
  73 #include "gromacs/utility/exceptions.h"
  74 #include "gromacs/utility/fatalerror.h"
  75
  76 #include "nbnxm_gpu.h"
  77 #include "pairlistsets.h"
  78
  79 namespace Nbnxm
  80 {
  81
  82 inline void issueClFlushInStream(const DeviceStream& deviceStream)
  83 {
  84 #if GMX_GPU_OPENCL
  85     /* Based on the v1.2 section 5.13 of the OpenCL spec, a flush is needed
  86      * in the stream after marking an event in it in order to be able to sync with
  87      * the event from another stream.
  88      */
  89     cl_int cl_error = clFlush(deviceStream.stream());
  90     if (cl_error != CL_SUCCESS)
  91     {
  92         GMX_THROW(gmx::InternalError("clFlush failed: " + ocl_get_error_string(cl_error)));
  93     }
  94 #else
  95     GMX_UNUSED_VALUE(deviceStream);
  96 #endif
  97 }
  98
  99 void init_ewald_coulomb_force_table(const EwaldCorrectionTables& tables,
 100                                     NBParamGpu*                  nbp,
 101                                     const DeviceContext&         deviceContext)
 102 {
 103     if (nbp->coulomb_tab)
 104     {
 105         destroyParamLookupTable(&nbp->coulomb_tab, nbp->coulomb_tab_texobj);
 106     }
 107
 108     nbp->coulomb_tab_scale = tables.scale;
 109     initParamLookupTable(
 110             &nbp->coulomb_tab, &nbp->coulomb_tab_texobj, tables.tableF.data(), tables.tableF.size(), deviceContext);
 111 }
 112
 113 enum ElecType nbnxn_gpu_pick_ewald_kernel_type(const interaction_const_t& ic,
 114                                                const DeviceInformation gmx_unused& deviceInfo)
 115 {
 116     bool bTwinCut = (ic.rcoulomb != ic.rvdw);
 117
 118     /* Benchmarking/development environment variables to force the use of
 119        analytical or tabulated Ewald kernel. */
 120     const bool forceAnalyticalEwald = (getenv("GMX_GPU_NB_ANA_EWALD") != nullptr);
 121     const bool forceTabulatedEwald  = (getenv("GMX_GPU_NB_TAB_EWALD") != nullptr);
 122     const bool forceTwinCutoffEwald = (getenv("GMX_GPU_NB_EWALD_TWINCUT") != nullptr);
 123
 124     if (forceAnalyticalEwald && forceTabulatedEwald)
 125     {
 126         gmx_incons(
 127                 "Both analytical and tabulated Ewald GPU non-bonded kernels "
 128                 "requested through environment variables.");
 129     }
 130
 131     /* By default, use analytical Ewald except with CUDA on NVIDIA CC 7.0 and 8.0.
 132      */
 133     const bool c_useTabulatedEwaldDefault =
 134 #if GMX_GPU_CUDA
 135             (deviceInfo.prop.major == 7 && deviceInfo.prop.minor == 0)
 136             || (deviceInfo.prop.major == 8 && deviceInfo.prop.minor == 0);
 137 #else
 138             false;
 139 #endif
 140     bool bUseAnalyticalEwald = !c_useTabulatedEwaldDefault;
 141     if (forceAnalyticalEwald)
 142     {
 143         bUseAnalyticalEwald = true;
 144         if (debug)
 145         {
 146             fprintf(debug, "Using analytical Ewald GPU kernels\n");
 147         }
 148     }
 149     else if (forceTabulatedEwald)
 150     {
 151         bUseAnalyticalEwald = false;
 152
 153         if (debug)
 154         {
 155             fprintf(debug, "Using tabulated Ewald GPU kernels\n");
 156         }
 157     }
 158
 159     /* Use twin cut-off kernels if requested by bTwinCut or the env. var.
 160        forces it (use it for debugging/benchmarking only). */
 161     if (!bTwinCut && !forceTwinCutoffEwald)
 162     {
 163         return bUseAnalyticalEwald ? ElecType::EwaldAna : ElecType::EwaldTab;
 164     }
 165     else
 166     {
 167         return bUseAnalyticalEwald ? ElecType::EwaldAnaTwin : ElecType::EwaldTabTwin;
 168     }
 169 }
 170
 171 void set_cutoff_parameters(NBParamGpu* nbp, const interaction_const_t* ic, const PairlistParams& listParams)
 172 {
 173     nbp->ewald_beta        = ic->ewaldcoeff_q;
 174     nbp->sh_ewald          = ic->sh_ewald;
 175     nbp->epsfac            = ic->epsfac;
 176     nbp->two_k_rf          = 2.0 * ic->reactionFieldCoefficient;
 177     nbp->c_rf              = ic->reactionFieldShift;
 178     nbp->rvdw_sq           = ic->rvdw * ic->rvdw;
 179     nbp->rcoulomb_sq       = ic->rcoulomb * ic->rcoulomb;
 180     nbp->rlistOuter_sq     = listParams.rlistOuter * listParams.rlistOuter;
 181     nbp->rlistInner_sq     = listParams.rlistInner * listParams.rlistInner;
 182     nbp->useDynamicPruning = listParams.useDynamicPruning;
 183
 184     nbp->sh_lj_ewald   = ic->sh_lj_ewald;
 185     nbp->ewaldcoeff_lj = ic->ewaldcoeff_lj;
 186
 187     nbp->rvdw_switch      = ic->rvdw_switch;
 188     nbp->dispersion_shift = ic->dispersion_shift;
 189     nbp->repulsion_shift  = ic->repulsion_shift;
 190     nbp->vdw_switch       = ic->vdw_switch;
 191 }
 192
 193 void gpu_pme_loadbal_update_param(const nonbonded_verlet_t* nbv, const interaction_const_t* ic)
 194 {
 195     if (!nbv || !nbv->useGpu())
 196     {
 197         return;
 198     }
 199     NbnxmGpu*   nb  = nbv->gpu_nbv;
 200     NBParamGpu* nbp = nb->nbparam;
 201
 202     set_cutoff_parameters(nbp, ic, nbv->pairlistSets().params());
 203
 204     nbp->elecType = nbnxn_gpu_pick_ewald_kernel_type(*ic, nb->deviceContext_->deviceInfo());
 205
 206     GMX_RELEASE_ASSERT(ic->coulombEwaldTables, "Need valid Coulomb Ewald correction tables");
 207     init_ewald_coulomb_force_table(*ic->coulombEwaldTables, nbp, *nb->deviceContext_);
 208 }
 209
 210 void init_plist(gpu_plist* pl)
 211 {
 212     /* initialize to nullptr pointers to data that is not allocated here and will
 213        need reallocation in nbnxn_gpu_init_pairlist */
 214     pl->sci   = nullptr;
 215     pl->cj4   = nullptr;
 216     pl->imask = nullptr;
 217     pl->excl  = nullptr;
 218
 219     /* size -1 indicates that the respective array hasn't been initialized yet */
 220     pl->na_c                   = -1;
 221     pl->nsci                   = -1;
 222     pl->sci_nalloc             = -1;
 223     pl->ncj4                   = -1;
 224     pl->cj4_nalloc             = -1;
 225     pl->nimask                 = -1;
 226     pl->imask_nalloc           = -1;
 227     pl->nexcl                  = -1;
 228     pl->excl_nalloc            = -1;
 229     pl->haveFreshList          = false;
 230     pl->rollingPruningNumParts = 0;
 231     pl->rollingPruningPart     = 0;
 232 }
 233
 234 void init_timings(gmx_wallclock_gpu_nbnxn_t* t)
 235 {
 236     t->nb_h2d_t = 0.0;
 237     t->nb_d2h_t = 0.0;
 238     t->nb_c     = 0;
 239     t->pl_h2d_t = 0.0;
 240     t->pl_h2d_c = 0;
 241     for (int i = 0; i < 2; i++)
 242     {
 243         for (int j = 0; j < 2; j++)
 244         {
 245             t->ktime[i][j].t = 0.0;
 246             t->ktime[i][j].c = 0;
 247         }
 248     }
 249     t->pruneTime.c        = 0;
 250     t->pruneTime.t        = 0.0;
 251     t->dynamicPruneTime.c = 0;
 252     t->dynamicPruneTime.t = 0.0;
 253 }
 254
 255 //! This function is documented in the header file
 256 void gpu_init_pairlist(NbnxmGpu* nb, const NbnxnPairlistGpu* h_plist, const InteractionLocality iloc)
 257 {
 258     char sbuf[STRLEN];
 259     // Timing accumulation should happen only if there was work to do
 260     // because getLastRangeTime() gets skipped with empty lists later
 261     // which leads to the counter not being reset.
 262     bool                bDoTime      = (nb->bDoTime && !h_plist->sci.empty());
 263     const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
 264     gpu_plist*          d_plist      = nb->plist[iloc];
 265
 266     if (d_plist->na_c < 0)
 267     {
 268         d_plist->na_c = h_plist->na_ci;
 269     }
 270     else
 271     {
 272         if (d_plist->na_c != h_plist->na_ci)
 273         {
 274             sprintf(sbuf,
 275                     "In init_plist: the #atoms per cell has changed (from %d to %d)",
 276                     d_plist->na_c,
 277                     h_plist->na_ci);
 278             gmx_incons(sbuf);
 279         }
 280     }
 281
 282     GpuTimers::Interaction& iTimers = nb->timers->interaction[iloc];
 283
 284     if (bDoTime)
 285     {
 286         iTimers.pl_h2d.openTimingRegion(deviceStream);
 287         iTimers.didPairlistH2D = true;
 288     }
 289
 290     // TODO most of this function is same in CUDA and OpenCL, move into the header
 291     const DeviceContext& deviceContext = *nb->deviceContext_;
 292
 293     reallocateDeviceBuffer(
 294             &d_plist->sci, h_plist->sci.size(), &d_plist->nsci, &d_plist->sci_nalloc, deviceContext);
 295     copyToDeviceBuffer(&d_plist->sci,
 296                        h_plist->sci.data(),
 297                        0,
 298                        h_plist->sci.size(),
 299                        deviceStream,
 300                        GpuApiCallBehavior::Async,
 301                        bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
 302
 303     reallocateDeviceBuffer(
 304             &d_plist->cj4, h_plist->cj4.size(), &d_plist->ncj4, &d_plist->cj4_nalloc, deviceContext);
 305     copyToDeviceBuffer(&d_plist->cj4,
 306                        h_plist->cj4.data(),
 307                        0,
 308                        h_plist->cj4.size(),
 309                        deviceStream,
 310                        GpuApiCallBehavior::Async,
 311                        bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
 312
 313     reallocateDeviceBuffer(&d_plist->imask,
 314                            h_plist->cj4.size() * c_nbnxnGpuClusterpairSplit,
 315                            &d_plist->nimask,
 316                            &d_plist->imask_nalloc,
 317                            deviceContext);
 318
 319     reallocateDeviceBuffer(
 320             &d_plist->excl, h_plist->excl.size(), &d_plist->nexcl, &d_plist->excl_nalloc, deviceContext);
 321     copyToDeviceBuffer(&d_plist->excl,
 322                        h_plist->excl.data(),
 323                        0,
 324                        h_plist->excl.size(),
 325                        deviceStream,
 326                        GpuApiCallBehavior::Async,
 327                        bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
 328
 329     if (bDoTime)
 330     {
 331         iTimers.pl_h2d.closeTimingRegion(deviceStream);
 332     }
 333
 334     /* need to prune the pair list during the next step */
 335     d_plist->haveFreshList = true;
 336 }
 337
 338 void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat)
 339 {
 340     bool                 bDoTime       = nb->bDoTime;
 341     Nbnxm::GpuTimers*    timers        = bDoTime ? nb->timers : nullptr;
 342     NBAtomData*          atdat         = nb->atdat;
 343     const DeviceContext& deviceContext = *nb->deviceContext_;
 344     const DeviceStream&  localStream   = *nb->deviceStreams[InteractionLocality::Local];
 345
 346     int  numAtoms  = nbat->numAtoms();
 347     bool realloced = false;
 348
 349     if (bDoTime)
 350     {
 351         /* time async copy */
 352         timers->atdat.openTimingRegion(localStream);
 353     }
 354
 355     /* need to reallocate if we have to copy more atoms than the amount of space
 356        available and only allocate if we haven't initialized yet, i.e atdat->natoms == -1 */
 357     if (numAtoms > atdat->numAtomsAlloc)
 358     {
 359         int numAlloc = over_alloc_small(numAtoms);
 360
 361         /* free up first if the arrays have already been initialized */
 362         if (atdat->numAtomsAlloc != -1)
 363         {
 364             freeDeviceBuffer(&atdat->f);
 365             freeDeviceBuffer(&atdat->xq);
 366             freeDeviceBuffer(&atdat->ljComb);
 367             freeDeviceBuffer(&atdat->atomTypes);
 368         }
 369
 370
 371         allocateDeviceBuffer(&atdat->f, numAlloc, deviceContext);
 372         allocateDeviceBuffer(&atdat->xq, numAlloc, deviceContext);
 373
 374         if (useLjCombRule(nb->nbparam->vdwType))
 375         {
 376             // Two Lennard-Jones parameters per atom
 377             allocateDeviceBuffer(&atdat->ljComb, numAlloc, deviceContext);
 378         }
 379         else
 380         {
 381             allocateDeviceBuffer(&atdat->atomTypes, numAlloc, deviceContext);
 382         }
 383
 384         atdat->numAtomsAlloc = numAlloc;
 385         realloced            = true;
 386     }
 387
 388     atdat->numAtoms      = numAtoms;
 389     atdat->numAtomsLocal = nbat->natoms_local;
 390
 391     /* need to clear GPU f output if realloc happened */
 392     if (realloced)
 393     {
 394         clearDeviceBufferAsync(&atdat->f, 0, atdat->numAtomsAlloc, localStream);
 395     }
 396
 397     if (useLjCombRule(nb->nbparam->vdwType))
 398     {
 399         static_assert(
 400                 sizeof(Float2) == 2 * sizeof(*nbat->params().lj_comb.data()),
 401                 "Size of a pair of LJ parameters elements should be equal to the size of Float2.");
 402         copyToDeviceBuffer(&atdat->ljComb,
 403                            reinterpret_cast<const Float2*>(nbat->params().lj_comb.data()),
 404                            0,
 405                            numAtoms,
 406                            localStream,
 407                            GpuApiCallBehavior::Async,
 408                            bDoTime ? timers->atdat.fetchNextEvent() : nullptr);
 409     }
 410     else
 411     {
 412         static_assert(sizeof(int) == sizeof(*nbat->params().type.data()),
 413                       "Sizes of host- and device-side atom types should be the same.");
 414         copyToDeviceBuffer(&atdat->atomTypes,
 415                            nbat->params().type.data(),
 416                            0,
 417                            numAtoms,
 418                            localStream,
 419                            GpuApiCallBehavior::Async,
 420                            bDoTime ? timers->atdat.fetchNextEvent() : nullptr);
 421     }
 422
 423     if (bDoTime)
 424     {
 425         timers->atdat.closeTimingRegion(localStream);
 426     }
 427
 428     /* kick off the tasks enqueued above to ensure concurrency with the search */
 429     issueClFlushInStream(localStream);
 430 }
 431
 432 //! This function is documented in the header file
 433 gmx_wallclock_gpu_nbnxn_t* gpu_get_timings(NbnxmGpu* nb)
 434 {
 435     return (nb != nullptr && nb->bDoTime) ? nb->timings : nullptr;
 436 }
 437
 438 //! This function is documented in the header file
 439 void gpu_reset_timings(nonbonded_verlet_t* nbv)
 440 {
 441     if (nbv->gpu_nbv && nbv->gpu_nbv->bDoTime)
 442     {
 443         init_timings(nbv->gpu_nbv->timings);
 444     }
 445 }
 446
 447 bool gpu_is_kernel_ewald_analytical(const NbnxmGpu* nb)
 448 {
 449     return ((nb->nbparam->elecType == ElecType::EwaldAna)
 450             || (nb->nbparam->elecType == ElecType::EwaldAnaTwin));
 451 }
 452
 453 enum ElecType nbnxmGpuPickElectrostaticsKernelType(const interaction_const_t* ic,
 454                                                    const DeviceInformation&   deviceInfo)
 455 {
 456     if (ic->eeltype == CoulombInteractionType::Cut)
 457     {
 458         return ElecType::Cut;
 459     }
 460     else if (EEL_RF(ic->eeltype))
 461     {
 462         return ElecType::RF;
 463     }
 464     else if ((EEL_PME(ic->eeltype) || ic->eeltype == CoulombInteractionType::Ewald))
 465     {
 466         return nbnxn_gpu_pick_ewald_kernel_type(*ic, deviceInfo);
 467     }
 468     else
 469     {
 470         /* Shouldn't happen, as this is checked when choosing Verlet-scheme */
 471         GMX_THROW(gmx::InconsistentInputError(
 472                 gmx::formatString("The requested electrostatics type %s is not implemented in "
 473                                   "the GPU accelerated kernels!",
 474                                   enumValueToString(ic->eeltype))));
 475     }
 476 }
 477
 478
 479 enum VdwType nbnxmGpuPickVdwKernelType(const interaction_const_t* ic, LJCombinationRule ljCombinationRule)
 480 {
 481     if (ic->vdwtype == VanDerWaalsType::Cut)
 482     {
 483         switch (ic->vdw_modifier)
 484         {
 485             case InteractionModifiers::None:
 486             case InteractionModifiers::PotShift:
 487                 switch (ljCombinationRule)
 488                 {
 489                     case LJCombinationRule::None: return VdwType::Cut;
 490                     case LJCombinationRule::Geometric: return VdwType::CutCombGeom;
 491                     case LJCombinationRule::LorentzBerthelot: return VdwType::CutCombLB;
 492                     default:
 493                         GMX_THROW(gmx::InconsistentInputError(gmx::formatString(
 494                                 "The requested LJ combination rule %s is not implemented in "
 495                                 "the GPU accelerated kernels!",
 496                                 enumValueToString(ljCombinationRule))));
 497                 }
 498             case InteractionModifiers::ForceSwitch: return VdwType::FSwitch;
 499             case InteractionModifiers::PotSwitch: return VdwType::PSwitch;
 500             default:
 501                 GMX_THROW(gmx::InconsistentInputError(
 502                         gmx::formatString("The requested VdW interaction modifier %s is not "
 503                                           "implemented in the GPU accelerated kernels!",
 504                                           enumValueToString(ic->vdw_modifier))));
 505         }
 506     }
 507     else if (ic->vdwtype == VanDerWaalsType::Pme)
 508     {
 509         if (ic->ljpme_comb_rule == LongRangeVdW::Geom)
 510         {
 511             assert(ljCombinationRule == LJCombinationRule::Geometric);
 512             return VdwType::EwaldGeom;
 513         }
 514         else
 515         {
 516             assert(ljCombinationRule == LJCombinationRule::LorentzBerthelot);
 517             return VdwType::EwaldLB;
 518         }
 519     }
 520     else
 521     {
 522         GMX_THROW(gmx::InconsistentInputError(gmx::formatString(
 523                 "The requested VdW type %s is not implemented in the GPU accelerated kernels!",
 524                 enumValueToString(ic->vdwtype))));
 525     }
 526 }
 527
 528 void setupGpuShortRangeWork(NbnxmGpu* nb, const gmx::GpuBonded* gpuBonded, const gmx::InteractionLocality iLocality)
 529 {
 530     GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
 531
 532     // There is short-range work if the pair list for the provided
 533     // interaction locality contains entries or if there is any
 534     // bonded work (as this is not split into local/nonlocal).
 535     nb->haveWork[iLocality] = ((nb->plist[iLocality]->nsci != 0)
 536                                || (gpuBonded != nullptr && gpuBonded->haveInteractions()));
 537 }
 538
 539 bool haveGpuShortRangeWork(const NbnxmGpu* nb, const gmx::AtomLocality aLocality)
 540 {
 541     GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
 542
 543     return haveGpuShortRangeWork(*nb, gpuAtomToInteractionLocality(aLocality));
 544 }
 545
 546 /*! \brief
 547  * Launch asynchronously the download of nonbonded forces from the GPU
 548  * (and energies/shift forces if required).
 549  */
 550 void gpu_launch_cpyback(NbnxmGpu*                nb,
 551                         struct nbnxn_atomdata_t* nbatom,
 552                         const gmx::StepWorkload& stepWork,
 553                         const AtomLocality       atomLocality)
 554 {
 555     GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
 556
 557     /* determine interaction locality from atom locality */
 558     const InteractionLocality iloc = gpuAtomToInteractionLocality(atomLocality);
 559     GMX_ASSERT(iloc == InteractionLocality::Local
 560                        || (iloc == InteractionLocality::NonLocal && nb->bNonLocalStreamDoneMarked == false),
 561                "Non-local stream is indicating that the copy back event is enqueued at the "
 562                "beginning of the copy back function.");
 563
 564     /* extract the data */
 565     NBAtomData*         adat         = nb->atdat;
 566     Nbnxm::GpuTimers*   timers       = nb->timers;
 567     bool                bDoTime      = nb->bDoTime;
 568     const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
 569
 570     /* don't launch non-local copy-back if there was no non-local work to do */
 571     if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(*nb, iloc))
 572     {
 573         /* TODO An alternative way to signal that non-local work is
 574            complete is to use a clEnqueueMarker+clEnqueueBarrier
 575            pair. However, the use of bNonLocalStreamDoneMarked has the
 576            advantage of being local to the host, so probably minimizes
 577            overhead. Curiously, for NVIDIA OpenCL with an empty-domain
 578            test case, overall simulation performance was higher with
 579            the API calls, but this has not been tested on AMD OpenCL,
 580            so could be worth considering in future. */
 581         nb->bNonLocalStreamDoneMarked = false;
 582         return;
 583     }
 584
 585     /* local/nonlocal offset and length used for xq and f */
 586     auto atomsRange = getGpuAtomRange(adat, atomLocality);
 587
 588     /* beginning of timed D2H section */
 589     if (bDoTime)
 590     {
 591         timers->xf[atomLocality].nb_d2h.openTimingRegion(deviceStream);
 592     }
 593
 594     /* With DD the local D2H transfer can only start after the non-local
 595        has been launched. */
 596     if (iloc == InteractionLocality::Local && nb->bNonLocalStreamDoneMarked)
 597     {
 598         nb->nonlocal_done.enqueueWaitEvent(deviceStream);
 599         nb->bNonLocalStreamDoneMarked = false;
 600     }
 601
 602     /* DtoH f */
 603     static_assert(sizeof(*nbatom->out[0].f.data()) == sizeof(float),
 604                   "The host force buffer should be in single precision to match device data size.");
 605     copyFromDeviceBuffer(reinterpret_cast<Float3*>(nbatom->out[0].f.data()) + atomsRange.begin(),
 606                          &adat->f,
 607                          atomsRange.begin(),
 608                          atomsRange.size(),
 609                          deviceStream,
 610                          GpuApiCallBehavior::Async,
 611                          bDoTime ? timers->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
 612
 613     issueClFlushInStream(deviceStream);
 614
 615     /* After the non-local D2H is launched the nonlocal_done event can be
 616        recorded which signals that the local D2H can proceed. This event is not
 617        placed after the non-local kernel because we first need the non-local
 618        data back first. */
 619     if (iloc == InteractionLocality::NonLocal)
 620     {
 621         nb->nonlocal_done.markEvent(deviceStream);
 622         nb->bNonLocalStreamDoneMarked = true;
 623     }
 624
 625     /* only transfer energies in the local stream */
 626     if (iloc == InteractionLocality::Local)
 627     {
 628         /* DtoH fshift when virial is needed */
 629         if (stepWork.computeVirial)
 630         {
 631             static_assert(
 632                     sizeof(*nb->nbst.fShift) == sizeof(Float3),
 633                     "Sizes of host- and device-side shift vector elements should be the same.");
 634             copyFromDeviceBuffer(nb->nbst.fShift,
 635                                  &adat->fShift,
 636                                  0,
 637                                  SHIFTS,
 638                                  deviceStream,
 639                                  GpuApiCallBehavior::Async,
 640                                  bDoTime ? timers->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
 641         }
 642
 643         /* DtoH energies */
 644         if (stepWork.computeEnergy)
 645         {
 646             static_assert(sizeof(*nb->nbst.eLJ) == sizeof(float),
 647                           "Sizes of host- and device-side LJ energy terms should be the same.");
 648             copyFromDeviceBuffer(nb->nbst.eLJ,
 649                                  &adat->eLJ,
 650                                  0,
 651                                  1,
 652                                  deviceStream,
 653                                  GpuApiCallBehavior::Async,
 654                                  bDoTime ? timers->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
 655             static_assert(sizeof(*nb->nbst.eElec) == sizeof(float),
 656                           "Sizes of host- and device-side electrostatic energy terms should be the "
 657                           "same.");
 658             copyFromDeviceBuffer(nb->nbst.eElec,
 659                                  &adat->eElec,
 660                                  0,
 661                                  1,
 662                                  deviceStream,
 663                                  GpuApiCallBehavior::Async,
 664                                  bDoTime ? timers->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
 665         }
 666     }
 667
 668     if (bDoTime)
 669     {
 670         timers->xf[atomLocality].nb_d2h.closeTimingRegion(deviceStream);
 671     }
 672 }
 673
 674 void nbnxnInsertNonlocalGpuDependency(NbnxmGpu* nb, const InteractionLocality interactionLocality)
 675 {
 676     const DeviceStream& deviceStream = *nb->deviceStreams[interactionLocality];
 677
 678     /* When we get here all misc operations issued in the local stream as well as
 679        the local xq H2D are done,
 680        so we record that in the local stream and wait for it in the nonlocal one.
 681        This wait needs to precede any PP tasks, bonded or nonbonded, that may
 682        compute on interactions between local and nonlocal atoms.
 683      */
 684     if (nb->bUseTwoStreams)
 685     {
 686         if (interactionLocality == InteractionLocality::Local)
 687         {
 688             nb->misc_ops_and_local_H2D_done.markEvent(deviceStream);
 689             issueClFlushInStream(deviceStream);
 690         }
 691         else
 692         {
 693             nb->misc_ops_and_local_H2D_done.enqueueWaitEvent(deviceStream);
 694         }
 695     }
 696 }
 697
 698 /*! \brief Launch asynchronously the xq buffer host to device copy. */
 699 void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const AtomLocality atomLocality)
 700 {
 701     GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
 702
 703     const InteractionLocality iloc = gpuAtomToInteractionLocality(atomLocality);
 704
 705     NBAtomData*         adat         = nb->atdat;
 706     gpu_plist*          plist        = nb->plist[iloc];
 707     Nbnxm::GpuTimers*   timers       = nb->timers;
 708     const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
 709
 710     const bool bDoTime = nb->bDoTime;
 711
 712     /* Don't launch the non-local H2D copy if there is no dependent
 713        work to do: neither non-local nor other (e.g. bonded) work
 714        to do that has as input the nbnxn coordaintes.
 715        Doing the same for the local kernel is more complicated, since the
 716        local part of the force array also depends on the non-local kernel.
 717        So to avoid complicating the code and to reduce the risk of bugs,
 718        we always call the local local x+q copy (and the rest of the local
 719        work in nbnxn_gpu_launch_kernel().
 720      */
 721     if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(*nb, iloc))
 722     {
 723         plist->haveFreshList = false;
 724
 725         // The event is marked for Local interactions unconditionally,
 726         // so it has to be released here because of the early return
 727         // for NonLocal interactions.
 728         nb->misc_ops_and_local_H2D_done.reset();
 729
 730         return;
 731     }
 732
 733     /* local/nonlocal offset and length used for xq and f */
 734     const auto atomsRange = getGpuAtomRange(adat, atomLocality);
 735
 736     /* beginning of timed HtoD section */
 737     if (bDoTime)
 738     {
 739         timers->xf[atomLocality].nb_h2d.openTimingRegion(deviceStream);
 740     }
 741
 742     /* HtoD x, q */
 743     GMX_ASSERT(nbatom->XFormat == nbatXYZQ,
 744                "The coordinates should be in xyzq format to copy to the Float4 device buffer.");
 745     copyToDeviceBuffer(&adat->xq,
 746                        reinterpret_cast<const Float4*>(nbatom->x().data()) + atomsRange.begin(),
 747                        atomsRange.begin(),
 748                        atomsRange.size(),
 749                        deviceStream,
 750                        GpuApiCallBehavior::Async,
 751                        nullptr);
 752
 753     if (bDoTime)
 754     {
 755         timers->xf[atomLocality].nb_h2d.closeTimingRegion(deviceStream);
 756     }
 757
 758     /* When we get here all misc operations issued in the local stream as well as
 759        the local xq H2D are done,
 760        so we record that in the local stream and wait for it in the nonlocal one.
 761        This wait needs to precede any PP tasks, bonded or nonbonded, that may
 762        compute on interactions between local and nonlocal atoms.
 763      */
 764     nbnxnInsertNonlocalGpuDependency(nb, iloc);
 765 }
 766
 767 } // namespace Nbnxm