src/gromacs/nbnxm/nbnxm_gpu_data_mgmt.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2012,2013,2014,2015,2016 by the GROMACS development team.
   5  * Copyright (c) 2017,2018,2019,2020,2021, by the GROMACS development team, led by
   6  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   7  * and including many others, as listed in the AUTHORS file in the
   8  * top-level source directory and at http://www.gromacs.org.
   9  *
  10  * GROMACS is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU Lesser General Public License
  12  * as published by the Free Software Foundation; either version 2.1
  13  * of the License, or (at your option) any later version.
  14  *
  15  * GROMACS is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18  * Lesser General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public
  21  * License along with GROMACS; if not, see
  22  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  23  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  24  *
  25  * If you want to redistribute modifications to GROMACS, please
  26  * consider that scientific software is very special. Version
  27  * control is crucial - bugs must be traceable. We will be happy to
  28  * consider code for inclusion in the official distribution, but
  29  * derived work must not be called official GROMACS. Details are found
  30  * in the README & COPYING files - if they are missing, get the
  31  * official version at http://www.gromacs.org.
  32  *
  33  * To help us fund GROMACS development, we humbly ask that you cite
  34  * the research papers on the package. Check out http://www.gromacs.org.
  35  */
  36 /*! \internal \file
  37  *  \brief Define common implementation of nbnxm_gpu_data_mgmt.h
  38  *
  39  *  \author Anca Hamuraru <anca@streamcomputing.eu>
  40  *  \author Dimitrios Karkoulis <dimitris.karkoulis@gmail.com>
  41  *  \author Teemu Virolainen <teemu@streamcomputing.eu>
  42  *  \author Szilárd Páll <pall.szilard@gmail.com>
  43  *  \author Artem Zhmurov <zhmurov@gmail.com>
  44  *
  45  *  \ingroup module_nbnxm
  46  */
  47 #include "gmxpre.h"
  48
  49 #include "config.h"
  50
  51 #if GMX_GPU_CUDA
  52 #    include "cuda/nbnxm_cuda_types.h"
  53 #endif
  54
  55 #if GMX_GPU_OPENCL
  56 #    include "opencl/nbnxm_ocl_types.h"
  57 #endif
  58
  59 #if GMX_GPU_SYCL
  60 #    include "sycl/nbnxm_sycl_types.h"
  61 #endif
  62
  63 #include "nbnxm_gpu_data_mgmt.h"
  64
  65 #include "gromacs/hardware/device_information.h"
  66 #include "gromacs/mdtypes/interaction_const.h"
  67 #include "gromacs/mdtypes/simulation_workload.h"
  68 #include "gromacs/nbnxm/gpu_common_utils.h"
  69 #include "gromacs/nbnxm/gpu_data_mgmt.h"
  70 #include "gromacs/pbcutil/ishift.h"
  71 #include "gromacs/timing/gpu_timing.h"
  72 #include "gromacs/pbcutil/ishift.h"
  73 #include "gromacs/utility/cstringutil.h"
  74 #include "gromacs/utility/exceptions.h"
  75 #include "gromacs/utility/fatalerror.h"
  76
  77 #include "nbnxm_gpu.h"
  78 #include "pairlistsets.h"
  79
  80 namespace Nbnxm
  81 {
  82
  83 inline void issueClFlushInStream(const DeviceStream& deviceStream)
  84 {
  85 #if GMX_GPU_OPENCL
  86     /* Based on the v1.2 section 5.13 of the OpenCL spec, a flush is needed
  87      * in the stream after marking an event in it in order to be able to sync with
  88      * the event from another stream.
  89      */
  90     cl_int cl_error = clFlush(deviceStream.stream());
  91     if (cl_error != CL_SUCCESS)
  92     {
  93         GMX_THROW(gmx::InternalError("clFlush failed: " + ocl_get_error_string(cl_error)));
  94     }
  95 #else
  96     GMX_UNUSED_VALUE(deviceStream);
  97 #endif
  98 }
  99
 100 void init_ewald_coulomb_force_table(const EwaldCorrectionTables& tables,
 101                                     NBParamGpu*                  nbp,
 102                                     const DeviceContext&         deviceContext)
 103 {
 104     if (nbp->coulomb_tab)
 105     {
 106         destroyParamLookupTable(&nbp->coulomb_tab, nbp->coulomb_tab_texobj);
 107     }
 108
 109     nbp->coulomb_tab_scale = tables.scale;
 110     initParamLookupTable(
 111             &nbp->coulomb_tab, &nbp->coulomb_tab_texobj, tables.tableF.data(), tables.tableF.size(), deviceContext);
 112 }
 113
 114 enum ElecType nbnxn_gpu_pick_ewald_kernel_type(const interaction_const_t& ic,
 115                                                const DeviceInformation gmx_unused& deviceInfo)
 116 {
 117     bool bTwinCut = (ic.rcoulomb != ic.rvdw);
 118
 119     /* Benchmarking/development environment variables to force the use of
 120        analytical or tabulated Ewald kernel. */
 121     const bool forceAnalyticalEwald = (getenv("GMX_GPU_NB_ANA_EWALD") != nullptr);
 122     const bool forceTabulatedEwald  = (getenv("GMX_GPU_NB_TAB_EWALD") != nullptr);
 123     const bool forceTwinCutoffEwald = (getenv("GMX_GPU_NB_EWALD_TWINCUT") != nullptr);
 124
 125     if (forceAnalyticalEwald && forceTabulatedEwald)
 126     {
 127         gmx_incons(
 128                 "Both analytical and tabulated Ewald GPU non-bonded kernels "
 129                 "requested through environment variables.");
 130     }
 131
 132     /* By default, use analytical Ewald except with CUDA on NVIDIA CC 7.0 and 8.0.
 133      */
 134     const bool c_useTabulatedEwaldDefault =
 135 #if GMX_GPU_CUDA
 136             (deviceInfo.prop.major == 7 && deviceInfo.prop.minor == 0)
 137             || (deviceInfo.prop.major == 8 && deviceInfo.prop.minor == 0);
 138 #else
 139             false;
 140 #endif
 141     bool bUseAnalyticalEwald = !c_useTabulatedEwaldDefault;
 142     if (forceAnalyticalEwald)
 143     {
 144         bUseAnalyticalEwald = true;
 145         if (debug)
 146         {
 147             fprintf(debug, "Using analytical Ewald GPU kernels\n");
 148         }
 149     }
 150     else if (forceTabulatedEwald)
 151     {
 152         bUseAnalyticalEwald = false;
 153
 154         if (debug)
 155         {
 156             fprintf(debug, "Using tabulated Ewald GPU kernels\n");
 157         }
 158     }
 159
 160     /* Use twin cut-off kernels if requested by bTwinCut or the env. var.
 161        forces it (use it for debugging/benchmarking only). */
 162     if (!bTwinCut && !forceTwinCutoffEwald)
 163     {
 164         return bUseAnalyticalEwald ? ElecType::EwaldAna : ElecType::EwaldTab;
 165     }
 166     else
 167     {
 168         return bUseAnalyticalEwald ? ElecType::EwaldAnaTwin : ElecType::EwaldTabTwin;
 169     }
 170 }
 171
 172 void set_cutoff_parameters(NBParamGpu* nbp, const interaction_const_t* ic, const PairlistParams& listParams)
 173 {
 174     nbp->ewald_beta        = ic->ewaldcoeff_q;
 175     nbp->sh_ewald          = ic->sh_ewald;
 176     nbp->epsfac            = ic->epsfac;
 177     nbp->two_k_rf          = 2.0 * ic->reactionFieldCoefficient;
 178     nbp->c_rf              = ic->reactionFieldShift;
 179     nbp->rvdw_sq           = ic->rvdw * ic->rvdw;
 180     nbp->rcoulomb_sq       = ic->rcoulomb * ic->rcoulomb;
 181     nbp->rlistOuter_sq     = listParams.rlistOuter * listParams.rlistOuter;
 182     nbp->rlistInner_sq     = listParams.rlistInner * listParams.rlistInner;
 183     nbp->useDynamicPruning = listParams.useDynamicPruning;
 184
 185     nbp->sh_lj_ewald   = ic->sh_lj_ewald;
 186     nbp->ewaldcoeff_lj = ic->ewaldcoeff_lj;
 187
 188     nbp->rvdw_switch      = ic->rvdw_switch;
 189     nbp->dispersion_shift = ic->dispersion_shift;
 190     nbp->repulsion_shift  = ic->repulsion_shift;
 191     nbp->vdw_switch       = ic->vdw_switch;
 192 }
 193
 194 void gpu_pme_loadbal_update_param(const nonbonded_verlet_t* nbv, const interaction_const_t* ic)
 195 {
 196     if (!nbv || !nbv->useGpu())
 197     {
 198         return;
 199     }
 200     NbnxmGpu*   nb  = nbv->gpu_nbv;
 201     NBParamGpu* nbp = nb->nbparam;
 202
 203     set_cutoff_parameters(nbp, ic, nbv->pairlistSets().params());
 204
 205     nbp->elecType = nbnxn_gpu_pick_ewald_kernel_type(*ic, nb->deviceContext_->deviceInfo());
 206
 207     GMX_RELEASE_ASSERT(ic->coulombEwaldTables, "Need valid Coulomb Ewald correction tables");
 208     init_ewald_coulomb_force_table(*ic->coulombEwaldTables, nbp, *nb->deviceContext_);
 209 }
 210
 211 void init_plist(gpu_plist* pl)
 212 {
 213     /* initialize to nullptr pointers to data that is not allocated here and will
 214        need reallocation in nbnxn_gpu_init_pairlist */
 215     pl->sci   = nullptr;
 216     pl->cj4   = nullptr;
 217     pl->imask = nullptr;
 218     pl->excl  = nullptr;
 219
 220     /* size -1 indicates that the respective array hasn't been initialized yet */
 221     pl->na_c                   = -1;
 222     pl->nsci                   = -1;
 223     pl->sci_nalloc             = -1;
 224     pl->ncj4                   = -1;
 225     pl->cj4_nalloc             = -1;
 226     pl->nimask                 = -1;
 227     pl->imask_nalloc           = -1;
 228     pl->nexcl                  = -1;
 229     pl->excl_nalloc            = -1;
 230     pl->haveFreshList          = false;
 231     pl->rollingPruningNumParts = 0;
 232     pl->rollingPruningPart     = 0;
 233 }
 234
 235 void init_timings(gmx_wallclock_gpu_nbnxn_t* t)
 236 {
 237     t->nb_h2d_t = 0.0;
 238     t->nb_d2h_t = 0.0;
 239     t->nb_c     = 0;
 240     t->pl_h2d_t = 0.0;
 241     t->pl_h2d_c = 0;
 242     for (int i = 0; i < 2; i++)
 243     {
 244         for (int j = 0; j < 2; j++)
 245         {
 246             t->ktime[i][j].t = 0.0;
 247             t->ktime[i][j].c = 0;
 248         }
 249     }
 250     t->pruneTime.c        = 0;
 251     t->pruneTime.t        = 0.0;
 252     t->dynamicPruneTime.c = 0;
 253     t->dynamicPruneTime.t = 0.0;
 254 }
 255
 256 //! This function is documented in the header file
 257 void gpu_init_pairlist(NbnxmGpu* nb, const NbnxnPairlistGpu* h_plist, const InteractionLocality iloc)
 258 {
 259     char sbuf[STRLEN];
 260     // Timing accumulation should happen only if there was work to do
 261     // because getLastRangeTime() gets skipped with empty lists later
 262     // which leads to the counter not being reset.
 263     bool                bDoTime      = (nb->bDoTime && !h_plist->sci.empty());
 264     const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
 265     gpu_plist*          d_plist      = nb->plist[iloc];
 266
 267     if (d_plist->na_c < 0)
 268     {
 269         d_plist->na_c = h_plist->na_ci;
 270     }
 271     else
 272     {
 273         if (d_plist->na_c != h_plist->na_ci)
 274         {
 275             sprintf(sbuf,
 276                     "In init_plist: the #atoms per cell has changed (from %d to %d)",
 277                     d_plist->na_c,
 278                     h_plist->na_ci);
 279             gmx_incons(sbuf);
 280         }
 281     }
 282
 283     GpuTimers::Interaction& iTimers = nb->timers->interaction[iloc];
 284
 285     if (bDoTime)
 286     {
 287         iTimers.pl_h2d.openTimingRegion(deviceStream);
 288         iTimers.didPairlistH2D = true;
 289     }
 290
 291     // TODO most of this function is same in CUDA and OpenCL, move into the header
 292     const DeviceContext& deviceContext = *nb->deviceContext_;
 293
 294     reallocateDeviceBuffer(
 295             &d_plist->sci, h_plist->sci.size(), &d_plist->nsci, &d_plist->sci_nalloc, deviceContext);
 296     copyToDeviceBuffer(&d_plist->sci,
 297                        h_plist->sci.data(),
 298                        0,
 299                        h_plist->sci.size(),
 300                        deviceStream,
 301                        GpuApiCallBehavior::Async,
 302                        bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
 303
 304     reallocateDeviceBuffer(
 305             &d_plist->cj4, h_plist->cj4.size(), &d_plist->ncj4, &d_plist->cj4_nalloc, deviceContext);
 306     copyToDeviceBuffer(&d_plist->cj4,
 307                        h_plist->cj4.data(),
 308                        0,
 309                        h_plist->cj4.size(),
 310                        deviceStream,
 311                        GpuApiCallBehavior::Async,
 312                        bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
 313
 314     reallocateDeviceBuffer(&d_plist->imask,
 315                            h_plist->cj4.size() * c_nbnxnGpuClusterpairSplit,
 316                            &d_plist->nimask,
 317                            &d_plist->imask_nalloc,
 318                            deviceContext);
 319
 320     reallocateDeviceBuffer(
 321             &d_plist->excl, h_plist->excl.size(), &d_plist->nexcl, &d_plist->excl_nalloc, deviceContext);
 322     copyToDeviceBuffer(&d_plist->excl,
 323                        h_plist->excl.data(),
 324                        0,
 325                        h_plist->excl.size(),
 326                        deviceStream,
 327                        GpuApiCallBehavior::Async,
 328                        bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
 329
 330     if (bDoTime)
 331     {
 332         iTimers.pl_h2d.closeTimingRegion(deviceStream);
 333     }
 334
 335     /* need to prune the pair list during the next step */
 336     d_plist->haveFreshList = true;
 337 }
 338
 339 void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat)
 340 {
 341     bool                 bDoTime       = nb->bDoTime;
 342     Nbnxm::GpuTimers*    timers        = bDoTime ? nb->timers : nullptr;
 343     NBAtomData*          atdat         = nb->atdat;
 344     const DeviceContext& deviceContext = *nb->deviceContext_;
 345     const DeviceStream&  localStream   = *nb->deviceStreams[InteractionLocality::Local];
 346
 347     int  numAtoms  = nbat->numAtoms();
 348     bool realloced = false;
 349
 350     if (bDoTime)
 351     {
 352         /* time async copy */
 353         timers->atdat.openTimingRegion(localStream);
 354     }
 355
 356     /* need to reallocate if we have to copy more atoms than the amount of space
 357        available and only allocate if we haven't initialized yet, i.e atdat->natoms == -1 */
 358     if (numAtoms > atdat->numAtomsAlloc)
 359     {
 360         int numAlloc = over_alloc_small(numAtoms);
 361
 362         /* free up first if the arrays have already been initialized */
 363         if (atdat->numAtomsAlloc != -1)
 364         {
 365             freeDeviceBuffer(&atdat->f);
 366             freeDeviceBuffer(&atdat->xq);
 367             freeDeviceBuffer(&atdat->ljComb);
 368             freeDeviceBuffer(&atdat->atomTypes);
 369         }
 370
 371
 372         allocateDeviceBuffer(&atdat->f, numAlloc, deviceContext);
 373         allocateDeviceBuffer(&atdat->xq, numAlloc, deviceContext);
 374
 375         if (useLjCombRule(nb->nbparam->vdwType))
 376         {
 377             // Two Lennard-Jones parameters per atom
 378             allocateDeviceBuffer(&atdat->ljComb, numAlloc, deviceContext);
 379         }
 380         else
 381         {
 382             allocateDeviceBuffer(&atdat->atomTypes, numAlloc, deviceContext);
 383         }
 384
 385         atdat->numAtomsAlloc = numAlloc;
 386         realloced            = true;
 387     }
 388
 389     atdat->numAtoms      = numAtoms;
 390     atdat->numAtomsLocal = nbat->natoms_local;
 391
 392     /* need to clear GPU f output if realloc happened */
 393     if (realloced)
 394     {
 395         clearDeviceBufferAsync(&atdat->f, 0, atdat->numAtomsAlloc, localStream);
 396     }
 397
 398     if (useLjCombRule(nb->nbparam->vdwType))
 399     {
 400         static_assert(
 401                 sizeof(Float2) == 2 * sizeof(*nbat->params().lj_comb.data()),
 402                 "Size of a pair of LJ parameters elements should be equal to the size of Float2.");
 403         copyToDeviceBuffer(&atdat->ljComb,
 404                            reinterpret_cast<const Float2*>(nbat->params().lj_comb.data()),
 405                            0,
 406                            numAtoms,
 407                            localStream,
 408                            GpuApiCallBehavior::Async,
 409                            bDoTime ? timers->atdat.fetchNextEvent() : nullptr);
 410     }
 411     else
 412     {
 413         static_assert(sizeof(int) == sizeof(*nbat->params().type.data()),
 414                       "Sizes of host- and device-side atom types should be the same.");
 415         copyToDeviceBuffer(&atdat->atomTypes,
 416                            nbat->params().type.data(),
 417                            0,
 418                            numAtoms,
 419                            localStream,
 420                            GpuApiCallBehavior::Async,
 421                            bDoTime ? timers->atdat.fetchNextEvent() : nullptr);
 422     }
 423
 424     if (bDoTime)
 425     {
 426         timers->atdat.closeTimingRegion(localStream);
 427     }
 428
 429     /* kick off the tasks enqueued above to ensure concurrency with the search */
 430     issueClFlushInStream(localStream);
 431 }
 432
 433 void gpu_clear_outputs(NbnxmGpu* nb, bool computeVirial)
 434 {
 435     NBAtomData*         adat        = nb->atdat;
 436     const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local];
 437     // Clear forces
 438     clearDeviceBufferAsync(&adat->f, 0, nb->atdat->numAtoms, localStream);
 439     // Clear shift force array and energies if the outputs were used in the current step
 440     if (computeVirial)
 441     {
 442         clearDeviceBufferAsync(&adat->fShift, 0, SHIFTS, localStream);
 443         clearDeviceBufferAsync(&adat->eLJ, 0, 1, localStream);
 444         clearDeviceBufferAsync(&adat->eElec, 0, 1, localStream);
 445     }
 446     issueClFlushInStream(localStream);
 447 }
 448
 449 //! This function is documented in the header file
 450 gmx_wallclock_gpu_nbnxn_t* gpu_get_timings(NbnxmGpu* nb)
 451 {
 452     return (nb != nullptr && nb->bDoTime) ? nb->timings : nullptr;
 453 }
 454
 455 //! This function is documented in the header file
 456 void gpu_reset_timings(nonbonded_verlet_t* nbv)
 457 {
 458     if (nbv->gpu_nbv && nbv->gpu_nbv->bDoTime)
 459     {
 460         init_timings(nbv->gpu_nbv->timings);
 461     }
 462 }
 463
 464 bool gpu_is_kernel_ewald_analytical(const NbnxmGpu* nb)
 465 {
 466     return ((nb->nbparam->elecType == ElecType::EwaldAna)
 467             || (nb->nbparam->elecType == ElecType::EwaldAnaTwin));
 468 }
 469
 470 enum ElecType nbnxmGpuPickElectrostaticsKernelType(const interaction_const_t* ic,
 471                                                    const DeviceInformation&   deviceInfo)
 472 {
 473     if (ic->eeltype == CoulombInteractionType::Cut)
 474     {
 475         return ElecType::Cut;
 476     }
 477     else if (EEL_RF(ic->eeltype))
 478     {
 479         return ElecType::RF;
 480     }
 481     else if ((EEL_PME(ic->eeltype) || ic->eeltype == CoulombInteractionType::Ewald))
 482     {
 483         return nbnxn_gpu_pick_ewald_kernel_type(*ic, deviceInfo);
 484     }
 485     else
 486     {
 487         /* Shouldn't happen, as this is checked when choosing Verlet-scheme */
 488         GMX_THROW(gmx::InconsistentInputError(
 489                 gmx::formatString("The requested electrostatics type %s is not implemented in "
 490                                   "the GPU accelerated kernels!",
 491                                   enumValueToString(ic->eeltype))));
 492     }
 493 }
 494
 495
 496 enum VdwType nbnxmGpuPickVdwKernelType(const interaction_const_t* ic, LJCombinationRule ljCombinationRule)
 497 {
 498     if (ic->vdwtype == VanDerWaalsType::Cut)
 499     {
 500         switch (ic->vdw_modifier)
 501         {
 502             case InteractionModifiers::None:
 503             case InteractionModifiers::PotShift:
 504                 switch (ljCombinationRule)
 505                 {
 506                     case LJCombinationRule::None: return VdwType::Cut;
 507                     case LJCombinationRule::Geometric: return VdwType::CutCombGeom;
 508                     case LJCombinationRule::LorentzBerthelot: return VdwType::CutCombLB;
 509                     default:
 510                         GMX_THROW(gmx::InconsistentInputError(gmx::formatString(
 511                                 "The requested LJ combination rule %s is not implemented in "
 512                                 "the GPU accelerated kernels!",
 513                                 enumValueToString(ljCombinationRule))));
 514                 }
 515             case InteractionModifiers::ForceSwitch: return VdwType::FSwitch;
 516             case InteractionModifiers::PotSwitch: return VdwType::PSwitch;
 517             default:
 518                 GMX_THROW(gmx::InconsistentInputError(
 519                         gmx::formatString("The requested VdW interaction modifier %s is not "
 520                                           "implemented in the GPU accelerated kernels!",
 521                                           enumValueToString(ic->vdw_modifier))));
 522         }
 523     }
 524     else if (ic->vdwtype == VanDerWaalsType::Pme)
 525     {
 526         if (ic->ljpme_comb_rule == LongRangeVdW::Geom)
 527         {
 528             assert(ljCombinationRule == LJCombinationRule::Geometric);
 529             return VdwType::EwaldGeom;
 530         }
 531         else
 532         {
 533             assert(ljCombinationRule == LJCombinationRule::LorentzBerthelot);
 534             return VdwType::EwaldLB;
 535         }
 536     }
 537     else
 538     {
 539         GMX_THROW(gmx::InconsistentInputError(gmx::formatString(
 540                 "The requested VdW type %s is not implemented in the GPU accelerated kernels!",
 541                 enumValueToString(ic->vdwtype))));
 542     }
 543 }
 544
 545 void setupGpuShortRangeWork(NbnxmGpu* nb, const gmx::GpuBonded* gpuBonded, const gmx::InteractionLocality iLocality)
 546 {
 547     GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
 548
 549     // There is short-range work if the pair list for the provided
 550     // interaction locality contains entries or if there is any
 551     // bonded work (as this is not split into local/nonlocal).
 552     nb->haveWork[iLocality] = ((nb->plist[iLocality]->nsci != 0)
 553                                || (gpuBonded != nullptr && gpuBonded->haveInteractions()));
 554 }
 555
 556 bool haveGpuShortRangeWork(const NbnxmGpu* nb, const gmx::AtomLocality aLocality)
 557 {
 558     GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
 559
 560     return haveGpuShortRangeWork(*nb, gpuAtomToInteractionLocality(aLocality));
 561 }
 562
 563 /*! \brief
 564  * Launch asynchronously the download of nonbonded forces from the GPU
 565  * (and energies/shift forces if required).
 566  */
 567 void gpu_launch_cpyback(NbnxmGpu*                nb,
 568                         struct nbnxn_atomdata_t* nbatom,
 569                         const gmx::StepWorkload& stepWork,
 570                         const AtomLocality       atomLocality)
 571 {
 572     GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
 573
 574     /* determine interaction locality from atom locality */
 575     const InteractionLocality iloc = gpuAtomToInteractionLocality(atomLocality);
 576     GMX_ASSERT(iloc == InteractionLocality::Local
 577                        || (iloc == InteractionLocality::NonLocal && nb->bNonLocalStreamDoneMarked == false),
 578                "Non-local stream is indicating that the copy back event is enqueued at the "
 579                "beginning of the copy back function.");
 580
 581     /* extract the data */
 582     NBAtomData*         adat         = nb->atdat;
 583     Nbnxm::GpuTimers*   timers       = nb->timers;
 584     bool                bDoTime      = nb->bDoTime;
 585     const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
 586
 587     /* don't launch non-local copy-back if there was no non-local work to do */
 588     if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(*nb, iloc))
 589     {
 590         /* TODO An alternative way to signal that non-local work is
 591            complete is to use a clEnqueueMarker+clEnqueueBarrier
 592            pair. However, the use of bNonLocalStreamDoneMarked has the
 593            advantage of being local to the host, so probably minimizes
 594            overhead. Curiously, for NVIDIA OpenCL with an empty-domain
 595            test case, overall simulation performance was higher with
 596            the API calls, but this has not been tested on AMD OpenCL,
 597            so could be worth considering in future. */
 598         nb->bNonLocalStreamDoneMarked = false;
 599         return;
 600     }
 601
 602     /* local/nonlocal offset and length used for xq and f */
 603     auto atomsRange = getGpuAtomRange(adat, atomLocality);
 604
 605     /* beginning of timed D2H section */
 606     if (bDoTime)
 607     {
 608         timers->xf[atomLocality].nb_d2h.openTimingRegion(deviceStream);
 609     }
 610
 611     /* With DD the local D2H transfer can only start after the non-local
 612        has been launched. */
 613     if (iloc == InteractionLocality::Local && nb->bNonLocalStreamDoneMarked)
 614     {
 615         nb->nonlocal_done.enqueueWaitEvent(deviceStream);
 616         nb->bNonLocalStreamDoneMarked = false;
 617     }
 618
 619     /* DtoH f */
 620     static_assert(sizeof(*nbatom->out[0].f.data()) == sizeof(float),
 621                   "The host force buffer should be in single precision to match device data size.");
 622     copyFromDeviceBuffer(reinterpret_cast<Float3*>(nbatom->out[0].f.data()) + atomsRange.begin(),
 623                          &adat->f,
 624                          atomsRange.begin(),
 625                          atomsRange.size(),
 626                          deviceStream,
 627                          GpuApiCallBehavior::Async,
 628                          bDoTime ? timers->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
 629
 630     issueClFlushInStream(deviceStream);
 631
 632     /* After the non-local D2H is launched the nonlocal_done event can be
 633        recorded which signals that the local D2H can proceed. This event is not
 634        placed after the non-local kernel because we first need the non-local
 635        data back first. */
 636     if (iloc == InteractionLocality::NonLocal)
 637     {
 638         nb->nonlocal_done.markEvent(deviceStream);
 639         nb->bNonLocalStreamDoneMarked = true;
 640     }
 641
 642     /* only transfer energies in the local stream */
 643     if (iloc == InteractionLocality::Local)
 644     {
 645         /* DtoH fshift when virial is needed */
 646         if (stepWork.computeVirial)
 647         {
 648             static_assert(
 649                     sizeof(*nb->nbst.fShift) == sizeof(Float3),
 650                     "Sizes of host- and device-side shift vector elements should be the same.");
 651             copyFromDeviceBuffer(nb->nbst.fShift,
 652                                  &adat->fShift,
 653                                  0,
 654                                  SHIFTS,
 655                                  deviceStream,
 656                                  GpuApiCallBehavior::Async,
 657                                  bDoTime ? timers->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
 658         }
 659
 660         /* DtoH energies */
 661         if (stepWork.computeEnergy)
 662         {
 663             static_assert(sizeof(*nb->nbst.eLJ) == sizeof(float),
 664                           "Sizes of host- and device-side LJ energy terms should be the same.");
 665             copyFromDeviceBuffer(nb->nbst.eLJ,
 666                                  &adat->eLJ,
 667                                  0,
 668                                  1,
 669                                  deviceStream,
 670                                  GpuApiCallBehavior::Async,
 671                                  bDoTime ? timers->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
 672             static_assert(sizeof(*nb->nbst.eElec) == sizeof(float),
 673                           "Sizes of host- and device-side electrostatic energy terms should be the "
 674                           "same.");
 675             copyFromDeviceBuffer(nb->nbst.eElec,
 676                                  &adat->eElec,
 677                                  0,
 678                                  1,
 679                                  deviceStream,
 680                                  GpuApiCallBehavior::Async,
 681                                  bDoTime ? timers->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
 682         }
 683     }
 684
 685     if (bDoTime)
 686     {
 687         timers->xf[atomLocality].nb_d2h.closeTimingRegion(deviceStream);
 688     }
 689 }
 690
 691 void nbnxnInsertNonlocalGpuDependency(NbnxmGpu* nb, const InteractionLocality interactionLocality)
 692 {
 693     const DeviceStream& deviceStream = *nb->deviceStreams[interactionLocality];
 694
 695     /* When we get here all misc operations issued in the local stream as well as
 696        the local xq H2D are done,
 697        so we record that in the local stream and wait for it in the nonlocal one.
 698        This wait needs to precede any PP tasks, bonded or nonbonded, that may
 699        compute on interactions between local and nonlocal atoms.
 700      */
 701     if (nb->bUseTwoStreams)
 702     {
 703         if (interactionLocality == InteractionLocality::Local)
 704         {
 705             nb->misc_ops_and_local_H2D_done.markEvent(deviceStream);
 706             issueClFlushInStream(deviceStream);
 707         }
 708         else
 709         {
 710             nb->misc_ops_and_local_H2D_done.enqueueWaitEvent(deviceStream);
 711         }
 712     }
 713 }
 714
 715 /*! \brief Launch asynchronously the xq buffer host to device copy. */
 716 void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const AtomLocality atomLocality)
 717 {
 718     GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
 719
 720     const InteractionLocality iloc = gpuAtomToInteractionLocality(atomLocality);
 721
 722     NBAtomData*         adat         = nb->atdat;
 723     gpu_plist*          plist        = nb->plist[iloc];
 724     Nbnxm::GpuTimers*   timers       = nb->timers;
 725     const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
 726
 727     const bool bDoTime = nb->bDoTime;
 728
 729     /* Don't launch the non-local H2D copy if there is no dependent
 730        work to do: neither non-local nor other (e.g. bonded) work
 731        to do that has as input the nbnxn coordaintes.
 732        Doing the same for the local kernel is more complicated, since the
 733        local part of the force array also depends on the non-local kernel.
 734        So to avoid complicating the code and to reduce the risk of bugs,
 735        we always call the local local x+q copy (and the rest of the local
 736        work in nbnxn_gpu_launch_kernel().
 737      */
 738     if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(*nb, iloc))
 739     {
 740         plist->haveFreshList = false;
 741
 742         // The event is marked for Local interactions unconditionally,
 743         // so it has to be released here because of the early return
 744         // for NonLocal interactions.
 745         nb->misc_ops_and_local_H2D_done.reset();
 746
 747         return;
 748     }
 749
 750     /* local/nonlocal offset and length used for xq and f */
 751     const auto atomsRange = getGpuAtomRange(adat, atomLocality);
 752
 753     /* beginning of timed HtoD section */
 754     if (bDoTime)
 755     {
 756         timers->xf[atomLocality].nb_h2d.openTimingRegion(deviceStream);
 757     }
 758
 759     /* HtoD x, q */
 760     GMX_ASSERT(nbatom->XFormat == nbatXYZQ,
 761                "The coordinates should be in xyzq format to copy to the Float4 device buffer.");
 762     copyToDeviceBuffer(&adat->xq,
 763                        reinterpret_cast<const Float4*>(nbatom->x().data()) + atomsRange.begin(),
 764                        atomsRange.begin(),
 765                        atomsRange.size(),
 766                        deviceStream,
 767                        GpuApiCallBehavior::Async,
 768                        nullptr);
 769
 770     if (bDoTime)
 771     {
 772         timers->xf[atomLocality].nb_h2d.closeTimingRegion(deviceStream);
 773     }
 774
 775     /* When we get here all misc operations issued in the local stream as well as
 776        the local xq H2D are done,
 777        so we record that in the local stream and wait for it in the nonlocal one.
 778        This wait needs to precede any PP tasks, bonded or nonbonded, that may
 779        compute on interactions between local and nonlocal atoms.
 780      */
 781     nbnxnInsertNonlocalGpuDependency(nb, iloc);
 782 }
 783
 784 } // namespace Nbnxm