src/gromacs/nbnxm/nbnxm_gpu_data_mgmt.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2012,2013,2014,2015,2016 by the GROMACS development team.
   5  * Copyright (c) 2017,2018,2019,2020,2021, by the GROMACS development team, led by
   6  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   7  * and including many others, as listed in the AUTHORS file in the
   8  * top-level source directory and at http://www.gromacs.org.
   9  *
  10  * GROMACS is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU Lesser General Public License
  12  * as published by the Free Software Foundation; either version 2.1
  13  * of the License, or (at your option) any later version.
  14  *
  15  * GROMACS is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18  * Lesser General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public
  21  * License along with GROMACS; if not, see
  22  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  23  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  24  *
  25  * If you want to redistribute modifications to GROMACS, please
  26  * consider that scientific software is very special. Version
  27  * control is crucial - bugs must be traceable. We will be happy to
  28  * consider code for inclusion in the official distribution, but
  29  * derived work must not be called official GROMACS. Details are found
  30  * in the README & COPYING files - if they are missing, get the
  31  * official version at http://www.gromacs.org.
  32  *
  33  * To help us fund GROMACS development, we humbly ask that you cite
  34  * the research papers on the package. Check out http://www.gromacs.org.
  35  */
  36 /*! \internal \file
  37  *  \brief Define common implementation of nbnxm_gpu_data_mgmt.h
  38  *
  39  *  \author Anca Hamuraru <anca@streamcomputing.eu>
  40  *  \author Dimitrios Karkoulis <dimitris.karkoulis@gmail.com>
  41  *  \author Teemu Virolainen <teemu@streamcomputing.eu>
  42  *  \author Szilárd Páll <pall.szilard@gmail.com>
  43  *  \author Artem Zhmurov <zhmurov@gmail.com>
  44  *
  45  *  \ingroup module_nbnxm
  46  */
  47 #include "gmxpre.h"
  48
  49 #include "config.h"
  50
  51 #if GMX_GPU_CUDA
  52 #    include "cuda/nbnxm_cuda_types.h"
  53 #endif
  54
  55 #if GMX_GPU_OPENCL
  56 #    include "opencl/nbnxm_ocl_types.h"
  57 #endif
  58
  59 #if GMX_GPU_SYCL
  60 #    include "sycl/nbnxm_sycl_types.h"
  61 #endif
  62
  63 #include "nbnxm_gpu_data_mgmt.h"
  64
  65 #include "gromacs/gpu_utils/device_stream_manager.h"
  66 #include "gromacs/gpu_utils/gputraits.h"
  67 #include "gromacs/gpu_utils/pmalloc.h"
  68 #include "gromacs/hardware/device_information.h"
  69 #include "gromacs/mdtypes/interaction_const.h"
  70 #include "gromacs/mdtypes/simulation_workload.h"
  71 #include "gromacs/nbnxm/gpu_common_utils.h"
  72 #include "gromacs/nbnxm/gpu_data_mgmt.h"
  73 #include "gromacs/nbnxm/gridset.h"
  74 #include "gromacs/pbcutil/ishift.h"
  75 #include "gromacs/timing/gpu_timing.h"
  76 #include "gromacs/pbcutil/ishift.h"
  77 #include "gromacs/utility/cstringutil.h"
  78 #include "gromacs/utility/exceptions.h"
  79 #include "gromacs/utility/fatalerror.h"
  80
  81 #include "nbnxm_gpu.h"
  82 #include "pairlistsets.h"
  83
  84 namespace Nbnxm
  85 {
  86
  87 inline void issueClFlushInStream(const DeviceStream& deviceStream)
  88 {
  89 #if GMX_GPU_OPENCL
  90     /* Based on the v1.2 section 5.13 of the OpenCL spec, a flush is needed
  91      * in the stream after marking an event in it in order to be able to sync with
  92      * the event from another stream.
  93      */
  94     cl_int cl_error = clFlush(deviceStream.stream());
  95     if (cl_error != CL_SUCCESS)
  96     {
  97         GMX_THROW(gmx::InternalError("clFlush failed: " + ocl_get_error_string(cl_error)));
  98     }
  99 #else
 100     GMX_UNUSED_VALUE(deviceStream);
 101 #endif
 102 }
 103
 104 void init_ewald_coulomb_force_table(const EwaldCorrectionTables& tables,
 105                                     NBParamGpu*                  nbp,
 106                                     const DeviceContext&         deviceContext)
 107 {
 108     if (nbp->coulomb_tab)
 109     {
 110         destroyParamLookupTable(&nbp->coulomb_tab, nbp->coulomb_tab_texobj);
 111     }
 112
 113     nbp->coulomb_tab_scale = tables.scale;
 114     initParamLookupTable(
 115             &nbp->coulomb_tab, &nbp->coulomb_tab_texobj, tables.tableF.data(), tables.tableF.size(), deviceContext);
 116 }
 117
 118 enum ElecType nbnxn_gpu_pick_ewald_kernel_type(const interaction_const_t& ic,
 119                                                const DeviceInformation gmx_unused& deviceInfo)
 120 {
 121     bool bTwinCut = (ic.rcoulomb != ic.rvdw);
 122
 123     /* Benchmarking/development environment variables to force the use of
 124        analytical or tabulated Ewald kernel. */
 125     const bool forceAnalyticalEwald = (getenv("GMX_GPU_NB_ANA_EWALD") != nullptr);
 126     const bool forceTabulatedEwald  = (getenv("GMX_GPU_NB_TAB_EWALD") != nullptr);
 127     const bool forceTwinCutoffEwald = (getenv("GMX_GPU_NB_EWALD_TWINCUT") != nullptr);
 128
 129     if (forceAnalyticalEwald && forceTabulatedEwald)
 130     {
 131         gmx_incons(
 132                 "Both analytical and tabulated Ewald GPU non-bonded kernels "
 133                 "requested through environment variables.");
 134     }
 135
 136     /* By default, use analytical Ewald except with CUDA on NVIDIA CC 7.0 and 8.0.
 137      */
 138     const bool c_useTabulatedEwaldDefault =
 139 #if GMX_GPU_CUDA
 140             (deviceInfo.prop.major == 7 && deviceInfo.prop.minor == 0)
 141             || (deviceInfo.prop.major == 8 && deviceInfo.prop.minor == 0);
 142 #else
 143             false;
 144 #endif
 145     bool bUseAnalyticalEwald = !c_useTabulatedEwaldDefault;
 146     if (forceAnalyticalEwald)
 147     {
 148         bUseAnalyticalEwald = true;
 149         if (debug)
 150         {
 151             fprintf(debug, "Using analytical Ewald GPU kernels\n");
 152         }
 153     }
 154     else if (forceTabulatedEwald)
 155     {
 156         bUseAnalyticalEwald = false;
 157
 158         if (debug)
 159         {
 160             fprintf(debug, "Using tabulated Ewald GPU kernels\n");
 161         }
 162     }
 163
 164     /* Use twin cut-off kernels if requested by bTwinCut or the env. var.
 165        forces it (use it for debugging/benchmarking only). */
 166     if (!bTwinCut && !forceTwinCutoffEwald)
 167     {
 168         return bUseAnalyticalEwald ? ElecType::EwaldAna : ElecType::EwaldTab;
 169     }
 170     else
 171     {
 172         return bUseAnalyticalEwald ? ElecType::EwaldAnaTwin : ElecType::EwaldTabTwin;
 173     }
 174 }
 175
 176 void set_cutoff_parameters(NBParamGpu* nbp, const interaction_const_t& ic, const PairlistParams& listParams)
 177 {
 178     nbp->ewald_beta        = ic.ewaldcoeff_q;
 179     nbp->sh_ewald          = ic.sh_ewald;
 180     nbp->epsfac            = ic.epsfac;
 181     nbp->two_k_rf          = 2.0 * ic.reactionFieldCoefficient;
 182     nbp->c_rf              = ic.reactionFieldShift;
 183     nbp->rvdw_sq           = ic.rvdw * ic.rvdw;
 184     nbp->rcoulomb_sq       = ic.rcoulomb * ic.rcoulomb;
 185     nbp->rlistOuter_sq     = listParams.rlistOuter * listParams.rlistOuter;
 186     nbp->rlistInner_sq     = listParams.rlistInner * listParams.rlistInner;
 187     nbp->useDynamicPruning = listParams.useDynamicPruning;
 188
 189     nbp->sh_lj_ewald   = ic.sh_lj_ewald;
 190     nbp->ewaldcoeff_lj = ic.ewaldcoeff_lj;
 191
 192     nbp->rvdw_switch      = ic.rvdw_switch;
 193     nbp->dispersion_shift = ic.dispersion_shift;
 194     nbp->repulsion_shift  = ic.repulsion_shift;
 195     nbp->vdw_switch       = ic.vdw_switch;
 196 }
 197
 198 void gpu_pme_loadbal_update_param(const nonbonded_verlet_t* nbv, const interaction_const_t& ic)
 199 {
 200     if (!nbv || !nbv->useGpu())
 201     {
 202         return;
 203     }
 204     NbnxmGpu*   nb  = nbv->gpu_nbv;
 205     NBParamGpu* nbp = nb->nbparam;
 206
 207     set_cutoff_parameters(nbp, ic, nbv->pairlistSets().params());
 208
 209     nbp->elecType = nbnxn_gpu_pick_ewald_kernel_type(ic, nb->deviceContext_->deviceInfo());
 210
 211     GMX_RELEASE_ASSERT(ic.coulombEwaldTables, "Need valid Coulomb Ewald correction tables");
 212     init_ewald_coulomb_force_table(*ic.coulombEwaldTables, nbp, *nb->deviceContext_);
 213 }
 214
 215 void init_plist(gpu_plist* pl)
 216 {
 217     /* initialize to nullptr pointers to data that is not allocated here and will
 218        need reallocation in nbnxn_gpu_init_pairlist */
 219     pl->sci   = nullptr;
 220     pl->cj4   = nullptr;
 221     pl->imask = nullptr;
 222     pl->excl  = nullptr;
 223
 224     /* size -1 indicates that the respective array hasn't been initialized yet */
 225     pl->na_c                   = -1;
 226     pl->nsci                   = -1;
 227     pl->sci_nalloc             = -1;
 228     pl->ncj4                   = -1;
 229     pl->cj4_nalloc             = -1;
 230     pl->nimask                 = -1;
 231     pl->imask_nalloc           = -1;
 232     pl->nexcl                  = -1;
 233     pl->excl_nalloc            = -1;
 234     pl->haveFreshList          = false;
 235     pl->rollingPruningNumParts = 0;
 236     pl->rollingPruningPart     = 0;
 237 }
 238
 239 void init_timings(gmx_wallclock_gpu_nbnxn_t* t)
 240 {
 241     t->nb_h2d_t = 0.0;
 242     t->nb_d2h_t = 0.0;
 243     t->nb_c     = 0;
 244     t->pl_h2d_t = 0.0;
 245     t->pl_h2d_c = 0;
 246     for (int i = 0; i < 2; i++)
 247     {
 248         for (int j = 0; j < 2; j++)
 249         {
 250             t->ktime[i][j].t = 0.0;
 251             t->ktime[i][j].c = 0;
 252         }
 253     }
 254     t->pruneTime.c        = 0;
 255     t->pruneTime.t        = 0.0;
 256     t->dynamicPruneTime.c = 0;
 257     t->dynamicPruneTime.t = 0.0;
 258 }
 259
 260 /*! \brief Initialize \p atomdata first time; it only gets filled at pair-search. */
 261 static void initAtomdataFirst(NBAtomData*          atomdata,
 262                               int                  numTypes,
 263                               const DeviceContext& deviceContext,
 264                               const DeviceStream&  localStream)
 265 {
 266     atomdata->numTypes = numTypes;
 267     allocateDeviceBuffer(&atomdata->shiftVec, SHIFTS, deviceContext);
 268     atomdata->shiftVecUploaded = false;
 269
 270     allocateDeviceBuffer(&atomdata->fShift, SHIFTS, deviceContext);
 271     allocateDeviceBuffer(&atomdata->eLJ, 1, deviceContext);
 272     allocateDeviceBuffer(&atomdata->eElec, 1, deviceContext);
 273
 274     clearDeviceBufferAsync(&atomdata->fShift, 0, SHIFTS, localStream);
 275     clearDeviceBufferAsync(&atomdata->eElec, 0, 1, localStream);
 276     clearDeviceBufferAsync(&atomdata->eLJ, 0, 1, localStream);
 277
 278     /* initialize to nullptr pointers to data that is not allocated here and will
 279        need reallocation in later */
 280     atomdata->xq = nullptr;
 281     atomdata->f  = nullptr;
 282
 283     /* size -1 indicates that the respective array hasn't been initialized yet */
 284     atomdata->numAtoms      = -1;
 285     atomdata->numAtomsAlloc = -1;
 286 }
 287
 288 /*! \brief Initialize the nonbonded parameter data structure. */
 289 static void initNbparam(NBParamGpu*                     nbp,
 290                         const interaction_const_t&      ic,
 291                         const PairlistParams&           listParams,
 292                         const nbnxn_atomdata_t::Params& nbatParams,
 293                         const DeviceContext&            deviceContext)
 294 {
 295     const int numTypes = nbatParams.numTypes;
 296
 297     set_cutoff_parameters(nbp, ic, listParams);
 298
 299     nbp->vdwType  = nbnxmGpuPickVdwKernelType(ic, nbatParams.ljCombinationRule);
 300     nbp->elecType = nbnxmGpuPickElectrostaticsKernelType(ic, deviceContext.deviceInfo());
 301
 302     if (ic.vdwtype == VanDerWaalsType::Pme)
 303     {
 304         if (ic.ljpme_comb_rule == LongRangeVdW::Geom)
 305         {
 306             GMX_ASSERT(nbatParams.ljCombinationRule == LJCombinationRule::Geometric,
 307                        "Combination rule mismatch!");
 308         }
 309         else
 310         {
 311             GMX_ASSERT(nbatParams.ljCombinationRule == LJCombinationRule::LorentzBerthelot,
 312                        "Combination rule mismatch!");
 313         }
 314     }
 315
 316     /* generate table for PME */
 317     if (nbp->elecType == ElecType::EwaldTab || nbp->elecType == ElecType::EwaldTabTwin)
 318     {
 319         GMX_RELEASE_ASSERT(ic.coulombEwaldTables, "Need valid Coulomb Ewald correction tables");
 320         init_ewald_coulomb_force_table(*ic.coulombEwaldTables, nbp, deviceContext);
 321     }
 322
 323     /* set up LJ parameter lookup table */
 324     if (!useLjCombRule(nbp->vdwType))
 325     {
 326         static_assert(sizeof(decltype(nbp->nbfp)) == 2 * sizeof(decltype(*nbatParams.nbfp.data())),
 327                       "Mismatch in the size of host / device data types");
 328         initParamLookupTable(&nbp->nbfp,
 329                              &nbp->nbfp_texobj,
 330                              reinterpret_cast<const Float2*>(nbatParams.nbfp.data()),
 331                              numTypes * numTypes,
 332                              deviceContext);
 333     }
 334
 335     /* set up LJ-PME parameter lookup table */
 336     if (ic.vdwtype == VanDerWaalsType::Pme)
 337     {
 338         static_assert(sizeof(decltype(nbp->nbfp_comb))
 339                               == 2 * sizeof(decltype(*nbatParams.nbfp_comb.data())),
 340                       "Mismatch in the size of host / device data types");
 341         initParamLookupTable(&nbp->nbfp_comb,
 342                              &nbp->nbfp_comb_texobj,
 343                              reinterpret_cast<const Float2*>(nbatParams.nbfp_comb.data()),
 344                              numTypes,
 345                              deviceContext);
 346     }
 347 }
 348
 349 NbnxmGpu* gpu_init(const gmx::DeviceStreamManager& deviceStreamManager,
 350                    const interaction_const_t*      ic,
 351                    const PairlistParams&           listParams,
 352                    const nbnxn_atomdata_t*         nbat,
 353                    const bool                      bLocalAndNonlocal)
 354 {
 355     auto* nb                              = new NbnxmGpu();
 356     nb->deviceContext_                    = &deviceStreamManager.context();
 357     nb->atdat                             = new NBAtomData;
 358     nb->nbparam                           = new NBParamGpu;
 359     nb->plist[InteractionLocality::Local] = new Nbnxm::gpu_plist;
 360     if (bLocalAndNonlocal)
 361     {
 362         nb->plist[InteractionLocality::NonLocal] = new Nbnxm::gpu_plist;
 363     }
 364
 365     nb->bUseTwoStreams = bLocalAndNonlocal;
 366
 367     nb->timers = new Nbnxm::GpuTimers();
 368     snew(nb->timings, 1);
 369
 370     /* WARNING: CUDA timings are incorrect with multiple streams.
 371      * This is the main reason why they are disabled by default.
 372      * Can be enabled by setting GMX_ENABLE_GPU_TIMING environment variable.
 373      * TODO: Consider turning on by default when we can detect nr of streams.
 374      *
 375      * OpenCL timing is enabled by default and can be disabled by
 376      * GMX_DISABLE_GPU_TIMING environment variable.
 377      *
 378      * Timing is disabled in SYCL.
 379      */
 380     nb->bDoTime = (GMX_GPU_CUDA && (getenv("GMX_ENABLE_GPU_TIMING") != nullptr))
 381                   || (GMX_GPU_OPENCL && (getenv("GMX_DISABLE_GPU_TIMING") == nullptr));
 382
 383     if (nb->bDoTime)
 384     {
 385         init_timings(nb->timings);
 386     }
 387
 388     /* init nbst */
 389     pmalloc(reinterpret_cast<void**>(&nb->nbst.eLJ), sizeof(*nb->nbst.eLJ));
 390     pmalloc(reinterpret_cast<void**>(&nb->nbst.eElec), sizeof(*nb->nbst.eElec));
 391     pmalloc(reinterpret_cast<void**>(&nb->nbst.fShift), SHIFTS * sizeof(*nb->nbst.fShift));
 392
 393     init_plist(nb->plist[InteractionLocality::Local]);
 394
 395     /* local/non-local GPU streams */
 396     GMX_RELEASE_ASSERT(deviceStreamManager.streamIsValid(gmx::DeviceStreamType::NonBondedLocal),
 397                        "Local non-bonded stream should be initialized to use GPU for non-bonded.");
 398     const DeviceStream& localStream = deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedLocal);
 399     nb->deviceStreams[InteractionLocality::Local] = &localStream;
 400     // In general, it's not strictly necessary to use 2 streams for SYCL, since they are
 401     // out-of-order. But for the time being, it will be less disruptive to keep them.
 402     if (nb->bUseTwoStreams)
 403     {
 404         init_plist(nb->plist[InteractionLocality::NonLocal]);
 405
 406         GMX_RELEASE_ASSERT(deviceStreamManager.streamIsValid(gmx::DeviceStreamType::NonBondedNonLocal),
 407                            "Non-local non-bonded stream should be initialized to use GPU for "
 408                            "non-bonded with domain decomposition.");
 409         nb->deviceStreams[InteractionLocality::NonLocal] =
 410                 &deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedNonLocal);
 411     }
 412
 413     const nbnxn_atomdata_t::Params& nbatParams    = nbat->params();
 414     const DeviceContext&            deviceContext = *nb->deviceContext_;
 415
 416     initNbparam(nb->nbparam, *ic, listParams, nbatParams, deviceContext);
 417     initAtomdataFirst(nb->atdat, nbatParams.numTypes, deviceContext, localStream);
 418
 419     gpu_init_platform_specific(nb);
 420
 421     if (debug)
 422     {
 423         fprintf(debug, "Initialized NBNXM GPU data structures.\n");
 424     }
 425
 426     return nb;
 427 }
 428
 429 void gpu_upload_shiftvec(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom)
 430 {
 431     NBAtomData*         adat        = nb->atdat;
 432     const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local];
 433
 434     /* only if we have a dynamic box */
 435     if (nbatom->bDynamicBox || !adat->shiftVecUploaded)
 436     {
 437         copyToDeviceBuffer(&adat->shiftVec,
 438                            gmx::asGenericFloat3Pointer(nbatom->shift_vec),
 439                            0,
 440                            SHIFTS,
 441                            localStream,
 442                            GpuApiCallBehavior::Async,
 443                            nullptr);
 444         adat->shiftVecUploaded = true;
 445     }
 446 }
 447
 448 //! This function is documented in the header file
 449 void gpu_init_pairlist(NbnxmGpu* nb, const NbnxnPairlistGpu* h_plist, const InteractionLocality iloc)
 450 {
 451     char sbuf[STRLEN];
 452     // Timing accumulation should happen only if there was work to do
 453     // because getLastRangeTime() gets skipped with empty lists later
 454     // which leads to the counter not being reset.
 455     bool                bDoTime      = (nb->bDoTime && !h_plist->sci.empty());
 456     const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
 457     gpu_plist*          d_plist      = nb->plist[iloc];
 458
 459     if (d_plist->na_c < 0)
 460     {
 461         d_plist->na_c = h_plist->na_ci;
 462     }
 463     else
 464     {
 465         if (d_plist->na_c != h_plist->na_ci)
 466         {
 467             sprintf(sbuf,
 468                     "In init_plist: the #atoms per cell has changed (from %d to %d)",
 469                     d_plist->na_c,
 470                     h_plist->na_ci);
 471             gmx_incons(sbuf);
 472         }
 473     }
 474
 475     GpuTimers::Interaction& iTimers = nb->timers->interaction[iloc];
 476
 477     if (bDoTime)
 478     {
 479         iTimers.pl_h2d.openTimingRegion(deviceStream);
 480         iTimers.didPairlistH2D = true;
 481     }
 482
 483     // TODO most of this function is same in CUDA and OpenCL, move into the header
 484     const DeviceContext& deviceContext = *nb->deviceContext_;
 485
 486     reallocateDeviceBuffer(
 487             &d_plist->sci, h_plist->sci.size(), &d_plist->nsci, &d_plist->sci_nalloc, deviceContext);
 488     copyToDeviceBuffer(&d_plist->sci,
 489                        h_plist->sci.data(),
 490                        0,
 491                        h_plist->sci.size(),
 492                        deviceStream,
 493                        GpuApiCallBehavior::Async,
 494                        bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
 495
 496     reallocateDeviceBuffer(
 497             &d_plist->cj4, h_plist->cj4.size(), &d_plist->ncj4, &d_plist->cj4_nalloc, deviceContext);
 498     copyToDeviceBuffer(&d_plist->cj4,
 499                        h_plist->cj4.data(),
 500                        0,
 501                        h_plist->cj4.size(),
 502                        deviceStream,
 503                        GpuApiCallBehavior::Async,
 504                        bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
 505
 506     reallocateDeviceBuffer(&d_plist->imask,
 507                            h_plist->cj4.size() * c_nbnxnGpuClusterpairSplit,
 508                            &d_plist->nimask,
 509                            &d_plist->imask_nalloc,
 510                            deviceContext);
 511
 512     reallocateDeviceBuffer(
 513             &d_plist->excl, h_plist->excl.size(), &d_plist->nexcl, &d_plist->excl_nalloc, deviceContext);
 514     copyToDeviceBuffer(&d_plist->excl,
 515                        h_plist->excl.data(),
 516                        0,
 517                        h_plist->excl.size(),
 518                        deviceStream,
 519                        GpuApiCallBehavior::Async,
 520                        bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
 521
 522     if (bDoTime)
 523     {
 524         iTimers.pl_h2d.closeTimingRegion(deviceStream);
 525     }
 526
 527     /* need to prune the pair list during the next step */
 528     d_plist->haveFreshList = true;
 529 }
 530
 531 void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat)
 532 {
 533     bool                 bDoTime       = nb->bDoTime;
 534     Nbnxm::GpuTimers*    timers        = bDoTime ? nb->timers : nullptr;
 535     NBAtomData*          atdat         = nb->atdat;
 536     const DeviceContext& deviceContext = *nb->deviceContext_;
 537     const DeviceStream&  localStream   = *nb->deviceStreams[InteractionLocality::Local];
 538
 539     int  numAtoms  = nbat->numAtoms();
 540     bool realloced = false;
 541
 542     if (bDoTime)
 543     {
 544         /* time async copy */
 545         timers->atdat.openTimingRegion(localStream);
 546     }
 547
 548     /* need to reallocate if we have to copy more atoms than the amount of space
 549        available and only allocate if we haven't initialized yet, i.e atdat->natoms == -1 */
 550     if (numAtoms > atdat->numAtomsAlloc)
 551     {
 552         int numAlloc = over_alloc_small(numAtoms);
 553
 554         /* free up first if the arrays have already been initialized */
 555         if (atdat->numAtomsAlloc != -1)
 556         {
 557             freeDeviceBuffer(&atdat->f);
 558             freeDeviceBuffer(&atdat->xq);
 559             if (useLjCombRule(nb->nbparam->vdwType))
 560             {
 561                 freeDeviceBuffer(&atdat->ljComb);
 562             }
 563             else
 564             {
 565                 freeDeviceBuffer(&atdat->atomTypes);
 566             }
 567         }
 568
 569
 570         allocateDeviceBuffer(&atdat->f, numAlloc, deviceContext);
 571         allocateDeviceBuffer(&atdat->xq, numAlloc, deviceContext);
 572
 573         if (useLjCombRule(nb->nbparam->vdwType))
 574         {
 575             // Two Lennard-Jones parameters per atom
 576             allocateDeviceBuffer(&atdat->ljComb, numAlloc, deviceContext);
 577         }
 578         else
 579         {
 580             allocateDeviceBuffer(&atdat->atomTypes, numAlloc, deviceContext);
 581         }
 582
 583         atdat->numAtomsAlloc = numAlloc;
 584         realloced            = true;
 585     }
 586
 587     atdat->numAtoms      = numAtoms;
 588     atdat->numAtomsLocal = nbat->natoms_local;
 589
 590     /* need to clear GPU f output if realloc happened */
 591     if (realloced)
 592     {
 593         clearDeviceBufferAsync(&atdat->f, 0, atdat->numAtomsAlloc, localStream);
 594     }
 595
 596     if (useLjCombRule(nb->nbparam->vdwType))
 597     {
 598         static_assert(
 599                 sizeof(Float2) == 2 * sizeof(*nbat->params().lj_comb.data()),
 600                 "Size of a pair of LJ parameters elements should be equal to the size of Float2.");
 601         copyToDeviceBuffer(&atdat->ljComb,
 602                            reinterpret_cast<const Float2*>(nbat->params().lj_comb.data()),
 603                            0,
 604                            numAtoms,
 605                            localStream,
 606                            GpuApiCallBehavior::Async,
 607                            bDoTime ? timers->atdat.fetchNextEvent() : nullptr);
 608     }
 609     else
 610     {
 611         static_assert(sizeof(int) == sizeof(*nbat->params().type.data()),
 612                       "Sizes of host- and device-side atom types should be the same.");
 613         copyToDeviceBuffer(&atdat->atomTypes,
 614                            nbat->params().type.data(),
 615                            0,
 616                            numAtoms,
 617                            localStream,
 618                            GpuApiCallBehavior::Async,
 619                            bDoTime ? timers->atdat.fetchNextEvent() : nullptr);
 620     }
 621
 622     if (bDoTime)
 623     {
 624         timers->atdat.closeTimingRegion(localStream);
 625     }
 626
 627     /* kick off the tasks enqueued above to ensure concurrency with the search */
 628     issueClFlushInStream(localStream);
 629 }
 630
 631 void gpu_clear_outputs(NbnxmGpu* nb, bool computeVirial)
 632 {
 633     NBAtomData*         adat        = nb->atdat;
 634     const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local];
 635     // Clear forces
 636     clearDeviceBufferAsync(&adat->f, 0, nb->atdat->numAtoms, localStream);
 637     // Clear shift force array and energies if the outputs were used in the current step
 638     if (computeVirial)
 639     {
 640         clearDeviceBufferAsync(&adat->fShift, 0, SHIFTS, localStream);
 641         clearDeviceBufferAsync(&adat->eLJ, 0, 1, localStream);
 642         clearDeviceBufferAsync(&adat->eElec, 0, 1, localStream);
 643     }
 644     issueClFlushInStream(localStream);
 645 }
 646
 647 //! This function is documented in the header file
 648 gmx_wallclock_gpu_nbnxn_t* gpu_get_timings(NbnxmGpu* nb)
 649 {
 650     return (nb != nullptr && nb->bDoTime) ? nb->timings : nullptr;
 651 }
 652
 653 //! This function is documented in the header file
 654 void gpu_reset_timings(nonbonded_verlet_t* nbv)
 655 {
 656     if (nbv->gpu_nbv && nbv->gpu_nbv->bDoTime)
 657     {
 658         init_timings(nbv->gpu_nbv->timings);
 659     }
 660 }
 661
 662 bool gpu_is_kernel_ewald_analytical(const NbnxmGpu* nb)
 663 {
 664     return ((nb->nbparam->elecType == ElecType::EwaldAna)
 665             || (nb->nbparam->elecType == ElecType::EwaldAnaTwin));
 666 }
 667
 668 enum ElecType nbnxmGpuPickElectrostaticsKernelType(const interaction_const_t& ic,
 669                                                    const DeviceInformation&   deviceInfo)
 670 {
 671     if (ic.eeltype == CoulombInteractionType::Cut)
 672     {
 673         return ElecType::Cut;
 674     }
 675     else if (EEL_RF(ic.eeltype))
 676     {
 677         return ElecType::RF;
 678     }
 679     else if ((EEL_PME(ic.eeltype) || ic.eeltype == CoulombInteractionType::Ewald))
 680     {
 681         return nbnxn_gpu_pick_ewald_kernel_type(ic, deviceInfo);
 682     }
 683     else
 684     {
 685         /* Shouldn't happen, as this is checked when choosing Verlet-scheme */
 686         GMX_THROW(gmx::InconsistentInputError(
 687                 gmx::formatString("The requested electrostatics type %s is not implemented in "
 688                                   "the GPU accelerated kernels!",
 689                                   enumValueToString(ic.eeltype))));
 690     }
 691 }
 692
 693
 694 enum VdwType nbnxmGpuPickVdwKernelType(const interaction_const_t& ic, LJCombinationRule ljCombinationRule)
 695 {
 696     if (ic.vdwtype == VanDerWaalsType::Cut)
 697     {
 698         switch (ic.vdw_modifier)
 699         {
 700             case InteractionModifiers::None:
 701             case InteractionModifiers::PotShift:
 702                 switch (ljCombinationRule)
 703                 {
 704                     case LJCombinationRule::None: return VdwType::Cut;
 705                     case LJCombinationRule::Geometric: return VdwType::CutCombGeom;
 706                     case LJCombinationRule::LorentzBerthelot: return VdwType::CutCombLB;
 707                     default:
 708                         GMX_THROW(gmx::InconsistentInputError(gmx::formatString(
 709                                 "The requested LJ combination rule %s is not implemented in "
 710                                 "the GPU accelerated kernels!",
 711                                 enumValueToString(ljCombinationRule))));
 712                 }
 713             case InteractionModifiers::ForceSwitch: return VdwType::FSwitch;
 714             case InteractionModifiers::PotSwitch: return VdwType::PSwitch;
 715             default:
 716                 GMX_THROW(gmx::InconsistentInputError(
 717                         gmx::formatString("The requested VdW interaction modifier %s is not "
 718                                           "implemented in the GPU accelerated kernels!",
 719                                           enumValueToString(ic.vdw_modifier))));
 720         }
 721     }
 722     else if (ic.vdwtype == VanDerWaalsType::Pme)
 723     {
 724         if (ic.ljpme_comb_rule == LongRangeVdW::Geom)
 725         {
 726             assert(ljCombinationRule == LJCombinationRule::Geometric);
 727             return VdwType::EwaldGeom;
 728         }
 729         else
 730         {
 731             assert(ljCombinationRule == LJCombinationRule::LorentzBerthelot);
 732             return VdwType::EwaldLB;
 733         }
 734     }
 735     else
 736     {
 737         GMX_THROW(gmx::InconsistentInputError(gmx::formatString(
 738                 "The requested VdW type %s is not implemented in the GPU accelerated kernels!",
 739                 enumValueToString(ic.vdwtype))));
 740     }
 741 }
 742
 743 void setupGpuShortRangeWork(NbnxmGpu* nb, const gmx::GpuBonded* gpuBonded, const gmx::InteractionLocality iLocality)
 744 {
 745     GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
 746
 747     // There is short-range work if the pair list for the provided
 748     // interaction locality contains entries or if there is any
 749     // bonded work (as this is not split into local/nonlocal).
 750     nb->haveWork[iLocality] = ((nb->plist[iLocality]->nsci != 0)
 751                                || (gpuBonded != nullptr && gpuBonded->haveInteractions()));
 752 }
 753
 754 bool haveGpuShortRangeWork(const NbnxmGpu* nb, const gmx::InteractionLocality interactionLocality)
 755 {
 756     GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
 757
 758     return nb->haveWork[interactionLocality];
 759 }
 760
 761 /*! \brief
 762  * Launch asynchronously the download of nonbonded forces from the GPU
 763  * (and energies/shift forces if required).
 764  */
 765 void gpu_launch_cpyback(NbnxmGpu*                nb,
 766                         struct nbnxn_atomdata_t* nbatom,
 767                         const gmx::StepWorkload& stepWork,
 768                         const AtomLocality       atomLocality)
 769 {
 770     GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
 771
 772     /* determine interaction locality from atom locality */
 773     const InteractionLocality iloc = atomToInteractionLocality(atomLocality);
 774     GMX_ASSERT(iloc == InteractionLocality::Local
 775                        || (iloc == InteractionLocality::NonLocal && nb->bNonLocalStreamDoneMarked == false),
 776                "Non-local stream is indicating that the copy back event is enqueued at the "
 777                "beginning of the copy back function.");
 778
 779     /* extract the data */
 780     NBAtomData*         adat         = nb->atdat;
 781     Nbnxm::GpuTimers*   timers       = nb->timers;
 782     bool                bDoTime      = nb->bDoTime;
 783     const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
 784
 785     /* don't launch non-local copy-back if there was no non-local work to do */
 786     if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(nb, iloc))
 787     {
 788         /* TODO An alternative way to signal that non-local work is
 789            complete is to use a clEnqueueMarker+clEnqueueBarrier
 790            pair. However, the use of bNonLocalStreamDoneMarked has the
 791            advantage of being local to the host, so probably minimizes
 792            overhead. Curiously, for NVIDIA OpenCL with an empty-domain
 793            test case, overall simulation performance was higher with
 794            the API calls, but this has not been tested on AMD OpenCL,
 795            so could be worth considering in future. */
 796         nb->bNonLocalStreamDoneMarked = false;
 797         return;
 798     }
 799
 800     /* local/nonlocal offset and length used for xq and f */
 801     auto atomsRange = getGpuAtomRange(adat, atomLocality);
 802
 803     /* beginning of timed D2H section */
 804     if (bDoTime)
 805     {
 806         timers->xf[atomLocality].nb_d2h.openTimingRegion(deviceStream);
 807     }
 808
 809     /* With DD the local D2H transfer can only start after the non-local
 810        has been launched. */
 811     if (iloc == InteractionLocality::Local && nb->bNonLocalStreamDoneMarked)
 812     {
 813         nb->nonlocal_done.enqueueWaitEvent(deviceStream);
 814         nb->bNonLocalStreamDoneMarked = false;
 815     }
 816
 817     /* DtoH f */
 818     if (!stepWork.useGpuFBufferOps)
 819     {
 820         static_assert(
 821                 sizeof(*nbatom->out[0].f.data()) == sizeof(float),
 822                 "The host force buffer should be in single precision to match device data size.");
 823         copyFromDeviceBuffer(reinterpret_cast<Float3*>(nbatom->out[0].f.data()) + atomsRange.begin(),
 824                              &adat->f,
 825                              atomsRange.begin(),
 826                              atomsRange.size(),
 827                              deviceStream,
 828                              GpuApiCallBehavior::Async,
 829                              bDoTime ? timers->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
 830
 831         issueClFlushInStream(deviceStream);
 832     }
 833
 834     /* After the non-local D2H is launched the nonlocal_done event can be
 835        recorded which signals that the local D2H can proceed. This event is not
 836        placed after the non-local kernel because we first need the non-local
 837        data back first. */
 838     if (iloc == InteractionLocality::NonLocal)
 839     {
 840         nb->nonlocal_done.markEvent(deviceStream);
 841         nb->bNonLocalStreamDoneMarked = true;
 842     }
 843
 844     /* only transfer energies in the local stream */
 845     if (iloc == InteractionLocality::Local)
 846     {
 847         /* DtoH fshift when virial is needed */
 848         if (stepWork.computeVirial)
 849         {
 850             static_assert(
 851                     sizeof(*nb->nbst.fShift) == sizeof(Float3),
 852                     "Sizes of host- and device-side shift vector elements should be the same.");
 853             copyFromDeviceBuffer(nb->nbst.fShift,
 854                                  &adat->fShift,
 855                                  0,
 856                                  SHIFTS,
 857                                  deviceStream,
 858                                  GpuApiCallBehavior::Async,
 859                                  bDoTime ? timers->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
 860         }
 861
 862         /* DtoH energies */
 863         if (stepWork.computeEnergy)
 864         {
 865             static_assert(sizeof(*nb->nbst.eLJ) == sizeof(float),
 866                           "Sizes of host- and device-side LJ energy terms should be the same.");
 867             copyFromDeviceBuffer(nb->nbst.eLJ,
 868                                  &adat->eLJ,
 869                                  0,
 870                                  1,
 871                                  deviceStream,
 872                                  GpuApiCallBehavior::Async,
 873                                  bDoTime ? timers->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
 874             static_assert(sizeof(*nb->nbst.eElec) == sizeof(float),
 875                           "Sizes of host- and device-side electrostatic energy terms should be the "
 876                           "same.");
 877             copyFromDeviceBuffer(nb->nbst.eElec,
 878                                  &adat->eElec,
 879                                  0,
 880                                  1,
 881                                  deviceStream,
 882                                  GpuApiCallBehavior::Async,
 883                                  bDoTime ? timers->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
 884         }
 885     }
 886
 887     if (bDoTime)
 888     {
 889         timers->xf[atomLocality].nb_d2h.closeTimingRegion(deviceStream);
 890     }
 891 }
 892
 893 void nbnxnInsertNonlocalGpuDependency(NbnxmGpu* nb, const InteractionLocality interactionLocality)
 894 {
 895     const DeviceStream& deviceStream = *nb->deviceStreams[interactionLocality];
 896
 897     /* When we get here all misc operations issued in the local stream as well as
 898        the local xq H2D are done,
 899        so we record that in the local stream and wait for it in the nonlocal one.
 900        This wait needs to precede any PP tasks, bonded or nonbonded, that may
 901        compute on interactions between local and nonlocal atoms.
 902      */
 903     if (nb->bUseTwoStreams)
 904     {
 905         if (interactionLocality == InteractionLocality::Local)
 906         {
 907             nb->misc_ops_and_local_H2D_done.markEvent(deviceStream);
 908             issueClFlushInStream(deviceStream);
 909         }
 910         else
 911         {
 912             nb->misc_ops_and_local_H2D_done.enqueueWaitEvent(deviceStream);
 913         }
 914     }
 915 }
 916
 917 /*! \brief Launch asynchronously the xq buffer host to device copy. */
 918 void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const AtomLocality atomLocality)
 919 {
 920     GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
 921
 922     const InteractionLocality iloc = atomToInteractionLocality(atomLocality);
 923
 924     NBAtomData*         adat         = nb->atdat;
 925     gpu_plist*          plist        = nb->plist[iloc];
 926     Nbnxm::GpuTimers*   timers       = nb->timers;
 927     const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
 928
 929     const bool bDoTime = nb->bDoTime;
 930
 931     /* Don't launch the non-local H2D copy if there is no dependent
 932        work to do: neither non-local nor other (e.g. bonded) work
 933        to do that has as input the nbnxn coordaintes.
 934        Doing the same for the local kernel is more complicated, since the
 935        local part of the force array also depends on the non-local kernel.
 936        So to avoid complicating the code and to reduce the risk of bugs,
 937        we always call the local local x+q copy (and the rest of the local
 938        work in nbnxn_gpu_launch_kernel().
 939      */
 940     if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(nb, iloc))
 941     {
 942         plist->haveFreshList = false;
 943
 944         // The event is marked for Local interactions unconditionally,
 945         // so it has to be released here because of the early return
 946         // for NonLocal interactions.
 947         nb->misc_ops_and_local_H2D_done.reset();
 948
 949         return;
 950     }
 951
 952     /* local/nonlocal offset and length used for xq and f */
 953     const auto atomsRange = getGpuAtomRange(adat, atomLocality);
 954
 955     /* beginning of timed HtoD section */
 956     if (bDoTime)
 957     {
 958         timers->xf[atomLocality].nb_h2d.openTimingRegion(deviceStream);
 959     }
 960
 961     /* HtoD x, q */
 962     GMX_ASSERT(nbatom->XFormat == nbatXYZQ,
 963                "The coordinates should be in xyzq format to copy to the Float4 device buffer.");
 964     copyToDeviceBuffer(&adat->xq,
 965                        reinterpret_cast<const Float4*>(nbatom->x().data()) + atomsRange.begin(),
 966                        atomsRange.begin(),
 967                        atomsRange.size(),
 968                        deviceStream,
 969                        GpuApiCallBehavior::Async,
 970                        nullptr);
 971
 972     if (bDoTime)
 973     {
 974         timers->xf[atomLocality].nb_h2d.closeTimingRegion(deviceStream);
 975     }
 976
 977     /* When we get here all misc operations issued in the local stream as well as
 978        the local xq H2D are done,
 979        so we record that in the local stream and wait for it in the nonlocal one.
 980        This wait needs to precede any PP tasks, bonded or nonbonded, that may
 981        compute on interactions between local and nonlocal atoms.
 982      */
 983     nbnxnInsertNonlocalGpuDependency(nb, iloc);
 984 }
 985
 986
 987 /* Initialization for X buffer operations on GPU. */
 988 void nbnxn_gpu_init_x_to_nbat_x(const Nbnxm::GridSet& gridSet, NbnxmGpu* gpu_nbv)
 989 {
 990     const DeviceStream& localStream   = *gpu_nbv->deviceStreams[InteractionLocality::Local];
 991     const bool          bDoTime       = gpu_nbv->bDoTime;
 992     const int           maxNumColumns = gridSet.numColumnsMax();
 993
 994     reallocateDeviceBuffer(&gpu_nbv->cxy_na,
 995                            maxNumColumns * gridSet.grids().size(),
 996                            &gpu_nbv->ncxy_na,
 997                            &gpu_nbv->ncxy_na_alloc,
 998                            *gpu_nbv->deviceContext_);
 999     reallocateDeviceBuffer(&gpu_nbv->cxy_ind,
1000                            maxNumColumns * gridSet.grids().size(),
1001                            &gpu_nbv->ncxy_ind,
1002                            &gpu_nbv->ncxy_ind_alloc,
1003                            *gpu_nbv->deviceContext_);
1004
1005     for (unsigned int g = 0; g < gridSet.grids().size(); g++)
1006     {
1007         const Nbnxm::Grid& grid = gridSet.grids()[g];
1008
1009         const int  numColumns      = grid.numColumns();
1010         const int* atomIndices     = gridSet.atomIndices().data();
1011         const int  atomIndicesSize = gridSet.atomIndices().size();
1012         const int* cxy_na          = grid.cxy_na().data();
1013         const int* cxy_ind         = grid.cxy_ind().data();
1014
1015         auto* timerH2D = bDoTime ? &gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d : nullptr;
1016
1017         reallocateDeviceBuffer(&gpu_nbv->atomIndices,
1018                                atomIndicesSize,
1019                                &gpu_nbv->atomIndicesSize,
1020                                &gpu_nbv->atomIndicesSize_alloc,
1021                                *gpu_nbv->deviceContext_);
1022
1023         if (atomIndicesSize > 0)
1024         {
1025             if (bDoTime)
1026             {
1027                 timerH2D->openTimingRegion(localStream);
1028             }
1029
1030             copyToDeviceBuffer(&gpu_nbv->atomIndices,
1031                                atomIndices,
1032                                0,
1033                                atomIndicesSize,
1034                                localStream,
1035                                GpuApiCallBehavior::Async,
1036                                bDoTime ? timerH2D->fetchNextEvent() : nullptr);
1037
1038             if (bDoTime)
1039             {
1040                 timerH2D->closeTimingRegion(localStream);
1041             }
1042         }
1043
1044         if (numColumns > 0)
1045         {
1046             if (bDoTime)
1047             {
1048                 timerH2D->openTimingRegion(localStream);
1049             }
1050
1051             copyToDeviceBuffer(&gpu_nbv->cxy_na,
1052                                cxy_na,
1053                                maxNumColumns * g,
1054                                numColumns,
1055                                localStream,
1056                                GpuApiCallBehavior::Async,
1057                                bDoTime ? timerH2D->fetchNextEvent() : nullptr);
1058
1059             if (bDoTime)
1060             {
1061                 timerH2D->closeTimingRegion(localStream);
1062             }
1063
1064             if (bDoTime)
1065             {
1066                 timerH2D->openTimingRegion(localStream);
1067             }
1068
1069             copyToDeviceBuffer(&gpu_nbv->cxy_ind,
1070                                cxy_ind,
1071                                maxNumColumns * g,
1072                                numColumns,
1073                                localStream,
1074                                GpuApiCallBehavior::Async,
1075                                bDoTime ? timerH2D->fetchNextEvent() : nullptr);
1076
1077             if (bDoTime)
1078             {
1079                 timerH2D->closeTimingRegion(localStream);
1080             }
1081         }
1082     }
1083
1084     // The above data is transferred on the local stream but is a
1085     // dependency of the nonlocal stream (specifically the nonlocal X
1086     // buf ops kernel).  We therefore set a dependency to ensure
1087     // that the nonlocal stream waits on the local stream here.
1088     // This call records an event in the local stream:
1089     nbnxnInsertNonlocalGpuDependency(gpu_nbv, Nbnxm::InteractionLocality::Local);
1090     // ...and this call instructs the nonlocal stream to wait on that event:
1091     nbnxnInsertNonlocalGpuDependency(gpu_nbv, Nbnxm::InteractionLocality::NonLocal);
1092 }
1093
1094 } // namespace Nbnxm