src/gromacs/nbnxm/nbnxm_gpu_data_mgmt.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2012,2013,2014,2015,2016 by the GROMACS development team.
   5  * Copyright (c) 2017,2018,2019,2020,2021, by the GROMACS development team, led by
   6  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   7  * and including many others, as listed in the AUTHORS file in the
   8  * top-level source directory and at http://www.gromacs.org.
   9  *
  10  * GROMACS is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU Lesser General Public License
  12  * as published by the Free Software Foundation; either version 2.1
  13  * of the License, or (at your option) any later version.
  14  *
  15  * GROMACS is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18  * Lesser General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public
  21  * License along with GROMACS; if not, see
  22  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  23  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  24  *
  25  * If you want to redistribute modifications to GROMACS, please
  26  * consider that scientific software is very special. Version
  27  * control is crucial - bugs must be traceable. We will be happy to
  28  * consider code for inclusion in the official distribution, but
  29  * derived work must not be called official GROMACS. Details are found
  30  * in the README & COPYING files - if they are missing, get the
  31  * official version at http://www.gromacs.org.
  32  *
  33  * To help us fund GROMACS development, we humbly ask that you cite
  34  * the research papers on the package. Check out http://www.gromacs.org.
  35  */
  36 /*! \internal \file
  37  *  \brief Define common implementation of nbnxm_gpu_data_mgmt.h
  38  *
  39  *  \author Anca Hamuraru <anca@streamcomputing.eu>
  40  *  \author Dimitrios Karkoulis <dimitris.karkoulis@gmail.com>
  41  *  \author Teemu Virolainen <teemu@streamcomputing.eu>
  42  *  \author Szilárd Páll <pall.szilard@gmail.com>
  43  *  \author Artem Zhmurov <zhmurov@gmail.com>
  44  *
  45  *  \ingroup module_nbnxm
  46  */
  47 #include "gmxpre.h"
  48
  49 #include "config.h"
  50
  51 #if GMX_GPU_CUDA
  52 #    include "cuda/nbnxm_cuda_types.h"
  53 #endif
  54
  55 #if GMX_GPU_OPENCL
  56 #    include "opencl/nbnxm_ocl_types.h"
  57 #endif
  58
  59 #if GMX_GPU_SYCL
  60 #    include "sycl/nbnxm_sycl_types.h"
  61 #endif
  62
  63 #include "nbnxm_gpu_data_mgmt.h"
  64
  65 #include "gromacs/hardware/device_information.h"
  66 #include "gromacs/mdtypes/interaction_const.h"
  67 #include "gromacs/nbnxm/gpu_common_utils.h"
  68 #include "gromacs/nbnxm/gpu_data_mgmt.h"
  69 #include "gromacs/timing/gpu_timing.h"
  70 #include "gromacs/utility/cstringutil.h"
  71 #include "gromacs/utility/exceptions.h"
  72 #include "gromacs/utility/fatalerror.h"
  73
  74 #include "nbnxm_gpu.h"
  75 #include "pairlistsets.h"
  76
  77 namespace Nbnxm
  78 {
  79
  80 void init_ewald_coulomb_force_table(const EwaldCorrectionTables& tables,
  81                                     NBParamGpu*                  nbp,
  82                                     const DeviceContext&         deviceContext)
  83 {
  84     if (nbp->coulomb_tab)
  85     {
  86         destroyParamLookupTable(&nbp->coulomb_tab, nbp->coulomb_tab_texobj);
  87     }
  88
  89     nbp->coulomb_tab_scale = tables.scale;
  90     initParamLookupTable(
  91             &nbp->coulomb_tab, &nbp->coulomb_tab_texobj, tables.tableF.data(), tables.tableF.size(), deviceContext);
  92 }
  93
  94 enum ElecType nbnxn_gpu_pick_ewald_kernel_type(const interaction_const_t& ic,
  95                                                const DeviceInformation gmx_unused& deviceInfo)
  96 {
  97     bool bTwinCut = (ic.rcoulomb != ic.rvdw);
  98
  99     /* Benchmarking/development environment variables to force the use of
 100        analytical or tabulated Ewald kernel. */
 101     const bool forceAnalyticalEwald = (getenv("GMX_GPU_NB_ANA_EWALD") != nullptr);
 102     const bool forceTabulatedEwald  = (getenv("GMX_GPU_NB_TAB_EWALD") != nullptr);
 103     const bool forceTwinCutoffEwald = (getenv("GMX_GPU_NB_EWALD_TWINCUT") != nullptr);
 104
 105     if (forceAnalyticalEwald && forceTabulatedEwald)
 106     {
 107         gmx_incons(
 108                 "Both analytical and tabulated Ewald GPU non-bonded kernels "
 109                 "requested through environment variables.");
 110     }
 111
 112     /* By default, use analytical Ewald except with CUDA on NVIDIA CC 7.0 and 8.0.
 113      */
 114     const bool c_useTabulatedEwaldDefault =
 115 #if GMX_GPU_CUDA
 116             (deviceInfo.prop.major == 7 && deviceInfo.prop.minor == 0)
 117             || (deviceInfo.prop.major == 8 && deviceInfo.prop.minor == 0);
 118 #else
 119             false;
 120 #endif
 121     bool bUseAnalyticalEwald = !c_useTabulatedEwaldDefault;
 122     if (forceAnalyticalEwald)
 123     {
 124         bUseAnalyticalEwald = true;
 125         if (debug)
 126         {
 127             fprintf(debug, "Using analytical Ewald GPU kernels\n");
 128         }
 129     }
 130     else if (forceTabulatedEwald)
 131     {
 132         bUseAnalyticalEwald = false;
 133
 134         if (debug)
 135         {
 136             fprintf(debug, "Using tabulated Ewald GPU kernels\n");
 137         }
 138     }
 139
 140     /* Use twin cut-off kernels if requested by bTwinCut or the env. var.
 141        forces it (use it for debugging/benchmarking only). */
 142     if (!bTwinCut && !forceTwinCutoffEwald)
 143     {
 144         return bUseAnalyticalEwald ? ElecType::EwaldAna : ElecType::EwaldTab;
 145     }
 146     else
 147     {
 148         return bUseAnalyticalEwald ? ElecType::EwaldAnaTwin : ElecType::EwaldTabTwin;
 149     }
 150 }
 151
 152 void set_cutoff_parameters(NBParamGpu* nbp, const interaction_const_t* ic, const PairlistParams& listParams)
 153 {
 154     nbp->ewald_beta        = ic->ewaldcoeff_q;
 155     nbp->sh_ewald          = ic->sh_ewald;
 156     nbp->epsfac            = ic->epsfac;
 157     nbp->two_k_rf          = 2.0 * ic->reactionFieldCoefficient;
 158     nbp->c_rf              = ic->reactionFieldShift;
 159     nbp->rvdw_sq           = ic->rvdw * ic->rvdw;
 160     nbp->rcoulomb_sq       = ic->rcoulomb * ic->rcoulomb;
 161     nbp->rlistOuter_sq     = listParams.rlistOuter * listParams.rlistOuter;
 162     nbp->rlistInner_sq     = listParams.rlistInner * listParams.rlistInner;
 163     nbp->useDynamicPruning = listParams.useDynamicPruning;
 164
 165     nbp->sh_lj_ewald   = ic->sh_lj_ewald;
 166     nbp->ewaldcoeff_lj = ic->ewaldcoeff_lj;
 167
 168     nbp->rvdw_switch      = ic->rvdw_switch;
 169     nbp->dispersion_shift = ic->dispersion_shift;
 170     nbp->repulsion_shift  = ic->repulsion_shift;
 171     nbp->vdw_switch       = ic->vdw_switch;
 172 }
 173
 174 void gpu_pme_loadbal_update_param(const nonbonded_verlet_t* nbv, const interaction_const_t* ic)
 175 {
 176     if (!nbv || !nbv->useGpu())
 177     {
 178         return;
 179     }
 180     NbnxmGpu*   nb  = nbv->gpu_nbv;
 181     NBParamGpu* nbp = nb->nbparam;
 182
 183     set_cutoff_parameters(nbp, ic, nbv->pairlistSets().params());
 184
 185     nbp->elecType = nbnxn_gpu_pick_ewald_kernel_type(*ic, nb->deviceContext_->deviceInfo());
 186
 187     GMX_RELEASE_ASSERT(ic->coulombEwaldTables, "Need valid Coulomb Ewald correction tables");
 188     init_ewald_coulomb_force_table(*ic->coulombEwaldTables, nbp, *nb->deviceContext_);
 189 }
 190
 191 void init_plist(gpu_plist* pl)
 192 {
 193     /* initialize to nullptr pointers to data that is not allocated here and will
 194        need reallocation in nbnxn_gpu_init_pairlist */
 195     pl->sci   = nullptr;
 196     pl->cj4   = nullptr;
 197     pl->imask = nullptr;
 198     pl->excl  = nullptr;
 199
 200     /* size -1 indicates that the respective array hasn't been initialized yet */
 201     pl->na_c                   = -1;
 202     pl->nsci                   = -1;
 203     pl->sci_nalloc             = -1;
 204     pl->ncj4                   = -1;
 205     pl->cj4_nalloc             = -1;
 206     pl->nimask                 = -1;
 207     pl->imask_nalloc           = -1;
 208     pl->nexcl                  = -1;
 209     pl->excl_nalloc            = -1;
 210     pl->haveFreshList          = false;
 211     pl->rollingPruningNumParts = 0;
 212     pl->rollingPruningPart     = 0;
 213 }
 214
 215 void init_timings(gmx_wallclock_gpu_nbnxn_t* t)
 216 {
 217     t->nb_h2d_t = 0.0;
 218     t->nb_d2h_t = 0.0;
 219     t->nb_c     = 0;
 220     t->pl_h2d_t = 0.0;
 221     t->pl_h2d_c = 0;
 222     for (int i = 0; i < 2; i++)
 223     {
 224         for (int j = 0; j < 2; j++)
 225         {
 226             t->ktime[i][j].t = 0.0;
 227             t->ktime[i][j].c = 0;
 228         }
 229     }
 230     t->pruneTime.c        = 0;
 231     t->pruneTime.t        = 0.0;
 232     t->dynamicPruneTime.c = 0;
 233     t->dynamicPruneTime.t = 0.0;
 234 }
 235
 236 //! This function is documented in the header file
 237 void gpu_init_pairlist(NbnxmGpu* nb, const NbnxnPairlistGpu* h_plist, const InteractionLocality iloc)
 238 {
 239     char sbuf[STRLEN];
 240     // Timing accumulation should happen only if there was work to do
 241     // because getLastRangeTime() gets skipped with empty lists later
 242     // which leads to the counter not being reset.
 243     bool                bDoTime      = (nb->bDoTime && !h_plist->sci.empty());
 244     const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
 245     gpu_plist*          d_plist      = nb->plist[iloc];
 246
 247     if (d_plist->na_c < 0)
 248     {
 249         d_plist->na_c = h_plist->na_ci;
 250     }
 251     else
 252     {
 253         if (d_plist->na_c != h_plist->na_ci)
 254         {
 255             sprintf(sbuf,
 256                     "In init_plist: the #atoms per cell has changed (from %d to %d)",
 257                     d_plist->na_c,
 258                     h_plist->na_ci);
 259             gmx_incons(sbuf);
 260         }
 261     }
 262
 263     GpuTimers::Interaction& iTimers = nb->timers->interaction[iloc];
 264
 265     if (bDoTime)
 266     {
 267         iTimers.pl_h2d.openTimingRegion(deviceStream);
 268         iTimers.didPairlistH2D = true;
 269     }
 270
 271     // TODO most of this function is same in CUDA and OpenCL, move into the header
 272     const DeviceContext& deviceContext = *nb->deviceContext_;
 273
 274     reallocateDeviceBuffer(
 275             &d_plist->sci, h_plist->sci.size(), &d_plist->nsci, &d_plist->sci_nalloc, deviceContext);
 276     copyToDeviceBuffer(&d_plist->sci,
 277                        h_plist->sci.data(),
 278                        0,
 279                        h_plist->sci.size(),
 280                        deviceStream,
 281                        GpuApiCallBehavior::Async,
 282                        bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
 283
 284     reallocateDeviceBuffer(
 285             &d_plist->cj4, h_plist->cj4.size(), &d_plist->ncj4, &d_plist->cj4_nalloc, deviceContext);
 286     copyToDeviceBuffer(&d_plist->cj4,
 287                        h_plist->cj4.data(),
 288                        0,
 289                        h_plist->cj4.size(),
 290                        deviceStream,
 291                        GpuApiCallBehavior::Async,
 292                        bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
 293
 294     reallocateDeviceBuffer(&d_plist->imask,
 295                            h_plist->cj4.size() * c_nbnxnGpuClusterpairSplit,
 296                            &d_plist->nimask,
 297                            &d_plist->imask_nalloc,
 298                            deviceContext);
 299
 300     reallocateDeviceBuffer(
 301             &d_plist->excl, h_plist->excl.size(), &d_plist->nexcl, &d_plist->excl_nalloc, deviceContext);
 302     copyToDeviceBuffer(&d_plist->excl,
 303                        h_plist->excl.data(),
 304                        0,
 305                        h_plist->excl.size(),
 306                        deviceStream,
 307                        GpuApiCallBehavior::Async,
 308                        bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
 309
 310     if (bDoTime)
 311     {
 312         iTimers.pl_h2d.closeTimingRegion(deviceStream);
 313     }
 314
 315     /* need to prune the pair list during the next step */
 316     d_plist->haveFreshList = true;
 317 }
 318
 319 //! This function is documented in the header file
 320 gmx_wallclock_gpu_nbnxn_t* gpu_get_timings(NbnxmGpu* nb)
 321 {
 322     return (nb != nullptr && nb->bDoTime) ? nb->timings : nullptr;
 323 }
 324
 325 //! This function is documented in the header file
 326 void gpu_reset_timings(nonbonded_verlet_t* nbv)
 327 {
 328     if (nbv->gpu_nbv && nbv->gpu_nbv->bDoTime)
 329     {
 330         init_timings(nbv->gpu_nbv->timings);
 331     }
 332 }
 333
 334 bool gpu_is_kernel_ewald_analytical(const NbnxmGpu* nb)
 335 {
 336     return ((nb->nbparam->elecType == ElecType::EwaldAna)
 337             || (nb->nbparam->elecType == ElecType::EwaldAnaTwin));
 338 }
 339
 340 enum ElecType nbnxmGpuPickElectrostaticsKernelType(const interaction_const_t* ic,
 341                                                    const DeviceInformation&   deviceInfo)
 342 {
 343     if (ic->eeltype == CoulombInteractionType::Cut)
 344     {
 345         return ElecType::Cut;
 346     }
 347     else if (EEL_RF(ic->eeltype))
 348     {
 349         return ElecType::RF;
 350     }
 351     else if ((EEL_PME(ic->eeltype) || ic->eeltype == CoulombInteractionType::Ewald))
 352     {
 353         return nbnxn_gpu_pick_ewald_kernel_type(*ic, deviceInfo);
 354     }
 355     else
 356     {
 357         /* Shouldn't happen, as this is checked when choosing Verlet-scheme */
 358         GMX_THROW(gmx::InconsistentInputError(
 359                 gmx::formatString("The requested electrostatics type %s is not implemented in "
 360                                   "the GPU accelerated kernels!",
 361                                   enumValueToString(ic->eeltype))));
 362     }
 363 }
 364
 365
 366 enum VdwType nbnxmGpuPickVdwKernelType(const interaction_const_t* ic, LJCombinationRule ljCombinationRule)
 367 {
 368     if (ic->vdwtype == VanDerWaalsType::Cut)
 369     {
 370         switch (ic->vdw_modifier)
 371         {
 372             case InteractionModifiers::None:
 373             case InteractionModifiers::PotShift:
 374                 switch (ljCombinationRule)
 375                 {
 376                     case LJCombinationRule::None: return VdwType::Cut;
 377                     case LJCombinationRule::Geometric: return VdwType::CutCombGeom;
 378                     case LJCombinationRule::LorentzBerthelot: return VdwType::CutCombLB;
 379                     default:
 380                         GMX_THROW(gmx::InconsistentInputError(gmx::formatString(
 381                                 "The requested LJ combination rule %s is not implemented in "
 382                                 "the GPU accelerated kernels!",
 383                                 enumValueToString(ljCombinationRule))));
 384                 }
 385             case InteractionModifiers::ForceSwitch: return VdwType::FSwitch;
 386             case InteractionModifiers::PotSwitch: return VdwType::PSwitch;
 387             default:
 388                 GMX_THROW(gmx::InconsistentInputError(
 389                         gmx::formatString("The requested VdW interaction modifier %s is not "
 390                                           "implemented in the GPU accelerated kernels!",
 391                                           enumValueToString(ic->vdw_modifier))));
 392         }
 393     }
 394     else if (ic->vdwtype == VanDerWaalsType::Pme)
 395     {
 396         if (ic->ljpme_comb_rule == LongRangeVdW::Geom)
 397         {
 398             assert(ljCombinationRule == LJCombinationRule::Geometric);
 399             return VdwType::EwaldGeom;
 400         }
 401         else
 402         {
 403             assert(ljCombinationRule == LJCombinationRule::LorentzBerthelot);
 404             return VdwType::EwaldLB;
 405         }
 406     }
 407     else
 408     {
 409         GMX_THROW(gmx::InconsistentInputError(gmx::formatString(
 410                 "The requested VdW type %s is not implemented in the GPU accelerated kernels!",
 411                 enumValueToString(ic->vdwtype))));
 412     }
 413 }
 414
 415 void setupGpuShortRangeWork(NbnxmGpu* nb, const gmx::GpuBonded* gpuBonded, const gmx::InteractionLocality iLocality)
 416 {
 417     GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
 418
 419     // There is short-range work if the pair list for the provided
 420     // interaction locality contains entries or if there is any
 421     // bonded work (as this is not split into local/nonlocal).
 422     nb->haveWork[iLocality] = ((nb->plist[iLocality]->nsci != 0)
 423                                || (gpuBonded != nullptr && gpuBonded->haveInteractions()));
 424 }
 425
 426 bool haveGpuShortRangeWork(const NbnxmGpu* nb, const gmx::AtomLocality aLocality)
 427 {
 428     GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
 429
 430     return haveGpuShortRangeWork(*nb, gpuAtomToInteractionLocality(aLocality));
 431 }
 432
 433 inline void issueClFlushInStream(const DeviceStream& gmx_unused deviceStream)
 434 {
 435 #if GMX_GPU_OPENCL
 436     /* Based on the v1.2 section 5.13 of the OpenCL spec, a flush is needed
 437      * in the stream after marking an event in it in order to be able to sync with
 438      * the event from another stream.
 439      */
 440     cl_int cl_error = clFlush(deviceStream.stream());
 441     if (cl_error != CL_SUCCESS)
 442     {
 443         GMX_THROW(gmx::InternalError("clFlush failed: " + ocl_get_error_string(cl_error)));
 444     }
 445 #endif
 446 }
 447
 448 void nbnxnInsertNonlocalGpuDependency(NbnxmGpu* nb, const InteractionLocality interactionLocality)
 449 {
 450     const DeviceStream& deviceStream = *nb->deviceStreams[interactionLocality];
 451
 452     /* When we get here all misc operations issued in the local stream as well as
 453        the local xq H2D are done,
 454        so we record that in the local stream and wait for it in the nonlocal one.
 455        This wait needs to precede any PP tasks, bonded or nonbonded, that may
 456        compute on interactions between local and nonlocal atoms.
 457      */
 458     if (nb->bUseTwoStreams)
 459     {
 460         if (interactionLocality == InteractionLocality::Local)
 461         {
 462             nb->misc_ops_and_local_H2D_done.markEvent(deviceStream);
 463             issueClFlushInStream(deviceStream);
 464         }
 465         else
 466         {
 467             nb->misc_ops_and_local_H2D_done.enqueueWaitEvent(deviceStream);
 468         }
 469     }
 470 }
 471
 472 /*! \brief Launch asynchronously the xq buffer host to device copy. */
 473 void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const AtomLocality atomLocality)
 474 {
 475     GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
 476
 477     const InteractionLocality iloc = gpuAtomToInteractionLocality(atomLocality);
 478
 479     NBAtomData*         adat         = nb->atdat;
 480     gpu_plist*          plist        = nb->plist[iloc];
 481     Nbnxm::GpuTimers*   timers       = nb->timers;
 482     const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
 483
 484     const bool bDoTime = nb->bDoTime;
 485
 486     /* Don't launch the non-local H2D copy if there is no dependent
 487        work to do: neither non-local nor other (e.g. bonded) work
 488        to do that has as input the nbnxn coordaintes.
 489        Doing the same for the local kernel is more complicated, since the
 490        local part of the force array also depends on the non-local kernel.
 491        So to avoid complicating the code and to reduce the risk of bugs,
 492        we always call the local local x+q copy (and the rest of the local
 493        work in nbnxn_gpu_launch_kernel().
 494      */
 495     if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(*nb, iloc))
 496     {
 497         plist->haveFreshList = false;
 498
 499         // The event is marked for Local interactions unconditionally,
 500         // so it has to be released here because of the early return
 501         // for NonLocal interactions.
 502         nb->misc_ops_and_local_H2D_done.reset();
 503
 504         return;
 505     }
 506
 507     /* local/nonlocal offset and length used for xq and f */
 508     const auto atomsRange = getGpuAtomRange(adat, atomLocality);
 509
 510     /* beginning of timed HtoD section */
 511     if (bDoTime)
 512     {
 513         timers->xf[atomLocality].nb_h2d.openTimingRegion(deviceStream);
 514     }
 515
 516     /* HtoD x, q */
 517     GMX_ASSERT(nbatom->XFormat == nbatXYZQ,
 518                "The coordinates should be in xyzq format to copy to the Float4 device buffer.");
 519     copyToDeviceBuffer(&adat->xq,
 520                        reinterpret_cast<const Float4*>(nbatom->x().data()) + atomsRange.begin(),
 521                        atomsRange.begin(),
 522                        atomsRange.size(),
 523                        deviceStream,
 524                        GpuApiCallBehavior::Async,
 525                        nullptr);
 526
 527     if (bDoTime)
 528     {
 529         timers->xf[atomLocality].nb_h2d.closeTimingRegion(deviceStream);
 530     }
 531
 532     /* When we get here all misc operations issued in the local stream as well as
 533        the local xq H2D are done,
 534        so we record that in the local stream and wait for it in the nonlocal one.
 535        This wait needs to precede any PP tasks, bonded or nonbonded, that may
 536        compute on interactions between local and nonlocal atoms.
 537      */
 538     nbnxnInsertNonlocalGpuDependency(nb, iloc);
 539 }
 540
 541 } // namespace Nbnxm