src/gromacs/nbnxm/nbnxm_gpu_data_mgmt.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2012,2013,2014,2015,2016 by the GROMACS development team.
   5  * Copyright (c) 2017,2018,2019,2020,2021, by the GROMACS development team, led by
   6  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   7  * and including many others, as listed in the AUTHORS file in the
   8  * top-level source directory and at http://www.gromacs.org.
   9  *
  10  * GROMACS is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU Lesser General Public License
  12  * as published by the Free Software Foundation; either version 2.1
  13  * of the License, or (at your option) any later version.
  14  *
  15  * GROMACS is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18  * Lesser General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public
  21  * License along with GROMACS; if not, see
  22  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  23  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  24  *
  25  * If you want to redistribute modifications to GROMACS, please
  26  * consider that scientific software is very special. Version
  27  * control is crucial - bugs must be traceable. We will be happy to
  28  * consider code for inclusion in the official distribution, but
  29  * derived work must not be called official GROMACS. Details are found
  30  * in the README & COPYING files - if they are missing, get the
  31  * official version at http://www.gromacs.org.
  32  *
  33  * To help us fund GROMACS development, we humbly ask that you cite
  34  * the research papers on the package. Check out http://www.gromacs.org.
  35  */
  36 /*! \internal \file
  37  *  \brief Define common implementation of nbnxm_gpu_data_mgmt.h
  38  *
  39  *  \author Anca Hamuraru <anca@streamcomputing.eu>
  40  *  \author Dimitrios Karkoulis <dimitris.karkoulis@gmail.com>
  41  *  \author Teemu Virolainen <teemu@streamcomputing.eu>
  42  *  \author Szilárd Páll <pall.szilard@gmail.com>
  43  *  \author Artem Zhmurov <zhmurov@gmail.com>
  44  *
  45  *  \ingroup module_nbnxm
  46  */
  47 #include "gmxpre.h"
  48
  49 #include "config.h"
  50
  51 #if GMX_GPU_CUDA
  52 #    include "cuda/nbnxm_cuda_types.h"
  53 #endif
  54
  55 #if GMX_GPU_OPENCL
  56 #    include "opencl/nbnxm_ocl_types.h"
  57 #endif
  58
  59 #if GMX_GPU_SYCL
  60 #    include "sycl/nbnxm_sycl_types.h"
  61 #endif
  62
  63 #include "nbnxm_gpu_data_mgmt.h"
  64
  65 #include "gromacs/hardware/device_information.h"
  66 #include "gromacs/mdtypes/interaction_const.h"
  67 #include "gromacs/nbnxm/gpu_common_utils.h"
  68 #include "gromacs/nbnxm/gpu_data_mgmt.h"
  69 #include "gromacs/timing/gpu_timing.h"
  70 #include "gromacs/utility/cstringutil.h"
  71 #include "gromacs/utility/fatalerror.h"
  72
  73 #include "nbnxm_gpu.h"
  74 #include "pairlistsets.h"
  75
  76 namespace Nbnxm
  77 {
  78
  79 void init_ewald_coulomb_force_table(const EwaldCorrectionTables& tables,
  80                                     NBParamGpu*                  nbp,
  81                                     const DeviceContext&         deviceContext)
  82 {
  83     if (nbp->coulomb_tab)
  84     {
  85         destroyParamLookupTable(&nbp->coulomb_tab, nbp->coulomb_tab_texobj);
  86     }
  87
  88     nbp->coulomb_tab_scale = tables.scale;
  89     initParamLookupTable(
  90             &nbp->coulomb_tab, &nbp->coulomb_tab_texobj, tables.tableF.data(), tables.tableF.size(), deviceContext);
  91 }
  92
  93 enum ElecType nbnxn_gpu_pick_ewald_kernel_type(const interaction_const_t& ic,
  94                                                const DeviceInformation gmx_unused& deviceInfo)
  95 {
  96     bool bTwinCut = (ic.rcoulomb != ic.rvdw);
  97
  98     /* Benchmarking/development environment variables to force the use of
  99        analytical or tabulated Ewald kernel. */
 100     const bool forceAnalyticalEwald = (getenv("GMX_GPU_NB_ANA_EWALD") != nullptr);
 101     const bool forceTabulatedEwald  = (getenv("GMX_GPU_NB_TAB_EWALD") != nullptr);
 102     const bool forceTwinCutoffEwald = (getenv("GMX_GPU_NB_EWALD_TWINCUT") != nullptr);
 103
 104     if (forceAnalyticalEwald && forceTabulatedEwald)
 105     {
 106         gmx_incons(
 107                 "Both analytical and tabulated Ewald GPU non-bonded kernels "
 108                 "requested through environment variables.");
 109     }
 110
 111     /* By default, use analytical Ewald except with CUDA on NVIDIA CC 7.0 and 8.0.
 112      */
 113     const bool c_useTabulatedEwaldDefault =
 114 #if GMX_GPU_CUDA
 115             (deviceInfo.prop.major == 7 && deviceInfo.prop.minor == 0)
 116             || (deviceInfo.prop.major == 8 && deviceInfo.prop.minor == 0);
 117 #else
 118             false;
 119 #endif
 120     bool bUseAnalyticalEwald = !c_useTabulatedEwaldDefault;
 121     if (forceAnalyticalEwald)
 122     {
 123         bUseAnalyticalEwald = true;
 124         if (debug)
 125         {
 126             fprintf(debug, "Using analytical Ewald GPU kernels\n");
 127         }
 128     }
 129     else if (forceTabulatedEwald)
 130     {
 131         bUseAnalyticalEwald = false;
 132
 133         if (debug)
 134         {
 135             fprintf(debug, "Using tabulated Ewald GPU kernels\n");
 136         }
 137     }
 138
 139     /* Use twin cut-off kernels if requested by bTwinCut or the env. var.
 140        forces it (use it for debugging/benchmarking only). */
 141     if (!bTwinCut && !forceTwinCutoffEwald)
 142     {
 143         return bUseAnalyticalEwald ? ElecType::EwaldAna : ElecType::EwaldTab;
 144     }
 145     else
 146     {
 147         return bUseAnalyticalEwald ? ElecType::EwaldAnaTwin : ElecType::EwaldTabTwin;
 148     }
 149 }
 150
 151 void set_cutoff_parameters(NBParamGpu* nbp, const interaction_const_t* ic, const PairlistParams& listParams)
 152 {
 153     nbp->ewald_beta        = ic->ewaldcoeff_q;
 154     nbp->sh_ewald          = ic->sh_ewald;
 155     nbp->epsfac            = ic->epsfac;
 156     nbp->two_k_rf          = 2.0 * ic->reactionFieldCoefficient;
 157     nbp->c_rf              = ic->reactionFieldShift;
 158     nbp->rvdw_sq           = ic->rvdw * ic->rvdw;
 159     nbp->rcoulomb_sq       = ic->rcoulomb * ic->rcoulomb;
 160     nbp->rlistOuter_sq     = listParams.rlistOuter * listParams.rlistOuter;
 161     nbp->rlistInner_sq     = listParams.rlistInner * listParams.rlistInner;
 162     nbp->useDynamicPruning = listParams.useDynamicPruning;
 163
 164     nbp->sh_lj_ewald   = ic->sh_lj_ewald;
 165     nbp->ewaldcoeff_lj = ic->ewaldcoeff_lj;
 166
 167     nbp->rvdw_switch      = ic->rvdw_switch;
 168     nbp->dispersion_shift = ic->dispersion_shift;
 169     nbp->repulsion_shift  = ic->repulsion_shift;
 170     nbp->vdw_switch       = ic->vdw_switch;
 171 }
 172
 173 void gpu_pme_loadbal_update_param(const nonbonded_verlet_t* nbv, const interaction_const_t* ic)
 174 {
 175     if (!nbv || !nbv->useGpu())
 176     {
 177         return;
 178     }
 179     NbnxmGpu*   nb  = nbv->gpu_nbv;
 180     NBParamGpu* nbp = nb->nbparam;
 181
 182     set_cutoff_parameters(nbp, ic, nbv->pairlistSets().params());
 183
 184     nbp->elecType = nbnxn_gpu_pick_ewald_kernel_type(*ic, nb->deviceContext_->deviceInfo());
 185
 186     GMX_RELEASE_ASSERT(ic->coulombEwaldTables, "Need valid Coulomb Ewald correction tables");
 187     init_ewald_coulomb_force_table(*ic->coulombEwaldTables, nbp, *nb->deviceContext_);
 188 }
 189
 190 void init_plist(gpu_plist* pl)
 191 {
 192     /* initialize to nullptr pointers to data that is not allocated here and will
 193        need reallocation in nbnxn_gpu_init_pairlist */
 194     pl->sci   = nullptr;
 195     pl->cj4   = nullptr;
 196     pl->imask = nullptr;
 197     pl->excl  = nullptr;
 198
 199     /* size -1 indicates that the respective array hasn't been initialized yet */
 200     pl->na_c                   = -1;
 201     pl->nsci                   = -1;
 202     pl->sci_nalloc             = -1;
 203     pl->ncj4                   = -1;
 204     pl->cj4_nalloc             = -1;
 205     pl->nimask                 = -1;
 206     pl->imask_nalloc           = -1;
 207     pl->nexcl                  = -1;
 208     pl->excl_nalloc            = -1;
 209     pl->haveFreshList          = false;
 210     pl->rollingPruningNumParts = 0;
 211     pl->rollingPruningPart     = 0;
 212 }
 213
 214 void init_timings(gmx_wallclock_gpu_nbnxn_t* t)
 215 {
 216     t->nb_h2d_t = 0.0;
 217     t->nb_d2h_t = 0.0;
 218     t->nb_c     = 0;
 219     t->pl_h2d_t = 0.0;
 220     t->pl_h2d_c = 0;
 221     for (int i = 0; i < 2; i++)
 222     {
 223         for (int j = 0; j < 2; j++)
 224         {
 225             t->ktime[i][j].t = 0.0;
 226             t->ktime[i][j].c = 0;
 227         }
 228     }
 229     t->pruneTime.c        = 0;
 230     t->pruneTime.t        = 0.0;
 231     t->dynamicPruneTime.c = 0;
 232     t->dynamicPruneTime.t = 0.0;
 233 }
 234
 235 //! This function is documented in the header file
 236 void gpu_init_pairlist(NbnxmGpu* nb, const NbnxnPairlistGpu* h_plist, const InteractionLocality iloc)
 237 {
 238     char sbuf[STRLEN];
 239     // Timing accumulation should happen only if there was work to do
 240     // because getLastRangeTime() gets skipped with empty lists later
 241     // which leads to the counter not being reset.
 242     bool                bDoTime      = (nb->bDoTime && !h_plist->sci.empty());
 243     const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
 244     gpu_plist*          d_plist      = nb->plist[iloc];
 245
 246     if (d_plist->na_c < 0)
 247     {
 248         d_plist->na_c = h_plist->na_ci;
 249     }
 250     else
 251     {
 252         if (d_plist->na_c != h_plist->na_ci)
 253         {
 254             sprintf(sbuf,
 255                     "In init_plist: the #atoms per cell has changed (from %d to %d)",
 256                     d_plist->na_c,
 257                     h_plist->na_ci);
 258             gmx_incons(sbuf);
 259         }
 260     }
 261
 262     GpuTimers::Interaction& iTimers = nb->timers->interaction[iloc];
 263
 264     if (bDoTime)
 265     {
 266         iTimers.pl_h2d.openTimingRegion(deviceStream);
 267         iTimers.didPairlistH2D = true;
 268     }
 269
 270     // TODO most of this function is same in CUDA and OpenCL, move into the header
 271     const DeviceContext& deviceContext = *nb->deviceContext_;
 272
 273     reallocateDeviceBuffer(
 274             &d_plist->sci, h_plist->sci.size(), &d_plist->nsci, &d_plist->sci_nalloc, deviceContext);
 275     copyToDeviceBuffer(&d_plist->sci,
 276                        h_plist->sci.data(),
 277                        0,
 278                        h_plist->sci.size(),
 279                        deviceStream,
 280                        GpuApiCallBehavior::Async,
 281                        bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
 282
 283     reallocateDeviceBuffer(
 284             &d_plist->cj4, h_plist->cj4.size(), &d_plist->ncj4, &d_plist->cj4_nalloc, deviceContext);
 285     copyToDeviceBuffer(&d_plist->cj4,
 286                        h_plist->cj4.data(),
 287                        0,
 288                        h_plist->cj4.size(),
 289                        deviceStream,
 290                        GpuApiCallBehavior::Async,
 291                        bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
 292
 293     reallocateDeviceBuffer(&d_plist->imask,
 294                            h_plist->cj4.size() * c_nbnxnGpuClusterpairSplit,
 295                            &d_plist->nimask,
 296                            &d_plist->imask_nalloc,
 297                            deviceContext);
 298
 299     reallocateDeviceBuffer(
 300             &d_plist->excl, h_plist->excl.size(), &d_plist->nexcl, &d_plist->excl_nalloc, deviceContext);
 301     copyToDeviceBuffer(&d_plist->excl,
 302                        h_plist->excl.data(),
 303                        0,
 304                        h_plist->excl.size(),
 305                        deviceStream,
 306                        GpuApiCallBehavior::Async,
 307                        bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
 308
 309     if (bDoTime)
 310     {
 311         iTimers.pl_h2d.closeTimingRegion(deviceStream);
 312     }
 313
 314     /* need to prune the pair list during the next step */
 315     d_plist->haveFreshList = true;
 316 }
 317
 318 //! This function is documented in the header file
 319 gmx_wallclock_gpu_nbnxn_t* gpu_get_timings(NbnxmGpu* nb)
 320 {
 321     return (nb != nullptr && nb->bDoTime) ? nb->timings : nullptr;
 322 }
 323
 324 //! This function is documented in the header file
 325 void gpu_reset_timings(nonbonded_verlet_t* nbv)
 326 {
 327     if (nbv->gpu_nbv && nbv->gpu_nbv->bDoTime)
 328     {
 329         init_timings(nbv->gpu_nbv->timings);
 330     }
 331 }
 332
 333 bool gpu_is_kernel_ewald_analytical(const NbnxmGpu* nb)
 334 {
 335     return ((nb->nbparam->elecType == ElecType::EwaldAna)
 336             || (nb->nbparam->elecType == ElecType::EwaldAnaTwin));
 337 }
 338
 339 enum ElecType nbnxmGpuPickElectrostaticsKernelType(const interaction_const_t* ic,
 340                                                    const DeviceInformation&   deviceInfo)
 341 {
 342     if (ic->eeltype == CoulombInteractionType::Cut)
 343     {
 344         return ElecType::Cut;
 345     }
 346     else if (EEL_RF(ic->eeltype))
 347     {
 348         return ElecType::RF;
 349     }
 350     else if ((EEL_PME(ic->eeltype) || ic->eeltype == CoulombInteractionType::Ewald))
 351     {
 352         return nbnxn_gpu_pick_ewald_kernel_type(*ic, deviceInfo);
 353     }
 354     else
 355     {
 356         /* Shouldn't happen, as this is checked when choosing Verlet-scheme */
 357         GMX_THROW(gmx::InconsistentInputError(
 358                 gmx::formatString("The requested electrostatics type %s is not implemented in "
 359                                   "the GPU accelerated kernels!",
 360                                   enumValueToString(ic->eeltype))));
 361     }
 362 }
 363
 364
 365 enum VdwType nbnxmGpuPickVdwKernelType(const interaction_const_t* ic, LJCombinationRule ljCombinationRule)
 366 {
 367     if (ic->vdwtype == VanDerWaalsType::Cut)
 368     {
 369         switch (ic->vdw_modifier)
 370         {
 371             case InteractionModifiers::None:
 372             case InteractionModifiers::PotShift:
 373                 switch (ljCombinationRule)
 374                 {
 375                     case LJCombinationRule::None: return VdwType::Cut;
 376                     case LJCombinationRule::Geometric: return VdwType::CutCombGeom;
 377                     case LJCombinationRule::LorentzBerthelot: return VdwType::CutCombLB;
 378                     default:
 379                         GMX_THROW(gmx::InconsistentInputError(gmx::formatString(
 380                                 "The requested LJ combination rule %s is not implemented in "
 381                                 "the GPU accelerated kernels!",
 382                                 enumValueToString(ljCombinationRule))));
 383                 }
 384             case InteractionModifiers::ForceSwitch: return VdwType::FSwitch;
 385             case InteractionModifiers::PotSwitch: return VdwType::PSwitch;
 386             default:
 387                 GMX_THROW(gmx::InconsistentInputError(
 388                         gmx::formatString("The requested VdW interaction modifier %s is not "
 389                                           "implemented in the GPU accelerated kernels!",
 390                                           enumValueToString(ic->vdw_modifier))));
 391         }
 392     }
 393     else if (ic->vdwtype == VanDerWaalsType::Pme)
 394     {
 395         if (ic->ljpme_comb_rule == LongRangeVdW::Geom)
 396         {
 397             assert(ljCombinationRule == LJCombinationRule::Geometric);
 398             return VdwType::EwaldGeom;
 399         }
 400         else
 401         {
 402             assert(ljCombinationRule == LJCombinationRule::LorentzBerthelot);
 403             return VdwType::EwaldLB;
 404         }
 405     }
 406     else
 407     {
 408         GMX_THROW(gmx::InconsistentInputError(gmx::formatString(
 409                 "The requested VdW type %s is not implemented in the GPU accelerated kernels!",
 410                 enumValueToString(ic->vdwtype))));
 411     }
 412 }
 413
 414 void setupGpuShortRangeWork(NbnxmGpu* nb, const gmx::GpuBonded* gpuBonded, const gmx::InteractionLocality iLocality)
 415 {
 416     GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
 417
 418     // There is short-range work if the pair list for the provided
 419     // interaction locality contains entries or if there is any
 420     // bonded work (as this is not split into local/nonlocal).
 421     nb->haveWork[iLocality] = ((nb->plist[iLocality]->nsci != 0)
 422                                || (gpuBonded != nullptr && gpuBonded->haveInteractions()));
 423 }
 424
 425 bool haveGpuShortRangeWork(const NbnxmGpu* nb, const gmx::AtomLocality aLocality)
 426 {
 427     GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
 428
 429     return haveGpuShortRangeWork(*nb, gpuAtomToInteractionLocality(aLocality));
 430 }
 431
 432 /*! \brief Launch asynchronously the xq buffer host to device copy. */
 433 void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const AtomLocality atomLocality)
 434 {
 435     GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
 436
 437     const InteractionLocality iloc = gpuAtomToInteractionLocality(atomLocality);
 438
 439     NBAtomData*         adat         = nb->atdat;
 440     gpu_plist*          plist        = nb->plist[iloc];
 441     Nbnxm::GpuTimers*   timers       = nb->timers;
 442     const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
 443
 444     const bool bDoTime = nb->bDoTime;
 445
 446     /* Don't launch the non-local H2D copy if there is no dependent
 447        work to do: neither non-local nor other (e.g. bonded) work
 448        to do that has as input the nbnxn coordaintes.
 449        Doing the same for the local kernel is more complicated, since the
 450        local part of the force array also depends on the non-local kernel.
 451        So to avoid complicating the code and to reduce the risk of bugs,
 452        we always call the local local x+q copy (and the rest of the local
 453        work in nbnxn_gpu_launch_kernel().
 454      */
 455     if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(*nb, iloc))
 456     {
 457         plist->haveFreshList = false;
 458
 459         // The event is marked for Local interactions unconditionally,
 460         // so it has to be released here because of the early return
 461         // for NonLocal interactions.
 462         nb->misc_ops_and_local_H2D_done.reset();
 463
 464         return;
 465     }
 466
 467     /* local/nonlocal offset and length used for xq and f */
 468     const auto atomsRange = getGpuAtomRange(adat, atomLocality);
 469
 470     /* beginning of timed HtoD section */
 471     if (bDoTime)
 472     {
 473         timers->xf[atomLocality].nb_h2d.openTimingRegion(deviceStream);
 474     }
 475
 476     /* HtoD x, q */
 477     GMX_ASSERT(nbatom->XFormat == nbatXYZQ,
 478                "The coordinates should be in xyzq format to copy to the Float4 device buffer.");
 479     copyToDeviceBuffer(&adat->xq,
 480                        reinterpret_cast<const Float4*>(nbatom->x().data()) + atomsRange.begin(),
 481                        atomsRange.begin(),
 482                        atomsRange.size(),
 483                        deviceStream,
 484                        GpuApiCallBehavior::Async,
 485                        nullptr);
 486
 487     if (bDoTime)
 488     {
 489         timers->xf[atomLocality].nb_h2d.closeTimingRegion(deviceStream);
 490     }
 491
 492     /* When we get here all misc operations issued in the local stream as well as
 493        the local xq H2D are done,
 494        so we record that in the local stream and wait for it in the nonlocal one.
 495        This wait needs to precede any PP tasks, bonded or nonbonded, that may
 496        compute on interactions between local and nonlocal atoms.
 497      */
 498     nbnxnInsertNonlocalGpuDependency(nb, iloc);
 499 }
 500
 501 } // namespace Nbnxm