src/gromacs/nbnxm/nbnxm_gpu_data_mgmt.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2012,2013,2014,2015,2016 by the GROMACS development team.
   5  * Copyright (c) 2017,2018,2019,2020,2021, by the GROMACS development team, led by
   6  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   7  * and including many others, as listed in the AUTHORS file in the
   8  * top-level source directory and at http://www.gromacs.org.
   9  *
  10  * GROMACS is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU Lesser General Public License
  12  * as published by the Free Software Foundation; either version 2.1
  13  * of the License, or (at your option) any later version.
  14  *
  15  * GROMACS is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18  * Lesser General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public
  21  * License along with GROMACS; if not, see
  22  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  23  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  24  *
  25  * If you want to redistribute modifications to GROMACS, please
  26  * consider that scientific software is very special. Version
  27  * control is crucial - bugs must be traceable. We will be happy to
  28  * consider code for inclusion in the official distribution, but
  29  * derived work must not be called official GROMACS. Details are found
  30  * in the README & COPYING files - if they are missing, get the
  31  * official version at http://www.gromacs.org.
  32  *
  33  * To help us fund GROMACS development, we humbly ask that you cite
  34  * the research papers on the package. Check out http://www.gromacs.org.
  35  */
  36 /*! \internal \file
  37  *  \brief Define common implementation of nbnxm_gpu_data_mgmt.h
  38  *
  39  *  \author Anca Hamuraru <anca@streamcomputing.eu>
  40  *  \author Dimitrios Karkoulis <dimitris.karkoulis@gmail.com>
  41  *  \author Teemu Virolainen <teemu@streamcomputing.eu>
  42  *  \author Szilárd Páll <pall.szilard@gmail.com>
  43  *  \author Artem Zhmurov <zhmurov@gmail.com>
  44  *
  45  *  \ingroup module_nbnxm
  46  */
  47 #include "gmxpre.h"
  48
  49 #include "config.h"
  50
  51 #if GMX_GPU_CUDA
  52 #    include "cuda/nbnxm_cuda_types.h"
  53 #endif
  54
  55 #if GMX_GPU_OPENCL
  56 #    include "opencl/nbnxm_ocl_types.h"
  57 #endif
  58
  59 #if GMX_GPU_SYCL
  60 #    include "sycl/nbnxm_sycl_types.h"
  61 #endif
  62
  63 #include "nbnxm_gpu_data_mgmt.h"
  64
  65 #include "gromacs/hardware/device_information.h"
  66 #include "gromacs/mdtypes/interaction_const.h"
  67 #include "gromacs/nbnxm/gpu_data_mgmt.h"
  68 #include "gromacs/timing/gpu_timing.h"
  69 #include "gromacs/utility/cstringutil.h"
  70 #include "gromacs/utility/fatalerror.h"
  71
  72 #include "nbnxm_gpu.h"
  73 #include "pairlistsets.h"
  74
  75 namespace Nbnxm
  76 {
  77
  78 void init_ewald_coulomb_force_table(const EwaldCorrectionTables& tables,
  79                                     NBParamGpu*                  nbp,
  80                                     const DeviceContext&         deviceContext)
  81 {
  82     if (nbp->coulomb_tab)
  83     {
  84         destroyParamLookupTable(&nbp->coulomb_tab, nbp->coulomb_tab_texobj);
  85     }
  86
  87     nbp->coulomb_tab_scale = tables.scale;
  88     initParamLookupTable(
  89             &nbp->coulomb_tab, &nbp->coulomb_tab_texobj, tables.tableF.data(), tables.tableF.size(), deviceContext);
  90 }
  91
  92 enum ElecType nbnxn_gpu_pick_ewald_kernel_type(const interaction_const_t& ic,
  93                                                const DeviceInformation gmx_unused& deviceInfo)
  94 {
  95     bool bTwinCut = (ic.rcoulomb != ic.rvdw);
  96
  97     /* Benchmarking/development environment variables to force the use of
  98        analytical or tabulated Ewald kernel. */
  99     const bool forceAnalyticalEwald = (getenv("GMX_GPU_NB_ANA_EWALD") != nullptr);
 100     const bool forceTabulatedEwald  = (getenv("GMX_GPU_NB_TAB_EWALD") != nullptr);
 101     const bool forceTwinCutoffEwald = (getenv("GMX_GPU_NB_EWALD_TWINCUT") != nullptr);
 102
 103     if (forceAnalyticalEwald && forceTabulatedEwald)
 104     {
 105         gmx_incons(
 106                 "Both analytical and tabulated Ewald GPU non-bonded kernels "
 107                 "requested through environment variables.");
 108     }
 109
 110     /* By default, use analytical Ewald except with CUDA on NVIDIA CC 7.0 and 8.0.
 111      */
 112     const bool c_useTabulatedEwaldDefault =
 113 #if GMX_GPU_CUDA
 114             (deviceInfo.prop.major == 7 && deviceInfo.prop.minor == 0)
 115             || (deviceInfo.prop.major == 8 && deviceInfo.prop.minor == 0);
 116 #else
 117             false;
 118 #endif
 119     bool bUseAnalyticalEwald = !c_useTabulatedEwaldDefault;
 120     if (forceAnalyticalEwald)
 121     {
 122         bUseAnalyticalEwald = true;
 123         if (debug)
 124         {
 125             fprintf(debug, "Using analytical Ewald GPU kernels\n");
 126         }
 127     }
 128     else if (forceTabulatedEwald)
 129     {
 130         bUseAnalyticalEwald = false;
 131
 132         if (debug)
 133         {
 134             fprintf(debug, "Using tabulated Ewald GPU kernels\n");
 135         }
 136     }
 137
 138     /* Use twin cut-off kernels if requested by bTwinCut or the env. var.
 139        forces it (use it for debugging/benchmarking only). */
 140     if (!bTwinCut && !forceTwinCutoffEwald)
 141     {
 142         return bUseAnalyticalEwald ? ElecType::EwaldAna : ElecType::EwaldTab;
 143     }
 144     else
 145     {
 146         return bUseAnalyticalEwald ? ElecType::EwaldAnaTwin : ElecType::EwaldTabTwin;
 147     }
 148 }
 149
 150 void set_cutoff_parameters(NBParamGpu* nbp, const interaction_const_t* ic, const PairlistParams& listParams)
 151 {
 152     nbp->ewald_beta        = ic->ewaldcoeff_q;
 153     nbp->sh_ewald          = ic->sh_ewald;
 154     nbp->epsfac            = ic->epsfac;
 155     nbp->two_k_rf          = 2.0 * ic->reactionFieldCoefficient;
 156     nbp->c_rf              = ic->reactionFieldShift;
 157     nbp->rvdw_sq           = ic->rvdw * ic->rvdw;
 158     nbp->rcoulomb_sq       = ic->rcoulomb * ic->rcoulomb;
 159     nbp->rlistOuter_sq     = listParams.rlistOuter * listParams.rlistOuter;
 160     nbp->rlistInner_sq     = listParams.rlistInner * listParams.rlistInner;
 161     nbp->useDynamicPruning = listParams.useDynamicPruning;
 162
 163     nbp->sh_lj_ewald   = ic->sh_lj_ewald;
 164     nbp->ewaldcoeff_lj = ic->ewaldcoeff_lj;
 165
 166     nbp->rvdw_switch      = ic->rvdw_switch;
 167     nbp->dispersion_shift = ic->dispersion_shift;
 168     nbp->repulsion_shift  = ic->repulsion_shift;
 169     nbp->vdw_switch       = ic->vdw_switch;
 170 }
 171
 172 void gpu_pme_loadbal_update_param(const nonbonded_verlet_t* nbv, const interaction_const_t* ic)
 173 {
 174     if (!nbv || !nbv->useGpu())
 175     {
 176         return;
 177     }
 178     NbnxmGpu*   nb  = nbv->gpu_nbv;
 179     NBParamGpu* nbp = nb->nbparam;
 180
 181     set_cutoff_parameters(nbp, ic, nbv->pairlistSets().params());
 182
 183     nbp->elecType = nbnxn_gpu_pick_ewald_kernel_type(*ic, nb->deviceContext_->deviceInfo());
 184
 185     GMX_RELEASE_ASSERT(ic->coulombEwaldTables, "Need valid Coulomb Ewald correction tables");
 186     init_ewald_coulomb_force_table(*ic->coulombEwaldTables, nbp, *nb->deviceContext_);
 187 }
 188
 189 void init_plist(gpu_plist* pl)
 190 {
 191     /* initialize to nullptr pointers to data that is not allocated here and will
 192        need reallocation in nbnxn_gpu_init_pairlist */
 193     pl->sci   = nullptr;
 194     pl->cj4   = nullptr;
 195     pl->imask = nullptr;
 196     pl->excl  = nullptr;
 197
 198     /* size -1 indicates that the respective array hasn't been initialized yet */
 199     pl->na_c                   = -1;
 200     pl->nsci                   = -1;
 201     pl->sci_nalloc             = -1;
 202     pl->ncj4                   = -1;
 203     pl->cj4_nalloc             = -1;
 204     pl->nimask                 = -1;
 205     pl->imask_nalloc           = -1;
 206     pl->nexcl                  = -1;
 207     pl->excl_nalloc            = -1;
 208     pl->haveFreshList          = false;
 209     pl->rollingPruningNumParts = 0;
 210     pl->rollingPruningPart     = 0;
 211 }
 212
 213 void init_timings(gmx_wallclock_gpu_nbnxn_t* t)
 214 {
 215     t->nb_h2d_t = 0.0;
 216     t->nb_d2h_t = 0.0;
 217     t->nb_c     = 0;
 218     t->pl_h2d_t = 0.0;
 219     t->pl_h2d_c = 0;
 220     for (int i = 0; i < 2; i++)
 221     {
 222         for (int j = 0; j < 2; j++)
 223         {
 224             t->ktime[i][j].t = 0.0;
 225             t->ktime[i][j].c = 0;
 226         }
 227     }
 228     t->pruneTime.c        = 0;
 229     t->pruneTime.t        = 0.0;
 230     t->dynamicPruneTime.c = 0;
 231     t->dynamicPruneTime.t = 0.0;
 232 }
 233
 234 //! This function is documented in the header file
 235 void gpu_init_pairlist(NbnxmGpu* nb, const NbnxnPairlistGpu* h_plist, const InteractionLocality iloc)
 236 {
 237     char sbuf[STRLEN];
 238     // Timing accumulation should happen only if there was work to do
 239     // because getLastRangeTime() gets skipped with empty lists later
 240     // which leads to the counter not being reset.
 241     bool                bDoTime      = (nb->bDoTime && !h_plist->sci.empty());
 242     const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
 243     gpu_plist*          d_plist      = nb->plist[iloc];
 244
 245     if (d_plist->na_c < 0)
 246     {
 247         d_plist->na_c = h_plist->na_ci;
 248     }
 249     else
 250     {
 251         if (d_plist->na_c != h_plist->na_ci)
 252         {
 253             sprintf(sbuf,
 254                     "In init_plist: the #atoms per cell has changed (from %d to %d)",
 255                     d_plist->na_c,
 256                     h_plist->na_ci);
 257             gmx_incons(sbuf);
 258         }
 259     }
 260
 261     gpu_timers_t::Interaction& iTimers = nb->timers->interaction[iloc];
 262
 263     if (bDoTime)
 264     {
 265         iTimers.pl_h2d.openTimingRegion(deviceStream);
 266         iTimers.didPairlistH2D = true;
 267     }
 268
 269     // TODO most of this function is same in CUDA and OpenCL, move into the header
 270     const DeviceContext& deviceContext = *nb->deviceContext_;
 271
 272     reallocateDeviceBuffer(
 273             &d_plist->sci, h_plist->sci.size(), &d_plist->nsci, &d_plist->sci_nalloc, deviceContext);
 274     copyToDeviceBuffer(&d_plist->sci,
 275                        h_plist->sci.data(),
 276                        0,
 277                        h_plist->sci.size(),
 278                        deviceStream,
 279                        GpuApiCallBehavior::Async,
 280                        bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
 281
 282     reallocateDeviceBuffer(
 283             &d_plist->cj4, h_plist->cj4.size(), &d_plist->ncj4, &d_plist->cj4_nalloc, deviceContext);
 284     copyToDeviceBuffer(&d_plist->cj4,
 285                        h_plist->cj4.data(),
 286                        0,
 287                        h_plist->cj4.size(),
 288                        deviceStream,
 289                        GpuApiCallBehavior::Async,
 290                        bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
 291
 292     reallocateDeviceBuffer(&d_plist->imask,
 293                            h_plist->cj4.size() * c_nbnxnGpuClusterpairSplit,
 294                            &d_plist->nimask,
 295                            &d_plist->imask_nalloc,
 296                            deviceContext);
 297
 298     reallocateDeviceBuffer(
 299             &d_plist->excl, h_plist->excl.size(), &d_plist->nexcl, &d_plist->excl_nalloc, deviceContext);
 300     copyToDeviceBuffer(&d_plist->excl,
 301                        h_plist->excl.data(),
 302                        0,
 303                        h_plist->excl.size(),
 304                        deviceStream,
 305                        GpuApiCallBehavior::Async,
 306                        bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
 307
 308     if (bDoTime)
 309     {
 310         iTimers.pl_h2d.closeTimingRegion(deviceStream);
 311     }
 312
 313     /* need to prune the pair list during the next step */
 314     d_plist->haveFreshList = true;
 315 }
 316
 317 //! This function is documented in the header file
 318 gmx_wallclock_gpu_nbnxn_t* gpu_get_timings(NbnxmGpu* nb)
 319 {
 320     return (nb != nullptr && nb->bDoTime) ? nb->timings : nullptr;
 321 }
 322
 323 //! This function is documented in the header file
 324 void gpu_reset_timings(nonbonded_verlet_t* nbv)
 325 {
 326     if (nbv->gpu_nbv && nbv->gpu_nbv->bDoTime)
 327     {
 328         init_timings(nbv->gpu_nbv->timings);
 329     }
 330 }
 331
 332 bool gpu_is_kernel_ewald_analytical(const NbnxmGpu* nb)
 333 {
 334     return ((nb->nbparam->elecType == ElecType::EwaldAna)
 335             || (nb->nbparam->elecType == ElecType::EwaldAnaTwin));
 336 }
 337
 338 enum ElecType nbnxmGpuPickElectrostaticsKernelType(const interaction_const_t* ic,
 339                                                    const DeviceInformation&   deviceInfo)
 340 {
 341     if (ic->eeltype == eelCUT)
 342     {
 343         return ElecType::Cut;
 344     }
 345     else if (EEL_RF(ic->eeltype))
 346     {
 347         return ElecType::RF;
 348     }
 349     else if ((EEL_PME(ic->eeltype) || ic->eeltype == eelEWALD))
 350     {
 351         return nbnxn_gpu_pick_ewald_kernel_type(*ic, deviceInfo);
 352     }
 353     else
 354     {
 355         /* Shouldn't happen, as this is checked when choosing Verlet-scheme */
 356         GMX_THROW(gmx::InconsistentInputError(
 357                 gmx::formatString("The requested electrostatics type %s (%d) is not implemented in "
 358                                   "the GPU accelerated kernels!",
 359                                   EELTYPE(ic->eeltype),
 360                                   ic->eeltype)));
 361     }
 362 }
 363
 364
 365 enum VdwType nbnxmGpuPickVdwKernelType(const interaction_const_t* ic, LJCombinationRule ljCombinationRule)
 366 {
 367     if (ic->vdwtype == evdwCUT)
 368     {
 369         switch (ic->vdw_modifier)
 370         {
 371             case eintmodNONE:
 372             case eintmodPOTSHIFT:
 373                 switch (ljCombinationRule)
 374                 {
 375                     case LJCombinationRule::None: return VdwType::Cut;
 376                     case LJCombinationRule::Geometric: return VdwType::CutCombGeom;
 377                     case LJCombinationRule::LorentzBerthelot: return VdwType::CutCombLB;
 378                     default:
 379                         GMX_THROW(gmx::InconsistentInputError(gmx::formatString(
 380                                 "The requested LJ combination rule %s is not implemented in "
 381                                 "the GPU accelerated kernels!",
 382                                 enumValueToString(ljCombinationRule))));
 383                 }
 384             case eintmodFORCESWITCH: return VdwType::FSwitch;
 385             case eintmodPOTSWITCH: return VdwType::PSwitch;
 386             default:
 387                 GMX_THROW(gmx::InconsistentInputError(
 388                         gmx::formatString("The requested VdW interaction modifier %s (%d) is not "
 389                                           "implemented in the GPU accelerated kernels!",
 390                                           INTMODIFIER(ic->vdw_modifier),
 391                                           ic->vdw_modifier)));
 392         }
 393     }
 394     else if (ic->vdwtype == evdwPME)
 395     {
 396         if (ic->ljpme_comb_rule == eljpmeGEOM)
 397         {
 398             assert(ljCombinationRule == LJCombinationRule::Geometric);
 399             return VdwType::EwaldGeom;
 400         }
 401         else
 402         {
 403             assert(ljCombinationRule == LJCombinationRule::LorentzBerthelot);
 404             return VdwType::EwaldLB;
 405         }
 406     }
 407     else
 408     {
 409         GMX_THROW(gmx::InconsistentInputError(gmx::formatString(
 410                 "The requested VdW type %s (%d) is not implemented in the GPU accelerated kernels!",
 411                 EVDWTYPE(ic->vdwtype),
 412                 ic->vdwtype)));
 413     }
 414 }
 415
 416 } // namespace Nbnxm