src/gromacs/nbnxm/nbnxm_gpu_data_mgmt.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2012,2013,2014,2015,2016 by the GROMACS development team.
   5  * Copyright (c) 2017,2018,2019,2020,2021, by the GROMACS development team, led by
   6  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   7  * and including many others, as listed in the AUTHORS file in the
   8  * top-level source directory and at http://www.gromacs.org.
   9  *
  10  * GROMACS is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU Lesser General Public License
  12  * as published by the Free Software Foundation; either version 2.1
  13  * of the License, or (at your option) any later version.
  14  *
  15  * GROMACS is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18  * Lesser General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public
  21  * License along with GROMACS; if not, see
  22  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  23  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  24  *
  25  * If you want to redistribute modifications to GROMACS, please
  26  * consider that scientific software is very special. Version
  27  * control is crucial - bugs must be traceable. We will be happy to
  28  * consider code for inclusion in the official distribution, but
  29  * derived work must not be called official GROMACS. Details are found
  30  * in the README & COPYING files - if they are missing, get the
  31  * official version at http://www.gromacs.org.
  32  *
  33  * To help us fund GROMACS development, we humbly ask that you cite
  34  * the research papers on the package. Check out http://www.gromacs.org.
  35  */
  36 /*! \internal \file
  37  *  \brief Define common implementation of nbnxm_gpu_data_mgmt.h
  38  *
  39  *  \author Anca Hamuraru <anca@streamcomputing.eu>
  40  *  \author Dimitrios Karkoulis <dimitris.karkoulis@gmail.com>
  41  *  \author Teemu Virolainen <teemu@streamcomputing.eu>
  42  *  \author Szilárd Páll <pall.szilard@gmail.com>
  43  *  \author Artem Zhmurov <zhmurov@gmail.com>
  44  *
  45  *  \ingroup module_nbnxm
  46  */
  47 #include "gmxpre.h"
  48
  49 #include "config.h"
  50
  51 #if GMX_GPU_CUDA
  52 #    include "cuda/nbnxm_cuda_types.h"
  53 #endif
  54
  55 #if GMX_GPU_OPENCL
  56 #    include "opencl/nbnxm_ocl_types.h"
  57 #endif
  58
  59 #include "nbnxm_gpu_data_mgmt.h"
  60
  61 #include "gromacs/hardware/device_information.h"
  62 #include "gromacs/mdtypes/interaction_const.h"
  63 #include "gromacs/nbnxm/gpu_data_mgmt.h"
  64 #include "gromacs/timing/gpu_timing.h"
  65 #include "gromacs/utility/cstringutil.h"
  66
  67 #include "nbnxm_gpu.h"
  68 #include "pairlistsets.h"
  69
  70 namespace Nbnxm
  71 {
  72
  73 void init_ewald_coulomb_force_table(const EwaldCorrectionTables& tables,
  74                                     NBParamGpu*                  nbp,
  75                                     const DeviceContext&         deviceContext)
  76 {
  77     if (nbp->coulomb_tab)
  78     {
  79         destroyParamLookupTable(&nbp->coulomb_tab, nbp->coulomb_tab_texobj);
  80     }
  81
  82     nbp->coulomb_tab_scale = tables.scale;
  83     initParamLookupTable(
  84             &nbp->coulomb_tab, &nbp->coulomb_tab_texobj, tables.tableF.data(), tables.tableF.size(), deviceContext);
  85 }
  86
  87 void inline printEnvironmentVariableDeprecationMessage(bool               isEnvironmentVariableSet,
  88                                                        const std::string& environmentVariableSuffix)
  89 {
  90     if (isEnvironmentVariableSet)
  91     {
  92         fprintf(stderr,
  93                 "Environment variables GMX_CUDA_%s and GMX_OCL_%s are deprecated and will be\n"
  94                 "removed in release 2022, please use GMX_GPU_%s instead.",
  95                 environmentVariableSuffix.c_str(),
  96                 environmentVariableSuffix.c_str(),
  97                 environmentVariableSuffix.c_str());
  98     }
  99 }
 100
 101 enum ElecType nbnxn_gpu_pick_ewald_kernel_type(const interaction_const_t& ic,
 102                                                const DeviceInformation gmx_unused& deviceInfo)
 103 {
 104     bool bTwinCut = (ic.rcoulomb != ic.rvdw);
 105
 106     /* Benchmarking/development environment variables to force the use of
 107        analytical or tabulated Ewald kernel. */
 108
 109     // Remove these when old environment variables are deprecated
 110     const bool forceAnalyticalEwaldLegacy = (getenv("GMX_CUDA_NB_ANA_EWALD") != nullptr)
 111                                             || (getenv("GMX_OCL_NB_ANA_EWALD") != nullptr);
 112     const bool forceTabulatedEwaldLegacy = (getenv("GMX_CUDA_NB_TAB_EWALD") != nullptr)
 113                                            || (getenv("GMX_OCL_NB_TAB_EWALD") != nullptr);
 114     const bool forceTwinCutoffEwaldLegacy = (getenv("GMX_CUDA_NB_EWALD_TWINCUT") != nullptr)
 115                                             || (getenv("GMX_OCL_NB_EWALD_TWINCUT") != nullptr);
 116
 117     printEnvironmentVariableDeprecationMessage(forceAnalyticalEwaldLegacy, "NB_ANA_EWALD");
 118     printEnvironmentVariableDeprecationMessage(forceTabulatedEwaldLegacy, "NB_TAB_EWALD");
 119     printEnvironmentVariableDeprecationMessage(forceTwinCutoffEwaldLegacy, "NB_EWALD_TWINCUT");
 120
 121     const bool forceAnalyticalEwald =
 122             (getenv("GMX_GPU_NB_ANA_EWALD") != nullptr) || forceAnalyticalEwaldLegacy;
 123     const bool forceTabulatedEwald =
 124             (getenv("GMX_GPU_NB_TAB_EWALD") != nullptr) || forceTabulatedEwaldLegacy;
 125     const bool forceTwinCutoffEwald =
 126             (getenv("GMX_GPU_NB_EWALD_TWINCUT") != nullptr) || forceTwinCutoffEwaldLegacy;
 127
 128     if (forceAnalyticalEwald && forceTabulatedEwald)
 129     {
 130         gmx_incons(
 131                 "Both analytical and tabulated Ewald GPU non-bonded kernels "
 132                 "requested through environment variables.");
 133     }
 134
 135     /* By default, use analytical Ewald except with CUDA on NVIDIA CC 7.0 and 8.0.
 136      */
 137     const bool c_useTabulatedEwaldDefault =
 138 #if GMX_GPU_CUDA
 139             (deviceInfo.prop.major == 7 && deviceInfo.prop.minor == 0)
 140             || (deviceInfo.prop.major == 8 && deviceInfo.prop.minor == 0);
 141 #else
 142             false;
 143 #endif
 144     bool bUseAnalyticalEwald = !c_useTabulatedEwaldDefault;
 145     if (forceAnalyticalEwald)
 146     {
 147         bUseAnalyticalEwald = true;
 148         if (debug)
 149         {
 150             fprintf(debug, "Using analytical Ewald GPU kernels\n");
 151         }
 152     }
 153     else if (forceTabulatedEwald)
 154     {
 155         bUseAnalyticalEwald = false;
 156
 157         if (debug)
 158         {
 159             fprintf(debug, "Using tabulated Ewald GPU kernels\n");
 160         }
 161     }
 162
 163     /* Use twin cut-off kernels if requested by bTwinCut or the env. var.
 164        forces it (use it for debugging/benchmarking only). */
 165     if (!bTwinCut && !forceTwinCutoffEwald)
 166     {
 167         return bUseAnalyticalEwald ? ElecType::EwaldAna : ElecType::EwaldTab;
 168     }
 169     else
 170     {
 171         return bUseAnalyticalEwald ? ElecType::EwaldAnaTwin : ElecType::EwaldTabTwin;
 172     }
 173 }
 174
 175 void set_cutoff_parameters(NBParamGpu* nbp, const interaction_const_t* ic, const PairlistParams& listParams)
 176 {
 177     nbp->ewald_beta        = ic->ewaldcoeff_q;
 178     nbp->sh_ewald          = ic->sh_ewald;
 179     nbp->epsfac            = ic->epsfac;
 180     nbp->two_k_rf          = 2.0 * ic->k_rf;
 181     nbp->c_rf              = ic->c_rf;
 182     nbp->rvdw_sq           = ic->rvdw * ic->rvdw;
 183     nbp->rcoulomb_sq       = ic->rcoulomb * ic->rcoulomb;
 184     nbp->rlistOuter_sq     = listParams.rlistOuter * listParams.rlistOuter;
 185     nbp->rlistInner_sq     = listParams.rlistInner * listParams.rlistInner;
 186     nbp->useDynamicPruning = listParams.useDynamicPruning;
 187
 188     nbp->sh_lj_ewald   = ic->sh_lj_ewald;
 189     nbp->ewaldcoeff_lj = ic->ewaldcoeff_lj;
 190
 191     nbp->rvdw_switch      = ic->rvdw_switch;
 192     nbp->dispersion_shift = ic->dispersion_shift;
 193     nbp->repulsion_shift  = ic->repulsion_shift;
 194     nbp->vdw_switch       = ic->vdw_switch;
 195 }
 196
 197 void gpu_pme_loadbal_update_param(const nonbonded_verlet_t* nbv, const interaction_const_t* ic)
 198 {
 199     if (!nbv || !nbv->useGpu())
 200     {
 201         return;
 202     }
 203     NbnxmGpu*   nb  = nbv->gpu_nbv;
 204     NBParamGpu* nbp = nb->nbparam;
 205
 206     set_cutoff_parameters(nbp, ic, nbv->pairlistSets().params());
 207
 208     nbp->elecType = nbnxn_gpu_pick_ewald_kernel_type(*ic, nb->deviceContext_->deviceInfo());
 209
 210     GMX_RELEASE_ASSERT(ic->coulombEwaldTables, "Need valid Coulomb Ewald correction tables");
 211     init_ewald_coulomb_force_table(*ic->coulombEwaldTables, nbp, *nb->deviceContext_);
 212 }
 213
 214 void init_plist(gpu_plist* pl)
 215 {
 216     /* initialize to nullptr pointers to data that is not allocated here and will
 217        need reallocation in nbnxn_gpu_init_pairlist */
 218     pl->sci   = nullptr;
 219     pl->cj4   = nullptr;
 220     pl->imask = nullptr;
 221     pl->excl  = nullptr;
 222
 223     /* size -1 indicates that the respective array hasn't been initialized yet */
 224     pl->na_c          = -1;
 225     pl->nsci          = -1;
 226     pl->sci_nalloc    = -1;
 227     pl->ncj4          = -1;
 228     pl->cj4_nalloc    = -1;
 229     pl->nimask        = -1;
 230     pl->imask_nalloc  = -1;
 231     pl->nexcl         = -1;
 232     pl->excl_nalloc   = -1;
 233     pl->haveFreshList = false;
 234 }
 235
 236 void init_timings(gmx_wallclock_gpu_nbnxn_t* t)
 237 {
 238     int i, j;
 239
 240     t->nb_h2d_t = 0.0;
 241     t->nb_d2h_t = 0.0;
 242     t->nb_c     = 0;
 243     t->pl_h2d_t = 0.0;
 244     t->pl_h2d_c = 0;
 245     for (i = 0; i < 2; i++)
 246     {
 247         for (j = 0; j < 2; j++)
 248         {
 249             t->ktime[i][j].t = 0.0;
 250             t->ktime[i][j].c = 0;
 251         }
 252     }
 253     t->pruneTime.c        = 0;
 254     t->pruneTime.t        = 0.0;
 255     t->dynamicPruneTime.c = 0;
 256     t->dynamicPruneTime.t = 0.0;
 257 }
 258
 259 //! This function is documented in the header file
 260 void gpu_init_pairlist(NbnxmGpu* nb, const NbnxnPairlistGpu* h_plist, const InteractionLocality iloc)
 261 {
 262     char sbuf[STRLEN];
 263     // Timing accumulation should happen only if there was work to do
 264     // because getLastRangeTime() gets skipped with empty lists later
 265     // which leads to the counter not being reset.
 266     bool                bDoTime      = (nb->bDoTime && !h_plist->sci.empty());
 267     const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
 268     gpu_plist*          d_plist      = nb->plist[iloc];
 269
 270     if (d_plist->na_c < 0)
 271     {
 272         d_plist->na_c = h_plist->na_ci;
 273     }
 274     else
 275     {
 276         if (d_plist->na_c != h_plist->na_ci)
 277         {
 278             sprintf(sbuf,
 279                     "In init_plist: the #atoms per cell has changed (from %d to %d)",
 280                     d_plist->na_c,
 281                     h_plist->na_ci);
 282             gmx_incons(sbuf);
 283         }
 284     }
 285
 286     gpu_timers_t::Interaction& iTimers = nb->timers->interaction[iloc];
 287
 288     if (bDoTime)
 289     {
 290         iTimers.pl_h2d.openTimingRegion(deviceStream);
 291         iTimers.didPairlistH2D = true;
 292     }
 293
 294     // TODO most of this function is same in CUDA and OpenCL, move into the header
 295     const DeviceContext& deviceContext = *nb->deviceContext_;
 296
 297     reallocateDeviceBuffer(
 298             &d_plist->sci, h_plist->sci.size(), &d_plist->nsci, &d_plist->sci_nalloc, deviceContext);
 299     copyToDeviceBuffer(&d_plist->sci,
 300                        h_plist->sci.data(),
 301                        0,
 302                        h_plist->sci.size(),
 303                        deviceStream,
 304                        GpuApiCallBehavior::Async,
 305                        bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
 306
 307     reallocateDeviceBuffer(
 308             &d_plist->cj4, h_plist->cj4.size(), &d_plist->ncj4, &d_plist->cj4_nalloc, deviceContext);
 309     copyToDeviceBuffer(&d_plist->cj4,
 310                        h_plist->cj4.data(),
 311                        0,
 312                        h_plist->cj4.size(),
 313                        deviceStream,
 314                        GpuApiCallBehavior::Async,
 315                        bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
 316
 317     reallocateDeviceBuffer(&d_plist->imask,
 318                            h_plist->cj4.size() * c_nbnxnGpuClusterpairSplit,
 319                            &d_plist->nimask,
 320                            &d_plist->imask_nalloc,
 321                            deviceContext);
 322
 323     reallocateDeviceBuffer(
 324             &d_plist->excl, h_plist->excl.size(), &d_plist->nexcl, &d_plist->excl_nalloc, deviceContext);
 325     copyToDeviceBuffer(&d_plist->excl,
 326                        h_plist->excl.data(),
 327                        0,
 328                        h_plist->excl.size(),
 329                        deviceStream,
 330                        GpuApiCallBehavior::Async,
 331                        bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
 332
 333     if (bDoTime)
 334     {
 335         iTimers.pl_h2d.closeTimingRegion(deviceStream);
 336     }
 337
 338     /* need to prune the pair list during the next step */
 339     d_plist->haveFreshList = true;
 340 }
 341
 342 //! This function is documented in the header file
 343 gmx_wallclock_gpu_nbnxn_t* gpu_get_timings(NbnxmGpu* nb)
 344 {
 345     return (nb != nullptr && nb->bDoTime) ? nb->timings : nullptr;
 346 }
 347
 348 //! This function is documented in the header file
 349 void gpu_reset_timings(nonbonded_verlet_t* nbv)
 350 {
 351     if (nbv->gpu_nbv && nbv->gpu_nbv->bDoTime)
 352     {
 353         init_timings(nbv->gpu_nbv->timings);
 354     }
 355 }
 356
 357 bool gpu_is_kernel_ewald_analytical(const NbnxmGpu* nb)
 358 {
 359     return ((nb->nbparam->elecType == ElecType::EwaldAna)
 360             || (nb->nbparam->elecType == ElecType::EwaldAnaTwin));
 361 }
 362
 363 enum ElecType nbnxmGpuPickElectrostaticsKernelType(const interaction_const_t* ic,
 364                                                    const DeviceInformation&   deviceInfo)
 365 {
 366     if (ic->eeltype == eelCUT)
 367     {
 368         return ElecType::Cut;
 369     }
 370     else if (EEL_RF(ic->eeltype))
 371     {
 372         return ElecType::RF;
 373     }
 374     else if ((EEL_PME(ic->eeltype) || ic->eeltype == eelEWALD))
 375     {
 376         return nbnxn_gpu_pick_ewald_kernel_type(*ic, deviceInfo);
 377     }
 378     else
 379     {
 380         /* Shouldn't happen, as this is checked when choosing Verlet-scheme */
 381         GMX_THROW(gmx::InconsistentInputError(
 382                 gmx::formatString("The requested electrostatics type %s (%d) is not implemented in "
 383                                   "the GPU accelerated kernels!",
 384                                   EELTYPE(ic->eeltype),
 385                                   ic->eeltype)));
 386     }
 387 }
 388
 389
 390 enum VdwType nbnxmGpuPickVdwKernelType(const interaction_const_t* ic, int combRule)
 391 {
 392     if (ic->vdwtype == evdwCUT)
 393     {
 394         switch (ic->vdw_modifier)
 395         {
 396             case eintmodNONE:
 397             case eintmodPOTSHIFT:
 398                 switch (combRule)
 399                 {
 400                     case ljcrNONE: return VdwType::Cut;
 401                     case ljcrGEOM: return VdwType::CutCombGeom;
 402                     case ljcrLB: return VdwType::CutCombLB;
 403                     default:
 404                         GMX_THROW(gmx::InconsistentInputError(gmx::formatString(
 405                                 "The requested LJ combination rule %s (%d) is not implemented in "
 406                                 "the GPU accelerated kernels!",
 407                                 enum_name(combRule, ljcrNR, c_ljcrNames),
 408                                 combRule)));
 409                 }
 410             case eintmodFORCESWITCH: return VdwType::FSwitch;
 411             case eintmodPOTSWITCH: return VdwType::PSwitch;
 412             default:
 413                 GMX_THROW(gmx::InconsistentInputError(
 414                         gmx::formatString("The requested VdW interaction modifier %s (%d) is not "
 415                                           "implemented in the GPU accelerated kernels!",
 416                                           INTMODIFIER(ic->vdw_modifier),
 417                                           ic->vdw_modifier)));
 418         }
 419     }
 420     else if (ic->vdwtype == evdwPME)
 421     {
 422         if (ic->ljpme_comb_rule == ljcrGEOM)
 423         {
 424             assert(combRule == ljcrGEOM);
 425             return VdwType::EwaldGeom;
 426         }
 427         else
 428         {
 429             assert(combRule == ljcrLB);
 430             return VdwType::EwaldLB;
 431         }
 432     }
 433     else
 434     {
 435         GMX_THROW(gmx::InconsistentInputError(gmx::formatString(
 436                 "The requested VdW type %s (%d) is not implemented in the GPU accelerated kernels!",
 437                 EVDWTYPE(ic->vdwtype),
 438                 ic->vdwtype)));
 439     }
 440 }
 441
 442 } // namespace Nbnxm