src/gromacs/nbnxm/nbnxm_gpu_data_mgmt.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2012,2013,2014,2015,2016 by the GROMACS development team.
   5  * Copyright (c) 2017,2018,2019,2020, by the GROMACS development team, led by
   6  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   7  * and including many others, as listed in the AUTHORS file in the
   8  * top-level source directory and at http://www.gromacs.org.
   9  *
  10  * GROMACS is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU Lesser General Public License
  12  * as published by the Free Software Foundation; either version 2.1
  13  * of the License, or (at your option) any later version.
  14  *
  15  * GROMACS is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18  * Lesser General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public
  21  * License along with GROMACS; if not, see
  22  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  23  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  24  *
  25  * If you want to redistribute modifications to GROMACS, please
  26  * consider that scientific software is very special. Version
  27  * control is crucial - bugs must be traceable. We will be happy to
  28  * consider code for inclusion in the official distribution, but
  29  * derived work must not be called official GROMACS. Details are found
  30  * in the README & COPYING files - if they are missing, get the
  31  * official version at http://www.gromacs.org.
  32  *
  33  * To help us fund GROMACS development, we humbly ask that you cite
  34  * the research papers on the package. Check out http://www.gromacs.org.
  35  */
  36 /*! \internal \file
  37  *  \brief Define common implementation of nbnxm_gpu_data_mgmt.h
  38  *
  39  *  \author Anca Hamuraru <anca@streamcomputing.eu>
  40  *  \author Dimitrios Karkoulis <dimitris.karkoulis@gmail.com>
  41  *  \author Teemu Virolainen <teemu@streamcomputing.eu>
  42  *  \author Szilárd Páll <pall.szilard@gmail.com>
  43  *  \author Artem Zhmurov <zhmurov@gmail.com>
  44  *
  45  *  \ingroup module_nbnxm
  46  */
  47 #include "gmxpre.h"
  48
  49 #include "config.h"
  50
  51 #if GMX_GPU_CUDA
  52 #    include "cuda/nbnxm_cuda_types.h"
  53 #endif
  54
  55 #if GMX_GPU_OPENCL
  56 #    include "opencl/nbnxm_ocl_types.h"
  57 #endif
  58
  59 #include "nbnxm_gpu_data_mgmt.h"
  60
  61 #include "gromacs/nbnxm/gpu_data_mgmt.h"
  62 #include "gromacs/timing/gpu_timing.h"
  63 #include "gromacs/utility/cstringutil.h"
  64
  65 #include "nbnxm_gpu.h"
  66 #include "pairlistsets.h"
  67
  68 namespace Nbnxm
  69 {
  70
  71 void init_ewald_coulomb_force_table(const EwaldCorrectionTables& tables,
  72                                     NBParamGpu*                  nbp,
  73                                     const DeviceContext&         deviceContext)
  74 {
  75     if (nbp->coulomb_tab)
  76     {
  77         destroyParamLookupTable(&nbp->coulomb_tab, nbp->coulomb_tab_texobj);
  78     }
  79
  80     nbp->coulomb_tab_scale = tables.scale;
  81     initParamLookupTable(&nbp->coulomb_tab, &nbp->coulomb_tab_texobj, tables.tableF.data(),
  82                          tables.tableF.size(), deviceContext);
  83 }
  84
  85 void inline printEnvironmentVariableDeprecationMessage(bool               isEnvironmentVariableSet,
  86                                                        const std::string& environmentVariableSuffix)
  87 {
  88     if (isEnvironmentVariableSet)
  89     {
  90         fprintf(stderr,
  91                 "Environment variables GMX_CUDA_%s and GMX_OCL_%s are deprecated and will be\n"
  92                 "removed in release 2022, please use GMX_GPU_%s instead.",
  93                 environmentVariableSuffix.c_str(), environmentVariableSuffix.c_str(),
  94                 environmentVariableSuffix.c_str());
  95     }
  96 }
  97
  98 int nbnxn_gpu_pick_ewald_kernel_type(const interaction_const_t& ic)
  99 {
 100     bool bTwinCut = (ic.rcoulomb != ic.rvdw);
 101     int  kernel_type;
 102
 103     /* Benchmarking/development environment variables to force the use of
 104        analytical or tabulated Ewald kernel. */
 105
 106     // Remove these when old environment variables are deprecated
 107     const bool forceAnalyticalEwaldLegacy = (getenv("GMX_CUDA_NB_ANA_EWALD") != nullptr)
 108                                             || (getenv("GMX_OCL_NB_ANA_EWALD") != nullptr);
 109     const bool forceTabulatedEwaldLegacy = (getenv("GMX_CUDA_NB_TAB_EWALD") != nullptr)
 110                                            || (getenv("GMX_OCL_NB_TAB_EWALD") != nullptr);
 111     const bool forceTwinCutoffEwaldLegacy = (getenv("GMX_CUDA_NB_EWALD_TWINCUT") != nullptr)
 112                                             || (getenv("GMX_OCL_NB_EWALD_TWINCUT") != nullptr);
 113
 114     printEnvironmentVariableDeprecationMessage(forceAnalyticalEwaldLegacy, "NB_ANA_EWALD");
 115     printEnvironmentVariableDeprecationMessage(forceTabulatedEwaldLegacy, "NB_TAB_EWALD");
 116     printEnvironmentVariableDeprecationMessage(forceTwinCutoffEwaldLegacy, "NB_EWALD_TWINCUT");
 117
 118     const bool forceAnalyticalEwald =
 119             (getenv("GMX_GPU_NB_ANA_EWALD") != nullptr) || forceAnalyticalEwaldLegacy;
 120     const bool forceTabulatedEwald =
 121             (getenv("GMX_GPU_NB_TAB_EWALD") != nullptr) || forceTabulatedEwaldLegacy;
 122     const bool forceTwinCutoffEwald =
 123             (getenv("GMX_GPU_NB_EWALD_TWINCUT") != nullptr) || forceTwinCutoffEwaldLegacy;
 124
 125     if (forceAnalyticalEwald && forceTabulatedEwald)
 126     {
 127         gmx_incons(
 128                 "Both analytical and tabulated Ewald GPU non-bonded kernels "
 129                 "requested through environment variables.");
 130     }
 131
 132     /* By default, use analytical Ewald
 133      * TODO: tabulated does not work in OpenCL, it needs fixing, see init_nbparam() in nbnxn_ocl_data_mgmt.cpp
 134      *
 135      */
 136     bool bUseAnalyticalEwald = true;
 137     if (forceAnalyticalEwald)
 138     {
 139         if (debug)
 140         {
 141             fprintf(debug, "Using analytical Ewald GPU kernels\n");
 142         }
 143     }
 144     else if (forceTabulatedEwald)
 145     {
 146         bUseAnalyticalEwald = false;
 147
 148         if (debug)
 149         {
 150             fprintf(debug, "Using tabulated Ewald GPU kernels\n");
 151         }
 152     }
 153
 154     /* Use twin cut-off kernels if requested by bTwinCut or the env. var.
 155        forces it (use it for debugging/benchmarking only). */
 156     if (!bTwinCut && !forceTwinCutoffEwald)
 157     {
 158         kernel_type = bUseAnalyticalEwald ? eelTypeEWALD_ANA : eelTypeEWALD_TAB;
 159     }
 160     else
 161     {
 162         kernel_type = bUseAnalyticalEwald ? eelTypeEWALD_ANA_TWIN : eelTypeEWALD_TAB_TWIN;
 163     }
 164
 165     return kernel_type;
 166 }
 167
 168 void set_cutoff_parameters(NBParamGpu* nbp, const interaction_const_t* ic, const PairlistParams& listParams)
 169 {
 170     nbp->ewald_beta        = ic->ewaldcoeff_q;
 171     nbp->sh_ewald          = ic->sh_ewald;
 172     nbp->epsfac            = ic->epsfac;
 173     nbp->two_k_rf          = 2.0 * ic->k_rf;
 174     nbp->c_rf              = ic->c_rf;
 175     nbp->rvdw_sq           = ic->rvdw * ic->rvdw;
 176     nbp->rcoulomb_sq       = ic->rcoulomb * ic->rcoulomb;
 177     nbp->rlistOuter_sq     = listParams.rlistOuter * listParams.rlistOuter;
 178     nbp->rlistInner_sq     = listParams.rlistInner * listParams.rlistInner;
 179     nbp->useDynamicPruning = listParams.useDynamicPruning;
 180
 181     nbp->sh_lj_ewald   = ic->sh_lj_ewald;
 182     nbp->ewaldcoeff_lj = ic->ewaldcoeff_lj;
 183
 184     nbp->rvdw_switch      = ic->rvdw_switch;
 185     nbp->dispersion_shift = ic->dispersion_shift;
 186     nbp->repulsion_shift  = ic->repulsion_shift;
 187     nbp->vdw_switch       = ic->vdw_switch;
 188 }
 189
 190 void gpu_pme_loadbal_update_param(const nonbonded_verlet_t* nbv, const interaction_const_t* ic)
 191 {
 192     if (!nbv || !nbv->useGpu())
 193     {
 194         return;
 195     }
 196     NbnxmGpu*   nb  = nbv->gpu_nbv;
 197     NBParamGpu* nbp = nb->nbparam;
 198
 199     set_cutoff_parameters(nbp, ic, nbv->pairlistSets().params());
 200
 201     nbp->eeltype = nbnxn_gpu_pick_ewald_kernel_type(*ic);
 202
 203     GMX_RELEASE_ASSERT(ic->coulombEwaldTables, "Need valid Coulomb Ewald correction tables");
 204     init_ewald_coulomb_force_table(*ic->coulombEwaldTables, nbp, *nb->deviceContext_);
 205 }
 206
 207 void init_plist(gpu_plist* pl)
 208 {
 209     /* initialize to nullptr pointers to data that is not allocated here and will
 210        need reallocation in nbnxn_gpu_init_pairlist */
 211     pl->sci   = nullptr;
 212     pl->cj4   = nullptr;
 213     pl->imask = nullptr;
 214     pl->excl  = nullptr;
 215
 216     /* size -1 indicates that the respective array hasn't been initialized yet */
 217     pl->na_c          = -1;
 218     pl->nsci          = -1;
 219     pl->sci_nalloc    = -1;
 220     pl->ncj4          = -1;
 221     pl->cj4_nalloc    = -1;
 222     pl->nimask        = -1;
 223     pl->imask_nalloc  = -1;
 224     pl->nexcl         = -1;
 225     pl->excl_nalloc   = -1;
 226     pl->haveFreshList = false;
 227 }
 228
 229 void init_timings(gmx_wallclock_gpu_nbnxn_t* t)
 230 {
 231     int i, j;
 232
 233     t->nb_h2d_t = 0.0;
 234     t->nb_d2h_t = 0.0;
 235     t->nb_c     = 0;
 236     t->pl_h2d_t = 0.0;
 237     t->pl_h2d_c = 0;
 238     for (i = 0; i < 2; i++)
 239     {
 240         for (j = 0; j < 2; j++)
 241         {
 242             t->ktime[i][j].t = 0.0;
 243             t->ktime[i][j].c = 0;
 244         }
 245     }
 246     t->pruneTime.c        = 0;
 247     t->pruneTime.t        = 0.0;
 248     t->dynamicPruneTime.c = 0;
 249     t->dynamicPruneTime.t = 0.0;
 250 }
 251
 252 //! This function is documented in the header file
 253 void gpu_init_pairlist(NbnxmGpu* nb, const NbnxnPairlistGpu* h_plist, const InteractionLocality iloc)
 254 {
 255     char sbuf[STRLEN];
 256     // Timing accumulation should happen only if there was work to do
 257     // because getLastRangeTime() gets skipped with empty lists later
 258     // which leads to the counter not being reset.
 259     bool                bDoTime      = (nb->bDoTime && !h_plist->sci.empty());
 260     const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
 261     gpu_plist*          d_plist      = nb->plist[iloc];
 262
 263     if (d_plist->na_c < 0)
 264     {
 265         d_plist->na_c = h_plist->na_ci;
 266     }
 267     else
 268     {
 269         if (d_plist->na_c != h_plist->na_ci)
 270         {
 271             sprintf(sbuf, "In init_plist: the #atoms per cell has changed (from %d to %d)",
 272                     d_plist->na_c, h_plist->na_ci);
 273             gmx_incons(sbuf);
 274         }
 275     }
 276
 277     gpu_timers_t::Interaction& iTimers = nb->timers->interaction[iloc];
 278
 279     if (bDoTime)
 280     {
 281         iTimers.pl_h2d.openTimingRegion(deviceStream);
 282         iTimers.didPairlistH2D = true;
 283     }
 284
 285     // TODO most of this function is same in CUDA and OpenCL, move into the header
 286     const DeviceContext& deviceContext = *nb->deviceContext_;
 287
 288     reallocateDeviceBuffer(&d_plist->sci, h_plist->sci.size(), &d_plist->nsci, &d_plist->sci_nalloc,
 289                            deviceContext);
 290     copyToDeviceBuffer(&d_plist->sci, h_plist->sci.data(), 0, h_plist->sci.size(), deviceStream,
 291                        GpuApiCallBehavior::Async, bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
 292
 293     reallocateDeviceBuffer(&d_plist->cj4, h_plist->cj4.size(), &d_plist->ncj4, &d_plist->cj4_nalloc,
 294                            deviceContext);
 295     copyToDeviceBuffer(&d_plist->cj4, h_plist->cj4.data(), 0, h_plist->cj4.size(), deviceStream,
 296                        GpuApiCallBehavior::Async, bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
 297
 298     reallocateDeviceBuffer(&d_plist->imask, h_plist->cj4.size() * c_nbnxnGpuClusterpairSplit,
 299                            &d_plist->nimask, &d_plist->imask_nalloc, deviceContext);
 300
 301     reallocateDeviceBuffer(&d_plist->excl, h_plist->excl.size(), &d_plist->nexcl,
 302                            &d_plist->excl_nalloc, deviceContext);
 303     copyToDeviceBuffer(&d_plist->excl, h_plist->excl.data(), 0, h_plist->excl.size(), deviceStream,
 304                        GpuApiCallBehavior::Async, bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
 305
 306     if (bDoTime)
 307     {
 308         iTimers.pl_h2d.closeTimingRegion(deviceStream);
 309     }
 310
 311     /* need to prune the pair list during the next step */
 312     d_plist->haveFreshList = true;
 313 }
 314
 315 //! This function is documented in the header file
 316 gmx_wallclock_gpu_nbnxn_t* gpu_get_timings(NbnxmGpu* nb)
 317 {
 318     return (nb != nullptr && nb->bDoTime) ? nb->timings : nullptr;
 319 }
 320
 321 //! This function is documented in the header file
 322 void gpu_reset_timings(nonbonded_verlet_t* nbv)
 323 {
 324     if (nbv->gpu_nbv && nbv->gpu_nbv->bDoTime)
 325     {
 326         init_timings(nbv->gpu_nbv->timings);
 327     }
 328 }
 329
 330 bool gpu_is_kernel_ewald_analytical(const NbnxmGpu* nb)
 331 {
 332     return ((nb->nbparam->eeltype == eelTypeEWALD_ANA) || (nb->nbparam->eeltype == eelTypeEWALD_ANA_TWIN));
 333 }
 334
 335 } // namespace Nbnxm