src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2012,2013,2014,2015,2016 by the GROMACS development team.
   5  * Copyright (c) 2017,2018,2019,2020,2021, by the GROMACS development team, led by
   6  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   7  * and including many others, as listed in the AUTHORS file in the
   8  * top-level source directory and at http://www.gromacs.org.
   9  *
  10  * GROMACS is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU Lesser General Public License
  12  * as published by the Free Software Foundation; either version 2.1
  13  * of the License, or (at your option) any later version.
  14  *
  15  * GROMACS is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18  * Lesser General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public
  21  * License along with GROMACS; if not, see
  22  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  23  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  24  *
  25  * If you want to redistribute modifications to GROMACS, please
  26  * consider that scientific software is very special. Version
  27  * control is crucial - bugs must be traceable. We will be happy to
  28  * consider code for inclusion in the official distribution, but
  29  * derived work must not be called official GROMACS. Details are found
  30  * in the README & COPYING files - if they are missing, get the
  31  * official version at http://www.gromacs.org.
  32  *
  33  * To help us fund GROMACS development, we humbly ask that you cite
  34  * the research papers on the package. Check out http://www.gromacs.org.
  35  */
  36 /*! \file
  37  *  \brief Define CUDA implementation of nbnxn_gpu_data_mgmt.h
  38  *
  39  *  \author Szilard Pall <pall.szilard@gmail.com>
  40  */
  41 #include "gmxpre.h"
  42
  43 #include <assert.h>
  44 #include <stdarg.h>
  45 #include <stdio.h>
  46 #include <stdlib.h>
  47
  48 // TODO We would like to move this down, but the way NbnxmGpu
  49 //      is currently declared means this has to be before gpu_types.h
  50 #include "nbnxm_cuda_types.h"
  51
  52 // TODO Remove this comment when the above order issue is resolved
  53 #include "gromacs/gpu_utils/cudautils.cuh"
  54 #include "gromacs/gpu_utils/device_context.h"
  55 #include "gromacs/gpu_utils/device_stream_manager.h"
  56 #include "gromacs/gpu_utils/gpu_utils.h"
  57 #include "gromacs/gpu_utils/gpueventsynchronizer.cuh"
  58 #include "gromacs/gpu_utils/pmalloc.h"
  59 #include "gromacs/hardware/device_information.h"
  60 #include "gromacs/hardware/device_management.h"
  61 #include "gromacs/math/vectypes.h"
  62 #include "gromacs/mdlib/force_flags.h"
  63 #include "gromacs/mdtypes/interaction_const.h"
  64 #include "gromacs/mdtypes/md_enums.h"
  65 #include "gromacs/nbnxm/atomdata.h"
  66 #include "gromacs/nbnxm/gpu_data_mgmt.h"
  67 #include "gromacs/nbnxm/gridset.h"
  68 #include "gromacs/nbnxm/nbnxm.h"
  69 #include "gromacs/nbnxm/nbnxm_gpu.h"
  70 #include "gromacs/nbnxm/nbnxm_gpu_data_mgmt.h"
  71 #include "gromacs/nbnxm/pairlistsets.h"
  72 #include "gromacs/pbcutil/ishift.h"
  73 #include "gromacs/timing/gpu_timing.h"
  74 #include "gromacs/utility/basedefinitions.h"
  75 #include "gromacs/utility/cstringutil.h"
  76 #include "gromacs/utility/fatalerror.h"
  77 #include "gromacs/utility/real.h"
  78 #include "gromacs/utility/smalloc.h"
  79
  80 #include "nbnxm_cuda.h"
  81
  82 namespace Nbnxm
  83 {
  84
  85 /* This is a heuristically determined parameter for the Kepler
  86  * and Maxwell architectures for the minimum size of ci lists by multiplying
  87  * this constant with the # of multiprocessors on the current device.
  88  * Since the maximum number of blocks per multiprocessor is 16, the ideal
  89  * count for small systems is 32 or 48 blocks per multiprocessor. Because
  90  * there is a bit of fluctuations in the generated block counts, we use
  91  * a target of 44 instead of the ideal value of 48.
  92  */
  93 static unsigned int gpu_min_ci_balanced_factor = 44;
  94
  95 /*! Initializes the atomdata structure first time, it only gets filled at
  96     pair-search. */
  97 static void init_atomdata_first(NBAtomData*          ad,
  98                                 int                  nTypes,
  99                                 const DeviceContext& deviceContext,
 100                                 const DeviceStream&  localStream)
 101 {
 102     ad->numTypes = nTypes;
 103     allocateDeviceBuffer(&ad->shiftVec, SHIFTS, deviceContext);
 104     ad->shiftVecUploaded = false;
 105
 106     allocateDeviceBuffer(&ad->fShift, SHIFTS, deviceContext);
 107     allocateDeviceBuffer(&ad->eLJ, 1, deviceContext);
 108     allocateDeviceBuffer(&ad->eElec, 1, deviceContext);
 109
 110     clearDeviceBufferAsync(&ad->fShift, 0, SHIFTS, localStream);
 111     clearDeviceBufferAsync(&ad->eElec, 0, 1, localStream);
 112     clearDeviceBufferAsync(&ad->eLJ, 0, 1, localStream);
 113
 114     /* initialize to nullptr poiters to data that is not allocated here and will
 115        need reallocation in nbnxn_cuda_init_atomdata */
 116     ad->xq = nullptr;
 117     ad->f  = nullptr;
 118
 119     /* size -1 indicates that the respective array hasn't been initialized yet */
 120     ad->numAtoms      = -1;
 121     ad->numAtomsAlloc = -1;
 122 }
 123
 124 /*! Initializes the nonbonded parameter data structure. */
 125 static void init_nbparam(NBParamGpu*                     nbp,
 126                          const interaction_const_t*      ic,
 127                          const PairlistParams&           listParams,
 128                          const nbnxn_atomdata_t::Params& nbatParams,
 129                          const DeviceContext&            deviceContext)
 130 {
 131     const int ntypes = nbatParams.numTypes;
 132
 133     set_cutoff_parameters(nbp, ic, listParams);
 134
 135     /* The kernel code supports LJ combination rules (geometric and LB) for
 136      * all kernel types, but we only generate useful combination rule kernels.
 137      * We currently only use LJ combination rule (geometric and LB) kernels
 138      * for plain cut-off LJ. On Maxwell the force only kernels speed up 15%
 139      * with PME and 20% with RF, the other kernels speed up about half as much.
 140      * For LJ force-switch the geometric rule would give 7% speed-up, but this
 141      * combination is rarely used. LJ force-switch with LB rule is more common,
 142      * but gives only 1% speed-up.
 143      */
 144     nbp->vdwType  = nbnxmGpuPickVdwKernelType(ic, nbatParams.ljCombinationRule);
 145     nbp->elecType = nbnxmGpuPickElectrostaticsKernelType(ic, deviceContext.deviceInfo());
 146
 147     /* generate table for PME */
 148     nbp->coulomb_tab = nullptr;
 149     if (nbp->elecType == ElecType::EwaldTab || nbp->elecType == ElecType::EwaldTabTwin)
 150     {
 151         GMX_RELEASE_ASSERT(ic->coulombEwaldTables, "Need valid Coulomb Ewald correction tables");
 152         init_ewald_coulomb_force_table(*ic->coulombEwaldTables, nbp, deviceContext);
 153     }
 154
 155     /* set up LJ parameter lookup table */
 156     if (!useLjCombRule(nbp->vdwType))
 157     {
 158         static_assert(sizeof(decltype(nbp->nbfp)) == 2 * sizeof(decltype(*nbatParams.nbfp.data())),
 159                       "Mismatch in the size of host / device data types");
 160         initParamLookupTable(&nbp->nbfp,
 161                              &nbp->nbfp_texobj,
 162                              reinterpret_cast<const Float2*>(nbatParams.nbfp.data()),
 163                              ntypes * ntypes,
 164                              deviceContext);
 165     }
 166
 167     /* set up LJ-PME parameter lookup table */
 168     if (ic->vdwtype == VanDerWaalsType::Pme)
 169     {
 170         static_assert(sizeof(decltype(nbp->nbfp_comb))
 171                               == 2 * sizeof(decltype(*nbatParams.nbfp_comb.data())),
 172                       "Mismatch in the size of host / device data types");
 173         initParamLookupTable(&nbp->nbfp_comb,
 174                              &nbp->nbfp_comb_texobj,
 175                              reinterpret_cast<const Float2*>(nbatParams.nbfp_comb.data()),
 176                              ntypes,
 177                              deviceContext);
 178     }
 179 }
 180
 181 NbnxmGpu* gpu_init(const gmx::DeviceStreamManager& deviceStreamManager,
 182                    const interaction_const_t*      ic,
 183                    const PairlistParams&           listParams,
 184                    const nbnxn_atomdata_t*         nbat,
 185                    bool                            bLocalAndNonlocal)
 186 {
 187     auto nb            = new NbnxmGpu();
 188     nb->deviceContext_ = &deviceStreamManager.context();
 189     snew(nb->atdat, 1);
 190     snew(nb->nbparam, 1);
 191     snew(nb->plist[InteractionLocality::Local], 1);
 192     if (bLocalAndNonlocal)
 193     {
 194         snew(nb->plist[InteractionLocality::NonLocal], 1);
 195     }
 196
 197     nb->bUseTwoStreams = bLocalAndNonlocal;
 198
 199     nb->timers = new Nbnxm::GpuTimers();
 200     snew(nb->timings, 1);
 201
 202     /* init nbst */
 203     pmalloc((void**)&nb->nbst.eLJ, sizeof(*nb->nbst.eLJ));
 204     pmalloc((void**)&nb->nbst.eElec, sizeof(*nb->nbst.eElec));
 205     pmalloc((void**)&nb->nbst.fShift, SHIFTS * sizeof(*nb->nbst.fShift));
 206
 207     init_plist(nb->plist[InteractionLocality::Local]);
 208
 209     /* local/non-local GPU streams */
 210     GMX_RELEASE_ASSERT(deviceStreamManager.streamIsValid(gmx::DeviceStreamType::NonBondedLocal),
 211                        "Local non-bonded stream should be initialized to use GPU for non-bonded.");
 212     const DeviceStream& localStream = deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedLocal);
 213     nb->deviceStreams[InteractionLocality::Local] = &localStream;
 214     if (nb->bUseTwoStreams)
 215     {
 216         init_plist(nb->plist[InteractionLocality::NonLocal]);
 217
 218         /* Note that the device we're running on does not have to support
 219          * priorities, because we are querying the priority range which in this
 220          * case will be a single value.
 221          */
 222         GMX_RELEASE_ASSERT(deviceStreamManager.streamIsValid(gmx::DeviceStreamType::NonBondedNonLocal),
 223                            "Non-local non-bonded stream should be initialized to use GPU for "
 224                            "non-bonded with domain decomposition.");
 225         nb->deviceStreams[InteractionLocality::NonLocal] =
 226                 &deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedNonLocal);
 227         ;
 228     }
 229
 230     /* WARNING: CUDA timings are incorrect with multiple streams.
 231      *          This is the main reason why they are disabled by default.
 232      */
 233     // TODO: Consider turning on by default when we can detect nr of streams.
 234     nb->bDoTime = (getenv("GMX_ENABLE_GPU_TIMING") != nullptr);
 235
 236     if (nb->bDoTime)
 237     {
 238         init_timings(nb->timings);
 239     }
 240
 241     /* set the kernel type for the current GPU */
 242     /* pick L1 cache configuration */
 243     cuda_set_cacheconfig();
 244
 245     const nbnxn_atomdata_t::Params& nbatParams    = nbat->params();
 246     const DeviceContext&            deviceContext = *nb->deviceContext_;
 247     init_atomdata_first(nb->atdat, nbatParams.numTypes, deviceContext, localStream);
 248     init_nbparam(nb->nbparam, ic, listParams, nbatParams, deviceContext);
 249
 250     nb->atomIndicesSize       = 0;
 251     nb->atomIndicesSize_alloc = 0;
 252     nb->ncxy_na               = 0;
 253     nb->ncxy_na_alloc         = 0;
 254     nb->ncxy_ind              = 0;
 255     nb->ncxy_ind_alloc        = 0;
 256
 257     if (debug)
 258     {
 259         fprintf(debug, "Initialized CUDA data structures.\n");
 260     }
 261
 262     return nb;
 263 }
 264
 265 void gpu_upload_shiftvec(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom)
 266 {
 267     NBAtomData*         adat        = nb->atdat;
 268     const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local];
 269
 270     /* only if we have a dynamic box */
 271     if (nbatom->bDynamicBox || !adat->shiftVecUploaded)
 272     {
 273         static_assert(sizeof(adat->shiftVec[0]) == sizeof(nbatom->shift_vec[0]),
 274                       "Sizes of host- and device-side shift vectors should be the same.");
 275         copyToDeviceBuffer(&adat->shiftVec,
 276                            reinterpret_cast<const Float3*>(nbatom->shift_vec.data()),
 277                            0,
 278                            SHIFTS,
 279                            localStream,
 280                            GpuApiCallBehavior::Async,
 281                            nullptr);
 282         adat->shiftVecUploaded = true;
 283     }
 284 }
 285
 286 void gpu_free(NbnxmGpu* nb)
 287 {
 288     if (nb == nullptr)
 289     {
 290         return;
 291     }
 292
 293     NBAtomData* atdat   = nb->atdat;
 294     NBParamGpu* nbparam = nb->nbparam;
 295
 296     if ((!nbparam->coulomb_tab)
 297         && (nbparam->elecType == ElecType::EwaldTab || nbparam->elecType == ElecType::EwaldTabTwin))
 298     {
 299         destroyParamLookupTable(&nbparam->coulomb_tab, nbparam->coulomb_tab_texobj);
 300     }
 301
 302     delete nb->timers;
 303
 304     if (!useLjCombRule(nb->nbparam->vdwType))
 305     {
 306         destroyParamLookupTable(&nbparam->nbfp, nbparam->nbfp_texobj);
 307     }
 308
 309     if (nbparam->vdwType == VdwType::EwaldGeom || nbparam->vdwType == VdwType::EwaldLB)
 310     {
 311         destroyParamLookupTable(&nbparam->nbfp_comb, nbparam->nbfp_comb_texobj);
 312     }
 313
 314     freeDeviceBuffer(&atdat->shiftVec);
 315     freeDeviceBuffer(&atdat->fShift);
 316
 317     freeDeviceBuffer(&atdat->eLJ);
 318     freeDeviceBuffer(&atdat->eElec);
 319
 320     freeDeviceBuffer(&atdat->f);
 321     freeDeviceBuffer(&atdat->xq);
 322     freeDeviceBuffer(&atdat->atomTypes);
 323     freeDeviceBuffer(&atdat->ljComb);
 324
 325     /* Free plist */
 326     auto* plist = nb->plist[InteractionLocality::Local];
 327     freeDeviceBuffer(&plist->sci);
 328     freeDeviceBuffer(&plist->cj4);
 329     freeDeviceBuffer(&plist->imask);
 330     freeDeviceBuffer(&plist->excl);
 331     sfree(plist);
 332     if (nb->bUseTwoStreams)
 333     {
 334         auto* plist_nl = nb->plist[InteractionLocality::NonLocal];
 335         freeDeviceBuffer(&plist_nl->sci);
 336         freeDeviceBuffer(&plist_nl->cj4);
 337         freeDeviceBuffer(&plist_nl->imask);
 338         freeDeviceBuffer(&plist_nl->excl);
 339         sfree(plist_nl);
 340     }
 341
 342     /* Free nbst */
 343     pfree(nb->nbst.eLJ);
 344     nb->nbst.eLJ = nullptr;
 345
 346     pfree(nb->nbst.eElec);
 347     nb->nbst.eElec = nullptr;
 348
 349     pfree(nb->nbst.fShift);
 350     nb->nbst.fShift = nullptr;
 351
 352     sfree(atdat);
 353     sfree(nbparam);
 354     sfree(nb->timings);
 355     delete nb;
 356
 357     if (debug)
 358     {
 359         fprintf(debug, "Cleaned up CUDA data structures.\n");
 360     }
 361 }
 362
 363 int gpu_min_ci_balanced(NbnxmGpu* nb)
 364 {
 365     return nb != nullptr ? gpu_min_ci_balanced_factor * nb->deviceContext_->deviceInfo().prop.multiProcessorCount
 366                          : 0;
 367 }
 368
 369 void* gpu_get_xq(NbnxmGpu* nb)
 370 {
 371     assert(nb);
 372
 373     return static_cast<void*>(nb->atdat->xq);
 374 }
 375
 376 DeviceBuffer<gmx::RVec> gpu_get_f(NbnxmGpu* nb)
 377 {
 378     assert(nb);
 379
 380     return reinterpret_cast<DeviceBuffer<gmx::RVec>>(nb->atdat->f);
 381 }
 382
 383 DeviceBuffer<gmx::RVec> gpu_get_fshift(NbnxmGpu* nb)
 384 {
 385     assert(nb);
 386
 387     return reinterpret_cast<DeviceBuffer<gmx::RVec>>(nb->atdat->fShift);
 388 }
 389
 390 /* Initialization for X buffer operations on GPU. */
 391 /* TODO  Remove explicit pinning from host arrays from here and manage in a more natural way*/
 392 void nbnxn_gpu_init_x_to_nbat_x(const Nbnxm::GridSet& gridSet, NbnxmGpu* gpu_nbv)
 393 {
 394     const DeviceStream& localStream   = *gpu_nbv->deviceStreams[InteractionLocality::Local];
 395     bool                bDoTime       = gpu_nbv->bDoTime;
 396     const int           maxNumColumns = gridSet.numColumnsMax();
 397
 398     reallocateDeviceBuffer(&gpu_nbv->cxy_na,
 399                            maxNumColumns * gridSet.grids().size(),
 400                            &gpu_nbv->ncxy_na,
 401                            &gpu_nbv->ncxy_na_alloc,
 402                            *gpu_nbv->deviceContext_);
 403     reallocateDeviceBuffer(&gpu_nbv->cxy_ind,
 404                            maxNumColumns * gridSet.grids().size(),
 405                            &gpu_nbv->ncxy_ind,
 406                            &gpu_nbv->ncxy_ind_alloc,
 407                            *gpu_nbv->deviceContext_);
 408
 409     for (unsigned int g = 0; g < gridSet.grids().size(); g++)
 410     {
 411
 412         const Nbnxm::Grid& grid = gridSet.grids()[g];
 413
 414         const int  numColumns      = grid.numColumns();
 415         const int* atomIndices     = gridSet.atomIndices().data();
 416         const int  atomIndicesSize = gridSet.atomIndices().size();
 417         const int* cxy_na          = grid.cxy_na().data();
 418         const int* cxy_ind         = grid.cxy_ind().data();
 419
 420         reallocateDeviceBuffer(&gpu_nbv->atomIndices,
 421                                atomIndicesSize,
 422                                &gpu_nbv->atomIndicesSize,
 423                                &gpu_nbv->atomIndicesSize_alloc,
 424                                *gpu_nbv->deviceContext_);
 425
 426         if (atomIndicesSize > 0)
 427         {
 428
 429             if (bDoTime)
 430             {
 431                 gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.openTimingRegion(localStream);
 432             }
 433
 434             copyToDeviceBuffer(&gpu_nbv->atomIndices,
 435                                atomIndices,
 436                                0,
 437                                atomIndicesSize,
 438                                localStream,
 439                                GpuApiCallBehavior::Async,
 440                                nullptr);
 441
 442             if (bDoTime)
 443             {
 444                 gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.closeTimingRegion(localStream);
 445             }
 446         }
 447
 448         if (numColumns > 0)
 449         {
 450             if (bDoTime)
 451             {
 452                 gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.openTimingRegion(localStream);
 453             }
 454
 455             int* destPtr = &gpu_nbv->cxy_na[maxNumColumns * g];
 456             copyToDeviceBuffer(
 457                     &destPtr, cxy_na, 0, numColumns, localStream, GpuApiCallBehavior::Async, nullptr);
 458
 459             if (bDoTime)
 460             {
 461                 gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.closeTimingRegion(localStream);
 462             }
 463
 464             if (bDoTime)
 465             {
 466                 gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.openTimingRegion(localStream);
 467             }
 468
 469             destPtr = &gpu_nbv->cxy_ind[maxNumColumns * g];
 470             copyToDeviceBuffer(
 471                     &destPtr, cxy_ind, 0, numColumns, localStream, GpuApiCallBehavior::Async, nullptr);
 472
 473             if (bDoTime)
 474             {
 475                 gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.closeTimingRegion(localStream);
 476             }
 477         }
 478     }
 479
 480     if (gpu_nbv->bUseTwoStreams)
 481     {
 482         // The above data is transferred on the local stream but is a
 483         // dependency of the nonlocal stream (specifically the nonlocal X
 484         // buf ops kernel).  We therefore set a dependency to ensure
 485         // that the nonlocal stream waits on the local stream here.
 486         // This call records an event in the local stream:
 487         gpu_nbv->misc_ops_and_local_H2D_done.markEvent(
 488                 *gpu_nbv->deviceStreams[Nbnxm::InteractionLocality::Local]);
 489         // ...and this call instructs the nonlocal stream to wait on that event:
 490         gpu_nbv->misc_ops_and_local_H2D_done.enqueueWaitEvent(
 491                 *gpu_nbv->deviceStreams[Nbnxm::InteractionLocality::NonLocal]);
 492     }
 493
 494     return;
 495 }
 496
 497 } // namespace Nbnxm