src/gromacs/listed_forces/listed_forces_gpu_impl.cu

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35 /*! \internal \file
  36  *
  37  * \brief Implements GPU bonded lists for CUDA
  38  *
  39  * \author Berk Hess <hess@kth.se>
  40  * \author Szilárd Páll <pall.szilard@gmail.com>
  41  * \author Mark Abraham <mark.j.abraham@gmail.com>
  42  * \author Magnus Lundborg <lundborg.magnus@gmail.com>
  43  *
  44  * \ingroup module_listed_forces
  45  */
  46
  47 #include "gmxpre.h"
  48
  49 #include "listed_forces_gpu_impl.h"
  50
  51 #include "gromacs/gpu_utils/cuda_arch_utils.cuh"
  52 #include "gromacs/gpu_utils/cudautils.cuh"
  53 #include "gromacs/gpu_utils/device_context.h"
  54 #include "gromacs/gpu_utils/device_stream.h"
  55 #include "gromacs/gpu_utils/devicebuffer.h"
  56 #include "gromacs/gpu_utils/typecasts.cuh"
  57 #include "gromacs/mdtypes/enerdata.h"
  58 #include "gromacs/timing/wallcycle.h"
  59 #include "gromacs/topology/forcefieldparameters.h"
  60
  61 struct t_forcerec;
  62
  63 namespace gmx
  64 {
  65 // Number of CUDA threads in a block
  66 constexpr static int c_threadsPerBlock = 256;
  67
  68 // ---- ListedForcesGpu::Impl
  69
  70 ListedForcesGpu::Impl::Impl(const gmx_ffparams_t& ffparams,
  71                             const float           electrostaticsScaleFactor,
  72                             const DeviceContext&  deviceContext,
  73                             const DeviceStream&   deviceStream,
  74                             gmx_wallcycle*        wcycle) :
  75     deviceContext_(deviceContext), deviceStream_(deviceStream)
  76 {
  77     GMX_RELEASE_ASSERT(deviceStream.isValid(),
  78                        "Can't run GPU version of bonded forces in stream that is not valid.");
  79
  80     static_assert(
  81             c_threadsPerBlock >= c_numShiftVectors,
  82             "Threads per block in GPU bonded must be >= c_numShiftVectors for the virial kernel "
  83             "(calcVir=true)");
  84
  85     wcycle_ = wcycle;
  86
  87     allocateDeviceBuffer(&d_forceParams_, ffparams.numTypes(), deviceContext_);
  88     // This could be an async transfer (if the source is pinned), so
  89     // long as it uses the same stream as the kernels and we are happy
  90     // to consume additional pinned pages.
  91     copyToDeviceBuffer(&d_forceParams_,
  92                        ffparams.iparams.data(),
  93                        0,
  94                        ffparams.numTypes(),
  95                        deviceStream_,
  96                        GpuApiCallBehavior::Sync,
  97                        nullptr);
  98     vTot_.resize(F_NRE);
  99     allocateDeviceBuffer(&d_vTot_, F_NRE, deviceContext_);
 100     clearDeviceBufferAsync(&d_vTot_, 0, F_NRE, deviceStream_);
 101
 102     kernelParams_.electrostaticsScaleFactor = electrostaticsScaleFactor;
 103     kernelParams_.d_forceParams             = d_forceParams_;
 104     kernelParams_.d_xq                      = d_xq_;
 105     kernelParams_.d_f                       = d_f_;
 106     kernelParams_.d_fShift                  = d_fShift_;
 107     kernelParams_.d_vTot                    = d_vTot_;
 108     for (int i = 0; i < numFTypesOnGpu; i++)
 109     {
 110         kernelParams_.d_iatoms[i]        = nullptr;
 111         kernelParams_.fTypeRangeStart[i] = 0;
 112         kernelParams_.fTypeRangeEnd[i]   = -1;
 113     }
 114
 115     int fTypeRangeEnd = kernelParams_.fTypeRangeEnd[numFTypesOnGpu - 1];
 116
 117     kernelLaunchConfig_.blockSize[0] = c_threadsPerBlock;
 118     kernelLaunchConfig_.blockSize[1] = 1;
 119     kernelLaunchConfig_.blockSize[2] = 1;
 120     kernelLaunchConfig_.gridSize[0]  = (fTypeRangeEnd + c_threadsPerBlock) / c_threadsPerBlock;
 121     kernelLaunchConfig_.gridSize[1]  = 1;
 122     kernelLaunchConfig_.gridSize[2]  = 1;
 123     kernelLaunchConfig_.sharedMemorySize =
 124             c_numShiftVectors * sizeof(float3) + (c_threadsPerBlock / warp_size) * 3 * sizeof(float);
 125 }
 126
 127 ListedForcesGpu::Impl::~Impl()
 128 {
 129     for (int fType : fTypesOnGpu)
 130     {
 131         if (d_iLists_[fType].iatoms)
 132         {
 133             freeDeviceBuffer(&d_iLists_[fType].iatoms);
 134             d_iLists_[fType].iatoms = nullptr;
 135         }
 136     }
 137
 138     freeDeviceBuffer(&d_forceParams_);
 139     freeDeviceBuffer(&d_vTot_);
 140 }
 141
 142 //! Return whether function type \p fType in \p idef has perturbed interactions
 143 static bool fTypeHasPerturbedEntries(const InteractionDefinitions& idef, int fType)
 144 {
 145     GMX_ASSERT(idef.ilsort == ilsortNO_FE || idef.ilsort == ilsortFE_SORTED,
 146                "Perturbed interations should be sorted here");
 147
 148     const InteractionList& ilist = idef.il[fType];
 149
 150     return (idef.ilsort != ilsortNO_FE && idef.numNonperturbedInteractions[fType] != ilist.size());
 151 }
 152
 153 //! Converts \p src with atom indices in state order to \p dest in nbnxn order
 154 static void convertIlistToNbnxnOrder(const InteractionList& src,
 155                                      HostInteractionList*   dest,
 156                                      int                    numAtomsPerInteraction,
 157                                      ArrayRef<const int>    nbnxnAtomOrder)
 158 {
 159     GMX_ASSERT(src.empty() || !nbnxnAtomOrder.empty(), "We need the nbnxn atom order");
 160
 161     dest->iatoms.resize(src.size());
 162
 163     // TODO use OpenMP to parallelise this loop
 164     for (int i = 0; i < src.size(); i += 1 + numAtomsPerInteraction)
 165     {
 166         dest->iatoms[i] = src.iatoms[i];
 167         for (int a = 0; a < numAtomsPerInteraction; a++)
 168         {
 169             dest->iatoms[i + 1 + a] = nbnxnAtomOrder[src.iatoms[i + 1 + a]];
 170         }
 171     }
 172 }
 173
 174 //! Returns \p input rounded up to the closest multiple of \p factor.
 175 static inline int roundUpToFactor(const int input, const int factor)
 176 {
 177     GMX_ASSERT(factor > 0, "The factor to round up to must be > 0.");
 178
 179     int remainder = input % factor;
 180
 181     if (remainder == 0)
 182     {
 183         return (input);
 184     }
 185     return (input + (factor - remainder));
 186 }
 187
 188 // TODO Consider whether this function should be a factory method that
 189 // makes an object that is the only one capable of the device
 190 // operations needed for the lifetime of an interaction list. This
 191 // would be harder to misuse than ListedForcesGpu, and exchange the problem
 192 // of naming this method for the problem of what to name the
 193 // BondedDeviceInteractionListHandler type.
 194
 195 /*! Divides bonded interactions over threads and GPU.
 196  *  The bonded interactions are assigned by interaction type to GPU threads. The intereaction
 197  *  types are assigned in blocks sized as <warp_size>. The beginning and end (thread index) of each
 198  *  interaction type are stored in kernelParams_. Pointers to the relevant data structures on the
 199  *  GPU are also stored in kernelParams_.
 200  *
 201  * \todo Use DeviceBuffer for the d_xqPtr.
 202  */
 203 void ListedForcesGpu::Impl::updateInteractionListsAndDeviceBuffers(ArrayRef<const int> nbnxnAtomOrder,
 204                                                                    const InteractionDefinitions& idef,
 205                                                                    void*              d_xqPtr,
 206                                                                    DeviceBuffer<RVec> d_fPtr,
 207                                                                    DeviceBuffer<RVec> d_fShiftPtr)
 208 {
 209     // TODO wallcycle sub start
 210     haveInteractions_ = false;
 211     int fTypesCounter = 0;
 212
 213     for (int fType : fTypesOnGpu)
 214     {
 215         auto& iList = iLists_[fType];
 216
 217         /* Perturbation is not implemented in the GPU bonded kernels.
 218          * But instead of doing all interactions on the CPU, we can
 219          * still easily handle the types that have no perturbed
 220          * interactions on the GPU. */
 221         if (!idef.il[fType].empty() && !fTypeHasPerturbedEntries(idef, fType))
 222         {
 223             haveInteractions_ = true;
 224
 225             convertIlistToNbnxnOrder(idef.il[fType], &iList, NRAL(fType), nbnxnAtomOrder);
 226         }
 227         else
 228         {
 229             iList.iatoms.clear();
 230         }
 231
 232         // Update the device if necessary. This can leave some
 233         // allocations on the device when the host size decreases to
 234         // zero, which is OK, since we deallocate everything at the
 235         // end.
 236         if (iList.size() > 0)
 237         {
 238             t_ilist& d_iList = d_iLists_[fType];
 239
 240             reallocateDeviceBuffer(
 241                     &d_iList.iatoms, iList.size(), &d_iList.nr, &d_iList.nalloc, deviceContext_);
 242
 243             copyToDeviceBuffer(&d_iList.iatoms,
 244                                iList.iatoms.data(),
 245                                0,
 246                                iList.size(),
 247                                deviceStream_,
 248                                GpuApiCallBehavior::Async,
 249                                nullptr);
 250         }
 251         kernelParams_.fTypesOnGpu[fTypesCounter]    = fType;
 252         kernelParams_.numFTypeIAtoms[fTypesCounter] = iList.size();
 253         int numBonds = iList.size() / (interaction_function[fType].nratoms + 1);
 254         kernelParams_.numFTypeBonds[fTypesCounter] = numBonds;
 255         kernelParams_.d_iatoms[fTypesCounter]      = d_iLists_[fType].iatoms;
 256         if (fTypesCounter == 0)
 257         {
 258             kernelParams_.fTypeRangeStart[fTypesCounter] = 0;
 259         }
 260         else
 261         {
 262             kernelParams_.fTypeRangeStart[fTypesCounter] =
 263                     kernelParams_.fTypeRangeEnd[fTypesCounter - 1] + 1;
 264         }
 265         kernelParams_.fTypeRangeEnd[fTypesCounter] = kernelParams_.fTypeRangeStart[fTypesCounter]
 266                                                      + roundUpToFactor(numBonds, warp_size) - 1;
 267
 268         GMX_ASSERT(numBonds > 0
 269                            || kernelParams_.fTypeRangeEnd[fTypesCounter]
 270                                       <= kernelParams_.fTypeRangeStart[fTypesCounter],
 271                    "Invalid GPU listed forces setup. numBonds must be > 0 if there are threads "
 272                    "allocated to do work on that interaction function type.");
 273         GMX_ASSERT(kernelParams_.fTypeRangeStart[fTypesCounter] % warp_size == 0
 274                            && (kernelParams_.fTypeRangeEnd[fTypesCounter] + 1) % warp_size == 0,
 275                    "The bonded interactions must be assigned to the GPU in blocks of warp size.");
 276
 277         fTypesCounter++;
 278     }
 279
 280     int fTypeRangeEnd               = kernelParams_.fTypeRangeEnd[numFTypesOnGpu - 1];
 281     kernelLaunchConfig_.gridSize[0] = (fTypeRangeEnd + c_threadsPerBlock) / c_threadsPerBlock;
 282
 283     d_xq_     = static_cast<float4*>(d_xqPtr);
 284     d_f_      = asFloat3(d_fPtr);
 285     d_fShift_ = asFloat3(d_fShiftPtr);
 286
 287     kernelParams_.d_xq          = d_xq_;
 288     kernelParams_.d_f           = d_f_;
 289     kernelParams_.d_fShift      = d_fShift_;
 290     kernelParams_.d_forceParams = d_forceParams_;
 291     kernelParams_.d_vTot        = d_vTot_;
 292
 293     // TODO wallcycle sub stop
 294 }
 295
 296 void ListedForcesGpu::Impl::setPbc(PbcType pbcType, const matrix box, bool canMoleculeSpanPbc)
 297 {
 298     PbcAiuc pbcAiuc;
 299     setPbcAiuc(canMoleculeSpanPbc ? numPbcDimensions(pbcType) : 0, box, &pbcAiuc);
 300     kernelParams_.pbcAiuc = pbcAiuc;
 301 }
 302
 303 bool ListedForcesGpu::Impl::haveInteractions() const
 304 {
 305     return haveInteractions_;
 306 }
 307
 308 void ListedForcesGpu::Impl::launchEnergyTransfer()
 309 {
 310     GMX_ASSERT(haveInteractions_,
 311                "No GPU bonded interactions, so no energies will be computed, so transfer should "
 312                "not be called");
 313     wallcycle_sub_start_nocount(wcycle_, WallCycleSubCounter::LaunchGpuBonded);
 314     // TODO add conditional on whether there has been any compute (and make sure host buffer doesn't contain garbage)
 315     float* h_vTot = vTot_.data();
 316     copyFromDeviceBuffer(h_vTot, &d_vTot_, 0, F_NRE, deviceStream_, GpuApiCallBehavior::Async, nullptr);
 317     wallcycle_sub_stop(wcycle_, WallCycleSubCounter::LaunchGpuBonded);
 318 }
 319
 320 void ListedForcesGpu::Impl::waitAccumulateEnergyTerms(gmx_enerdata_t* enerd)
 321 {
 322     GMX_ASSERT(haveInteractions_,
 323                "No GPU bonded interactions, so no energies will be computed or transferred, so "
 324                "accumulation should not occur");
 325
 326     wallcycle_start(wcycle_, WallCycleCounter::WaitGpuBonded);
 327     cudaError_t stat = cudaStreamSynchronize(deviceStream_.stream());
 328     CU_RET_ERR(stat, "D2H transfer of bonded energies failed");
 329     wallcycle_stop(wcycle_, WallCycleCounter::WaitGpuBonded);
 330
 331     for (int fType : fTypesOnGpu)
 332     {
 333         if (fType != F_LJ14 && fType != F_COUL14)
 334         {
 335             enerd->term[fType] += vTot_[fType];
 336         }
 337     }
 338
 339     // Note: We do not support energy groups here
 340     gmx_grppairener_t* grppener = &enerd->grpp;
 341     GMX_RELEASE_ASSERT(grppener->nener == 1, "No energy group support for bondeds on the GPU");
 342     grppener->energyGroupPairTerms[NonBondedEnergyTerms::LJ14][0] += vTot_[F_LJ14];
 343     grppener->energyGroupPairTerms[NonBondedEnergyTerms::Coulomb14][0] += vTot_[F_COUL14];
 344 }
 345
 346 void ListedForcesGpu::Impl::clearEnergies()
 347 {
 348     wallcycle_start_nocount(wcycle_, WallCycleCounter::LaunchGpu);
 349     wallcycle_sub_start_nocount(wcycle_, WallCycleSubCounter::LaunchGpuBonded);
 350     clearDeviceBufferAsync(&d_vTot_, 0, F_NRE, deviceStream_);
 351     wallcycle_sub_stop(wcycle_, WallCycleSubCounter::LaunchGpuBonded);
 352     wallcycle_stop(wcycle_, WallCycleCounter::LaunchGpu);
 353 }
 354
 355 // ---- ListedForcesGpu
 356
 357 ListedForcesGpu::ListedForcesGpu(const gmx_ffparams_t& ffparams,
 358                                  const float           electrostaticsScaleFactor,
 359                                  const DeviceContext&  deviceContext,
 360                                  const DeviceStream&   deviceStream,
 361                                  gmx_wallcycle*        wcycle) :
 362     impl_(new Impl(ffparams, electrostaticsScaleFactor, deviceContext, deviceStream, wcycle))
 363 {
 364 }
 365
 366 ListedForcesGpu::~ListedForcesGpu() = default;
 367
 368 void ListedForcesGpu::updateInteractionListsAndDeviceBuffers(ArrayRef<const int> nbnxnAtomOrder,
 369                                                              const InteractionDefinitions& idef,
 370                                                              void*                         d_xq,
 371                                                              DeviceBuffer<RVec>            d_f,
 372                                                              DeviceBuffer<RVec>            d_fShift)
 373 {
 374     impl_->updateInteractionListsAndDeviceBuffers(nbnxnAtomOrder, idef, d_xq, d_f, d_fShift);
 375 }
 376
 377 void ListedForcesGpu::setPbc(PbcType pbcType, const matrix box, bool canMoleculeSpanPbc)
 378 {
 379     impl_->setPbc(pbcType, box, canMoleculeSpanPbc);
 380 }
 381
 382 bool ListedForcesGpu::haveInteractions() const
 383 {
 384     return impl_->haveInteractions();
 385 }
 386
 387 void ListedForcesGpu::setPbcAndlaunchKernel(PbcType                  pbcType,
 388                                             const matrix             box,
 389                                             bool                     canMoleculeSpanPbc,
 390                                             const gmx::StepWorkload& stepWork)
 391 {
 392     setPbc(pbcType, box, canMoleculeSpanPbc);
 393     launchKernel(stepWork);
 394 }
 395
 396 void ListedForcesGpu::launchEnergyTransfer()
 397 {
 398     impl_->launchEnergyTransfer();
 399 }
 400
 401 void ListedForcesGpu::waitAccumulateEnergyTerms(gmx_enerdata_t* enerd)
 402 {
 403     impl_->waitAccumulateEnergyTerms(enerd);
 404 }
 405
 406 void ListedForcesGpu::clearEnergies()
 407 {
 408     impl_->clearEnergies();
 409 }
 410
 411 } // namespace gmx