src/gromacs/listed_forces/gpubonded-impl.cu

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2018,2019, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35 /*! \internal \file
  36  *
  37  * \brief Implements GPU bonded lists for CUDA
  38  *
  39  * \author Berk Hess <hess@kth.se>
  40  * \author Szilárd Páll <pall.szilard@gmail.com>
  41  * \author Mark Abraham <mark.j.abraham@gmail.com>
  42  *
  43  * \ingroup module_listed_forces
  44  */
  45
  46 #include "gmxpre.h"
  47
  48 #include "gpubonded-impl.h"
  49
  50 #include "gromacs/gpu_utils/cudautils.cuh"
  51 #include "gromacs/gpu_utils/devicebuffer.h"
  52 #include "gromacs/gpu_utils/gpu_vec.cuh"
  53 #include "gromacs/gpu_utils/gputraits.cuh"
  54 #include "gromacs/gpu_utils/hostallocator.h"
  55 #include "gromacs/listed_forces/gpubonded.h"
  56 #include "gromacs/mdtypes/enerdata.h"
  57 #include "gromacs/topology/forcefieldparameters.h"
  58 #include "gromacs/topology/idef.h"
  59
  60 struct t_forcerec;
  61
  62 namespace gmx
  63 {
  64
  65 // ---- GpuBonded::Impl
  66
  67 GpuBonded::Impl::Impl(const gmx_ffparams_t &ffparams,
  68                       void                 *streamPtr)
  69 {
  70     stream = *static_cast<CommandStream*>(streamPtr);
  71
  72     allocateDeviceBuffer(&forceparamsDevice, ffparams.numTypes(), nullptr);
  73     // This could be an async transfer (if the source is pinned), so
  74     // long as it uses the same stream as the kernels and we are happy
  75     // to consume additional pinned pages.
  76     copyToDeviceBuffer(&forceparamsDevice, ffparams.iparams.data(),
  77                        0, ffparams.numTypes(),
  78                        stream, GpuApiCallBehavior::Sync, nullptr);
  79     vtot.resize(F_NRE);
  80     allocateDeviceBuffer(&vtotDevice, F_NRE, nullptr);
  81     clearDeviceBufferAsync(&vtotDevice, 0, F_NRE, stream);
  82
  83     for (int ftype = 0; ftype < F_NRE; ftype++)
  84     {
  85         iListsDevice[ftype].nr     = 0;
  86         iListsDevice[ftype].iatoms = nullptr;
  87         iListsDevice[ftype].nalloc = 0;
  88     }
  89 }
  90
  91 GpuBonded::Impl::~Impl()
  92 {
  93     for (int ftype : ftypesOnGpu)
  94     {
  95         if (iListsDevice[ftype].iatoms)
  96         {
  97             freeDeviceBuffer(&iListsDevice[ftype].iatoms);
  98             iListsDevice[ftype].iatoms = nullptr;
  99         }
 100     }
 101
 102     freeDeviceBuffer(&forceparamsDevice);
 103     freeDeviceBuffer(&vtotDevice);
 104 }
 105
 106 //! Return whether function type \p ftype in \p idef has perturbed interactions
 107 static bool ftypeHasPerturbedEntries(const t_idef  &idef,
 108                                      int            ftype)
 109 {
 110     GMX_ASSERT(idef.ilsort == ilsortNO_FE || idef.ilsort == ilsortFE_SORTED,
 111                "Perturbed interations should be sorted here");
 112
 113     const t_ilist &ilist = idef.il[ftype];
 114
 115     return (idef.ilsort != ilsortNO_FE && ilist.nr_nonperturbed != ilist.nr);
 116 }
 117
 118 //! Converts \p src with atom indices in state order to \p dest in nbnxn order
 119 static void convertIlistToNbnxnOrder(const t_ilist       &src,
 120                                      HostInteractionList *dest,
 121                                      int                  numAtomsPerInteraction,
 122                                      ArrayRef<const int>  nbnxnAtomOrder)
 123 {
 124     GMX_ASSERT(src.size() == 0 || !nbnxnAtomOrder.empty(), "We need the nbnxn atom order");
 125
 126     dest->iatoms.resize(src.size());
 127
 128     // TODO use OpenMP to parallelise this loop
 129     for (int i = 0; i < src.size(); i += 1 + numAtomsPerInteraction)
 130     {
 131         dest->iatoms[i] = src.iatoms[i];
 132         for (int a = 0; a < numAtomsPerInteraction; a++)
 133         {
 134             dest->iatoms[i + 1 + a] = nbnxnAtomOrder[src.iatoms[i + 1 + a]];
 135         }
 136     }
 137 }
 138
 139 // TODO Consider whether this function should be a factory method that
 140 // makes an object that is the only one capable of the device
 141 // operations needed for the lifetime of an interaction list. This
 142 // would be harder to misuse than GpuBonded, and exchange the problem
 143 // of naming this method for the problem of what to name the
 144 // BondedDeviceInteractionListHandler type.
 145
 146 //! Divides bonded interactions over threads and GPU
 147 void
 148 GpuBonded::Impl::updateInteractionListsAndDeviceBuffers(ArrayRef<const int>  nbnxnAtomOrder,
 149                                                         const t_idef        &idef,
 150                                                         void                *xqDevicePtr,
 151                                                         void                *forceDevicePtr,
 152                                                         void                *fshiftDevicePtr)
 153 {
 154     // TODO wallcycle sub start
 155     haveInteractions_ = false;
 156
 157     for (int ftype : ftypesOnGpu)
 158     {
 159         auto &iList = iLists[ftype];
 160
 161         /* Perturbation is not implemented in the GPU bonded kernels.
 162          * But instead of doing all interactions on the CPU, we can
 163          * still easily handle the types that have no perturbed
 164          * interactions on the GPU. */
 165         if (idef.il[ftype].nr > 0 && !ftypeHasPerturbedEntries(idef, ftype))
 166         {
 167             haveInteractions_ = true;
 168
 169             convertIlistToNbnxnOrder(idef.il[ftype],
 170                                      &iList,
 171                                      NRAL(ftype), nbnxnAtomOrder);
 172         }
 173         else
 174         {
 175             iList.iatoms.clear();
 176         }
 177
 178         // Update the device if necessary. This can leave some
 179         // allocations on the device when the host size decreases to
 180         // zero, which is OK, since we deallocate everything at the
 181         // end.
 182         if (iList.size() > 0)
 183         {
 184             t_ilist &iListDevice = iListsDevice[ftype];
 185
 186             reallocateDeviceBuffer(&iListDevice.iatoms, iList.size(), &iListDevice.nr, &iListDevice.nalloc, nullptr);
 187
 188             copyToDeviceBuffer(&iListDevice.iatoms, iList.iatoms.data(),
 189                                0, iList.size(),
 190                                stream, GpuApiCallBehavior::Async, nullptr);
 191         }
 192     }
 193
 194     xqDevice     = static_cast<float4 *>(xqDevicePtr);
 195     forceDevice  = static_cast<fvec *>(forceDevicePtr);
 196     fshiftDevice = static_cast<fvec *>(fshiftDevicePtr);
 197     // TODO wallcycle sub stop
 198 }
 199
 200 bool
 201 GpuBonded::Impl::haveInteractions() const
 202 {
 203     return haveInteractions_;
 204 }
 205
 206 void
 207 GpuBonded::Impl::launchEnergyTransfer()
 208 {
 209     // TODO should wrap with ewcLAUNCH_GPU
 210     GMX_ASSERT(haveInteractions_, "No GPU bonded interactions, so no energies will be computed, so transfer should not be called");
 211
 212     float *vtot_h   = vtot.data();
 213     copyFromDeviceBuffer(vtot_h, &vtotDevice,
 214                          0, F_NRE,
 215                          stream, GpuApiCallBehavior::Async, nullptr);
 216 }
 217
 218 void
 219 GpuBonded::Impl::accumulateEnergyTerms(gmx_enerdata_t *enerd)
 220 {
 221     // TODO should wrap with some kind of wait counter, so not all
 222     // wait goes in to the "Rest" counter
 223     GMX_ASSERT(haveInteractions_, "No GPU bonded interactions, so no energies will be computed or transferred, so accumulation should not occur");
 224
 225     cudaError_t stat = cudaStreamSynchronize(stream);
 226     CU_RET_ERR(stat, "D2H transfer of bonded energies failed");
 227
 228     for (int ftype : ftypesOnGpu)
 229     {
 230         if (ftype != F_LJ14 && ftype != F_COUL14)
 231         {
 232             enerd->term[ftype] += vtot[ftype];
 233         }
 234     }
 235
 236     // Note: We do not support energy groups here
 237     gmx_grppairener_t *grppener = &enerd->grpp;
 238     GMX_RELEASE_ASSERT(grppener->nener == 1, "No energy group support for bondeds on the GPU");
 239     grppener->ener[egLJ14][0]   += vtot[F_LJ14];
 240     grppener->ener[egCOUL14][0] += vtot[F_COUL14];
 241 }
 242
 243 void
 244 GpuBonded::Impl::clearEnergies()
 245 {
 246     // TODO should wrap with ewcLAUNCH_GPU
 247     clearDeviceBufferAsync(&vtotDevice, 0, F_NRE, stream);
 248 }
 249
 250 // ---- GpuBonded
 251
 252 GpuBonded::GpuBonded(const gmx_ffparams_t &ffparams,
 253                      void                 *streamPtr)
 254     : impl_(new Impl(ffparams, streamPtr))
 255 {
 256 }
 257
 258 GpuBonded::~GpuBonded() = default;
 259
 260 void
 261 GpuBonded::updateInteractionListsAndDeviceBuffers(ArrayRef<const int>  nbnxnAtomOrder,
 262                                                   const t_idef        &idef,
 263                                                   void                *xqDevice,
 264                                                   void                *forceDevice,
 265                                                   void                *fshiftDevice)
 266 {
 267     impl_->updateInteractionListsAndDeviceBuffers
 268         (nbnxnAtomOrder, idef, xqDevice, forceDevice, fshiftDevice);
 269 }
 270
 271 bool
 272 GpuBonded::haveInteractions() const
 273 {
 274     return impl_->haveInteractions();
 275 }
 276
 277 void
 278 GpuBonded::launchEnergyTransfer()
 279 {
 280     impl_->launchEnergyTransfer();
 281 }
 282
 283 void
 284 GpuBonded::accumulateEnergyTerms(gmx_enerdata_t *enerd)
 285 {
 286     impl_->accumulateEnergyTerms(enerd);
 287 }
 288
 289 void
 290 GpuBonded::clearEnergies()
 291 {
 292     impl_->clearEnergies();
 293 }
 294
 295 }   // namespace gmx