src/gromacs/mdlib/leapfrog_gpu.cu

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2019,2020,2021, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35 /*! \internal \file
  36  *
  37  * \brief Implements Leap-Frog using CUDA
  38  *
  39  * This file contains implementation of basic Leap-Frog integrator
  40  * using CUDA, including class initialization, data-structures management
  41  * and GPU kernel.
  42  *
  43  * \author Artem Zhmurov <zhmurov@gmail.com>
  44  *
  45  * \ingroup module_mdlib
  46  */
  47 #include "gmxpre.h"
  48
  49 #include "leapfrog_gpu.h"
  50
  51 #include <assert.h>
  52 #include <stdio.h>
  53
  54 #include <cmath>
  55
  56 #include <algorithm>
  57
  58 #include "gromacs/gpu_utils/cudautils.cuh"
  59 #include "gromacs/gpu_utils/devicebuffer.h"
  60 #include "gromacs/gpu_utils/typecasts.cuh"
  61 #include "gromacs/gpu_utils/vectype_ops.cuh"
  62 #include "gromacs/math/vec.h"
  63 #include "gromacs/mdtypes/group.h"
  64 #include "gromacs/pbcutil/pbc.h"
  65 #include "gromacs/pbcutil/pbc_aiuc_cuda.cuh"
  66 #include "gromacs/utility/arrayref.h"
  67
  68 namespace gmx
  69 {
  70
  71 /*!\brief Number of CUDA threads in a block
  72  *
  73  * \todo Check if using smaller block size will lead to better performance.
  74  */
  75 constexpr static int c_threadsPerBlock = 256;
  76 //! Maximum number of threads in a block (for __launch_bounds__)
  77 constexpr static int c_maxThreadsPerBlock = c_threadsPerBlock;
  78
  79 /*! \brief Main kernel for Leap-Frog integrator.
  80  *
  81  *  The coordinates and velocities are updated on the GPU. Also saves the intermediate values of the coordinates for
  82  *   further use in constraints.
  83  *
  84  *  Each GPU thread works with a single particle. Empty declaration is needed to
  85  *  avoid "no previous prototype for function" clang warning.
  86  *
  87  *  \todo Check if the force should be set to zero here.
  88  *  \todo This kernel can also accumulate incidental temperatures for each atom.
  89  *
  90  * \tparam        numTempScaleValues               The number of different T-couple values.
  91  * \tparam        velocityScaling                  Type of the Parrinello-Rahman velocity rescaling.
  92  * \param[in]     numAtoms                         Total number of atoms.
  93  * \param[in,out] gm_x                             Coordinates to update upon integration.
  94  * \param[out]    gm_xp                            A copy of the coordinates before the integration (for constraints).
  95  * \param[in,out] gm_v                             Velocities to update.
  96  * \param[in]     gm_f                             Atomic forces.
  97  * \param[in]     gm_inverseMasses                 Reciprocal masses.
  98  * \param[in]     dt                               Timestep.
  99  * \param[in]     gm_lambdas                       Temperature scaling factors (one per group)
 100  * \param[in]     gm_tempScaleGroups               Mapping of atoms into groups.
 101  * \param[in]     prVelocityScalingMatrixDiagonal  Diagonal elements of Parrinello-Rahman velocity scaling matrix
 102  */
 103 template<NumTempScaleValues numTempScaleValues, VelocityScalingType velocityScaling>
 104 __launch_bounds__(c_maxThreadsPerBlock) __global__
 105         void leapfrog_kernel(const int numAtoms,
 106                              float3* __restrict__ gm_x,
 107                              float3* __restrict__ gm_xp,
 108                              float3* __restrict__ gm_v,
 109                              const float3* __restrict__ gm_f,
 110                              const float* __restrict__ gm_inverseMasses,
 111                              const float dt,
 112                              const float* __restrict__ gm_lambdas,
 113                              const unsigned short* __restrict__ gm_tempScaleGroups,
 114                              const float3 prVelocityScalingMatrixDiagonal);
 115
 116 template<NumTempScaleValues numTempScaleValues, VelocityScalingType velocityScaling>
 117 __launch_bounds__(c_maxThreadsPerBlock) __global__
 118         void leapfrog_kernel(const int numAtoms,
 119                              float3* __restrict__ gm_x,
 120                              float3* __restrict__ gm_xp,
 121                              float3* __restrict__ gm_v,
 122                              const float3* __restrict__ gm_f,
 123                              const float* __restrict__ gm_inverseMasses,
 124                              const float dt,
 125                              const float* __restrict__ gm_lambdas,
 126                              const unsigned short* __restrict__ gm_tempScaleGroups,
 127                              const float3 prVelocityScalingMatrixDiagonal)
 128 {
 129     int threadIndex = blockIdx.x * blockDim.x + threadIdx.x;
 130     if (threadIndex < numAtoms)
 131     {
 132         float3 x    = gm_x[threadIndex];
 133         float3 v    = gm_v[threadIndex];
 134         float3 f    = gm_f[threadIndex];
 135         float  im   = gm_inverseMasses[threadIndex];
 136         float  imdt = im * dt;
 137
 138         // Swapping places for xp and x so that the x will contain the updated coordinates and xp - the
 139         // coordinates before update. This should be taken into account when (if) constraints are applied
 140         // after the update: x and xp have to be passed to constraints in the 'wrong' order.
 141         // TODO: Issue #3727
 142         gm_xp[threadIndex] = x;
 143
 144         if (numTempScaleValues != NumTempScaleValues::None || velocityScaling != VelocityScalingType::None)
 145         {
 146             float3 vp = v;
 147
 148             if (numTempScaleValues != NumTempScaleValues::None)
 149             {
 150                 float lambda = 1.0F;
 151                 if (numTempScaleValues == NumTempScaleValues::Single)
 152                 {
 153                     lambda = gm_lambdas[0];
 154                 }
 155                 else if (numTempScaleValues == NumTempScaleValues::Multiple)
 156                 {
 157                     int tempScaleGroup = gm_tempScaleGroups[threadIndex];
 158                     lambda             = gm_lambdas[tempScaleGroup];
 159                 }
 160                 vp *= lambda;
 161             }
 162
 163             if (velocityScaling == VelocityScalingType::Diagonal)
 164             {
 165                 vp.x -= prVelocityScalingMatrixDiagonal.x * v.x;
 166                 vp.y -= prVelocityScalingMatrixDiagonal.y * v.y;
 167                 vp.z -= prVelocityScalingMatrixDiagonal.z * v.z;
 168             }
 169
 170             v = vp;
 171         }
 172
 173         v += f * imdt;
 174
 175         x += v * dt;
 176         gm_v[threadIndex] = v;
 177         gm_x[threadIndex] = x;
 178     }
 179     return;
 180 }
 181
 182 /*! \brief Select templated kernel.
 183  *
 184  * Returns pointer to a CUDA kernel based on the number of temperature coupling groups and
 185  * whether or not the temperature and(or) pressure coupling is enabled.
 186  *
 187  * \param[in]  doTemperatureScaling   If the kernel with temperature coupling velocity scaling
 188  *                                    should be selected.
 189  * \param[in]  numTempScaleValues     Number of temperature coupling groups in the system.
 190  * \param[in]  prVelocityScalingType  Type of the Parrinello-Rahman velocity scaling.
 191  *
 192  * \retrun                         Pointer to CUDA kernel
 193  */
 194 inline auto selectLeapFrogKernelPtr(bool                doTemperatureScaling,
 195                                     int                 numTempScaleValues,
 196                                     VelocityScalingType prVelocityScalingType)
 197 {
 198     // Check input for consistency: if there is temperature coupling, at least one coupling group should be defined.
 199     GMX_ASSERT(!doTemperatureScaling || (numTempScaleValues > 0),
 200                "Temperature coupling was requested with no temperature coupling groups.");
 201     auto kernelPtr = leapfrog_kernel<NumTempScaleValues::None, VelocityScalingType::None>;
 202
 203     if (prVelocityScalingType == VelocityScalingType::None)
 204     {
 205         if (!doTemperatureScaling)
 206         {
 207             kernelPtr = leapfrog_kernel<NumTempScaleValues::None, VelocityScalingType::None>;
 208         }
 209         else if (numTempScaleValues == 1)
 210         {
 211             kernelPtr = leapfrog_kernel<NumTempScaleValues::Single, VelocityScalingType::None>;
 212         }
 213         else if (numTempScaleValues > 1)
 214         {
 215             kernelPtr = leapfrog_kernel<NumTempScaleValues::Multiple, VelocityScalingType::None>;
 216         }
 217     }
 218     else if (prVelocityScalingType == VelocityScalingType::Diagonal)
 219     {
 220         if (!doTemperatureScaling)
 221         {
 222             kernelPtr = leapfrog_kernel<NumTempScaleValues::None, VelocityScalingType::Diagonal>;
 223         }
 224         else if (numTempScaleValues == 1)
 225         {
 226             kernelPtr = leapfrog_kernel<NumTempScaleValues::Single, VelocityScalingType::Diagonal>;
 227         }
 228         else if (numTempScaleValues > 1)
 229         {
 230             kernelPtr = leapfrog_kernel<NumTempScaleValues::Multiple, VelocityScalingType::Diagonal>;
 231         }
 232     }
 233     else
 234     {
 235         GMX_RELEASE_ASSERT(false,
 236                            "Only isotropic Parrinello-Rahman pressure coupling is supported.");
 237     }
 238     return kernelPtr;
 239 }
 240
 241 void LeapFrogGpu::integrate(DeviceBuffer<Float3>              d_x,
 242                             DeviceBuffer<Float3>              d_xp,
 243                             DeviceBuffer<Float3>              d_v,
 244                             const DeviceBuffer<Float3>        d_f,
 245                             const real                        dt,
 246                             const bool                        doTemperatureScaling,
 247                             gmx::ArrayRef<const t_grp_tcstat> tcstat,
 248                             const bool                        doParrinelloRahman,
 249                             const float                       dtPressureCouple,
 250                             const matrix                      prVelocityScalingMatrix)
 251 {
 252
 253     ensureNoPendingDeviceError("In CUDA version of Leap-Frog integrator");
 254
 255     auto kernelPtr = leapfrog_kernel<NumTempScaleValues::None, VelocityScalingType::None>;
 256     if (doTemperatureScaling || doParrinelloRahman)
 257     {
 258         if (doTemperatureScaling)
 259         {
 260             GMX_ASSERT(numTempScaleValues_ == ssize(h_lambdas_),
 261                        "Number of temperature scaling factors changed since it was set for the "
 262                        "last time.");
 263             for (int i = 0; i < numTempScaleValues_; i++)
 264             {
 265                 h_lambdas_[i] = tcstat[i].lambda;
 266             }
 267             copyToDeviceBuffer(&d_lambdas_,
 268                                h_lambdas_.data(),
 269                                0,
 270                                numTempScaleValues_,
 271                                deviceStream_,
 272                                GpuApiCallBehavior::Async,
 273                                nullptr);
 274         }
 275         VelocityScalingType prVelocityScalingType = VelocityScalingType::None;
 276         if (doParrinelloRahman)
 277         {
 278             prVelocityScalingType = VelocityScalingType::Diagonal;
 279             GMX_ASSERT(prVelocityScalingMatrix[YY][XX] == 0 && prVelocityScalingMatrix[ZZ][XX] == 0
 280                                && prVelocityScalingMatrix[ZZ][YY] == 0
 281                                && prVelocityScalingMatrix[XX][YY] == 0
 282                                && prVelocityScalingMatrix[XX][ZZ] == 0
 283                                && prVelocityScalingMatrix[YY][ZZ] == 0,
 284                        "Fully anisotropic Parrinello-Rahman pressure coupling is not yet supported "
 285                        "in GPU version of Leap-Frog integrator.");
 286             prVelocityScalingMatrixDiagonal_ =
 287                     Float3{ dtPressureCouple * prVelocityScalingMatrix[XX][XX],
 288                             dtPressureCouple * prVelocityScalingMatrix[YY][YY],
 289                             dtPressureCouple * prVelocityScalingMatrix[ZZ][ZZ] };
 290         }
 291         kernelPtr = selectLeapFrogKernelPtr(doTemperatureScaling, numTempScaleValues_, prVelocityScalingType);
 292     }
 293
 294     // Checking the buffer types against the kernel argument types
 295     static_assert(sizeof(*d_inverseMasses_) == sizeof(float), "Incompatible types");
 296     const auto kernelArgs = prepareGpuKernelArguments(kernelPtr,
 297                                                       kernelLaunchConfig_,
 298                                                       &numAtoms_,
 299                                                       asFloat3Pointer(&d_x),
 300                                                       asFloat3Pointer(&d_xp),
 301                                                       asFloat3Pointer(&d_v),
 302                                                       asFloat3Pointer(&d_f),
 303                                                       &d_inverseMasses_,
 304                                                       &dt,
 305                                                       &d_lambdas_,
 306                                                       &d_tempScaleGroups_,
 307                                                       &prVelocityScalingMatrixDiagonal_);
 308     launchGpuKernel(kernelPtr, kernelLaunchConfig_, deviceStream_, nullptr, "leapfrog_kernel", kernelArgs);
 309
 310     return;
 311 }
 312
 313 LeapFrogGpu::LeapFrogGpu(const DeviceContext& deviceContext,
 314                          const DeviceStream&  deviceStream,
 315                          const int            numTempScaleValues) :
 316     deviceContext_(deviceContext),
 317     deviceStream_(deviceStream),
 318     numTempScaleValues_(numTempScaleValues)
 319 {
 320     numAtoms_ = 0;
 321
 322     changePinningPolicy(&h_lambdas_, gmx::PinningPolicy::PinnedIfSupported);
 323
 324     kernelLaunchConfig_.blockSize[0]     = c_threadsPerBlock;
 325     kernelLaunchConfig_.blockSize[1]     = 1;
 326     kernelLaunchConfig_.blockSize[2]     = 1;
 327     kernelLaunchConfig_.sharedMemorySize = 0;
 328
 329     // If the temperature coupling is enabled, we need to make space for scaling factors
 330     if (numTempScaleValues_ > 0)
 331     {
 332         h_lambdas_.resize(numTempScaleValues_);
 333         reallocateDeviceBuffer(
 334                 &d_lambdas_, numTempScaleValues_, &numLambdas_, &numLambdasAlloc_, deviceContext_);
 335     }
 336 }
 337
 338 LeapFrogGpu::~LeapFrogGpu()
 339 {
 340     freeDeviceBuffer(&d_inverseMasses_);
 341 }
 342
 343 void LeapFrogGpu::set(const int numAtoms, const real* inverseMasses, const unsigned short* tempScaleGroups)
 344 {
 345     numAtoms_                       = numAtoms;
 346     kernelLaunchConfig_.gridSize[0] = (numAtoms_ + c_threadsPerBlock - 1) / c_threadsPerBlock;
 347
 348     reallocateDeviceBuffer(
 349             &d_inverseMasses_, numAtoms_, &numInverseMasses_, &numInverseMassesAlloc_, deviceContext_);
 350     copyToDeviceBuffer(
 351             &d_inverseMasses_, inverseMasses, 0, numAtoms_, deviceStream_, GpuApiCallBehavior::Sync, nullptr);
 352
 353     // Temperature scale group map only used if there are more then one group
 354     if (numTempScaleValues_ > 1)
 355     {
 356         reallocateDeviceBuffer(
 357                 &d_tempScaleGroups_, numAtoms_, &numTempScaleGroups_, &numTempScaleGroupsAlloc_, deviceContext_);
 358         copyToDeviceBuffer(
 359                 &d_tempScaleGroups_, tempScaleGroups, 0, numAtoms_, deviceStream_, GpuApiCallBehavior::Sync, nullptr);
 360     }
 361 }
 362
 363 } // namespace gmx