src/gromacs/mdlib/leapfrog_gpu.cu

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2019,2020, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35 /*! \internal \file
  36  *
  37  * \brief Implements Leap-Frog using CUDA
  38  *
  39  * This file contains implementation of basic Leap-Frog integrator
  40  * using CUDA, including class initialization, data-structures management
  41  * and GPU kernel.
  42  *
  43  * \author Artem Zhmurov <zhmurov@gmail.com>
  44  *
  45  * \ingroup module_mdlib
  46  */
  47 #include "gmxpre.h"
  48
  49 #include "leapfrog_gpu.h"
  50
  51 #include <assert.h>
  52 #include <stdio.h>
  53
  54 #include <cmath>
  55
  56 #include <algorithm>
  57
  58 #include "gromacs/gpu_utils/cudautils.cuh"
  59 #include "gromacs/gpu_utils/devicebuffer.h"
  60 #include "gromacs/gpu_utils/vectype_ops.cuh"
  61 #include "gromacs/math/vec.h"
  62 #include "gromacs/mdtypes/group.h"
  63 #include "gromacs/pbcutil/pbc.h"
  64 #include "gromacs/pbcutil/pbc_aiuc_cuda.cuh"
  65 #include "gromacs/utility/arrayref.h"
  66
  67 namespace gmx
  68 {
  69
  70 /*!\brief Number of CUDA threads in a block
  71  *
  72  * \todo Check if using smaller block size will lead to better performance.
  73  */
  74 constexpr static int c_threadsPerBlock = 256;
  75 //! Maximum number of threads in a block (for __launch_bounds__)
  76 constexpr static int c_maxThreadsPerBlock = c_threadsPerBlock;
  77
  78 /*! \brief Main kernel for Leap-Frog integrator.
  79  *
  80  *  The coordinates and velocities are updated on the GPU. Also saves the intermediate values of the coordinates for
  81  *   further use in constraints.
  82  *
  83  *  Each GPU thread works with a single particle. Empty declaration is needed to
  84  *  avoid "no previous prototype for function" clang warning.
  85  *
  86  *  \todo Check if the force should be set to zero here.
  87  *  \todo This kernel can also accumulate incidental temperatures for each atom.
  88  *
  89  * \tparam        numTempScaleValues               The number of different T-couple values.
  90  * \tparam        velocityScaling                  Type of the Parrinello-Rahman velocity rescaling.
  91  * \param[in]     numAtoms                         Total number of atoms.
  92  * \param[in,out] gm_x                             Coordinates to update upon integration.
  93  * \param[out]    gm_xp                            A copy of the coordinates before the integration (for constraints).
  94  * \param[in,out] gm_v                             Velocities to update.
  95  * \param[in]     gm_f                             Atomic forces.
  96  * \param[in]     gm_inverseMasses                 Reciprocal masses.
  97  * \param[in]     dt                               Timestep.
  98  * \param[in]     gm_lambdas                       Temperature scaling factors (one per group)
  99  * \param[in]     gm_tempScaleGroups               Mapping of atoms into groups.
 100  * \param[in]     prVelocityScalingMatrixDiagonal  Diagonal elements of Parrinello-Rahman velocity scaling matrix
 101  */
 102 template<NumTempScaleValues numTempScaleValues, VelocityScalingType velocityScaling>
 103 __launch_bounds__(c_maxThreadsPerBlock) __global__
 104         void leapfrog_kernel(const int numAtoms,
 105                              float3* __restrict__ gm_x,
 106                              float3* __restrict__ gm_xp,
 107                              float3* __restrict__ gm_v,
 108                              const float3* __restrict__ gm_f,
 109                              const float* __restrict__ gm_inverseMasses,
 110                              const float dt,
 111                              const float* __restrict__ gm_lambdas,
 112                              const unsigned short* __restrict__ gm_tempScaleGroups,
 113                              const float3 prVelocityScalingMatrixDiagonal);
 114
 115 template<NumTempScaleValues numTempScaleValues, VelocityScalingType velocityScaling>
 116 __launch_bounds__(c_maxThreadsPerBlock) __global__
 117         void leapfrog_kernel(const int numAtoms,
 118                              float3* __restrict__ gm_x,
 119                              float3* __restrict__ gm_xp,
 120                              float3* __restrict__ gm_v,
 121                              const float3* __restrict__ gm_f,
 122                              const float* __restrict__ gm_inverseMasses,
 123                              const float dt,
 124                              const float* __restrict__ gm_lambdas,
 125                              const unsigned short* __restrict__ gm_tempScaleGroups,
 126                              const float3 prVelocityScalingMatrixDiagonal)
 127 {
 128     int threadIndex = blockIdx.x * blockDim.x + threadIdx.x;
 129     if (threadIndex < numAtoms)
 130     {
 131         float3 x    = gm_x[threadIndex];
 132         float3 v    = gm_v[threadIndex];
 133         float3 f    = gm_f[threadIndex];
 134         float  im   = gm_inverseMasses[threadIndex];
 135         float  imdt = im * dt;
 136
 137         // Swapping places for xp and x so that the x will contain the updated coordinates and xp - the
 138         // coordinates before update. This should be taken into account when (if) constraints are applied
 139         // after the update: x and xp have to be passed to constraints in the 'wrong' order.
 140         // TODO: Issue #3727
 141         gm_xp[threadIndex] = x;
 142
 143         if (numTempScaleValues != NumTempScaleValues::None || velocityScaling != VelocityScalingType::None)
 144         {
 145             float3 vp = v;
 146
 147             if (numTempScaleValues != NumTempScaleValues::None)
 148             {
 149                 float lambda = 1.0F;
 150                 if (numTempScaleValues == NumTempScaleValues::Single)
 151                 {
 152                     lambda = gm_lambdas[0];
 153                 }
 154                 else if (numTempScaleValues == NumTempScaleValues::Multiple)
 155                 {
 156                     int tempScaleGroup = gm_tempScaleGroups[threadIndex];
 157                     lambda             = gm_lambdas[tempScaleGroup];
 158                 }
 159                 vp *= lambda;
 160             }
 161
 162             if (velocityScaling == VelocityScalingType::Diagonal)
 163             {
 164                 vp.x -= prVelocityScalingMatrixDiagonal.x * v.x;
 165                 vp.y -= prVelocityScalingMatrixDiagonal.y * v.y;
 166                 vp.z -= prVelocityScalingMatrixDiagonal.z * v.z;
 167             }
 168
 169             v = vp;
 170         }
 171
 172         v += f * imdt;
 173
 174         x += v * dt;
 175         gm_v[threadIndex] = v;
 176         gm_x[threadIndex] = x;
 177     }
 178     return;
 179 }
 180
 181 /*! \brief Select templated kernel.
 182  *
 183  * Returns pointer to a CUDA kernel based on the number of temperature coupling groups and
 184  * whether or not the temperature and(or) pressure coupling is enabled.
 185  *
 186  * \param[in]  doTemperatureScaling   If the kernel with temperature coupling velocity scaling
 187  *                                    should be selected.
 188  * \param[in]  numTempScaleValues     Number of temperature coupling groups in the system.
 189  * \param[in]  prVelocityScalingType  Type of the Parrinello-Rahman velocity scaling.
 190  *
 191  * \retrun                         Pointer to CUDA kernel
 192  */
 193 inline auto selectLeapFrogKernelPtr(bool                doTemperatureScaling,
 194                                     int                 numTempScaleValues,
 195                                     VelocityScalingType prVelocityScalingType)
 196 {
 197     // Check input for consistency: if there is temperature coupling, at least one coupling group should be defined.
 198     GMX_ASSERT(!doTemperatureScaling || (numTempScaleValues > 0),
 199                "Temperature coupling was requested with no temperature coupling groups.");
 200     auto kernelPtr = leapfrog_kernel<NumTempScaleValues::None, VelocityScalingType::None>;
 201
 202     if (prVelocityScalingType == VelocityScalingType::None)
 203     {
 204         if (!doTemperatureScaling)
 205         {
 206             kernelPtr = leapfrog_kernel<NumTempScaleValues::None, VelocityScalingType::None>;
 207         }
 208         else if (numTempScaleValues == 1)
 209         {
 210             kernelPtr = leapfrog_kernel<NumTempScaleValues::Single, VelocityScalingType::None>;
 211         }
 212         else if (numTempScaleValues > 1)
 213         {
 214             kernelPtr = leapfrog_kernel<NumTempScaleValues::Multiple, VelocityScalingType::None>;
 215         }
 216     }
 217     else if (prVelocityScalingType == VelocityScalingType::Diagonal)
 218     {
 219         if (!doTemperatureScaling)
 220         {
 221             kernelPtr = leapfrog_kernel<NumTempScaleValues::None, VelocityScalingType::Diagonal>;
 222         }
 223         else if (numTempScaleValues == 1)
 224         {
 225             kernelPtr = leapfrog_kernel<NumTempScaleValues::Single, VelocityScalingType::Diagonal>;
 226         }
 227         else if (numTempScaleValues > 1)
 228         {
 229             kernelPtr = leapfrog_kernel<NumTempScaleValues::Multiple, VelocityScalingType::Diagonal>;
 230         }
 231     }
 232     else
 233     {
 234         GMX_RELEASE_ASSERT(false,
 235                            "Only isotropic Parrinello-Rahman pressure coupling is supported.");
 236     }
 237     return kernelPtr;
 238 }
 239
 240 void LeapFrogGpu::integrate(const DeviceBuffer<float3>        d_x,
 241                             DeviceBuffer<float3>              d_xp,
 242                             DeviceBuffer<float3>              d_v,
 243                             const DeviceBuffer<float3>        d_f,
 244                             const real                        dt,
 245                             const bool                        doTemperatureScaling,
 246                             gmx::ArrayRef<const t_grp_tcstat> tcstat,
 247                             const bool                        doParrinelloRahman,
 248                             const float                       dtPressureCouple,
 249                             const matrix                      prVelocityScalingMatrix)
 250 {
 251
 252     ensureNoPendingDeviceError("In CUDA version of Leap-Frog integrator");
 253
 254     auto kernelPtr = leapfrog_kernel<NumTempScaleValues::None, VelocityScalingType::None>;
 255     if (doTemperatureScaling || doParrinelloRahman)
 256     {
 257         if (doTemperatureScaling)
 258         {
 259             GMX_ASSERT(numTempScaleValues_ == ssize(h_lambdas_),
 260                        "Number of temperature scaling factors changed since it was set for the "
 261                        "last time.");
 262             for (int i = 0; i < numTempScaleValues_; i++)
 263             {
 264                 h_lambdas_[i] = tcstat[i].lambda;
 265             }
 266             copyToDeviceBuffer(&d_lambdas_,
 267                                h_lambdas_.data(),
 268                                0,
 269                                numTempScaleValues_,
 270                                deviceStream_,
 271                                GpuApiCallBehavior::Async,
 272                                nullptr);
 273         }
 274         VelocityScalingType prVelocityScalingType = VelocityScalingType::None;
 275         if (doParrinelloRahman)
 276         {
 277             prVelocityScalingType = VelocityScalingType::Diagonal;
 278             GMX_ASSERT(prVelocityScalingMatrix[YY][XX] == 0 && prVelocityScalingMatrix[ZZ][XX] == 0
 279                                && prVelocityScalingMatrix[ZZ][YY] == 0
 280                                && prVelocityScalingMatrix[XX][YY] == 0
 281                                && prVelocityScalingMatrix[XX][ZZ] == 0
 282                                && prVelocityScalingMatrix[YY][ZZ] == 0,
 283                        "Fully anisotropic Parrinello-Rahman pressure coupling is not yet supported "
 284                        "in GPU version of Leap-Frog integrator.");
 285             prVelocityScalingMatrixDiagonal_ =
 286                     make_float3(dtPressureCouple * prVelocityScalingMatrix[XX][XX],
 287                                 dtPressureCouple * prVelocityScalingMatrix[YY][YY],
 288                                 dtPressureCouple * prVelocityScalingMatrix[ZZ][ZZ]);
 289         }
 290         kernelPtr = selectLeapFrogKernelPtr(doTemperatureScaling, numTempScaleValues_, prVelocityScalingType);
 291     }
 292
 293     const auto kernelArgs = prepareGpuKernelArguments(kernelPtr,
 294                                                       kernelLaunchConfig_,
 295                                                       &numAtoms_,
 296                                                       &d_x,
 297                                                       &d_xp,
 298                                                       &d_v,
 299                                                       &d_f,
 300                                                       &d_inverseMasses_,
 301                                                       &dt,
 302                                                       &d_lambdas_,
 303                                                       &d_tempScaleGroups_,
 304                                                       &prVelocityScalingMatrixDiagonal_);
 305     launchGpuKernel(kernelPtr, kernelLaunchConfig_, deviceStream_, nullptr, "leapfrog_kernel", kernelArgs);
 306
 307     return;
 308 }
 309
 310 LeapFrogGpu::LeapFrogGpu(const DeviceContext& deviceContext, const DeviceStream& deviceStream) :
 311     deviceContext_(deviceContext),
 312     deviceStream_(deviceStream)
 313 {
 314     numAtoms_ = 0;
 315
 316     changePinningPolicy(&h_lambdas_, gmx::PinningPolicy::PinnedIfSupported);
 317
 318     kernelLaunchConfig_.blockSize[0]     = c_threadsPerBlock;
 319     kernelLaunchConfig_.blockSize[1]     = 1;
 320     kernelLaunchConfig_.blockSize[2]     = 1;
 321     kernelLaunchConfig_.sharedMemorySize = 0;
 322 }
 323
 324 LeapFrogGpu::~LeapFrogGpu()
 325 {
 326     freeDeviceBuffer(&d_inverseMasses_);
 327 }
 328
 329 void LeapFrogGpu::set(const int             numAtoms,
 330                       const real*           inverseMasses,
 331                       const int             numTempScaleValues,
 332                       const unsigned short* tempScaleGroups)
 333 {
 334     numAtoms_                       = numAtoms;
 335     kernelLaunchConfig_.gridSize[0] = (numAtoms_ + c_threadsPerBlock - 1) / c_threadsPerBlock;
 336
 337     numTempScaleValues_ = numTempScaleValues;
 338
 339     reallocateDeviceBuffer(
 340             &d_inverseMasses_, numAtoms_, &numInverseMasses_, &numInverseMassesAlloc_, deviceContext_);
 341     copyToDeviceBuffer(
 342             &d_inverseMasses_, (float*)inverseMasses, 0, numAtoms_, deviceStream_, GpuApiCallBehavior::Sync, nullptr);
 343
 344     // Temperature scale group map only used if there are more then one group
 345     if (numTempScaleValues > 1)
 346     {
 347         reallocateDeviceBuffer(
 348                 &d_tempScaleGroups_, numAtoms_, &numTempScaleGroups_, &numTempScaleGroupsAlloc_, deviceContext_);
 349         copyToDeviceBuffer(
 350                 &d_tempScaleGroups_, tempScaleGroups, 0, numAtoms_, deviceStream_, GpuApiCallBehavior::Sync, nullptr);
 351     }
 352
 353     // If the temperature coupling is enabled, we need to make space for scaling factors
 354     if (numTempScaleValues_ > 0)
 355     {
 356         h_lambdas_.resize(numTempScaleValues);
 357         reallocateDeviceBuffer(
 358                 &d_lambdas_, numTempScaleValues_, &numLambdas_, &numLambdasAlloc_, deviceContext_);
 359     }
 360 }
 361
 362 } // namespace gmx