src/gromacs/mdlib/leapfrog_gpu.cu

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2019,2020, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35 /*! \internal \file
  36  *
  37  * \brief Implements Leap-Frog using CUDA
  38  *
  39  * This file contains implementation of basic Leap-Frog integrator
  40  * using CUDA, including class initialization, data-structures management
  41  * and GPU kernel.
  42  *
  43  * \author Artem Zhmurov <zhmurov@gmail.com>
  44  *
  45  * \ingroup module_mdlib
  46  */
  47 #include "gmxpre.h"
  48
  49 #include "leapfrog_gpu.cuh"
  50
  51 #include <assert.h>
  52 #include <stdio.h>
  53
  54 #include <cmath>
  55
  56 #include <algorithm>
  57
  58 #include "gromacs/gpu_utils/cudautils.cuh"
  59 #include "gromacs/gpu_utils/devicebuffer.h"
  60 #include "gromacs/gpu_utils/vectype_ops.cuh"
  61 #include "gromacs/math/vec.h"
  62 #include "gromacs/pbcutil/pbc.h"
  63 #include "gromacs/pbcutil/pbc_aiuc_cuda.cuh"
  64 #include "gromacs/utility/arrayref.h"
  65
  66 namespace gmx
  67 {
  68
  69 /*!\brief Number of CUDA threads in a block
  70  *
  71  * \todo Check if using smaller block size will lead to better prformance.
  72  */
  73 constexpr static int c_threadsPerBlock = 256;
  74 //! Maximum number of threads in a block (for __launch_bounds__)
  75 constexpr static int c_maxThreadsPerBlock = c_threadsPerBlock;
  76
  77 /*! \brief Sets the number of different temperature coupling values
  78  *
  79  *  This is needed to template the kernel
  80  *  \todo Unify with similar enum in CPU update module
  81  */
  82 enum class NumTempScaleValues
  83 {
  84     None,    //!< No temperature coupling
  85     Single,  //!< Single T-scaling value (one group)
  86     Multiple //!< Multiple T-scaling values, need to use T-group indices
  87 };
  88
  89 /*! \brief Different variants of the Parrinello-Rahman velocity scaling
  90  *
  91  *  This is needed to template the kernel
  92  *  \todo Unify with similar enum in CPU update module
  93  */
  94 enum class VelocityScalingType
  95 {
  96     None,     //!< Do not apply velocity scaling (not a PR-coupling run or step)
  97     Diagonal, //!< Apply velocity scaling using a diagonal matrix
  98     Full      //!< Apply velocity scaling using a full matrix
  99 };
 100
 101 /*! \brief Main kernel for Leap-Frog integrator.
 102  *
 103  *  The coordinates and velocities are updated on the GPU. Also saves the intermediate values of the coordinates for
 104  *   further use in constraints.
 105  *
 106  *  Each GPU thread works with a single particle. Empty declaration is needed to
 107  *  avoid "no previous prototype for function" clang warning.
 108  *
 109  *  \todo Check if the force should be set to zero here.
 110  *  \todo This kernel can also accumulate incidental temperatures for each atom.
 111  *
 112  * \tparam        numTempScaleValues               The number of different T-couple values.
 113  * \tparam        velocityScaling                  Type of the Parrinello-Rahman velocity rescaling.
 114  * \param[in]     numAtoms                         Total number of atoms.
 115  * \param[in,out] gm_x                             Coordinates to update upon integration.
 116  * \param[out]    gm_xp                            A copy of the coordinates before the integration (for constraints).
 117  * \param[in,out] gm_v                             Velocities to update.
 118  * \param[in]     gm_f                             Atomic forces.
 119  * \param[in]     gm_inverseMasses                 Reciprocal masses.
 120  * \param[in]     dt                               Timestep.
 121  * \param[in]     gm_lambdas                       Temperature scaling factors (one per group)
 122  * \param[in]     gm_tempScaleGroups               Mapping of atoms into groups.
 123  * \param[in]     dtPressureCouple                 Time step for pressure coupling
 124  * \param[in]     prVelocityScalingMatrixDiagonal  Diagonal elements of Parrinello-Rahman velocity scaling matrix
 125  */
 126 template<NumTempScaleValues numTempScaleValues, VelocityScalingType velocityScaling>
 127 __launch_bounds__(c_maxThreadsPerBlock) __global__
 128         void leapfrog_kernel(const int numAtoms,
 129                              float3* __restrict__ gm_x,
 130                              float3* __restrict__ gm_xp,
 131                              float3* __restrict__ gm_v,
 132                              const float3* __restrict__ gm_f,
 133                              const float* __restrict__ gm_inverseMasses,
 134                              const float dt,
 135                              const float* __restrict__ gm_lambdas,
 136                              const unsigned short* __restrict__ gm_tempScaleGroups,
 137                              const float3 prVelocityScalingMatrixDiagonal);
 138
 139 template<NumTempScaleValues numTempScaleValues, VelocityScalingType velocityScaling>
 140 __launch_bounds__(c_maxThreadsPerBlock) __global__
 141         void leapfrog_kernel(const int numAtoms,
 142                              float3* __restrict__ gm_x,
 143                              float3* __restrict__ gm_xp,
 144                              float3* __restrict__ gm_v,
 145                              const float3* __restrict__ gm_f,
 146                              const float* __restrict__ gm_inverseMasses,
 147                              const float dt,
 148                              const float* __restrict__ gm_lambdas,
 149                              const unsigned short* __restrict__ gm_tempScaleGroups,
 150                              const float3 prVelocityScalingMatrixDiagonal)
 151 {
 152     int threadIndex = blockIdx.x * blockDim.x + threadIdx.x;
 153     if (threadIndex < numAtoms)
 154     {
 155         float3 x    = gm_x[threadIndex];
 156         float3 v    = gm_v[threadIndex];
 157         float3 f    = gm_f[threadIndex];
 158         float  im   = gm_inverseMasses[threadIndex];
 159         float  imdt = im * dt;
 160
 161         // Swapping places for xp and x so that the x will contain the updated coordinates and xp - the
 162         // coordinates before update. This should be taken into account when (if) constraints are applied
 163         // after the update: x and xp have to be passed to constraints in the 'wrong' order.
 164         gm_xp[threadIndex] = x;
 165
 166         if (numTempScaleValues != NumTempScaleValues::None || velocityScaling != VelocityScalingType::None)
 167         {
 168             float3 vp = v;
 169
 170             if (numTempScaleValues != NumTempScaleValues::None)
 171             {
 172                 float lambda = 1.0F;
 173                 if (numTempScaleValues == NumTempScaleValues::Single)
 174                 {
 175                     lambda = gm_lambdas[0];
 176                 }
 177                 else if (numTempScaleValues == NumTempScaleValues::Multiple)
 178                 {
 179                     int tempScaleGroup = gm_tempScaleGroups[threadIndex];
 180                     lambda             = gm_lambdas[tempScaleGroup];
 181                 }
 182                 vp *= lambda;
 183             }
 184
 185             if (velocityScaling == VelocityScalingType::Diagonal)
 186             {
 187                 vp.x -= prVelocityScalingMatrixDiagonal.x * v.x;
 188                 vp.y -= prVelocityScalingMatrixDiagonal.y * v.y;
 189                 vp.z -= prVelocityScalingMatrixDiagonal.z * v.z;
 190             }
 191
 192             v = vp;
 193         }
 194
 195         v += f * imdt;
 196
 197         x += v * dt;
 198         gm_v[threadIndex] = v;
 199         gm_x[threadIndex] = x;
 200     }
 201     return;
 202 }
 203
 204 /*! \brief Select templated kernel.
 205  *
 206  * Returns pointer to a CUDA kernel based on the number of temperature coupling groups and
 207  * whether or not the temperature and(or) pressure coupling is enabled.
 208  *
 209  * \param[in]  doTemperatureScaling   If the kernel with temperature coupling velocity scaling
 210  *                                    should be selected.
 211  * \param[in]  numTempScaleValues     Number of temperature coupling groups in the system.
 212  * \param[in]  prVelocityScalingType  Type of the Parrinello-Rahman velocity scaling.
 213  *
 214  * \retrun                         Pointer to CUDA kernel
 215  */
 216 inline auto selectLeapFrogKernelPtr(bool                doTemperatureScaling,
 217                                     int                 numTempScaleValues,
 218                                     VelocityScalingType prVelocityScalingType)
 219 {
 220     // Check input for consistency: if there is temperature coupling, at least one coupling group should be defined.
 221     GMX_ASSERT(!doTemperatureScaling || (numTempScaleValues > 0),
 222                "Temperature coupling was requested with no temperature coupling groups.");
 223     auto kernelPtr = leapfrog_kernel<NumTempScaleValues::None, VelocityScalingType::None>;
 224
 225     if (prVelocityScalingType == VelocityScalingType::None)
 226     {
 227         if (!doTemperatureScaling)
 228         {
 229             kernelPtr = leapfrog_kernel<NumTempScaleValues::None, VelocityScalingType::None>;
 230         }
 231         else if (numTempScaleValues == 1)
 232         {
 233             kernelPtr = leapfrog_kernel<NumTempScaleValues::Single, VelocityScalingType::None>;
 234         }
 235         else if (numTempScaleValues > 1)
 236         {
 237             kernelPtr = leapfrog_kernel<NumTempScaleValues::Multiple, VelocityScalingType::None>;
 238         }
 239     }
 240     else if (prVelocityScalingType == VelocityScalingType::Diagonal)
 241     {
 242         if (!doTemperatureScaling)
 243         {
 244             kernelPtr = leapfrog_kernel<NumTempScaleValues::None, VelocityScalingType::Diagonal>;
 245         }
 246         else if (numTempScaleValues == 1)
 247         {
 248             kernelPtr = leapfrog_kernel<NumTempScaleValues::Single, VelocityScalingType::Diagonal>;
 249         }
 250         else if (numTempScaleValues > 1)
 251         {
 252             kernelPtr = leapfrog_kernel<NumTempScaleValues::Multiple, VelocityScalingType::Diagonal>;
 253         }
 254     }
 255     else
 256     {
 257         GMX_RELEASE_ASSERT(false,
 258                            "Only isotropic Parrinello-Rahman pressure coupling is supported.");
 259     }
 260     return kernelPtr;
 261 }
 262
 263 void LeapFrogGpu::integrate(const float3*                     d_x,
 264                             float3*                           d_xp,
 265                             float3*                           d_v,
 266                             const float3*                     d_f,
 267                             const real                        dt,
 268                             const bool                        doTemperatureScaling,
 269                             gmx::ArrayRef<const t_grp_tcstat> tcstat,
 270                             const bool                        doParrinelloRahman,
 271                             const float                       dtPressureCouple,
 272                             const matrix                      prVelocityScalingMatrix)
 273 {
 274
 275     ensureNoPendingCudaError("In CUDA version of Leap-Frog integrator");
 276
 277     auto kernelPtr = leapfrog_kernel<NumTempScaleValues::None, VelocityScalingType::None>;
 278     if (doTemperatureScaling || doParrinelloRahman)
 279     {
 280         if (doTemperatureScaling)
 281         {
 282             GMX_ASSERT(numTempScaleValues_ == ssize(h_lambdas_),
 283                        "Number of temperature scaling factors changed since it was set for the "
 284                        "last time.");
 285             for (int i = 0; i < numTempScaleValues_; i++)
 286             {
 287                 h_lambdas_[i] = tcstat[i].lambda;
 288             }
 289             copyToDeviceBuffer(&d_lambdas_, h_lambdas_.data(), 0, numTempScaleValues_,
 290                                commandStream_, GpuApiCallBehavior::Async, nullptr);
 291         }
 292         VelocityScalingType prVelocityScalingType = VelocityScalingType::None;
 293         if (doParrinelloRahman)
 294         {
 295             prVelocityScalingType = VelocityScalingType::Diagonal;
 296             GMX_ASSERT(prVelocityScalingMatrix[YY][XX] == 0 && prVelocityScalingMatrix[ZZ][XX] == 0
 297                                && prVelocityScalingMatrix[ZZ][YY] == 0
 298                                && prVelocityScalingMatrix[XX][YY] == 0
 299                                && prVelocityScalingMatrix[XX][ZZ] == 0
 300                                && prVelocityScalingMatrix[YY][ZZ] == 0,
 301                        "Fully anisotropic Parrinello-Rahman pressure coupling is not yet supported "
 302                        "in GPU version of Leap-Frog integrator.");
 303             prVelocityScalingMatrixDiagonal_ =
 304                     make_float3(dtPressureCouple * prVelocityScalingMatrix[XX][XX],
 305                                 dtPressureCouple * prVelocityScalingMatrix[YY][YY],
 306                                 dtPressureCouple * prVelocityScalingMatrix[ZZ][ZZ]);
 307         }
 308         kernelPtr = selectLeapFrogKernelPtr(doTemperatureScaling, numTempScaleValues_, prVelocityScalingType);
 309     }
 310
 311     const auto kernelArgs = prepareGpuKernelArguments(
 312             kernelPtr, kernelLaunchConfig_, &numAtoms_, &d_x, &d_xp, &d_v, &d_f, &d_inverseMasses_,
 313             &dt, &d_lambdas_, &d_tempScaleGroups_, &prVelocityScalingMatrixDiagonal_);
 314     launchGpuKernel(kernelPtr, kernelLaunchConfig_, nullptr, "leapfrog_kernel", kernelArgs);
 315
 316     return;
 317 }
 318
 319 LeapFrogGpu::LeapFrogGpu(CommandStream commandStream) : commandStream_(commandStream)
 320 {
 321     numAtoms_ = 0;
 322
 323     changePinningPolicy(&h_lambdas_, gmx::PinningPolicy::PinnedIfSupported);
 324
 325     kernelLaunchConfig_.blockSize[0]     = c_threadsPerBlock;
 326     kernelLaunchConfig_.blockSize[1]     = 1;
 327     kernelLaunchConfig_.blockSize[2]     = 1;
 328     kernelLaunchConfig_.sharedMemorySize = 0;
 329     kernelLaunchConfig_.stream           = commandStream_;
 330 }
 331
 332 LeapFrogGpu::~LeapFrogGpu()
 333 {
 334     freeDeviceBuffer(&d_inverseMasses_);
 335 }
 336
 337 void LeapFrogGpu::set(const t_mdatoms& md, const int numTempScaleValues, const unsigned short* tempScaleGroups)
 338 {
 339     numAtoms_                       = md.nr;
 340     kernelLaunchConfig_.gridSize[0] = (numAtoms_ + c_threadsPerBlock - 1) / c_threadsPerBlock;
 341
 342     numTempScaleValues_ = numTempScaleValues;
 343
 344     reallocateDeviceBuffer(&d_inverseMasses_, numAtoms_, &numInverseMasses_,
 345                            &numInverseMassesAlloc_, nullptr);
 346     copyToDeviceBuffer(&d_inverseMasses_, (float*)md.invmass, 0, numAtoms_, commandStream_,
 347                        GpuApiCallBehavior::Sync, nullptr);
 348
 349     // Temperature scale group map only used if there are more then one group
 350     if (numTempScaleValues > 1)
 351     {
 352         reallocateDeviceBuffer(&d_tempScaleGroups_, numAtoms_, &numTempScaleGroups_,
 353                                &numTempScaleGroupsAlloc_, nullptr);
 354         copyToDeviceBuffer(&d_tempScaleGroups_, tempScaleGroups, 0, numAtoms_, commandStream_,
 355                            GpuApiCallBehavior::Sync, nullptr);
 356     }
 357
 358     // If the temperature coupling is enabled, we need to make space for scaling factors
 359     if (numTempScaleValues_ > 0)
 360     {
 361         h_lambdas_.resize(numTempScaleValues);
 362         reallocateDeviceBuffer(&d_lambdas_, numTempScaleValues_, &numLambdas_, &numLambdasAlloc_, nullptr);
 363     }
 364 }
 365
 366 } // namespace gmx