src/gromacs/mdlib/leapfrog_gpu_sycl.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2020,2021, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35 /*! \internal \file
  36  *
  37  * \brief Implements Leap-Frog using SYCL
  38  *
  39  * This file contains implementation of basic Leap-Frog integrator
  40  * using SYCL, including class initialization, data-structures management
  41  * and GPU kernel.
  42  *
  43  * \author Artem Zhmurov <zhmurov@gmail.com>
  44  * \author Andrey Alekseenko <al42and@gmail.com>
  45  *
  46  * \ingroup module_mdlib
  47  */
  48 #include "gmxpre.h"
  49
  50 #include "gromacs/gpu_utils/devicebuffer.h"
  51 #include "gromacs/gpu_utils/gmxsycl.h"
  52 #include "gromacs/math/vec.h"
  53 #include "gromacs/mdlib/leapfrog_gpu.h"
  54 #include "gromacs/mdtypes/group.h"
  55 #include "gromacs/utility/arrayref.h"
  56 #include "gromacs/utility/fatalerror.h"
  57 #include "gromacs/utility/template_mp.h"
  58
  59 namespace gmx
  60 {
  61
  62 using cl::sycl::access::mode;
  63
  64 /*! \brief Main kernel for the Leap-Frog integrator.
  65  *
  66  *  The coordinates and velocities are updated on the GPU. Also saves the intermediate values of the coordinates for
  67  *   further use in constraints.
  68  *
  69  *  Each GPU thread works with a single particle.
  70  *
  71  * \tparam        numTempScaleValues               The number of different T-couple values.
  72  * \tparam        velocityScaling                  Type of the Parrinello-Rahman velocity rescaling.
  73  * \param         cgh                              SYCL's command group handler.
  74  * \param[in,out] a_x                              Coordinates to update upon integration.
  75  * \param[out]    a_xp                             A copy of the coordinates before the integration (for constraints).
  76  * \param[in,out] a_v                              Velocities to update.
  77  * \param[in]     a_f                              Atomic forces.
  78  * \param[in]     a_inverseMasses                  Reciprocal masses.
  79  * \param[in]     dt                               Timestep.
  80  * \param[in]     a_lambdas                        Temperature scaling factors (one per group).
  81  * \param[in]     a_tempScaleGroups                Mapping of atoms into groups.
  82  * \param[in]     prVelocityScalingMatrixDiagonal  Diagonal elements of Parrinello-Rahman velocity scaling matrix
  83  */
  84 template<NumTempScaleValues numTempScaleValues, VelocityScalingType velocityScaling>
  85 auto leapFrogKernel(
  86         cl::sycl::handler&                          cgh,
  87         DeviceAccessor<Float3, mode::read_write>    a_x,
  88         DeviceAccessor<Float3, mode::discard_write> a_xp,
  89         DeviceAccessor<Float3, mode::read_write>    a_v,
  90         DeviceAccessor<Float3, mode::read>          a_f,
  91         DeviceAccessor<float, mode::read>           a_inverseMasses,
  92         float                                       dt,
  93         OptionalAccessor<float, mode::read, numTempScaleValues != NumTempScaleValues::None> a_lambdas,
  94         OptionalAccessor<unsigned short, mode::read, numTempScaleValues == NumTempScaleValues::Multiple> a_tempScaleGroups,
  95         Float3 prVelocityScalingMatrixDiagonal)
  96 {
  97     cgh.require(a_x);
  98     cgh.require(a_xp);
  99     cgh.require(a_v);
 100     cgh.require(a_f);
 101     cgh.require(a_inverseMasses);
 102     if constexpr (numTempScaleValues != NumTempScaleValues::None)
 103     {
 104         cgh.require(a_lambdas);
 105     }
 106     if constexpr (numTempScaleValues == NumTempScaleValues::Multiple)
 107     {
 108         cgh.require(a_tempScaleGroups);
 109     }
 110
 111     return [=](cl::sycl::id<1> itemIdx) {
 112         const Float3 x    = a_x[itemIdx];
 113         const Float3 v    = a_v[itemIdx];
 114         const Float3 f    = a_f[itemIdx];
 115         const float  im   = a_inverseMasses[itemIdx];
 116         const float  imdt = im * dt;
 117
 118         // Swapping places for xp and x so that the x will contain the updated coordinates and xp -
 119         // the coordinates before update. This should be taken into account when (if) constraints
 120         // are applied after the update: x and xp have to be passed to constraints in the 'wrong'
 121         // order. See Issue #3727
 122         a_xp[itemIdx] = x;
 123
 124         const float lambda = [=]() {
 125             if constexpr (numTempScaleValues == NumTempScaleValues::None)
 126             {
 127                 return 1.0F;
 128             }
 129             else if constexpr (numTempScaleValues == NumTempScaleValues::Single)
 130             {
 131                 return a_lambdas[0];
 132             }
 133             else if constexpr (numTempScaleValues == NumTempScaleValues::Multiple)
 134             {
 135                 const int tempScaleGroup = a_tempScaleGroups[itemIdx];
 136                 return a_lambdas[tempScaleGroup];
 137             }
 138         }();
 139
 140         const Float3 prVelocityDelta = [=]() {
 141             if constexpr (velocityScaling == VelocityScalingType::Diagonal)
 142             {
 143                 return Float3{ prVelocityScalingMatrixDiagonal[0] * v[0],
 144                                prVelocityScalingMatrixDiagonal[1] * v[1],
 145                                prVelocityScalingMatrixDiagonal[2] * v[2] };
 146             }
 147             else if constexpr (velocityScaling == VelocityScalingType::None)
 148             {
 149                 return Float3{ 0, 0, 0 };
 150             }
 151         }();
 152
 153         const Float3 v_new = v * lambda - prVelocityDelta + f * imdt;
 154         a_v[itemIdx]       = v_new;
 155         a_x[itemIdx]       = x + v_new * dt;
 156     };
 157 }
 158
 159 // SYCL 1.2.1 requires providing a unique type for a kernel. Should not be needed for SYCL2020.
 160 template<NumTempScaleValues numTempScaleValues, VelocityScalingType velocityScaling>
 161 class LeapFrogKernelName;
 162
 163 template<NumTempScaleValues numTempScaleValues, VelocityScalingType velocityScaling, class... Args>
 164 static cl::sycl::event launchLeapFrogKernel(const DeviceStream& deviceStream, int numAtoms, Args&&... args)
 165 {
 166     // Should not be needed for SYCL2020.
 167     using kernelNameType = LeapFrogKernelName<numTempScaleValues, velocityScaling>;
 168
 169     const cl::sycl::range<1> rangeAllAtoms(numAtoms);
 170     cl::sycl::queue          q = deviceStream.stream();
 171
 172     cl::sycl::event e = q.submit([&](cl::sycl::handler& cgh) {
 173         auto kernel =
 174                 leapFrogKernel<numTempScaleValues, velocityScaling>(cgh, std::forward<Args>(args)...);
 175         cgh.parallel_for<kernelNameType>(rangeAllAtoms, kernel);
 176     });
 177
 178     return e;
 179 }
 180
 181 static NumTempScaleValues getTempScalingType(bool doTemperatureScaling, int numTempScaleValues)
 182 {
 183     if (!doTemperatureScaling)
 184     {
 185         return NumTempScaleValues::None;
 186     }
 187     else if (numTempScaleValues == 1)
 188     {
 189         return NumTempScaleValues::Single;
 190     }
 191     else if (numTempScaleValues > 1)
 192     {
 193         return NumTempScaleValues::Multiple;
 194     }
 195     else
 196     {
 197         gmx_incons("Temperature coupling was requested with no temperature coupling groups.");
 198     }
 199 }
 200
 201 /*! \brief Select templated kernel and launch it. */
 202 template<class... Args>
 203 static inline cl::sycl::event launchLeapFrogKernel(NumTempScaleValues  tempScalingType,
 204                                                    VelocityScalingType prVelocityScalingType,
 205                                                    Args&&... args)
 206 {
 207     GMX_ASSERT(prVelocityScalingType == VelocityScalingType::None
 208                        || prVelocityScalingType == VelocityScalingType::Diagonal,
 209                "Only isotropic Parrinello-Rahman pressure coupling is supported.");
 210
 211     return dispatchTemplatedFunction(
 212             [&](auto tempScalingType_, auto prScalingType_) {
 213                 return launchLeapFrogKernel<tempScalingType_, prScalingType_>(std::forward<Args>(args)...);
 214             },
 215             tempScalingType,
 216             prVelocityScalingType);
 217 }
 218
 219 void LeapFrogGpu::integrate(DeviceBuffer<Float3>              d_x,
 220                             DeviceBuffer<Float3>              d_xp,
 221                             DeviceBuffer<Float3>              d_v,
 222                             DeviceBuffer<Float3>              d_f,
 223                             const real                        dt,
 224                             const bool                        doTemperatureScaling,
 225                             gmx::ArrayRef<const t_grp_tcstat> tcstat,
 226                             const bool                        doParrinelloRahman,
 227                             const float                       dtPressureCouple,
 228                             const matrix                      prVelocityScalingMatrix)
 229 {
 230     if (doTemperatureScaling)
 231     {
 232         GMX_ASSERT(checkDeviceBuffer(d_lambdas_, numTempScaleValues_),
 233                    "Number of temperature scaling factors changed since it was set for the "
 234                    "last time.");
 235         GMX_RELEASE_ASSERT(gmx::ssize(h_lambdas_) == numTempScaleValues_,
 236                            "Number of temperature scaling factors changed since it was set for the "
 237                            "last time.");
 238         /* We could use host accessors here, without h_lambdas_.
 239          * According to a quick test, host accessor is slightly faster when using DPC++ and
 240          * LevelZero compared to using h_lambdas_ + cgh.copy. But with DPC++ and OpenCL, the host
 241          * accessor waits for fReadyOnDevice in UpdateConstrainGpu::Impl::integrate. See #4023. */
 242
 243         for (int i = 0; i < numTempScaleValues_; i++)
 244         {
 245             h_lambdas_[i] = tcstat[i].lambda;
 246         }
 247         copyToDeviceBuffer(&d_lambdas_,
 248                            h_lambdas_.data(),
 249                            0,
 250                            numTempScaleValues_,
 251                            deviceStream_,
 252                            GpuApiCallBehavior::Async,
 253                            nullptr);
 254     }
 255     NumTempScaleValues tempVelocityScalingType =
 256             getTempScalingType(doTemperatureScaling, numTempScaleValues_);
 257
 258     VelocityScalingType prVelocityScalingType = VelocityScalingType::None;
 259     if (doParrinelloRahman)
 260     {
 261         prVelocityScalingType = VelocityScalingType::Diagonal;
 262         GMX_ASSERT(prVelocityScalingMatrix[YY][XX] == 0 && prVelocityScalingMatrix[ZZ][XX] == 0
 263                            && prVelocityScalingMatrix[ZZ][YY] == 0 && prVelocityScalingMatrix[XX][YY] == 0
 264                            && prVelocityScalingMatrix[XX][ZZ] == 0 && prVelocityScalingMatrix[YY][ZZ] == 0,
 265                    "Fully anisotropic Parrinello-Rahman pressure coupling is not yet supported "
 266                    "in GPU version of Leap-Frog integrator.");
 267         prVelocityScalingMatrixDiagonal_ = dtPressureCouple
 268                                            * Float3{ prVelocityScalingMatrix[XX][XX],
 269                                                      prVelocityScalingMatrix[YY][YY],
 270                                                      prVelocityScalingMatrix[ZZ][ZZ] };
 271     }
 272
 273     launchLeapFrogKernel(tempVelocityScalingType,
 274                          prVelocityScalingType,
 275                          deviceStream_,
 276                          numAtoms_,
 277                          d_x,
 278                          d_xp,
 279                          d_v,
 280                          d_f,
 281                          d_inverseMasses_,
 282                          dt,
 283                          d_lambdas_,
 284                          d_tempScaleGroups_,
 285                          prVelocityScalingMatrixDiagonal_);
 286 }
 287
 288 LeapFrogGpu::LeapFrogGpu(const DeviceContext& deviceContext,
 289                          const DeviceStream&  deviceStream,
 290                          const int            numTempScaleValues) :
 291     deviceContext_(deviceContext),
 292     deviceStream_(deviceStream),
 293     numAtoms_(0),
 294     numTempScaleValues_(numTempScaleValues)
 295 {
 296     // If the temperature coupling is enabled, we need to make space for scaling factors
 297     if (numTempScaleValues_ > 0)
 298     {
 299         h_lambdas_.resize(numTempScaleValues_);
 300         reallocateDeviceBuffer(
 301                 &d_lambdas_, numTempScaleValues_, &numLambdas_, &numLambdasAlloc_, deviceContext_);
 302     }
 303 }
 304
 305 LeapFrogGpu::~LeapFrogGpu()
 306 {
 307     freeDeviceBuffer(&d_inverseMasses_);
 308 }
 309
 310 void LeapFrogGpu::set(const int numAtoms, const real* inverseMasses, const unsigned short* tempScaleGroups)
 311 {
 312     numAtoms_ = numAtoms;
 313
 314     reallocateDeviceBuffer(
 315             &d_inverseMasses_, numAtoms_, &numInverseMasses_, &numInverseMassesAlloc_, deviceContext_);
 316     copyToDeviceBuffer(
 317             &d_inverseMasses_, inverseMasses, 0, numAtoms_, deviceStream_, GpuApiCallBehavior::Sync, nullptr);
 318
 319     // Temperature scale group map only used if there are more then one group
 320     if (numTempScaleValues_ > 1)
 321     {
 322         reallocateDeviceBuffer(
 323                 &d_tempScaleGroups_, numAtoms_, &numTempScaleGroups_, &numTempScaleGroupsAlloc_, deviceContext_);
 324         copyToDeviceBuffer(
 325                 &d_tempScaleGroups_, tempScaleGroups, 0, numAtoms_, deviceStream_, GpuApiCallBehavior::Sync, nullptr);
 326     }
 327 }
 328
 329 } // namespace gmx