src/gromacs/mdlib/lincs_gpu.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2019,2020,2021, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35 /*! \internal \file
  36  *
  37  * \brief Implementations of LINCS GPU class
  38  *
  39  * This file contains back-end agnostic implementation of LINCS GPU class.
  40  *
  41  * \author Artem Zhmurov <zhmurov@gmail.com>
  42  * \author Alan Gray <alang@nvidia.com>
  43  *
  44  * \ingroup module_mdlib
  45  */
  46 #include "gmxpre.h"
  47
  48 #include "lincs_gpu.h"
  49
  50 #include <assert.h>
  51 #include <stdio.h>
  52
  53 #include <cmath>
  54
  55 #include <algorithm>
  56
  57 #include "gromacs/gpu_utils/devicebuffer.h"
  58 #include "gromacs/gpu_utils/gputraits.h"
  59 #include "gromacs/math/functions.h"
  60 #include "gromacs/math/vec.h"
  61 #include "gromacs/mdlib/constr.h"
  62 #include "gromacs/mdlib/lincs_gpu_internal.h"
  63 #include "gromacs/pbcutil/pbc.h"
  64 #include "gromacs/topology/ifunc.h"
  65 #include "gromacs/topology/topology.h"
  66
  67 namespace gmx
  68 {
  69
  70 void LincsGpu::apply(const DeviceBuffer<Float3>& d_x,
  71                      DeviceBuffer<Float3>        d_xp,
  72                      const bool                  updateVelocities,
  73                      DeviceBuffer<Float3>        d_v,
  74                      const real                  invdt,
  75                      const bool                  computeVirial,
  76                      tensor                      virialScaled,
  77                      const PbcAiuc&              pbcAiuc)
  78 {
  79     // Early exit if no constraints
  80     if (kernelParams_.numConstraintsThreads == 0)
  81     {
  82         return;
  83     }
  84
  85     if (computeVirial)
  86     {
  87         // Fill with zeros so the values can be reduced to it
  88         // Only 6 values are needed because virial is symmetrical
  89         clearDeviceBufferAsync(&kernelParams_.d_virialScaled, 0, 6, deviceStream_);
  90     }
  91
  92     kernelParams_.pbcAiuc = pbcAiuc;
  93
  94     launchLincsGpuKernel(
  95             kernelParams_, d_x, d_xp, updateVelocities, d_v, invdt, computeVirial, deviceStream_);
  96
  97     if (computeVirial)
  98     {
  99         // Copy LINCS virial data and add it to the common virial
 100         copyFromDeviceBuffer(h_virialScaled_.data(),
 101                              &kernelParams_.d_virialScaled,
 102                              0,
 103                              6,
 104                              deviceStream_,
 105                              GpuApiCallBehavior::Sync,
 106                              nullptr);
 107
 108         // Mapping [XX, XY, XZ, YY, YZ, ZZ] internal format to a tensor object
 109         virialScaled[XX][XX] += h_virialScaled_[0];
 110         virialScaled[XX][YY] += h_virialScaled_[1];
 111         virialScaled[XX][ZZ] += h_virialScaled_[2];
 112
 113         virialScaled[YY][XX] += h_virialScaled_[1];
 114         virialScaled[YY][YY] += h_virialScaled_[3];
 115         virialScaled[YY][ZZ] += h_virialScaled_[4];
 116
 117         virialScaled[ZZ][XX] += h_virialScaled_[2];
 118         virialScaled[ZZ][YY] += h_virialScaled_[4];
 119         virialScaled[ZZ][ZZ] += h_virialScaled_[5];
 120     }
 121 }
 122
 123 LincsGpu::LincsGpu(int                  numIterations,
 124                    int                  expansionOrder,
 125                    const DeviceContext& deviceContext,
 126                    const DeviceStream&  deviceStream) :
 127     deviceContext_(deviceContext), deviceStream_(deviceStream)
 128 {
 129     GMX_RELEASE_ASSERT(bool(GMX_GPU_CUDA) || bool(GMX_GPU_SYCL),
 130                        "LINCS GPU is only implemented in CUDA and SYCL.");
 131     kernelParams_.numIterations  = numIterations;
 132     kernelParams_.expansionOrder = expansionOrder;
 133
 134     static_assert(sizeof(real) == sizeof(float),
 135                   "Real numbers should be in single precision in GPU code.");
 136     static_assert(
 137             gmx::isPowerOfTwo(c_threadsPerBlock),
 138             "Number of threads per block should be a power of two in order for reduction to work.");
 139
 140     allocateDeviceBuffer(&kernelParams_.d_virialScaled, 6, deviceContext_);
 141     h_virialScaled_.resize(6);
 142
 143     // The data arrays should be expanded/reallocated on first call of set() function.
 144     numConstraintsThreadsAlloc_ = 0;
 145     numAtomsAlloc_              = 0;
 146 }
 147
 148 LincsGpu::~LincsGpu()
 149 {
 150     freeDeviceBuffer(&kernelParams_.d_virialScaled);
 151
 152     if (numConstraintsThreadsAlloc_ > 0)
 153     {
 154         freeDeviceBuffer(&kernelParams_.d_constraints);
 155         freeDeviceBuffer(&kernelParams_.d_constraintsTargetLengths);
 156
 157         freeDeviceBuffer(&kernelParams_.d_coupledConstraintsCounts);
 158         freeDeviceBuffer(&kernelParams_.d_coupledConstraintsIndices);
 159         freeDeviceBuffer(&kernelParams_.d_massFactors);
 160         freeDeviceBuffer(&kernelParams_.d_matrixA);
 161     }
 162     if (numAtomsAlloc_ > 0)
 163     {
 164         freeDeviceBuffer(&kernelParams_.d_inverseMasses);
 165     }
 166 }
 167
 168 //! Helper type for discovering coupled constraints
 169 struct AtomsAdjacencyListElement
 170 {
 171     AtomsAdjacencyListElement(const int indexOfSecondConstrainedAtom,
 172                               const int indexOfConstraint,
 173                               const int signFactor) :
 174         indexOfSecondConstrainedAtom_(indexOfSecondConstrainedAtom),
 175         indexOfConstraint_(indexOfConstraint),
 176         signFactor_(signFactor)
 177     {
 178     }
 179     //! The index of the other atom constrained to this atom.
 180     int indexOfSecondConstrainedAtom_;
 181     //! The index of this constraint in the container of constraints.
 182     int indexOfConstraint_;
 183     /*! \brief A multiplicative factor that indicates the relative
 184      * order of the atoms in the atom list.
 185      *
 186      * Used for computing the mass factor of this constraint
 187      * relative to any coupled constraints. */
 188     int signFactor_;
 189 };
 190 //! Constructs and returns an atom constraint adjacency list
 191 static std::vector<std::vector<AtomsAdjacencyListElement>>
 192 constructAtomsAdjacencyList(const int numAtoms, ArrayRef<const int> iatoms)
 193 {
 194     const int                                           stride         = 1 + NRAL(F_CONSTR);
 195     const int                                           numConstraints = iatoms.ssize() / stride;
 196     std::vector<std::vector<AtomsAdjacencyListElement>> atomsAdjacencyList(numAtoms);
 197     for (int c = 0; c < numConstraints; c++)
 198     {
 199         int a1 = iatoms[stride * c + 1];
 200         int a2 = iatoms[stride * c + 2];
 201
 202         // Each constraint will be represented as a tuple, containing index of the second
 203         // constrained atom, index of the constraint and a sign that indicates the order of atoms in
 204         // which they are listed. Sign is needed to compute the mass factors.
 205         atomsAdjacencyList[a1].emplace_back(a2, c, +1);
 206         atomsAdjacencyList[a2].emplace_back(a1, c, -1);
 207     }
 208
 209     return atomsAdjacencyList;
 210 }
 211
 212 /*! \brief Helper function to go through constraints recursively.
 213  *
 214  *  For each constraint, counts the number of coupled constraints and stores the value in \p numCoupledConstraints array.
 215  *  This information is used to split the array of constraints between thread blocks on a GPU so there is no
 216  *  coupling between constraints from different thread blocks. After the \p numCoupledConstraints array is filled, the
 217  *  value \p numCoupledConstraints[c] should be equal to the number of constraints that are coupled to \p c and located
 218  *  after it in the constraints array.
 219  *
 220  * \param[in]     a                   Atom index.
 221  * \param[in,out] numCoupledConstraints  Indicates if the constraint was already counted and stores
 222  *                                    the number of constraints (i) connected to it and (ii) located
 223  *                                    after it in memory. This array is filled by this recursive function.
 224  *                                    For a set of coupled constraints, only for the first one in this list
 225  *                                    the number of consecutive coupled constraints is needed: if there is
 226  *                                    not enough space for this set of constraints in the thread block,
 227  *                                    the group has to be moved to the next one.
 228  * \param[in]     atomsAdjacencyList  Stores information about connections between atoms.
 229  */
 230 inline int countCoupled(int           a,
 231                         ArrayRef<int> numCoupledConstraints,
 232                         ArrayRef<const std::vector<AtomsAdjacencyListElement>> atomsAdjacencyList)
 233
 234 {
 235     int counted = 0;
 236     for (const auto& adjacentAtom : atomsAdjacencyList[a])
 237     {
 238         const int c2 = adjacentAtom.indexOfConstraint_;
 239         if (numCoupledConstraints[c2] == -1)
 240         {
 241             numCoupledConstraints[c2] = 0; // To indicate we've been here
 242             counted += 1
 243                        + countCoupled(adjacentAtom.indexOfSecondConstrainedAtom_,
 244                                       numCoupledConstraints,
 245                                       atomsAdjacencyList);
 246         }
 247     }
 248     return counted;
 249 }
 250
 251 /*! \brief Add constraint to \p splitMap with all constraints coupled to it.
 252  *
 253  *  Adds the constraint \p c from the constrain list \p iatoms to the map \p splitMap
 254  *  if it was not yet added. Then goes through all the constraints coupled to \p c
 255  *  and calls itself recursively. This ensures that all the coupled constraints will
 256  *  be added to neighboring locations in the final data structures on the device,
 257  *  hence mapping all coupled constraints to the same thread block. A value of -1 in
 258  *  the \p splitMap is used to flag that constraint was not yet added to the \p splitMap.
 259  *
 260  * \param[in]     iatoms              The list of constraints.
 261  * \param[in]     stride              Number of elements per constraint in \p iatoms.
 262  * \param[in]     atomsAdjacencyList  Information about connections between atoms.
 263  * \param[out]    splitMap            Map of sequential constraint indexes to indexes to be on the device
 264  * \param[in]     c                   Sequential index for constraint to consider adding.
 265  * \param[in,out] currentMapIndex     The rolling index for the constraints mapping.
 266  */
 267 inline void addWithCoupled(ArrayRef<const int>                                    iatoms,
 268                            const int                                              stride,
 269                            ArrayRef<const std::vector<AtomsAdjacencyListElement>> atomsAdjacencyList,
 270                            ArrayRef<int>                                          splitMap,
 271                            const int                                              c,
 272                            int*                                                   currentMapIndex)
 273 {
 274     if (splitMap[c] == -1)
 275     {
 276         splitMap[c] = *currentMapIndex;
 277         (*currentMapIndex)++;
 278
 279         // Constraints, coupled through both atoms.
 280         for (int atomIndexInConstraint = 0; atomIndexInConstraint < 2; atomIndexInConstraint++)
 281         {
 282             const int a1 = iatoms[stride * c + 1 + atomIndexInConstraint];
 283             for (const auto& adjacentAtom : atomsAdjacencyList[a1])
 284             {
 285                 const int c2 = adjacentAtom.indexOfConstraint_;
 286                 if (c2 != c)
 287                 {
 288                     addWithCoupled(iatoms, stride, atomsAdjacencyList, splitMap, c2, currentMapIndex);
 289                 }
 290             }
 291         }
 292     }
 293 }
 294
 295 /*! \brief Computes and returns how many constraints are coupled to each constraint
 296  *
 297  * Needed to introduce splits in data so that all coupled constraints will be computed in a
 298  * single GPU block. The position \p c of the vector \p numCoupledConstraints should have the number
 299  * of constraints that are coupled to a constraint \p c and are after \p c in the vector. Only
 300  * first index of the connected group of the constraints is needed later in the code, hence the
 301  * numCoupledConstraints vector is also used to keep track if the constrain was already counted.
 302  */
 303 static std::vector<int> countNumCoupledConstraints(ArrayRef<const int> iatoms,
 304                                                    ArrayRef<const std::vector<AtomsAdjacencyListElement>> atomsAdjacencyList)
 305 {
 306     const int        stride         = 1 + NRAL(F_CONSTR);
 307     const int        numConstraints = iatoms.ssize() / stride;
 308     std::vector<int> numCoupledConstraints(numConstraints, -1);
 309     for (int c = 0; c < numConstraints; c++)
 310     {
 311         const int a1 = iatoms[stride * c + 1];
 312         const int a2 = iatoms[stride * c + 2];
 313         if (numCoupledConstraints[c] == -1)
 314         {
 315             numCoupledConstraints[c] = countCoupled(a1, numCoupledConstraints, atomsAdjacencyList)
 316                                        + countCoupled(a2, numCoupledConstraints, atomsAdjacencyList);
 317         }
 318     }
 319
 320     return numCoupledConstraints;
 321 }
 322
 323 bool LincsGpu::isNumCoupledConstraintsSupported(const gmx_mtop_t& mtop)
 324 {
 325     for (const gmx_moltype_t& molType : mtop.moltype)
 326     {
 327         ArrayRef<const int> iatoms    = molType.ilist[F_CONSTR].iatoms;
 328         const auto atomsAdjacencyList = constructAtomsAdjacencyList(molType.atoms.nr, iatoms);
 329         // Compute, how many constraints are coupled to each constraint
 330         const auto numCoupledConstraints = countNumCoupledConstraints(iatoms, atomsAdjacencyList);
 331         for (const int numCoupled : numCoupledConstraints)
 332         {
 333             if (numCoupled > c_threadsPerBlock)
 334             {
 335                 return false;
 336             }
 337         }
 338     }
 339
 340     return true;
 341 }
 342
 343 void LincsGpu::set(const InteractionDefinitions& idef, const int numAtoms, const real* invmass)
 344 {
 345     GMX_RELEASE_ASSERT(bool(GMX_GPU_CUDA) || bool(GMX_GPU_SYCL),
 346                        "LINCS GPU is only implemented in CUDA and SYCL.");
 347     // List of constrained atoms (CPU memory)
 348     std::vector<AtomPair> constraintsHost;
 349     // Equilibrium distances for the constraints (CPU)
 350     std::vector<float> constraintsTargetLengthsHost;
 351     // Number of constraints, coupled with the current one (CPU)
 352     std::vector<int> coupledConstraintsCountsHost;
 353     // List of coupled with the current one (CPU)
 354     std::vector<int> coupledConstraintsIndicesHost;
 355     // Mass factors (CPU)
 356     std::vector<float> massFactorsHost;
 357
 358     // List of constrained atoms in local topology
 359     ArrayRef<const int> iatoms         = idef.il[F_CONSTR].iatoms;
 360     const int           stride         = NRAL(F_CONSTR) + 1;
 361     const int           numConstraints = idef.il[F_CONSTR].size() / stride;
 362
 363     // Early exit if no constraints
 364     if (numConstraints == 0)
 365     {
 366         kernelParams_.numConstraintsThreads = 0;
 367         return;
 368     }
 369
 370     // Construct the adjacency list, a useful intermediate structure
 371     const auto atomsAdjacencyList = constructAtomsAdjacencyList(numAtoms, iatoms);
 372
 373     // Compute, how many constraints are coupled to each constraint
 374     const auto numCoupledConstraints = countNumCoupledConstraints(iatoms, atomsAdjacencyList);
 375
 376     // Map of splits in the constraints data. For each 'old' constraint index gives 'new' which
 377     // takes into account the empty spaces which might be needed in the end of each thread block.
 378     std::vector<int> splitMap(numConstraints, -1);
 379     int              currentMapIndex = 0;
 380     for (int c = 0; c < numConstraints; c++)
 381     {
 382         // Check if coupled constraints all fit in one block
 383         if (numCoupledConstraints[c] > c_threadsPerBlock)
 384         {
 385             gmx_fatal(FARGS,
 386                       "Maximum number of coupled constraints (%d) exceeds the size of the CUDA "
 387                       "thread block (%d). Most likely, you are trying to use the GPU version of "
 388                       "LINCS with constraints on all-bonds, which is not supported for large "
 389                       "molecules. When compatible with the force field and integration settings, "
 390                       "using constraints on H-bonds only.",
 391                       numCoupledConstraints[c],
 392                       c_threadsPerBlock);
 393         }
 394         if (currentMapIndex / c_threadsPerBlock != (currentMapIndex + numCoupledConstraints[c]) / c_threadsPerBlock)
 395         {
 396             currentMapIndex = ((currentMapIndex / c_threadsPerBlock) + 1) * c_threadsPerBlock;
 397         }
 398         addWithCoupled(iatoms, stride, atomsAdjacencyList, splitMap, c, &currentMapIndex);
 399     }
 400
 401     kernelParams_.numConstraintsThreads =
 402             currentMapIndex + c_threadsPerBlock - currentMapIndex % c_threadsPerBlock;
 403     GMX_RELEASE_ASSERT(kernelParams_.numConstraintsThreads % c_threadsPerBlock == 0,
 404                        "Number of threads should be a multiple of the block size");
 405
 406     // Initialize constraints and their target indexes taking into account the splits in the data arrays.
 407     AtomPair pair;
 408     pair.i = -1;
 409     pair.j = -1;
 410     constraintsHost.resize(kernelParams_.numConstraintsThreads, pair);
 411     std::fill(constraintsHost.begin(), constraintsHost.end(), pair);
 412     constraintsTargetLengthsHost.resize(kernelParams_.numConstraintsThreads, 0.0);
 413     std::fill(constraintsTargetLengthsHost.begin(), constraintsTargetLengthsHost.end(), 0.0);
 414     for (int c = 0; c < numConstraints; c++)
 415     {
 416         int a1   = iatoms[stride * c + 1];
 417         int a2   = iatoms[stride * c + 2];
 418         int type = iatoms[stride * c];
 419
 420         AtomPair pair;
 421         pair.i                                    = a1;
 422         pair.j                                    = a2;
 423         constraintsHost[splitMap[c]]              = pair;
 424         constraintsTargetLengthsHost[splitMap[c]] = idef.iparams[type].constr.dA;
 425     }
 426
 427     // The adjacency list of constraints (i.e. the list of coupled constraints for each constraint).
 428     // We map a single thread to a single constraint, hence each thread 'c' will be using one
 429     // element from coupledConstraintsCountsHost array, which is the number of constraints coupled
 430     // to the constraint 'c'. The coupled constraints indexes are placed into the
 431     // coupledConstraintsIndicesHost array. Latter is organized as a one-dimensional array to ensure
 432     // good memory alignment. It is addressed as [c + i*numConstraintsThreads], where 'i' goes from
 433     // zero to the number of constraints coupled to 'c'. 'numConstraintsThreads' is the width of the
 434     // array --- a number, greater then total number of constraints, taking into account the splits
 435     // in the constraints array due to the GPU block borders. This number can be adjusted to improve
 436     // memory access pattern. Mass factors are saved in a similar data structure.
 437     int  maxCoupledConstraints             = 0;
 438     bool maxCoupledConstraintsHasIncreased = false;
 439     for (int c = 0; c < numConstraints; c++)
 440     {
 441         int a1 = iatoms[stride * c + 1];
 442         int a2 = iatoms[stride * c + 2];
 443
 444         // Constraint 'c' is counted twice, but it should be excluded altogether. Hence '-2'.
 445         int nCoupledConstraints = atomsAdjacencyList[a1].size() + atomsAdjacencyList[a2].size() - 2;
 446
 447         if (nCoupledConstraints > maxCoupledConstraints)
 448         {
 449             maxCoupledConstraints             = nCoupledConstraints;
 450             maxCoupledConstraintsHasIncreased = true;
 451         }
 452     }
 453
 454     kernelParams_.haveCoupledConstraints = (maxCoupledConstraints > 0);
 455
 456     coupledConstraintsCountsHost.resize(kernelParams_.numConstraintsThreads, 0);
 457     coupledConstraintsIndicesHost.resize(maxCoupledConstraints * kernelParams_.numConstraintsThreads, -1);
 458     massFactorsHost.resize(maxCoupledConstraints * kernelParams_.numConstraintsThreads, -1);
 459
 460     for (int c1 = 0; c1 < numConstraints; c1++)
 461     {
 462         coupledConstraintsCountsHost[splitMap[c1]] = 0;
 463         int c1a1                                   = iatoms[stride * c1 + 1];
 464         int c1a2                                   = iatoms[stride * c1 + 2];
 465
 466         // Constraints, coupled through the first atom.
 467         int c2a1 = c1a1;
 468         for (const auto& atomAdjacencyList : atomsAdjacencyList[c1a1])
 469         {
 470             int c2 = atomAdjacencyList.indexOfConstraint_;
 471
 472             if (c1 != c2)
 473             {
 474                 int c2a2  = atomAdjacencyList.indexOfSecondConstrainedAtom_;
 475                 int sign  = atomAdjacencyList.signFactor_;
 476                 int index = kernelParams_.numConstraintsThreads
 477                                     * coupledConstraintsCountsHost[splitMap[c1]]
 478                             + splitMap[c1];
 479                 int threadBlockStarts = splitMap[c1] - splitMap[c1] % c_threadsPerBlock;
 480
 481                 coupledConstraintsIndicesHost[index] = splitMap[c2] - threadBlockStarts;
 482
 483                 int center = c1a1;
 484
 485                 float sqrtmu1 = 1.0 / std::sqrt(invmass[c1a1] + invmass[c1a2]);
 486                 float sqrtmu2 = 1.0 / std::sqrt(invmass[c2a1] + invmass[c2a2]);
 487
 488                 massFactorsHost[index] = -sign * invmass[center] * sqrtmu1 * sqrtmu2;
 489
 490                 coupledConstraintsCountsHost[splitMap[c1]]++;
 491             }
 492         }
 493
 494         // Constraints, coupled through the second atom.
 495         c2a1 = c1a2;
 496         for (const auto& atomAdjacencyList : atomsAdjacencyList[c1a2])
 497         {
 498             int c2 = atomAdjacencyList.indexOfConstraint_;
 499
 500             if (c1 != c2)
 501             {
 502                 int c2a2  = atomAdjacencyList.indexOfSecondConstrainedAtom_;
 503                 int sign  = atomAdjacencyList.signFactor_;
 504                 int index = kernelParams_.numConstraintsThreads
 505                                     * coupledConstraintsCountsHost[splitMap[c1]]
 506                             + splitMap[c1];
 507                 int threadBlockStarts = splitMap[c1] - splitMap[c1] % c_threadsPerBlock;
 508
 509                 coupledConstraintsIndicesHost[index] = splitMap[c2] - threadBlockStarts;
 510
 511                 int center = c1a2;
 512
 513                 float sqrtmu1 = 1.0 / std::sqrt(invmass[c1a1] + invmass[c1a2]);
 514                 float sqrtmu2 = 1.0 / std::sqrt(invmass[c2a1] + invmass[c2a2]);
 515
 516                 massFactorsHost[index] = sign * invmass[center] * sqrtmu1 * sqrtmu2;
 517
 518                 coupledConstraintsCountsHost[splitMap[c1]]++;
 519             }
 520         }
 521     }
 522
 523     // (Re)allocate the memory, if the number of constraints has increased.
 524     if ((kernelParams_.numConstraintsThreads > numConstraintsThreadsAlloc_) || maxCoupledConstraintsHasIncreased)
 525     {
 526         // Free memory if it was allocated before (i.e. if not the first time here).
 527         if (numConstraintsThreadsAlloc_ > 0)
 528         {
 529             freeDeviceBuffer(&kernelParams_.d_constraints);
 530             freeDeviceBuffer(&kernelParams_.d_constraintsTargetLengths);
 531
 532             freeDeviceBuffer(&kernelParams_.d_coupledConstraintsCounts);
 533             freeDeviceBuffer(&kernelParams_.d_coupledConstraintsIndices);
 534             freeDeviceBuffer(&kernelParams_.d_massFactors);
 535             freeDeviceBuffer(&kernelParams_.d_matrixA);
 536         }
 537
 538         numConstraintsThreadsAlloc_ = kernelParams_.numConstraintsThreads;
 539
 540         allocateDeviceBuffer(
 541                 &kernelParams_.d_constraints, kernelParams_.numConstraintsThreads, deviceContext_);
 542         allocateDeviceBuffer(&kernelParams_.d_constraintsTargetLengths,
 543                              kernelParams_.numConstraintsThreads,
 544                              deviceContext_);
 545
 546         allocateDeviceBuffer(&kernelParams_.d_coupledConstraintsCounts,
 547                              kernelParams_.numConstraintsThreads,
 548                              deviceContext_);
 549         allocateDeviceBuffer(&kernelParams_.d_coupledConstraintsIndices,
 550                              maxCoupledConstraints * kernelParams_.numConstraintsThreads,
 551                              deviceContext_);
 552         allocateDeviceBuffer(&kernelParams_.d_massFactors,
 553                              maxCoupledConstraints * kernelParams_.numConstraintsThreads,
 554                              deviceContext_);
 555         allocateDeviceBuffer(&kernelParams_.d_matrixA,
 556                              maxCoupledConstraints * kernelParams_.numConstraintsThreads,
 557                              deviceContext_);
 558     }
 559
 560     // (Re)allocate the memory, if the number of atoms has increased.
 561     if (numAtoms > numAtomsAlloc_)
 562     {
 563         if (numAtomsAlloc_ > 0)
 564         {
 565             freeDeviceBuffer(&kernelParams_.d_inverseMasses);
 566         }
 567         numAtomsAlloc_ = numAtoms;
 568         allocateDeviceBuffer(&kernelParams_.d_inverseMasses, numAtoms, deviceContext_);
 569     }
 570
 571     // Copy data to GPU.
 572     copyToDeviceBuffer(&kernelParams_.d_constraints,
 573                        constraintsHost.data(),
 574                        0,
 575                        kernelParams_.numConstraintsThreads,
 576                        deviceStream_,
 577                        GpuApiCallBehavior::Sync,
 578                        nullptr);
 579     copyToDeviceBuffer(&kernelParams_.d_constraintsTargetLengths,
 580                        constraintsTargetLengthsHost.data(),
 581                        0,
 582                        kernelParams_.numConstraintsThreads,
 583                        deviceStream_,
 584                        GpuApiCallBehavior::Sync,
 585                        nullptr);
 586     copyToDeviceBuffer(&kernelParams_.d_coupledConstraintsCounts,
 587                        coupledConstraintsCountsHost.data(),
 588                        0,
 589                        kernelParams_.numConstraintsThreads,
 590                        deviceStream_,
 591                        GpuApiCallBehavior::Sync,
 592                        nullptr);
 593     copyToDeviceBuffer(&kernelParams_.d_coupledConstraintsIndices,
 594                        coupledConstraintsIndicesHost.data(),
 595                        0,
 596                        maxCoupledConstraints * kernelParams_.numConstraintsThreads,
 597                        deviceStream_,
 598                        GpuApiCallBehavior::Sync,
 599                        nullptr);
 600     copyToDeviceBuffer(&kernelParams_.d_massFactors,
 601                        massFactorsHost.data(),
 602                        0,
 603                        maxCoupledConstraints * kernelParams_.numConstraintsThreads,
 604                        deviceStream_,
 605                        GpuApiCallBehavior::Sync,
 606                        nullptr);
 607
 608     GMX_RELEASE_ASSERT(invmass != nullptr, "Masses of atoms should be specified.\n");
 609     copyToDeviceBuffer(
 610             &kernelParams_.d_inverseMasses, invmass, 0, numAtoms, deviceStream_, GpuApiCallBehavior::Sync, nullptr);
 611 }
 612
 613 } // namespace gmx