2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2019,2020,2021, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
37 * \brief Implementations of LINCS GPU class
39 * This file contains back-end agnostic implementation of LINCS GPU class.
41 * \author Artem Zhmurov <zhmurov@gmail.com>
42 * \author Alan Gray <alang@nvidia.com>
44 * \ingroup module_mdlib
48 #include "lincs_gpu.h"
57 #include "gromacs/gpu_utils/devicebuffer.h"
58 #include "gromacs/gpu_utils/gputraits.h"
59 #include "gromacs/math/functions.h"
60 #include "gromacs/math/vec.h"
61 #include "gromacs/mdlib/constr.h"
62 #include "gromacs/mdlib/lincs_gpu_internal.h"
63 #include "gromacs/pbcutil/pbc.h"
64 #include "gromacs/topology/ifunc.h"
65 #include "gromacs/topology/topology.h"
70 void LincsGpu::apply(const DeviceBuffer<Float3>& d_x,
71 DeviceBuffer<Float3> d_xp,
72 const bool updateVelocities,
73 DeviceBuffer<Float3> d_v,
75 const bool computeVirial,
77 const PbcAiuc& pbcAiuc)
79 GMX_ASSERT(GMX_GPU_CUDA, "LINCS GPU is only implemented in CUDA.");
81 // Early exit if no constraints
82 if (kernelParams_.numConstraintsThreads == 0)
89 // Fill with zeros so the values can be reduced to it
90 // Only 6 values are needed because virial is symmetrical
91 clearDeviceBufferAsync(&kernelParams_.d_virialScaled, 0, 6, deviceStream_);
94 kernelParams_.pbcAiuc = pbcAiuc;
97 kernelParams_, d_x, d_xp, updateVelocities, d_v, invdt, computeVirial, deviceStream_);
101 // Copy LINCS virial data and add it to the common virial
102 copyFromDeviceBuffer(h_virialScaled_.data(),
103 &kernelParams_.d_virialScaled,
107 GpuApiCallBehavior::Sync,
110 // Mapping [XX, XY, XZ, YY, YZ, ZZ] internal format to a tensor object
111 virialScaled[XX][XX] += h_virialScaled_[0];
112 virialScaled[XX][YY] += h_virialScaled_[1];
113 virialScaled[XX][ZZ] += h_virialScaled_[2];
115 virialScaled[YY][XX] += h_virialScaled_[1];
116 virialScaled[YY][YY] += h_virialScaled_[3];
117 virialScaled[YY][ZZ] += h_virialScaled_[4];
119 virialScaled[ZZ][XX] += h_virialScaled_[2];
120 virialScaled[ZZ][YY] += h_virialScaled_[4];
121 virialScaled[ZZ][ZZ] += h_virialScaled_[5];
125 LincsGpu::LincsGpu(int numIterations,
127 const DeviceContext& deviceContext,
128 const DeviceStream& deviceStream) :
129 deviceContext_(deviceContext), deviceStream_(deviceStream)
131 GMX_RELEASE_ASSERT(GMX_GPU_CUDA, "LINCS GPU is only implemented in CUDA.");
132 kernelParams_.numIterations = numIterations;
133 kernelParams_.expansionOrder = expansionOrder;
135 static_assert(sizeof(real) == sizeof(float),
136 "Real numbers should be in single precision in GPU code.");
138 gmx::isPowerOfTwo(c_threadsPerBlock),
139 "Number of threads per block should be a power of two in order for reduction to work.");
141 allocateDeviceBuffer(&kernelParams_.d_virialScaled, 6, deviceContext_);
142 h_virialScaled_.resize(6);
144 // The data arrays should be expanded/reallocated on first call of set() function.
145 numConstraintsThreadsAlloc_ = 0;
149 LincsGpu::~LincsGpu()
151 freeDeviceBuffer(&kernelParams_.d_virialScaled);
153 if (numConstraintsThreadsAlloc_ > 0)
155 freeDeviceBuffer(&kernelParams_.d_constraints);
156 freeDeviceBuffer(&kernelParams_.d_constraintsTargetLengths);
158 freeDeviceBuffer(&kernelParams_.d_coupledConstraintsCounts);
159 freeDeviceBuffer(&kernelParams_.d_coupledConstraintsIndices);
160 freeDeviceBuffer(&kernelParams_.d_massFactors);
161 freeDeviceBuffer(&kernelParams_.d_matrixA);
163 if (numAtomsAlloc_ > 0)
165 freeDeviceBuffer(&kernelParams_.d_inverseMasses);
169 //! Helper type for discovering coupled constraints
170 struct AtomsAdjacencyListElement
172 AtomsAdjacencyListElement(const int indexOfSecondConstrainedAtom,
173 const int indexOfConstraint,
174 const int signFactor) :
175 indexOfSecondConstrainedAtom_(indexOfSecondConstrainedAtom),
176 indexOfConstraint_(indexOfConstraint),
177 signFactor_(signFactor)
180 //! The index of the other atom constrained to this atom.
181 int indexOfSecondConstrainedAtom_;
182 //! The index of this constraint in the container of constraints.
183 int indexOfConstraint_;
184 /*! \brief A multiplicative factor that indicates the relative
185 * order of the atoms in the atom list.
187 * Used for computing the mass factor of this constraint
188 * relative to any coupled constraints. */
191 //! Constructs and returns an atom constraint adjacency list
192 static std::vector<std::vector<AtomsAdjacencyListElement>>
193 constructAtomsAdjacencyList(const int numAtoms, ArrayRef<const int> iatoms)
195 const int stride = 1 + NRAL(F_CONSTR);
196 const int numConstraints = iatoms.ssize() / stride;
197 std::vector<std::vector<AtomsAdjacencyListElement>> atomsAdjacencyList(numAtoms);
198 for (int c = 0; c < numConstraints; c++)
200 int a1 = iatoms[stride * c + 1];
201 int a2 = iatoms[stride * c + 2];
203 // Each constraint will be represented as a tuple, containing index of the second
204 // constrained atom, index of the constraint and a sign that indicates the order of atoms in
205 // which they are listed. Sign is needed to compute the mass factors.
206 atomsAdjacencyList[a1].emplace_back(a2, c, +1);
207 atomsAdjacencyList[a2].emplace_back(a1, c, -1);
210 return atomsAdjacencyList;
213 /*! \brief Helper function to go through constraints recursively.
215 * For each constraint, counts the number of coupled constraints and stores the value in \p numCoupledConstraints array.
216 * This information is used to split the array of constraints between thread blocks on a GPU so there is no
217 * coupling between constraints from different thread blocks. After the \p numCoupledConstraints array is filled, the
218 * value \p numCoupledConstraints[c] should be equal to the number of constraints that are coupled to \p c and located
219 * after it in the constraints array.
221 * \param[in] a Atom index.
222 * \param[in,out] numCoupledConstraints Indicates if the constraint was already counted and stores
223 * the number of constraints (i) connected to it and (ii) located
224 * after it in memory. This array is filled by this recursive function.
225 * For a set of coupled constraints, only for the first one in this list
226 * the number of consecutive coupled constraints is needed: if there is
227 * not enough space for this set of constraints in the thread block,
228 * the group has to be moved to the next one.
229 * \param[in] atomsAdjacencyList Stores information about connections between atoms.
231 inline int countCoupled(int a,
232 ArrayRef<int> numCoupledConstraints,
233 ArrayRef<const std::vector<AtomsAdjacencyListElement>> atomsAdjacencyList)
237 for (const auto& adjacentAtom : atomsAdjacencyList[a])
239 const int c2 = adjacentAtom.indexOfConstraint_;
240 if (numCoupledConstraints[c2] == -1)
242 numCoupledConstraints[c2] = 0; // To indicate we've been here
244 + countCoupled(adjacentAtom.indexOfSecondConstrainedAtom_,
245 numCoupledConstraints,
252 /*! \brief Add constraint to \p splitMap with all constraints coupled to it.
254 * Adds the constraint \p c from the constrain list \p iatoms to the map \p splitMap
255 * if it was not yet added. Then goes through all the constraints coupled to \p c
256 * and calls itself recursively. This ensures that all the coupled constraints will
257 * be added to neighboring locations in the final data structures on the device,
258 * hence mapping all coupled constraints to the same thread block. A value of -1 in
259 * the \p splitMap is used to flag that constraint was not yet added to the \p splitMap.
261 * \param[in] iatoms The list of constraints.
262 * \param[in] stride Number of elements per constraint in \p iatoms.
263 * \param[in] atomsAdjacencyList Information about connections between atoms.
264 * \param[out] splitMap Map of sequential constraint indexes to indexes to be on the device
265 * \param[in] c Sequential index for constraint to consider adding.
266 * \param[in,out] currentMapIndex The rolling index for the constraints mapping.
268 inline void addWithCoupled(ArrayRef<const int> iatoms,
270 ArrayRef<const std::vector<AtomsAdjacencyListElement>> atomsAdjacencyList,
271 ArrayRef<int> splitMap,
273 int* currentMapIndex)
275 if (splitMap[c] == -1)
277 splitMap[c] = *currentMapIndex;
278 (*currentMapIndex)++;
280 // Constraints, coupled through both atoms.
281 for (int atomIndexInConstraint = 0; atomIndexInConstraint < 2; atomIndexInConstraint++)
283 const int a1 = iatoms[stride * c + 1 + atomIndexInConstraint];
284 for (const auto& adjacentAtom : atomsAdjacencyList[a1])
286 const int c2 = adjacentAtom.indexOfConstraint_;
289 addWithCoupled(iatoms, stride, atomsAdjacencyList, splitMap, c2, currentMapIndex);
296 /*! \brief Computes and returns how many constraints are coupled to each constraint
298 * Needed to introduce splits in data so that all coupled constraints will be computed in a
299 * single GPU block. The position \p c of the vector \p numCoupledConstraints should have the number
300 * of constraints that are coupled to a constraint \p c and are after \p c in the vector. Only
301 * first index of the connected group of the constraints is needed later in the code, hence the
302 * numCoupledConstraints vector is also used to keep track if the constrain was already counted.
304 static std::vector<int> countNumCoupledConstraints(ArrayRef<const int> iatoms,
305 ArrayRef<const std::vector<AtomsAdjacencyListElement>> atomsAdjacencyList)
307 const int stride = 1 + NRAL(F_CONSTR);
308 const int numConstraints = iatoms.ssize() / stride;
309 std::vector<int> numCoupledConstraints(numConstraints, -1);
310 for (int c = 0; c < numConstraints; c++)
312 const int a1 = iatoms[stride * c + 1];
313 const int a2 = iatoms[stride * c + 2];
314 if (numCoupledConstraints[c] == -1)
316 numCoupledConstraints[c] = countCoupled(a1, numCoupledConstraints, atomsAdjacencyList)
317 + countCoupled(a2, numCoupledConstraints, atomsAdjacencyList);
321 return numCoupledConstraints;
324 bool LincsGpu::isNumCoupledConstraintsSupported(const gmx_mtop_t& mtop)
326 for (const gmx_moltype_t& molType : mtop.moltype)
328 ArrayRef<const int> iatoms = molType.ilist[F_CONSTR].iatoms;
329 const auto atomsAdjacencyList = constructAtomsAdjacencyList(molType.atoms.nr, iatoms);
330 // Compute, how many constraints are coupled to each constraint
331 const auto numCoupledConstraints = countNumCoupledConstraints(iatoms, atomsAdjacencyList);
332 for (const int numCoupled : numCoupledConstraints)
334 if (numCoupled > c_threadsPerBlock)
344 void LincsGpu::set(const InteractionDefinitions& idef, const int numAtoms, const real* invmass)
346 GMX_RELEASE_ASSERT(GMX_GPU_CUDA, "LINCS GPU is only implemented in CUDA.");
347 // List of constrained atoms (CPU memory)
348 std::vector<AtomPair> constraintsHost;
349 // Equilibrium distances for the constraints (CPU)
350 std::vector<float> constraintsTargetLengthsHost;
351 // Number of constraints, coupled with the current one (CPU)
352 std::vector<int> coupledConstraintsCountsHost;
353 // List of coupled with the current one (CPU)
354 std::vector<int> coupledConstraintsIndicesHost;
355 // Mass factors (CPU)
356 std::vector<float> massFactorsHost;
358 // List of constrained atoms in local topology
359 ArrayRef<const int> iatoms = idef.il[F_CONSTR].iatoms;
360 const int stride = NRAL(F_CONSTR) + 1;
361 const int numConstraints = idef.il[F_CONSTR].size() / stride;
363 // Early exit if no constraints
364 if (numConstraints == 0)
366 kernelParams_.numConstraintsThreads = 0;
370 // Construct the adjacency list, a useful intermediate structure
371 const auto atomsAdjacencyList = constructAtomsAdjacencyList(numAtoms, iatoms);
373 // Compute, how many constraints are coupled to each constraint
374 const auto numCoupledConstraints = countNumCoupledConstraints(iatoms, atomsAdjacencyList);
376 // Map of splits in the constraints data. For each 'old' constraint index gives 'new' which
377 // takes into account the empty spaces which might be needed in the end of each thread block.
378 std::vector<int> splitMap(numConstraints, -1);
379 int currentMapIndex = 0;
380 for (int c = 0; c < numConstraints; c++)
382 // Check if coupled constraints all fit in one block
383 if (numCoupledConstraints[c] > c_threadsPerBlock)
386 "Maximum number of coupled constraints (%d) exceeds the size of the CUDA "
387 "thread block (%d). Most likely, you are trying to use the GPU version of "
388 "LINCS with constraints on all-bonds, which is not supported for large "
389 "molecules. When compatible with the force field and integration settings, "
390 "using constraints on H-bonds only.",
391 numCoupledConstraints[c],
394 if (currentMapIndex / c_threadsPerBlock != (currentMapIndex + numCoupledConstraints[c]) / c_threadsPerBlock)
396 currentMapIndex = ((currentMapIndex / c_threadsPerBlock) + 1) * c_threadsPerBlock;
398 addWithCoupled(iatoms, stride, atomsAdjacencyList, splitMap, c, ¤tMapIndex);
401 kernelParams_.numConstraintsThreads =
402 currentMapIndex + c_threadsPerBlock - currentMapIndex % c_threadsPerBlock;
403 GMX_RELEASE_ASSERT(kernelParams_.numConstraintsThreads % c_threadsPerBlock == 0,
404 "Number of threads should be a multiple of the block size");
406 // Initialize constraints and their target indexes taking into account the splits in the data arrays.
410 constraintsHost.resize(kernelParams_.numConstraintsThreads, pair);
411 std::fill(constraintsHost.begin(), constraintsHost.end(), pair);
412 constraintsTargetLengthsHost.resize(kernelParams_.numConstraintsThreads, 0.0);
413 std::fill(constraintsTargetLengthsHost.begin(), constraintsTargetLengthsHost.end(), 0.0);
414 for (int c = 0; c < numConstraints; c++)
416 int a1 = iatoms[stride * c + 1];
417 int a2 = iatoms[stride * c + 2];
418 int type = iatoms[stride * c];
423 constraintsHost[splitMap[c]] = pair;
424 constraintsTargetLengthsHost[splitMap[c]] = idef.iparams[type].constr.dA;
427 // The adjacency list of constraints (i.e. the list of coupled constraints for each constraint).
428 // We map a single thread to a single constraint, hence each thread 'c' will be using one
429 // element from coupledConstraintsCountsHost array, which is the number of constraints coupled
430 // to the constraint 'c'. The coupled constraints indexes are placed into the
431 // coupledConstraintsIndicesHost array. Latter is organized as a one-dimensional array to ensure
432 // good memory alignment. It is addressed as [c + i*numConstraintsThreads], where 'i' goes from
433 // zero to the number of constraints coupled to 'c'. 'numConstraintsThreads' is the width of the
434 // array --- a number, greater then total number of constraints, taking into account the splits
435 // in the constraints array due to the GPU block borders. This number can be adjusted to improve
436 // memory access pattern. Mass factors are saved in a similar data structure.
437 int maxCoupledConstraints = 0;
438 bool maxCoupledConstraintsHasIncreased = false;
439 for (int c = 0; c < numConstraints; c++)
441 int a1 = iatoms[stride * c + 1];
442 int a2 = iatoms[stride * c + 2];
444 // Constraint 'c' is counted twice, but it should be excluded altogether. Hence '-2'.
445 int nCoupledConstraints = atomsAdjacencyList[a1].size() + atomsAdjacencyList[a2].size() - 2;
447 if (nCoupledConstraints > maxCoupledConstraints)
449 maxCoupledConstraints = nCoupledConstraints;
450 maxCoupledConstraintsHasIncreased = true;
454 coupledConstraintsCountsHost.resize(kernelParams_.numConstraintsThreads, 0);
455 coupledConstraintsIndicesHost.resize(maxCoupledConstraints * kernelParams_.numConstraintsThreads, -1);
456 massFactorsHost.resize(maxCoupledConstraints * kernelParams_.numConstraintsThreads, -1);
458 for (int c1 = 0; c1 < numConstraints; c1++)
460 coupledConstraintsCountsHost[splitMap[c1]] = 0;
461 int c1a1 = iatoms[stride * c1 + 1];
462 int c1a2 = iatoms[stride * c1 + 2];
464 // Constraints, coupled through the first atom.
466 for (const auto& atomAdjacencyList : atomsAdjacencyList[c1a1])
468 int c2 = atomAdjacencyList.indexOfConstraint_;
472 int c2a2 = atomAdjacencyList.indexOfSecondConstrainedAtom_;
473 int sign = atomAdjacencyList.signFactor_;
474 int index = kernelParams_.numConstraintsThreads
475 * coupledConstraintsCountsHost[splitMap[c1]]
477 int threadBlockStarts = splitMap[c1] - splitMap[c1] % c_threadsPerBlock;
479 coupledConstraintsIndicesHost[index] = splitMap[c2] - threadBlockStarts;
483 float sqrtmu1 = 1.0 / std::sqrt(invmass[c1a1] + invmass[c1a2]);
484 float sqrtmu2 = 1.0 / std::sqrt(invmass[c2a1] + invmass[c2a2]);
486 massFactorsHost[index] = -sign * invmass[center] * sqrtmu1 * sqrtmu2;
488 coupledConstraintsCountsHost[splitMap[c1]]++;
492 // Constraints, coupled through the second atom.
494 for (const auto& atomAdjacencyList : atomsAdjacencyList[c1a2])
496 int c2 = atomAdjacencyList.indexOfConstraint_;
500 int c2a2 = atomAdjacencyList.indexOfSecondConstrainedAtom_;
501 int sign = atomAdjacencyList.signFactor_;
502 int index = kernelParams_.numConstraintsThreads
503 * coupledConstraintsCountsHost[splitMap[c1]]
505 int threadBlockStarts = splitMap[c1] - splitMap[c1] % c_threadsPerBlock;
507 coupledConstraintsIndicesHost[index] = splitMap[c2] - threadBlockStarts;
511 float sqrtmu1 = 1.0 / std::sqrt(invmass[c1a1] + invmass[c1a2]);
512 float sqrtmu2 = 1.0 / std::sqrt(invmass[c2a1] + invmass[c2a2]);
514 massFactorsHost[index] = sign * invmass[center] * sqrtmu1 * sqrtmu2;
516 coupledConstraintsCountsHost[splitMap[c1]]++;
521 // (Re)allocate the memory, if the number of constraints has increased.
522 if ((kernelParams_.numConstraintsThreads > numConstraintsThreadsAlloc_) || maxCoupledConstraintsHasIncreased)
524 // Free memory if it was allocated before (i.e. if not the first time here).
525 if (numConstraintsThreadsAlloc_ > 0)
527 freeDeviceBuffer(&kernelParams_.d_constraints);
528 freeDeviceBuffer(&kernelParams_.d_constraintsTargetLengths);
530 freeDeviceBuffer(&kernelParams_.d_coupledConstraintsCounts);
531 freeDeviceBuffer(&kernelParams_.d_coupledConstraintsIndices);
532 freeDeviceBuffer(&kernelParams_.d_massFactors);
533 freeDeviceBuffer(&kernelParams_.d_matrixA);
536 numConstraintsThreadsAlloc_ = kernelParams_.numConstraintsThreads;
538 allocateDeviceBuffer(
539 &kernelParams_.d_constraints, kernelParams_.numConstraintsThreads, deviceContext_);
540 allocateDeviceBuffer(&kernelParams_.d_constraintsTargetLengths,
541 kernelParams_.numConstraintsThreads,
544 allocateDeviceBuffer(&kernelParams_.d_coupledConstraintsCounts,
545 kernelParams_.numConstraintsThreads,
547 allocateDeviceBuffer(&kernelParams_.d_coupledConstraintsIndices,
548 maxCoupledConstraints * kernelParams_.numConstraintsThreads,
550 allocateDeviceBuffer(&kernelParams_.d_massFactors,
551 maxCoupledConstraints * kernelParams_.numConstraintsThreads,
553 allocateDeviceBuffer(&kernelParams_.d_matrixA,
554 maxCoupledConstraints * kernelParams_.numConstraintsThreads,
558 // (Re)allocate the memory, if the number of atoms has increased.
559 if (numAtoms > numAtomsAlloc_)
561 if (numAtomsAlloc_ > 0)
563 freeDeviceBuffer(&kernelParams_.d_inverseMasses);
565 numAtomsAlloc_ = numAtoms;
566 allocateDeviceBuffer(&kernelParams_.d_inverseMasses, numAtoms, deviceContext_);
570 copyToDeviceBuffer(&kernelParams_.d_constraints,
571 constraintsHost.data(),
573 kernelParams_.numConstraintsThreads,
575 GpuApiCallBehavior::Sync,
577 copyToDeviceBuffer(&kernelParams_.d_constraintsTargetLengths,
578 constraintsTargetLengthsHost.data(),
580 kernelParams_.numConstraintsThreads,
582 GpuApiCallBehavior::Sync,
584 copyToDeviceBuffer(&kernelParams_.d_coupledConstraintsCounts,
585 coupledConstraintsCountsHost.data(),
587 kernelParams_.numConstraintsThreads,
589 GpuApiCallBehavior::Sync,
591 copyToDeviceBuffer(&kernelParams_.d_coupledConstraintsIndices,
592 coupledConstraintsIndicesHost.data(),
594 maxCoupledConstraints * kernelParams_.numConstraintsThreads,
596 GpuApiCallBehavior::Sync,
598 copyToDeviceBuffer(&kernelParams_.d_massFactors,
599 massFactorsHost.data(),
601 maxCoupledConstraints * kernelParams_.numConstraintsThreads,
603 GpuApiCallBehavior::Sync,
606 GMX_RELEASE_ASSERT(invmass != nullptr, "Masses of atoms should be specified.\n");
608 &kernelParams_.d_inverseMasses, invmass, 0, numAtoms, deviceStream_, GpuApiCallBehavior::Sync, nullptr);