2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2019,2020,2021, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
37 * \brief Implementations of LINCS GPU class
39 * This file contains back-end agnostic implementation of LINCS GPU class.
41 * \author Artem Zhmurov <zhmurov@gmail.com>
42 * \author Alan Gray <alang@nvidia.com>
44 * \ingroup module_mdlib
48 #include "lincs_gpu.h"
57 #include "gromacs/gpu_utils/devicebuffer.h"
58 #include "gromacs/gpu_utils/gputraits.h"
59 #include "gromacs/math/functions.h"
60 #include "gromacs/math/vec.h"
61 #include "gromacs/mdlib/constr.h"
62 #include "gromacs/mdlib/lincs_gpu_internal.h"
63 #include "gromacs/pbcutil/pbc.h"
64 #include "gromacs/topology/ifunc.h"
65 #include "gromacs/topology/topology.h"
70 void LincsGpu::apply(const DeviceBuffer<Float3> d_x,
71 DeviceBuffer<Float3> d_xp,
72 const bool updateVelocities,
73 DeviceBuffer<Float3> d_v,
75 const bool computeVirial,
77 const PbcAiuc pbcAiuc)
79 GMX_ASSERT(GMX_GPU_CUDA, "LINCS GPU is only implemented in CUDA.");
81 // Early exit if no constraints
82 if (kernelParams_.numConstraintsThreads == 0)
89 // Fill with zeros so the values can be reduced to it
90 // Only 6 values are needed because virial is symmetrical
91 clearDeviceBufferAsync(&kernelParams_.d_virialScaled, 0, 6, deviceStream_);
94 kernelParams_.pbcAiuc = pbcAiuc;
97 kernelParams_, d_x, d_xp, updateVelocities, d_v, invdt, computeVirial, deviceStream_);
101 // Copy LINCS virial data and add it to the common virial
102 copyFromDeviceBuffer(h_virialScaled_.data(),
103 &kernelParams_.d_virialScaled,
107 GpuApiCallBehavior::Sync,
110 // Mapping [XX, XY, XZ, YY, YZ, ZZ] internal format to a tensor object
111 virialScaled[XX][XX] += h_virialScaled_[0];
112 virialScaled[XX][YY] += h_virialScaled_[1];
113 virialScaled[XX][ZZ] += h_virialScaled_[2];
115 virialScaled[YY][XX] += h_virialScaled_[1];
116 virialScaled[YY][YY] += h_virialScaled_[3];
117 virialScaled[YY][ZZ] += h_virialScaled_[4];
119 virialScaled[ZZ][XX] += h_virialScaled_[2];
120 virialScaled[ZZ][YY] += h_virialScaled_[4];
121 virialScaled[ZZ][ZZ] += h_virialScaled_[5];
127 LincsGpu::LincsGpu(int numIterations,
129 const DeviceContext& deviceContext,
130 const DeviceStream& deviceStream) :
131 deviceContext_(deviceContext),
132 deviceStream_(deviceStream)
134 GMX_RELEASE_ASSERT(GMX_GPU_CUDA, "LINCS GPU is only implemented in CUDA.");
135 kernelParams_.numIterations = numIterations;
136 kernelParams_.expansionOrder = expansionOrder;
138 static_assert(sizeof(real) == sizeof(float),
139 "Real numbers should be in single precision in GPU code.");
141 gmx::isPowerOfTwo(c_threadsPerBlock),
142 "Number of threads per block should be a power of two in order for reduction to work.");
144 allocateDeviceBuffer(&kernelParams_.d_virialScaled, 6, deviceContext_);
145 h_virialScaled_.resize(6);
147 // The data arrays should be expanded/reallocated on first call of set() function.
148 numConstraintsThreadsAlloc_ = 0;
152 LincsGpu::~LincsGpu()
154 freeDeviceBuffer(&kernelParams_.d_virialScaled);
156 if (numConstraintsThreadsAlloc_ > 0)
158 freeDeviceBuffer(&kernelParams_.d_constraints);
159 freeDeviceBuffer(&kernelParams_.d_constraintsTargetLengths);
161 freeDeviceBuffer(&kernelParams_.d_coupledConstraintsCounts);
162 freeDeviceBuffer(&kernelParams_.d_coupledConstraintsIndices);
163 freeDeviceBuffer(&kernelParams_.d_massFactors);
164 freeDeviceBuffer(&kernelParams_.d_matrixA);
166 if (numAtomsAlloc_ > 0)
168 freeDeviceBuffer(&kernelParams_.d_inverseMasses);
172 //! Helper type for discovering coupled constraints
173 struct AtomsAdjacencyListElement
175 AtomsAdjacencyListElement(const int indexOfSecondConstrainedAtom,
176 const int indexOfConstraint,
177 const int signFactor) :
178 indexOfSecondConstrainedAtom_(indexOfSecondConstrainedAtom),
179 indexOfConstraint_(indexOfConstraint),
180 signFactor_(signFactor)
183 //! The index of the other atom constrained to this atom.
184 int indexOfSecondConstrainedAtom_;
185 //! The index of this constraint in the container of constraints.
186 int indexOfConstraint_;
187 /*! \brief A multiplicative factor that indicates the relative
188 * order of the atoms in the atom list.
190 * Used for computing the mass factor of this constraint
191 * relative to any coupled constraints. */
194 //! Constructs and returns an atom constraint adjacency list
195 static std::vector<std::vector<AtomsAdjacencyListElement>>
196 constructAtomsAdjacencyList(const int numAtoms, ArrayRef<const int> iatoms)
198 const int stride = 1 + NRAL(F_CONSTR);
199 const int numConstraints = iatoms.ssize() / stride;
200 std::vector<std::vector<AtomsAdjacencyListElement>> atomsAdjacencyList(numAtoms);
201 for (int c = 0; c < numConstraints; c++)
203 int a1 = iatoms[stride * c + 1];
204 int a2 = iatoms[stride * c + 2];
206 // Each constraint will be represented as a tuple, containing index of the second
207 // constrained atom, index of the constraint and a sign that indicates the order of atoms in
208 // which they are listed. Sign is needed to compute the mass factors.
209 atomsAdjacencyList[a1].emplace_back(a2, c, +1);
210 atomsAdjacencyList[a2].emplace_back(a1, c, -1);
213 return atomsAdjacencyList;
216 /*! \brief Helper function to go through constraints recursively.
218 * For each constraint, counts the number of coupled constraints and stores the value in \p numCoupledConstraints array.
219 * This information is used to split the array of constraints between thread blocks on a GPU so there is no
220 * coupling between constraints from different thread blocks. After the \p numCoupledConstraints array is filled, the
221 * value \p numCoupledConstraints[c] should be equal to the number of constraints that are coupled to \p c and located
222 * after it in the constraints array.
224 * \param[in] a Atom index.
225 * \param[in,out] numCoupledConstraints Indicates if the constraint was already counted and stores
226 * the number of constraints (i) connected to it and (ii) located
227 * after it in memory. This array is filled by this recursive function.
228 * For a set of coupled constraints, only for the first one in this list
229 * the number of consecutive coupled constraints is needed: if there is
230 * not enough space for this set of constraints in the thread block,
231 * the group has to be moved to the next one.
232 * \param[in] atomsAdjacencyList Stores information about connections between atoms.
234 inline int countCoupled(int a,
235 ArrayRef<int> numCoupledConstraints,
236 ArrayRef<const std::vector<AtomsAdjacencyListElement>> atomsAdjacencyList)
240 for (const auto& adjacentAtom : atomsAdjacencyList[a])
242 const int c2 = adjacentAtom.indexOfConstraint_;
243 if (numCoupledConstraints[c2] == -1)
245 numCoupledConstraints[c2] = 0; // To indicate we've been here
247 + countCoupled(adjacentAtom.indexOfSecondConstrainedAtom_,
248 numCoupledConstraints,
255 /*! \brief Add constraint to \p splitMap with all constraints coupled to it.
257 * Adds the constraint \p c from the constrain list \p iatoms to the map \p splitMap
258 * if it was not yet added. Then goes through all the constraints coupled to \p c
259 * and calls itself recursively. This ensures that all the coupled constraints will
260 * be added to neighboring locations in the final data structures on the device,
261 * hence mapping all coupled constraints to the same thread block. A value of -1 in
262 * the \p splitMap is used to flag that constraint was not yet added to the \p splitMap.
264 * \param[in] iatoms The list of constraints.
265 * \param[in] stride Number of elements per constraint in \p iatoms.
266 * \param[in] atomsAdjacencyList Information about connections between atoms.
267 * \param[out] splitMap Map of sequential constraint indexes to indexes to be on the device
268 * \param[in] c Sequential index for constraint to consider adding.
269 * \param[in,out] currentMapIndex The rolling index for the constraints mapping.
271 inline void addWithCoupled(ArrayRef<const int> iatoms,
273 ArrayRef<const std::vector<AtomsAdjacencyListElement>> atomsAdjacencyList,
274 ArrayRef<int> splitMap,
276 int* currentMapIndex)
278 if (splitMap[c] == -1)
280 splitMap[c] = *currentMapIndex;
281 (*currentMapIndex)++;
283 // Constraints, coupled through both atoms.
284 for (int atomIndexInConstraint = 0; atomIndexInConstraint < 2; atomIndexInConstraint++)
286 const int a1 = iatoms[stride * c + 1 + atomIndexInConstraint];
287 for (const auto& adjacentAtom : atomsAdjacencyList[a1])
289 const int c2 = adjacentAtom.indexOfConstraint_;
292 addWithCoupled(iatoms, stride, atomsAdjacencyList, splitMap, c2, currentMapIndex);
299 /*! \brief Computes and returns how many constraints are coupled to each constraint
301 * Needed to introduce splits in data so that all coupled constraints will be computed in a
302 * single GPU block. The position \p c of the vector \p numCoupledConstraints should have the number
303 * of constraints that are coupled to a constraint \p c and are after \p c in the vector. Only
304 * first index of the connected group of the constraints is needed later in the code, hence the
305 * numCoupledConstraints vector is also used to keep track if the constrain was already counted.
307 static std::vector<int> countNumCoupledConstraints(ArrayRef<const int> iatoms,
308 ArrayRef<const std::vector<AtomsAdjacencyListElement>> atomsAdjacencyList)
310 const int stride = 1 + NRAL(F_CONSTR);
311 const int numConstraints = iatoms.ssize() / stride;
312 std::vector<int> numCoupledConstraints(numConstraints, -1);
313 for (int c = 0; c < numConstraints; c++)
315 const int a1 = iatoms[stride * c + 1];
316 const int a2 = iatoms[stride * c + 2];
317 if (numCoupledConstraints[c] == -1)
319 numCoupledConstraints[c] = countCoupled(a1, numCoupledConstraints, atomsAdjacencyList)
320 + countCoupled(a2, numCoupledConstraints, atomsAdjacencyList);
324 return numCoupledConstraints;
327 bool LincsGpu::isNumCoupledConstraintsSupported(const gmx_mtop_t& mtop)
329 for (const gmx_moltype_t& molType : mtop.moltype)
331 ArrayRef<const int> iatoms = molType.ilist[F_CONSTR].iatoms;
332 const auto atomsAdjacencyList = constructAtomsAdjacencyList(molType.atoms.nr, iatoms);
333 // Compute, how many constraints are coupled to each constraint
334 const auto numCoupledConstraints = countNumCoupledConstraints(iatoms, atomsAdjacencyList);
335 for (const int numCoupled : numCoupledConstraints)
337 if (numCoupled > c_threadsPerBlock)
347 void LincsGpu::set(const InteractionDefinitions& idef, const int numAtoms, const real* invmass)
349 GMX_RELEASE_ASSERT(GMX_GPU_CUDA, "LINCS GPU is only implemented in CUDA.");
350 // List of constrained atoms (CPU memory)
351 std::vector<AtomPair> constraintsHost;
352 // Equilibrium distances for the constraints (CPU)
353 std::vector<float> constraintsTargetLengthsHost;
354 // Number of constraints, coupled with the current one (CPU)
355 std::vector<int> coupledConstraintsCountsHost;
356 // List of coupled with the current one (CPU)
357 std::vector<int> coupledConstraintsIndicesHost;
358 // Mass factors (CPU)
359 std::vector<float> massFactorsHost;
361 // List of constrained atoms in local topology
362 ArrayRef<const int> iatoms = idef.il[F_CONSTR].iatoms;
363 const int stride = NRAL(F_CONSTR) + 1;
364 const int numConstraints = idef.il[F_CONSTR].size() / stride;
366 // Early exit if no constraints
367 if (numConstraints == 0)
369 kernelParams_.numConstraintsThreads = 0;
373 // Construct the adjacency list, a useful intermediate structure
374 const auto atomsAdjacencyList = constructAtomsAdjacencyList(numAtoms, iatoms);
376 // Compute, how many constraints are coupled to each constraint
377 const auto numCoupledConstraints = countNumCoupledConstraints(iatoms, atomsAdjacencyList);
379 // Map of splits in the constraints data. For each 'old' constraint index gives 'new' which
380 // takes into account the empty spaces which might be needed in the end of each thread block.
381 std::vector<int> splitMap(numConstraints, -1);
382 int currentMapIndex = 0;
383 for (int c = 0; c < numConstraints; c++)
385 // Check if coupled constraints all fit in one block
386 if (numCoupledConstraints.at(c) > c_threadsPerBlock)
389 "Maximum number of coupled constraints (%d) exceeds the size of the CUDA "
390 "thread block (%d). Most likely, you are trying to use the GPU version of "
391 "LINCS with constraints on all-bonds, which is not supported for large "
392 "molecules. When compatible with the force field and integration settings, "
393 "using constraints on H-bonds only.",
394 numCoupledConstraints.at(c),
397 if (currentMapIndex / c_threadsPerBlock
398 != (currentMapIndex + numCoupledConstraints.at(c)) / c_threadsPerBlock)
400 currentMapIndex = ((currentMapIndex / c_threadsPerBlock) + 1) * c_threadsPerBlock;
402 addWithCoupled(iatoms, stride, atomsAdjacencyList, splitMap, c, ¤tMapIndex);
405 kernelParams_.numConstraintsThreads =
406 currentMapIndex + c_threadsPerBlock - currentMapIndex % c_threadsPerBlock;
407 GMX_RELEASE_ASSERT(kernelParams_.numConstraintsThreads % c_threadsPerBlock == 0,
408 "Number of threads should be a multiple of the block size");
410 // Initialize constraints and their target indexes taking into account the splits in the data arrays.
414 constraintsHost.resize(kernelParams_.numConstraintsThreads, pair);
415 std::fill(constraintsHost.begin(), constraintsHost.end(), pair);
416 constraintsTargetLengthsHost.resize(kernelParams_.numConstraintsThreads, 0.0);
417 std::fill(constraintsTargetLengthsHost.begin(), constraintsTargetLengthsHost.end(), 0.0);
418 for (int c = 0; c < numConstraints; c++)
420 int a1 = iatoms[stride * c + 1];
421 int a2 = iatoms[stride * c + 2];
422 int type = iatoms[stride * c];
427 constraintsHost.at(splitMap.at(c)) = pair;
428 constraintsTargetLengthsHost.at(splitMap.at(c)) = idef.iparams[type].constr.dA;
431 // The adjacency list of constraints (i.e. the list of coupled constraints for each constraint).
432 // We map a single thread to a single constraint, hence each thread 'c' will be using one
433 // element from coupledConstraintsCountsHost array, which is the number of constraints coupled
434 // to the constraint 'c'. The coupled constraints indexes are placed into the
435 // coupledConstraintsIndicesHost array. Latter is organized as a one-dimensional array to ensure
436 // good memory alignment. It is addressed as [c + i*numConstraintsThreads], where 'i' goes from
437 // zero to the number of constraints coupled to 'c'. 'numConstraintsThreads' is the width of the
438 // array --- a number, greater then total number of constraints, taking into account the splits
439 // in the constraints array due to the GPU block borders. This number can be adjusted to improve
440 // memory access pattern. Mass factors are saved in a similar data structure.
441 int maxCoupledConstraints = 0;
442 bool maxCoupledConstraintsHasIncreased = false;
443 for (int c = 0; c < numConstraints; c++)
445 int a1 = iatoms[stride * c + 1];
446 int a2 = iatoms[stride * c + 2];
448 // Constraint 'c' is counted twice, but it should be excluded altogether. Hence '-2'.
449 int nCoupledConstraints = atomsAdjacencyList.at(a1).size() + atomsAdjacencyList.at(a2).size() - 2;
451 if (nCoupledConstraints > maxCoupledConstraints)
453 maxCoupledConstraints = nCoupledConstraints;
454 maxCoupledConstraintsHasIncreased = true;
458 coupledConstraintsCountsHost.resize(kernelParams_.numConstraintsThreads, 0);
459 coupledConstraintsIndicesHost.resize(maxCoupledConstraints * kernelParams_.numConstraintsThreads, -1);
460 massFactorsHost.resize(maxCoupledConstraints * kernelParams_.numConstraintsThreads, -1);
462 for (int c1 = 0; c1 < numConstraints; c1++)
464 coupledConstraintsCountsHost.at(splitMap.at(c1)) = 0;
465 int c1a1 = iatoms[stride * c1 + 1];
466 int c1a2 = iatoms[stride * c1 + 2];
468 // Constraints, coupled through the first atom.
470 for (const auto& atomAdjacencyList : atomsAdjacencyList[c1a1])
472 int c2 = atomAdjacencyList.indexOfConstraint_;
476 int c2a2 = atomAdjacencyList.indexOfSecondConstrainedAtom_;
477 int sign = atomAdjacencyList.signFactor_;
478 int index = kernelParams_.numConstraintsThreads
479 * coupledConstraintsCountsHost.at(splitMap.at(c1))
482 coupledConstraintsIndicesHost.at(index) = splitMap.at(c2);
486 float sqrtmu1 = 1.0 / sqrt(invmass[c1a1] + invmass[c1a2]);
487 float sqrtmu2 = 1.0 / sqrt(invmass[c2a1] + invmass[c2a2]);
489 massFactorsHost.at(index) = -sign * invmass[center] * sqrtmu1 * sqrtmu2;
491 coupledConstraintsCountsHost.at(splitMap.at(c1))++;
495 // Constraints, coupled through the second atom.
497 for (const auto& atomAdjacencyList : atomsAdjacencyList[c1a2])
499 int c2 = atomAdjacencyList.indexOfConstraint_;
503 int c2a2 = atomAdjacencyList.indexOfSecondConstrainedAtom_;
504 int sign = atomAdjacencyList.signFactor_;
505 int index = kernelParams_.numConstraintsThreads
506 * coupledConstraintsCountsHost.at(splitMap.at(c1))
509 coupledConstraintsIndicesHost.at(index) = splitMap.at(c2);
513 float sqrtmu1 = 1.0 / sqrt(invmass[c1a1] + invmass[c1a2]);
514 float sqrtmu2 = 1.0 / sqrt(invmass[c2a1] + invmass[c2a2]);
516 massFactorsHost.at(index) = sign * invmass[center] * sqrtmu1 * sqrtmu2;
518 coupledConstraintsCountsHost.at(splitMap.at(c1))++;
523 // (Re)allocate the memory, if the number of constraints has increased.
524 if ((kernelParams_.numConstraintsThreads > numConstraintsThreadsAlloc_) || maxCoupledConstraintsHasIncreased)
526 // Free memory if it was allocated before (i.e. if not the first time here).
527 if (numConstraintsThreadsAlloc_ > 0)
529 freeDeviceBuffer(&kernelParams_.d_constraints);
530 freeDeviceBuffer(&kernelParams_.d_constraintsTargetLengths);
532 freeDeviceBuffer(&kernelParams_.d_coupledConstraintsCounts);
533 freeDeviceBuffer(&kernelParams_.d_coupledConstraintsIndices);
534 freeDeviceBuffer(&kernelParams_.d_massFactors);
535 freeDeviceBuffer(&kernelParams_.d_matrixA);
538 numConstraintsThreadsAlloc_ = kernelParams_.numConstraintsThreads;
540 allocateDeviceBuffer(
541 &kernelParams_.d_constraints, kernelParams_.numConstraintsThreads, deviceContext_);
542 allocateDeviceBuffer(&kernelParams_.d_constraintsTargetLengths,
543 kernelParams_.numConstraintsThreads,
546 allocateDeviceBuffer(&kernelParams_.d_coupledConstraintsCounts,
547 kernelParams_.numConstraintsThreads,
549 allocateDeviceBuffer(&kernelParams_.d_coupledConstraintsIndices,
550 maxCoupledConstraints * kernelParams_.numConstraintsThreads,
552 allocateDeviceBuffer(&kernelParams_.d_massFactors,
553 maxCoupledConstraints * kernelParams_.numConstraintsThreads,
555 allocateDeviceBuffer(&kernelParams_.d_matrixA,
556 maxCoupledConstraints * kernelParams_.numConstraintsThreads,
560 // (Re)allocate the memory, if the number of atoms has increased.
561 if (numAtoms > numAtomsAlloc_)
563 if (numAtomsAlloc_ > 0)
565 freeDeviceBuffer(&kernelParams_.d_inverseMasses);
567 numAtomsAlloc_ = numAtoms;
568 allocateDeviceBuffer(&kernelParams_.d_inverseMasses, numAtoms, deviceContext_);
572 copyToDeviceBuffer(&kernelParams_.d_constraints,
573 constraintsHost.data(),
575 kernelParams_.numConstraintsThreads,
577 GpuApiCallBehavior::Sync,
579 copyToDeviceBuffer(&kernelParams_.d_constraintsTargetLengths,
580 constraintsTargetLengthsHost.data(),
582 kernelParams_.numConstraintsThreads,
584 GpuApiCallBehavior::Sync,
586 copyToDeviceBuffer(&kernelParams_.d_coupledConstraintsCounts,
587 coupledConstraintsCountsHost.data(),
589 kernelParams_.numConstraintsThreads,
591 GpuApiCallBehavior::Sync,
593 copyToDeviceBuffer(&kernelParams_.d_coupledConstraintsIndices,
594 coupledConstraintsIndicesHost.data(),
596 maxCoupledConstraints * kernelParams_.numConstraintsThreads,
598 GpuApiCallBehavior::Sync,
600 copyToDeviceBuffer(&kernelParams_.d_massFactors,
601 massFactorsHost.data(),
603 maxCoupledConstraints * kernelParams_.numConstraintsThreads,
605 GpuApiCallBehavior::Sync,
608 GMX_RELEASE_ASSERT(invmass != nullptr, "Masses of atoms should be specified.\n");
610 &kernelParams_.d_inverseMasses, invmass, 0, numAtoms, deviceStream_, GpuApiCallBehavior::Sync, nullptr);