2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2019,2020,2021, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
37 * \brief Implementations of LINCS GPU class
39 * This file contains back-end agnostic implementation of LINCS GPU class.
41 * \author Artem Zhmurov <zhmurov@gmail.com>
42 * \author Alan Gray <alang@nvidia.com>
44 * \ingroup module_mdlib
48 #include "lincs_gpu.h"
57 #include "gromacs/gpu_utils/devicebuffer.h"
58 #include "gromacs/gpu_utils/gputraits.h"
59 #include "gromacs/math/functions.h"
60 #include "gromacs/math/vec.h"
61 #include "gromacs/mdlib/constr.h"
62 #include "gromacs/mdlib/lincs_gpu_internal.h"
63 #include "gromacs/pbcutil/pbc.h"
64 #include "gromacs/topology/ifunc.h"
65 #include "gromacs/topology/topology.h"
70 void LincsGpu::apply(const DeviceBuffer<Float3>& d_x,
71 DeviceBuffer<Float3> d_xp,
72 const bool updateVelocities,
73 DeviceBuffer<Float3> d_v,
75 const bool computeVirial,
77 const PbcAiuc& pbcAiuc)
79 // Early exit if no constraints
80 if (kernelParams_.numConstraintsThreads == 0)
87 // Fill with zeros so the values can be reduced to it
88 // Only 6 values are needed because virial is symmetrical
89 clearDeviceBufferAsync(&kernelParams_.d_virialScaled, 0, 6, deviceStream_);
92 kernelParams_.pbcAiuc = pbcAiuc;
95 kernelParams_, d_x, d_xp, updateVelocities, d_v, invdt, computeVirial, deviceStream_);
99 // Copy LINCS virial data and add it to the common virial
100 copyFromDeviceBuffer(h_virialScaled_.data(),
101 &kernelParams_.d_virialScaled,
105 GpuApiCallBehavior::Sync,
108 // Mapping [XX, XY, XZ, YY, YZ, ZZ] internal format to a tensor object
109 virialScaled[XX][XX] += h_virialScaled_[0];
110 virialScaled[XX][YY] += h_virialScaled_[1];
111 virialScaled[XX][ZZ] += h_virialScaled_[2];
113 virialScaled[YY][XX] += h_virialScaled_[1];
114 virialScaled[YY][YY] += h_virialScaled_[3];
115 virialScaled[YY][ZZ] += h_virialScaled_[4];
117 virialScaled[ZZ][XX] += h_virialScaled_[2];
118 virialScaled[ZZ][YY] += h_virialScaled_[4];
119 virialScaled[ZZ][ZZ] += h_virialScaled_[5];
123 LincsGpu::LincsGpu(int numIterations,
125 const DeviceContext& deviceContext,
126 const DeviceStream& deviceStream) :
127 deviceContext_(deviceContext), deviceStream_(deviceStream)
129 GMX_RELEASE_ASSERT(bool(GMX_GPU_CUDA) || bool(GMX_GPU_SYCL),
130 "LINCS GPU is only implemented in CUDA and SYCL.");
131 kernelParams_.numIterations = numIterations;
132 kernelParams_.expansionOrder = expansionOrder;
134 static_assert(sizeof(real) == sizeof(float),
135 "Real numbers should be in single precision in GPU code.");
137 gmx::isPowerOfTwo(c_threadsPerBlock),
138 "Number of threads per block should be a power of two in order for reduction to work.");
140 allocateDeviceBuffer(&kernelParams_.d_virialScaled, 6, deviceContext_);
141 h_virialScaled_.resize(6);
143 // The data arrays should be expanded/reallocated on first call of set() function.
144 numConstraintsThreadsAlloc_ = 0;
148 LincsGpu::~LincsGpu()
150 freeDeviceBuffer(&kernelParams_.d_virialScaled);
152 if (numConstraintsThreadsAlloc_ > 0)
154 freeDeviceBuffer(&kernelParams_.d_constraints);
155 freeDeviceBuffer(&kernelParams_.d_constraintsTargetLengths);
157 freeDeviceBuffer(&kernelParams_.d_coupledConstraintsCounts);
158 freeDeviceBuffer(&kernelParams_.d_coupledConstraintsIndices);
159 freeDeviceBuffer(&kernelParams_.d_massFactors);
160 freeDeviceBuffer(&kernelParams_.d_matrixA);
162 if (numAtomsAlloc_ > 0)
164 freeDeviceBuffer(&kernelParams_.d_inverseMasses);
168 //! Helper type for discovering coupled constraints
169 struct AtomsAdjacencyListElement
171 AtomsAdjacencyListElement(const int indexOfSecondConstrainedAtom,
172 const int indexOfConstraint,
173 const int signFactor) :
174 indexOfSecondConstrainedAtom_(indexOfSecondConstrainedAtom),
175 indexOfConstraint_(indexOfConstraint),
176 signFactor_(signFactor)
179 //! The index of the other atom constrained to this atom.
180 int indexOfSecondConstrainedAtom_;
181 //! The index of this constraint in the container of constraints.
182 int indexOfConstraint_;
183 /*! \brief A multiplicative factor that indicates the relative
184 * order of the atoms in the atom list.
186 * Used for computing the mass factor of this constraint
187 * relative to any coupled constraints. */
190 //! Constructs and returns an atom constraint adjacency list
191 static std::vector<std::vector<AtomsAdjacencyListElement>>
192 constructAtomsAdjacencyList(const int numAtoms, ArrayRef<const int> iatoms)
194 const int stride = 1 + NRAL(F_CONSTR);
195 const int numConstraints = iatoms.ssize() / stride;
196 std::vector<std::vector<AtomsAdjacencyListElement>> atomsAdjacencyList(numAtoms);
197 for (int c = 0; c < numConstraints; c++)
199 int a1 = iatoms[stride * c + 1];
200 int a2 = iatoms[stride * c + 2];
202 // Each constraint will be represented as a tuple, containing index of the second
203 // constrained atom, index of the constraint and a sign that indicates the order of atoms in
204 // which they are listed. Sign is needed to compute the mass factors.
205 atomsAdjacencyList[a1].emplace_back(a2, c, +1);
206 atomsAdjacencyList[a2].emplace_back(a1, c, -1);
209 return atomsAdjacencyList;
212 /*! \brief Helper function to go through constraints recursively.
214 * For each constraint, counts the number of coupled constraints and stores the value in \p numCoupledConstraints array.
215 * This information is used to split the array of constraints between thread blocks on a GPU so there is no
216 * coupling between constraints from different thread blocks. After the \p numCoupledConstraints array is filled, the
217 * value \p numCoupledConstraints[c] should be equal to the number of constraints that are coupled to \p c and located
218 * after it in the constraints array.
220 * \param[in] a Atom index.
221 * \param[in,out] numCoupledConstraints Indicates if the constraint was already counted and stores
222 * the number of constraints (i) connected to it and (ii) located
223 * after it in memory. This array is filled by this recursive function.
224 * For a set of coupled constraints, only for the first one in this list
225 * the number of consecutive coupled constraints is needed: if there is
226 * not enough space for this set of constraints in the thread block,
227 * the group has to be moved to the next one.
228 * \param[in] atomsAdjacencyList Stores information about connections between atoms.
230 inline int countCoupled(int a,
231 ArrayRef<int> numCoupledConstraints,
232 ArrayRef<const std::vector<AtomsAdjacencyListElement>> atomsAdjacencyList)
236 for (const auto& adjacentAtom : atomsAdjacencyList[a])
238 const int c2 = adjacentAtom.indexOfConstraint_;
239 if (numCoupledConstraints[c2] == -1)
241 numCoupledConstraints[c2] = 0; // To indicate we've been here
243 + countCoupled(adjacentAtom.indexOfSecondConstrainedAtom_,
244 numCoupledConstraints,
251 /*! \brief Add constraint to \p splitMap with all constraints coupled to it.
253 * Adds the constraint \p c from the constrain list \p iatoms to the map \p splitMap
254 * if it was not yet added. Then goes through all the constraints coupled to \p c
255 * and calls itself recursively. This ensures that all the coupled constraints will
256 * be added to neighboring locations in the final data structures on the device,
257 * hence mapping all coupled constraints to the same thread block. A value of -1 in
258 * the \p splitMap is used to flag that constraint was not yet added to the \p splitMap.
260 * \param[in] iatoms The list of constraints.
261 * \param[in] stride Number of elements per constraint in \p iatoms.
262 * \param[in] atomsAdjacencyList Information about connections between atoms.
263 * \param[out] splitMap Map of sequential constraint indexes to indexes to be on the device
264 * \param[in] c Sequential index for constraint to consider adding.
265 * \param[in,out] currentMapIndex The rolling index for the constraints mapping.
267 inline void addWithCoupled(ArrayRef<const int> iatoms,
269 ArrayRef<const std::vector<AtomsAdjacencyListElement>> atomsAdjacencyList,
270 ArrayRef<int> splitMap,
272 int* currentMapIndex)
274 if (splitMap[c] == -1)
276 splitMap[c] = *currentMapIndex;
277 (*currentMapIndex)++;
279 // Constraints, coupled through both atoms.
280 for (int atomIndexInConstraint = 0; atomIndexInConstraint < 2; atomIndexInConstraint++)
282 const int a1 = iatoms[stride * c + 1 + atomIndexInConstraint];
283 for (const auto& adjacentAtom : atomsAdjacencyList[a1])
285 const int c2 = adjacentAtom.indexOfConstraint_;
288 addWithCoupled(iatoms, stride, atomsAdjacencyList, splitMap, c2, currentMapIndex);
295 /*! \brief Computes and returns how many constraints are coupled to each constraint
297 * Needed to introduce splits in data so that all coupled constraints will be computed in a
298 * single GPU block. The position \p c of the vector \p numCoupledConstraints should have the number
299 * of constraints that are coupled to a constraint \p c and are after \p c in the vector. Only
300 * first index of the connected group of the constraints is needed later in the code, hence the
301 * numCoupledConstraints vector is also used to keep track if the constrain was already counted.
303 static std::vector<int> countNumCoupledConstraints(ArrayRef<const int> iatoms,
304 ArrayRef<const std::vector<AtomsAdjacencyListElement>> atomsAdjacencyList)
306 const int stride = 1 + NRAL(F_CONSTR);
307 const int numConstraints = iatoms.ssize() / stride;
308 std::vector<int> numCoupledConstraints(numConstraints, -1);
309 for (int c = 0; c < numConstraints; c++)
311 const int a1 = iatoms[stride * c + 1];
312 const int a2 = iatoms[stride * c + 2];
313 if (numCoupledConstraints[c] == -1)
315 numCoupledConstraints[c] = countCoupled(a1, numCoupledConstraints, atomsAdjacencyList)
316 + countCoupled(a2, numCoupledConstraints, atomsAdjacencyList);
320 return numCoupledConstraints;
323 bool LincsGpu::isNumCoupledConstraintsSupported(const gmx_mtop_t& mtop)
325 for (const gmx_moltype_t& molType : mtop.moltype)
327 ArrayRef<const int> iatoms = molType.ilist[F_CONSTR].iatoms;
328 const auto atomsAdjacencyList = constructAtomsAdjacencyList(molType.atoms.nr, iatoms);
329 // Compute, how many constraints are coupled to each constraint
330 const auto numCoupledConstraints = countNumCoupledConstraints(iatoms, atomsAdjacencyList);
331 for (const int numCoupled : numCoupledConstraints)
333 if (numCoupled > c_threadsPerBlock)
343 void LincsGpu::set(const InteractionDefinitions& idef, const int numAtoms, const real* invmass)
345 GMX_RELEASE_ASSERT(bool(GMX_GPU_CUDA) || bool(GMX_GPU_SYCL),
346 "LINCS GPU is only implemented in CUDA and SYCL.");
347 // List of constrained atoms (CPU memory)
348 std::vector<AtomPair> constraintsHost;
349 // Equilibrium distances for the constraints (CPU)
350 std::vector<float> constraintsTargetLengthsHost;
351 // Number of constraints, coupled with the current one (CPU)
352 std::vector<int> coupledConstraintsCountsHost;
353 // List of coupled with the current one (CPU)
354 std::vector<int> coupledConstraintsIndicesHost;
355 // Mass factors (CPU)
356 std::vector<float> massFactorsHost;
358 // List of constrained atoms in local topology
359 ArrayRef<const int> iatoms = idef.il[F_CONSTR].iatoms;
360 const int stride = NRAL(F_CONSTR) + 1;
361 const int numConstraints = idef.il[F_CONSTR].size() / stride;
363 // Early exit if no constraints
364 if (numConstraints == 0)
366 kernelParams_.numConstraintsThreads = 0;
370 // Construct the adjacency list, a useful intermediate structure
371 const auto atomsAdjacencyList = constructAtomsAdjacencyList(numAtoms, iatoms);
373 // Compute, how many constraints are coupled to each constraint
374 const auto numCoupledConstraints = countNumCoupledConstraints(iatoms, atomsAdjacencyList);
376 // Map of splits in the constraints data. For each 'old' constraint index gives 'new' which
377 // takes into account the empty spaces which might be needed in the end of each thread block.
378 std::vector<int> splitMap(numConstraints, -1);
379 int currentMapIndex = 0;
380 for (int c = 0; c < numConstraints; c++)
382 // Check if coupled constraints all fit in one block
383 if (numCoupledConstraints[c] > c_threadsPerBlock)
386 "Maximum number of coupled constraints (%d) exceeds the size of the CUDA "
387 "thread block (%d). Most likely, you are trying to use the GPU version of "
388 "LINCS with constraints on all-bonds, which is not supported for large "
389 "molecules. When compatible with the force field and integration settings, "
390 "using constraints on H-bonds only.",
391 numCoupledConstraints[c],
394 if (currentMapIndex / c_threadsPerBlock != (currentMapIndex + numCoupledConstraints[c]) / c_threadsPerBlock)
396 currentMapIndex = ((currentMapIndex / c_threadsPerBlock) + 1) * c_threadsPerBlock;
398 addWithCoupled(iatoms, stride, atomsAdjacencyList, splitMap, c, ¤tMapIndex);
401 kernelParams_.numConstraintsThreads =
402 currentMapIndex + c_threadsPerBlock - currentMapIndex % c_threadsPerBlock;
403 GMX_RELEASE_ASSERT(kernelParams_.numConstraintsThreads % c_threadsPerBlock == 0,
404 "Number of threads should be a multiple of the block size");
406 // Initialize constraints and their target indexes taking into account the splits in the data arrays.
410 constraintsHost.resize(kernelParams_.numConstraintsThreads, pair);
411 std::fill(constraintsHost.begin(), constraintsHost.end(), pair);
412 constraintsTargetLengthsHost.resize(kernelParams_.numConstraintsThreads, 0.0);
413 std::fill(constraintsTargetLengthsHost.begin(), constraintsTargetLengthsHost.end(), 0.0);
414 for (int c = 0; c < numConstraints; c++)
416 int a1 = iatoms[stride * c + 1];
417 int a2 = iatoms[stride * c + 2];
418 int type = iatoms[stride * c];
423 constraintsHost[splitMap[c]] = pair;
424 constraintsTargetLengthsHost[splitMap[c]] = idef.iparams[type].constr.dA;
427 // The adjacency list of constraints (i.e. the list of coupled constraints for each constraint).
428 // We map a single thread to a single constraint, hence each thread 'c' will be using one
429 // element from coupledConstraintsCountsHost array, which is the number of constraints coupled
430 // to the constraint 'c'. The coupled constraints indexes are placed into the
431 // coupledConstraintsIndicesHost array. Latter is organized as a one-dimensional array to ensure
432 // good memory alignment. It is addressed as [c + i*numConstraintsThreads], where 'i' goes from
433 // zero to the number of constraints coupled to 'c'. 'numConstraintsThreads' is the width of the
434 // array --- a number, greater then total number of constraints, taking into account the splits
435 // in the constraints array due to the GPU block borders. This number can be adjusted to improve
436 // memory access pattern. Mass factors are saved in a similar data structure.
437 int maxCoupledConstraints = 0;
438 bool maxCoupledConstraintsHasIncreased = false;
439 for (int c = 0; c < numConstraints; c++)
441 int a1 = iatoms[stride * c + 1];
442 int a2 = iatoms[stride * c + 2];
444 // Constraint 'c' is counted twice, but it should be excluded altogether. Hence '-2'.
445 int nCoupledConstraints = atomsAdjacencyList[a1].size() + atomsAdjacencyList[a2].size() - 2;
447 if (nCoupledConstraints > maxCoupledConstraints)
449 maxCoupledConstraints = nCoupledConstraints;
450 maxCoupledConstraintsHasIncreased = true;
454 kernelParams_.haveCoupledConstraints = (maxCoupledConstraints > 0);
456 coupledConstraintsCountsHost.resize(kernelParams_.numConstraintsThreads, 0);
457 coupledConstraintsIndicesHost.resize(maxCoupledConstraints * kernelParams_.numConstraintsThreads, -1);
458 massFactorsHost.resize(maxCoupledConstraints * kernelParams_.numConstraintsThreads, -1);
460 for (int c1 = 0; c1 < numConstraints; c1++)
462 coupledConstraintsCountsHost[splitMap[c1]] = 0;
463 int c1a1 = iatoms[stride * c1 + 1];
464 int c1a2 = iatoms[stride * c1 + 2];
466 // Constraints, coupled through the first atom.
468 for (const auto& atomAdjacencyList : atomsAdjacencyList[c1a1])
470 int c2 = atomAdjacencyList.indexOfConstraint_;
474 int c2a2 = atomAdjacencyList.indexOfSecondConstrainedAtom_;
475 int sign = atomAdjacencyList.signFactor_;
476 int index = kernelParams_.numConstraintsThreads
477 * coupledConstraintsCountsHost[splitMap[c1]]
479 int threadBlockStarts = splitMap[c1] - splitMap[c1] % c_threadsPerBlock;
481 coupledConstraintsIndicesHost[index] = splitMap[c2] - threadBlockStarts;
485 float sqrtmu1 = 1.0 / std::sqrt(invmass[c1a1] + invmass[c1a2]);
486 float sqrtmu2 = 1.0 / std::sqrt(invmass[c2a1] + invmass[c2a2]);
488 massFactorsHost[index] = -sign * invmass[center] * sqrtmu1 * sqrtmu2;
490 coupledConstraintsCountsHost[splitMap[c1]]++;
494 // Constraints, coupled through the second atom.
496 for (const auto& atomAdjacencyList : atomsAdjacencyList[c1a2])
498 int c2 = atomAdjacencyList.indexOfConstraint_;
502 int c2a2 = atomAdjacencyList.indexOfSecondConstrainedAtom_;
503 int sign = atomAdjacencyList.signFactor_;
504 int index = kernelParams_.numConstraintsThreads
505 * coupledConstraintsCountsHost[splitMap[c1]]
507 int threadBlockStarts = splitMap[c1] - splitMap[c1] % c_threadsPerBlock;
509 coupledConstraintsIndicesHost[index] = splitMap[c2] - threadBlockStarts;
513 float sqrtmu1 = 1.0 / std::sqrt(invmass[c1a1] + invmass[c1a2]);
514 float sqrtmu2 = 1.0 / std::sqrt(invmass[c2a1] + invmass[c2a2]);
516 massFactorsHost[index] = sign * invmass[center] * sqrtmu1 * sqrtmu2;
518 coupledConstraintsCountsHost[splitMap[c1]]++;
523 // (Re)allocate the memory, if the number of constraints has increased.
524 if ((kernelParams_.numConstraintsThreads > numConstraintsThreadsAlloc_) || maxCoupledConstraintsHasIncreased)
526 // Free memory if it was allocated before (i.e. if not the first time here).
527 if (numConstraintsThreadsAlloc_ > 0)
529 freeDeviceBuffer(&kernelParams_.d_constraints);
530 freeDeviceBuffer(&kernelParams_.d_constraintsTargetLengths);
532 freeDeviceBuffer(&kernelParams_.d_coupledConstraintsCounts);
533 freeDeviceBuffer(&kernelParams_.d_coupledConstraintsIndices);
534 freeDeviceBuffer(&kernelParams_.d_massFactors);
535 freeDeviceBuffer(&kernelParams_.d_matrixA);
538 numConstraintsThreadsAlloc_ = kernelParams_.numConstraintsThreads;
540 allocateDeviceBuffer(
541 &kernelParams_.d_constraints, kernelParams_.numConstraintsThreads, deviceContext_);
542 allocateDeviceBuffer(&kernelParams_.d_constraintsTargetLengths,
543 kernelParams_.numConstraintsThreads,
546 allocateDeviceBuffer(&kernelParams_.d_coupledConstraintsCounts,
547 kernelParams_.numConstraintsThreads,
549 allocateDeviceBuffer(&kernelParams_.d_coupledConstraintsIndices,
550 maxCoupledConstraints * kernelParams_.numConstraintsThreads,
552 allocateDeviceBuffer(&kernelParams_.d_massFactors,
553 maxCoupledConstraints * kernelParams_.numConstraintsThreads,
555 allocateDeviceBuffer(&kernelParams_.d_matrixA,
556 maxCoupledConstraints * kernelParams_.numConstraintsThreads,
560 // (Re)allocate the memory, if the number of atoms has increased.
561 if (numAtoms > numAtomsAlloc_)
563 if (numAtomsAlloc_ > 0)
565 freeDeviceBuffer(&kernelParams_.d_inverseMasses);
567 numAtomsAlloc_ = numAtoms;
568 allocateDeviceBuffer(&kernelParams_.d_inverseMasses, numAtoms, deviceContext_);
572 copyToDeviceBuffer(&kernelParams_.d_constraints,
573 constraintsHost.data(),
575 kernelParams_.numConstraintsThreads,
577 GpuApiCallBehavior::Sync,
579 copyToDeviceBuffer(&kernelParams_.d_constraintsTargetLengths,
580 constraintsTargetLengthsHost.data(),
582 kernelParams_.numConstraintsThreads,
584 GpuApiCallBehavior::Sync,
586 copyToDeviceBuffer(&kernelParams_.d_coupledConstraintsCounts,
587 coupledConstraintsCountsHost.data(),
589 kernelParams_.numConstraintsThreads,
591 GpuApiCallBehavior::Sync,
593 copyToDeviceBuffer(&kernelParams_.d_coupledConstraintsIndices,
594 coupledConstraintsIndicesHost.data(),
596 maxCoupledConstraints * kernelParams_.numConstraintsThreads,
598 GpuApiCallBehavior::Sync,
600 copyToDeviceBuffer(&kernelParams_.d_massFactors,
601 massFactorsHost.data(),
603 maxCoupledConstraints * kernelParams_.numConstraintsThreads,
605 GpuApiCallBehavior::Sync,
608 GMX_RELEASE_ASSERT(invmass != nullptr, "Masses of atoms should be specified.\n");
610 &kernelParams_.d_inverseMasses, invmass, 0, numAtoms, deviceStream_, GpuApiCallBehavior::Sync, nullptr);