2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2019,2020,2021, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
37 * \brief Implements LINCS kernels using CUDA
39 * This file contains CUDA kernels of LINCS constraints algorithm.
41 * \author Artem Zhmurov <zhmurov@gmail.com>
42 * \author Alan Gray <alang@nvidia.com>
44 * \ingroup module_mdlib
46 #include "gromacs/gpu_utils/cuda_arch_utils.cuh"
47 #include "gromacs/gpu_utils/cudautils.cuh"
48 #include "gromacs/gpu_utils/devicebuffer.cuh"
49 #include "gromacs/gpu_utils/gputraits.h"
50 #include "gromacs/gpu_utils/typecasts.cuh"
51 #include "gromacs/gpu_utils/vectype_ops.cuh"
52 #include "gromacs/mdlib/lincs_gpu.h"
53 #include "gromacs/pbcutil/pbc_aiuc_cuda.cuh"
55 #include "lincs_gpu_internal.h"
60 //! Maximum number of threads in a block (for __launch_bounds__)
61 constexpr static int c_maxThreadsPerBlock = c_threadsPerBlock;
63 /*! \brief Main kernel for LINCS constraints.
65 * See Hess et al., J. Comput. Chem. 18: 1463-1472 (1997) for the description of the algorithm.
67 * In CUDA version, one thread is responsible for all computations for one constraint. The blocks are
68 * filled in a way that no constraint is coupled to the constraint from the next block. This is achieved
69 * by moving active threads to the next block, if the correspondent group of coupled constraints is to big
70 * to fit the current thread block. This may leave some 'dummy' threads in the end of the thread block, i.e.
71 * threads that are not required to do actual work. Since constraints from different blocks are not coupled,
72 * there is no need to synchronize across the device. However, extensive communication in a thread block
75 * \todo Reduce synchronization overhead. Some ideas are:
76 * 1. Consider going to warp-level synchronization for the coupled constraints.
77 * 2. Move more data to local/shared memory and try to get rid of atomic operations (at least on
79 * 3. Use analytical solution for matrix A inversion.
80 * 4. Introduce mapping of thread id to both single constraint and single atom, thus designating
81 * Nth threads to deal with Nat <= Nth coupled atoms and Nc <= Nth coupled constraints.
82 * See Issue #2885 for details (https://gitlab.com/gromacs/gromacs/-/issues/2885)
83 * \todo The use of __restrict__ for gm_xp and gm_v causes failure, probably because of the atomic
84 operations. Investigate this issue further.
86 * \param[in,out] kernelParams All parameters and pointers for the kernel condensed in single struct.
87 * \param[in] invdt Inverse timestep (needed to update velocities).
89 template<bool updateVelocities, bool computeVirial>
90 __launch_bounds__(c_maxThreadsPerBlock) __global__
91 void lincs_kernel(LincsGpuKernelParameters kernelParams,
92 const float3* __restrict__ gm_x,
97 const PbcAiuc pbcAiuc = kernelParams.pbcAiuc;
98 const int numConstraintsThreads = kernelParams.numConstraintsThreads;
99 const int numIterations = kernelParams.numIterations;
100 const int expansionOrder = kernelParams.expansionOrder;
101 const AtomPair* __restrict__ gm_constraints = kernelParams.d_constraints;
102 const float* __restrict__ gm_constraintsTargetLengths = kernelParams.d_constraintsTargetLengths;
103 const int* __restrict__ gm_coupledConstraintsCounts = kernelParams.d_coupledConstraintsCounts;
104 const int* __restrict__ gm_coupledConstraintsIdxes = kernelParams.d_coupledConstraintsIndices;
105 const float* __restrict__ gm_massFactors = kernelParams.d_massFactors;
106 float* __restrict__ gm_matrixA = kernelParams.d_matrixA;
107 const float* __restrict__ gm_inverseMasses = kernelParams.d_inverseMasses;
108 float* __restrict__ gm_virialScaled = kernelParams.d_virialScaled;
110 int threadIndex = blockIdx.x * blockDim.x + threadIdx.x;
112 // numConstraintsThreads should be a integer multiple of blockSize (numConstraintsThreads = numBlocks*blockSize).
113 // This is to ensure proper synchronizations and reduction. All array are padded to the required size.
114 assert(threadIndex < numConstraintsThreads);
116 // Vectors connecting constrained atoms before algorithm was applied.
117 // Needed to construct constrain matrix A
118 extern __shared__ float3 sm_r[];
120 AtomPair pair = gm_constraints[threadIndex];
124 // Mass-scaled Lagrange multiplier
125 float lagrangeScaled = 0.0f;
130 float sqrtReducedMass;
136 // i == -1 indicates dummy constraint at the end of the thread block.
137 bool isDummyThread = (i == -1);
139 // Everything computed for these dummies will be equal to zero
145 sqrtReducedMass = 0.0f;
147 xi = make_float3(0.0f, 0.0f, 0.0f);
148 xj = make_float3(0.0f, 0.0f, 0.0f);
149 rc = make_float3(0.0f, 0.0f, 0.0f);
154 targetLength = gm_constraintsTargetLengths[threadIndex];
155 inverseMassi = gm_inverseMasses[i];
156 inverseMassj = gm_inverseMasses[j];
157 sqrtReducedMass = rsqrt(inverseMassi + inverseMassj);
162 float3 dx = pbcDxAiuc(pbcAiuc, xi, xj);
164 float rlen = rsqrtf(dx.x * dx.x + dx.y * dx.y + dx.z * dx.z);
168 sm_r[threadIdx.x] = rc;
169 // Make sure that all r's are saved into shared memory
170 // before they are accessed in the loop below
174 * Constructing LINCS matrix (A)
177 // Only non-zero values are saved (for coupled constraints)
178 int coupledConstraintsCount = gm_coupledConstraintsCounts[threadIndex];
179 for (int n = 0; n < coupledConstraintsCount; n++)
181 int index = n * numConstraintsThreads + threadIndex;
182 int c1 = gm_coupledConstraintsIdxes[index];
184 float3 rc1 = sm_r[c1 - blockIdx.x * blockDim.x];
185 gm_matrixA[index] = gm_massFactors[index] * (rc.x * rc1.x + rc.y * rc1.y + rc.z * rc1.z);
188 // Skipping in dummy threads
195 float3 dx = pbcDxAiuc(pbcAiuc, xi, xj);
197 float sol = sqrtReducedMass * ((rc.x * dx.x + rc.y * dx.y + rc.z * dx.z) - targetLength);
200 * Inverse matrix using a set of expansionOrder matrix multiplications
203 // This will use the same memory space as sm_r, which is no longer needed.
204 extern __shared__ float sm_rhs[];
205 // Save current right-hand-side vector in the shared memory
206 sm_rhs[threadIdx.x] = sol;
208 for (int rec = 0; rec < expansionOrder; rec++)
210 // Making sure that all sm_rhs are saved before they are accessed in a loop below
214 for (int n = 0; n < coupledConstraintsCount; n++)
216 int index = n * numConstraintsThreads + threadIndex;
217 int c1 = gm_coupledConstraintsIdxes[index];
218 // Convolute current right-hand-side with A
219 // Different, non overlapping parts of sm_rhs[..] are read during odd and even iterations
220 mvb = mvb + gm_matrixA[index] * sm_rhs[c1 - blockIdx.x * blockDim.x + blockDim.x * (rec % 2)];
222 // 'Switch' rhs vectors, save current result
223 // These values will be accessed in the loop above during the next iteration.
224 sm_rhs[threadIdx.x + blockDim.x * ((rec + 1) % 2)] = mvb;
228 // Current mass-scaled Lagrange multipliers
229 lagrangeScaled = sqrtReducedMass * sol;
231 // Save updated coordinates before correction for the rotational lengthening
232 float3 tmp = rc * lagrangeScaled;
234 // Writing for all but dummy constraints
237 atomicAdd(&gm_xp[i], -tmp * inverseMassi);
238 atomicAdd(&gm_xp[j], tmp * inverseMassj);
242 * Correction for centripetal effects
244 for (int iter = 0; iter < numIterations; iter++)
246 // Make sure that all xp's are saved: atomic operation calls before are
247 // communicating current xp[..] values across thread block.
256 float3 dx = pbcDxAiuc(pbcAiuc, xi, xj);
258 float len2 = targetLength * targetLength;
259 float dlen2 = 2.0f * len2 - norm2(dx);
261 // TODO A little bit more effective but slightly less readable version of the below would be:
262 // float proj = sqrtReducedMass*(targetLength - (dlen2 > 0.0f ? 1.0f : 0.0f)*dlen2*rsqrt(dlen2));
266 proj = sqrtReducedMass * (targetLength - dlen2 * rsqrt(dlen2));
270 proj = sqrtReducedMass * targetLength;
273 sm_rhs[threadIdx.x] = proj;
277 * Same matrix inversion as above is used for updated data
279 for (int rec = 0; rec < expansionOrder; rec++)
281 // Make sure that all elements of rhs are saved into shared memory
285 for (int n = 0; n < coupledConstraintsCount; n++)
287 int index = n * numConstraintsThreads + threadIndex;
288 int c1 = gm_coupledConstraintsIdxes[index];
290 mvb = mvb + gm_matrixA[index] * sm_rhs[c1 - blockIdx.x * blockDim.x + blockDim.x * (rec % 2)];
292 sm_rhs[threadIdx.x + blockDim.x * ((rec + 1) % 2)] = mvb;
296 // Add corrections to Lagrange multipliers
297 float sqrtmu_sol = sqrtReducedMass * sol;
298 lagrangeScaled += sqrtmu_sol;
300 // Save updated coordinates for the next iteration
301 // Dummy constraints are skipped
304 float3 tmp = rc * sqrtmu_sol;
305 atomicAdd(&gm_xp[i], -tmp * inverseMassi);
306 atomicAdd(&gm_xp[j], tmp * inverseMassj);
310 // Updating particle velocities for all but dummy threads
311 if (updateVelocities && !isDummyThread)
313 float3 tmp = rc * invdt * lagrangeScaled;
314 atomicAdd(&gm_v[i], -tmp * inverseMassi);
315 atomicAdd(&gm_v[j], tmp * inverseMassj);
321 // Virial is computed from Lagrange multiplier (lagrangeScaled), target constrain length
322 // (targetLength) and the normalized vector connecting constrained atoms before
323 // the algorithm was applied (rc). The evaluation of virial in each thread is
324 // followed by basic reduction for the values inside single thread block.
325 // Then, the values are reduced across grid by atomicAdd(...).
327 // TODO Shuffle reduction.
328 // TODO Should be unified and/or done once when virial is actually needed.
329 // TODO Recursive version that removes atomicAdd(...)'s entirely is needed. Ideally,
330 // one that works for any datatype.
332 // Save virial for each thread into the shared memory. Tensor is symmetrical, hence only
333 // 6 values are saved. Dummy threads will have zeroes in their virial: targetLength,
334 // lagrangeScaled and rc are all set to zero for them in the beginning of the kernel.
335 // The sm_threadVirial[..] will overlap with the sm_r[..] and sm_rhs[..], but the latter
336 // two are no longer in use.
337 extern __shared__ float sm_threadVirial[];
338 float mult = targetLength * lagrangeScaled;
339 sm_threadVirial[0 * blockDim.x + threadIdx.x] = mult * rc.x * rc.x;
340 sm_threadVirial[1 * blockDim.x + threadIdx.x] = mult * rc.x * rc.y;
341 sm_threadVirial[2 * blockDim.x + threadIdx.x] = mult * rc.x * rc.z;
342 sm_threadVirial[3 * blockDim.x + threadIdx.x] = mult * rc.y * rc.y;
343 sm_threadVirial[4 * blockDim.x + threadIdx.x] = mult * rc.y * rc.z;
344 sm_threadVirial[5 * blockDim.x + threadIdx.x] = mult * rc.z * rc.z;
348 // Reduce up to one virial per thread block. All blocks are divided by half, the first
349 // half of threads sums two virials. Then the first half is divided by two and the first
350 // half of it sums two values. This procedure is repeated until only one thread is left.
351 // Only works if the threads per blocks is a power of two (hence static_assert
352 // in the beginning of the kernel).
353 for (int divideBy = 2; divideBy <= static_cast<int>(blockDim.x); divideBy *= 2)
355 int dividedAt = blockDim.x / divideBy;
356 if (static_cast<int>(threadIdx.x) < dividedAt)
358 for (int d = 0; d < 6; d++)
360 sm_threadVirial[d * blockDim.x + threadIdx.x] +=
361 sm_threadVirial[d * blockDim.x + (threadIdx.x + dividedAt)];
364 // Syncronize if not within one warp
365 if (dividedAt > warpSize / 2)
370 // First 6 threads in the block add the results of 6 tensor components to the global memory address.
373 atomicAdd(&(gm_virialScaled[threadIdx.x]), sm_threadVirial[threadIdx.x * blockDim.x]);
380 /*! \brief Select templated kernel.
382 * Returns pointer to a CUDA kernel based on provided booleans.
384 * \param[in] updateVelocities If the velocities should be constrained.
385 * \param[in] computeVirial If virial should be updated.
387 * \return Pointer to CUDA kernel
389 inline auto getLincsKernelPtr(const bool updateVelocities, const bool computeVirial)
392 auto kernelPtr = lincs_kernel<true, true>;
393 if (updateVelocities && computeVirial)
395 kernelPtr = lincs_kernel<true, true>;
397 else if (updateVelocities && !computeVirial)
399 kernelPtr = lincs_kernel<true, false>;
401 else if (!updateVelocities && computeVirial)
403 kernelPtr = lincs_kernel<false, true>;
405 else if (!updateVelocities && !computeVirial)
407 kernelPtr = lincs_kernel<false, false>;
412 void launchLincsGpuKernel(LincsGpuKernelParameters& kernelParams,
413 const DeviceBuffer<Float3> d_x,
414 DeviceBuffer<Float3> d_xp,
415 const bool updateVelocities,
416 DeviceBuffer<Float3> d_v,
418 const bool computeVirial,
419 const DeviceStream& deviceStream)
422 auto kernelPtr = getLincsKernelPtr(updateVelocities, computeVirial);
424 KernelLaunchConfig config;
425 config.blockSize[0] = c_threadsPerBlock;
426 config.blockSize[1] = 1;
427 config.blockSize[2] = 1;
428 config.gridSize[0] = (kernelParams.numConstraintsThreads + c_threadsPerBlock - 1) / c_threadsPerBlock;
429 config.gridSize[1] = 1;
430 config.gridSize[2] = 1;
432 // Shared memory is used to store:
433 // -- Current coordinates (3 floats per thread)
434 // -- Right-hand-sides for matrix inversion (2 floats per thread)
435 // -- Virial tensor components (6 floats per thread)
436 // Since none of these three are needed simultaneously, they can be saved at the same shared memory address
437 // (i.e. correspondent arrays are intentionally overlapped in address space). Consequently, only
438 // max{3, 2, 6} = 6 floats per thread are needed in case virial is computed, or max{3, 2} = 3 if not.
441 config.sharedMemorySize = c_threadsPerBlock * 6 * sizeof(float);
445 config.sharedMemorySize = c_threadsPerBlock * 3 * sizeof(float);
448 const auto kernelArgs = prepareGpuKernelArguments(kernelPtr,
451 asFloat3Pointer(&d_x),
452 asFloat3Pointer(&d_xp),
453 asFloat3Pointer(&d_v),
456 launchGpuKernel(kernelPtr,
460 "lincs_kernel<updateVelocities, computeVirial>",