src/gromacs/mdlib/lincs_gpu_internal.cu

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2019,2020,2021, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35 /*! \internal \file
  36  *
  37  * \brief Implements LINCS kernels using CUDA
  38  *
  39  * This file contains CUDA kernels of LINCS constraints algorithm.
  40  *
  41  * \author Artem Zhmurov <zhmurov@gmail.com>
  42  * \author Alan Gray <alang@nvidia.com>
  43  *
  44  * \ingroup module_mdlib
  45  */
  46 #include "gromacs/gpu_utils/cuda_arch_utils.cuh"
  47 #include "gromacs/gpu_utils/cudautils.cuh"
  48 #include "gromacs/gpu_utils/devicebuffer.cuh"
  49 #include "gromacs/gpu_utils/gputraits.h"
  50 #include "gromacs/gpu_utils/typecasts.cuh"
  51 #include "gromacs/gpu_utils/vectype_ops.cuh"
  52 #include "gromacs/mdlib/lincs_gpu.h"
  53 #include "gromacs/pbcutil/pbc_aiuc_cuda.cuh"
  54
  55 #include "lincs_gpu_internal.h"
  56
  57 namespace gmx
  58 {
  59
  60 //! Maximum number of threads in a block (for __launch_bounds__)
  61 constexpr static int c_maxThreadsPerBlock = c_threadsPerBlock;
  62
  63 /*! \brief Main kernel for LINCS constraints.
  64  *
  65  * See Hess et al., J. Comput. Chem. 18: 1463-1472 (1997) for the description of the algorithm.
  66  *
  67  * In GPU version, one thread is responsible for all computations for one constraint. The blocks are
  68  * filled in a way that no constraint is coupled to the constraint from the next block. This is achieved
  69  * by moving active threads to the next block, if the correspondent group of coupled constraints is to big
  70  * to fit the current thread block. This may leave some 'dummy' threads in the end of the thread block, i.e.
  71  * threads that are not required to do actual work. Since constraints from different blocks are not coupled,
  72  * there is no need to synchronize across the device. However, extensive communication in a thread block
  73  * are still needed.
  74  *
  75  * \todo Reduce synchronization overhead. Some ideas are:
  76  *        1. Consider going to warp-level synchronization for the coupled constraints.
  77  *        2. Move more data to local/shared memory and try to get rid of atomic operations (at least on
  78  *           the device level).
  79  *        3. Use analytical solution for matrix A inversion.
  80  *        4. Introduce mapping of thread id to both single constraint and single atom, thus designating
  81  *           Nth threads to deal with Nat <= Nth coupled atoms and Nc <= Nth coupled constraints.
  82  *       See Issue #2885 for details (https://gitlab.com/gromacs/gromacs/-/issues/2885)
  83  * \todo The use of __restrict__  for gm_xp and gm_v causes failure, probably because of the atomic
  84          operations. Investigate this issue further.
  85  *
  86  * \param[in,out] kernelParams  All parameters and pointers for the kernel condensed in single struct.
  87  * \param[in]     invdt         Inverse timestep (needed to update velocities).
  88  */
  89 template<bool updateVelocities, bool computeVirial>
  90 __launch_bounds__(c_maxThreadsPerBlock) __global__
  91         void lincs_kernel(LincsGpuKernelParameters kernelParams,
  92                           const float3* __restrict__ gm_x,
  93                           float3*     gm_xp,
  94                           float3*     gm_v,
  95                           const float invdt)
  96 {
  97     const PbcAiuc pbcAiuc                                 = kernelParams.pbcAiuc;
  98     const int     numConstraintsThreads                   = kernelParams.numConstraintsThreads;
  99     const int     numIterations                           = kernelParams.numIterations;
 100     const int     expansionOrder                          = kernelParams.expansionOrder;
 101     const AtomPair* __restrict__ gm_constraints           = kernelParams.d_constraints;
 102     const float* __restrict__ gm_constraintsTargetLengths = kernelParams.d_constraintsTargetLengths;
 103     const int* __restrict__ gm_coupledConstraintsCounts   = kernelParams.d_coupledConstraintsCounts;
 104     const int* __restrict__ gm_coupledConstraintsIndices = kernelParams.d_coupledConstraintsIndices;
 105     const float* __restrict__ gm_massFactors             = kernelParams.d_massFactors;
 106     float* __restrict__ gm_matrixA                       = kernelParams.d_matrixA;
 107     const float* __restrict__ gm_inverseMasses           = kernelParams.d_inverseMasses;
 108     float* __restrict__ gm_virialScaled                  = kernelParams.d_virialScaled;
 109
 110     const int threadIndex = blockIdx.x * blockDim.x + threadIdx.x;
 111
 112     // numConstraintsThreads should be a integer multiple of blockSize (numConstraintsThreads = numBlocks*blockSize).
 113     // This is to ensure proper synchronizations and reduction. All array are padded to the required size.
 114     assert(threadIndex < numConstraintsThreads);
 115
 116     // Vectors connecting constrained atoms before algorithm was applied.
 117     // Needed to construct constrain matrix A
 118     extern __shared__ float3 sm_r[];
 119
 120     AtomPair pair = gm_constraints[threadIndex];
 121     int      i    = pair.i;
 122     int      j    = pair.j;
 123
 124     // Mass-scaled Lagrange multiplier
 125     float lagrangeScaled = 0.0F;
 126
 127     float targetLength;
 128     float inverseMassi;
 129     float inverseMassj;
 130     float sqrtReducedMass;
 131
 132     float3 xi;
 133     float3 xj;
 134     float3 rc;
 135
 136     // i == -1 indicates dummy constraint at the end of the thread block.
 137     bool isDummyThread = (i == -1);
 138
 139     // Everything computed for these dummies will be equal to zero
 140     if (isDummyThread)
 141     {
 142         targetLength    = 0.0F;
 143         inverseMassi    = 0.0F;
 144         inverseMassj    = 0.0F;
 145         sqrtReducedMass = 0.0F;
 146
 147         xi = make_float3(0.0F, 0.0F, 0.0F);
 148         xj = make_float3(0.0F, 0.0F, 0.0F);
 149         rc = make_float3(0.0F, 0.0F, 0.0F);
 150     }
 151     else
 152     {
 153         // Collecting data
 154         targetLength    = gm_constraintsTargetLengths[threadIndex];
 155         inverseMassi    = gm_inverseMasses[i];
 156         inverseMassj    = gm_inverseMasses[j];
 157         sqrtReducedMass = rsqrt(inverseMassi + inverseMassj);
 158
 159         xi = gm_x[i];
 160         xj = gm_x[j];
 161
 162         float3 dx = pbcDxAiuc(pbcAiuc, xi, xj);
 163
 164         float rlen = rsqrtf(dx.x * dx.x + dx.y * dx.y + dx.z * dx.z);
 165         rc         = rlen * dx;
 166     }
 167
 168     sm_r[threadIdx.x] = rc;
 169     // Make sure that all r's are saved into shared memory
 170     // before they are accessed in the loop below
 171     __syncthreads();
 172
 173     /*
 174      * Constructing LINCS matrix (A)
 175      */
 176
 177     // Only non-zero values are saved (for coupled constraints)
 178     int coupledConstraintsCount = gm_coupledConstraintsCounts[threadIndex];
 179     for (int n = 0; n < coupledConstraintsCount; n++)
 180     {
 181         int index = n * numConstraintsThreads + threadIndex;
 182         int c1    = gm_coupledConstraintsIndices[index];
 183
 184         float3 rc1        = sm_r[c1];
 185         gm_matrixA[index] = gm_massFactors[index] * (rc.x * rc1.x + rc.y * rc1.y + rc.z * rc1.z);
 186     }
 187
 188     // Skipping in dummy threads
 189     if (!isDummyThread)
 190     {
 191         xi = gm_xp[i];
 192         xj = gm_xp[j];
 193     }
 194
 195     float3 dx = pbcDxAiuc(pbcAiuc, xi, xj);
 196
 197     float sol = sqrtReducedMass * ((rc.x * dx.x + rc.y * dx.y + rc.z * dx.z) - targetLength);
 198
 199     /*
 200      *  Inverse matrix using a set of expansionOrder matrix multiplications
 201      */
 202
 203     // Make sure that we don't overwrite the sm_r[..] array.
 204     __syncthreads();
 205
 206     // This will use the same memory space as sm_r, which is no longer needed.
 207     extern __shared__ float sm_rhs[];
 208     // Save current right-hand-side vector in the shared memory
 209     sm_rhs[threadIdx.x] = sol;
 210
 211     for (int rec = 0; rec < expansionOrder; rec++)
 212     {
 213         // Making sure that all sm_rhs are saved before they are accessed in a loop below
 214         __syncthreads();
 215         float mvb = 0.0F;
 216
 217         for (int n = 0; n < coupledConstraintsCount; n++)
 218         {
 219             int index = n * numConstraintsThreads + threadIndex;
 220             int c1    = gm_coupledConstraintsIndices[index];
 221             // Convolute current right-hand-side with A
 222             // Different, non overlapping parts of sm_rhs[..] are read during odd and even iterations
 223             mvb = mvb + gm_matrixA[index] * sm_rhs[c1 + blockDim.x * (rec % 2)];
 224         }
 225         // 'Switch' rhs vectors, save current result
 226         // These values will be accessed in the loop above during the next iteration.
 227         sm_rhs[threadIdx.x + blockDim.x * ((rec + 1) % 2)] = mvb;
 228         sol                                                = sol + mvb;
 229     }
 230
 231     // Current mass-scaled Lagrange multipliers
 232     lagrangeScaled = sqrtReducedMass * sol;
 233
 234     // Save updated coordinates before correction for the rotational lengthening
 235     float3 tmp = rc * lagrangeScaled;
 236
 237     // Writing for all but dummy constraints
 238     if (!isDummyThread)
 239     {
 240         atomicAdd(&gm_xp[i], -tmp * inverseMassi);
 241         atomicAdd(&gm_xp[j], tmp * inverseMassj);
 242     }
 243
 244     /*
 245      *  Correction for centripetal effects
 246      */
 247     for (int iter = 0; iter < numIterations; iter++)
 248     {
 249         // Make sure that all xp's are saved: atomic operation calls before are
 250         // communicating current xp[..] values across thread block.
 251         __syncthreads();
 252
 253         if (!isDummyThread)
 254         {
 255             xi = gm_xp[i];
 256             xj = gm_xp[j];
 257         }
 258
 259         float3 dx = pbcDxAiuc(pbcAiuc, xi, xj);
 260
 261         float len2  = targetLength * targetLength;
 262         float dlen2 = 2.0F * len2 - norm2(dx);
 263
 264         // TODO A little bit more effective but slightly less readable version of the below would be:
 265         //      float proj = sqrtReducedMass*(targetLength - (dlen2 > 0.0f ? 1.0f : 0.0f)*dlen2*rsqrt(dlen2));
 266         float proj;
 267         if (dlen2 > 0.0F)
 268         {
 269             proj = sqrtReducedMass * (targetLength - dlen2 * rsqrt(dlen2));
 270         }
 271         else
 272         {
 273             proj = sqrtReducedMass * targetLength;
 274         }
 275
 276         sm_rhs[threadIdx.x] = proj;
 277         float sol           = proj;
 278
 279         /*
 280          * Same matrix inversion as above is used for updated data
 281          */
 282         for (int rec = 0; rec < expansionOrder; rec++)
 283         {
 284             // Make sure that all elements of rhs are saved into shared memory
 285             __syncthreads();
 286             float mvb = 0;
 287
 288             for (int n = 0; n < coupledConstraintsCount; n++)
 289             {
 290                 int index = n * numConstraintsThreads + threadIndex;
 291                 int c1    = gm_coupledConstraintsIndices[index];
 292
 293                 mvb = mvb + gm_matrixA[index] * sm_rhs[c1 + blockDim.x * (rec % 2)];
 294             }
 295             sm_rhs[threadIdx.x + blockDim.x * ((rec + 1) % 2)] = mvb;
 296             sol                                                = sol + mvb;
 297         }
 298
 299         // Add corrections to Lagrange multipliers
 300         float sqrtmu_sol = sqrtReducedMass * sol;
 301         lagrangeScaled += sqrtmu_sol;
 302
 303         // Save updated coordinates for the next iteration
 304         // Dummy constraints are skipped
 305         if (!isDummyThread)
 306         {
 307             float3 tmp = rc * sqrtmu_sol;
 308             atomicAdd(&gm_xp[i], -tmp * inverseMassi);
 309             atomicAdd(&gm_xp[j], tmp * inverseMassj);
 310         }
 311     }
 312
 313     // Updating particle velocities for all but dummy threads
 314     if (updateVelocities && !isDummyThread)
 315     {
 316         float3 tmp = rc * invdt * lagrangeScaled;
 317         atomicAdd(&gm_v[i], -tmp * inverseMassi);
 318         atomicAdd(&gm_v[j], tmp * inverseMassj);
 319     }
 320
 321
 322     if (computeVirial)
 323     {
 324         // Virial is computed from Lagrange multiplier (lagrangeScaled), target constrain length
 325         // (targetLength) and the normalized vector connecting constrained atoms before
 326         // the algorithm was applied (rc). The evaluation of virial in each thread is
 327         // followed by basic reduction for the values inside single thread block.
 328         // Then, the values are reduced across grid by atomicAdd(...).
 329         //
 330         // TODO Shuffle reduction.
 331         // TODO Should be unified and/or done once when virial is actually needed.
 332         // TODO Recursive version that removes atomicAdd(...)'s entirely is needed. Ideally,
 333         //      one that works for any datatype.
 334
 335         // Save virial for each thread into the shared memory. Tensor is symmetrical, hence only
 336         // 6 values are saved. Dummy threads will have zeroes in their virial: targetLength,
 337         // lagrangeScaled and rc are all set to zero for them in the beginning of the kernel.
 338         // The sm_threadVirial[..] will overlap with the sm_r[..] and sm_rhs[..], but the latter
 339         // two are no longer in use, which we make sure by waiting for all threads in block.
 340         __syncthreads();
 341         extern __shared__ float sm_threadVirial[];
 342         float                   mult                  = targetLength * lagrangeScaled;
 343         sm_threadVirial[0 * blockDim.x + threadIdx.x] = mult * rc.x * rc.x;
 344         sm_threadVirial[1 * blockDim.x + threadIdx.x] = mult * rc.x * rc.y;
 345         sm_threadVirial[2 * blockDim.x + threadIdx.x] = mult * rc.x * rc.z;
 346         sm_threadVirial[3 * blockDim.x + threadIdx.x] = mult * rc.y * rc.y;
 347         sm_threadVirial[4 * blockDim.x + threadIdx.x] = mult * rc.y * rc.z;
 348         sm_threadVirial[5 * blockDim.x + threadIdx.x] = mult * rc.z * rc.z;
 349
 350         __syncthreads();
 351
 352         // Reduce up to one virial per thread block. All blocks are divided by half, the first
 353         // half of threads sums two virials. Then the first half is divided by two and the first
 354         // half of it sums two values. This procedure is repeated until only one thread is left.
 355         // Only works if the threads per blocks is a power of two (hence static_assert
 356         // in the beginning of the kernel).
 357         for (int divideBy = 2; divideBy <= static_cast<int>(blockDim.x); divideBy *= 2)
 358         {
 359             int dividedAt = blockDim.x / divideBy;
 360             if (static_cast<int>(threadIdx.x) < dividedAt)
 361             {
 362                 for (int d = 0; d < 6; d++)
 363                 {
 364                     sm_threadVirial[d * blockDim.x + threadIdx.x] +=
 365                             sm_threadVirial[d * blockDim.x + (threadIdx.x + dividedAt)];
 366                 }
 367             }
 368             // Syncronize if not within one warp
 369             if (dividedAt > warpSize / 2)
 370             {
 371                 __syncthreads();
 372             }
 373         }
 374         // First 6 threads in the block add the results of 6 tensor components to the global memory address.
 375         if (threadIdx.x < 6)
 376         {
 377             atomicAdd(&(gm_virialScaled[threadIdx.x]), sm_threadVirial[threadIdx.x * blockDim.x]);
 378         }
 379     }
 380 }
 381
 382 /*! \brief Select templated kernel.
 383  *
 384  * Returns pointer to a CUDA kernel based on provided booleans.
 385  *
 386  * \param[in] updateVelocities  If the velocities should be constrained.
 387  * \param[in] computeVirial     If virial should be updated.
 388  *
 389  * \return                      Pointer to CUDA kernel
 390  */
 391 inline auto getLincsKernelPtr(const bool updateVelocities, const bool computeVirial)
 392 {
 393
 394     auto kernelPtr = lincs_kernel<true, true>;
 395     if (updateVelocities && computeVirial)
 396     {
 397         kernelPtr = lincs_kernel<true, true>;
 398     }
 399     else if (updateVelocities && !computeVirial)
 400     {
 401         kernelPtr = lincs_kernel<true, false>;
 402     }
 403     else if (!updateVelocities && computeVirial)
 404     {
 405         kernelPtr = lincs_kernel<false, true>;
 406     }
 407     else if (!updateVelocities && !computeVirial)
 408     {
 409         kernelPtr = lincs_kernel<false, false>;
 410     }
 411     return kernelPtr;
 412 }
 413
 414 void launchLincsGpuKernel(LincsGpuKernelParameters*   kernelParams,
 415                           const DeviceBuffer<Float3>& d_x,
 416                           DeviceBuffer<Float3>        d_xp,
 417                           const bool                  updateVelocities,
 418                           DeviceBuffer<Float3>        d_v,
 419                           const real                  invdt,
 420                           const bool                  computeVirial,
 421                           const DeviceStream&         deviceStream)
 422 {
 423
 424     auto kernelPtr = getLincsKernelPtr(updateVelocities, computeVirial);
 425
 426     KernelLaunchConfig config;
 427     config.blockSize[0] = c_threadsPerBlock;
 428     config.blockSize[1] = 1;
 429     config.blockSize[2] = 1;
 430     config.gridSize[0] = (kernelParams->numConstraintsThreads + c_threadsPerBlock - 1) / c_threadsPerBlock;
 431     config.gridSize[1] = 1;
 432     config.gridSize[2] = 1;
 433
 434     // Shared memory is used to store:
 435     // -- Current coordinates (3 floats per thread)
 436     // -- Right-hand-sides for matrix inversion (2 floats per thread)
 437     // -- Virial tensor components (6 floats per thread)
 438     // Since none of these three are needed simultaneously, they can be saved at the same shared memory address
 439     // (i.e. correspondent arrays are intentionally overlapped in address space). Consequently, only
 440     // max{3, 2, 6} = 6 floats per thread are needed in case virial is computed, or max{3, 2} = 3 if not.
 441     if (computeVirial)
 442     {
 443         config.sharedMemorySize = c_threadsPerBlock * 6 * sizeof(float);
 444     }
 445     else
 446     {
 447         config.sharedMemorySize = c_threadsPerBlock * 3 * sizeof(float);
 448     }
 449
 450     const auto kernelArgs = prepareGpuKernelArguments(kernelPtr,
 451                                                       config,
 452                                                       kernelParams,
 453                                                       asFloat3Pointer(&d_x),
 454                                                       asFloat3Pointer(&d_xp),
 455                                                       asFloat3Pointer(&d_v),
 456                                                       &invdt);
 457
 458     launchGpuKernel(kernelPtr,
 459                     config,
 460                     deviceStream,
 461                     nullptr,
 462                     "lincs_kernel<updateVelocities, computeVirial>",
 463                     kernelArgs);
 464 }
 465
 466 } // namespace gmx