src/gromacs/mdlib/lincs_gpu.cu

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2019,2020, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35 /*! \internal \file
  36  *
  37  * \brief Implements LINCS using CUDA
  38  *
  39  * This file contains implementation of LINCS constraints algorithm
  40  * using CUDA, including class initialization, data-structures management
  41  * and GPU kernel.
  42  *
  43  * \author Artem Zhmurov <zhmurov@gmail.com>
  44  * \author Alan Gray <alang@nvidia.com>
  45  *
  46  * \ingroup module_mdlib
  47  */
  48 #include "gmxpre.h"
  49
  50 #include "lincs_gpu.cuh"
  51
  52 #include <assert.h>
  53 #include <stdio.h>
  54
  55 #include <cmath>
  56
  57 #include <algorithm>
  58
  59 #include "gromacs/gpu_utils/cuda_arch_utils.cuh"
  60 #include "gromacs/gpu_utils/cudautils.cuh"
  61 #include "gromacs/gpu_utils/devicebuffer.cuh"
  62 #include "gromacs/gpu_utils/gputraits.cuh"
  63 #include "gromacs/gpu_utils/vectype_ops.cuh"
  64 #include "gromacs/math/vec.h"
  65 #include "gromacs/mdlib/constr.h"
  66 #include "gromacs/pbcutil/pbc.h"
  67 #include "gromacs/pbcutil/pbc_aiuc_cuda.cuh"
  68 #include "gromacs/topology/forcefieldparameters.h"
  69 #include "gromacs/topology/ifunc.h"
  70 #include "gromacs/topology/topology.h"
  71
  72 namespace gmx
  73 {
  74
  75 //! Number of CUDA threads in a block
  76 constexpr static int c_threadsPerBlock = 256;
  77 //! Maximum number of threads in a block (for __launch_bounds__)
  78 constexpr static int c_maxThreadsPerBlock = c_threadsPerBlock;
  79
  80 /*! \brief Main kernel for LINCS constraints.
  81  *
  82  * See Hess et al., J. Comput. Chem. 18: 1463-1472 (1997) for the description of the algorithm.
  83  *
  84  * In CUDA version, one thread is responsible for all computations for one constraint. The blocks are
  85  * filled in a way that no constraint is coupled to the constraint from the next block. This is achieved
  86  * by moving active threads to the next block, if the correspondent group of coupled constraints is to big
  87  * to fit the current thread block. This may leave some 'dummy' threads in the end of the thread block, i.e.
  88  * threads that are not required to do actual work. Since constraints from different blocks are not coupled,
  89  * there is no need to synchronize across the device. However, extensive communication in a thread block
  90  * are still needed.
  91  *
  92  * \todo Reduce synchronization overhead. Some ideas are:
  93  *        1. Consider going to warp-level synchronization for the coupled constraints.
  94  *        2. Move more data to local/shared memory and try to get rid of atomic operations (at least on
  95  *           the device level).
  96  *        3. Use analytical solution for matrix A inversion.
  97  *        4. Introduce mapping of thread id to both single constraint and single atom, thus designating
  98  *           Nth threads to deal with Nat <= Nth coupled atoms and Nc <= Nth coupled constraints.
  99  *       See Issue #2885 for details (https://gitlab.com/gromacs/gromacs/-/issues/2885)
 100  * \todo The use of __restrict__  for gm_xp and gm_v causes failure, probably because of the atomic
 101          operations. Investigate this issue further.
 102  *
 103  * \param[in,out] kernelParams  All parameters and pointers for the kernel condensed in single struct.
 104  * \param[in]     invdt         Inverse timestep (needed to update velocities).
 105  */
 106 template<bool updateVelocities, bool computeVirial>
 107 __launch_bounds__(c_maxThreadsPerBlock) __global__
 108         void lincs_kernel(LincsGpuKernelParameters kernelParams,
 109                           const float3* __restrict__ gm_x,
 110                           float3*     gm_xp,
 111                           float3*     gm_v,
 112                           const float invdt)
 113 {
 114     const PbcAiuc pbcAiuc                                 = kernelParams.pbcAiuc;
 115     const int     numConstraintsThreads                   = kernelParams.numConstraintsThreads;
 116     const int     numIterations                           = kernelParams.numIterations;
 117     const int     expansionOrder                          = kernelParams.expansionOrder;
 118     const int2* __restrict__ gm_constraints               = kernelParams.d_constraints;
 119     const float* __restrict__ gm_constraintsTargetLengths = kernelParams.d_constraintsTargetLengths;
 120     const int* __restrict__ gm_coupledConstraintsCounts   = kernelParams.d_coupledConstraintsCounts;
 121     const int* __restrict__ gm_coupledConstraintsIdxes = kernelParams.d_coupledConstraintsIndices;
 122     const float* __restrict__ gm_massFactors           = kernelParams.d_massFactors;
 123     float* __restrict__ gm_matrixA                     = kernelParams.d_matrixA;
 124     const float* __restrict__ gm_inverseMasses         = kernelParams.d_inverseMasses;
 125     float* __restrict__ gm_virialScaled                = kernelParams.d_virialScaled;
 126
 127     int threadIndex = blockIdx.x * blockDim.x + threadIdx.x;
 128
 129     // numConstraintsThreads should be a integer multiple of blockSize (numConstraintsThreads = numBlocks*blockSize).
 130     // This is to ensure proper synchronizations and reduction. All array are padded to the required size.
 131     assert(threadIndex < numConstraintsThreads);
 132
 133     // Vectors connecting constrained atoms before algorithm was applied.
 134     // Needed to construct constrain matrix A
 135     extern __shared__ float3 sm_r[];
 136
 137     int2 pair = gm_constraints[threadIndex];
 138     int  i    = pair.x;
 139     int  j    = pair.y;
 140
 141     // Mass-scaled Lagrange multiplier
 142     float lagrangeScaled = 0.0f;
 143
 144     float targetLength;
 145     float inverseMassi;
 146     float inverseMassj;
 147     float sqrtReducedMass;
 148
 149     float3 xi;
 150     float3 xj;
 151     float3 rc;
 152
 153     // i == -1 indicates dummy constraint at the end of the thread block.
 154     bool isDummyThread = (i == -1);
 155
 156     // Everything computed for these dummies will be equal to zero
 157     if (isDummyThread)
 158     {
 159         targetLength    = 0.0f;
 160         inverseMassi    = 0.0f;
 161         inverseMassj    = 0.0f;
 162         sqrtReducedMass = 0.0f;
 163
 164         xi = make_float3(0.0f, 0.0f, 0.0f);
 165         xj = make_float3(0.0f, 0.0f, 0.0f);
 166         rc = make_float3(0.0f, 0.0f, 0.0f);
 167     }
 168     else
 169     {
 170         // Collecting data
 171         targetLength    = gm_constraintsTargetLengths[threadIndex];
 172         inverseMassi    = gm_inverseMasses[i];
 173         inverseMassj    = gm_inverseMasses[j];
 174         sqrtReducedMass = rsqrt(inverseMassi + inverseMassj);
 175
 176         xi = gm_x[i];
 177         xj = gm_x[j];
 178
 179         float3 dx = pbcDxAiuc(pbcAiuc, xi, xj);
 180
 181         float rlen = rsqrtf(dx.x * dx.x + dx.y * dx.y + dx.z * dx.z);
 182         rc         = rlen * dx;
 183     }
 184
 185     sm_r[threadIdx.x] = rc;
 186     // Make sure that all r's are saved into shared memory
 187     // before they are accessed in the loop below
 188     __syncthreads();
 189
 190     /*
 191      * Constructing LINCS matrix (A)
 192      */
 193
 194     // Only non-zero values are saved (for coupled constraints)
 195     int coupledConstraintsCount = gm_coupledConstraintsCounts[threadIndex];
 196     for (int n = 0; n < coupledConstraintsCount; n++)
 197     {
 198         int index = n * numConstraintsThreads + threadIndex;
 199         int c1    = gm_coupledConstraintsIdxes[index];
 200
 201         float3 rc1        = sm_r[c1 - blockIdx.x * blockDim.x];
 202         gm_matrixA[index] = gm_massFactors[index] * (rc.x * rc1.x + rc.y * rc1.y + rc.z * rc1.z);
 203     }
 204
 205     // Skipping in dummy threads
 206     if (!isDummyThread)
 207     {
 208         xi = gm_xp[i];
 209         xj = gm_xp[j];
 210     }
 211
 212     float3 dx = pbcDxAiuc(pbcAiuc, xi, xj);
 213
 214     float sol = sqrtReducedMass * ((rc.x * dx.x + rc.y * dx.y + rc.z * dx.z) - targetLength);
 215
 216     /*
 217      *  Inverse matrix using a set of expansionOrder matrix multiplications
 218      */
 219
 220     // This will use the same memory space as sm_r, which is no longer needed.
 221     extern __shared__ float sm_rhs[];
 222     // Save current right-hand-side vector in the shared memory
 223     sm_rhs[threadIdx.x] = sol;
 224
 225     for (int rec = 0; rec < expansionOrder; rec++)
 226     {
 227         // Making sure that all sm_rhs are saved before they are accessed in a loop below
 228         __syncthreads();
 229         float mvb = 0.0f;
 230
 231         for (int n = 0; n < coupledConstraintsCount; n++)
 232         {
 233             int index = n * numConstraintsThreads + threadIndex;
 234             int c1    = gm_coupledConstraintsIdxes[index];
 235             // Convolute current right-hand-side with A
 236             // Different, non overlapping parts of sm_rhs[..] are read during odd and even iterations
 237             mvb = mvb + gm_matrixA[index] * sm_rhs[c1 - blockIdx.x * blockDim.x + blockDim.x * (rec % 2)];
 238         }
 239         // 'Switch' rhs vectors, save current result
 240         // These values will be accessed in the loop above during the next iteration.
 241         sm_rhs[threadIdx.x + blockDim.x * ((rec + 1) % 2)] = mvb;
 242         sol                                                = sol + mvb;
 243     }
 244
 245     // Current mass-scaled Lagrange multipliers
 246     lagrangeScaled = sqrtReducedMass * sol;
 247
 248     // Save updated coordinates before correction for the rotational lengthening
 249     float3 tmp = rc * lagrangeScaled;
 250
 251     // Writing for all but dummy constraints
 252     if (!isDummyThread)
 253     {
 254         atomicAdd(&gm_xp[i], -tmp * inverseMassi);
 255         atomicAdd(&gm_xp[j], tmp * inverseMassj);
 256     }
 257
 258     /*
 259      *  Correction for centripetal effects
 260      */
 261     for (int iter = 0; iter < numIterations; iter++)
 262     {
 263         // Make sure that all xp's are saved: atomic operation calls before are
 264         // communicating current xp[..] values across thread block.
 265         __syncthreads();
 266
 267         if (!isDummyThread)
 268         {
 269             xi = gm_xp[i];
 270             xj = gm_xp[j];
 271         }
 272
 273         float3 dx = pbcDxAiuc(pbcAiuc, xi, xj);
 274
 275         float len2  = targetLength * targetLength;
 276         float dlen2 = 2.0f * len2 - norm2(dx);
 277
 278         // TODO A little bit more effective but slightly less readable version of the below would be:
 279         //      float proj = sqrtReducedMass*(targetLength - (dlen2 > 0.0f ? 1.0f : 0.0f)*dlen2*rsqrt(dlen2));
 280         float proj;
 281         if (dlen2 > 0.0f)
 282         {
 283             proj = sqrtReducedMass * (targetLength - dlen2 * rsqrt(dlen2));
 284         }
 285         else
 286         {
 287             proj = sqrtReducedMass * targetLength;
 288         }
 289
 290         sm_rhs[threadIdx.x] = proj;
 291         float sol           = proj;
 292
 293         /*
 294          * Same matrix inversion as above is used for updated data
 295          */
 296         for (int rec = 0; rec < expansionOrder; rec++)
 297         {
 298             // Make sure that all elements of rhs are saved into shared memory
 299             __syncthreads();
 300             float mvb = 0;
 301
 302             for (int n = 0; n < coupledConstraintsCount; n++)
 303             {
 304                 int index = n * numConstraintsThreads + threadIndex;
 305                 int c1    = gm_coupledConstraintsIdxes[index];
 306
 307                 mvb = mvb + gm_matrixA[index] * sm_rhs[c1 - blockIdx.x * blockDim.x + blockDim.x * (rec % 2)];
 308             }
 309             sm_rhs[threadIdx.x + blockDim.x * ((rec + 1) % 2)] = mvb;
 310             sol                                                = sol + mvb;
 311         }
 312
 313         // Add corrections to Lagrange multipliers
 314         float sqrtmu_sol = sqrtReducedMass * sol;
 315         lagrangeScaled += sqrtmu_sol;
 316
 317         // Save updated coordinates for the next iteration
 318         // Dummy constraints are skipped
 319         if (!isDummyThread)
 320         {
 321             float3 tmp = rc * sqrtmu_sol;
 322             atomicAdd(&gm_xp[i], -tmp * inverseMassi);
 323             atomicAdd(&gm_xp[j], tmp * inverseMassj);
 324         }
 325     }
 326
 327     // Updating particle velocities for all but dummy threads
 328     if (updateVelocities && !isDummyThread)
 329     {
 330         float3 tmp = rc * invdt * lagrangeScaled;
 331         atomicAdd(&gm_v[i], -tmp * inverseMassi);
 332         atomicAdd(&gm_v[j], tmp * inverseMassj);
 333     }
 334
 335
 336     if (computeVirial)
 337     {
 338         // Virial is computed from Lagrange multiplier (lagrangeScaled), target constrain length
 339         // (targetLength) and the normalized vector connecting constrained atoms before
 340         // the algorithm was applied (rc). The evaluation of virial in each thread is
 341         // followed by basic reduction for the values inside single thread block.
 342         // Then, the values are reduced across grid by atomicAdd(...).
 343         //
 344         // TODO Shuffle reduction.
 345         // TODO Should be unified and/or done once when virial is actually needed.
 346         // TODO Recursive version that removes atomicAdd(...)'s entirely is needed. Ideally,
 347         //      one that works for any datatype.
 348
 349         // Save virial for each thread into the shared memory. Tensor is symmetrical, hence only
 350         // 6 values are saved. Dummy threads will have zeroes in their virial: targetLength,
 351         // lagrangeScaled and rc are all set to zero for them in the beginning of the kernel.
 352         // The sm_threadVirial[..] will overlap with the sm_r[..] and sm_rhs[..], but the latter
 353         // two are no longer in use.
 354         extern __shared__ float sm_threadVirial[];
 355         float                   mult                  = targetLength * lagrangeScaled;
 356         sm_threadVirial[0 * blockDim.x + threadIdx.x] = mult * rc.x * rc.x;
 357         sm_threadVirial[1 * blockDim.x + threadIdx.x] = mult * rc.x * rc.y;
 358         sm_threadVirial[2 * blockDim.x + threadIdx.x] = mult * rc.x * rc.z;
 359         sm_threadVirial[3 * blockDim.x + threadIdx.x] = mult * rc.y * rc.y;
 360         sm_threadVirial[4 * blockDim.x + threadIdx.x] = mult * rc.y * rc.z;
 361         sm_threadVirial[5 * blockDim.x + threadIdx.x] = mult * rc.z * rc.z;
 362
 363         __syncthreads();
 364
 365         // Reduce up to one virial per thread block. All blocks are divided by half, the first
 366         // half of threads sums two virials. Then the first half is divided by two and the first
 367         // half of it sums two values. This procedure is repeated until only one thread is left.
 368         // Only works if the threads per blocks is a power of two (hence static_assert
 369         // in the beginning of the kernel).
 370         for (int divideBy = 2; divideBy <= static_cast<int>(blockDim.x); divideBy *= 2)
 371         {
 372             int dividedAt = blockDim.x / divideBy;
 373             if (static_cast<int>(threadIdx.x) < dividedAt)
 374             {
 375                 for (int d = 0; d < 6; d++)
 376                 {
 377                     sm_threadVirial[d * blockDim.x + threadIdx.x] +=
 378                             sm_threadVirial[d * blockDim.x + (threadIdx.x + dividedAt)];
 379                 }
 380             }
 381             // Syncronize if not within one warp
 382             if (dividedAt > warpSize / 2)
 383             {
 384                 __syncthreads();
 385             }
 386         }
 387         // First 6 threads in the block add the results of 6 tensor components to the global memory address.
 388         if (threadIdx.x < 6)
 389         {
 390             atomicAdd(&(gm_virialScaled[threadIdx.x]), sm_threadVirial[threadIdx.x * blockDim.x]);
 391         }
 392     }
 393
 394     return;
 395 }
 396
 397 /*! \brief Select templated kernel.
 398  *
 399  * Returns pointer to a CUDA kernel based on provided booleans.
 400  *
 401  * \param[in] updateVelocities  If the velocities should be constrained.
 402  * \param[in] computeVirial     If virial should be updated.
 403  *
 404  * \return                      Pointer to CUDA kernel
 405  */
 406 inline auto getLincsKernelPtr(const bool updateVelocities, const bool computeVirial)
 407 {
 408
 409     auto kernelPtr = lincs_kernel<true, true>;
 410     if (updateVelocities && computeVirial)
 411     {
 412         kernelPtr = lincs_kernel<true, true>;
 413     }
 414     else if (updateVelocities && !computeVirial)
 415     {
 416         kernelPtr = lincs_kernel<true, false>;
 417     }
 418     else if (!updateVelocities && computeVirial)
 419     {
 420         kernelPtr = lincs_kernel<false, true>;
 421     }
 422     else if (!updateVelocities && !computeVirial)
 423     {
 424         kernelPtr = lincs_kernel<false, false>;
 425     }
 426     return kernelPtr;
 427 }
 428
 429 void LincsGpu::apply(const float3* d_x,
 430                      float3*       d_xp,
 431                      const bool    updateVelocities,
 432                      float3*       d_v,
 433                      const real    invdt,
 434                      const bool    computeVirial,
 435                      tensor        virialScaled,
 436                      const PbcAiuc pbcAiuc)
 437 {
 438     ensureNoPendingDeviceError("In CUDA version of LINCS");
 439
 440     // Early exit if no constraints
 441     if (kernelParams_.numConstraintsThreads == 0)
 442     {
 443         return;
 444     }
 445
 446     if (computeVirial)
 447     {
 448         // Fill with zeros so the values can be reduced to it
 449         // Only 6 values are needed because virial is symmetrical
 450         clearDeviceBufferAsync(&kernelParams_.d_virialScaled, 0, 6, deviceStream_);
 451     }
 452
 453     auto kernelPtr = getLincsKernelPtr(updateVelocities, computeVirial);
 454
 455     KernelLaunchConfig config;
 456     config.blockSize[0] = c_threadsPerBlock;
 457     config.blockSize[1] = 1;
 458     config.blockSize[2] = 1;
 459     config.gridSize[0] = (kernelParams_.numConstraintsThreads + c_threadsPerBlock - 1) / c_threadsPerBlock;
 460     config.gridSize[1] = 1;
 461     config.gridSize[2] = 1;
 462
 463     // Shared memory is used to store:
 464     // -- Current coordinates (3 floats per thread)
 465     // -- Right-hand-sides for matrix inversion (2 floats per thread)
 466     // -- Virial tensor components (6 floats per thread)
 467     // Since none of these three are needed simultaneously, they can be saved at the same shared memory address
 468     // (i.e. correspondent arrays are intentionally overlapped in address space). Consequently, only
 469     // max{3, 2, 6} = 6 floats per thread are needed in case virial is computed, or max{3, 2} = 3 if not.
 470     if (computeVirial)
 471     {
 472         config.sharedMemorySize = c_threadsPerBlock * 6 * sizeof(float);
 473     }
 474     else
 475     {
 476         config.sharedMemorySize = c_threadsPerBlock * 3 * sizeof(float);
 477     }
 478
 479     kernelParams_.pbcAiuc = pbcAiuc;
 480
 481     const auto kernelArgs =
 482             prepareGpuKernelArguments(kernelPtr, config, &kernelParams_, &d_x, &d_xp, &d_v, &invdt);
 483
 484     launchGpuKernel(kernelPtr,
 485                     config,
 486                     deviceStream_,
 487                     nullptr,
 488                     "lincs_kernel<updateVelocities, computeVirial>",
 489                     kernelArgs);
 490
 491     if (computeVirial)
 492     {
 493         // Copy LINCS virial data and add it to the common virial
 494         copyFromDeviceBuffer(h_virialScaled_.data(),
 495                              &kernelParams_.d_virialScaled,
 496                              0,
 497                              6,
 498                              deviceStream_,
 499                              GpuApiCallBehavior::Sync,
 500                              nullptr);
 501
 502         // Mapping [XX, XY, XZ, YY, YZ, ZZ] internal format to a tensor object
 503         virialScaled[XX][XX] += h_virialScaled_[0];
 504         virialScaled[XX][YY] += h_virialScaled_[1];
 505         virialScaled[XX][ZZ] += h_virialScaled_[2];
 506
 507         virialScaled[YY][XX] += h_virialScaled_[1];
 508         virialScaled[YY][YY] += h_virialScaled_[3];
 509         virialScaled[YY][ZZ] += h_virialScaled_[4];
 510
 511         virialScaled[ZZ][XX] += h_virialScaled_[2];
 512         virialScaled[ZZ][YY] += h_virialScaled_[4];
 513         virialScaled[ZZ][ZZ] += h_virialScaled_[5];
 514     }
 515
 516     return;
 517 }
 518
 519 LincsGpu::LincsGpu(int                  numIterations,
 520                    int                  expansionOrder,
 521                    const DeviceContext& deviceContext,
 522                    const DeviceStream&  deviceStream) :
 523     deviceContext_(deviceContext),
 524     deviceStream_(deviceStream)
 525 {
 526     kernelParams_.numIterations  = numIterations;
 527     kernelParams_.expansionOrder = expansionOrder;
 528
 529     static_assert(sizeof(real) == sizeof(float),
 530                   "Real numbers should be in single precision in GPU code.");
 531     static_assert(
 532             c_threadsPerBlock > 0 && ((c_threadsPerBlock & (c_threadsPerBlock - 1)) == 0),
 533             "Number of threads per block should be a power of two in order for reduction to work.");
 534
 535     allocateDeviceBuffer(&kernelParams_.d_virialScaled, 6, deviceContext_);
 536     h_virialScaled_.resize(6);
 537
 538     // The data arrays should be expanded/reallocated on first call of set() function.
 539     numConstraintsThreadsAlloc_ = 0;
 540     numAtomsAlloc_              = 0;
 541 }
 542
 543 LincsGpu::~LincsGpu()
 544 {
 545     freeDeviceBuffer(&kernelParams_.d_virialScaled);
 546
 547     if (numConstraintsThreadsAlloc_ > 0)
 548     {
 549         freeDeviceBuffer(&kernelParams_.d_constraints);
 550         freeDeviceBuffer(&kernelParams_.d_constraintsTargetLengths);
 551
 552         freeDeviceBuffer(&kernelParams_.d_coupledConstraintsCounts);
 553         freeDeviceBuffer(&kernelParams_.d_coupledConstraintsIndices);
 554         freeDeviceBuffer(&kernelParams_.d_massFactors);
 555         freeDeviceBuffer(&kernelParams_.d_matrixA);
 556     }
 557     if (numAtomsAlloc_ > 0)
 558     {
 559         freeDeviceBuffer(&kernelParams_.d_inverseMasses);
 560     }
 561 }
 562
 563 //! Helper type for discovering coupled constraints
 564 struct AtomsAdjacencyListElement
 565 {
 566     AtomsAdjacencyListElement(const int indexOfSecondConstrainedAtom,
 567                               const int indexOfConstraint,
 568                               const int signFactor) :
 569         indexOfSecondConstrainedAtom_(indexOfSecondConstrainedAtom),
 570         indexOfConstraint_(indexOfConstraint),
 571         signFactor_(signFactor)
 572     {
 573     }
 574     //! The index of the other atom constrained to this atom.
 575     int indexOfSecondConstrainedAtom_;
 576     //! The index of this constraint in the container of constraints.
 577     int indexOfConstraint_;
 578     /*! \brief A multiplicative factor that indicates the relative
 579      * order of the atoms in the atom list.
 580      *
 581      * Used for computing the mass factor of this constraint
 582      * relative to any coupled constraints. */
 583     int signFactor_;
 584 };
 585 //! Constructs and returns an atom constraint adjacency list
 586 static std::vector<std::vector<AtomsAdjacencyListElement>>
 587 constructAtomsAdjacencyList(const int numAtoms, ArrayRef<const int> iatoms)
 588 {
 589     const int                                           stride         = 1 + NRAL(F_CONSTR);
 590     const int                                           numConstraints = iatoms.ssize() / stride;
 591     std::vector<std::vector<AtomsAdjacencyListElement>> atomsAdjacencyList(numAtoms);
 592     for (int c = 0; c < numConstraints; c++)
 593     {
 594         int a1 = iatoms[stride * c + 1];
 595         int a2 = iatoms[stride * c + 2];
 596
 597         // Each constraint will be represented as a tuple, containing index of the second
 598         // constrained atom, index of the constraint and a sign that indicates the order of atoms in
 599         // which they are listed. Sign is needed to compute the mass factors.
 600         atomsAdjacencyList[a1].emplace_back(a2, c, +1);
 601         atomsAdjacencyList[a2].emplace_back(a1, c, -1);
 602     }
 603
 604     return atomsAdjacencyList;
 605 }
 606
 607 /*! \brief Helper function to go through constraints recursively.
 608  *
 609  *  For each constraint, counts the number of coupled constraints and stores the value in \p numCoupledConstraints array.
 610  *  This information is used to split the array of constraints between thread blocks on a GPU so there is no
 611  *  coupling between constraints from different thread blocks. After the \p numCoupledConstraints array is filled, the
 612  *  value \p numCoupledConstraints[c] should be equal to the number of constraints that are coupled to \p c and located
 613  *  after it in the constraints array.
 614  *
 615  * \param[in]     a                   Atom index.
 616  * \param[in,out] numCoupledConstraints  Indicates if the constraint was already counted and stores
 617  *                                    the number of constraints (i) connected to it and (ii) located
 618  *                                    after it in memory. This array is filled by this recursive function.
 619  *                                    For a set of coupled constraints, only for the first one in this list
 620  *                                    the number of consecutive coupled constraints is needed: if there is
 621  *                                    not enough space for this set of constraints in the thread block,
 622  *                                    the group has to be moved to the next one.
 623  * \param[in]     atomsAdjacencyList  Stores information about connections between atoms.
 624  */
 625 inline int countCoupled(int           a,
 626                         ArrayRef<int> numCoupledConstraints,
 627                         ArrayRef<const std::vector<AtomsAdjacencyListElement>> atomsAdjacencyList)
 628
 629 {
 630     int counted = 0;
 631     for (const auto& adjacentAtom : atomsAdjacencyList[a])
 632     {
 633         const int c2 = adjacentAtom.indexOfConstraint_;
 634         if (numCoupledConstraints[c2] == -1)
 635         {
 636             numCoupledConstraints[c2] = 0; // To indicate we've been here
 637             counted += 1
 638                        + countCoupled(adjacentAtom.indexOfSecondConstrainedAtom_,
 639                                       numCoupledConstraints,
 640                                       atomsAdjacencyList);
 641         }
 642     }
 643     return counted;
 644 }
 645
 646 /*! \brief Add constraint to \p splitMap with all constraints coupled to it.
 647  *
 648  *  Adds the constraint \p c from the constrain list \p iatoms to the map \p splitMap
 649  *  if it was not yet added. Then goes through all the constraints coupled to \p c
 650  *  and calls itself recursively. This ensures that all the coupled constraints will
 651  *  be added to neighboring locations in the final data structures on the device,
 652  *  hence mapping all coupled constraints to the same thread block. A value of -1 in
 653  *  the \p splitMap is used to flag that constraint was not yet added to the \p splitMap.
 654  *
 655  * \param[in]     iatoms              The list of constraints.
 656  * \param[in]     stride              Number of elements per constraint in \p iatoms.
 657  * \param[in]     atomsAdjacencyList  Information about connections between atoms.
 658  * \param[our]    splitMap            Map of sequential constraint indexes to indexes to be on the device
 659  * \param[in]     c                   Sequential index for constraint to consider adding.
 660  * \param[in,out] currentMapIndex     The rolling index for the constraints mapping.
 661  */
 662 inline void addWithCoupled(ArrayRef<const int>                                    iatoms,
 663                            const int                                              stride,
 664                            ArrayRef<const std::vector<AtomsAdjacencyListElement>> atomsAdjacencyList,
 665                            ArrayRef<int>                                          splitMap,
 666                            const int                                              c,
 667                            int*                                                   currentMapIndex)
 668 {
 669     if (splitMap[c] == -1)
 670     {
 671         splitMap[c] = *currentMapIndex;
 672         (*currentMapIndex)++;
 673
 674         // Constraints, coupled through both atoms.
 675         for (int atomIndexInConstraint = 0; atomIndexInConstraint < 2; atomIndexInConstraint++)
 676         {
 677             const int a1 = iatoms[stride * c + 1 + atomIndexInConstraint];
 678             for (const auto& adjacentAtom : atomsAdjacencyList[a1])
 679             {
 680                 const int c2 = adjacentAtom.indexOfConstraint_;
 681                 if (c2 != c)
 682                 {
 683                     addWithCoupled(iatoms, stride, atomsAdjacencyList, splitMap, c2, currentMapIndex);
 684                 }
 685             }
 686         }
 687     }
 688 }
 689
 690 /*! \brief Computes and returns how many constraints are coupled to each constraint
 691  *
 692  * Needed to introduce splits in data so that all coupled constraints will be computed in a
 693  * single GPU block. The position \p c of the vector \p numCoupledConstraints should have the number
 694  * of constraints that are coupled to a constraint \p c and are after \p c in the vector. Only
 695  * first index of the connected group of the constraints is needed later in the code, hence the
 696  * numCoupledConstraints vector is also used to keep track if the constrain was already counted.
 697  */
 698 static std::vector<int> countNumCoupledConstraints(ArrayRef<const int> iatoms,
 699                                                    ArrayRef<const std::vector<AtomsAdjacencyListElement>> atomsAdjacencyList)
 700 {
 701     const int        stride         = 1 + NRAL(F_CONSTR);
 702     const int        numConstraints = iatoms.ssize() / stride;
 703     std::vector<int> numCoupledConstraints(numConstraints, -1);
 704     for (int c = 0; c < numConstraints; c++)
 705     {
 706         const int a1 = iatoms[stride * c + 1];
 707         const int a2 = iatoms[stride * c + 2];
 708         if (numCoupledConstraints[c] == -1)
 709         {
 710             numCoupledConstraints[c] = countCoupled(a1, numCoupledConstraints, atomsAdjacencyList)
 711                                        + countCoupled(a2, numCoupledConstraints, atomsAdjacencyList);
 712         }
 713     }
 714
 715     return numCoupledConstraints;
 716 }
 717
 718 bool LincsGpu::isNumCoupledConstraintsSupported(const gmx_mtop_t& mtop)
 719 {
 720     for (const gmx_moltype_t& molType : mtop.moltype)
 721     {
 722         ArrayRef<const int> iatoms    = molType.ilist[F_CONSTR].iatoms;
 723         const auto atomsAdjacencyList = constructAtomsAdjacencyList(molType.atoms.nr, iatoms);
 724         // Compute, how many constraints are coupled to each constraint
 725         const auto numCoupledConstraints = countNumCoupledConstraints(iatoms, atomsAdjacencyList);
 726         for (const int numCoupled : numCoupledConstraints)
 727         {
 728             if (numCoupled > c_threadsPerBlock)
 729             {
 730                 return false;
 731             }
 732         }
 733     }
 734
 735     return true;
 736 }
 737
 738 void LincsGpu::set(const InteractionDefinitions& idef, const int numAtoms, const real* invmass)
 739 {
 740     // List of constrained atoms (CPU memory)
 741     std::vector<int2> constraintsHost;
 742     // Equilibrium distances for the constraints (CPU)
 743     std::vector<float> constraintsTargetLengthsHost;
 744     // Number of constraints, coupled with the current one (CPU)
 745     std::vector<int> coupledConstraintsCountsHost;
 746     // List of coupled with the current one (CPU)
 747     std::vector<int> coupledConstraintsIndicesHost;
 748     // Mass factors (CPU)
 749     std::vector<float> massFactorsHost;
 750
 751     // List of constrained atoms in local topology
 752     ArrayRef<const int> iatoms         = idef.il[F_CONSTR].iatoms;
 753     const int           stride         = NRAL(F_CONSTR) + 1;
 754     const int           numConstraints = idef.il[F_CONSTR].size() / stride;
 755
 756     // Early exit if no constraints
 757     if (numConstraints == 0)
 758     {
 759         kernelParams_.numConstraintsThreads = 0;
 760         return;
 761     }
 762
 763     // Construct the adjacency list, a useful intermediate structure
 764     const auto atomsAdjacencyList = constructAtomsAdjacencyList(numAtoms, iatoms);
 765
 766     // Compute, how many constraints are coupled to each constraint
 767     const auto numCoupledConstraints = countNumCoupledConstraints(iatoms, atomsAdjacencyList);
 768
 769     // Map of splits in the constraints data. For each 'old' constraint index gives 'new' which
 770     // takes into account the empty spaces which might be needed in the end of each thread block.
 771     std::vector<int> splitMap(numConstraints, -1);
 772     int              currentMapIndex = 0;
 773     for (int c = 0; c < numConstraints; c++)
 774     {
 775         // Check if coupled constraints all fit in one block
 776         if (numCoupledConstraints.at(c) > c_threadsPerBlock)
 777         {
 778             gmx_fatal(FARGS,
 779                       "Maximum number of coupled constraints (%d) exceeds the size of the CUDA "
 780                       "thread block (%d). Most likely, you are trying to use the GPU version of "
 781                       "LINCS with constraints on all-bonds, which is not supported for large "
 782                       "molecules. When compatible with the force field and integration settings, "
 783                       "using constraints on H-bonds only.",
 784                       numCoupledConstraints.at(c),
 785                       c_threadsPerBlock);
 786         }
 787         if (currentMapIndex / c_threadsPerBlock
 788             != (currentMapIndex + numCoupledConstraints.at(c)) / c_threadsPerBlock)
 789         {
 790             currentMapIndex = ((currentMapIndex / c_threadsPerBlock) + 1) * c_threadsPerBlock;
 791         }
 792         addWithCoupled(iatoms, stride, atomsAdjacencyList, splitMap, c, &currentMapIndex);
 793     }
 794
 795     kernelParams_.numConstraintsThreads =
 796             currentMapIndex + c_threadsPerBlock - currentMapIndex % c_threadsPerBlock;
 797     GMX_RELEASE_ASSERT(kernelParams_.numConstraintsThreads % c_threadsPerBlock == 0,
 798                        "Number of threads should be a multiple of the block size");
 799
 800     // Initialize constraints and their target indexes taking into account the splits in the data arrays.
 801     int2 pair;
 802     pair.x = -1;
 803     pair.y = -1;
 804     constraintsHost.resize(kernelParams_.numConstraintsThreads, pair);
 805     std::fill(constraintsHost.begin(), constraintsHost.end(), pair);
 806     constraintsTargetLengthsHost.resize(kernelParams_.numConstraintsThreads, 0.0);
 807     std::fill(constraintsTargetLengthsHost.begin(), constraintsTargetLengthsHost.end(), 0.0);
 808     for (int c = 0; c < numConstraints; c++)
 809     {
 810         int a1   = iatoms[stride * c + 1];
 811         int a2   = iatoms[stride * c + 2];
 812         int type = iatoms[stride * c];
 813
 814         int2 pair;
 815         pair.x                                          = a1;
 816         pair.y                                          = a2;
 817         constraintsHost.at(splitMap.at(c))              = pair;
 818         constraintsTargetLengthsHost.at(splitMap.at(c)) = idef.iparams[type].constr.dA;
 819     }
 820
 821     // The adjacency list of constraints (i.e. the list of coupled constraints for each constraint).
 822     // We map a single thread to a single constraint, hence each thread 'c' will be using one
 823     // element from coupledConstraintsCountsHost array, which is the number of constraints coupled
 824     // to the constraint 'c'. The coupled constraints indexes are placed into the
 825     // coupledConstraintsIndicesHost array. Latter is organized as a one-dimensional array to ensure
 826     // good memory alignment. It is addressed as [c + i*numConstraintsThreads], where 'i' goes from
 827     // zero to the number of constraints coupled to 'c'. 'numConstraintsThreads' is the width of the
 828     // array --- a number, greater then total number of constraints, taking into account the splits
 829     // in the constraints array due to the GPU block borders. This number can be adjusted to improve
 830     // memory access pattern. Mass factors are saved in a similar data structure.
 831     int  maxCoupledConstraints             = 0;
 832     bool maxCoupledConstraintsHasIncreased = false;
 833     for (int c = 0; c < numConstraints; c++)
 834     {
 835         int a1 = iatoms[stride * c + 1];
 836         int a2 = iatoms[stride * c + 2];
 837
 838         // Constraint 'c' is counted twice, but it should be excluded altogether. Hence '-2'.
 839         int nCoupledConstraints = atomsAdjacencyList.at(a1).size() + atomsAdjacencyList.at(a2).size() - 2;
 840
 841         if (nCoupledConstraints > maxCoupledConstraints)
 842         {
 843             maxCoupledConstraints             = nCoupledConstraints;
 844             maxCoupledConstraintsHasIncreased = true;
 845         }
 846     }
 847
 848     coupledConstraintsCountsHost.resize(kernelParams_.numConstraintsThreads, 0);
 849     coupledConstraintsIndicesHost.resize(maxCoupledConstraints * kernelParams_.numConstraintsThreads, -1);
 850     massFactorsHost.resize(maxCoupledConstraints * kernelParams_.numConstraintsThreads, -1);
 851
 852     for (int c1 = 0; c1 < numConstraints; c1++)
 853     {
 854         coupledConstraintsCountsHost.at(splitMap.at(c1)) = 0;
 855         int c1a1                                         = iatoms[stride * c1 + 1];
 856         int c1a2                                         = iatoms[stride * c1 + 2];
 857
 858         // Constraints, coupled through the first atom.
 859         int c2a1 = c1a1;
 860         for (const auto& atomAdjacencyList : atomsAdjacencyList[c1a1])
 861         {
 862             int c2 = atomAdjacencyList.indexOfConstraint_;
 863
 864             if (c1 != c2)
 865             {
 866                 int c2a2  = atomAdjacencyList.indexOfSecondConstrainedAtom_;
 867                 int sign  = atomAdjacencyList.signFactor_;
 868                 int index = kernelParams_.numConstraintsThreads
 869                                     * coupledConstraintsCountsHost.at(splitMap.at(c1))
 870                             + splitMap.at(c1);
 871
 872                 coupledConstraintsIndicesHost.at(index) = splitMap.at(c2);
 873
 874                 int center = c1a1;
 875
 876                 float sqrtmu1 = 1.0 / sqrt(invmass[c1a1] + invmass[c1a2]);
 877                 float sqrtmu2 = 1.0 / sqrt(invmass[c2a1] + invmass[c2a2]);
 878
 879                 massFactorsHost.at(index) = -sign * invmass[center] * sqrtmu1 * sqrtmu2;
 880
 881                 coupledConstraintsCountsHost.at(splitMap.at(c1))++;
 882             }
 883         }
 884
 885         // Constraints, coupled through the second atom.
 886         c2a1 = c1a2;
 887         for (const auto& atomAdjacencyList : atomsAdjacencyList[c1a2])
 888         {
 889             int c2 = atomAdjacencyList.indexOfConstraint_;
 890
 891             if (c1 != c2)
 892             {
 893                 int c2a2  = atomAdjacencyList.indexOfSecondConstrainedAtom_;
 894                 int sign  = atomAdjacencyList.signFactor_;
 895                 int index = kernelParams_.numConstraintsThreads
 896                                     * coupledConstraintsCountsHost.at(splitMap.at(c1))
 897                             + splitMap.at(c1);
 898
 899                 coupledConstraintsIndicesHost.at(index) = splitMap.at(c2);
 900
 901                 int center = c1a2;
 902
 903                 float sqrtmu1 = 1.0 / sqrt(invmass[c1a1] + invmass[c1a2]);
 904                 float sqrtmu2 = 1.0 / sqrt(invmass[c2a1] + invmass[c2a2]);
 905
 906                 massFactorsHost.at(index) = sign * invmass[center] * sqrtmu1 * sqrtmu2;
 907
 908                 coupledConstraintsCountsHost.at(splitMap.at(c1))++;
 909             }
 910         }
 911     }
 912
 913     // (Re)allocate the memory, if the number of constraints has increased.
 914     if ((kernelParams_.numConstraintsThreads > numConstraintsThreadsAlloc_) || maxCoupledConstraintsHasIncreased)
 915     {
 916         // Free memory if it was allocated before (i.e. if not the first time here).
 917         if (numConstraintsThreadsAlloc_ > 0)
 918         {
 919             freeDeviceBuffer(&kernelParams_.d_constraints);
 920             freeDeviceBuffer(&kernelParams_.d_constraintsTargetLengths);
 921
 922             freeDeviceBuffer(&kernelParams_.d_coupledConstraintsCounts);
 923             freeDeviceBuffer(&kernelParams_.d_coupledConstraintsIndices);
 924             freeDeviceBuffer(&kernelParams_.d_massFactors);
 925             freeDeviceBuffer(&kernelParams_.d_matrixA);
 926         }
 927
 928         numConstraintsThreadsAlloc_ = kernelParams_.numConstraintsThreads;
 929
 930         allocateDeviceBuffer(
 931                 &kernelParams_.d_constraints, kernelParams_.numConstraintsThreads, deviceContext_);
 932         allocateDeviceBuffer(&kernelParams_.d_constraintsTargetLengths,
 933                              kernelParams_.numConstraintsThreads,
 934                              deviceContext_);
 935
 936         allocateDeviceBuffer(&kernelParams_.d_coupledConstraintsCounts,
 937                              kernelParams_.numConstraintsThreads,
 938                              deviceContext_);
 939         allocateDeviceBuffer(&kernelParams_.d_coupledConstraintsIndices,
 940                              maxCoupledConstraints * kernelParams_.numConstraintsThreads,
 941                              deviceContext_);
 942         allocateDeviceBuffer(&kernelParams_.d_massFactors,
 943                              maxCoupledConstraints * kernelParams_.numConstraintsThreads,
 944                              deviceContext_);
 945         allocateDeviceBuffer(&kernelParams_.d_matrixA,
 946                              maxCoupledConstraints * kernelParams_.numConstraintsThreads,
 947                              deviceContext_);
 948     }
 949
 950     // (Re)allocate the memory, if the number of atoms has increased.
 951     if (numAtoms > numAtomsAlloc_)
 952     {
 953         if (numAtomsAlloc_ > 0)
 954         {
 955             freeDeviceBuffer(&kernelParams_.d_inverseMasses);
 956         }
 957         numAtomsAlloc_ = numAtoms;
 958         allocateDeviceBuffer(&kernelParams_.d_inverseMasses, numAtoms, deviceContext_);
 959     }
 960
 961     // Copy data to GPU.
 962     copyToDeviceBuffer(&kernelParams_.d_constraints,
 963                        constraintsHost.data(),
 964                        0,
 965                        kernelParams_.numConstraintsThreads,
 966                        deviceStream_,
 967                        GpuApiCallBehavior::Sync,
 968                        nullptr);
 969     copyToDeviceBuffer(&kernelParams_.d_constraintsTargetLengths,
 970                        constraintsTargetLengthsHost.data(),
 971                        0,
 972                        kernelParams_.numConstraintsThreads,
 973                        deviceStream_,
 974                        GpuApiCallBehavior::Sync,
 975                        nullptr);
 976     copyToDeviceBuffer(&kernelParams_.d_coupledConstraintsCounts,
 977                        coupledConstraintsCountsHost.data(),
 978                        0,
 979                        kernelParams_.numConstraintsThreads,
 980                        deviceStream_,
 981                        GpuApiCallBehavior::Sync,
 982                        nullptr);
 983     copyToDeviceBuffer(&kernelParams_.d_coupledConstraintsIndices,
 984                        coupledConstraintsIndicesHost.data(),
 985                        0,
 986                        maxCoupledConstraints * kernelParams_.numConstraintsThreads,
 987                        deviceStream_,
 988                        GpuApiCallBehavior::Sync,
 989                        nullptr);
 990     copyToDeviceBuffer(&kernelParams_.d_massFactors,
 991                        massFactorsHost.data(),
 992                        0,
 993                        maxCoupledConstraints * kernelParams_.numConstraintsThreads,
 994                        deviceStream_,
 995                        GpuApiCallBehavior::Sync,
 996                        nullptr);
 997
 998     GMX_RELEASE_ASSERT(invmass != nullptr, "Masses of atoms should be specified.\n");
 999     copyToDeviceBuffer(
1000             &kernelParams_.d_inverseMasses, invmass, 0, numAtoms, deviceStream_, GpuApiCallBehavior::Sync, nullptr);
1001 }
1002
1003 } // namespace gmx