src/gromacs/mdlib/lincs_cuda.cu

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2019, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35 /*! \internal \file
  36  *
  37  * \brief Implements LINCS using CUDA
  38  *
  39  * This file contains implementation of LINCS constraints algorithm
  40  * using CUDA, including class initialization, data-structures management
  41  * and GPU kernel.
  42  *
  43  * \note Management of periodic boundary should be unified with SETTLE and
  44  *       removed from here.
  45  * \todo Reconsider naming, i.e. "cuda" suffics should be changed to "gpu".
  46  *
  47  * \author Artem Zhmurov <zhmurov@gmail.com>
  48  * \author Alan Gray <alang@nvidia.com>
  49  *
  50  * \ingroup module_mdlib
  51  */
  52 #include "gmxpre.h"
  53
  54 #include "lincs_cuda.cuh"
  55
  56 #include <assert.h>
  57 #include <stdio.h>
  58
  59 #include <cmath>
  60
  61 #include <algorithm>
  62
  63 #include "gromacs/gpu_utils/cuda_arch_utils.cuh"
  64 #include "gromacs/gpu_utils/cudautils.cuh"
  65 #include "gromacs/gpu_utils/devicebuffer.cuh"
  66 #include "gromacs/gpu_utils/gputraits.cuh"
  67 #include "gromacs/gpu_utils/vectype_ops.cuh"
  68 #include "gromacs/math/vec.h"
  69 #include "gromacs/mdlib/constr.h"
  70 #include "gromacs/pbcutil/pbc.h"
  71 #include "gromacs/pbcutil/pbc_aiuc_cuda.cuh"
  72 #include "gromacs/topology/ifunc.h"
  73 #include "gromacs/topology/topology.h"
  74
  75 namespace gmx
  76 {
  77
  78 //! Number of CUDA threads in a block
  79 constexpr static int c_threadsPerBlock = 256;
  80 //! Maximum number of threads in a block (for __launch_bounds__)
  81 constexpr static int c_maxThreadsPerBlock = c_threadsPerBlock;
  82
  83 /*! \brief Main kernel for LINCS constraints.
  84  *
  85  * See Hess et al., J. Comput. Chem. 18: 1463-1472 (1997) for the description of the algorithm.
  86  *
  87  * In CUDA version, one thread is responsible for all computations for one constraint. The blocks are
  88  * filled in a way that no constraint is coupled to the constraint from the next block. This is achieved
  89  * by moving active threads to the next block, if the correspondent group of coupled constraints is to big
  90  * to fit the current thread block. This may leave some 'dummy' threads in the end of the thread block, i.e.
  91  * threads that are not required to do actual work. Since constraints from different blocks are not coupled,
  92  * there is no need to synchronize across the device. However, extensive communication in a thread block
  93  * are still needed.
  94  *
  95  * \todo Reduce synchronization overhead. Some ideas are:
  96  *        1. Consider going to warp-level synchronization for the coupled constraints.
  97  *        2. Move more data to local/shared memory and try to get rid of atomic operations (at least on
  98  *           the device level).
  99  *        3. Use analytical solution for matrix A inversion.
 100  *        4. Introduce mapping of thread id to both single constraint and single atom, thus designating
 101  *           Nth threads to deal with Nat <= Nth coupled atoms and Nc <= Nth coupled constraints.
 102  *       See Redmine issue #2885 for details (https://redmine.gromacs.org/issues/2885)
 103  * \todo The use of __restrict__  for gm_xp and gm_v causes failure, probably because of the atomic
 104          operations. Investigate this issue further.
 105  *
 106  * \param[in,out] kernelParams  All parameters and pointers for the kernel condensed in single struct.
 107  * \param[in]     invdt         Inverse timestep (needed to update velocities).
 108  */
 109 template<bool updateVelocities, bool computeVirial>
 110 __launch_bounds__(c_maxThreadsPerBlock) __global__
 111         void lincs_kernel(LincsCudaKernelParameters kernelParams,
 112                           const float3* __restrict__ gm_x,
 113                           float3*     gm_xp,
 114                           float3*     gm_v,
 115                           const float invdt)
 116 {
 117     const PbcAiuc pbcAiuc                                 = kernelParams.pbcAiuc;
 118     const int     numConstraintsThreads                   = kernelParams.numConstraintsThreads;
 119     const int     numIterations                           = kernelParams.numIterations;
 120     const int     expansionOrder                          = kernelParams.expansionOrder;
 121     const int2* __restrict__ gm_constraints               = kernelParams.d_constraints;
 122     const float* __restrict__ gm_constraintsTargetLengths = kernelParams.d_constraintsTargetLengths;
 123     const int* __restrict__ gm_coupledConstraintsCounts   = kernelParams.d_coupledConstraintsCounts;
 124     const int* __restrict__ gm_coupledConstraintsIdxes = kernelParams.d_coupledConstraintsIndices;
 125     const float* __restrict__ gm_massFactors           = kernelParams.d_massFactors;
 126     float* __restrict__ gm_matrixA                     = kernelParams.d_matrixA;
 127     const float* __restrict__ gm_inverseMasses         = kernelParams.d_inverseMasses;
 128     float* __restrict__ gm_virialScaled                = kernelParams.d_virialScaled;
 129
 130     int threadIndex = blockIdx.x * blockDim.x + threadIdx.x;
 131
 132     // numConstraintsThreads should be a integer multiple of blockSize (numConstraintsThreads = numBlocks*blockSize).
 133     // This is to ensure proper synchronizations and reduction. All array are padded to the required size.
 134     assert(threadIndex < numConstraintsThreads);
 135
 136     // Vectors connecting constrained atoms before algorithm was applied.
 137     // Needed to construct constrain matrix A
 138     extern __shared__ float3 sm_r[];
 139
 140     int2 pair = gm_constraints[threadIndex];
 141     int  i    = pair.x;
 142     int  j    = pair.y;
 143
 144     // Mass-scaled Lagrange multiplier
 145     float lagrangeScaled = 0.0f;
 146
 147     float targetLength;
 148     float inverseMassi;
 149     float inverseMassj;
 150     float sqrtReducedMass;
 151
 152     float3 xi;
 153     float3 xj;
 154     float3 rc;
 155
 156     // i == -1 indicates dummy constraint at the end of the thread block.
 157     bool isDummyThread = (i == -1);
 158
 159     // Everything computed for these dummies will be equal to zero
 160     if (isDummyThread)
 161     {
 162         targetLength    = 0.0f;
 163         inverseMassi    = 0.0f;
 164         inverseMassj    = 0.0f;
 165         sqrtReducedMass = 0.0f;
 166
 167         xi = make_float3(0.0f, 0.0f, 0.0f);
 168         xj = make_float3(0.0f, 0.0f, 0.0f);
 169         rc = make_float3(0.0f, 0.0f, 0.0f);
 170     }
 171     else
 172     {
 173         // Collecting data
 174         targetLength    = gm_constraintsTargetLengths[threadIndex];
 175         inverseMassi    = gm_inverseMasses[i];
 176         inverseMassj    = gm_inverseMasses[j];
 177         sqrtReducedMass = rsqrt(inverseMassi + inverseMassj);
 178
 179         xi = gm_x[i];
 180         xj = gm_x[j];
 181
 182         float3 dx = pbcDxAiuc(pbcAiuc, xi, xj);
 183
 184         float rlen = rsqrtf(dx.x * dx.x + dx.y * dx.y + dx.z * dx.z);
 185         rc         = rlen * dx;
 186     }
 187
 188     sm_r[threadIdx.x] = rc;
 189     // Make sure that all r's are saved into shared memory
 190     // before they are accessed in the loop below
 191     __syncthreads();
 192
 193     /*
 194      * Constructing LINCS matrix (A)
 195      */
 196
 197     // Only non-zero values are saved (for coupled constraints)
 198     int coupledConstraintsCount = gm_coupledConstraintsCounts[threadIndex];
 199     for (int n = 0; n < coupledConstraintsCount; n++)
 200     {
 201         int index = n * numConstraintsThreads + threadIndex;
 202         int c1    = gm_coupledConstraintsIdxes[index];
 203
 204         float3 rc1        = sm_r[c1 - blockIdx.x * blockDim.x];
 205         gm_matrixA[index] = gm_massFactors[index] * (rc.x * rc1.x + rc.y * rc1.y + rc.z * rc1.z);
 206     }
 207
 208     // Skipping in dummy threads
 209     if (!isDummyThread)
 210     {
 211         xi = gm_xp[i];
 212         xj = gm_xp[j];
 213     }
 214
 215     float3 dx = pbcDxAiuc(pbcAiuc, xi, xj);
 216
 217     float sol = sqrtReducedMass * ((rc.x * dx.x + rc.y * dx.y + rc.z * dx.z) - targetLength);
 218
 219     /*
 220      *  Inverse matrix using a set of expansionOrder matrix multiplications
 221      */
 222
 223     // This will use the same memory space as sm_r, which is no longer needed.
 224     extern __shared__ float sm_rhs[];
 225     // Save current right-hand-side vector in the shared memory
 226     sm_rhs[threadIdx.x] = sol;
 227
 228     for (int rec = 0; rec < expansionOrder; rec++)
 229     {
 230         // Making sure that all sm_rhs are saved before they are accessed in a loop below
 231         __syncthreads();
 232         float mvb = 0.0f;
 233
 234         for (int n = 0; n < coupledConstraintsCount; n++)
 235         {
 236             int index = n * numConstraintsThreads + threadIndex;
 237             int c1    = gm_coupledConstraintsIdxes[index];
 238             // Convolute current right-hand-side with A
 239             // Different, non overlapping parts of sm_rhs[..] are read during odd and even iterations
 240             mvb = mvb + gm_matrixA[index] * sm_rhs[c1 - blockIdx.x * blockDim.x + blockDim.x * (rec % 2)];
 241         }
 242         // 'Switch' rhs vectors, save current result
 243         // These values will be accessed in the loop above during the next iteration.
 244         sm_rhs[threadIdx.x + blockDim.x * ((rec + 1) % 2)] = mvb;
 245         sol                                                = sol + mvb;
 246     }
 247
 248     // Current mass-scaled Lagrange multipliers
 249     lagrangeScaled = sqrtReducedMass * sol;
 250
 251     // Save updated coordinates before correction for the rotational lengthening
 252     float3 tmp = rc * lagrangeScaled;
 253
 254     // Writing for all but dummy constraints
 255     if (!isDummyThread)
 256     {
 257         atomicAdd(&gm_xp[i], -tmp * inverseMassi);
 258         atomicAdd(&gm_xp[j], tmp * inverseMassj);
 259     }
 260
 261     /*
 262      *  Correction for centripetal effects
 263      */
 264     for (int iter = 0; iter < numIterations; iter++)
 265     {
 266         // Make sure that all xp's are saved: atomic operation calls before are
 267         // communicating current xp[..] values across thread block.
 268         __syncthreads();
 269
 270         if (!isDummyThread)
 271         {
 272             xi = gm_xp[i];
 273             xj = gm_xp[j];
 274         }
 275
 276         float3 dx = pbcDxAiuc(pbcAiuc, xi, xj);
 277
 278         float len2  = targetLength * targetLength;
 279         float dlen2 = 2.0f * len2 - norm2(dx);
 280
 281         // TODO A little bit more effective but slightly less readable version of the below would be:
 282         //      float proj = sqrtReducedMass*(targetLength - (dlen2 > 0.0f ? 1.0f : 0.0f)*dlen2*rsqrt(dlen2));
 283         float proj;
 284         if (dlen2 > 0.0f)
 285         {
 286             proj = sqrtReducedMass * (targetLength - dlen2 * rsqrt(dlen2));
 287         }
 288         else
 289         {
 290             proj = sqrtReducedMass * targetLength;
 291         }
 292
 293         sm_rhs[threadIdx.x] = proj;
 294         float sol           = proj;
 295
 296         /*
 297          * Same matrix inversion as above is used for updated data
 298          */
 299         for (int rec = 0; rec < expansionOrder; rec++)
 300         {
 301             // Make sure that all elements of rhs are saved into shared memory
 302             __syncthreads();
 303             float mvb = 0;
 304
 305             for (int n = 0; n < coupledConstraintsCount; n++)
 306             {
 307                 int index = n * numConstraintsThreads + threadIndex;
 308                 int c1    = gm_coupledConstraintsIdxes[index];
 309
 310                 mvb = mvb + gm_matrixA[index] * sm_rhs[c1 - blockIdx.x * blockDim.x + blockDim.x * (rec % 2)];
 311             }
 312             sm_rhs[threadIdx.x + blockDim.x * ((rec + 1) % 2)] = mvb;
 313             sol                                                = sol + mvb;
 314         }
 315
 316         // Add corrections to Lagrange multipliers
 317         float sqrtmu_sol = sqrtReducedMass * sol;
 318         lagrangeScaled += sqrtmu_sol;
 319
 320         // Save updated coordinates for the next iteration
 321         // Dummy constraints are skipped
 322         if (!isDummyThread)
 323         {
 324             float3 tmp = rc * sqrtmu_sol;
 325             atomicAdd(&gm_xp[i], -tmp * inverseMassi);
 326             atomicAdd(&gm_xp[j], tmp * inverseMassj);
 327         }
 328     }
 329
 330     // Updating particle velocities for all but dummy threads
 331     if (updateVelocities && !isDummyThread)
 332     {
 333         float3 tmp = rc * invdt * lagrangeScaled;
 334         atomicAdd(&gm_v[i], -tmp * inverseMassi);
 335         atomicAdd(&gm_v[j], tmp * inverseMassj);
 336     }
 337
 338
 339     if (computeVirial)
 340     {
 341         // Virial is computed from Lagrange multiplier (lagrangeScaled), target constrain length
 342         // (targetLength) and the normalized vector connecting constrained atoms before
 343         // the algorithm was applied (rc). The evaluation of virial in each thread is
 344         // followed by basic reduction for the values inside single thread block.
 345         // Then, the values are reduced across grid by atomicAdd(...).
 346         //
 347         // TODO Shuffle reduction.
 348         // TODO Should be unified and/or done once when virial is actually needed.
 349         // TODO Recursive version that removes atomicAdd(...)'s entirely is needed. Ideally,
 350         //      one that works for any datatype.
 351
 352         // Save virial for each thread into the shared memory. Tensor is symmetrical, hence only
 353         // 6 values are saved. Dummy threads will have zeroes in their virial: targetLength,
 354         // lagrangeScaled and rc are all set to zero for them in the beginning of the kernel.
 355         // The sm_threadVirial[..] will overlap with the sm_r[..] and sm_rhs[..], but the latter
 356         // two are no longer in use.
 357         extern __shared__ float sm_threadVirial[];
 358         float                   mult                  = targetLength * lagrangeScaled;
 359         sm_threadVirial[0 * blockDim.x + threadIdx.x] = mult * rc.x * rc.x;
 360         sm_threadVirial[1 * blockDim.x + threadIdx.x] = mult * rc.x * rc.y;
 361         sm_threadVirial[2 * blockDim.x + threadIdx.x] = mult * rc.x * rc.z;
 362         sm_threadVirial[3 * blockDim.x + threadIdx.x] = mult * rc.y * rc.y;
 363         sm_threadVirial[4 * blockDim.x + threadIdx.x] = mult * rc.y * rc.z;
 364         sm_threadVirial[5 * blockDim.x + threadIdx.x] = mult * rc.z * rc.z;
 365
 366         __syncthreads();
 367
 368         // Reduce up to one virial per thread block. All blocks are divided by half, the first
 369         // half of threads sums two virials. Then the first half is divided by two and the first
 370         // half of it sums two values. This procedure is repeated until only one thread is left.
 371         // Only works if the threads per blocks is a power of two (hence static_assert
 372         // in the beginning of the kernel).
 373         for (int divideBy = 2; divideBy <= static_cast<int>(blockDim.x); divideBy *= 2)
 374         {
 375             int dividedAt = blockDim.x / divideBy;
 376             if (static_cast<int>(threadIdx.x) < dividedAt)
 377             {
 378                 for (int d = 0; d < 6; d++)
 379                 {
 380                     sm_threadVirial[d * blockDim.x + threadIdx.x] +=
 381                             sm_threadVirial[d * blockDim.x + (threadIdx.x + dividedAt)];
 382                 }
 383             }
 384             // Syncronize if not within one warp
 385             if (dividedAt > warpSize / 2)
 386             {
 387                 __syncthreads();
 388             }
 389         }
 390         // First 6 threads in the block add the results of 6 tensor components to the global memory address.
 391         if (threadIdx.x < 6)
 392         {
 393             atomicAdd(&(gm_virialScaled[threadIdx.x]), sm_threadVirial[threadIdx.x * blockDim.x]);
 394         }
 395     }
 396
 397     return;
 398 }
 399
 400 /*! \brief Select templated kernel.
 401  *
 402  * Returns pointer to a CUDA kernel based on provided booleans.
 403  *
 404  * \param[in] updateVelocities  If the velocities should be constrained.
 405  * \param[in] computeVirial     If virial should be updated.
 406  *
 407  * \return                      Pointer to CUDA kernel
 408  */
 409 inline auto getLincsKernelPtr(const bool updateVelocities, const bool computeVirial)
 410 {
 411
 412     auto kernelPtr = lincs_kernel<true, true>;
 413     if (updateVelocities && computeVirial)
 414     {
 415         kernelPtr = lincs_kernel<true, true>;
 416     }
 417     else if (updateVelocities && !computeVirial)
 418     {
 419         kernelPtr = lincs_kernel<true, false>;
 420     }
 421     else if (!updateVelocities && computeVirial)
 422     {
 423         kernelPtr = lincs_kernel<false, true>;
 424     }
 425     else if (!updateVelocities && !computeVirial)
 426     {
 427         kernelPtr = lincs_kernel<false, false>;
 428     }
 429     return kernelPtr;
 430 }
 431
 432 void LincsCuda::apply(const float3* d_x,
 433                       float3*       d_xp,
 434                       const bool    updateVelocities,
 435                       float3*       d_v,
 436                       const real    invdt,
 437                       const bool    computeVirial,
 438                       tensor        virialScaled)
 439 {
 440     ensureNoPendingCudaError("In CUDA version of LINCS");
 441
 442     // Early exit if no constraints
 443     if (kernelParams_.numConstraintsThreads == 0)
 444     {
 445         return;
 446     }
 447
 448     if (computeVirial)
 449     {
 450         // Fill with zeros so the values can be reduced to it
 451         // Only 6 values are needed because virial is symmetrical
 452         clearDeviceBufferAsync(&kernelParams_.d_virialScaled, 0, 6, commandStream_);
 453     }
 454
 455     auto kernelPtr = getLincsKernelPtr(updateVelocities, computeVirial);
 456
 457     KernelLaunchConfig config;
 458     config.blockSize[0] = c_threadsPerBlock;
 459     config.blockSize[1] = 1;
 460     config.blockSize[2] = 1;
 461     config.gridSize[0] = (kernelParams_.numConstraintsThreads + c_threadsPerBlock - 1) / c_threadsPerBlock;
 462     config.gridSize[1] = 1;
 463     config.gridSize[2] = 1;
 464
 465     // Shared memory is used to store:
 466     // -- Current coordinates (3 floats per thread)
 467     // -- Right-hand-sides for matrix inversion (2 floats per thread)
 468     // -- Virial tensor components (6 floats per thread)
 469     // Since none of these three are needed simultaneously, they can be saved at the same shared memory address
 470     // (i.e. correspondent arrays are intentionally overlapped in address space). Consequently, only
 471     // max{3, 2, 6} = 6 floats per thread are needed in case virial is computed, or max{3, 2} = 3 if not.
 472     if (computeVirial)
 473     {
 474         config.sharedMemorySize = c_threadsPerBlock * 6 * sizeof(float);
 475     }
 476     else
 477     {
 478         config.sharedMemorySize = c_threadsPerBlock * 3 * sizeof(float);
 479     }
 480     config.stream = commandStream_;
 481
 482     const auto kernelArgs =
 483             prepareGpuKernelArguments(kernelPtr, config, &kernelParams_, &d_x, &d_xp, &d_v, &invdt);
 484
 485     launchGpuKernel(kernelPtr, config, nullptr, "lincs_kernel<updateVelocities, computeVirial>", kernelArgs);
 486
 487     if (computeVirial)
 488     {
 489         // Copy LINCS virial data and add it to the common virial
 490         copyFromDeviceBuffer(h_virialScaled_.data(), &kernelParams_.d_virialScaled, 0, 6,
 491                              commandStream_, GpuApiCallBehavior::Sync, nullptr);
 492
 493         // Mapping [XX, XY, XZ, YY, YZ, ZZ] internal format to a tensor object
 494         virialScaled[XX][XX] += h_virialScaled_[0];
 495         virialScaled[XX][YY] += h_virialScaled_[1];
 496         virialScaled[XX][ZZ] += h_virialScaled_[2];
 497
 498         virialScaled[YY][XX] += h_virialScaled_[1];
 499         virialScaled[YY][YY] += h_virialScaled_[3];
 500         virialScaled[YY][ZZ] += h_virialScaled_[4];
 501
 502         virialScaled[ZZ][XX] += h_virialScaled_[2];
 503         virialScaled[ZZ][YY] += h_virialScaled_[4];
 504         virialScaled[ZZ][ZZ] += h_virialScaled_[5];
 505     }
 506
 507     return;
 508 }
 509
 510 LincsCuda::LincsCuda(int numIterations, int expansionOrder, CommandStream commandStream) :
 511     commandStream_(commandStream)
 512 {
 513     kernelParams_.numIterations  = numIterations;
 514     kernelParams_.expansionOrder = expansionOrder;
 515
 516     static_assert(sizeof(real) == sizeof(float),
 517                   "Real numbers should be in single precision in GPU code.");
 518     static_assert(
 519             c_threadsPerBlock > 0 && ((c_threadsPerBlock & (c_threadsPerBlock - 1)) == 0),
 520             "Number of threads per block should be a power of two in order for reduction to work.");
 521
 522     allocateDeviceBuffer(&kernelParams_.d_virialScaled, 6, nullptr);
 523     h_virialScaled_.resize(6);
 524
 525     // The data arrays should be expanded/reallocated on first call of set() function.
 526     numConstraintsThreadsAlloc_ = 0;
 527     numAtomsAlloc_              = 0;
 528 }
 529
 530 LincsCuda::~LincsCuda()
 531 {
 532     freeDeviceBuffer(&kernelParams_.d_virialScaled);
 533
 534     if (numConstraintsThreadsAlloc_ > 0)
 535     {
 536         freeDeviceBuffer(&kernelParams_.d_constraints);
 537         freeDeviceBuffer(&kernelParams_.d_constraintsTargetLengths);
 538
 539         freeDeviceBuffer(&kernelParams_.d_coupledConstraintsCounts);
 540         freeDeviceBuffer(&kernelParams_.d_coupledConstraintsIndices);
 541         freeDeviceBuffer(&kernelParams_.d_massFactors);
 542         freeDeviceBuffer(&kernelParams_.d_matrixA);
 543     }
 544     if (numAtomsAlloc_ > 0)
 545     {
 546         freeDeviceBuffer(&kernelParams_.d_inverseMasses);
 547     }
 548 }
 549
 550 //! Helper type for discovering coupled constraints
 551 struct AtomsAdjacencyListElement
 552 {
 553     AtomsAdjacencyListElement(const int indexOfSecondConstrainedAtom,
 554                               const int indexOfConstraint,
 555                               const int signFactor) :
 556         indexOfSecondConstrainedAtom_(indexOfSecondConstrainedAtom),
 557         indexOfConstraint_(indexOfConstraint),
 558         signFactor_(signFactor)
 559     {
 560     }
 561     //! The index of the other atom constrained to this atom.
 562     int indexOfSecondConstrainedAtom_;
 563     //! The index of this constraint in the container of constraints.
 564     int indexOfConstraint_;
 565     /*! \brief A multiplicative factor that indicates the relative
 566      * order of the atoms in the atom list.
 567      *
 568      * Used for computing the mass factor of this constraint
 569      * relative to any coupled constraints. */
 570     int signFactor_;
 571 };
 572 //! Constructs and returns an atom constraint adjacency list
 573 static std::vector<std::vector<AtomsAdjacencyListElement>>
 574 constructAtomsAdjacencyList(const int numAtoms, ArrayRef<const int> iatoms)
 575 {
 576     const int                                           stride         = 1 + NRAL(F_CONSTR);
 577     const int                                           numConstraints = iatoms.ssize() / stride;
 578     std::vector<std::vector<AtomsAdjacencyListElement>> atomsAdjacencyList(numAtoms);
 579     for (int c = 0; c < numConstraints; c++)
 580     {
 581         int a1 = iatoms[stride * c + 1];
 582         int a2 = iatoms[stride * c + 2];
 583
 584         // Each constraint will be represented as a tuple, containing index of the second
 585         // constrained atom, index of the constraint and a sign that indicates the order of atoms in
 586         // which they are listed. Sign is needed to compute the mass factors.
 587         atomsAdjacencyList[a1].emplace_back(a2, c, +1);
 588         atomsAdjacencyList[a2].emplace_back(a1, c, -1);
 589     }
 590
 591     return atomsAdjacencyList;
 592 }
 593
 594 /*! \brief Helper function to go through constraints recursively.
 595  *
 596  *  For each constraint, counts the number of coupled constraints and stores the value in \p numCoupledConstraints array.
 597  *  This information is used to split the array of constraints between thread blocks on a GPU so there is no
 598  *  coupling between constraints from different thread blocks. After the \p numCoupledConstraints array is filled, the
 599  *  value \p numCoupledConstraints[c] should be equal to the number of constraints that are coupled to \p c and located
 600  *  after it in the constraints array.
 601  *
 602  * \param[in]     a                   Atom index.
 603  * \param[in,out] numCoupledConstraints  Indicates if the constraint was already counted and stores
 604  *                                    the number of constraints (i) connected to it and (ii) located
 605  *                                    after it in memory. This array is filled by this recursive function.
 606  *                                    For a set of coupled constraints, only for the first one in this list
 607  *                                    the number of consecutive coupled constraints is needed: if there is
 608  *                                    not enough space for this set of constraints in the thread block,
 609  *                                    the group has to be moved to the next one.
 610  * \param[in]     atomsAdjacencyList  Stores information about connections between atoms.
 611  */
 612 inline int countCoupled(int           a,
 613                         ArrayRef<int> numCoupledConstraints,
 614                         ArrayRef<const std::vector<AtomsAdjacencyListElement>> atomsAdjacencyList)
 615
 616 {
 617     int counted = 0;
 618     for (const auto& adjacentAtom : atomsAdjacencyList[a])
 619     {
 620         const int c2 = adjacentAtom.indexOfConstraint_;
 621         if (numCoupledConstraints[c2] == -1)
 622         {
 623             numCoupledConstraints[c2] = 0; // To indicate we've been here
 624             counted += 1
 625                        + countCoupled(adjacentAtom.indexOfSecondConstrainedAtom_,
 626                                       numCoupledConstraints, atomsAdjacencyList);
 627         }
 628     }
 629     return counted;
 630 }
 631
 632 /*! \brief Add constraint to \p splitMap with all constraints coupled to it.
 633  *
 634  *  Adds the constraint \p c from the constrain list \p iatoms to the map \p splitMap
 635  *  if it was not yet added. Then goes through all the constraints coupled to \p c
 636  *  and calls itself recursively. This ensures that all the coupled constraints will
 637  *  be added to neighboring locations in the final data structures on the device,
 638  *  hence mapping all coupled constraints to the same thread block. A value of -1 in
 639  *  the \p splitMap is used to flag that constraint was not yet added to the \p splitMap.
 640  *
 641  * \param[in]     iatoms              The list of constraints.
 642  * \param[in]     stride              Number of elements per constraint in \p iatoms.
 643  * \param[in]     atomsAdjacencyList  Information about connections between atoms.
 644  * \param[our]    splitMap            Map of sequential constraint indexes to indexes to be on the device
 645  * \param[in]     c                   Sequential index for constraint to consider adding.
 646  * \param[in,out] currentMapIndex     The rolling index for the constraints mapping.
 647  */
 648 inline void addWithCoupled(ArrayRef<const int>                                    iatoms,
 649                            const int                                              stride,
 650                            ArrayRef<const std::vector<AtomsAdjacencyListElement>> atomsAdjacencyList,
 651                            ArrayRef<int>                                          splitMap,
 652                            const int                                              c,
 653                            int*                                                   currentMapIndex)
 654 {
 655     if (splitMap[c] == -1)
 656     {
 657         splitMap[c] = *currentMapIndex;
 658         (*currentMapIndex)++;
 659
 660         // Constraints, coupled through both atoms.
 661         for (int atomIndexInConstraint = 0; atomIndexInConstraint < 2; atomIndexInConstraint++)
 662         {
 663             const int a1 = iatoms[stride * c + 1 + atomIndexInConstraint];
 664             for (const auto& adjacentAtom : atomsAdjacencyList[a1])
 665             {
 666                 const int c2 = adjacentAtom.indexOfConstraint_;
 667                 if (c2 != c)
 668                 {
 669                     addWithCoupled(iatoms, stride, atomsAdjacencyList, splitMap, c2, currentMapIndex);
 670                 }
 671             }
 672         }
 673     }
 674 }
 675
 676 /*! \brief Computes and returns how many constraints are coupled to each constraint
 677  *
 678  * Needed to introduce splits in data so that all coupled constraints will be computed in a
 679  * single GPU block. The position \p c of the vector \p numCoupledConstraints should have the number
 680  * of constraints that are coupled to a constraint \p c and are after \p c in the vector. Only
 681  * first index of the connected group of the constraints is needed later in the code, hence the
 682  * numCoupledConstraints vector is also used to keep track if the constrain was already counted.
 683  */
 684 static std::vector<int> countNumCoupledConstraints(ArrayRef<const int> iatoms,
 685                                                    ArrayRef<const std::vector<AtomsAdjacencyListElement>> atomsAdjacencyList)
 686 {
 687     const int        stride         = 1 + NRAL(F_CONSTR);
 688     const int        numConstraints = iatoms.ssize() / stride;
 689     std::vector<int> numCoupledConstraints(numConstraints, -1);
 690     for (int c = 0; c < numConstraints; c++)
 691     {
 692         const int a1 = iatoms[stride * c + 1];
 693         const int a2 = iatoms[stride * c + 2];
 694         if (numCoupledConstraints[c] == -1)
 695         {
 696             numCoupledConstraints[c] = countCoupled(a1, numCoupledConstraints, atomsAdjacencyList)
 697                                        + countCoupled(a2, numCoupledConstraints, atomsAdjacencyList);
 698         }
 699     }
 700
 701     return numCoupledConstraints;
 702 }
 703
 704 bool LincsCuda::isNumCoupledConstraintsSupported(const gmx_mtop_t& mtop)
 705 {
 706     for (const gmx_moltype_t& molType : mtop.moltype)
 707     {
 708         ArrayRef<const int> iatoms    = molType.ilist[F_CONSTR].iatoms;
 709         const auto atomsAdjacencyList = constructAtomsAdjacencyList(molType.atoms.nr, iatoms);
 710         // Compute, how many constraints are coupled to each constraint
 711         const auto numCoupledConstraints = countNumCoupledConstraints(iatoms, atomsAdjacencyList);
 712         for (const int numCoupled : numCoupledConstraints)
 713         {
 714             if (numCoupled > c_threadsPerBlock)
 715             {
 716                 return false;
 717             }
 718         }
 719     }
 720
 721     return true;
 722 }
 723
 724 void LincsCuda::set(const t_idef& idef, const t_mdatoms& md)
 725 {
 726     int numAtoms = md.nr;
 727     // List of constrained atoms (CPU memory)
 728     std::vector<int2> constraintsHost;
 729     // Equilibrium distances for the constraints (CPU)
 730     std::vector<float> constraintsTargetLengthsHost;
 731     // Number of constraints, coupled with the current one (CPU)
 732     std::vector<int> coupledConstraintsCountsHost;
 733     // List of coupled with the current one (CPU)
 734     std::vector<int> coupledConstraintsIndicesHost;
 735     // Mass factors (CPU)
 736     std::vector<float> massFactorsHost;
 737
 738     // List of constrained atoms in local topology
 739     gmx::ArrayRef<const int> iatoms =
 740             constArrayRefFromArray(idef.il[F_CONSTR].iatoms, idef.il[F_CONSTR].nr);
 741     const int stride         = NRAL(F_CONSTR) + 1;
 742     const int numConstraints = idef.il[F_CONSTR].nr / stride;
 743
 744     // Early exit if no constraints
 745     if (numConstraints == 0)
 746     {
 747         kernelParams_.numConstraintsThreads = 0;
 748         return;
 749     }
 750
 751     // Construct the adjacency list, a useful intermediate structure
 752     const auto atomsAdjacencyList = constructAtomsAdjacencyList(numAtoms, iatoms);
 753
 754     // Compute, how many constraints are coupled to each constraint
 755     const auto numCoupledConstraints = countNumCoupledConstraints(iatoms, atomsAdjacencyList);
 756
 757     // Map of splits in the constraints data. For each 'old' constraint index gives 'new' which
 758     // takes into account the empty spaces which might be needed in the end of each thread block.
 759     std::vector<int> splitMap(numConstraints, -1);
 760     int              currentMapIndex = 0;
 761     for (int c = 0; c < numConstraints; c++)
 762     {
 763         // Check if coupled constraints all fit in one block
 764         if (numCoupledConstraints.at(c) > c_threadsPerBlock)
 765         {
 766             gmx_fatal(FARGS,
 767                       "Maximum number of coupled constraints (%d) exceeds the size of the CUDA "
 768                       "thread block (%d). Most likely, you are trying to use the GPU version of "
 769                       "LINCS with constraints on all-bonds, which is not supported for large "
 770                       "molecules. When compatible with the force field and integration settings, "
 771                       "using constraints on H-bonds only.",
 772                       numCoupledConstraints.at(c), c_threadsPerBlock);
 773         }
 774         if (currentMapIndex / c_threadsPerBlock
 775             != (currentMapIndex + numCoupledConstraints.at(c)) / c_threadsPerBlock)
 776         {
 777             currentMapIndex = ((currentMapIndex / c_threadsPerBlock) + 1) * c_threadsPerBlock;
 778         }
 779         addWithCoupled(iatoms, stride, atomsAdjacencyList, splitMap, c, &currentMapIndex);
 780     }
 781
 782     kernelParams_.numConstraintsThreads =
 783             currentMapIndex + c_threadsPerBlock - currentMapIndex % c_threadsPerBlock;
 784     GMX_RELEASE_ASSERT(kernelParams_.numConstraintsThreads % c_threadsPerBlock == 0,
 785                        "Number of threads should be a multiple of the block size");
 786
 787     // Initialize constraints and their target indexes taking into account the splits in the data arrays.
 788     int2 pair;
 789     pair.x = -1;
 790     pair.y = -1;
 791     constraintsHost.resize(kernelParams_.numConstraintsThreads, pair);
 792     std::fill(constraintsHost.begin(), constraintsHost.end(), pair);
 793     constraintsTargetLengthsHost.resize(kernelParams_.numConstraintsThreads, 0.0);
 794     std::fill(constraintsTargetLengthsHost.begin(), constraintsTargetLengthsHost.end(), 0.0);
 795     for (int c = 0; c < numConstraints; c++)
 796     {
 797         int a1   = iatoms[stride * c + 1];
 798         int a2   = iatoms[stride * c + 2];
 799         int type = iatoms[stride * c];
 800
 801         int2 pair;
 802         pair.x                                          = a1;
 803         pair.y                                          = a2;
 804         constraintsHost.at(splitMap.at(c))              = pair;
 805         constraintsTargetLengthsHost.at(splitMap.at(c)) = idef.iparams[type].constr.dA;
 806     }
 807
 808     // The adjacency list of constraints (i.e. the list of coupled constraints for each constraint).
 809     // We map a single thread to a single constraint, hence each thread 'c' will be using one
 810     // element from coupledConstraintsCountsHost array, which is the number of constraints coupled
 811     // to the constraint 'c'. The coupled constraints indexes are placed into the
 812     // coupledConstraintsIndicesHost array. Latter is organized as a one-dimensional array to ensure
 813     // good memory alignment. It is addressed as [c + i*numConstraintsThreads], where 'i' goes from
 814     // zero to the number of constraints coupled to 'c'. 'numConstraintsThreads' is the width of the
 815     // array --- a number, greater then total number of constraints, taking into account the splits
 816     // in the constraints array due to the GPU block borders. This number can be adjusted to improve
 817     // memory access pattern. Mass factors are saved in a similar data structure.
 818     int maxCoupledConstraints = 0;
 819     for (int c = 0; c < numConstraints; c++)
 820     {
 821         int a1 = iatoms[stride * c + 1];
 822         int a2 = iatoms[stride * c + 2];
 823
 824         // Constraint 'c' is counted twice, but it should be excluded altogether. Hence '-2'.
 825         int nCoupedConstraints = atomsAdjacencyList.at(a1).size() + atomsAdjacencyList.at(a2).size() - 2;
 826
 827         if (nCoupedConstraints > maxCoupledConstraints)
 828         {
 829             maxCoupledConstraints = nCoupedConstraints;
 830         }
 831     }
 832
 833     coupledConstraintsCountsHost.resize(kernelParams_.numConstraintsThreads, 0);
 834     coupledConstraintsIndicesHost.resize(maxCoupledConstraints * kernelParams_.numConstraintsThreads, -1);
 835     massFactorsHost.resize(maxCoupledConstraints * kernelParams_.numConstraintsThreads, -1);
 836
 837     for (int c1 = 0; c1 < numConstraints; c1++)
 838     {
 839         coupledConstraintsCountsHost.at(splitMap.at(c1)) = 0;
 840         int c1a1                                         = iatoms[stride * c1 + 1];
 841         int c1a2                                         = iatoms[stride * c1 + 2];
 842
 843         // Constraints, coupled through the first atom.
 844         int c2a1 = c1a1;
 845         for (const auto& atomAdjacencyList : atomsAdjacencyList[c1a1])
 846         {
 847             int c2 = atomAdjacencyList.indexOfConstraint_;
 848
 849             if (c1 != c2)
 850             {
 851                 int c2a2  = atomAdjacencyList.indexOfSecondConstrainedAtom_;
 852                 int sign  = atomAdjacencyList.signFactor_;
 853                 int index = kernelParams_.numConstraintsThreads
 854                                     * coupledConstraintsCountsHost.at(splitMap.at(c1))
 855                             + splitMap.at(c1);
 856
 857                 coupledConstraintsIndicesHost.at(index) = splitMap.at(c2);
 858
 859                 int center = c1a1;
 860
 861                 float sqrtmu1 = 1.0 / sqrt(md.invmass[c1a1] + md.invmass[c1a2]);
 862                 float sqrtmu2 = 1.0 / sqrt(md.invmass[c2a1] + md.invmass[c2a2]);
 863
 864                 massFactorsHost.at(index) = -sign * md.invmass[center] * sqrtmu1 * sqrtmu2;
 865
 866                 coupledConstraintsCountsHost.at(splitMap.at(c1))++;
 867             }
 868         }
 869
 870         // Constraints, coupled through the second atom.
 871         c2a1 = c1a2;
 872         for (const auto& atomAdjacencyList : atomsAdjacencyList[c1a2])
 873         {
 874             int c2 = atomAdjacencyList.indexOfConstraint_;
 875
 876             if (c1 != c2)
 877             {
 878                 int c2a2  = atomAdjacencyList.indexOfSecondConstrainedAtom_;
 879                 int sign  = atomAdjacencyList.signFactor_;
 880                 int index = kernelParams_.numConstraintsThreads
 881                                     * coupledConstraintsCountsHost.at(splitMap.at(c1))
 882                             + splitMap.at(c1);
 883
 884                 coupledConstraintsIndicesHost.at(index) = splitMap.at(c2);
 885
 886                 int center = c1a2;
 887
 888                 float sqrtmu1 = 1.0 / sqrt(md.invmass[c1a1] + md.invmass[c1a2]);
 889                 float sqrtmu2 = 1.0 / sqrt(md.invmass[c2a1] + md.invmass[c2a2]);
 890
 891                 massFactorsHost.at(index) = sign * md.invmass[center] * sqrtmu1 * sqrtmu2;
 892
 893                 coupledConstraintsCountsHost.at(splitMap.at(c1))++;
 894             }
 895         }
 896     }
 897
 898     // (Re)allocate the memory, if the number of constraints has increased.
 899     if (kernelParams_.numConstraintsThreads > numConstraintsThreadsAlloc_)
 900     {
 901         // Free memory if it was allocated before (i.e. if not the first time here).
 902         if (numConstraintsThreadsAlloc_ > 0)
 903         {
 904             freeDeviceBuffer(&kernelParams_.d_constraints);
 905             freeDeviceBuffer(&kernelParams_.d_constraintsTargetLengths);
 906
 907             freeDeviceBuffer(&kernelParams_.d_coupledConstraintsCounts);
 908             freeDeviceBuffer(&kernelParams_.d_coupledConstraintsIndices);
 909             freeDeviceBuffer(&kernelParams_.d_massFactors);
 910             freeDeviceBuffer(&kernelParams_.d_matrixA);
 911         }
 912
 913         numConstraintsThreadsAlloc_ = kernelParams_.numConstraintsThreads;
 914
 915         allocateDeviceBuffer(&kernelParams_.d_constraints, kernelParams_.numConstraintsThreads, nullptr);
 916         allocateDeviceBuffer(&kernelParams_.d_constraintsTargetLengths,
 917                              kernelParams_.numConstraintsThreads, nullptr);
 918
 919         allocateDeviceBuffer(&kernelParams_.d_coupledConstraintsCounts,
 920                              kernelParams_.numConstraintsThreads, nullptr);
 921         allocateDeviceBuffer(&kernelParams_.d_coupledConstraintsIndices,
 922                              maxCoupledConstraints * kernelParams_.numConstraintsThreads, nullptr);
 923         allocateDeviceBuffer(&kernelParams_.d_massFactors,
 924                              maxCoupledConstraints * kernelParams_.numConstraintsThreads, nullptr);
 925         allocateDeviceBuffer(&kernelParams_.d_matrixA,
 926                              maxCoupledConstraints * kernelParams_.numConstraintsThreads, nullptr);
 927     }
 928
 929     // (Re)allocate the memory, if the number of atoms has increased.
 930     if (numAtoms > numAtomsAlloc_)
 931     {
 932         if (numAtomsAlloc_ > 0)
 933         {
 934             freeDeviceBuffer(&kernelParams_.d_inverseMasses);
 935         }
 936         numAtomsAlloc_ = numAtoms;
 937         allocateDeviceBuffer(&kernelParams_.d_inverseMasses, numAtoms, nullptr);
 938     }
 939
 940     // Copy data to GPU.
 941     copyToDeviceBuffer(&kernelParams_.d_constraints, constraintsHost.data(), 0,
 942                        kernelParams_.numConstraintsThreads, commandStream_,
 943                        GpuApiCallBehavior::Sync, nullptr);
 944     copyToDeviceBuffer(&kernelParams_.d_constraintsTargetLengths,
 945                        constraintsTargetLengthsHost.data(), 0, kernelParams_.numConstraintsThreads,
 946                        commandStream_, GpuApiCallBehavior::Sync, nullptr);
 947     copyToDeviceBuffer(&kernelParams_.d_coupledConstraintsCounts,
 948                        coupledConstraintsCountsHost.data(), 0, kernelParams_.numConstraintsThreads,
 949                        commandStream_, GpuApiCallBehavior::Sync, nullptr);
 950     copyToDeviceBuffer(&kernelParams_.d_coupledConstraintsIndices, coupledConstraintsIndicesHost.data(),
 951                        0, maxCoupledConstraints * kernelParams_.numConstraintsThreads,
 952                        commandStream_, GpuApiCallBehavior::Sync, nullptr);
 953     copyToDeviceBuffer(&kernelParams_.d_massFactors, massFactorsHost.data(), 0,
 954                        maxCoupledConstraints * kernelParams_.numConstraintsThreads, commandStream_,
 955                        GpuApiCallBehavior::Sync, nullptr);
 956
 957     GMX_RELEASE_ASSERT(md.invmass != nullptr, "Masses of atoms should be specified.\n");
 958     copyToDeviceBuffer(&kernelParams_.d_inverseMasses, md.invmass, 0, numAtoms, commandStream_,
 959                        GpuApiCallBehavior::Sync, nullptr);
 960 }
 961
 962 void LincsCuda::setPbc(const t_pbc* pbc)
 963 {
 964     setPbcAiuc(pbc->ndim_ePBC, pbc->box, &kernelParams_.pbcAiuc);
 965 }
 966
 967 } // namespace gmx