src/gromacs/mdlib/lincs_cuda.cu

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2019, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35 /*! \internal \file
  36  *
  37  * \brief Implements LINCS using CUDA
  38  *
  39  * This file contains implementation of LINCS constraints algorithm
  40  * using CUDA, including class initialization, data-structures management
  41  * and GPU kernel.
  42  *
  43  * \note Management of periodic boundary should be unified with SETTLE and
  44  *       removed from here.
  45  * \todo Reconsider naming, i.e. "cuda" suffics should be changed to "gpu".
  46  *
  47  * \author Artem Zhmurov <zhmurov@gmail.com>
  48  * \author Alan Gray <alang@nvidia.com>
  49  *
  50  * \ingroup module_mdlib
  51  */
  52 #include "gmxpre.h"
  53
  54 #include "lincs_cuda.cuh"
  55
  56 #include <assert.h>
  57 #include <stdio.h>
  58
  59 #include <cmath>
  60
  61 #include <algorithm>
  62
  63 #include "gromacs/gpu_utils/cuda_arch_utils.cuh"
  64 #include "gromacs/gpu_utils/cudautils.cuh"
  65 #include "gromacs/gpu_utils/devicebuffer.cuh"
  66 #include "gromacs/gpu_utils/gputraits.cuh"
  67 #include "gromacs/gpu_utils/vectype_ops.cuh"
  68 #include "gromacs/math/vec.h"
  69 #include "gromacs/mdlib/constr.h"
  70 #include "gromacs/pbcutil/pbc.h"
  71 #include "gromacs/pbcutil/pbc_aiuc_cuda.cuh"
  72 #include "gromacs/topology/ifunc.h"
  73
  74 namespace gmx
  75 {
  76
  77 //! Number of CUDA threads in a block
  78 constexpr static int c_threadsPerBlock = 256;
  79 //! Maximum number of threads in a block (for __launch_bounds__)
  80 constexpr static int c_maxThreadsPerBlock = c_threadsPerBlock;
  81
  82 /*! \brief Main kernel for LINCS constraints.
  83  *
  84  * See Hess et al., J. Comput. Chem. 18: 1463-1472 (1997) for the description of the algorithm.
  85  *
  86  * In CUDA version, one thread is responsible for all computations for one constraint. The blocks are
  87  * filled in a way that no constraint is coupled to the constraint from the next block. This is achieved
  88  * by moving active threads to the next block, if the correspondent group of coupled constraints is to big
  89  * to fit the current thread block. This may leave some 'dummy' threads in the end of the thread block, i.e.
  90  * threads that are not required to do actual work. Since constraints from different blocks are not coupled,
  91  * there is no need to synchronize across the device. However, extensive communication in a thread block
  92  * are still needed.
  93  *
  94  * \todo Reduce synchronization overhead. Some ideas are:
  95  *        1. Consider going to warp-level synchronization for the coupled constraints.
  96  *        2. Move more data to local/shared memory and try to get rid of atomic operations (at least on
  97  *           the device level).
  98  *        3. Use analytical solution for matrix A inversion.
  99  *        4. Introduce mapping of thread id to both single constraint and single atom, thus designating
 100  *           Nth threads to deal with Nat <= Nth coupled atoms and Nc <= Nth coupled constraints.
 101  *       See Redmine issue #2885 for details (https://redmine.gromacs.org/issues/2885)
 102  * \todo The use of __restrict__  for gm_xp and gm_v causes failure, probably because of the atomic
 103          operations. Investigate this issue further.
 104  *
 105  * \param[in,out] kernelParams  All parameters and pointers for the kernel condensed in single struct.
 106  * \param[in]     invdt         Inverse timestep (needed to update velocities).
 107  */
 108 template<bool updateVelocities, bool computeVirial>
 109 __launch_bounds__(c_maxThreadsPerBlock) __global__
 110         void lincs_kernel(LincsCudaKernelParameters kernelParams,
 111                           const float3* __restrict__ gm_x,
 112                           float3*     gm_xp,
 113                           float3*     gm_v,
 114                           const float invdt)
 115 {
 116     const PbcAiuc pbcAiuc                                 = kernelParams.pbcAiuc;
 117     const int     numConstraintsThreads                   = kernelParams.numConstraintsThreads;
 118     const int     numIterations                           = kernelParams.numIterations;
 119     const int     expansionOrder                          = kernelParams.expansionOrder;
 120     const int2* __restrict__ gm_constraints               = kernelParams.d_constraints;
 121     const float* __restrict__ gm_constraintsTargetLengths = kernelParams.d_constraintsTargetLengths;
 122     const int* __restrict__ gm_coupledConstraintsCounts   = kernelParams.d_coupledConstraintsCounts;
 123     const int* __restrict__ gm_coupledConstraintsIdxes = kernelParams.d_coupledConstraintsIndices;
 124     const float* __restrict__ gm_massFactors           = kernelParams.d_massFactors;
 125     float* __restrict__ gm_matrixA                     = kernelParams.d_matrixA;
 126     const float* __restrict__ gm_inverseMasses         = kernelParams.d_inverseMasses;
 127     float* __restrict__ gm_virialScaled                = kernelParams.d_virialScaled;
 128
 129     int threadIndex = blockIdx.x * blockDim.x + threadIdx.x;
 130
 131     // numConstraintsThreads should be a integer multiple of blockSize (numConstraintsThreads = numBlocks*blockSize).
 132     // This is to ensure proper synchronizations and reduction. All array are padded to the required size.
 133     assert(threadIndex < numConstraintsThreads);
 134
 135     // Vectors connecting constrained atoms before algorithm was applied.
 136     // Needed to construct constrain matrix A
 137     extern __shared__ float3 sm_r[];
 138
 139     int2 pair = gm_constraints[threadIndex];
 140     int  i    = pair.x;
 141     int  j    = pair.y;
 142
 143     // Mass-scaled Lagrange multiplier
 144     float lagrangeScaled = 0.0f;
 145
 146     float targetLength;
 147     float inverseMassi;
 148     float inverseMassj;
 149     float sqrtReducedMass;
 150
 151     float3 xi;
 152     float3 xj;
 153     float3 rc;
 154
 155     // i == -1 indicates dummy constraint at the end of the thread block.
 156     bool isDummyThread = (i == -1);
 157
 158     // Everything computed for these dummies will be equal to zero
 159     if (isDummyThread)
 160     {
 161         targetLength    = 0.0f;
 162         inverseMassi    = 0.0f;
 163         inverseMassj    = 0.0f;
 164         sqrtReducedMass = 0.0f;
 165
 166         xi = make_float3(0.0f, 0.0f, 0.0f);
 167         xj = make_float3(0.0f, 0.0f, 0.0f);
 168         rc = make_float3(0.0f, 0.0f, 0.0f);
 169     }
 170     else
 171     {
 172         // Collecting data
 173         targetLength    = gm_constraintsTargetLengths[threadIndex];
 174         inverseMassi    = gm_inverseMasses[i];
 175         inverseMassj    = gm_inverseMasses[j];
 176         sqrtReducedMass = rsqrt(inverseMassi + inverseMassj);
 177
 178         xi = gm_x[i];
 179         xj = gm_x[j];
 180
 181         float3 dx = pbcDxAiuc(pbcAiuc, xi, xj);
 182
 183         float rlen = rsqrtf(dx.x * dx.x + dx.y * dx.y + dx.z * dx.z);
 184         rc         = rlen * dx;
 185     }
 186
 187     sm_r[threadIdx.x] = rc;
 188     // Make sure that all r's are saved into shared memory
 189     // before they are accessed in the loop below
 190     __syncthreads();
 191
 192     /*
 193      * Constructing LINCS matrix (A)
 194      */
 195
 196     // Only non-zero values are saved (for coupled constraints)
 197     int coupledConstraintsCount = gm_coupledConstraintsCounts[threadIndex];
 198     for (int n = 0; n < coupledConstraintsCount; n++)
 199     {
 200         int index = n * numConstraintsThreads + threadIndex;
 201         int c1    = gm_coupledConstraintsIdxes[index];
 202
 203         float3 rc1        = sm_r[c1 - blockIdx.x * blockDim.x];
 204         gm_matrixA[index] = gm_massFactors[index] * (rc.x * rc1.x + rc.y * rc1.y + rc.z * rc1.z);
 205     }
 206
 207     // Skipping in dummy threads
 208     if (!isDummyThread)
 209     {
 210         xi = gm_xp[i];
 211         xj = gm_xp[j];
 212     }
 213
 214     float3 dx = pbcDxAiuc(pbcAiuc, xi, xj);
 215
 216     float sol = sqrtReducedMass * ((rc.x * dx.x + rc.y * dx.y + rc.z * dx.z) - targetLength);
 217
 218     /*
 219      *  Inverse matrix using a set of expansionOrder matrix multiplications
 220      */
 221
 222     // This will use the same memory space as sm_r, which is no longer needed.
 223     extern __shared__ float sm_rhs[];
 224     // Save current right-hand-side vector in the shared memory
 225     sm_rhs[threadIdx.x] = sol;
 226
 227     for (int rec = 0; rec < expansionOrder; rec++)
 228     {
 229         // Making sure that all sm_rhs are saved before they are accessed in a loop below
 230         __syncthreads();
 231         float mvb = 0.0f;
 232
 233         for (int n = 0; n < coupledConstraintsCount; n++)
 234         {
 235             int index = n * numConstraintsThreads + threadIndex;
 236             int c1    = gm_coupledConstraintsIdxes[index];
 237             // Convolute current right-hand-side with A
 238             // Different, non overlapping parts of sm_rhs[..] are read during odd and even iterations
 239             mvb = mvb + gm_matrixA[index] * sm_rhs[c1 - blockIdx.x * blockDim.x + blockDim.x * (rec % 2)];
 240         }
 241         // 'Switch' rhs vectors, save current result
 242         // These values will be accessed in the loop above during the next iteration.
 243         sm_rhs[threadIdx.x + blockDim.x * ((rec + 1) % 2)] = mvb;
 244         sol                                                = sol + mvb;
 245     }
 246
 247     // Current mass-scaled Lagrange multipliers
 248     lagrangeScaled = sqrtReducedMass * sol;
 249
 250     // Save updated coordinates before correction for the rotational lengthening
 251     float3 tmp = rc * lagrangeScaled;
 252
 253     // Writing for all but dummy constraints
 254     if (!isDummyThread)
 255     {
 256         atomicAdd(&gm_xp[i], -tmp * inverseMassi);
 257         atomicAdd(&gm_xp[j], tmp * inverseMassj);
 258     }
 259
 260     /*
 261      *  Correction for centripetal effects
 262      */
 263     for (int iter = 0; iter < numIterations; iter++)
 264     {
 265         // Make sure that all xp's are saved: atomic operation calls before are
 266         // communicating current xp[..] values across thread block.
 267         __syncthreads();
 268
 269         if (!isDummyThread)
 270         {
 271             xi = gm_xp[i];
 272             xj = gm_xp[j];
 273         }
 274
 275         float3 dx = pbcDxAiuc(pbcAiuc, xi, xj);
 276
 277         float len2  = targetLength * targetLength;
 278         float dlen2 = 2.0f * len2 - norm2(dx);
 279
 280         // TODO A little bit more effective but slightly less readable version of the below would be:
 281         //      float proj = sqrtReducedMass*(targetLength - (dlen2 > 0.0f ? 1.0f : 0.0f)*dlen2*rsqrt(dlen2));
 282         float proj;
 283         if (dlen2 > 0.0f)
 284         {
 285             proj = sqrtReducedMass * (targetLength - dlen2 * rsqrt(dlen2));
 286         }
 287         else
 288         {
 289             proj = sqrtReducedMass * targetLength;
 290         }
 291
 292         sm_rhs[threadIdx.x] = proj;
 293         float sol           = proj;
 294
 295         /*
 296          * Same matrix inversion as above is used for updated data
 297          */
 298         for (int rec = 0; rec < expansionOrder; rec++)
 299         {
 300             // Make sure that all elements of rhs are saved into shared memory
 301             __syncthreads();
 302             float mvb = 0;
 303
 304             for (int n = 0; n < coupledConstraintsCount; n++)
 305             {
 306                 int index = n * numConstraintsThreads + threadIndex;
 307                 int c1    = gm_coupledConstraintsIdxes[index];
 308
 309                 mvb = mvb + gm_matrixA[index] * sm_rhs[c1 - blockIdx.x * blockDim.x + blockDim.x * (rec % 2)];
 310             }
 311             sm_rhs[threadIdx.x + blockDim.x * ((rec + 1) % 2)] = mvb;
 312             sol                                                = sol + mvb;
 313         }
 314
 315         // Add corrections to Lagrange multipliers
 316         float sqrtmu_sol = sqrtReducedMass * sol;
 317         lagrangeScaled += sqrtmu_sol;
 318
 319         // Save updated coordinates for the next iteration
 320         // Dummy constraints are skipped
 321         if (!isDummyThread)
 322         {
 323             float3 tmp = rc * sqrtmu_sol;
 324             atomicAdd(&gm_xp[i], -tmp * inverseMassi);
 325             atomicAdd(&gm_xp[j], tmp * inverseMassj);
 326         }
 327     }
 328
 329     // Updating particle velocities for all but dummy threads
 330     if (updateVelocities && !isDummyThread)
 331     {
 332         float3 tmp = rc * invdt * lagrangeScaled;
 333         atomicAdd(&gm_v[i], -tmp * inverseMassi);
 334         atomicAdd(&gm_v[j], tmp * inverseMassj);
 335     }
 336
 337
 338     if (computeVirial)
 339     {
 340         // Virial is computed from Lagrange multiplier (lagrangeScaled), target constrain length
 341         // (targetLength) and the normalized vector connecting constrained atoms before
 342         // the algorithm was applied (rc). The evaluation of virial in each thread is
 343         // followed by basic reduction for the values inside single thread block.
 344         // Then, the values are reduced across grid by atomicAdd(...).
 345         //
 346         // TODO Shuffle reduction.
 347         // TODO Should be unified and/or done once when virial is actually needed.
 348         // TODO Recursive version that removes atomicAdd(...)'s entirely is needed. Ideally,
 349         //      one that works for any datatype.
 350
 351         // Save virial for each thread into the shared memory. Tensor is symmetrical, hence only
 352         // 6 values are saved. Dummy threads will have zeroes in their virial: targetLength,
 353         // lagrangeScaled and rc are all set to zero for them in the beginning of the kernel.
 354         // The sm_threadVirial[..] will overlap with the sm_r[..] and sm_rhs[..], but the latter
 355         // two are no longer in use.
 356         extern __shared__ float sm_threadVirial[];
 357         float                   mult                  = targetLength * lagrangeScaled;
 358         sm_threadVirial[0 * blockDim.x + threadIdx.x] = mult * rc.x * rc.x;
 359         sm_threadVirial[1 * blockDim.x + threadIdx.x] = mult * rc.x * rc.y;
 360         sm_threadVirial[2 * blockDim.x + threadIdx.x] = mult * rc.x * rc.z;
 361         sm_threadVirial[3 * blockDim.x + threadIdx.x] = mult * rc.y * rc.y;
 362         sm_threadVirial[4 * blockDim.x + threadIdx.x] = mult * rc.y * rc.z;
 363         sm_threadVirial[5 * blockDim.x + threadIdx.x] = mult * rc.z * rc.z;
 364
 365         __syncthreads();
 366
 367         // Reduce up to one virial per thread block. All blocks are divided by half, the first
 368         // half of threads sums two virials. Then the first half is divided by two and the first
 369         // half of it sums two values. This procedure is repeated until only one thread is left.
 370         // Only works if the threads per blocks is a power of two (hence static_assert
 371         // in the beginning of the kernel).
 372         for (int divideBy = 2; divideBy <= static_cast<int>(blockDim.x); divideBy *= 2)
 373         {
 374             int dividedAt = blockDim.x / divideBy;
 375             if (static_cast<int>(threadIdx.x) < dividedAt)
 376             {
 377                 for (int d = 0; d < 6; d++)
 378                 {
 379                     sm_threadVirial[d * blockDim.x + threadIdx.x] +=
 380                             sm_threadVirial[d * blockDim.x + (threadIdx.x + dividedAt)];
 381                 }
 382             }
 383             // Syncronize if not within one warp
 384             if (dividedAt > warpSize / 2)
 385             {
 386                 __syncthreads();
 387             }
 388         }
 389         // First 6 threads in the block add the results of 6 tensor components to the global memory address.
 390         if (threadIdx.x < 6)
 391         {
 392             atomicAdd(&(gm_virialScaled[threadIdx.x]), sm_threadVirial[threadIdx.x * blockDim.x]);
 393         }
 394     }
 395
 396     return;
 397 }
 398
 399 /*! \brief Select templated kernel.
 400  *
 401  * Returns pointer to a CUDA kernel based on provided booleans.
 402  *
 403  * \param[in] updateVelocities  If the velocities should be constrained.
 404  * \param[in] computeVirial     If virial should be updated.
 405  *
 406  * \return                      Pointer to CUDA kernel
 407  */
 408 inline auto getLincsKernelPtr(const bool updateVelocities, const bool computeVirial)
 409 {
 410
 411     auto kernelPtr = lincs_kernel<true, true>;
 412     if (updateVelocities && computeVirial)
 413     {
 414         kernelPtr = lincs_kernel<true, true>;
 415     }
 416     else if (updateVelocities && !computeVirial)
 417     {
 418         kernelPtr = lincs_kernel<true, false>;
 419     }
 420     else if (!updateVelocities && computeVirial)
 421     {
 422         kernelPtr = lincs_kernel<false, true>;
 423     }
 424     else if (!updateVelocities && !computeVirial)
 425     {
 426         kernelPtr = lincs_kernel<false, false>;
 427     }
 428     return kernelPtr;
 429 }
 430
 431 void LincsCuda::apply(const float3* d_x,
 432                       float3*       d_xp,
 433                       const bool    updateVelocities,
 434                       float3*       d_v,
 435                       const real    invdt,
 436                       const bool    computeVirial,
 437                       tensor        virialScaled)
 438 {
 439     ensureNoPendingCudaError("In CUDA version of LINCS");
 440
 441     // Early exit if no constraints
 442     if (kernelParams_.numConstraintsThreads == 0)
 443     {
 444         return;
 445     }
 446
 447     if (computeVirial)
 448     {
 449         // Fill with zeros so the values can be reduced to it
 450         // Only 6 values are needed because virial is symmetrical
 451         clearDeviceBufferAsync(&kernelParams_.d_virialScaled, 0, 6, commandStream_);
 452     }
 453
 454     auto kernelPtr = getLincsKernelPtr(updateVelocities, computeVirial);
 455
 456     KernelLaunchConfig config;
 457     config.blockSize[0] = c_threadsPerBlock;
 458     config.blockSize[1] = 1;
 459     config.blockSize[2] = 1;
 460     config.gridSize[0] = (kernelParams_.numConstraintsThreads + c_threadsPerBlock - 1) / c_threadsPerBlock;
 461     config.gridSize[1] = 1;
 462     config.gridSize[2] = 1;
 463
 464     // Shared memory is used to store:
 465     // -- Current coordinates (3 floats per thread)
 466     // -- Right-hand-sides for matrix inversion (2 floats per thread)
 467     // -- Virial tensor components (6 floats per thread)
 468     // Since none of these three are needed simultaneously, they can be saved at the same shared memory address
 469     // (i.e. correspondent arrays are intentionally overlapped in address space). Consequently, only
 470     // max{3, 2, 6} = 6 floats per thread are needed in case virial is computed, or max{3, 2} = 3 if not.
 471     if (computeVirial)
 472     {
 473         config.sharedMemorySize = c_threadsPerBlock * 6 * sizeof(float);
 474     }
 475     else
 476     {
 477         config.sharedMemorySize = c_threadsPerBlock * 3 * sizeof(float);
 478     }
 479     config.stream = commandStream_;
 480
 481     const auto kernelArgs =
 482             prepareGpuKernelArguments(kernelPtr, config, &kernelParams_, &d_x, &d_xp, &d_v, &invdt);
 483
 484     launchGpuKernel(kernelPtr, config, nullptr, "lincs_kernel<updateVelocities, computeVirial>", kernelArgs);
 485
 486     if (computeVirial)
 487     {
 488         // Copy LINCS virial data and add it to the common virial
 489         copyFromDeviceBuffer(h_virialScaled_.data(), &kernelParams_.d_virialScaled, 0, 6,
 490                              commandStream_, GpuApiCallBehavior::Sync, nullptr);
 491
 492         // Mapping [XX, XY, XZ, YY, YZ, ZZ] internal format to a tensor object
 493         virialScaled[XX][XX] += h_virialScaled_[0];
 494         virialScaled[XX][YY] += h_virialScaled_[1];
 495         virialScaled[XX][ZZ] += h_virialScaled_[2];
 496
 497         virialScaled[YY][XX] += h_virialScaled_[1];
 498         virialScaled[YY][YY] += h_virialScaled_[3];
 499         virialScaled[YY][ZZ] += h_virialScaled_[4];
 500
 501         virialScaled[ZZ][XX] += h_virialScaled_[2];
 502         virialScaled[ZZ][YY] += h_virialScaled_[4];
 503         virialScaled[ZZ][ZZ] += h_virialScaled_[5];
 504     }
 505
 506     return;
 507 }
 508
 509 LincsCuda::LincsCuda(int numIterations, int expansionOrder, CommandStream commandStream) :
 510     commandStream_(commandStream)
 511 {
 512     kernelParams_.numIterations  = numIterations;
 513     kernelParams_.expansionOrder = expansionOrder;
 514
 515     static_assert(sizeof(real) == sizeof(float),
 516                   "Real numbers should be in single precision in GPU code.");
 517     static_assert(
 518             c_threadsPerBlock > 0 && ((c_threadsPerBlock & (c_threadsPerBlock - 1)) == 0),
 519             "Number of threads per block should be a power of two in order for reduction to work.");
 520
 521     allocateDeviceBuffer(&kernelParams_.d_virialScaled, 6, nullptr);
 522     h_virialScaled_.resize(6);
 523
 524     // The data arrays should be expanded/reallocated on first call of set() function.
 525     numConstraintsThreadsAlloc_ = 0;
 526     numAtomsAlloc_              = 0;
 527 }
 528
 529 LincsCuda::~LincsCuda()
 530 {
 531     freeDeviceBuffer(&kernelParams_.d_virialScaled);
 532
 533     if (numConstraintsThreadsAlloc_ > 0)
 534     {
 535         freeDeviceBuffer(&kernelParams_.d_constraints);
 536         freeDeviceBuffer(&kernelParams_.d_constraintsTargetLengths);
 537
 538         freeDeviceBuffer(&kernelParams_.d_coupledConstraintsCounts);
 539         freeDeviceBuffer(&kernelParams_.d_coupledConstraintsIndices);
 540         freeDeviceBuffer(&kernelParams_.d_massFactors);
 541         freeDeviceBuffer(&kernelParams_.d_matrixA);
 542     }
 543     if (numAtomsAlloc_ > 0)
 544     {
 545         freeDeviceBuffer(&kernelParams_.d_inverseMasses);
 546     }
 547 }
 548
 549 /*! \brief Helper function to go through constraints recursively.
 550  *
 551  *  For each constraint, counts the number of coupled constraints and stores the value in spaceNeeded array.
 552  *  This information is used to split the array of constraints between thread blocks on a GPU so there is no
 553  *  coupling between constraints from different thread blocks. After the 'spaceNeeded' array is filled, the
 554  *  value spaceNeeded[c] should be equal to the number of constraints that are coupled to 'c' and located
 555  *  after it in the constraints array.
 556  *
 557  * \param[in]     a                  Atom index.
 558  * \param[in,out] spaceNeeded        Indicates if the constraint was already counted and stores
 559  *                                   the number of constraints (i) connected to it and (ii) located
 560  *                                   after it in memory. This array is filled by this recursive function.
 561  *                                   For a set of coupled constraints, only for the first one in this list
 562  *                                   the number of consecutive coupled constraints is needed: if there is
 563  *                                   not enough space for this set of constraints in the thread block,
 564  *                                   the group has to be moved to the next one.
 565  * \param[in]     atomAdjacencyList  Stores information about connections between atoms.
 566  */
 567 inline int countCoupled(int                                                  a,
 568                         std::vector<int>*                                    spaceNeeded,
 569                         std::vector<std::vector<std::tuple<int, int, int>>>* atomsAdjacencyList)
 570
 571 {
 572     int c2, a2, sign;
 573     int counted = 0;
 574     for (unsigned i = 0; i < atomsAdjacencyList->at(a).size(); i++)
 575     {
 576         std::tie(a2, c2, sign) = atomsAdjacencyList->at(a).at(i);
 577         if (spaceNeeded->at(c2) == -1)
 578         {
 579             spaceNeeded->at(c2) = 0; // To indicate we've been here
 580             counted += 1 + countCoupled(a2, spaceNeeded, atomsAdjacencyList);
 581         }
 582     }
 583     return counted;
 584 }
 585
 586 void LincsCuda::set(const t_idef& idef, const t_mdatoms& md)
 587 {
 588     int numAtoms = md.nr;
 589     // List of constrained atoms (CPU memory)
 590     std::vector<int2> constraintsHost;
 591     // Equilibrium distances for the constraints (CPU)
 592     std::vector<float> constraintsTargetLengthsHost;
 593     // Number of constraints, coupled with the current one (CPU)
 594     std::vector<int> coupledConstraintsCountsHost;
 595     // List of coupled with the current one (CPU)
 596     std::vector<int> coupledConstraintsIndicesHost;
 597     // Mass factors (CPU)
 598     std::vector<float> massFactorsHost;
 599
 600     // List of constrained atoms in local topology
 601     t_iatom*  iatoms         = idef.il[F_CONSTR].iatoms;
 602     const int stride         = NRAL(F_CONSTR) + 1;
 603     const int numConstraints = idef.il[F_CONSTR].nr / stride;
 604
 605     // Early exit if no constraints
 606     if (numConstraints == 0)
 607     {
 608         kernelParams_.numConstraintsThreads = 0;
 609         return;
 610     }
 611
 612     // Constructing adjacency list --- usefull intermediate structure
 613     std::vector<std::vector<std::tuple<int, int, int>>> atomsAdjacencyList(numAtoms);
 614     for (int c = 0; c < numConstraints; c++)
 615     {
 616         int a1 = iatoms[stride * c + 1];
 617         int a2 = iatoms[stride * c + 2];
 618
 619         // Each constraint will be represented as a tuple, containing index of the second
 620         // constrained atom, index of the constraint and a sign that indicates the order of atoms in
 621         // which they are listed. Sign is needed to compute the mass factors.
 622         atomsAdjacencyList.at(a1).push_back(std::make_tuple(a2, c, +1));
 623         atomsAdjacencyList.at(a2).push_back(std::make_tuple(a1, c, -1));
 624     }
 625
 626     // Compute, how many coupled constraints are in front of each constraint.
 627     // Needed to introduce splits in data so that all coupled constraints will be computed in a single GPU block.
 628     // The position 'c' of the vector spaceNeeded should have the number of constraints that are coupled to a constraint
 629     // 'c' and are after 'c' in the vector. Only first index of the connected group of the constraints is needed later in the
 630     // code, hence the spaceNeeded vector is also used to keep track if the constrain was already counted.
 631     std::vector<int> spaceNeeded;
 632     spaceNeeded.resize(numConstraints, -1);
 633     std::fill(spaceNeeded.begin(), spaceNeeded.end(), -1);
 634     for (int c = 0; c < numConstraints; c++)
 635     {
 636         int a1 = iatoms[stride * c + 1];
 637         int a2 = iatoms[stride * c + 2];
 638         if (spaceNeeded.at(c) == -1)
 639         {
 640             spaceNeeded.at(c) = countCoupled(a1, &spaceNeeded, &atomsAdjacencyList)
 641                                 + countCoupled(a2, &spaceNeeded, &atomsAdjacencyList);
 642         }
 643     }
 644
 645     // Map of splits in the constraints data. For each 'old' constraint index gives 'new' which
 646     // takes into account the empty spaces which might be needed in the end of each thread block.
 647     std::vector<int> splitMap;
 648     splitMap.resize(numConstraints, -1);
 649     int currentMapIndex = 0;
 650     for (int c = 0; c < numConstraints; c++)
 651     {
 652         // Check if coupled constraints all fit in one block
 653         GMX_RELEASE_ASSERT(
 654                 spaceNeeded.at(c) < c_threadsPerBlock,
 655                 "Maximum number of coupled constraints exceedes the size of the CUDA thread block. "
 656                 "Most likely, you are trying to use GPU version of LINCS with constraints on "
 657                 "all-bonds, "
 658                 "which is not supported. Try using H-bonds constraints only.");
 659         if (currentMapIndex / c_threadsPerBlock != (currentMapIndex + spaceNeeded.at(c)) / c_threadsPerBlock)
 660         {
 661             currentMapIndex = ((currentMapIndex / c_threadsPerBlock) + 1) * c_threadsPerBlock;
 662         }
 663         splitMap.at(c) = currentMapIndex;
 664         currentMapIndex++;
 665     }
 666     kernelParams_.numConstraintsThreads =
 667             currentMapIndex + c_threadsPerBlock - currentMapIndex % c_threadsPerBlock;
 668     GMX_RELEASE_ASSERT(kernelParams_.numConstraintsThreads % c_threadsPerBlock == 0,
 669                        "Number of threads should be a multiple of the block size");
 670
 671     // Initialize constraints and their target indexes taking into account the splits in the data arrays.
 672     int2 pair;
 673     pair.x = -1;
 674     pair.y = -1;
 675     constraintsHost.resize(kernelParams_.numConstraintsThreads, pair);
 676     std::fill(constraintsHost.begin(), constraintsHost.end(), pair);
 677     constraintsTargetLengthsHost.resize(kernelParams_.numConstraintsThreads, 0.0);
 678     std::fill(constraintsTargetLengthsHost.begin(), constraintsTargetLengthsHost.end(), 0.0);
 679     for (int c = 0; c < numConstraints; c++)
 680     {
 681         int a1   = iatoms[stride * c + 1];
 682         int a2   = iatoms[stride * c + 2];
 683         int type = iatoms[stride * c];
 684
 685         int2 pair;
 686         pair.x                                          = a1;
 687         pair.y                                          = a2;
 688         constraintsHost.at(splitMap.at(c))              = pair;
 689         constraintsTargetLengthsHost.at(splitMap.at(c)) = idef.iparams[type].constr.dA;
 690     }
 691
 692     // The adjacency list of constraints (i.e. the list of coupled constraints for each constraint).
 693     // We map a single thread to a single constraint, hence each thread 'c' will be using one
 694     // element from coupledConstraintsCountsHost array, which is the number of constraints coupled
 695     // to the constraint 'c'. The coupled constraints indexes are placed into the
 696     // coupledConstraintsIndicesHost array. Latter is organized as a one-dimensional array to ensure
 697     // good memory alignment. It is addressed as [c + i*numConstraintsThreads], where 'i' goes from
 698     // zero to the number of constraints coupled to 'c'. 'numConstraintsThreads' is the width of the
 699     // array --- a number, greater then total number of constraints, taking into account the splits
 700     // in the constraints array due to the GPU block borders. This number can be adjusted to improve
 701     // memory access pattern. Mass factors are saved in a similar data structure.
 702     int maxCoupledConstraints = 0;
 703     for (int c = 0; c < numConstraints; c++)
 704     {
 705         int a1 = iatoms[stride * c + 1];
 706         int a2 = iatoms[stride * c + 2];
 707
 708         // Constraint 'c' is counted twice, but it should be excluded altogether. Hence '-2'.
 709         int nCoupedConstraints = atomsAdjacencyList.at(a1).size() + atomsAdjacencyList.at(a2).size() - 2;
 710
 711         if (nCoupedConstraints > maxCoupledConstraints)
 712         {
 713             maxCoupledConstraints = nCoupedConstraints;
 714         }
 715     }
 716
 717     coupledConstraintsCountsHost.resize(kernelParams_.numConstraintsThreads, 0);
 718     coupledConstraintsIndicesHost.resize(maxCoupledConstraints * kernelParams_.numConstraintsThreads, -1);
 719     massFactorsHost.resize(maxCoupledConstraints * kernelParams_.numConstraintsThreads, -1);
 720
 721     for (int c1 = 0; c1 < numConstraints; c1++)
 722     {
 723         coupledConstraintsCountsHost.at(splitMap.at(c1)) = 0;
 724         int c1a1                                         = iatoms[stride * c1 + 1];
 725         int c1a2                                         = iatoms[stride * c1 + 2];
 726         int c2;
 727         int c2a1;
 728         int c2a2;
 729
 730         int sign;
 731
 732         // Constraints, coupled trough the first atom.
 733         c2a1 = c1a1;
 734         for (unsigned j = 0; j < atomsAdjacencyList.at(c1a1).size(); j++)
 735         {
 736
 737             std::tie(c2a2, c2, sign) = atomsAdjacencyList.at(c1a1).at(j);
 738
 739             if (c1 != c2)
 740             {
 741                 int index = kernelParams_.numConstraintsThreads
 742                                     * coupledConstraintsCountsHost.at(splitMap.at(c1))
 743                             + splitMap.at(c1);
 744
 745                 coupledConstraintsIndicesHost.at(index) = splitMap.at(c2);
 746
 747                 int center = c1a1;
 748
 749                 float sqrtmu1 = 1.0 / sqrt(md.invmass[c1a1] + md.invmass[c1a2]);
 750                 float sqrtmu2 = 1.0 / sqrt(md.invmass[c2a1] + md.invmass[c2a2]);
 751
 752                 massFactorsHost.at(index) = -sign * md.invmass[center] * sqrtmu1 * sqrtmu2;
 753
 754                 coupledConstraintsCountsHost.at(splitMap.at(c1))++;
 755             }
 756         }
 757
 758         // Constraints, coupled through the second atom.
 759         c2a1 = c1a2;
 760         for (unsigned j = 0; j < atomsAdjacencyList.at(c1a2).size(); j++)
 761         {
 762
 763             std::tie(c2a2, c2, sign) = atomsAdjacencyList.at(c1a2).at(j);
 764
 765             if (c1 != c2)
 766             {
 767                 int index = kernelParams_.numConstraintsThreads
 768                                     * coupledConstraintsCountsHost.at(splitMap.at(c1))
 769                             + splitMap.at(c1);
 770
 771                 coupledConstraintsIndicesHost.at(index) = splitMap.at(c2);
 772
 773                 int center = c1a2;
 774
 775                 float sqrtmu1 = 1.0 / sqrt(md.invmass[c1a1] + md.invmass[c1a2]);
 776                 float sqrtmu2 = 1.0 / sqrt(md.invmass[c2a1] + md.invmass[c2a2]);
 777
 778                 massFactorsHost.at(index) = sign * md.invmass[center] * sqrtmu1 * sqrtmu2;
 779
 780                 coupledConstraintsCountsHost.at(splitMap.at(c1))++;
 781             }
 782         }
 783     }
 784
 785     // (Re)allocate the memory, if the number of constraints has increased.
 786     if (kernelParams_.numConstraintsThreads > numConstraintsThreadsAlloc_)
 787     {
 788         // Free memory if it was allocated before (i.e. if not the first time here).
 789         if (numConstraintsThreadsAlloc_ > 0)
 790         {
 791             freeDeviceBuffer(&kernelParams_.d_constraints);
 792             freeDeviceBuffer(&kernelParams_.d_constraintsTargetLengths);
 793
 794             freeDeviceBuffer(&kernelParams_.d_coupledConstraintsCounts);
 795             freeDeviceBuffer(&kernelParams_.d_coupledConstraintsIndices);
 796             freeDeviceBuffer(&kernelParams_.d_massFactors);
 797             freeDeviceBuffer(&kernelParams_.d_matrixA);
 798         }
 799
 800         numConstraintsThreadsAlloc_ = kernelParams_.numConstraintsThreads;
 801
 802         allocateDeviceBuffer(&kernelParams_.d_constraints, kernelParams_.numConstraintsThreads, nullptr);
 803         allocateDeviceBuffer(&kernelParams_.d_constraintsTargetLengths,
 804                              kernelParams_.numConstraintsThreads, nullptr);
 805
 806         allocateDeviceBuffer(&kernelParams_.d_coupledConstraintsCounts,
 807                              kernelParams_.numConstraintsThreads, nullptr);
 808         allocateDeviceBuffer(&kernelParams_.d_coupledConstraintsIndices,
 809                              maxCoupledConstraints * kernelParams_.numConstraintsThreads, nullptr);
 810         allocateDeviceBuffer(&kernelParams_.d_massFactors,
 811                              maxCoupledConstraints * kernelParams_.numConstraintsThreads, nullptr);
 812         allocateDeviceBuffer(&kernelParams_.d_matrixA,
 813                              maxCoupledConstraints * kernelParams_.numConstraintsThreads, nullptr);
 814     }
 815
 816     // (Re)allocate the memory, if the number of atoms has increased.
 817     if (numAtoms > numAtomsAlloc_)
 818     {
 819         if (numAtomsAlloc_ > 0)
 820         {
 821             freeDeviceBuffer(&kernelParams_.d_inverseMasses);
 822         }
 823         numAtomsAlloc_ = numAtoms;
 824         allocateDeviceBuffer(&kernelParams_.d_inverseMasses, numAtoms, nullptr);
 825     }
 826
 827     // Copy data to GPU.
 828     copyToDeviceBuffer(&kernelParams_.d_constraints, constraintsHost.data(), 0,
 829                        kernelParams_.numConstraintsThreads, commandStream_,
 830                        GpuApiCallBehavior::Sync, nullptr);
 831     copyToDeviceBuffer(&kernelParams_.d_constraintsTargetLengths,
 832                        constraintsTargetLengthsHost.data(), 0, kernelParams_.numConstraintsThreads,
 833                        commandStream_, GpuApiCallBehavior::Sync, nullptr);
 834     copyToDeviceBuffer(&kernelParams_.d_coupledConstraintsCounts,
 835                        coupledConstraintsCountsHost.data(), 0, kernelParams_.numConstraintsThreads,
 836                        commandStream_, GpuApiCallBehavior::Sync, nullptr);
 837     copyToDeviceBuffer(&kernelParams_.d_coupledConstraintsIndices, coupledConstraintsIndicesHost.data(),
 838                        0, maxCoupledConstraints * kernelParams_.numConstraintsThreads,
 839                        commandStream_, GpuApiCallBehavior::Sync, nullptr);
 840     copyToDeviceBuffer(&kernelParams_.d_massFactors, massFactorsHost.data(), 0,
 841                        maxCoupledConstraints * kernelParams_.numConstraintsThreads, commandStream_,
 842                        GpuApiCallBehavior::Sync, nullptr);
 843
 844     GMX_RELEASE_ASSERT(md.invmass != nullptr, "Masses of attoms should be specified.\n");
 845     copyToDeviceBuffer(&kernelParams_.d_inverseMasses, md.invmass, 0, numAtoms, commandStream_,
 846                        GpuApiCallBehavior::Sync, nullptr);
 847 }
 848
 849 void LincsCuda::setPbc(const t_pbc* pbc)
 850 {
 851     setPbcAiuc(pbc->ndim_ePBC, pbc->box, &kernelParams_.pbcAiuc);
 852 }
 853
 854 } // namespace gmx