src/gromacs/mdlib/lincs_gpu_internal_sycl.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2019,2020,2021, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35 /*! \internal \file
  36  *
  37  * \brief Implements LINCS kernels using SYCL
  38  *
  39  * This file contains SYCL kernels of LINCS constraints algorithm.
  40  *
  41  * \author Artem Zhmurov <zhmurov@gmail.com>
  42  *
  43  * \ingroup module_mdlib
  44  */
  45 #include "lincs_gpu_internal.h"
  46
  47 #include "gromacs/gpu_utils/devicebuffer.h"
  48 #include "gromacs/gpu_utils/gmxsycl.h"
  49 #include "gromacs/gpu_utils/sycl_kernel_utils.h"
  50 #include "gromacs/mdlib/lincs_gpu.h"
  51 #include "gromacs/pbcutil/pbc_aiuc_sycl.h"
  52 #include "gromacs/utility/gmxassert.h"
  53 #include "gromacs/utility/template_mp.h"
  54
  55 namespace gmx
  56 {
  57
  58 using cl::sycl::access::fence_space;
  59 using cl::sycl::access::mode;
  60 using cl::sycl::access::target;
  61
  62 /*! \brief Main kernel for LINCS constraints.
  63  *
  64  * See Hess et al., J. Comput. Chem. 18: 1463-1472 (1997) for the description of the algorithm.
  65  *
  66  * In GPU version, one thread is responsible for all computations for one constraint. The blocks are
  67  * filled in a way that no constraint is coupled to the constraint from the next block. This is achieved
  68  * by moving active threads to the next block, if the correspondent group of coupled constraints is to big
  69  * to fit the current thread block. This may leave some 'dummy' threads in the end of the thread block, i.e.
  70  * threads that are not required to do actual work. Since constraints from different blocks are not coupled,
  71  * there is no need to synchronize across the device. However, extensive communication in a thread block
  72  * are still needed.
  73  *
  74  * \todo Reduce synchronization overhead. Some ideas are:
  75  *        1. Consider going to warp-level synchronization for the coupled constraints.
  76  *        2. Move more data to local/shared memory and try to get rid of atomic operations (at least on
  77  *           the device level).
  78  *        3. Use analytical solution for matrix A inversion.
  79  *        4. Introduce mapping of thread id to both single constraint and single atom, thus designating
  80  *           Nth threads to deal with Nat <= Nth coupled atoms and Nc <= Nth coupled constraints.
  81  *       See Issue #2885 for details (https://gitlab.com/gromacs/gromacs/-/issues/2885)
  82  * \todo The use of __restrict__  for gm_xp and gm_v causes failure, probably because of the atomic
  83  *       operations. Investigate this issue further.
  84  *
  85  * \tparam updateVelocities        Whether velocities should be updated this step.
  86  * \tparam computeVirial           Whether virial tensor should be computed this step.
  87  * \tparam haveCoupledConstraints  If there are coupled constraints (i.e. LINCS iterations are needed).
  88  *
  89  * \param[in]     cgh                           SYCL handler.
  90  * \param[in]     numConstraintsThreads         Total number of threads.
  91  * \param[in]     a_constraints                 List of constrained atoms.
  92  * \param[in]     a_constraintsTargetLengths    Equilibrium distances for the constraints.
  93  * \param[in]     a_coupledConstraintsCounts    Number of constraints, coupled with the current one.
  94  * \param[in]     a_coupledConstraintsIndices   List of coupled with the current one.
  95  * \param[in]     a_massFactors                 Mass factors.
  96  * \param[in]     a_matrixA                     Elements of the coupling matrix.
  97  * \param[in]     a_inverseMasses               1/mass for all atoms.
  98  * \param[in]     numIterations                 Number of iterations used to correct the projection.
  99  * \param[in]     expansionOrder                Order of expansion when inverting the matrix.
 100  * \param[in]     a_x                           Unconstrained positions.
 101  * \param[in,out] a_xp                          Positions at the previous step, will be updated.
 102  * \param[in]     invdt                         Inverse timestep (needed to update velocities).
 103  * \param[in,out] a_v                           Velocities of atoms, will be updated if \c updateVelocities.
 104  * \param[in,out] a_virialScaled                Scaled virial tensor (6 floats: [XX, XY, XZ, YY, YZ, ZZ].
 105  *                                              Will be updated if \c updateVirial.
 106  * \param[in]     pbcAiuc                       Periodic boundary data.
 107  */
 108 template<bool updateVelocities, bool computeVirial, bool haveCoupledConstraints>
 109 auto lincsKernel(cl::sycl::handler&                   cgh,
 110                  const int                            numConstraintsThreads,
 111                  DeviceAccessor<AtomPair, mode::read> a_constraints,
 112                  DeviceAccessor<float, mode::read>    a_constraintsTargetLengths,
 113                  OptionalAccessor<int, mode::read, haveCoupledConstraints> a_coupledConstraintsCounts,
 114                  OptionalAccessor<int, mode::read, haveCoupledConstraints> a_coupledConstraintsIndices,
 115                  OptionalAccessor<float, mode::read, haveCoupledConstraints>       a_massFactors,
 116                  OptionalAccessor<float, mode::read_write, haveCoupledConstraints> a_matrixA,
 117                  DeviceAccessor<float, mode::read>                                 a_inverseMasses,
 118                  const int                                                         numIterations,
 119                  const int                                                         expansionOrder,
 120                  DeviceAccessor<Float3, mode::read>                                a_x,
 121                  DeviceAccessor<Float3, mode::read_write>                          a_xp,
 122                  const float                                                       invdt,
 123                  OptionalAccessor<Float3, mode::read_write, updateVelocities>      a_v,
 124                  OptionalAccessor<float, mode::read_write, computeVirial>          a_virialScaled,
 125                  PbcAiuc                                                           pbcAiuc)
 126 {
 127     a_constraints.bind(cgh);
 128     a_constraintsTargetLengths.bind(cgh);
 129     if constexpr (haveCoupledConstraints)
 130     {
 131         a_coupledConstraintsCounts.bind(cgh);
 132         a_coupledConstraintsIndices.bind(cgh);
 133         a_massFactors.bind(cgh);
 134         a_matrixA.bind(cgh);
 135     }
 136     a_inverseMasses.bind(cgh);
 137     a_x.bind(cgh);
 138     a_xp.bind(cgh);
 139     if constexpr (updateVelocities)
 140     {
 141         a_v.bind(cgh);
 142     }
 143     if constexpr (computeVirial)
 144     {
 145         a_virialScaled.bind(cgh);
 146     }
 147
 148     /* Shared local memory buffer. Corresponds to sh_r, sm_rhs, and sm_threadVirial in CUDA.
 149      * sh_r: one Float3 per thread.
 150      * sh_rhs: two floats per thread.
 151      * sm_threadVirial: six floats per thread.
 152      * So, without virials we need max(1*3, 2) floats, and with virials we need max(1*3, 2, 6) floats.
 153      */
 154     static constexpr int smBufferElementsPerThread = computeVirial ? 6 : 3;
 155     cl::sycl::accessor<float, 1, mode::read_write, target::local> sm_buffer{
 156         cl::sycl::range<1>(c_threadsPerBlock * smBufferElementsPerThread), cgh
 157     };
 158
 159     return [=](cl::sycl::nd_item<1> itemIdx) {
 160         const int threadIndex   = itemIdx.get_global_linear_id();
 161         const int threadInBlock = itemIdx.get_local_linear_id(); // Work-item index in work-group
 162
 163         AtomPair pair = a_constraints[threadIndex];
 164         int      i    = pair.i;
 165         int      j    = pair.j;
 166
 167         // Mass-scaled Lagrange multiplier
 168         float lagrangeScaled = 0.0F;
 169
 170         float targetLength;
 171         float inverseMassi;
 172         float inverseMassj;
 173         float sqrtReducedMass;
 174
 175         Float3 xi;
 176         Float3 xj;
 177         Float3 rc;
 178
 179         // i == -1 indicates dummy constraint at the end of the thread block.
 180         bool isDummyThread = (i == -1);
 181
 182         // Everything computed for these dummies will be equal to zero
 183         if (isDummyThread)
 184         {
 185             targetLength    = 0.0F;
 186             inverseMassi    = 0.0F;
 187             inverseMassj    = 0.0F;
 188             sqrtReducedMass = 0.0F;
 189
 190             xi = Float3(0.0F, 0.0F, 0.0F);
 191             xj = Float3(0.0F, 0.0F, 0.0F);
 192             rc = Float3(0.0F, 0.0F, 0.0F);
 193         }
 194         else
 195         {
 196             // Collecting data
 197             targetLength    = a_constraintsTargetLengths[threadIndex];
 198             inverseMassi    = a_inverseMasses[i];
 199             inverseMassj    = a_inverseMasses[j];
 200             sqrtReducedMass = cl::sycl::rsqrt(inverseMassi + inverseMassj);
 201
 202             xi = a_x[i];
 203             xj = a_x[j];
 204
 205             Float3 dx;
 206             pbcDxAiucSycl(pbcAiuc, xi, xj, dx);
 207
 208             float rlen = cl::sycl::rsqrt(dx[XX] * dx[XX] + dx[YY] * dx[YY] + dx[ZZ] * dx[ZZ]);
 209             rc         = rlen * dx;
 210         }
 211
 212         sm_buffer[threadInBlock * DIM + XX] = rc[XX];
 213         sm_buffer[threadInBlock * DIM + YY] = rc[YY];
 214         sm_buffer[threadInBlock * DIM + ZZ] = rc[ZZ];
 215         // Make sure that all r's are saved into shared memory
 216         // before they are accessed in the loop below
 217         itemIdx.barrier(fence_space::global_and_local);
 218
 219         /*
 220          * Constructing LINCS matrix (A)
 221          */
 222         int coupledConstraintsCount = 0;
 223         if constexpr (haveCoupledConstraints)
 224         {
 225             // Only non-zero values are saved (for coupled constraints)
 226             coupledConstraintsCount = a_coupledConstraintsCounts[threadIndex];
 227             for (int n = 0; n < coupledConstraintsCount; n++)
 228             {
 229                 int index = n * numConstraintsThreads + threadIndex;
 230                 int c1    = a_coupledConstraintsIndices[index];
 231
 232                 Float3 rc1{ sm_buffer[c1 * DIM + XX], sm_buffer[c1 * DIM + YY], sm_buffer[c1 * DIM + ZZ] };
 233                 a_matrixA[index] = a_massFactors[index]
 234                                    * (rc[XX] * rc1[XX] + rc[YY] * rc1[YY] + rc[ZZ] * rc1[ZZ]);
 235             }
 236         }
 237
 238         // Skipping in dummy threads
 239         if (!isDummyThread)
 240         {
 241             xi[XX] = atomicLoad(a_xp[i][XX]);
 242             xi[YY] = atomicLoad(a_xp[i][YY]);
 243             xi[ZZ] = atomicLoad(a_xp[i][ZZ]);
 244             xj[XX] = atomicLoad(a_xp[j][XX]);
 245             xj[YY] = atomicLoad(a_xp[j][YY]);
 246             xj[ZZ] = atomicLoad(a_xp[j][ZZ]);
 247         }
 248
 249         Float3 dx;
 250         pbcDxAiucSycl(pbcAiuc, xi, xj, dx);
 251
 252         float sol = sqrtReducedMass * ((rc[XX] * dx[XX] + rc[YY] * dx[YY] + rc[ZZ] * dx[ZZ]) - targetLength);
 253
 254         /*
 255          *  Inverse matrix using a set of expansionOrder matrix multiplications
 256          */
 257
 258         // This will reuse the same buffer, because the old values are no longer needed.
 259         itemIdx.barrier(fence_space::local_space);
 260         sm_buffer[threadInBlock] = sol;
 261
 262         // No need to iterate if there are no coupled constraints.
 263         if constexpr (haveCoupledConstraints)
 264         {
 265             for (int rec = 0; rec < expansionOrder; rec++)
 266             {
 267                 // Making sure that all sm_buffer values are saved before they are accessed in a loop below
 268                 itemIdx.barrier(fence_space::global_and_local);
 269                 float mvb = 0.0F;
 270                 for (int n = 0; n < coupledConstraintsCount; n++)
 271                 {
 272                     int index = n * numConstraintsThreads + threadIndex;
 273                     int c1    = a_coupledConstraintsIndices[index];
 274                     // Convolute current right-hand-side with A
 275                     // Different, non overlapping parts of sm_buffer[..] are read during odd and even iterations
 276                     mvb = mvb + a_matrixA[index] * sm_buffer[c1 + c_threadsPerBlock * (rec % 2)];
 277                 }
 278                 // 'Switch' rhs vectors, save current result
 279                 // These values will be accessed in the loop above during the next iteration.
 280                 sm_buffer[threadInBlock + c_threadsPerBlock * ((rec + 1) % 2)] = mvb;
 281
 282                 sol = sol + mvb;
 283             }
 284         }
 285
 286         // Current mass-scaled Lagrange multipliers
 287         lagrangeScaled = sqrtReducedMass * sol;
 288
 289         // Save updated coordinates before correction for the rotational lengthening
 290         Float3 tmp = rc * lagrangeScaled;
 291
 292         // Writing for all but dummy constraints
 293         if (!isDummyThread)
 294         {
 295             /*
 296              * Note: Using memory_scope::work_group for atomic_ref can be better here,
 297              * but for now we re-use the existing function for memory_scope::device atomics.
 298              */
 299             atomicFetchAdd(a_xp[i][XX], -tmp[XX] * inverseMassi);
 300             atomicFetchAdd(a_xp[i][YY], -tmp[YY] * inverseMassi);
 301             atomicFetchAdd(a_xp[i][ZZ], -tmp[ZZ] * inverseMassi);
 302             atomicFetchAdd(a_xp[j][XX], tmp[XX] * inverseMassj);
 303             atomicFetchAdd(a_xp[j][YY], tmp[YY] * inverseMassj);
 304             atomicFetchAdd(a_xp[j][ZZ], tmp[ZZ] * inverseMassj);
 305         }
 306
 307         /*
 308          *  Correction for centripetal effects
 309          */
 310         for (int iter = 0; iter < numIterations; iter++)
 311         {
 312             // Make sure that all xp's are saved: atomic operation calls before are
 313             // communicating current xp[..] values across thread block.
 314             itemIdx.barrier(fence_space::global_and_local);
 315
 316             if (!isDummyThread)
 317             {
 318                 xi[XX] = atomicLoad(a_xp[i][XX]);
 319                 xi[YY] = atomicLoad(a_xp[i][YY]);
 320                 xi[ZZ] = atomicLoad(a_xp[i][ZZ]);
 321                 xj[XX] = atomicLoad(a_xp[j][XX]);
 322                 xj[YY] = atomicLoad(a_xp[j][YY]);
 323                 xj[ZZ] = atomicLoad(a_xp[j][ZZ]);
 324             }
 325
 326             Float3 dx;
 327             pbcDxAiucSycl(pbcAiuc, xi, xj, dx);
 328
 329             float len2  = targetLength * targetLength;
 330             float dlen2 = 2.0F * len2 - (dx[XX] * dx[XX] + dx[YY] * dx[YY] + dx[ZZ] * dx[ZZ]);
 331
 332             // TODO A little bit more effective but slightly less readable version of the below would be:
 333             //      float proj = sqrtReducedMass*(targetLength - (dlen2 > 0.0f ? 1.0f : 0.0f)*dlen2*rsqrt(dlen2));
 334             float proj;
 335             if (dlen2 > 0.0F)
 336             {
 337                 proj = sqrtReducedMass * (targetLength - dlen2 * cl::sycl::rsqrt(dlen2));
 338             }
 339             else
 340             {
 341                 proj = sqrtReducedMass * targetLength;
 342             }
 343
 344             sm_buffer[threadInBlock] = proj;
 345             float sol                = proj;
 346
 347             /*
 348              * Same matrix inversion as above is used for updated data
 349              */
 350             if constexpr (haveCoupledConstraints)
 351             {
 352                 for (int rec = 0; rec < expansionOrder; rec++)
 353                 {
 354                     // Make sure that all elements of rhs are saved into shared memory
 355                     itemIdx.barrier(fence_space::global_and_local);
 356                     float mvb = 0;
 357                     for (int n = 0; n < coupledConstraintsCount; n++)
 358                     {
 359                         int index = n * numConstraintsThreads + threadIndex;
 360                         int c1    = a_coupledConstraintsIndices[index];
 361
 362                         mvb = mvb + a_matrixA[index] * sm_buffer[c1 + c_threadsPerBlock * (rec % 2)];
 363                     }
 364
 365                     sm_buffer[threadInBlock + c_threadsPerBlock * ((rec + 1) % 2)] = mvb;
 366                     sol                                                            = sol + mvb;
 367                 }
 368             }
 369
 370             // Add corrections to Lagrange multipliers
 371             float sqrtmu_sol = sqrtReducedMass * sol;
 372             lagrangeScaled += sqrtmu_sol;
 373
 374             // Save updated coordinates for the next iteration
 375             // Dummy constraints are skipped
 376             if (!isDummyThread)
 377             {
 378                 Float3 tmp = rc * sqrtmu_sol;
 379                 atomicFetchAdd(a_xp[i][XX], -tmp[XX] * inverseMassi);
 380                 atomicFetchAdd(a_xp[i][YY], -tmp[YY] * inverseMassi);
 381                 atomicFetchAdd(a_xp[i][ZZ], -tmp[ZZ] * inverseMassi);
 382                 atomicFetchAdd(a_xp[j][XX], tmp[XX] * inverseMassj);
 383                 atomicFetchAdd(a_xp[j][YY], tmp[YY] * inverseMassj);
 384                 atomicFetchAdd(a_xp[j][ZZ], tmp[ZZ] * inverseMassj);
 385             }
 386         }
 387
 388         // Updating particle velocities for all but dummy threads
 389         if constexpr (updateVelocities)
 390         {
 391             if (!isDummyThread)
 392             {
 393                 Float3 tmp = rc * invdt * lagrangeScaled;
 394                 atomicFetchAdd(a_v[i][XX], -tmp[XX] * inverseMassi);
 395                 atomicFetchAdd(a_v[i][YY], -tmp[YY] * inverseMassi);
 396                 atomicFetchAdd(a_v[i][ZZ], -tmp[ZZ] * inverseMassi);
 397                 atomicFetchAdd(a_v[j][XX], tmp[XX] * inverseMassj);
 398                 atomicFetchAdd(a_v[j][YY], tmp[YY] * inverseMassj);
 399                 atomicFetchAdd(a_v[j][ZZ], tmp[ZZ] * inverseMassj);
 400             }
 401         }
 402
 403         if constexpr (computeVirial)
 404         {
 405             // Virial is computed from Lagrange multiplier (lagrangeScaled), target constrain length
 406             // (targetLength) and the normalized vector connecting constrained atoms before
 407             // the algorithm was applied (rc). The evaluation of virial in each thread is
 408             // followed by basic reduction for the values inside single thread block.
 409             // Then, the values are reduced across grid by atomicAdd(...).
 410             //
 411             // TODO Shuffle reduction.
 412             // TODO Should be unified and/or done once when virial is actually needed.
 413             // TODO Recursive version that removes atomicAdd(...)'s entirely is needed. Ideally,
 414             //      one that works for any datatype.
 415
 416             // Save virial for each thread into the shared memory. Tensor is symmetrical, hence only
 417             // 6 values are saved. Dummy threads will have zeroes in their virial: targetLength,
 418             // lagrangeScaled and rc are all set to zero for them in the beginning of the kernel.
 419             // We reuse the same shared memory buffer, so we make sure we don't need its old values:
 420             itemIdx.barrier(fence_space::local_space);
 421             float mult                                       = targetLength * lagrangeScaled;
 422             sm_buffer[0 * c_threadsPerBlock + threadInBlock] = mult * rc[XX] * rc[XX];
 423             sm_buffer[1 * c_threadsPerBlock + threadInBlock] = mult * rc[XX] * rc[YY];
 424             sm_buffer[2 * c_threadsPerBlock + threadInBlock] = mult * rc[XX] * rc[ZZ];
 425             sm_buffer[3 * c_threadsPerBlock + threadInBlock] = mult * rc[YY] * rc[YY];
 426             sm_buffer[4 * c_threadsPerBlock + threadInBlock] = mult * rc[YY] * rc[ZZ];
 427             sm_buffer[5 * c_threadsPerBlock + threadInBlock] = mult * rc[ZZ] * rc[ZZ];
 428
 429             itemIdx.barrier(fence_space::local_space);
 430             // This casts unsigned into signed integers to avoid clang warnings
 431             const int tib          = static_cast<int>(threadInBlock);
 432             const int blockSize    = static_cast<int>(c_threadsPerBlock);
 433             const int subGroupSize = itemIdx.get_sub_group().get_max_local_range()[0];
 434
 435             // Reduce up to one virial per thread block
 436             // All blocks are divided by half, the first half of threads sums
 437             // two virials. Then the first half is divided by two and the first half
 438             // of it sums two values... The procedure continues until only one thread left.
 439             // Only works if the threads per blocks is a power of two.
 440             for (int divideBy = 2; divideBy <= blockSize; divideBy *= 2)
 441             {
 442                 int dividedAt = blockSize / divideBy;
 443                 if (tib < dividedAt)
 444                 {
 445                     for (int d = 0; d < 6; d++)
 446                     {
 447                         sm_buffer[d * blockSize + tib] += sm_buffer[d * blockSize + (tib + dividedAt)];
 448                     }
 449                 }
 450                 if (dividedAt > subGroupSize / 2)
 451                 {
 452                     itemIdx.barrier(fence_space::local_space);
 453                 }
 454                 else
 455                 {
 456                     subGroupBarrier(itemIdx);
 457                 }
 458             }
 459             // First 6 threads in the block add the 6 components of virial to the global memory address
 460             if (tib < 6)
 461             {
 462                 atomicFetchAdd(a_virialScaled[tib], sm_buffer[tib * blockSize]);
 463             }
 464         }
 465     };
 466 }
 467
 468 // SYCL 1.2.1 requires providing a unique type for a kernel. Should not be needed for SYCL2020.
 469 template<bool updateVelocities, bool computeVirial, bool haveCoupledConstraints>
 470 class LincsKernelName;
 471
 472 template<bool updateVelocities, bool computeVirial, bool haveCoupledConstraints, class... Args>
 473 static cl::sycl::event launchLincsKernel(const DeviceStream& deviceStream,
 474                                          const int           numConstraintsThreads,
 475                                          Args&&... args)
 476 {
 477     // Should not be needed for SYCL2020.
 478     using kernelNameType = LincsKernelName<updateVelocities, computeVirial, haveCoupledConstraints>;
 479
 480     const cl::sycl::nd_range<1> rangeAllLincs(numConstraintsThreads, c_threadsPerBlock);
 481     cl::sycl::queue             q = deviceStream.stream();
 482
 483     cl::sycl::event e = q.submit([&](cl::sycl::handler& cgh) {
 484         auto kernel = lincsKernel<updateVelocities, computeVirial, haveCoupledConstraints>(
 485                 cgh, numConstraintsThreads, std::forward<Args>(args)...);
 486         cgh.parallel_for<kernelNameType>(rangeAllLincs, kernel);
 487     });
 488
 489     return e;
 490 }
 491
 492 /*! \brief Select templated kernel and launch it. */
 493 template<class... Args>
 494 static inline cl::sycl::event
 495 launchLincsKernel(bool updateVelocities, bool computeVirial, bool haveCoupledConstraints, Args&&... args)
 496 {
 497     return dispatchTemplatedFunction(
 498             [&](auto updateVelocities_, auto computeVirial_, auto haveCoupledConstraints_) {
 499                 return launchLincsKernel<updateVelocities_, computeVirial_, haveCoupledConstraints_>(
 500                         std::forward<Args>(args)...);
 501             },
 502             updateVelocities,
 503             computeVirial,
 504             haveCoupledConstraints);
 505 }
 506
 507
 508 void launchLincsGpuKernel(LincsGpuKernelParameters*   kernelParams,
 509                           const DeviceBuffer<Float3>& d_x,
 510                           DeviceBuffer<Float3>        d_xp,
 511                           const bool                  updateVelocities,
 512                           DeviceBuffer<Float3>        d_v,
 513                           const real                  invdt,
 514                           const bool                  computeVirial,
 515                           const DeviceStream&         deviceStream)
 516 {
 517     launchLincsKernel(updateVelocities,
 518                       computeVirial,
 519                       kernelParams->haveCoupledConstraints,
 520                       deviceStream,
 521                       kernelParams->numConstraintsThreads,
 522                       kernelParams->d_constraints,
 523                       kernelParams->d_constraintsTargetLengths,
 524                       kernelParams->d_coupledConstraintsCounts,
 525                       kernelParams->d_coupledConstraintsIndices,
 526                       kernelParams->d_massFactors,
 527                       kernelParams->d_matrixA,
 528                       kernelParams->d_inverseMasses,
 529                       kernelParams->numIterations,
 530                       kernelParams->expansionOrder,
 531                       d_x,
 532                       d_xp,
 533                       invdt,
 534                       d_v,
 535                       kernelParams->d_virialScaled,
 536                       kernelParams->pbcAiuc);
 537     return;
 538 }
 539
 540 } // namespace gmx