src/gromacs/listed_forces/gpubondedkernels.cu

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35 /*! \internal \file
  36  *
  37  * \brief Implements CUDA bonded functionality
  38  *
  39  * \author Jon Vincent <jvincent@nvidia.com>
  40  * \author Magnus Lundborg <lundborg.magnus@gmail.com>
  41  * \author Berk Hess <hess@kth.se>
  42  * \author Szilárd Páll <pall.szilard@gmail.com>
  43  * \author Alan Gray <alang@nvidia.com>
  44  * \author Mark Abraham <mark.j.abraham@gmail.com>
  45  *
  46  * \ingroup module_listed_forces
  47  */
  48
  49 #include "gmxpre.h"
  50
  51 #include <cassert>
  52
  53 #include <math_constants.h>
  54
  55 #include "gromacs/gpu_utils/cudautils.cuh"
  56 #include "gromacs/gpu_utils/typecasts.cuh"
  57 #include "gromacs/gpu_utils/vectype_ops.cuh"
  58 #include "gromacs/listed_forces/gpubonded.h"
  59 #include "gromacs/math/units.h"
  60 #include "gromacs/mdlib/force_flags.h"
  61 #include "gromacs/mdtypes/interaction_const.h"
  62 #include "gromacs/mdtypes/simulation_workload.h"
  63 #include "gromacs/pbcutil/pbc_aiuc_cuda.cuh"
  64 #include "gromacs/timing/wallcycle.h"
  65 #include "gromacs/utility/gmxassert.h"
  66
  67 #include "gpubonded_impl.h"
  68
  69 #if defined(_MSVC)
  70 #    include <limits>
  71 #endif
  72
  73 /*-------------------------------- CUDA kernels-------------------------------- */
  74 /*------------------------------------------------------------------------------*/
  75
  76 #define CUDA_DEG2RAD_F (CUDART_PI_F / 180.0f)
  77
  78 /*---------------- BONDED CUDA kernels--------------*/
  79
  80 /* Harmonic */
  81 __device__ __forceinline__ static void
  82            harmonic_gpu(const float kA, const float xA, const float x, float* V, float* F)
  83 {
  84     constexpr float half = 0.5f;
  85     float           dx, dx2;
  86
  87     dx  = x - xA;
  88     dx2 = dx * dx;
  89
  90     *F = -kA * dx;
  91     *V = half * kA * dx2;
  92 }
  93
  94 template<bool calcVir, bool calcEner>
  95 __device__ void bonds_gpu(const int       i,
  96                           float*          vtot_loc,
  97                           const int       numBonds,
  98                           const t_iatom   d_forceatoms[],
  99                           const t_iparams d_forceparams[],
 100                           const float4    gm_xq[],
 101                           float3          gm_f[],
 102                           float3          sm_fShiftLoc[],
 103                           const PbcAiuc   pbcAiuc)
 104 {
 105     if (i < numBonds)
 106     {
 107         int3 bondData = *(int3*)(d_forceatoms + 3 * i);
 108         int  type     = bondData.x;
 109         int  ai       = bondData.y;
 110         int  aj       = bondData.z;
 111
 112         /* dx = xi - xj, corrected for periodic boundary conditions. */
 113         float3 dx;
 114         int    ki = pbcDxAiuc<calcVir>(pbcAiuc, gm_xq[ai], gm_xq[aj], dx);
 115
 116         float dr2 = norm2(dx);
 117         float dr  = sqrt(dr2);
 118
 119         float vbond;
 120         float fbond;
 121         harmonic_gpu(d_forceparams[type].harmonic.krA, d_forceparams[type].harmonic.rA, dr, &vbond, &fbond);
 122
 123         if (calcEner)
 124         {
 125             *vtot_loc += vbond;
 126         }
 127
 128         if (dr2 != 0.0f)
 129         {
 130             fbond *= rsqrtf(dr2);
 131
 132             float3 fij = fbond * dx;
 133             atomicAdd(&gm_f[ai], fij);
 134             atomicAdd(&gm_f[aj], -fij);
 135             if (calcVir && ki != CENTRAL)
 136             {
 137                 atomicAdd(&sm_fShiftLoc[ki], fij);
 138                 atomicAdd(&sm_fShiftLoc[CENTRAL], -fij);
 139             }
 140         }
 141     }
 142 }
 143
 144 template<bool returnShift>
 145 __device__ __forceinline__ static float bond_angle_gpu(const float4   xi,
 146                                                        const float4   xj,
 147                                                        const float4   xk,
 148                                                        const PbcAiuc& pbcAiuc,
 149                                                        float3*        r_ij,
 150                                                        float3*        r_kj,
 151                                                        float*         costh,
 152                                                        int*           t1,
 153                                                        int*           t2)
 154 /* Return value is the angle between the bonds i-j and j-k */
 155 {
 156     *t1 = pbcDxAiuc<returnShift>(pbcAiuc, xi, xj, *r_ij);
 157     *t2 = pbcDxAiuc<returnShift>(pbcAiuc, xk, xj, *r_kj);
 158
 159     *costh   = cos_angle(*r_ij, *r_kj);
 160     float th = acosf(*costh);
 161
 162     return th;
 163 }
 164
 165 template<bool calcVir, bool calcEner>
 166 __device__ void angles_gpu(const int       i,
 167                            float*          vtot_loc,
 168                            const int       numBonds,
 169                            const t_iatom   d_forceatoms[],
 170                            const t_iparams d_forceparams[],
 171                            const float4    gm_xq[],
 172                            float3          gm_f[],
 173                            float3          sm_fShiftLoc[],
 174                            const PbcAiuc   pbcAiuc)
 175 {
 176     if (i < numBonds)
 177     {
 178         int4 angleData = *(int4*)(d_forceatoms + 4 * i);
 179         int  type      = angleData.x;
 180         int  ai        = angleData.y;
 181         int  aj        = angleData.z;
 182         int  ak        = angleData.w;
 183
 184         float3 r_ij;
 185         float3 r_kj;
 186         float  cos_theta;
 187         int    t1;
 188         int    t2;
 189         float  theta = bond_angle_gpu<calcVir>(gm_xq[ai], gm_xq[aj], gm_xq[ak], pbcAiuc, &r_ij,
 190                                               &r_kj, &cos_theta, &t1, &t2);
 191
 192         float va;
 193         float dVdt;
 194         harmonic_gpu(d_forceparams[type].harmonic.krA,
 195                      d_forceparams[type].harmonic.rA * CUDA_DEG2RAD_F, theta, &va, &dVdt);
 196
 197         if (calcEner)
 198         {
 199             *vtot_loc += va;
 200         }
 201
 202         float cos_theta2 = cos_theta * cos_theta;
 203         if (cos_theta2 < 1.0f)
 204         {
 205             float st    = dVdt * rsqrtf(1.0f - cos_theta2);
 206             float sth   = st * cos_theta;
 207             float nrij2 = norm2(r_ij);
 208             float nrkj2 = norm2(r_kj);
 209
 210             float nrij_1 = rsqrtf(nrij2);
 211             float nrkj_1 = rsqrtf(nrkj2);
 212
 213             float cik = st * nrij_1 * nrkj_1;
 214             float cii = sth * nrij_1 * nrij_1;
 215             float ckk = sth * nrkj_1 * nrkj_1;
 216
 217             float3 f_i = cii * r_ij - cik * r_kj;
 218             float3 f_k = ckk * r_kj - cik * r_ij;
 219             float3 f_j = -f_i - f_k;
 220
 221             atomicAdd(&gm_f[ai], f_i);
 222             atomicAdd(&gm_f[aj], f_j);
 223             atomicAdd(&gm_f[ak], f_k);
 224
 225             if (calcVir)
 226             {
 227                 atomicAdd(&sm_fShiftLoc[t1], f_i);
 228                 atomicAdd(&sm_fShiftLoc[CENTRAL], f_j);
 229                 atomicAdd(&sm_fShiftLoc[t2], f_k);
 230             }
 231         }
 232     }
 233 }
 234
 235 template<bool calcVir, bool calcEner>
 236 __device__ void urey_bradley_gpu(const int       i,
 237                                  float*          vtot_loc,
 238                                  const int       numBonds,
 239                                  const t_iatom   d_forceatoms[],
 240                                  const t_iparams d_forceparams[],
 241                                  const float4    gm_xq[],
 242                                  float3          gm_f[],
 243                                  float3          sm_fShiftLoc[],
 244                                  const PbcAiuc   pbcAiuc)
 245 {
 246     if (i < numBonds)
 247     {
 248         int4 ubData = *(int4*)(d_forceatoms + 4 * i);
 249         int  type   = ubData.x;
 250         int  ai     = ubData.y;
 251         int  aj     = ubData.z;
 252         int  ak     = ubData.w;
 253
 254         float th0A = d_forceparams[type].u_b.thetaA * CUDA_DEG2RAD_F;
 255         float kthA = d_forceparams[type].u_b.kthetaA;
 256         float r13A = d_forceparams[type].u_b.r13A;
 257         float kUBA = d_forceparams[type].u_b.kUBA;
 258
 259         float3 r_ij;
 260         float3 r_kj;
 261         float  cos_theta;
 262         int    t1;
 263         int    t2;
 264         float  theta = bond_angle_gpu<calcVir>(gm_xq[ai], gm_xq[aj], gm_xq[ak], pbcAiuc, &r_ij,
 265                                               &r_kj, &cos_theta, &t1, &t2);
 266
 267         float va;
 268         float dVdt;
 269         harmonic_gpu(kthA, th0A, theta, &va, &dVdt);
 270
 271         if (calcEner)
 272         {
 273             *vtot_loc += va;
 274         }
 275
 276         float3 r_ik;
 277         int    ki = pbcDxAiuc<calcVir>(pbcAiuc, gm_xq[ai], gm_xq[ak], r_ik);
 278
 279         float dr2 = norm2(r_ik);
 280         float dr  = dr2 * rsqrtf(dr2);
 281
 282         float vbond;
 283         float fbond;
 284         harmonic_gpu(kUBA, r13A, dr, &vbond, &fbond);
 285
 286         float cos_theta2 = cos_theta * cos_theta;
 287         if (cos_theta2 < 1.0f)
 288         {
 289             float st  = dVdt * rsqrtf(1.0f - cos_theta2);
 290             float sth = st * cos_theta;
 291
 292             float nrkj2 = norm2(r_kj);
 293             float nrij2 = norm2(r_ij);
 294
 295             float cik = st * rsqrtf(nrkj2 * nrij2);
 296             float cii = sth / nrij2;
 297             float ckk = sth / nrkj2;
 298
 299             float3 f_i = cii * r_ij - cik * r_kj;
 300             float3 f_k = ckk * r_kj - cik * r_ij;
 301             float3 f_j = -f_i - f_k;
 302
 303             atomicAdd(&gm_f[ai], f_i);
 304             atomicAdd(&gm_f[aj], f_j);
 305             atomicAdd(&gm_f[ak], f_k);
 306
 307             if (calcVir)
 308             {
 309                 atomicAdd(&sm_fShiftLoc[t1], f_i);
 310                 atomicAdd(&sm_fShiftLoc[CENTRAL], f_j);
 311                 atomicAdd(&sm_fShiftLoc[t2], f_k);
 312             }
 313         }
 314
 315         /* Time for the bond calculations */
 316         if (dr2 != 0.0f)
 317         {
 318             if (calcEner)
 319             {
 320                 *vtot_loc += vbond;
 321             }
 322
 323             fbond *= rsqrtf(dr2);
 324
 325             float3 fik = fbond * r_ik;
 326             atomicAdd(&gm_f[ai], fik);
 327             atomicAdd(&gm_f[ak], -fik);
 328
 329             if (calcVir && ki != CENTRAL)
 330             {
 331                 atomicAdd(&sm_fShiftLoc[ki], fik);
 332                 atomicAdd(&sm_fShiftLoc[CENTRAL], -fik);
 333             }
 334         }
 335     }
 336 }
 337
 338 template<bool returnShift, typename T>
 339 __device__ __forceinline__ static float dih_angle_gpu(const T        xi,
 340                                                       const T        xj,
 341                                                       const T        xk,
 342                                                       const T        xl,
 343                                                       const PbcAiuc& pbcAiuc,
 344                                                       float3*        r_ij,
 345                                                       float3*        r_kj,
 346                                                       float3*        r_kl,
 347                                                       float3*        m,
 348                                                       float3*        n,
 349                                                       int*           t1,
 350                                                       int*           t2,
 351                                                       int*           t3)
 352 {
 353     *t1 = pbcDxAiuc<returnShift>(pbcAiuc, xi, xj, *r_ij);
 354     *t2 = pbcDxAiuc<returnShift>(pbcAiuc, xk, xj, *r_kj);
 355     *t3 = pbcDxAiuc<returnShift>(pbcAiuc, xk, xl, *r_kl);
 356
 357     *m         = cprod(*r_ij, *r_kj);
 358     *n         = cprod(*r_kj, *r_kl);
 359     float phi  = gmx_angle(*m, *n);
 360     float ipr  = iprod(*r_ij, *n);
 361     float sign = (ipr < 0.0f) ? -1.0f : 1.0f;
 362     phi        = sign * phi;
 363
 364     return phi;
 365 }
 366
 367
 368 __device__ __forceinline__ static void
 369            dopdihs_gpu(const float cpA, const float phiA, const int mult, const float phi, float* v, float* f)
 370 {
 371     float mdphi, sdphi;
 372
 373     mdphi = mult * phi - phiA * CUDA_DEG2RAD_F;
 374     sdphi = sinf(mdphi);
 375     *v    = cpA * (1.0f + cosf(mdphi));
 376     *f    = -cpA * mult * sdphi;
 377 }
 378
 379 template<bool calcVir>
 380 __device__ static void do_dih_fup_gpu(const int      i,
 381                                       const int      j,
 382                                       const int      k,
 383                                       const int      l,
 384                                       const float    ddphi,
 385                                       const float3   r_ij,
 386                                       const float3   r_kj,
 387                                       const float3   r_kl,
 388                                       const float3   m,
 389                                       const float3   n,
 390                                       float3         gm_f[],
 391                                       float3         sm_fShiftLoc[],
 392                                       const PbcAiuc& pbcAiuc,
 393                                       const float4   gm_xq[],
 394                                       const int      t1,
 395                                       const int      t2,
 396                                       const int gmx_unused t3)
 397 {
 398     float iprm  = norm2(m);
 399     float iprn  = norm2(n);
 400     float nrkj2 = norm2(r_kj);
 401     float toler = nrkj2 * GMX_REAL_EPS;
 402     if ((iprm > toler) && (iprn > toler))
 403     {
 404         float  nrkj_1 = rsqrtf(nrkj2); // replacing std::invsqrt call
 405         float  nrkj_2 = nrkj_1 * nrkj_1;
 406         float  nrkj   = nrkj2 * nrkj_1;
 407         float  a      = -ddphi * nrkj / iprm;
 408         float3 f_i    = a * m;
 409         float  b      = ddphi * nrkj / iprn;
 410         float3 f_l    = b * n;
 411         float  p      = iprod(r_ij, r_kj);
 412         p *= nrkj_2;
 413         float q = iprod(r_kl, r_kj);
 414         q *= nrkj_2;
 415         float3 uvec = p * f_i;
 416         float3 vvec = q * f_l;
 417         float3 svec = uvec - vvec;
 418         float3 f_j  = f_i - svec;
 419         float3 f_k  = f_l + svec;
 420
 421         atomicAdd(&gm_f[i], f_i);
 422         atomicAdd(&gm_f[j], -f_j);
 423         atomicAdd(&gm_f[k], -f_k);
 424         atomicAdd(&gm_f[l], f_l);
 425
 426         if (calcVir)
 427         {
 428             float3 dx_jl;
 429             int    t3 = pbcDxAiuc<calcVir>(pbcAiuc, gm_xq[l], gm_xq[j], dx_jl);
 430
 431             atomicAdd(&sm_fShiftLoc[t1], f_i);
 432             atomicAdd(&sm_fShiftLoc[CENTRAL], -f_j);
 433             atomicAdd(&sm_fShiftLoc[t2], -f_k);
 434             atomicAdd(&sm_fShiftLoc[t3], f_l);
 435         }
 436     }
 437 }
 438
 439 template<bool calcVir, bool calcEner>
 440 __device__ void pdihs_gpu(const int       i,
 441                           float*          vtot_loc,
 442                           const int       numBonds,
 443                           const t_iatom   d_forceatoms[],
 444                           const t_iparams d_forceparams[],
 445                           const float4    gm_xq[],
 446                           float3          gm_f[],
 447                           float3          sm_fShiftLoc[],
 448                           const PbcAiuc   pbcAiuc)
 449 {
 450     if (i < numBonds)
 451     {
 452         int type = d_forceatoms[5 * i];
 453         int ai   = d_forceatoms[5 * i + 1];
 454         int aj   = d_forceatoms[5 * i + 2];
 455         int ak   = d_forceatoms[5 * i + 3];
 456         int al   = d_forceatoms[5 * i + 4];
 457
 458         float3 r_ij;
 459         float3 r_kj;
 460         float3 r_kl;
 461         float3 m;
 462         float3 n;
 463         int    t1;
 464         int    t2;
 465         int    t3;
 466         float  phi = dih_angle_gpu<calcVir>(gm_xq[ai], gm_xq[aj], gm_xq[ak], gm_xq[al], pbcAiuc,
 467                                            &r_ij, &r_kj, &r_kl, &m, &n, &t1, &t2, &t3);
 468
 469         float vpd;
 470         float ddphi;
 471         dopdihs_gpu(d_forceparams[type].pdihs.cpA, d_forceparams[type].pdihs.phiA,
 472                     d_forceparams[type].pdihs.mult, phi, &vpd, &ddphi);
 473
 474         if (calcEner)
 475         {
 476             *vtot_loc += vpd;
 477         }
 478
 479         do_dih_fup_gpu<calcVir>(ai, aj, ak, al, ddphi, r_ij, r_kj, r_kl, m, n, gm_f, sm_fShiftLoc,
 480                                 pbcAiuc, gm_xq, t1, t2, t3);
 481     }
 482 }
 483
 484 template<bool calcVir, bool calcEner>
 485 __device__ void rbdihs_gpu(const int       i,
 486                            float*          vtot_loc,
 487                            const int       numBonds,
 488                            const t_iatom   d_forceatoms[],
 489                            const t_iparams d_forceparams[],
 490                            const float4    gm_xq[],
 491                            float3          gm_f[],
 492                            float3          sm_fShiftLoc[],
 493                            const PbcAiuc   pbcAiuc)
 494 {
 495     constexpr float c0 = 0.0f, c1 = 1.0f, c2 = 2.0f, c3 = 3.0f, c4 = 4.0f, c5 = 5.0f;
 496
 497     if (i < numBonds)
 498     {
 499         int type = d_forceatoms[5 * i];
 500         int ai   = d_forceatoms[5 * i + 1];
 501         int aj   = d_forceatoms[5 * i + 2];
 502         int ak   = d_forceatoms[5 * i + 3];
 503         int al   = d_forceatoms[5 * i + 4];
 504
 505         float3 r_ij;
 506         float3 r_kj;
 507         float3 r_kl;
 508         float3 m;
 509         float3 n;
 510         int    t1;
 511         int    t2;
 512         int    t3;
 513         float  phi = dih_angle_gpu<calcVir>(gm_xq[ai], gm_xq[aj], gm_xq[ak], gm_xq[al], pbcAiuc,
 514                                            &r_ij, &r_kj, &r_kl, &m, &n, &t1, &t2, &t3);
 515
 516         /* Change to polymer convention */
 517         if (phi < c0)
 518         {
 519             phi += CUDART_PI_F;
 520         }
 521         else
 522         {
 523             phi -= CUDART_PI_F;
 524         }
 525         float cos_phi = cosf(phi);
 526         /* Beware of accuracy loss, cannot use 1-sqrt(cos^2) ! */
 527         float sin_phi = sinf(phi);
 528
 529         float parm[NR_RBDIHS];
 530         for (int j = 0; j < NR_RBDIHS; j++)
 531         {
 532             parm[j] = d_forceparams[type].rbdihs.rbcA[j];
 533         }
 534         /* Calculate cosine powers */
 535         /* Calculate the energy */
 536         /* Calculate the derivative */
 537         float v      = parm[0];
 538         float ddphi  = c0;
 539         float cosfac = c1;
 540
 541         float rbp = parm[1];
 542         ddphi += rbp * cosfac;
 543         cosfac *= cos_phi;
 544         if (calcEner)
 545         {
 546             v += cosfac * rbp;
 547         }
 548         rbp = parm[2];
 549         ddphi += c2 * rbp * cosfac;
 550         cosfac *= cos_phi;
 551         if (calcEner)
 552         {
 553             v += cosfac * rbp;
 554         }
 555         rbp = parm[3];
 556         ddphi += c3 * rbp * cosfac;
 557         cosfac *= cos_phi;
 558         if (calcEner)
 559         {
 560             v += cosfac * rbp;
 561         }
 562         rbp = parm[4];
 563         ddphi += c4 * rbp * cosfac;
 564         cosfac *= cos_phi;
 565         if (calcEner)
 566         {
 567             v += cosfac * rbp;
 568         }
 569         rbp = parm[5];
 570         ddphi += c5 * rbp * cosfac;
 571         cosfac *= cos_phi;
 572         if (calcEner)
 573         {
 574             v += cosfac * rbp;
 575         }
 576
 577         ddphi = -ddphi * sin_phi;
 578
 579         do_dih_fup_gpu<calcVir>(ai, aj, ak, al, ddphi, r_ij, r_kj, r_kl, m, n, gm_f, sm_fShiftLoc,
 580                                 pbcAiuc, gm_xq, t1, t2, t3);
 581         if (calcEner)
 582         {
 583             *vtot_loc += v;
 584         }
 585     }
 586 }
 587
 588 __device__ __forceinline__ static void make_dp_periodic_gpu(float* dp)
 589 {
 590     /* dp cannot be outside (-pi,pi) */
 591     if (*dp >= CUDART_PI_F)
 592     {
 593         *dp -= 2.0f * CUDART_PI_F;
 594     }
 595     else if (*dp < -CUDART_PI_F)
 596     {
 597         *dp += 2.0f * CUDART_PI_F;
 598     }
 599 }
 600
 601 template<bool calcVir, bool calcEner>
 602 __device__ void idihs_gpu(const int       i,
 603                           float*          vtot_loc,
 604                           const int       numBonds,
 605                           const t_iatom   d_forceatoms[],
 606                           const t_iparams d_forceparams[],
 607                           const float4    gm_xq[],
 608                           float3          gm_f[],
 609                           float3          sm_fShiftLoc[],
 610                           const PbcAiuc   pbcAiuc)
 611 {
 612     if (i < numBonds)
 613     {
 614         int type = d_forceatoms[5 * i];
 615         int ai   = d_forceatoms[5 * i + 1];
 616         int aj   = d_forceatoms[5 * i + 2];
 617         int ak   = d_forceatoms[5 * i + 3];
 618         int al   = d_forceatoms[5 * i + 4];
 619
 620         float3 r_ij;
 621         float3 r_kj;
 622         float3 r_kl;
 623         float3 m;
 624         float3 n;
 625         int    t1;
 626         int    t2;
 627         int    t3;
 628         float  phi = dih_angle_gpu<calcVir>(gm_xq[ai], gm_xq[aj], gm_xq[ak], gm_xq[al], pbcAiuc,
 629                                            &r_ij, &r_kj, &r_kl, &m, &n, &t1, &t2, &t3);
 630
 631         /* phi can jump if phi0 is close to Pi/-Pi, which will cause huge
 632          * force changes if we just apply a normal harmonic.
 633          * Instead, we first calculate phi-phi0 and take it modulo (-Pi,Pi).
 634          * This means we will never have the periodicity problem, unless
 635          * the dihedral is Pi away from phiO, which is very unlikely due to
 636          * the potential.
 637          */
 638         float kA = d_forceparams[type].harmonic.krA;
 639         float pA = d_forceparams[type].harmonic.rA;
 640
 641         float phi0 = pA * CUDA_DEG2RAD_F;
 642
 643         float dp = phi - phi0;
 644
 645         make_dp_periodic_gpu(&dp);
 646
 647         float ddphi = -kA * dp;
 648
 649         do_dih_fup_gpu<calcVir>(ai, aj, ak, al, -ddphi, r_ij, r_kj, r_kl, m, n, gm_f, sm_fShiftLoc,
 650                                 pbcAiuc, gm_xq, t1, t2, t3);
 651
 652         if (calcEner)
 653         {
 654             *vtot_loc += -0.5f * ddphi * dp;
 655         }
 656     }
 657 }
 658
 659 template<bool calcVir, bool calcEner>
 660 __device__ void pairs_gpu(const int       i,
 661                           const int       numBonds,
 662                           const t_iatom   d_forceatoms[],
 663                           const t_iparams iparams[],
 664                           const float4    gm_xq[],
 665                           float3          gm_f[],
 666                           float3          sm_fShiftLoc[],
 667                           const PbcAiuc   pbcAiuc,
 668                           const float     scale_factor,
 669                           float*          vtotVdw_loc,
 670                           float*          vtotElec_loc)
 671 {
 672     if (i < numBonds)
 673     {
 674         // TODO this should be made into a separate type, the GPU and CPU sizes should be compared
 675         int3 pairData = *(int3*)(d_forceatoms + 3 * i);
 676         int  type     = pairData.x;
 677         int  ai       = pairData.y;
 678         int  aj       = pairData.z;
 679
 680         float qq  = gm_xq[ai].w * gm_xq[aj].w;
 681         float c6  = iparams[type].lj14.c6A;
 682         float c12 = iparams[type].lj14.c12A;
 683
 684         /* Do we need to apply full periodic boundary conditions? */
 685         float3 dr;
 686         int    fshift_index = pbcDxAiuc<calcVir>(pbcAiuc, gm_xq[ai], gm_xq[aj], dr);
 687
 688         float r2    = norm2(dr);
 689         float rinv  = rsqrtf(r2);
 690         float rinv2 = rinv * rinv;
 691         float rinv6 = rinv2 * rinv2 * rinv2;
 692
 693         /* Calculate the Coulomb force * r */
 694         float velec = scale_factor * qq * rinv;
 695
 696         /* Calculate the LJ force * r and add it to the Coulomb part */
 697         float fr = (12.0f * c12 * rinv6 - 6.0f * c6) * rinv6 + velec;
 698
 699         float  finvr = fr * rinv2;
 700         float3 f     = finvr * dr;
 701
 702         /* Add the forces */
 703         atomicAdd(&gm_f[ai], f);
 704         atomicAdd(&gm_f[aj], -f);
 705         if (calcVir && fshift_index != CENTRAL)
 706         {
 707             atomicAdd(&sm_fShiftLoc[fshift_index], f);
 708             atomicAdd(&sm_fShiftLoc[CENTRAL], -f);
 709         }
 710
 711         if (calcEner)
 712         {
 713             *vtotVdw_loc += (c12 * rinv6 - c6) * rinv6;
 714             *vtotElec_loc += velec;
 715         }
 716     }
 717 }
 718
 719 namespace gmx
 720 {
 721
 722 template<bool calcVir, bool calcEner>
 723 __global__ void exec_kernel_gpu(BondedCudaKernelParameters kernelParams)
 724 {
 725     assert(blockDim.y == 1 && blockDim.z == 1);
 726     const int tid          = blockIdx.x * blockDim.x + threadIdx.x;
 727     float     vtot_loc     = 0;
 728     float     vtotVdw_loc  = 0;
 729     float     vtotElec_loc = 0;
 730
 731     extern __shared__ char sm_dynamicShmem[];
 732     char*                  sm_nextSlotPtr = sm_dynamicShmem;
 733     float3*                sm_fShiftLoc   = (float3*)sm_nextSlotPtr;
 734     sm_nextSlotPtr += SHIFTS * sizeof(float3);
 735
 736     if (calcVir)
 737     {
 738         if (threadIdx.x < SHIFTS)
 739         {
 740             sm_fShiftLoc[threadIdx.x] = make_float3(0.0f, 0.0f, 0.0f);
 741         }
 742         __syncthreads();
 743     }
 744
 745     int  fType;
 746     bool threadComputedPotential = false;
 747 #pragma unroll
 748     for (int j = 0; j < numFTypesOnGpu; j++)
 749     {
 750         if (tid >= kernelParams.fTypeRangeStart[j] && tid <= kernelParams.fTypeRangeEnd[j])
 751         {
 752             const int      numBonds = kernelParams.numFTypeBonds[j];
 753             int            fTypeTid = tid - kernelParams.fTypeRangeStart[j];
 754             const t_iatom* iatoms   = kernelParams.d_iatoms[j];
 755             fType                   = kernelParams.fTypesOnGpu[j];
 756             if (calcEner)
 757             {
 758                 threadComputedPotential = true;
 759             }
 760
 761             switch (fType)
 762             {
 763                 case F_BONDS:
 764                     bonds_gpu<calcVir, calcEner>(fTypeTid, &vtot_loc, numBonds, iatoms,
 765                                                  kernelParams.d_forceParams, kernelParams.d_xq,
 766                                                  kernelParams.d_f, sm_fShiftLoc, kernelParams.pbcAiuc);
 767                     break;
 768                 case F_ANGLES:
 769                     angles_gpu<calcVir, calcEner>(
 770                             fTypeTid, &vtot_loc, numBonds, iatoms, kernelParams.d_forceParams,
 771                             kernelParams.d_xq, kernelParams.d_f, sm_fShiftLoc, kernelParams.pbcAiuc);
 772                     break;
 773                 case F_UREY_BRADLEY:
 774                     urey_bradley_gpu<calcVir, calcEner>(
 775                             fTypeTid, &vtot_loc, numBonds, iatoms, kernelParams.d_forceParams,
 776                             kernelParams.d_xq, kernelParams.d_f, sm_fShiftLoc, kernelParams.pbcAiuc);
 777                     break;
 778                 case F_PDIHS:
 779                 case F_PIDIHS:
 780                     pdihs_gpu<calcVir, calcEner>(fTypeTid, &vtot_loc, numBonds, iatoms,
 781                                                  kernelParams.d_forceParams, kernelParams.d_xq,
 782                                                  kernelParams.d_f, sm_fShiftLoc, kernelParams.pbcAiuc);
 783                     break;
 784                 case F_RBDIHS:
 785                     rbdihs_gpu<calcVir, calcEner>(
 786                             fTypeTid, &vtot_loc, numBonds, iatoms, kernelParams.d_forceParams,
 787                             kernelParams.d_xq, kernelParams.d_f, sm_fShiftLoc, kernelParams.pbcAiuc);
 788                     break;
 789                 case F_IDIHS:
 790                     idihs_gpu<calcVir, calcEner>(fTypeTid, &vtot_loc, numBonds, iatoms,
 791                                                  kernelParams.d_forceParams, kernelParams.d_xq,
 792                                                  kernelParams.d_f, sm_fShiftLoc, kernelParams.pbcAiuc);
 793                     break;
 794                 case F_LJ14:
 795                     pairs_gpu<calcVir, calcEner>(
 796                             fTypeTid, numBonds, iatoms, kernelParams.d_forceParams,
 797                             kernelParams.d_xq, kernelParams.d_f, sm_fShiftLoc, kernelParams.pbcAiuc,
 798                             kernelParams.electrostaticsScaleFactor, &vtotVdw_loc, &vtotElec_loc);
 799                     break;
 800             }
 801             break;
 802         }
 803     }
 804
 805     if (threadComputedPotential)
 806     {
 807         float* vtotVdw  = kernelParams.d_vTot + F_LJ14;
 808         float* vtotElec = kernelParams.d_vTot + F_COUL14;
 809
 810         // Stage atomic accumulation through shared memory:
 811         // each warp will accumulate its own partial sum
 812         // and then a single thread per warp will accumulate this to the global sum
 813
 814         int numWarps = blockDim.x / warpSize;
 815         int warpId   = threadIdx.x / warpSize;
 816
 817         // Shared memory variables to hold block-local partial sum
 818         float* sm_vTot = (float*)sm_nextSlotPtr;
 819         sm_nextSlotPtr += numWarps * sizeof(float);
 820         float* sm_vTotVdw = (float*)sm_nextSlotPtr;
 821         sm_nextSlotPtr += numWarps * sizeof(float);
 822         float* sm_vTotElec = (float*)sm_nextSlotPtr;
 823
 824         if (threadIdx.x % warpSize == 0)
 825         {
 826             // One thread per warp initializes to zero
 827             sm_vTot[warpId]     = 0.;
 828             sm_vTotVdw[warpId]  = 0.;
 829             sm_vTotElec[warpId] = 0.;
 830         }
 831         __syncwarp(); // All threads in warp must wait for initialization
 832
 833         // Perform warp-local accumulation in shared memory
 834         atomicAdd(sm_vTot + warpId, vtot_loc);
 835         atomicAdd(sm_vTotVdw + warpId, vtotVdw_loc);
 836         atomicAdd(sm_vTotElec + warpId, vtotElec_loc);
 837
 838         __syncwarp(); // Ensure all threads in warp have completed
 839         if (threadIdx.x % warpSize == 0)
 840         { // One thread per warp accumulates partial sum into global sum
 841             atomicAdd(kernelParams.d_vTot + fType, sm_vTot[warpId]);
 842             atomicAdd(vtotVdw, sm_vTotVdw[warpId]);
 843             atomicAdd(vtotElec, sm_vTotElec[warpId]);
 844         }
 845     }
 846     /* Accumulate shift vectors from shared memory to global memory on the first SHIFTS threads of the block. */
 847     if (calcVir)
 848     {
 849         __syncthreads();
 850         if (threadIdx.x < SHIFTS)
 851         {
 852             atomicAdd(kernelParams.d_fShift[threadIdx.x], sm_fShiftLoc[threadIdx.x]);
 853         }
 854     }
 855 }
 856
 857
 858 /*-------------------------------- End CUDA kernels-----------------------------*/
 859
 860
 861 template<bool calcVir, bool calcEner>
 862 void GpuBonded::Impl::launchKernel()
 863 {
 864     GMX_ASSERT(haveInteractions_,
 865                "Cannot launch bonded GPU kernels unless bonded GPU work was scheduled");
 866
 867     wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
 868     wallcycle_sub_start(wcycle_, ewcsLAUNCH_GPU_BONDED);
 869
 870     int fTypeRangeEnd = kernelParams_.fTypeRangeEnd[numFTypesOnGpu - 1];
 871
 872     if (fTypeRangeEnd < 0)
 873     {
 874         return;
 875     }
 876
 877     auto kernelPtr = exec_kernel_gpu<calcVir, calcEner>;
 878
 879     const auto kernelArgs = prepareGpuKernelArguments(kernelPtr, kernelLaunchConfig_, &kernelParams_);
 880
 881     launchGpuKernel(kernelPtr, kernelLaunchConfig_, deviceStream_, nullptr,
 882                     "exec_kernel_gpu<calcVir, calcEner>", kernelArgs);
 883
 884     wallcycle_sub_stop(wcycle_, ewcsLAUNCH_GPU_BONDED);
 885     wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
 886 }
 887
 888 void GpuBonded::launchKernel(const gmx::StepWorkload& stepWork)
 889 {
 890     if (stepWork.computeEnergy)
 891     {
 892         // When we need the energy, we also need the virial
 893         impl_->launchKernel<true, true>();
 894     }
 895     else if (stepWork.computeVirial)
 896     {
 897         impl_->launchKernel<true, false>();
 898     }
 899     else
 900     {
 901         impl_->launchKernel<false, false>();
 902     }
 903 }
 904
 905 } // namespace gmx