src/gromacs/listed_forces/gpubondedkernels.cu

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35 /*! \internal \file
  36  *
  37  * \brief Implements CUDA bonded functionality
  38  *
  39  * \author Jon Vincent <jvincent@nvidia.com>
  40  * \author Magnus Lundborg <lundborg.magnus@gmail.com>
  41  * \author Berk Hess <hess@kth.se>
  42  * \author Szilárd Páll <pall.szilard@gmail.com>
  43  * \author Alan Gray <alang@nvidia.com>
  44  * \author Mark Abraham <mark.j.abraham@gmail.com>
  45  *
  46  * \ingroup module_listed_forces
  47  */
  48
  49 #include "gmxpre.h"
  50
  51 #include <cassert>
  52
  53 #include <math_constants.h>
  54
  55 #include "gromacs/gpu_utils/cudautils.cuh"
  56 #include "gromacs/gpu_utils/typecasts.cuh"
  57 #include "gromacs/gpu_utils/vectype_ops.cuh"
  58 #include "gromacs/listed_forces/gpubonded.h"
  59 #include "gromacs/math/units.h"
  60 #include "gromacs/mdlib/force_flags.h"
  61 #include "gromacs/mdtypes/interaction_const.h"
  62 #include "gromacs/mdtypes/simulation_workload.h"
  63 #include "gromacs/pbcutil/pbc_aiuc_cuda.cuh"
  64 #include "gromacs/timing/wallcycle.h"
  65 #include "gromacs/utility/gmxassert.h"
  66
  67 #include "gpubonded_impl.h"
  68
  69 #if defined(_MSVC)
  70 #    include <limits>
  71 #endif
  72
  73 /*-------------------------------- CUDA kernels-------------------------------- */
  74 /*------------------------------------------------------------------------------*/
  75
  76 #define CUDA_DEG2RAD_F (CUDART_PI_F / 180.0f)
  77
  78 /*---------------- BONDED CUDA kernels--------------*/
  79
  80 /* Harmonic */
  81 __device__ __forceinline__ static void
  82            harmonic_gpu(const float kA, const float xA, const float x, float* V, float* F)
  83 {
  84     constexpr float half = 0.5f;
  85     float           dx, dx2;
  86
  87     dx  = x - xA;
  88     dx2 = dx * dx;
  89
  90     *F = -kA * dx;
  91     *V = half * kA * dx2;
  92 }
  93
  94 template<bool calcVir, bool calcEner>
  95 __device__ void bonds_gpu(const int       i,
  96                           float*          vtot_loc,
  97                           const int       numBonds,
  98                           const t_iatom   d_forceatoms[],
  99                           const t_iparams d_forceparams[],
 100                           const float4    gm_xq[],
 101                           float3          gm_f[],
 102                           float3          sm_fShiftLoc[],
 103                           const PbcAiuc   pbcAiuc)
 104 {
 105     if (i < numBonds)
 106     {
 107         int3 bondData = *(int3*)(d_forceatoms + 3 * i);
 108         int  type     = bondData.x;
 109         int  ai       = bondData.y;
 110         int  aj       = bondData.z;
 111
 112         /* dx = xi - xj, corrected for periodic boundary conditions. */
 113         float3 dx;
 114         int    ki = pbcDxAiuc<calcVir>(pbcAiuc, gm_xq[ai], gm_xq[aj], dx);
 115
 116         float dr2 = norm2(dx);
 117         float dr  = sqrt(dr2);
 118
 119         float vbond;
 120         float fbond;
 121         harmonic_gpu(d_forceparams[type].harmonic.krA, d_forceparams[type].harmonic.rA, dr, &vbond, &fbond);
 122
 123         if (calcEner)
 124         {
 125             *vtot_loc += vbond;
 126         }
 127
 128         if (dr2 != 0.0f)
 129         {
 130             fbond *= rsqrtf(dr2);
 131
 132             float3 fij = fbond * dx;
 133             atomicAdd(&gm_f[ai], fij);
 134             atomicAdd(&gm_f[aj], -fij);
 135             if (calcVir && ki != CENTRAL)
 136             {
 137                 atomicAdd(&sm_fShiftLoc[ki], fij);
 138                 atomicAdd(&sm_fShiftLoc[CENTRAL], -fij);
 139             }
 140         }
 141     }
 142 }
 143
 144 template<bool returnShift>
 145 __device__ __forceinline__ static float bond_angle_gpu(const float4   xi,
 146                                                        const float4   xj,
 147                                                        const float4   xk,
 148                                                        const PbcAiuc& pbcAiuc,
 149                                                        float3*        r_ij,
 150                                                        float3*        r_kj,
 151                                                        float*         costh,
 152                                                        int*           t1,
 153                                                        int*           t2)
 154 /* Return value is the angle between the bonds i-j and j-k */
 155 {
 156     *t1 = pbcDxAiuc<returnShift>(pbcAiuc, xi, xj, *r_ij);
 157     *t2 = pbcDxAiuc<returnShift>(pbcAiuc, xk, xj, *r_kj);
 158
 159     *costh   = cos_angle(*r_ij, *r_kj);
 160     float th = acosf(*costh);
 161
 162     return th;
 163 }
 164
 165 template<bool calcVir, bool calcEner>
 166 __device__ void angles_gpu(const int       i,
 167                            float*          vtot_loc,
 168                            const int       numBonds,
 169                            const t_iatom   d_forceatoms[],
 170                            const t_iparams d_forceparams[],
 171                            const float4    gm_xq[],
 172                            float3          gm_f[],
 173                            float3          sm_fShiftLoc[],
 174                            const PbcAiuc   pbcAiuc)
 175 {
 176     if (i < numBonds)
 177     {
 178         int4 angleData = *(int4*)(d_forceatoms + 4 * i);
 179         int  type      = angleData.x;
 180         int  ai        = angleData.y;
 181         int  aj        = angleData.z;
 182         int  ak        = angleData.w;
 183
 184         float3 r_ij;
 185         float3 r_kj;
 186         float  cos_theta;
 187         int    t1;
 188         int    t2;
 189         float  theta = bond_angle_gpu<calcVir>(
 190                 gm_xq[ai], gm_xq[aj], gm_xq[ak], pbcAiuc, &r_ij, &r_kj, &cos_theta, &t1, &t2);
 191
 192         float va;
 193         float dVdt;
 194         harmonic_gpu(d_forceparams[type].harmonic.krA,
 195                      d_forceparams[type].harmonic.rA * CUDA_DEG2RAD_F,
 196                      theta,
 197                      &va,
 198                      &dVdt);
 199
 200         if (calcEner)
 201         {
 202             *vtot_loc += va;
 203         }
 204
 205         float cos_theta2 = cos_theta * cos_theta;
 206         if (cos_theta2 < 1.0f)
 207         {
 208             float st    = dVdt * rsqrtf(1.0f - cos_theta2);
 209             float sth   = st * cos_theta;
 210             float nrij2 = norm2(r_ij);
 211             float nrkj2 = norm2(r_kj);
 212
 213             float nrij_1 = rsqrtf(nrij2);
 214             float nrkj_1 = rsqrtf(nrkj2);
 215
 216             float cik = st * nrij_1 * nrkj_1;
 217             float cii = sth * nrij_1 * nrij_1;
 218             float ckk = sth * nrkj_1 * nrkj_1;
 219
 220             float3 f_i = cii * r_ij - cik * r_kj;
 221             float3 f_k = ckk * r_kj - cik * r_ij;
 222             float3 f_j = -f_i - f_k;
 223
 224             atomicAdd(&gm_f[ai], f_i);
 225             atomicAdd(&gm_f[aj], f_j);
 226             atomicAdd(&gm_f[ak], f_k);
 227
 228             if (calcVir)
 229             {
 230                 atomicAdd(&sm_fShiftLoc[t1], f_i);
 231                 atomicAdd(&sm_fShiftLoc[CENTRAL], f_j);
 232                 atomicAdd(&sm_fShiftLoc[t2], f_k);
 233             }
 234         }
 235     }
 236 }
 237
 238 template<bool calcVir, bool calcEner>
 239 __device__ void urey_bradley_gpu(const int       i,
 240                                  float*          vtot_loc,
 241                                  const int       numBonds,
 242                                  const t_iatom   d_forceatoms[],
 243                                  const t_iparams d_forceparams[],
 244                                  const float4    gm_xq[],
 245                                  float3          gm_f[],
 246                                  float3          sm_fShiftLoc[],
 247                                  const PbcAiuc   pbcAiuc)
 248 {
 249     if (i < numBonds)
 250     {
 251         int4 ubData = *(int4*)(d_forceatoms + 4 * i);
 252         int  type   = ubData.x;
 253         int  ai     = ubData.y;
 254         int  aj     = ubData.z;
 255         int  ak     = ubData.w;
 256
 257         float th0A = d_forceparams[type].u_b.thetaA * CUDA_DEG2RAD_F;
 258         float kthA = d_forceparams[type].u_b.kthetaA;
 259         float r13A = d_forceparams[type].u_b.r13A;
 260         float kUBA = d_forceparams[type].u_b.kUBA;
 261
 262         float3 r_ij;
 263         float3 r_kj;
 264         float  cos_theta;
 265         int    t1;
 266         int    t2;
 267         float  theta = bond_angle_gpu<calcVir>(
 268                 gm_xq[ai], gm_xq[aj], gm_xq[ak], pbcAiuc, &r_ij, &r_kj, &cos_theta, &t1, &t2);
 269
 270         float va;
 271         float dVdt;
 272         harmonic_gpu(kthA, th0A, theta, &va, &dVdt);
 273
 274         if (calcEner)
 275         {
 276             *vtot_loc += va;
 277         }
 278
 279         float3 r_ik;
 280         int    ki = pbcDxAiuc<calcVir>(pbcAiuc, gm_xq[ai], gm_xq[ak], r_ik);
 281
 282         float dr2 = norm2(r_ik);
 283         float dr  = dr2 * rsqrtf(dr2);
 284
 285         float vbond;
 286         float fbond;
 287         harmonic_gpu(kUBA, r13A, dr, &vbond, &fbond);
 288
 289         float cos_theta2 = cos_theta * cos_theta;
 290         if (cos_theta2 < 1.0f)
 291         {
 292             float st  = dVdt * rsqrtf(1.0f - cos_theta2);
 293             float sth = st * cos_theta;
 294
 295             float nrkj2 = norm2(r_kj);
 296             float nrij2 = norm2(r_ij);
 297
 298             float cik = st * rsqrtf(nrkj2 * nrij2);
 299             float cii = sth / nrij2;
 300             float ckk = sth / nrkj2;
 301
 302             float3 f_i = cii * r_ij - cik * r_kj;
 303             float3 f_k = ckk * r_kj - cik * r_ij;
 304             float3 f_j = -f_i - f_k;
 305
 306             atomicAdd(&gm_f[ai], f_i);
 307             atomicAdd(&gm_f[aj], f_j);
 308             atomicAdd(&gm_f[ak], f_k);
 309
 310             if (calcVir)
 311             {
 312                 atomicAdd(&sm_fShiftLoc[t1], f_i);
 313                 atomicAdd(&sm_fShiftLoc[CENTRAL], f_j);
 314                 atomicAdd(&sm_fShiftLoc[t2], f_k);
 315             }
 316         }
 317
 318         /* Time for the bond calculations */
 319         if (dr2 != 0.0f)
 320         {
 321             if (calcEner)
 322             {
 323                 *vtot_loc += vbond;
 324             }
 325
 326             fbond *= rsqrtf(dr2);
 327
 328             float3 fik = fbond * r_ik;
 329             atomicAdd(&gm_f[ai], fik);
 330             atomicAdd(&gm_f[ak], -fik);
 331
 332             if (calcVir && ki != CENTRAL)
 333             {
 334                 atomicAdd(&sm_fShiftLoc[ki], fik);
 335                 atomicAdd(&sm_fShiftLoc[CENTRAL], -fik);
 336             }
 337         }
 338     }
 339 }
 340
 341 template<bool returnShift, typename T>
 342 __device__ __forceinline__ static float dih_angle_gpu(const T        xi,
 343                                                       const T        xj,
 344                                                       const T        xk,
 345                                                       const T        xl,
 346                                                       const PbcAiuc& pbcAiuc,
 347                                                       float3*        r_ij,
 348                                                       float3*        r_kj,
 349                                                       float3*        r_kl,
 350                                                       float3*        m,
 351                                                       float3*        n,
 352                                                       int*           t1,
 353                                                       int*           t2,
 354                                                       int*           t3)
 355 {
 356     *t1 = pbcDxAiuc<returnShift>(pbcAiuc, xi, xj, *r_ij);
 357     *t2 = pbcDxAiuc<returnShift>(pbcAiuc, xk, xj, *r_kj);
 358     *t3 = pbcDxAiuc<returnShift>(pbcAiuc, xk, xl, *r_kl);
 359
 360     *m         = cprod(*r_ij, *r_kj);
 361     *n         = cprod(*r_kj, *r_kl);
 362     float phi  = gmx_angle(*m, *n);
 363     float ipr  = iprod(*r_ij, *n);
 364     float sign = (ipr < 0.0f) ? -1.0f : 1.0f;
 365     phi        = sign * phi;
 366
 367     return phi;
 368 }
 369
 370
 371 __device__ __forceinline__ static void
 372            dopdihs_gpu(const float cpA, const float phiA, const int mult, const float phi, float* v, float* f)
 373 {
 374     float mdphi, sdphi;
 375
 376     mdphi = mult * phi - phiA * CUDA_DEG2RAD_F;
 377     sdphi = sinf(mdphi);
 378     *v    = cpA * (1.0f + cosf(mdphi));
 379     *f    = -cpA * mult * sdphi;
 380 }
 381
 382 template<bool calcVir>
 383 __device__ static void do_dih_fup_gpu(const int      i,
 384                                       const int      j,
 385                                       const int      k,
 386                                       const int      l,
 387                                       const float    ddphi,
 388                                       const float3   r_ij,
 389                                       const float3   r_kj,
 390                                       const float3   r_kl,
 391                                       const float3   m,
 392                                       const float3   n,
 393                                       float3         gm_f[],
 394                                       float3         sm_fShiftLoc[],
 395                                       const PbcAiuc& pbcAiuc,
 396                                       const float4   gm_xq[],
 397                                       const int      t1,
 398                                       const int      t2,
 399                                       const int gmx_unused t3)
 400 {
 401     float iprm  = norm2(m);
 402     float iprn  = norm2(n);
 403     float nrkj2 = norm2(r_kj);
 404     float toler = nrkj2 * GMX_REAL_EPS;
 405     if ((iprm > toler) && (iprn > toler))
 406     {
 407         float  nrkj_1 = rsqrtf(nrkj2); // replacing std::invsqrt call
 408         float  nrkj_2 = nrkj_1 * nrkj_1;
 409         float  nrkj   = nrkj2 * nrkj_1;
 410         float  a      = -ddphi * nrkj / iprm;
 411         float3 f_i    = a * m;
 412         float  b      = ddphi * nrkj / iprn;
 413         float3 f_l    = b * n;
 414         float  p      = iprod(r_ij, r_kj);
 415         p *= nrkj_2;
 416         float q = iprod(r_kl, r_kj);
 417         q *= nrkj_2;
 418         float3 uvec = p * f_i;
 419         float3 vvec = q * f_l;
 420         float3 svec = uvec - vvec;
 421         float3 f_j  = f_i - svec;
 422         float3 f_k  = f_l + svec;
 423
 424         atomicAdd(&gm_f[i], f_i);
 425         atomicAdd(&gm_f[j], -f_j);
 426         atomicAdd(&gm_f[k], -f_k);
 427         atomicAdd(&gm_f[l], f_l);
 428
 429         if (calcVir)
 430         {
 431             float3 dx_jl;
 432             int    t3 = pbcDxAiuc<calcVir>(pbcAiuc, gm_xq[l], gm_xq[j], dx_jl);
 433
 434             atomicAdd(&sm_fShiftLoc[t1], f_i);
 435             atomicAdd(&sm_fShiftLoc[CENTRAL], -f_j);
 436             atomicAdd(&sm_fShiftLoc[t2], -f_k);
 437             atomicAdd(&sm_fShiftLoc[t3], f_l);
 438         }
 439     }
 440 }
 441
 442 template<bool calcVir, bool calcEner>
 443 __device__ void pdihs_gpu(const int       i,
 444                           float*          vtot_loc,
 445                           const int       numBonds,
 446                           const t_iatom   d_forceatoms[],
 447                           const t_iparams d_forceparams[],
 448                           const float4    gm_xq[],
 449                           float3          gm_f[],
 450                           float3          sm_fShiftLoc[],
 451                           const PbcAiuc   pbcAiuc)
 452 {
 453     if (i < numBonds)
 454     {
 455         int type = d_forceatoms[5 * i];
 456         int ai   = d_forceatoms[5 * i + 1];
 457         int aj   = d_forceatoms[5 * i + 2];
 458         int ak   = d_forceatoms[5 * i + 3];
 459         int al   = d_forceatoms[5 * i + 4];
 460
 461         float3 r_ij;
 462         float3 r_kj;
 463         float3 r_kl;
 464         float3 m;
 465         float3 n;
 466         int    t1;
 467         int    t2;
 468         int    t3;
 469         float  phi = dih_angle_gpu<calcVir>(
 470                 gm_xq[ai], gm_xq[aj], gm_xq[ak], gm_xq[al], pbcAiuc, &r_ij, &r_kj, &r_kl, &m, &n, &t1, &t2, &t3);
 471
 472         float vpd;
 473         float ddphi;
 474         dopdihs_gpu(d_forceparams[type].pdihs.cpA,
 475                     d_forceparams[type].pdihs.phiA,
 476                     d_forceparams[type].pdihs.mult,
 477                     phi,
 478                     &vpd,
 479                     &ddphi);
 480
 481         if (calcEner)
 482         {
 483             *vtot_loc += vpd;
 484         }
 485
 486         do_dih_fup_gpu<calcVir>(
 487                 ai, aj, ak, al, ddphi, r_ij, r_kj, r_kl, m, n, gm_f, sm_fShiftLoc, pbcAiuc, gm_xq, t1, t2, t3);
 488     }
 489 }
 490
 491 template<bool calcVir, bool calcEner>
 492 __device__ void rbdihs_gpu(const int       i,
 493                            float*          vtot_loc,
 494                            const int       numBonds,
 495                            const t_iatom   d_forceatoms[],
 496                            const t_iparams d_forceparams[],
 497                            const float4    gm_xq[],
 498                            float3          gm_f[],
 499                            float3          sm_fShiftLoc[],
 500                            const PbcAiuc   pbcAiuc)
 501 {
 502     constexpr float c0 = 0.0f, c1 = 1.0f, c2 = 2.0f, c3 = 3.0f, c4 = 4.0f, c5 = 5.0f;
 503
 504     if (i < numBonds)
 505     {
 506         int type = d_forceatoms[5 * i];
 507         int ai   = d_forceatoms[5 * i + 1];
 508         int aj   = d_forceatoms[5 * i + 2];
 509         int ak   = d_forceatoms[5 * i + 3];
 510         int al   = d_forceatoms[5 * i + 4];
 511
 512         float3 r_ij;
 513         float3 r_kj;
 514         float3 r_kl;
 515         float3 m;
 516         float3 n;
 517         int    t1;
 518         int    t2;
 519         int    t3;
 520         float  phi = dih_angle_gpu<calcVir>(
 521                 gm_xq[ai], gm_xq[aj], gm_xq[ak], gm_xq[al], pbcAiuc, &r_ij, &r_kj, &r_kl, &m, &n, &t1, &t2, &t3);
 522
 523         /* Change to polymer convention */
 524         if (phi < c0)
 525         {
 526             phi += CUDART_PI_F;
 527         }
 528         else
 529         {
 530             phi -= CUDART_PI_F;
 531         }
 532         float cos_phi = cosf(phi);
 533         /* Beware of accuracy loss, cannot use 1-sqrt(cos^2) ! */
 534         float sin_phi = sinf(phi);
 535
 536         float parm[NR_RBDIHS];
 537         for (int j = 0; j < NR_RBDIHS; j++)
 538         {
 539             parm[j] = d_forceparams[type].rbdihs.rbcA[j];
 540         }
 541         /* Calculate cosine powers */
 542         /* Calculate the energy */
 543         /* Calculate the derivative */
 544         float v      = parm[0];
 545         float ddphi  = c0;
 546         float cosfac = c1;
 547
 548         float rbp = parm[1];
 549         ddphi += rbp * cosfac;
 550         cosfac *= cos_phi;
 551         if (calcEner)
 552         {
 553             v += cosfac * rbp;
 554         }
 555         rbp = parm[2];
 556         ddphi += c2 * rbp * cosfac;
 557         cosfac *= cos_phi;
 558         if (calcEner)
 559         {
 560             v += cosfac * rbp;
 561         }
 562         rbp = parm[3];
 563         ddphi += c3 * rbp * cosfac;
 564         cosfac *= cos_phi;
 565         if (calcEner)
 566         {
 567             v += cosfac * rbp;
 568         }
 569         rbp = parm[4];
 570         ddphi += c4 * rbp * cosfac;
 571         cosfac *= cos_phi;
 572         if (calcEner)
 573         {
 574             v += cosfac * rbp;
 575         }
 576         rbp = parm[5];
 577         ddphi += c5 * rbp * cosfac;
 578         cosfac *= cos_phi;
 579         if (calcEner)
 580         {
 581             v += cosfac * rbp;
 582         }
 583
 584         ddphi = -ddphi * sin_phi;
 585
 586         do_dih_fup_gpu<calcVir>(
 587                 ai, aj, ak, al, ddphi, r_ij, r_kj, r_kl, m, n, gm_f, sm_fShiftLoc, pbcAiuc, gm_xq, t1, t2, t3);
 588         if (calcEner)
 589         {
 590             *vtot_loc += v;
 591         }
 592     }
 593 }
 594
 595 __device__ __forceinline__ static void make_dp_periodic_gpu(float* dp)
 596 {
 597     /* dp cannot be outside (-pi,pi) */
 598     if (*dp >= CUDART_PI_F)
 599     {
 600         *dp -= 2.0f * CUDART_PI_F;
 601     }
 602     else if (*dp < -CUDART_PI_F)
 603     {
 604         *dp += 2.0f * CUDART_PI_F;
 605     }
 606 }
 607
 608 template<bool calcVir, bool calcEner>
 609 __device__ void idihs_gpu(const int       i,
 610                           float*          vtot_loc,
 611                           const int       numBonds,
 612                           const t_iatom   d_forceatoms[],
 613                           const t_iparams d_forceparams[],
 614                           const float4    gm_xq[],
 615                           float3          gm_f[],
 616                           float3          sm_fShiftLoc[],
 617                           const PbcAiuc   pbcAiuc)
 618 {
 619     if (i < numBonds)
 620     {
 621         int type = d_forceatoms[5 * i];
 622         int ai   = d_forceatoms[5 * i + 1];
 623         int aj   = d_forceatoms[5 * i + 2];
 624         int ak   = d_forceatoms[5 * i + 3];
 625         int al   = d_forceatoms[5 * i + 4];
 626
 627         float3 r_ij;
 628         float3 r_kj;
 629         float3 r_kl;
 630         float3 m;
 631         float3 n;
 632         int    t1;
 633         int    t2;
 634         int    t3;
 635         float  phi = dih_angle_gpu<calcVir>(
 636                 gm_xq[ai], gm_xq[aj], gm_xq[ak], gm_xq[al], pbcAiuc, &r_ij, &r_kj, &r_kl, &m, &n, &t1, &t2, &t3);
 637
 638         /* phi can jump if phi0 is close to Pi/-Pi, which will cause huge
 639          * force changes if we just apply a normal harmonic.
 640          * Instead, we first calculate phi-phi0 and take it modulo (-Pi,Pi).
 641          * This means we will never have the periodicity problem, unless
 642          * the dihedral is Pi away from phiO, which is very unlikely due to
 643          * the potential.
 644          */
 645         float kA = d_forceparams[type].harmonic.krA;
 646         float pA = d_forceparams[type].harmonic.rA;
 647
 648         float phi0 = pA * CUDA_DEG2RAD_F;
 649
 650         float dp = phi - phi0;
 651
 652         make_dp_periodic_gpu(&dp);
 653
 654         float ddphi = -kA * dp;
 655
 656         do_dih_fup_gpu<calcVir>(
 657                 ai, aj, ak, al, -ddphi, r_ij, r_kj, r_kl, m, n, gm_f, sm_fShiftLoc, pbcAiuc, gm_xq, t1, t2, t3);
 658
 659         if (calcEner)
 660         {
 661             *vtot_loc += -0.5f * ddphi * dp;
 662         }
 663     }
 664 }
 665
 666 template<bool calcVir, bool calcEner>
 667 __device__ void pairs_gpu(const int       i,
 668                           const int       numBonds,
 669                           const t_iatom   d_forceatoms[],
 670                           const t_iparams iparams[],
 671                           const float4    gm_xq[],
 672                           float3          gm_f[],
 673                           float3          sm_fShiftLoc[],
 674                           const PbcAiuc   pbcAiuc,
 675                           const float     scale_factor,
 676                           float*          vtotVdw_loc,
 677                           float*          vtotElec_loc)
 678 {
 679     if (i < numBonds)
 680     {
 681         // TODO this should be made into a separate type, the GPU and CPU sizes should be compared
 682         int3 pairData = *(int3*)(d_forceatoms + 3 * i);
 683         int  type     = pairData.x;
 684         int  ai       = pairData.y;
 685         int  aj       = pairData.z;
 686
 687         float qq  = gm_xq[ai].w * gm_xq[aj].w;
 688         float c6  = iparams[type].lj14.c6A;
 689         float c12 = iparams[type].lj14.c12A;
 690
 691         /* Do we need to apply full periodic boundary conditions? */
 692         float3 dr;
 693         int    fshift_index = pbcDxAiuc<calcVir>(pbcAiuc, gm_xq[ai], gm_xq[aj], dr);
 694
 695         float r2    = norm2(dr);
 696         float rinv  = rsqrtf(r2);
 697         float rinv2 = rinv * rinv;
 698         float rinv6 = rinv2 * rinv2 * rinv2;
 699
 700         /* Calculate the Coulomb force * r */
 701         float velec = scale_factor * qq * rinv;
 702
 703         /* Calculate the LJ force * r and add it to the Coulomb part */
 704         float fr = (12.0f * c12 * rinv6 - 6.0f * c6) * rinv6 + velec;
 705
 706         float  finvr = fr * rinv2;
 707         float3 f     = finvr * dr;
 708
 709         /* Add the forces */
 710         atomicAdd(&gm_f[ai], f);
 711         atomicAdd(&gm_f[aj], -f);
 712         if (calcVir && fshift_index != CENTRAL)
 713         {
 714             atomicAdd(&sm_fShiftLoc[fshift_index], f);
 715             atomicAdd(&sm_fShiftLoc[CENTRAL], -f);
 716         }
 717
 718         if (calcEner)
 719         {
 720             *vtotVdw_loc += (c12 * rinv6 - c6) * rinv6;
 721             *vtotElec_loc += velec;
 722         }
 723     }
 724 }
 725
 726 namespace gmx
 727 {
 728
 729 template<bool calcVir, bool calcEner>
 730 __global__ void exec_kernel_gpu(BondedCudaKernelParameters kernelParams)
 731 {
 732     assert(blockDim.y == 1 && blockDim.z == 1);
 733     const int tid          = blockIdx.x * blockDim.x + threadIdx.x;
 734     float     vtot_loc     = 0;
 735     float     vtotVdw_loc  = 0;
 736     float     vtotElec_loc = 0;
 737
 738     extern __shared__ char sm_dynamicShmem[];
 739     char*                  sm_nextSlotPtr = sm_dynamicShmem;
 740     float3*                sm_fShiftLoc   = (float3*)sm_nextSlotPtr;
 741     sm_nextSlotPtr += SHIFTS * sizeof(float3);
 742
 743     if (calcVir)
 744     {
 745         if (threadIdx.x < SHIFTS)
 746         {
 747             sm_fShiftLoc[threadIdx.x] = make_float3(0.0f, 0.0f, 0.0f);
 748         }
 749         __syncthreads();
 750     }
 751
 752     int  fType;
 753     bool threadComputedPotential = false;
 754 #pragma unroll
 755     for (int j = 0; j < numFTypesOnGpu; j++)
 756     {
 757         if (tid >= kernelParams.fTypeRangeStart[j] && tid <= kernelParams.fTypeRangeEnd[j])
 758         {
 759             const int      numBonds = kernelParams.numFTypeBonds[j];
 760             int            fTypeTid = tid - kernelParams.fTypeRangeStart[j];
 761             const t_iatom* iatoms   = kernelParams.d_iatoms[j];
 762             fType                   = kernelParams.fTypesOnGpu[j];
 763             if (calcEner)
 764             {
 765                 threadComputedPotential = true;
 766             }
 767
 768             switch (fType)
 769             {
 770                 case F_BONDS:
 771                     bonds_gpu<calcVir, calcEner>(fTypeTid,
 772                                                  &vtot_loc,
 773                                                  numBonds,
 774                                                  iatoms,
 775                                                  kernelParams.d_forceParams,
 776                                                  kernelParams.d_xq,
 777                                                  kernelParams.d_f,
 778                                                  sm_fShiftLoc,
 779                                                  kernelParams.pbcAiuc);
 780                     break;
 781                 case F_ANGLES:
 782                     angles_gpu<calcVir, calcEner>(fTypeTid,
 783                                                   &vtot_loc,
 784                                                   numBonds,
 785                                                   iatoms,
 786                                                   kernelParams.d_forceParams,
 787                                                   kernelParams.d_xq,
 788                                                   kernelParams.d_f,
 789                                                   sm_fShiftLoc,
 790                                                   kernelParams.pbcAiuc);
 791                     break;
 792                 case F_UREY_BRADLEY:
 793                     urey_bradley_gpu<calcVir, calcEner>(fTypeTid,
 794                                                         &vtot_loc,
 795                                                         numBonds,
 796                                                         iatoms,
 797                                                         kernelParams.d_forceParams,
 798                                                         kernelParams.d_xq,
 799                                                         kernelParams.d_f,
 800                                                         sm_fShiftLoc,
 801                                                         kernelParams.pbcAiuc);
 802                     break;
 803                 case F_PDIHS:
 804                 case F_PIDIHS:
 805                     pdihs_gpu<calcVir, calcEner>(fTypeTid,
 806                                                  &vtot_loc,
 807                                                  numBonds,
 808                                                  iatoms,
 809                                                  kernelParams.d_forceParams,
 810                                                  kernelParams.d_xq,
 811                                                  kernelParams.d_f,
 812                                                  sm_fShiftLoc,
 813                                                  kernelParams.pbcAiuc);
 814                     break;
 815                 case F_RBDIHS:
 816                     rbdihs_gpu<calcVir, calcEner>(fTypeTid,
 817                                                   &vtot_loc,
 818                                                   numBonds,
 819                                                   iatoms,
 820                                                   kernelParams.d_forceParams,
 821                                                   kernelParams.d_xq,
 822                                                   kernelParams.d_f,
 823                                                   sm_fShiftLoc,
 824                                                   kernelParams.pbcAiuc);
 825                     break;
 826                 case F_IDIHS:
 827                     idihs_gpu<calcVir, calcEner>(fTypeTid,
 828                                                  &vtot_loc,
 829                                                  numBonds,
 830                                                  iatoms,
 831                                                  kernelParams.d_forceParams,
 832                                                  kernelParams.d_xq,
 833                                                  kernelParams.d_f,
 834                                                  sm_fShiftLoc,
 835                                                  kernelParams.pbcAiuc);
 836                     break;
 837                 case F_LJ14:
 838                     pairs_gpu<calcVir, calcEner>(fTypeTid,
 839                                                  numBonds,
 840                                                  iatoms,
 841                                                  kernelParams.d_forceParams,
 842                                                  kernelParams.d_xq,
 843                                                  kernelParams.d_f,
 844                                                  sm_fShiftLoc,
 845                                                  kernelParams.pbcAiuc,
 846                                                  kernelParams.electrostaticsScaleFactor,
 847                                                  &vtotVdw_loc,
 848                                                  &vtotElec_loc);
 849                     break;
 850             }
 851             break;
 852         }
 853     }
 854
 855     if (threadComputedPotential)
 856     {
 857         float* vtotVdw  = kernelParams.d_vTot + F_LJ14;
 858         float* vtotElec = kernelParams.d_vTot + F_COUL14;
 859
 860         // Stage atomic accumulation through shared memory:
 861         // each warp will accumulate its own partial sum
 862         // and then a single thread per warp will accumulate this to the global sum
 863
 864         int numWarps = blockDim.x / warpSize;
 865         int warpId   = threadIdx.x / warpSize;
 866
 867         // Shared memory variables to hold block-local partial sum
 868         float* sm_vTot = (float*)sm_nextSlotPtr;
 869         sm_nextSlotPtr += numWarps * sizeof(float);
 870         float* sm_vTotVdw = (float*)sm_nextSlotPtr;
 871         sm_nextSlotPtr += numWarps * sizeof(float);
 872         float* sm_vTotElec = (float*)sm_nextSlotPtr;
 873
 874         if (threadIdx.x % warpSize == 0)
 875         {
 876             // One thread per warp initializes to zero
 877             sm_vTot[warpId]     = 0.;
 878             sm_vTotVdw[warpId]  = 0.;
 879             sm_vTotElec[warpId] = 0.;
 880         }
 881         __syncwarp(); // All threads in warp must wait for initialization
 882
 883         // Perform warp-local accumulation in shared memory
 884         atomicAdd(sm_vTot + warpId, vtot_loc);
 885         atomicAdd(sm_vTotVdw + warpId, vtotVdw_loc);
 886         atomicAdd(sm_vTotElec + warpId, vtotElec_loc);
 887
 888         __syncwarp(); // Ensure all threads in warp have completed
 889         if (threadIdx.x % warpSize == 0)
 890         { // One thread per warp accumulates partial sum into global sum
 891             atomicAdd(kernelParams.d_vTot + fType, sm_vTot[warpId]);
 892             atomicAdd(vtotVdw, sm_vTotVdw[warpId]);
 893             atomicAdd(vtotElec, sm_vTotElec[warpId]);
 894         }
 895     }
 896     /* Accumulate shift vectors from shared memory to global memory on the first SHIFTS threads of the block. */
 897     if (calcVir)
 898     {
 899         __syncthreads();
 900         if (threadIdx.x < SHIFTS)
 901         {
 902             atomicAdd(kernelParams.d_fShift[threadIdx.x], sm_fShiftLoc[threadIdx.x]);
 903         }
 904     }
 905 }
 906
 907
 908 /*-------------------------------- End CUDA kernels-----------------------------*/
 909
 910
 911 template<bool calcVir, bool calcEner>
 912 void GpuBonded::Impl::launchKernel()
 913 {
 914     GMX_ASSERT(haveInteractions_,
 915                "Cannot launch bonded GPU kernels unless bonded GPU work was scheduled");
 916
 917     wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
 918     wallcycle_sub_start(wcycle_, ewcsLAUNCH_GPU_BONDED);
 919
 920     int fTypeRangeEnd = kernelParams_.fTypeRangeEnd[numFTypesOnGpu - 1];
 921
 922     if (fTypeRangeEnd < 0)
 923     {
 924         return;
 925     }
 926
 927     auto kernelPtr = exec_kernel_gpu<calcVir, calcEner>;
 928
 929     const auto kernelArgs = prepareGpuKernelArguments(kernelPtr, kernelLaunchConfig_, &kernelParams_);
 930
 931     launchGpuKernel(kernelPtr,
 932                     kernelLaunchConfig_,
 933                     deviceStream_,
 934                     nullptr,
 935                     "exec_kernel_gpu<calcVir, calcEner>",
 936                     kernelArgs);
 937
 938     wallcycle_sub_stop(wcycle_, ewcsLAUNCH_GPU_BONDED);
 939     wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
 940 }
 941
 942 void GpuBonded::launchKernel(const gmx::StepWorkload& stepWork)
 943 {
 944     if (stepWork.computeEnergy)
 945     {
 946         // When we need the energy, we also need the virial
 947         impl_->launchKernel<true, true>();
 948     }
 949     else if (stepWork.computeVirial)
 950     {
 951         impl_->launchKernel<true, false>();
 952     }
 953     else
 954     {
 955         impl_->launchKernel<false, false>();
 956     }
 957 }
 958
 959 } // namespace gmx