src/gromacs/listed_forces/gpubondedkernels.cu

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35 /*! \internal \file
  36  *
  37  * \brief Implements CUDA bonded functionality
  38  *
  39  * \author Jon Vincent <jvincent@nvidia.com>
  40  * \author Magnus Lundborg <lundborg.magnus@gmail.com>
  41  * \author Berk Hess <hess@kth.se>
  42  * \author Szilárd Páll <pall.szilard@gmail.com>
  43  * \author Alan Gray <alang@nvidia.com>
  44  * \author Mark Abraham <mark.j.abraham@gmail.com>
  45  *
  46  * \ingroup module_listed_forces
  47  */
  48
  49 #include "gmxpre.h"
  50
  51 #include <cassert>
  52
  53 #include <math_constants.h>
  54
  55 #include "gromacs/gpu_utils/cudautils.cuh"
  56 #include "gromacs/gpu_utils/typecasts.cuh"
  57 #include "gromacs/gpu_utils/vectype_ops.cuh"
  58 #include "gromacs/listed_forces/gpubonded.h"
  59 #include "gromacs/math/units.h"
  60 #include "gromacs/mdlib/force_flags.h"
  61 #include "gromacs/mdtypes/forcerec.h"
  62 #include "gromacs/mdtypes/interaction_const.h"
  63 #include "gromacs/mdtypes/simulation_workload.h"
  64 #include "gromacs/pbcutil/pbc.h"
  65 #include "gromacs/pbcutil/pbc_aiuc_cuda.cuh"
  66 #include "gromacs/utility/gmxassert.h"
  67
  68 #include "gpubonded_impl.h"
  69
  70 #if defined(_MSVC)
  71 #    include <limits>
  72 #endif
  73
  74 // CUDA threads per block
  75 #define TPB_BONDED 256
  76
  77 /*-------------------------------- CUDA kernels-------------------------------- */
  78 /*------------------------------------------------------------------------------*/
  79
  80 #define CUDA_DEG2RAD_F (CUDART_PI_F / 180.0f)
  81
  82 /*---------------- BONDED CUDA kernels--------------*/
  83
  84 /* Harmonic */
  85 __device__ __forceinline__ static void
  86            harmonic_gpu(const float kA, const float xA, const float x, float* V, float* F)
  87 {
  88     constexpr float half = 0.5f;
  89     float           dx, dx2;
  90
  91     dx  = x - xA;
  92     dx2 = dx * dx;
  93
  94     *F = -kA * dx;
  95     *V = half * kA * dx2;
  96 }
  97
  98 template<bool calcVir, bool calcEner>
  99 __device__ void bonds_gpu(const int       i,
 100                           float*          vtot_loc,
 101                           const int       numBonds,
 102                           const t_iatom   d_forceatoms[],
 103                           const t_iparams d_forceparams[],
 104                           const float4    gm_xq[],
 105                           float3          gm_f[],
 106                           float3          sm_fShiftLoc[],
 107                           const PbcAiuc   pbcAiuc)
 108 {
 109     if (i < numBonds)
 110     {
 111         int3 bondData = *(int3*)(d_forceatoms + 3 * i);
 112         int  type     = bondData.x;
 113         int  ai       = bondData.y;
 114         int  aj       = bondData.z;
 115
 116         /* dx = xi - xj, corrected for periodic boundary conditions. */
 117         float3 dx;
 118         int    ki = pbcDxAiuc<calcVir>(pbcAiuc, gm_xq[ai], gm_xq[aj], dx);
 119
 120         float dr2 = norm2(dx);
 121         float dr  = sqrt(dr2);
 122
 123         float vbond;
 124         float fbond;
 125         harmonic_gpu(d_forceparams[type].harmonic.krA, d_forceparams[type].harmonic.rA, dr, &vbond, &fbond);
 126
 127         if (calcEner)
 128         {
 129             *vtot_loc += vbond;
 130         }
 131
 132         if (dr2 != 0.0f)
 133         {
 134             fbond *= rsqrtf(dr2);
 135
 136             float3 fij = fbond * dx;
 137             atomicAdd(&gm_f[ai], fij);
 138             atomicAdd(&gm_f[aj], -fij);
 139             if (calcVir && ki != CENTRAL)
 140             {
 141                 atomicAdd(&sm_fShiftLoc[ki], fij);
 142                 atomicAdd(&sm_fShiftLoc[CENTRAL], -fij);
 143             }
 144         }
 145     }
 146 }
 147
 148 template<bool returnShift>
 149 __device__ __forceinline__ static float bond_angle_gpu(const float4   xi,
 150                                                        const float4   xj,
 151                                                        const float4   xk,
 152                                                        const PbcAiuc& pbcAiuc,
 153                                                        float3*        r_ij,
 154                                                        float3*        r_kj,
 155                                                        float*         costh,
 156                                                        int*           t1,
 157                                                        int*           t2)
 158 /* Return value is the angle between the bonds i-j and j-k */
 159 {
 160     *t1 = pbcDxAiuc<returnShift>(pbcAiuc, xi, xj, *r_ij);
 161     *t2 = pbcDxAiuc<returnShift>(pbcAiuc, xk, xj, *r_kj);
 162
 163     *costh   = cos_angle(*r_ij, *r_kj);
 164     float th = acosf(*costh);
 165
 166     return th;
 167 }
 168
 169 template<bool calcVir, bool calcEner>
 170 __device__ void angles_gpu(const int       i,
 171                            float*          vtot_loc,
 172                            const int       numBonds,
 173                            const t_iatom   d_forceatoms[],
 174                            const t_iparams d_forceparams[],
 175                            const float4    gm_xq[],
 176                            float3          gm_f[],
 177                            float3          sm_fShiftLoc[],
 178                            const PbcAiuc   pbcAiuc)
 179 {
 180     if (i < numBonds)
 181     {
 182         int4 angleData = *(int4*)(d_forceatoms + 4 * i);
 183         int  type      = angleData.x;
 184         int  ai        = angleData.y;
 185         int  aj        = angleData.z;
 186         int  ak        = angleData.w;
 187
 188         float3 r_ij;
 189         float3 r_kj;
 190         float  cos_theta;
 191         int    t1;
 192         int    t2;
 193         float  theta = bond_angle_gpu<calcVir>(gm_xq[ai], gm_xq[aj], gm_xq[ak], pbcAiuc, &r_ij,
 194                                               &r_kj, &cos_theta, &t1, &t2);
 195
 196         float va;
 197         float dVdt;
 198         harmonic_gpu(d_forceparams[type].harmonic.krA,
 199                      d_forceparams[type].harmonic.rA * CUDA_DEG2RAD_F, theta, &va, &dVdt);
 200
 201         if (calcEner)
 202         {
 203             *vtot_loc += va;
 204         }
 205
 206         float cos_theta2 = cos_theta * cos_theta;
 207         if (cos_theta2 < 1.0f)
 208         {
 209             float st    = dVdt * rsqrtf(1.0f - cos_theta2);
 210             float sth   = st * cos_theta;
 211             float nrij2 = norm2(r_ij);
 212             float nrkj2 = norm2(r_kj);
 213
 214             float nrij_1 = rsqrtf(nrij2);
 215             float nrkj_1 = rsqrtf(nrkj2);
 216
 217             float cik = st * nrij_1 * nrkj_1;
 218             float cii = sth * nrij_1 * nrij_1;
 219             float ckk = sth * nrkj_1 * nrkj_1;
 220
 221             float3 f_i = cii * r_ij - cik * r_kj;
 222             float3 f_k = ckk * r_kj - cik * r_ij;
 223             float3 f_j = -f_i - f_k;
 224
 225             atomicAdd(&gm_f[ai], f_i);
 226             atomicAdd(&gm_f[aj], f_j);
 227             atomicAdd(&gm_f[ak], f_k);
 228
 229             if (calcVir)
 230             {
 231                 atomicAdd(&sm_fShiftLoc[t1], f_i);
 232                 atomicAdd(&sm_fShiftLoc[CENTRAL], f_j);
 233                 atomicAdd(&sm_fShiftLoc[t2], f_k);
 234             }
 235         }
 236     }
 237 }
 238
 239 template<bool calcVir, bool calcEner>
 240 __device__ void urey_bradley_gpu(const int       i,
 241                                  float*          vtot_loc,
 242                                  const int       numBonds,
 243                                  const t_iatom   d_forceatoms[],
 244                                  const t_iparams d_forceparams[],
 245                                  const float4    gm_xq[],
 246                                  float3          gm_f[],
 247                                  float3          sm_fShiftLoc[],
 248                                  const PbcAiuc   pbcAiuc)
 249 {
 250     if (i < numBonds)
 251     {
 252         int4 ubData = *(int4*)(d_forceatoms + 4 * i);
 253         int  type   = ubData.x;
 254         int  ai     = ubData.y;
 255         int  aj     = ubData.z;
 256         int  ak     = ubData.w;
 257
 258         float th0A = d_forceparams[type].u_b.thetaA * CUDA_DEG2RAD_F;
 259         float kthA = d_forceparams[type].u_b.kthetaA;
 260         float r13A = d_forceparams[type].u_b.r13A;
 261         float kUBA = d_forceparams[type].u_b.kUBA;
 262
 263         float3 r_ij;
 264         float3 r_kj;
 265         float  cos_theta;
 266         int    t1;
 267         int    t2;
 268         float  theta = bond_angle_gpu<calcVir>(gm_xq[ai], gm_xq[aj], gm_xq[ak], pbcAiuc, &r_ij,
 269                                               &r_kj, &cos_theta, &t1, &t2);
 270
 271         float va;
 272         float dVdt;
 273         harmonic_gpu(kthA, th0A, theta, &va, &dVdt);
 274
 275         if (calcEner)
 276         {
 277             *vtot_loc += va;
 278         }
 279
 280         float3 r_ik;
 281         int    ki = pbcDxAiuc<calcVir>(pbcAiuc, gm_xq[ai], gm_xq[ak], r_ik);
 282
 283         float dr2 = norm2(r_ik);
 284         float dr  = dr2 * rsqrtf(dr2);
 285
 286         float vbond;
 287         float fbond;
 288         harmonic_gpu(kUBA, r13A, dr, &vbond, &fbond);
 289
 290         float cos_theta2 = cos_theta * cos_theta;
 291         if (cos_theta2 < 1.0f)
 292         {
 293             float st  = dVdt * rsqrtf(1.0f - cos_theta2);
 294             float sth = st * cos_theta;
 295
 296             float nrkj2 = norm2(r_kj);
 297             float nrij2 = norm2(r_ij);
 298
 299             float cik = st * rsqrtf(nrkj2 * nrij2);
 300             float cii = sth / nrij2;
 301             float ckk = sth / nrkj2;
 302
 303             float3 f_i = cii * r_ij - cik * r_kj;
 304             float3 f_k = ckk * r_kj - cik * r_ij;
 305             float3 f_j = -f_i - f_k;
 306
 307             atomicAdd(&gm_f[ai], f_i);
 308             atomicAdd(&gm_f[aj], f_j);
 309             atomicAdd(&gm_f[ak], f_k);
 310
 311             if (calcVir)
 312             {
 313                 atomicAdd(&sm_fShiftLoc[t1], f_i);
 314                 atomicAdd(&sm_fShiftLoc[CENTRAL], f_j);
 315                 atomicAdd(&sm_fShiftLoc[t2], f_k);
 316             }
 317         }
 318
 319         /* Time for the bond calculations */
 320         if (dr2 != 0.0f)
 321         {
 322             if (calcEner)
 323             {
 324                 *vtot_loc += vbond;
 325             }
 326
 327             fbond *= rsqrtf(dr2);
 328
 329             float3 fik = fbond * r_ik;
 330             atomicAdd(&gm_f[ai], fik);
 331             atomicAdd(&gm_f[ak], -fik);
 332
 333             if (calcVir && ki != CENTRAL)
 334             {
 335                 atomicAdd(&sm_fShiftLoc[ki], fik);
 336                 atomicAdd(&sm_fShiftLoc[CENTRAL], -fik);
 337             }
 338         }
 339     }
 340 }
 341
 342 template<bool returnShift, typename T>
 343 __device__ __forceinline__ static float dih_angle_gpu(const T        xi,
 344                                                       const T        xj,
 345                                                       const T        xk,
 346                                                       const T        xl,
 347                                                       const PbcAiuc& pbcAiuc,
 348                                                       float3*        r_ij,
 349                                                       float3*        r_kj,
 350                                                       float3*        r_kl,
 351                                                       float3*        m,
 352                                                       float3*        n,
 353                                                       int*           t1,
 354                                                       int*           t2,
 355                                                       int*           t3)
 356 {
 357     *t1 = pbcDxAiuc<returnShift>(pbcAiuc, xi, xj, *r_ij);
 358     *t2 = pbcDxAiuc<returnShift>(pbcAiuc, xk, xj, *r_kj);
 359     *t3 = pbcDxAiuc<returnShift>(pbcAiuc, xk, xl, *r_kl);
 360
 361     *m         = cprod(*r_ij, *r_kj);
 362     *n         = cprod(*r_kj, *r_kl);
 363     float phi  = gmx_angle(*m, *n);
 364     float ipr  = iprod(*r_ij, *n);
 365     float sign = (ipr < 0.0f) ? -1.0f : 1.0f;
 366     phi        = sign * phi;
 367
 368     return phi;
 369 }
 370
 371
 372 __device__ __forceinline__ static void
 373            dopdihs_gpu(const float cpA, const float phiA, const int mult, const float phi, float* v, float* f)
 374 {
 375     float mdphi, sdphi;
 376
 377     mdphi = mult * phi - phiA * CUDA_DEG2RAD_F;
 378     sdphi = sinf(mdphi);
 379     *v    = cpA * (1.0f + cosf(mdphi));
 380     *f    = -cpA * mult * sdphi;
 381 }
 382
 383 template<bool calcVir>
 384 __device__ static void do_dih_fup_gpu(const int      i,
 385                                       const int      j,
 386                                       const int      k,
 387                                       const int      l,
 388                                       const float    ddphi,
 389                                       const float3   r_ij,
 390                                       const float3   r_kj,
 391                                       const float3   r_kl,
 392                                       const float3   m,
 393                                       const float3   n,
 394                                       float3         gm_f[],
 395                                       float3         sm_fShiftLoc[],
 396                                       const PbcAiuc& pbcAiuc,
 397                                       const float4   gm_xq[],
 398                                       const int      t1,
 399                                       const int      t2,
 400                                       const int gmx_unused t3)
 401 {
 402     float iprm  = norm2(m);
 403     float iprn  = norm2(n);
 404     float nrkj2 = norm2(r_kj);
 405     float toler = nrkj2 * GMX_REAL_EPS;
 406     if ((iprm > toler) && (iprn > toler))
 407     {
 408         float  nrkj_1 = rsqrtf(nrkj2); // replacing std::invsqrt call
 409         float  nrkj_2 = nrkj_1 * nrkj_1;
 410         float  nrkj   = nrkj2 * nrkj_1;
 411         float  a      = -ddphi * nrkj / iprm;
 412         float3 f_i    = a * m;
 413         float  b      = ddphi * nrkj / iprn;
 414         float3 f_l    = b * n;
 415         float  p      = iprod(r_ij, r_kj);
 416         p *= nrkj_2;
 417         float q = iprod(r_kl, r_kj);
 418         q *= nrkj_2;
 419         float3 uvec = p * f_i;
 420         float3 vvec = q * f_l;
 421         float3 svec = uvec - vvec;
 422         float3 f_j  = f_i - svec;
 423         float3 f_k  = f_l + svec;
 424
 425         atomicAdd(&gm_f[i], f_i);
 426         atomicAdd(&gm_f[j], -f_j);
 427         atomicAdd(&gm_f[k], -f_k);
 428         atomicAdd(&gm_f[l], f_l);
 429
 430         if (calcVir)
 431         {
 432             float3 dx_jl;
 433             int    t3 = pbcDxAiuc<calcVir>(pbcAiuc, gm_xq[l], gm_xq[j], dx_jl);
 434
 435             atomicAdd(&sm_fShiftLoc[t1], f_i);
 436             atomicAdd(&sm_fShiftLoc[CENTRAL], -f_j);
 437             atomicAdd(&sm_fShiftLoc[t2], -f_k);
 438             atomicAdd(&sm_fShiftLoc[t3], f_l);
 439         }
 440     }
 441 }
 442
 443 template<bool calcVir, bool calcEner>
 444 __device__ void pdihs_gpu(const int       i,
 445                           float*          vtot_loc,
 446                           const int       numBonds,
 447                           const t_iatom   d_forceatoms[],
 448                           const t_iparams d_forceparams[],
 449                           const float4    gm_xq[],
 450                           float3          gm_f[],
 451                           float3          sm_fShiftLoc[],
 452                           const PbcAiuc   pbcAiuc)
 453 {
 454     if (i < numBonds)
 455     {
 456         int type = d_forceatoms[5 * i];
 457         int ai   = d_forceatoms[5 * i + 1];
 458         int aj   = d_forceatoms[5 * i + 2];
 459         int ak   = d_forceatoms[5 * i + 3];
 460         int al   = d_forceatoms[5 * i + 4];
 461
 462         float3 r_ij;
 463         float3 r_kj;
 464         float3 r_kl;
 465         float3 m;
 466         float3 n;
 467         int    t1;
 468         int    t2;
 469         int    t3;
 470         float  phi = dih_angle_gpu<calcVir>(gm_xq[ai], gm_xq[aj], gm_xq[ak], gm_xq[al], pbcAiuc,
 471                                            &r_ij, &r_kj, &r_kl, &m, &n, &t1, &t2, &t3);
 472
 473         float vpd;
 474         float ddphi;
 475         dopdihs_gpu(d_forceparams[type].pdihs.cpA, d_forceparams[type].pdihs.phiA,
 476                     d_forceparams[type].pdihs.mult, phi, &vpd, &ddphi);
 477
 478         if (calcEner)
 479         {
 480             *vtot_loc += vpd;
 481         }
 482
 483         do_dih_fup_gpu<calcVir>(ai, aj, ak, al, ddphi, r_ij, r_kj, r_kl, m, n, gm_f, sm_fShiftLoc,
 484                                 pbcAiuc, gm_xq, t1, t2, t3);
 485     }
 486 }
 487
 488 template<bool calcVir, bool calcEner>
 489 __device__ void rbdihs_gpu(const int       i,
 490                            float*          vtot_loc,
 491                            const int       numBonds,
 492                            const t_iatom   d_forceatoms[],
 493                            const t_iparams d_forceparams[],
 494                            const float4    gm_xq[],
 495                            float3          gm_f[],
 496                            float3          sm_fShiftLoc[],
 497                            const PbcAiuc   pbcAiuc)
 498 {
 499     constexpr float c0 = 0.0f, c1 = 1.0f, c2 = 2.0f, c3 = 3.0f, c4 = 4.0f, c5 = 5.0f;
 500
 501     if (i < numBonds)
 502     {
 503         int type = d_forceatoms[5 * i];
 504         int ai   = d_forceatoms[5 * i + 1];
 505         int aj   = d_forceatoms[5 * i + 2];
 506         int ak   = d_forceatoms[5 * i + 3];
 507         int al   = d_forceatoms[5 * i + 4];
 508
 509         float3 r_ij;
 510         float3 r_kj;
 511         float3 r_kl;
 512         float3 m;
 513         float3 n;
 514         int    t1;
 515         int    t2;
 516         int    t3;
 517         float  phi = dih_angle_gpu<calcVir>(gm_xq[ai], gm_xq[aj], gm_xq[ak], gm_xq[al], pbcAiuc,
 518                                            &r_ij, &r_kj, &r_kl, &m, &n, &t1, &t2, &t3);
 519
 520         /* Change to polymer convention */
 521         if (phi < c0)
 522         {
 523             phi += CUDART_PI_F;
 524         }
 525         else
 526         {
 527             phi -= CUDART_PI_F;
 528         }
 529         float cos_phi = cosf(phi);
 530         /* Beware of accuracy loss, cannot use 1-sqrt(cos^2) ! */
 531         float sin_phi = sinf(phi);
 532
 533         float parm[NR_RBDIHS];
 534         for (int j = 0; j < NR_RBDIHS; j++)
 535         {
 536             parm[j] = d_forceparams[type].rbdihs.rbcA[j];
 537         }
 538         /* Calculate cosine powers */
 539         /* Calculate the energy */
 540         /* Calculate the derivative */
 541         float v      = parm[0];
 542         float ddphi  = c0;
 543         float cosfac = c1;
 544
 545         float rbp = parm[1];
 546         ddphi += rbp * cosfac;
 547         cosfac *= cos_phi;
 548         if (calcEner)
 549         {
 550             v += cosfac * rbp;
 551         }
 552         rbp = parm[2];
 553         ddphi += c2 * rbp * cosfac;
 554         cosfac *= cos_phi;
 555         if (calcEner)
 556         {
 557             v += cosfac * rbp;
 558         }
 559         rbp = parm[3];
 560         ddphi += c3 * rbp * cosfac;
 561         cosfac *= cos_phi;
 562         if (calcEner)
 563         {
 564             v += cosfac * rbp;
 565         }
 566         rbp = parm[4];
 567         ddphi += c4 * rbp * cosfac;
 568         cosfac *= cos_phi;
 569         if (calcEner)
 570         {
 571             v += cosfac * rbp;
 572         }
 573         rbp = parm[5];
 574         ddphi += c5 * rbp * cosfac;
 575         cosfac *= cos_phi;
 576         if (calcEner)
 577         {
 578             v += cosfac * rbp;
 579         }
 580
 581         ddphi = -ddphi * sin_phi;
 582
 583         do_dih_fup_gpu<calcVir>(ai, aj, ak, al, ddphi, r_ij, r_kj, r_kl, m, n, gm_f, sm_fShiftLoc,
 584                                 pbcAiuc, gm_xq, t1, t2, t3);
 585         if (calcEner)
 586         {
 587             *vtot_loc += v;
 588         }
 589     }
 590 }
 591
 592 __device__ __forceinline__ static void make_dp_periodic_gpu(float* dp)
 593 {
 594     /* dp cannot be outside (-pi,pi) */
 595     if (*dp >= CUDART_PI_F)
 596     {
 597         *dp -= 2.0f * CUDART_PI_F;
 598     }
 599     else if (*dp < -CUDART_PI_F)
 600     {
 601         *dp += 2.0f * CUDART_PI_F;
 602     }
 603 }
 604
 605 template<bool calcVir, bool calcEner>
 606 __device__ void idihs_gpu(const int       i,
 607                           float*          vtot_loc,
 608                           const int       numBonds,
 609                           const t_iatom   d_forceatoms[],
 610                           const t_iparams d_forceparams[],
 611                           const float4    gm_xq[],
 612                           float3          gm_f[],
 613                           float3          sm_fShiftLoc[],
 614                           const PbcAiuc   pbcAiuc)
 615 {
 616     if (i < numBonds)
 617     {
 618         int type = d_forceatoms[5 * i];
 619         int ai   = d_forceatoms[5 * i + 1];
 620         int aj   = d_forceatoms[5 * i + 2];
 621         int ak   = d_forceatoms[5 * i + 3];
 622         int al   = d_forceatoms[5 * i + 4];
 623
 624         float3 r_ij;
 625         float3 r_kj;
 626         float3 r_kl;
 627         float3 m;
 628         float3 n;
 629         int    t1;
 630         int    t2;
 631         int    t3;
 632         float  phi = dih_angle_gpu<calcVir>(gm_xq[ai], gm_xq[aj], gm_xq[ak], gm_xq[al], pbcAiuc,
 633                                            &r_ij, &r_kj, &r_kl, &m, &n, &t1, &t2, &t3);
 634
 635         /* phi can jump if phi0 is close to Pi/-Pi, which will cause huge
 636          * force changes if we just apply a normal harmonic.
 637          * Instead, we first calculate phi-phi0 and take it modulo (-Pi,Pi).
 638          * This means we will never have the periodicity problem, unless
 639          * the dihedral is Pi away from phiO, which is very unlikely due to
 640          * the potential.
 641          */
 642         float kA = d_forceparams[type].harmonic.krA;
 643         float pA = d_forceparams[type].harmonic.rA;
 644
 645         float phi0 = pA * CUDA_DEG2RAD_F;
 646
 647         float dp = phi - phi0;
 648
 649         make_dp_periodic_gpu(&dp);
 650
 651         float ddphi = -kA * dp;
 652
 653         do_dih_fup_gpu<calcVir>(ai, aj, ak, al, -ddphi, r_ij, r_kj, r_kl, m, n, gm_f, sm_fShiftLoc,
 654                                 pbcAiuc, gm_xq, t1, t2, t3);
 655
 656         if (calcEner)
 657         {
 658             *vtot_loc += -0.5f * ddphi * dp;
 659         }
 660     }
 661 }
 662
 663 template<bool calcVir, bool calcEner>
 664 __device__ void pairs_gpu(const int       i,
 665                           const int       numBonds,
 666                           const t_iatom   d_forceatoms[],
 667                           const t_iparams iparams[],
 668                           const float4    gm_xq[],
 669                           float3          gm_f[],
 670                           float3          sm_fShiftLoc[],
 671                           const PbcAiuc   pbcAiuc,
 672                           const float     scale_factor,
 673                           float*          vtotVdw_loc,
 674                           float*          vtotElec_loc)
 675 {
 676     if (i < numBonds)
 677     {
 678         // TODO this should be made into a separate type, the GPU and CPU sizes should be compared
 679         int3 pairData = *(int3*)(d_forceatoms + 3 * i);
 680         int  type     = pairData.x;
 681         int  ai       = pairData.y;
 682         int  aj       = pairData.z;
 683
 684         float qq  = gm_xq[ai].w * gm_xq[aj].w;
 685         float c6  = iparams[type].lj14.c6A;
 686         float c12 = iparams[type].lj14.c12A;
 687
 688         /* Do we need to apply full periodic boundary conditions? */
 689         float3 dr;
 690         int    fshift_index = pbcDxAiuc<calcVir>(pbcAiuc, gm_xq[ai], gm_xq[aj], dr);
 691
 692         float r2    = norm2(dr);
 693         float rinv  = rsqrtf(r2);
 694         float rinv2 = rinv * rinv;
 695         float rinv6 = rinv2 * rinv2 * rinv2;
 696
 697         /* Calculate the Coulomb force * r */
 698         float velec = scale_factor * qq * rinv;
 699
 700         /* Calculate the LJ force * r and add it to the Coulomb part */
 701         float fr = (12.0f * c12 * rinv6 - 6.0f * c6) * rinv6 + velec;
 702
 703         float  finvr = fr * rinv2;
 704         float3 f     = finvr * dr;
 705
 706         /* Add the forces */
 707         atomicAdd(&gm_f[ai], f);
 708         atomicAdd(&gm_f[aj], -f);
 709         if (calcVir && fshift_index != CENTRAL)
 710         {
 711             atomicAdd(&sm_fShiftLoc[fshift_index], f);
 712             atomicAdd(&sm_fShiftLoc[CENTRAL], -f);
 713         }
 714
 715         if (calcEner)
 716         {
 717             *vtotVdw_loc += (c12 * rinv6 - c6) * rinv6;
 718             *vtotElec_loc += velec;
 719         }
 720     }
 721 }
 722
 723 namespace gmx
 724 {
 725
 726 template<bool calcVir, bool calcEner>
 727 __global__ void exec_kernel_gpu(BondedCudaKernelParameters kernelParams)
 728 {
 729     assert(blockDim.y == 1 && blockDim.z == 1);
 730     const int  tid          = blockIdx.x * blockDim.x + threadIdx.x;
 731     float      vtot_loc     = 0;
 732     float      vtotVdw_loc  = 0;
 733     float      vtotElec_loc = 0;
 734     __shared__ float3 sm_fShiftLoc[SHIFTS];
 735
 736     if (calcVir)
 737     {
 738         if (threadIdx.x < SHIFTS)
 739         {
 740             sm_fShiftLoc[threadIdx.x] = make_float3(0.0f, 0.0f, 0.0f);
 741         }
 742         __syncthreads();
 743     }
 744
 745     int  fType;
 746     bool threadComputedPotential = false;
 747 #pragma unroll
 748     for (int j = 0; j < numFTypesOnGpu; j++)
 749     {
 750         if (tid >= kernelParams.fTypeRangeStart[j] && tid <= kernelParams.fTypeRangeEnd[j])
 751         {
 752             const int      numBonds = kernelParams.numFTypeBonds[j];
 753             int            fTypeTid = tid - kernelParams.fTypeRangeStart[j];
 754             const t_iatom* iatoms   = kernelParams.d_iatoms[j];
 755             fType                   = kernelParams.fTypesOnGpu[j];
 756             if (calcEner)
 757             {
 758                 threadComputedPotential = true;
 759             }
 760
 761             switch (fType)
 762             {
 763                 case F_BONDS:
 764                     bonds_gpu<calcVir, calcEner>(fTypeTid, &vtot_loc, numBonds, iatoms,
 765                                                  kernelParams.d_forceParams, kernelParams.d_xq,
 766                                                  kernelParams.d_f, sm_fShiftLoc, kernelParams.pbcAiuc);
 767                     break;
 768                 case F_ANGLES:
 769                     angles_gpu<calcVir, calcEner>(
 770                             fTypeTid, &vtot_loc, numBonds, iatoms, kernelParams.d_forceParams,
 771                             kernelParams.d_xq, kernelParams.d_f, sm_fShiftLoc, kernelParams.pbcAiuc);
 772                     break;
 773                 case F_UREY_BRADLEY:
 774                     urey_bradley_gpu<calcVir, calcEner>(
 775                             fTypeTid, &vtot_loc, numBonds, iatoms, kernelParams.d_forceParams,
 776                             kernelParams.d_xq, kernelParams.d_f, sm_fShiftLoc, kernelParams.pbcAiuc);
 777                     break;
 778                 case F_PDIHS:
 779                 case F_PIDIHS:
 780                     pdihs_gpu<calcVir, calcEner>(fTypeTid, &vtot_loc, numBonds, iatoms,
 781                                                  kernelParams.d_forceParams, kernelParams.d_xq,
 782                                                  kernelParams.d_f, sm_fShiftLoc, kernelParams.pbcAiuc);
 783                     break;
 784                 case F_RBDIHS:
 785                     rbdihs_gpu<calcVir, calcEner>(
 786                             fTypeTid, &vtot_loc, numBonds, iatoms, kernelParams.d_forceParams,
 787                             kernelParams.d_xq, kernelParams.d_f, sm_fShiftLoc, kernelParams.pbcAiuc);
 788                     break;
 789                 case F_IDIHS:
 790                     idihs_gpu<calcVir, calcEner>(fTypeTid, &vtot_loc, numBonds, iatoms,
 791                                                  kernelParams.d_forceParams, kernelParams.d_xq,
 792                                                  kernelParams.d_f, sm_fShiftLoc, kernelParams.pbcAiuc);
 793                     break;
 794                 case F_LJ14:
 795                     pairs_gpu<calcVir, calcEner>(fTypeTid, numBonds, iatoms, kernelParams.d_forceParams,
 796                                                  kernelParams.d_xq, kernelParams.d_f, sm_fShiftLoc,
 797                                                  kernelParams.pbcAiuc, kernelParams.scaleFactor,
 798                                                  &vtotVdw_loc, &vtotElec_loc);
 799                     break;
 800             }
 801             break;
 802         }
 803     }
 804
 805     if (threadComputedPotential)
 806     {
 807         float* vtotVdw  = kernelParams.d_vTot + F_LJ14;
 808         float* vtotElec = kernelParams.d_vTot + F_COUL14;
 809         atomicAdd(kernelParams.d_vTot + fType, vtot_loc);
 810         atomicAdd(vtotVdw, vtotVdw_loc);
 811         atomicAdd(vtotElec, vtotElec_loc);
 812     }
 813     /* Accumulate shift vectors from shared memory to global memory on the first SHIFTS threads of the block. */
 814     if (calcVir)
 815     {
 816         __syncthreads();
 817         if (threadIdx.x < SHIFTS)
 818         {
 819             atomicAdd(kernelParams.d_fShift[threadIdx.x], sm_fShiftLoc[threadIdx.x]);
 820         }
 821     }
 822 }
 823
 824
 825 /*-------------------------------- End CUDA kernels-----------------------------*/
 826
 827
 828 template<bool calcVir, bool calcEner>
 829 void GpuBonded::Impl::launchKernel(const t_forcerec* fr, const matrix box)
 830 {
 831     GMX_ASSERT(haveInteractions_,
 832                "Cannot launch bonded GPU kernels unless bonded GPU work was scheduled");
 833     static_assert(TPB_BONDED >= SHIFTS,
 834                   "TPB_BONDED must be >= SHIFTS for the virial kernel (calcVir=true)");
 835
 836     PbcAiuc pbcAiuc;
 837     setPbcAiuc(fr->bMolPBC ? numPbcDimensions(fr->pbcType) : 0, box, &pbcAiuc);
 838
 839     int fTypeRangeEnd = kernelParams_.fTypeRangeEnd[numFTypesOnGpu - 1];
 840
 841     if (fTypeRangeEnd < 0)
 842     {
 843         return;
 844     }
 845
 846     KernelLaunchConfig config;
 847     config.blockSize[0] = TPB_BONDED;
 848     config.blockSize[1] = 1;
 849     config.blockSize[2] = 1;
 850     config.gridSize[0]  = (fTypeRangeEnd + TPB_BONDED) / TPB_BONDED;
 851     config.gridSize[1]  = 1;
 852     config.gridSize[2]  = 1;
 853     config.stream       = stream_;
 854
 855     auto kernelPtr            = exec_kernel_gpu<calcVir, calcEner>;
 856     kernelParams_.scaleFactor = fr->ic->epsfac * fr->fudgeQQ;
 857     kernelParams_.pbcAiuc     = pbcAiuc;
 858
 859     const auto kernelArgs = prepareGpuKernelArguments(kernelPtr, config, &kernelParams_);
 860
 861     launchGpuKernel(kernelPtr, config, nullptr, "exec_kernel_gpu<calcVir, calcEner>", kernelArgs);
 862 }
 863
 864 void GpuBonded::launchKernel(const t_forcerec* fr, const gmx::StepWorkload& stepWork, const matrix box)
 865 {
 866     if (stepWork.computeEnergy)
 867     {
 868         // When we need the energy, we also need the virial
 869         impl_->launchKernel<true, true>(fr, box);
 870     }
 871     else if (stepWork.computeVirial)
 872     {
 873         impl_->launchKernel<true, false>(fr, box);
 874     }
 875     else
 876     {
 877         impl_->launchKernel<false, false>(fr, box);
 878     }
 879 }
 880
 881 } // namespace gmx