src/gromacs/listed_forces/gpubondedkernels.cu

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35 /*! \internal \file
  36  *
  37  * \brief Implements CUDA bonded functionality
  38  *
  39  * \author Jon Vincent <jvincent@nvidia.com>
  40  * \author Magnus Lundborg <lundborg.magnus@gmail.com>
  41  * \author Berk Hess <hess@kth.se>
  42  * \author Szilárd Páll <pall.szilard@gmail.com>
  43  * \author Alan Gray <alang@nvidia.com>
  44  * \author Mark Abraham <mark.j.abraham@gmail.com>
  45  *
  46  * \ingroup module_listed_forces
  47  */
  48
  49 #include "gmxpre.h"
  50
  51 #include <cassert>
  52
  53 #include <math_constants.h>
  54
  55 #include "gromacs/gpu_utils/cudautils.cuh"
  56 #include "gromacs/gpu_utils/vectype_ops.cuh"
  57 #include "gromacs/listed_forces/gpubonded.h"
  58 #include "gromacs/math/units.h"
  59 #include "gromacs/mdlib/force_flags.h"
  60 #include "gromacs/mdtypes/forcerec.h"
  61 #include "gromacs/mdtypes/simulation_workload.h"
  62 #include "gromacs/pbcutil/pbc.h"
  63 #include "gromacs/pbcutil/pbc_aiuc_cuda.cuh"
  64 #include "gromacs/utility/gmxassert.h"
  65
  66 #include "gpubonded_impl.h"
  67
  68 #if defined(_MSVC)
  69 #    include <limits>
  70 #endif
  71
  72 // CUDA threads per block
  73 #define TPB_BONDED 256
  74
  75 /*-------------------------------- CUDA kernels-------------------------------- */
  76 /*------------------------------------------------------------------------------*/
  77
  78 #define CUDA_DEG2RAD_F (CUDART_PI_F / 180.0f)
  79
  80 /*---------------- BONDED CUDA kernels--------------*/
  81
  82 /* Harmonic */
  83 __device__ __forceinline__ static void
  84            harmonic_gpu(const float kA, const float xA, const float x, float* V, float* F)
  85 {
  86     constexpr float half = 0.5f;
  87     float           dx, dx2;
  88
  89     dx  = x - xA;
  90     dx2 = dx * dx;
  91
  92     *F = -kA * dx;
  93     *V = half * kA * dx2;
  94 }
  95
  96 template<bool calcVir, bool calcEner>
  97 __device__ void bonds_gpu(const int       i,
  98                           float*          vtot_loc,
  99                           const int       numBonds,
 100                           const t_iatom   d_forceatoms[],
 101                           const t_iparams d_forceparams[],
 102                           const float4    gm_xq[],
 103                           float3          gm_f[],
 104                           float3          sm_fShiftLoc[],
 105                           const PbcAiuc   pbcAiuc)
 106 {
 107     if (i < numBonds)
 108     {
 109         int3 bondData = *(int3*)(d_forceatoms + 3 * i);
 110         int  type     = bondData.x;
 111         int  ai       = bondData.y;
 112         int  aj       = bondData.z;
 113
 114         /* dx = xi - xj, corrected for periodic boundary conditions. */
 115         float3 dx;
 116         int    ki = pbcDxAiuc<calcVir>(pbcAiuc, gm_xq[ai], gm_xq[aj], dx);
 117
 118         float dr2 = norm2(dx);
 119         float dr  = sqrt(dr2);
 120
 121         float vbond;
 122         float fbond;
 123         harmonic_gpu(d_forceparams[type].harmonic.krA, d_forceparams[type].harmonic.rA, dr, &vbond, &fbond);
 124
 125         if (calcEner)
 126         {
 127             *vtot_loc += vbond;
 128         }
 129
 130         if (dr2 != 0.0f)
 131         {
 132             fbond *= rsqrtf(dr2);
 133
 134             float3 fij = fbond * dx;
 135             atomicAdd(&gm_f[ai], fij);
 136             atomicAdd(&gm_f[aj], -fij);
 137             if (calcVir && ki != CENTRAL)
 138             {
 139                 atomicAdd(&sm_fShiftLoc[ki], fij);
 140                 atomicAdd(&sm_fShiftLoc[CENTRAL], -fij);
 141             }
 142         }
 143     }
 144 }
 145
 146 template<bool returnShift>
 147 __device__ __forceinline__ static float bond_angle_gpu(const float4   xi,
 148                                                        const float4   xj,
 149                                                        const float4   xk,
 150                                                        const PbcAiuc& pbcAiuc,
 151                                                        float3*        r_ij,
 152                                                        float3*        r_kj,
 153                                                        float*         costh,
 154                                                        int*           t1,
 155                                                        int*           t2)
 156 /* Return value is the angle between the bonds i-j and j-k */
 157 {
 158     *t1 = pbcDxAiuc<returnShift>(pbcAiuc, xi, xj, *r_ij);
 159     *t2 = pbcDxAiuc<returnShift>(pbcAiuc, xk, xj, *r_kj);
 160
 161     *costh   = cos_angle(*r_ij, *r_kj);
 162     float th = acosf(*costh);
 163
 164     return th;
 165 }
 166
 167 template<bool calcVir, bool calcEner>
 168 __device__ void angles_gpu(const int       i,
 169                            float*          vtot_loc,
 170                            const int       numBonds,
 171                            const t_iatom   d_forceatoms[],
 172                            const t_iparams d_forceparams[],
 173                            const float4    gm_xq[],
 174                            float3          gm_f[],
 175                            float3          sm_fShiftLoc[],
 176                            const PbcAiuc   pbcAiuc)
 177 {
 178     if (i < numBonds)
 179     {
 180         int4 angleData = *(int4*)(d_forceatoms + 4 * i);
 181         int  type      = angleData.x;
 182         int  ai        = angleData.y;
 183         int  aj        = angleData.z;
 184         int  ak        = angleData.w;
 185
 186         float3 r_ij;
 187         float3 r_kj;
 188         float  cos_theta;
 189         int    t1;
 190         int    t2;
 191         float  theta = bond_angle_gpu<calcVir>(gm_xq[ai], gm_xq[aj], gm_xq[ak], pbcAiuc, &r_ij,
 192                                               &r_kj, &cos_theta, &t1, &t2);
 193
 194         float va;
 195         float dVdt;
 196         harmonic_gpu(d_forceparams[type].harmonic.krA,
 197                      d_forceparams[type].harmonic.rA * CUDA_DEG2RAD_F, theta, &va, &dVdt);
 198
 199         if (calcEner)
 200         {
 201             *vtot_loc += va;
 202         }
 203
 204         float cos_theta2 = cos_theta * cos_theta;
 205         if (cos_theta2 < 1.0f)
 206         {
 207             float st    = dVdt * rsqrtf(1.0f - cos_theta2);
 208             float sth   = st * cos_theta;
 209             float nrij2 = norm2(r_ij);
 210             float nrkj2 = norm2(r_kj);
 211
 212             float nrij_1 = rsqrtf(nrij2);
 213             float nrkj_1 = rsqrtf(nrkj2);
 214
 215             float cik = st * nrij_1 * nrkj_1;
 216             float cii = sth * nrij_1 * nrij_1;
 217             float ckk = sth * nrkj_1 * nrkj_1;
 218
 219             float3 f_i = cii * r_ij - cik * r_kj;
 220             float3 f_k = ckk * r_kj - cik * r_ij;
 221             float3 f_j = -f_i - f_k;
 222
 223             atomicAdd(&gm_f[ai], f_i);
 224             atomicAdd(&gm_f[aj], f_j);
 225             atomicAdd(&gm_f[ak], f_k);
 226
 227             if (calcVir)
 228             {
 229                 atomicAdd(&sm_fShiftLoc[t1], f_i);
 230                 atomicAdd(&sm_fShiftLoc[CENTRAL], f_j);
 231                 atomicAdd(&sm_fShiftLoc[t2], f_k);
 232             }
 233         }
 234     }
 235 }
 236
 237 template<bool calcVir, bool calcEner>
 238 __device__ void urey_bradley_gpu(const int       i,
 239                                  float*          vtot_loc,
 240                                  const int       numBonds,
 241                                  const t_iatom   d_forceatoms[],
 242                                  const t_iparams d_forceparams[],
 243                                  const float4    gm_xq[],
 244                                  float3          gm_f[],
 245                                  float3          sm_fShiftLoc[],
 246                                  const PbcAiuc   pbcAiuc)
 247 {
 248     if (i < numBonds)
 249     {
 250         int4 ubData = *(int4*)(d_forceatoms + 4 * i);
 251         int  type   = ubData.x;
 252         int  ai     = ubData.y;
 253         int  aj     = ubData.z;
 254         int  ak     = ubData.w;
 255
 256         float th0A = d_forceparams[type].u_b.thetaA * CUDA_DEG2RAD_F;
 257         float kthA = d_forceparams[type].u_b.kthetaA;
 258         float r13A = d_forceparams[type].u_b.r13A;
 259         float kUBA = d_forceparams[type].u_b.kUBA;
 260
 261         float3 r_ij;
 262         float3 r_kj;
 263         float  cos_theta;
 264         int    t1;
 265         int    t2;
 266         float  theta = bond_angle_gpu<calcVir>(gm_xq[ai], gm_xq[aj], gm_xq[ak], pbcAiuc, &r_ij,
 267                                               &r_kj, &cos_theta, &t1, &t2);
 268
 269         float va;
 270         float dVdt;
 271         harmonic_gpu(kthA, th0A, theta, &va, &dVdt);
 272
 273         if (calcEner)
 274         {
 275             *vtot_loc += va;
 276         }
 277
 278         float3 r_ik;
 279         int    ki = pbcDxAiuc<calcVir>(pbcAiuc, gm_xq[ai], gm_xq[ak], r_ik);
 280
 281         float dr2 = norm2(r_ik);
 282         float dr  = dr2 * rsqrtf(dr2);
 283
 284         float vbond;
 285         float fbond;
 286         harmonic_gpu(kUBA, r13A, dr, &vbond, &fbond);
 287
 288         float cos_theta2 = cos_theta * cos_theta;
 289         if (cos_theta2 < 1.0f)
 290         {
 291             float st  = dVdt * rsqrtf(1.0f - cos_theta2);
 292             float sth = st * cos_theta;
 293
 294             float nrkj2 = norm2(r_kj);
 295             float nrij2 = norm2(r_ij);
 296
 297             float cik = st * rsqrtf(nrkj2 * nrij2);
 298             float cii = sth / nrij2;
 299             float ckk = sth / nrkj2;
 300
 301             float3 f_i = cii * r_ij - cik * r_kj;
 302             float3 f_k = ckk * r_kj - cik * r_ij;
 303             float3 f_j = -f_i - f_k;
 304
 305             atomicAdd(&gm_f[ai], f_i);
 306             atomicAdd(&gm_f[aj], f_j);
 307             atomicAdd(&gm_f[ak], f_k);
 308
 309             if (calcVir)
 310             {
 311                 atomicAdd(&sm_fShiftLoc[t1], f_i);
 312                 atomicAdd(&sm_fShiftLoc[CENTRAL], f_j);
 313                 atomicAdd(&sm_fShiftLoc[t2], f_k);
 314             }
 315         }
 316
 317         /* Time for the bond calculations */
 318         if (dr2 != 0.0f)
 319         {
 320             if (calcEner)
 321             {
 322                 *vtot_loc += vbond;
 323             }
 324
 325             fbond *= rsqrtf(dr2);
 326
 327             float3 fik = fbond * r_ik;
 328             atomicAdd(&gm_f[ai], fik);
 329             atomicAdd(&gm_f[ak], -fik);
 330
 331             if (calcVir && ki != CENTRAL)
 332             {
 333                 atomicAdd(&sm_fShiftLoc[ki], fik);
 334                 atomicAdd(&sm_fShiftLoc[CENTRAL], -fik);
 335             }
 336         }
 337     }
 338 }
 339
 340 template<bool returnShift, typename T>
 341 __device__ __forceinline__ static float dih_angle_gpu(const T        xi,
 342                                                       const T        xj,
 343                                                       const T        xk,
 344                                                       const T        xl,
 345                                                       const PbcAiuc& pbcAiuc,
 346                                                       float3*        r_ij,
 347                                                       float3*        r_kj,
 348                                                       float3*        r_kl,
 349                                                       float3*        m,
 350                                                       float3*        n,
 351                                                       int*           t1,
 352                                                       int*           t2,
 353                                                       int*           t3)
 354 {
 355     *t1 = pbcDxAiuc<returnShift>(pbcAiuc, xi, xj, *r_ij);
 356     *t2 = pbcDxAiuc<returnShift>(pbcAiuc, xk, xj, *r_kj);
 357     *t3 = pbcDxAiuc<returnShift>(pbcAiuc, xk, xl, *r_kl);
 358
 359     *m         = cprod(*r_ij, *r_kj);
 360     *n         = cprod(*r_kj, *r_kl);
 361     float phi  = gmx_angle(*m, *n);
 362     float ipr  = iprod(*r_ij, *n);
 363     float sign = (ipr < 0.0f) ? -1.0f : 1.0f;
 364     phi        = sign * phi;
 365
 366     return phi;
 367 }
 368
 369
 370 __device__ __forceinline__ static void
 371            dopdihs_gpu(const float cpA, const float phiA, const int mult, const float phi, float* v, float* f)
 372 {
 373     float mdphi, sdphi;
 374
 375     mdphi = mult * phi - phiA * CUDA_DEG2RAD_F;
 376     sdphi = sinf(mdphi);
 377     *v    = cpA * (1.0f + cosf(mdphi));
 378     *f    = -cpA * mult * sdphi;
 379 }
 380
 381 template<bool calcVir>
 382 __device__ static void do_dih_fup_gpu(const int      i,
 383                                       const int      j,
 384                                       const int      k,
 385                                       const int      l,
 386                                       const float    ddphi,
 387                                       const float3   r_ij,
 388                                       const float3   r_kj,
 389                                       const float3   r_kl,
 390                                       const float3   m,
 391                                       const float3   n,
 392                                       float3         gm_f[],
 393                                       float3         sm_fShiftLoc[],
 394                                       const PbcAiuc& pbcAiuc,
 395                                       const float4   gm_xq[],
 396                                       const int      t1,
 397                                       const int      t2,
 398                                       const int gmx_unused t3)
 399 {
 400     float iprm  = norm2(m);
 401     float iprn  = norm2(n);
 402     float nrkj2 = norm2(r_kj);
 403     float toler = nrkj2 * GMX_REAL_EPS;
 404     if ((iprm > toler) && (iprn > toler))
 405     {
 406         float  nrkj_1 = rsqrtf(nrkj2); // replacing std::invsqrt call
 407         float  nrkj_2 = nrkj_1 * nrkj_1;
 408         float  nrkj   = nrkj2 * nrkj_1;
 409         float  a      = -ddphi * nrkj / iprm;
 410         float3 f_i    = a * m;
 411         float  b      = ddphi * nrkj / iprn;
 412         float3 f_l    = b * n;
 413         float  p      = iprod(r_ij, r_kj);
 414         p *= nrkj_2;
 415         float q = iprod(r_kl, r_kj);
 416         q *= nrkj_2;
 417         float3 uvec = p * f_i;
 418         float3 vvec = q * f_l;
 419         float3 svec = uvec - vvec;
 420         float3 f_j  = f_i - svec;
 421         float3 f_k  = f_l + svec;
 422
 423         atomicAdd(&gm_f[i], f_i);
 424         atomicAdd(&gm_f[j], -f_j);
 425         atomicAdd(&gm_f[k], -f_k);
 426         atomicAdd(&gm_f[l], f_l);
 427
 428         if (calcVir)
 429         {
 430             float3 dx_jl;
 431             int    t3 = pbcDxAiuc<calcVir>(pbcAiuc, gm_xq[l], gm_xq[j], dx_jl);
 432
 433             atomicAdd(&sm_fShiftLoc[t1], f_i);
 434             atomicAdd(&sm_fShiftLoc[CENTRAL], -f_j);
 435             atomicAdd(&sm_fShiftLoc[t2], -f_k);
 436             atomicAdd(&sm_fShiftLoc[t3], f_l);
 437         }
 438     }
 439 }
 440
 441 template<bool calcVir, bool calcEner>
 442 __device__ void pdihs_gpu(const int       i,
 443                           float*          vtot_loc,
 444                           const int       numBonds,
 445                           const t_iatom   d_forceatoms[],
 446                           const t_iparams d_forceparams[],
 447                           const float4    gm_xq[],
 448                           float3          gm_f[],
 449                           float3          sm_fShiftLoc[],
 450                           const PbcAiuc   pbcAiuc)
 451 {
 452     if (i < numBonds)
 453     {
 454         int type = d_forceatoms[5 * i];
 455         int ai   = d_forceatoms[5 * i + 1];
 456         int aj   = d_forceatoms[5 * i + 2];
 457         int ak   = d_forceatoms[5 * i + 3];
 458         int al   = d_forceatoms[5 * i + 4];
 459
 460         float3 r_ij;
 461         float3 r_kj;
 462         float3 r_kl;
 463         float3 m;
 464         float3 n;
 465         int    t1;
 466         int    t2;
 467         int    t3;
 468         float  phi = dih_angle_gpu<calcVir>(gm_xq[ai], gm_xq[aj], gm_xq[ak], gm_xq[al], pbcAiuc,
 469                                            &r_ij, &r_kj, &r_kl, &m, &n, &t1, &t2, &t3);
 470
 471         float vpd;
 472         float ddphi;
 473         dopdihs_gpu(d_forceparams[type].pdihs.cpA, d_forceparams[type].pdihs.phiA,
 474                     d_forceparams[type].pdihs.mult, phi, &vpd, &ddphi);
 475
 476         if (calcEner)
 477         {
 478             *vtot_loc += vpd;
 479         }
 480
 481         do_dih_fup_gpu<calcVir>(ai, aj, ak, al, ddphi, r_ij, r_kj, r_kl, m, n, gm_f, sm_fShiftLoc,
 482                                 pbcAiuc, gm_xq, t1, t2, t3);
 483     }
 484 }
 485
 486 template<bool calcVir, bool calcEner>
 487 __device__ void rbdihs_gpu(const int       i,
 488                            float*          vtot_loc,
 489                            const int       numBonds,
 490                            const t_iatom   d_forceatoms[],
 491                            const t_iparams d_forceparams[],
 492                            const float4    gm_xq[],
 493                            float3          gm_f[],
 494                            float3          sm_fShiftLoc[],
 495                            const PbcAiuc   pbcAiuc)
 496 {
 497     constexpr float c0 = 0.0f, c1 = 1.0f, c2 = 2.0f, c3 = 3.0f, c4 = 4.0f, c5 = 5.0f;
 498
 499     if (i < numBonds)
 500     {
 501         int type = d_forceatoms[5 * i];
 502         int ai   = d_forceatoms[5 * i + 1];
 503         int aj   = d_forceatoms[5 * i + 2];
 504         int ak   = d_forceatoms[5 * i + 3];
 505         int al   = d_forceatoms[5 * i + 4];
 506
 507         float3 r_ij;
 508         float3 r_kj;
 509         float3 r_kl;
 510         float3 m;
 511         float3 n;
 512         int    t1;
 513         int    t2;
 514         int    t3;
 515         float  phi = dih_angle_gpu<calcVir>(gm_xq[ai], gm_xq[aj], gm_xq[ak], gm_xq[al], pbcAiuc,
 516                                            &r_ij, &r_kj, &r_kl, &m, &n, &t1, &t2, &t3);
 517
 518         /* Change to polymer convention */
 519         if (phi < c0)
 520         {
 521             phi += CUDART_PI_F;
 522         }
 523         else
 524         {
 525             phi -= CUDART_PI_F;
 526         }
 527         float cos_phi = cosf(phi);
 528         /* Beware of accuracy loss, cannot use 1-sqrt(cos^2) ! */
 529         float sin_phi = sinf(phi);
 530
 531         float parm[NR_RBDIHS];
 532         for (int j = 0; j < NR_RBDIHS; j++)
 533         {
 534             parm[j] = d_forceparams[type].rbdihs.rbcA[j];
 535         }
 536         /* Calculate cosine powers */
 537         /* Calculate the energy */
 538         /* Calculate the derivative */
 539         float v      = parm[0];
 540         float ddphi  = c0;
 541         float cosfac = c1;
 542
 543         float rbp = parm[1];
 544         ddphi += rbp * cosfac;
 545         cosfac *= cos_phi;
 546         if (calcEner)
 547         {
 548             v += cosfac * rbp;
 549         }
 550         rbp = parm[2];
 551         ddphi += c2 * rbp * cosfac;
 552         cosfac *= cos_phi;
 553         if (calcEner)
 554         {
 555             v += cosfac * rbp;
 556         }
 557         rbp = parm[3];
 558         ddphi += c3 * rbp * cosfac;
 559         cosfac *= cos_phi;
 560         if (calcEner)
 561         {
 562             v += cosfac * rbp;
 563         }
 564         rbp = parm[4];
 565         ddphi += c4 * rbp * cosfac;
 566         cosfac *= cos_phi;
 567         if (calcEner)
 568         {
 569             v += cosfac * rbp;
 570         }
 571         rbp = parm[5];
 572         ddphi += c5 * rbp * cosfac;
 573         cosfac *= cos_phi;
 574         if (calcEner)
 575         {
 576             v += cosfac * rbp;
 577         }
 578
 579         ddphi = -ddphi * sin_phi;
 580
 581         do_dih_fup_gpu<calcVir>(ai, aj, ak, al, ddphi, r_ij, r_kj, r_kl, m, n, gm_f, sm_fShiftLoc,
 582                                 pbcAiuc, gm_xq, t1, t2, t3);
 583         if (calcEner)
 584         {
 585             *vtot_loc += v;
 586         }
 587     }
 588 }
 589
 590 __device__ __forceinline__ static void make_dp_periodic_gpu(float* dp)
 591 {
 592     /* dp cannot be outside (-pi,pi) */
 593     if (*dp >= CUDART_PI_F)
 594     {
 595         *dp -= 2.0f * CUDART_PI_F;
 596     }
 597     else if (*dp < -CUDART_PI_F)
 598     {
 599         *dp += 2.0f * CUDART_PI_F;
 600     }
 601 }
 602
 603 template<bool calcVir, bool calcEner>
 604 __device__ void idihs_gpu(const int       i,
 605                           float*          vtot_loc,
 606                           const int       numBonds,
 607                           const t_iatom   d_forceatoms[],
 608                           const t_iparams d_forceparams[],
 609                           const float4    gm_xq[],
 610                           float3          gm_f[],
 611                           float3          sm_fShiftLoc[],
 612                           const PbcAiuc   pbcAiuc)
 613 {
 614     if (i < numBonds)
 615     {
 616         int type = d_forceatoms[5 * i];
 617         int ai   = d_forceatoms[5 * i + 1];
 618         int aj   = d_forceatoms[5 * i + 2];
 619         int ak   = d_forceatoms[5 * i + 3];
 620         int al   = d_forceatoms[5 * i + 4];
 621
 622         float3 r_ij;
 623         float3 r_kj;
 624         float3 r_kl;
 625         float3 m;
 626         float3 n;
 627         int    t1;
 628         int    t2;
 629         int    t3;
 630         float  phi = dih_angle_gpu<calcVir>(gm_xq[ai], gm_xq[aj], gm_xq[ak], gm_xq[al], pbcAiuc,
 631                                            &r_ij, &r_kj, &r_kl, &m, &n, &t1, &t2, &t3);
 632
 633         /* phi can jump if phi0 is close to Pi/-Pi, which will cause huge
 634          * force changes if we just apply a normal harmonic.
 635          * Instead, we first calculate phi-phi0 and take it modulo (-Pi,Pi).
 636          * This means we will never have the periodicity problem, unless
 637          * the dihedral is Pi away from phiO, which is very unlikely due to
 638          * the potential.
 639          */
 640         float kA = d_forceparams[type].harmonic.krA;
 641         float pA = d_forceparams[type].harmonic.rA;
 642
 643         float phi0 = pA * CUDA_DEG2RAD_F;
 644
 645         float dp = phi - phi0;
 646
 647         make_dp_periodic_gpu(&dp);
 648
 649         float ddphi = -kA * dp;
 650
 651         do_dih_fup_gpu<calcVir>(ai, aj, ak, al, -ddphi, r_ij, r_kj, r_kl, m, n, gm_f, sm_fShiftLoc,
 652                                 pbcAiuc, gm_xq, t1, t2, t3);
 653
 654         if (calcEner)
 655         {
 656             *vtot_loc += -0.5f * ddphi * dp;
 657         }
 658     }
 659 }
 660
 661 template<bool calcVir, bool calcEner>
 662 __device__ void pairs_gpu(const int       i,
 663                           const int       numBonds,
 664                           const t_iatom   d_forceatoms[],
 665                           const t_iparams iparams[],
 666                           const float4    gm_xq[],
 667                           float3          gm_f[],
 668                           float3          sm_fShiftLoc[],
 669                           const PbcAiuc   pbcAiuc,
 670                           const float     scale_factor,
 671                           float*          vtotVdw_loc,
 672                           float*          vtotElec_loc)
 673 {
 674     if (i < numBonds)
 675     {
 676         // TODO this should be made into a separate type, the GPU and CPU sizes should be compared
 677         int3 pairData = *(int3*)(d_forceatoms + 3 * i);
 678         int  type     = pairData.x;
 679         int  ai       = pairData.y;
 680         int  aj       = pairData.z;
 681
 682         float qq  = gm_xq[ai].w * gm_xq[aj].w;
 683         float c6  = iparams[type].lj14.c6A;
 684         float c12 = iparams[type].lj14.c12A;
 685
 686         /* Do we need to apply full periodic boundary conditions? */
 687         float3 dr;
 688         int    fshift_index = pbcDxAiuc<calcVir>(pbcAiuc, gm_xq[ai], gm_xq[aj], dr);
 689
 690         float r2    = norm2(dr);
 691         float rinv  = rsqrtf(r2);
 692         float rinv2 = rinv * rinv;
 693         float rinv6 = rinv2 * rinv2 * rinv2;
 694
 695         /* Calculate the Coulomb force * r */
 696         float velec = scale_factor * qq * rinv;
 697
 698         /* Calculate the LJ force * r and add it to the Coulomb part */
 699         float fr = (12.0f * c12 * rinv6 - 6.0f * c6) * rinv6 + velec;
 700
 701         float  finvr = fr * rinv2;
 702         float3 f     = finvr * dr;
 703
 704         /* Add the forces */
 705         atomicAdd(&gm_f[ai], f);
 706         atomicAdd(&gm_f[aj], -f);
 707         if (calcVir && fshift_index != CENTRAL)
 708         {
 709             atomicAdd(&sm_fShiftLoc[fshift_index], f);
 710             atomicAdd(&sm_fShiftLoc[CENTRAL], -f);
 711         }
 712
 713         if (calcEner)
 714         {
 715             *vtotVdw_loc += (c12 * rinv6 - c6) * rinv6;
 716             *vtotElec_loc += velec;
 717         }
 718     }
 719 }
 720
 721 namespace gmx
 722 {
 723
 724 template<bool calcVir, bool calcEner>
 725 __global__ void exec_kernel_gpu(BondedCudaKernelParameters kernelParams)
 726 {
 727     assert(blockDim.y == 1 && blockDim.z == 1);
 728     const int  tid          = blockIdx.x * blockDim.x + threadIdx.x;
 729     float      vtot_loc     = 0;
 730     float      vtotVdw_loc  = 0;
 731     float      vtotElec_loc = 0;
 732     __shared__ float3 sm_fShiftLoc[SHIFTS];
 733
 734     if (calcVir)
 735     {
 736         if (threadIdx.x < SHIFTS)
 737         {
 738             sm_fShiftLoc[threadIdx.x] = make_float3(0.0f, 0.0f, 0.0f);
 739         }
 740         __syncthreads();
 741     }
 742
 743     int  fType;
 744     bool threadComputedPotential = false;
 745 #pragma unroll
 746     for (int j = 0; j < numFTypesOnGpu; j++)
 747     {
 748         if (tid >= kernelParams.fTypeRangeStart[j] && tid <= kernelParams.fTypeRangeEnd[j])
 749         {
 750             const int      numBonds = kernelParams.numFTypeBonds[j];
 751             int            fTypeTid = tid - kernelParams.fTypeRangeStart[j];
 752             const t_iatom* iatoms   = kernelParams.d_iatoms[j];
 753             fType                   = kernelParams.fTypesOnGpu[j];
 754             if (calcEner)
 755             {
 756                 threadComputedPotential = true;
 757             }
 758
 759             switch (fType)
 760             {
 761                 case F_BONDS:
 762                     bonds_gpu<calcVir, calcEner>(fTypeTid, &vtot_loc, numBonds, iatoms,
 763                                                  kernelParams.d_forceParams, kernelParams.d_xq,
 764                                                  kernelParams.d_f, sm_fShiftLoc, kernelParams.pbcAiuc);
 765                     break;
 766                 case F_ANGLES:
 767                     angles_gpu<calcVir, calcEner>(
 768                             fTypeTid, &vtot_loc, numBonds, iatoms, kernelParams.d_forceParams,
 769                             kernelParams.d_xq, kernelParams.d_f, sm_fShiftLoc, kernelParams.pbcAiuc);
 770                     break;
 771                 case F_UREY_BRADLEY:
 772                     urey_bradley_gpu<calcVir, calcEner>(
 773                             fTypeTid, &vtot_loc, numBonds, iatoms, kernelParams.d_forceParams,
 774                             kernelParams.d_xq, kernelParams.d_f, sm_fShiftLoc, kernelParams.pbcAiuc);
 775                     break;
 776                 case F_PDIHS:
 777                 case F_PIDIHS:
 778                     pdihs_gpu<calcVir, calcEner>(fTypeTid, &vtot_loc, numBonds, iatoms,
 779                                                  kernelParams.d_forceParams, kernelParams.d_xq,
 780                                                  kernelParams.d_f, sm_fShiftLoc, kernelParams.pbcAiuc);
 781                     break;
 782                 case F_RBDIHS:
 783                     rbdihs_gpu<calcVir, calcEner>(
 784                             fTypeTid, &vtot_loc, numBonds, iatoms, kernelParams.d_forceParams,
 785                             kernelParams.d_xq, kernelParams.d_f, sm_fShiftLoc, kernelParams.pbcAiuc);
 786                     break;
 787                 case F_IDIHS:
 788                     idihs_gpu<calcVir, calcEner>(fTypeTid, &vtot_loc, numBonds, iatoms,
 789                                                  kernelParams.d_forceParams, kernelParams.d_xq,
 790                                                  kernelParams.d_f, sm_fShiftLoc, kernelParams.pbcAiuc);
 791                     break;
 792                 case F_LJ14:
 793                     pairs_gpu<calcVir, calcEner>(fTypeTid, numBonds, iatoms, kernelParams.d_forceParams,
 794                                                  kernelParams.d_xq, kernelParams.d_f, sm_fShiftLoc,
 795                                                  kernelParams.pbcAiuc, kernelParams.scaleFactor,
 796                                                  &vtotVdw_loc, &vtotElec_loc);
 797                     break;
 798             }
 799             break;
 800         }
 801     }
 802
 803     if (threadComputedPotential)
 804     {
 805         float* vtotVdw  = kernelParams.d_vTot + F_LJ14;
 806         float* vtotElec = kernelParams.d_vTot + F_COUL14;
 807         atomicAdd(kernelParams.d_vTot + fType, vtot_loc);
 808         atomicAdd(vtotVdw, vtotVdw_loc);
 809         atomicAdd(vtotElec, vtotElec_loc);
 810     }
 811     /* Accumulate shift vectors from shared memory to global memory on the first SHIFTS threads of the block. */
 812     if (calcVir)
 813     {
 814         __syncthreads();
 815         if (threadIdx.x < SHIFTS)
 816         {
 817             atomicAdd(kernelParams.d_fShift[threadIdx.x], sm_fShiftLoc[threadIdx.x]);
 818         }
 819     }
 820 }
 821
 822
 823 /*-------------------------------- End CUDA kernels-----------------------------*/
 824
 825
 826 template<bool calcVir, bool calcEner>
 827 void GpuBonded::Impl::launchKernel(const t_forcerec* fr, const matrix box)
 828 {
 829     GMX_ASSERT(haveInteractions_,
 830                "Cannot launch bonded GPU kernels unless bonded GPU work was scheduled");
 831     static_assert(TPB_BONDED >= SHIFTS,
 832                   "TPB_BONDED must be >= SHIFTS for the virial kernel (calcVir=true)");
 833
 834     PbcAiuc pbcAiuc;
 835     setPbcAiuc(fr->bMolPBC ? numPbcDimensions(fr->pbcType) : 0, box, &pbcAiuc);
 836
 837     int fTypeRangeEnd = kernelParams_.fTypeRangeEnd[numFTypesOnGpu - 1];
 838
 839     if (fTypeRangeEnd < 0)
 840     {
 841         return;
 842     }
 843
 844     KernelLaunchConfig config;
 845     config.blockSize[0] = TPB_BONDED;
 846     config.blockSize[1] = 1;
 847     config.blockSize[2] = 1;
 848     config.gridSize[0]  = (fTypeRangeEnd + TPB_BONDED) / TPB_BONDED;
 849     config.gridSize[1]  = 1;
 850     config.gridSize[2]  = 1;
 851     config.stream       = stream_;
 852
 853     auto kernelPtr            = exec_kernel_gpu<calcVir, calcEner>;
 854     kernelParams_.scaleFactor = fr->ic->epsfac * fr->fudgeQQ;
 855     kernelParams_.pbcAiuc     = pbcAiuc;
 856
 857     const auto kernelArgs = prepareGpuKernelArguments(kernelPtr, config, &kernelParams_);
 858
 859     launchGpuKernel(kernelPtr, config, nullptr, "exec_kernel_gpu<calcVir, calcEner>", kernelArgs);
 860 }
 861
 862 void GpuBonded::launchKernel(const t_forcerec* fr, const gmx::StepWorkload& stepWork, const matrix box)
 863 {
 864     if (stepWork.computeEnergy)
 865     {
 866         // When we need the energy, we also need the virial
 867         impl_->launchKernel<true, true>(fr, box);
 868     }
 869     else if (stepWork.computeVirial)
 870     {
 871         impl_->launchKernel<true, false>(fr, box);
 872     }
 873     else
 874     {
 875         impl_->launchKernel<false, false>(fr, box);
 876     }
 877 }
 878
 879 } // namespace gmx