src/gromacs/listed_forces/gpubondedkernels.cu

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2018,2019, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35 /*! \internal \file
  36  *
  37  * \brief Implements CUDA bonded functionality
  38  *
  39  * \author Jon Vincent <jvincent@nvidia.com>
  40  * \author Magnus Lundborg <lundborg.magnus@gmail.com>
  41  * \author Berk Hess <hess@kth.se>
  42  * \author Szilárd Páll <pall.szilard@gmail.com>
  43  * \author Alan Gray <alang@nvidia.com>
  44  * \author Mark Abraham <mark.j.abraham@gmail.com>
  45  *
  46  * \ingroup module_listed_forces
  47  */
  48
  49 #include "gmxpre.h"
  50
  51 #include <math_constants.h>
  52
  53 #include "gromacs/gpu_utils/cudautils.cuh"
  54 #include "gromacs/gpu_utils/devicebuffer.h"
  55 #include "gromacs/gpu_utils/gpu_vec.cuh"
  56 #include "gromacs/gpu_utils/gputraits.cuh"
  57 #include "gromacs/listed_forces/gpubonded.h"
  58 #include "gromacs/listed_forces/listed_forces.h"
  59 #include "gromacs/math/units.h"
  60 #include "gromacs/mdlib/force_flags.h"
  61 #include "gromacs/mdtypes/enerdata.h"
  62 #include "gromacs/mdtypes/forcerec.h"
  63 #include "gromacs/mdtypes/group.h"
  64 #include "gromacs/mdtypes/mdatom.h"
  65 #include "gromacs/pbcutil/gpu_pbc.cuh"
  66 #include "gromacs/pbcutil/ishift.h"
  67 #include "gromacs/pbcutil/pbc.h"
  68 #include "gromacs/topology/idef.h"
  69 #include "gromacs/topology/ifunc.h"
  70 #include "gromacs/utility/gmxassert.h"
  71
  72 #include "gpubonded_impl.h"
  73
  74 #if defined(_MSVC)
  75 #include <limits>
  76 #endif
  77
  78 //CUDA threads per block
  79 #define TPB_BONDED 256
  80
  81 /*-------------------------------- CUDA kernels-------------------------------- */
  82 /*------------------------------------------------------------------------------*/
  83
  84
  85 /*---------------- BONDED CUDA kernels--------------*/
  86
  87 /* Harmonic */
  88 __device__ __forceinline__
  89 static void harmonic_gpu(const float kA, const float xA, const float x, float *V, float *F)
  90 {
  91     constexpr float half = 0.5f;
  92     float           dx, dx2;
  93
  94     dx    = x-xA;
  95     dx2   = dx*dx;
  96
  97     *F = -kA*dx;
  98     *V = half*kA*dx2;
  99 }
 100
 101 template <bool calcVir, bool calcEner>
 102 __global__
 103 void bonds_gpu(float *vtot, const int nbonds,
 104                const t_iatom forceatoms[], const t_iparams forceparams[],
 105                const float4 xq[], fvec force[], fvec fshift[],
 106                const PbcAiuc pbcAiuc)
 107 {
 108     const int        i = blockIdx.x * blockDim.x + threadIdx.x;
 109
 110     __shared__ float vtot_loc;
 111     __shared__ fvec  fshift_loc[SHIFTS];
 112
 113     if (calcVir || calcEner)
 114     {
 115         if (threadIdx.x == 0)
 116         {
 117             vtot_loc = 0.0f;
 118         }
 119         if (threadIdx.x < SHIFTS)
 120         {
 121             fshift_loc[threadIdx.x][XX] = 0.0f;
 122             fshift_loc[threadIdx.x][YY] = 0.0f;
 123             fshift_loc[threadIdx.x][ZZ] = 0.0f;
 124         }
 125         __syncthreads();
 126     }
 127
 128     if (i < nbonds)
 129     {
 130         int type = forceatoms[3*i];
 131         int ai   = forceatoms[3*i + 1];
 132         int aj   = forceatoms[3*i + 2];
 133
 134         /* dx = xi - xj, corrected for periodic boundry conditions. */
 135         fvec  dx;
 136         int   ki = pbcDxAiuc<calcVir>(pbcAiuc, xq[ai], xq[aj], dx);
 137
 138         float dr2 = iprod_gpu(dx, dx);
 139         float dr  = sqrt(dr2);
 140
 141         float vbond;
 142         float fbond;
 143         harmonic_gpu(forceparams[type].harmonic.krA,
 144                      forceparams[type].harmonic.rA,
 145                      dr, &vbond, &fbond);
 146
 147         if (calcEner)
 148         {
 149             atomicAdd(&vtot_loc, vbond);
 150         }
 151
 152         if (dr2 != 0.0f)
 153         {
 154             fbond *= rsqrtf(dr2);
 155
 156 #pragma unroll
 157             for (int m = 0; m < DIM; m++)
 158             {
 159                 float fij = fbond*dx[m];
 160                 atomicAdd(&force[ai][m], fij);
 161                 atomicAdd(&force[aj][m], -fij);
 162                 if (calcVir && ki != CENTRAL)
 163                 {
 164                     atomicAdd(&fshift_loc[ki][m], fij);
 165                     atomicAdd(&fshift_loc[CENTRAL][m], -fij);
 166                 }
 167             }
 168         }
 169     }
 170
 171     if (calcVir || calcEner)
 172     {
 173         __syncthreads();
 174         if (calcEner && threadIdx.x == 0)
 175         {
 176             atomicAdd(vtot, vtot_loc);
 177         }
 178         if (calcVir && threadIdx.x < SHIFTS)
 179         {
 180             fvec_inc_atomic(fshift[threadIdx.x], fshift_loc[threadIdx.x]);
 181         }
 182     }
 183 }
 184
 185 template <bool returnShift>
 186 __device__ __forceinline__
 187 static float bond_angle_gpu(const float4 xi, const float4 xj, const float4 xk,
 188                             const PbcAiuc &pbcAiuc,
 189                             fvec r_ij, fvec r_kj, float *costh,
 190                             int *t1, int *t2)
 191 /* Return value is the angle between the bonds i-j and j-k */
 192 {
 193     *t1      = pbcDxAiuc<returnShift>(pbcAiuc, xi, xj, r_ij);
 194     *t2      = pbcDxAiuc<returnShift>(pbcAiuc, xk, xj, r_kj);
 195
 196     *costh   = cos_angle_gpu(r_ij, r_kj);
 197     float th = acosf(*costh);
 198
 199     return th;
 200 }
 201
 202 template <bool calcVir, bool calcEner>
 203 __global__
 204 void angles_gpu(float *vtot, const int nbonds,
 205                 const t_iatom forceatoms[], const t_iparams forceparams[],
 206                 const float4 x[], fvec force[], fvec fshift[],
 207                 const PbcAiuc pbcAiuc)
 208 {
 209     __shared__ float vtot_loc;
 210     __shared__ fvec  fshift_loc[SHIFTS];
 211
 212     const int        i = blockIdx.x*blockDim.x + threadIdx.x;
 213
 214     if (calcVir || calcEner)
 215     {
 216         if (threadIdx.x == 0)
 217         {
 218             vtot_loc = 0.0f;
 219         }
 220         if (threadIdx.x < SHIFTS)
 221         {
 222             fshift_loc[threadIdx.x][XX] = 0.0f;
 223             fshift_loc[threadIdx.x][YY] = 0.0f;
 224             fshift_loc[threadIdx.x][ZZ] = 0.0f;
 225         }
 226
 227         __syncthreads();
 228     }
 229
 230     if (i < nbonds)
 231     {
 232         int   type = forceatoms[4*i];
 233         int   ai   = forceatoms[4*i + 1];
 234         int   aj   = forceatoms[4*i + 2];
 235         int   ak   = forceatoms[4*i + 3];
 236
 237         fvec  r_ij;
 238         fvec  r_kj;
 239         float cos_theta;
 240         int   t1;
 241         int   t2;
 242         float theta =
 243             bond_angle_gpu<calcVir>(x[ai], x[aj], x[ak], pbcAiuc,
 244                                     r_ij, r_kj, &cos_theta, &t1, &t2);
 245
 246         float va;
 247         float dVdt;
 248         harmonic_gpu(forceparams[type].harmonic.krA,
 249                      forceparams[type].harmonic.rA*DEG2RAD,
 250                      theta, &va, &dVdt);
 251
 252         if (calcEner)
 253         {
 254             atomicAdd(&vtot_loc, va);
 255         }
 256
 257         float cos_theta2 = cos_theta*cos_theta;
 258         if (cos_theta2 < 1.0f)
 259         {
 260             float st    = dVdt*rsqrtf(1.0f - cos_theta2);
 261             float sth   = st*cos_theta;
 262             float nrij2 = iprod_gpu(r_ij, r_ij);
 263             float nrkj2 = iprod_gpu(r_kj, r_kj);
 264
 265             float nrij_1 = rsqrtf(nrij2);
 266             float nrkj_1 = rsqrtf(nrkj2);
 267
 268             float cik = st*nrij_1*nrkj_1;
 269             float cii = sth*nrij_1*nrij_1;
 270             float ckk = sth*nrkj_1*nrkj_1;
 271
 272             fvec  f_i;
 273             fvec  f_k;
 274             fvec  f_j;
 275             for (int m = 0; m < DIM; m++)
 276             {
 277                 f_i[m]    = -(cik*r_kj[m] - cii*r_ij[m]);
 278                 f_k[m]    = -(cik*r_ij[m] - ckk*r_kj[m]);
 279                 f_j[m]    = -f_i[m] - f_k[m];
 280                 atomicAdd(&force[ai][m], f_i[m]);
 281                 atomicAdd(&force[aj][m], f_j[m]);
 282                 atomicAdd(&force[ak][m], f_k[m]);
 283             }
 284             if (calcVir)
 285             {
 286                 fvec_inc_atomic(fshift_loc[t1], f_i);
 287                 fvec_inc_atomic(fshift_loc[CENTRAL], f_j);
 288                 fvec_inc_atomic(fshift_loc[t2], f_k);
 289             }
 290         }
 291
 292     }
 293
 294     if (calcVir || calcEner)
 295     {
 296         __syncthreads();
 297
 298         if (calcEner && threadIdx.x == 0)
 299         {
 300             atomicAdd(vtot, vtot_loc);
 301         }
 302         if (calcVir && threadIdx.x < SHIFTS)
 303         {
 304             fvec_inc_atomic(fshift[threadIdx.x], fshift_loc[threadIdx.x]);
 305         }
 306     }
 307 }
 308
 309 template <bool calcVir, bool calcEner>
 310 __global__
 311 void urey_bradley_gpu(float *vtot, const int nbonds,
 312                       const t_iatom forceatoms[], const t_iparams forceparams[],
 313                       const float4 x[], fvec force[], fvec fshift[],
 314                       const PbcAiuc pbcAiuc)
 315 {
 316     __shared__ float vtot_loc;
 317     __shared__ fvec  fshift_loc[SHIFTS];
 318
 319     const int        i = blockIdx.x*blockDim.x + threadIdx.x;
 320
 321     if (calcVir || calcEner)
 322     {
 323         if (threadIdx.x == 0)
 324         {
 325             vtot_loc = 0.0f;
 326         }
 327         if (threadIdx.x < SHIFTS)
 328         {
 329             fshift_loc[threadIdx.x][XX] = 0.0f;
 330             fshift_loc[threadIdx.x][YY] = 0.0f;
 331             fshift_loc[threadIdx.x][ZZ] = 0.0f;
 332         }
 333
 334         __syncthreads();
 335     }
 336
 337     if (i < nbonds)
 338     {
 339         int   type  = forceatoms[4*i];
 340         int   ai    = forceatoms[4*i+1];
 341         int   aj    = forceatoms[4*i+2];
 342         int   ak    = forceatoms[4*i+3];
 343
 344         float th0A  = forceparams[type].u_b.thetaA*DEG2RAD;
 345         float kthA  = forceparams[type].u_b.kthetaA;
 346         float r13A  = forceparams[type].u_b.r13A;
 347         float kUBA  = forceparams[type].u_b.kUBA;
 348
 349         fvec  r_ij;
 350         fvec  r_kj;
 351         float cos_theta;
 352         int   t1;
 353         int   t2;
 354         float theta = bond_angle_gpu<calcVir>(x[ai], x[aj], x[ak], pbcAiuc,
 355                                               r_ij, r_kj, &cos_theta, &t1, &t2);
 356
 357         float va;
 358         float dVdt;
 359         harmonic_gpu(kthA, th0A, theta, &va, &dVdt);
 360
 361         if (calcEner)
 362         {
 363             atomicAdd(&vtot_loc, va);
 364         }
 365
 366         fvec  r_ik;
 367         int   ki = pbcDxAiuc<calcVir>(pbcAiuc, x[ai], x[ak], r_ik);
 368
 369         float dr2  = iprod_gpu(r_ik, r_ik);
 370         float dr   = dr2*rsqrtf(dr2);
 371
 372         float vbond;
 373         float fbond;
 374         harmonic_gpu(kUBA, r13A, dr, &vbond, &fbond);
 375
 376         float cos_theta2 = cos_theta*cos_theta;
 377         if (cos_theta2 < 1.0f)
 378         {
 379             float st    = dVdt*rsqrtf(1.0f - cos_theta2);
 380             float sth   = st*cos_theta;
 381
 382             float nrkj2 = iprod_gpu(r_kj, r_kj);
 383             float nrij2 = iprod_gpu(r_ij, r_ij);
 384
 385             float cik   = st*rsqrtf(nrkj2*nrij2);
 386             float cii   = sth/nrij2;
 387             float ckk   = sth/nrkj2;
 388
 389             fvec  f_i;
 390             fvec  f_j;
 391             fvec  f_k;
 392             for (int m = 0; m < DIM; m++)
 393             {
 394                 f_i[m]    = -(cik*r_kj[m]-cii*r_ij[m]);
 395                 f_k[m]    = -(cik*r_ij[m]-ckk*r_kj[m]);
 396                 f_j[m]    = -f_i[m]-f_k[m];
 397                 atomicAdd(&force[ai][m], f_i[m]);
 398                 atomicAdd(&force[aj][m], f_j[m]);
 399                 atomicAdd(&force[ak][m], f_k[m]);
 400             }
 401             fvec_inc_atomic(fshift_loc[t1], f_i);
 402             fvec_inc_atomic(fshift_loc[CENTRAL], f_j);
 403             fvec_inc_atomic(fshift_loc[t2], f_k);
 404         }
 405
 406         /* Time for the bond calculations */
 407         if (dr2 != 0.0f)
 408         {
 409             if (calcEner)
 410             {
 411                 atomicAdd(&vtot_loc, vbond);
 412             }
 413
 414             fbond *= rsqrtf(dr2);
 415
 416             for (int m = 0; m < DIM; m++)
 417             {
 418                 float fik = fbond*r_ik[m];
 419                 atomicAdd(&force[ai][m], fik);
 420                 atomicAdd(&force[ak][m], -fik);
 421
 422                 if (calcVir && ki != CENTRAL)
 423                 {
 424                     atomicAdd(&fshift_loc[ki][m], fik);
 425                     atomicAdd(&fshift_loc[CENTRAL][m], -fik);
 426                 }
 427             }
 428         }
 429     }
 430
 431     if (calcVir || calcEner)
 432     {
 433         __syncthreads();
 434
 435         if (calcEner && threadIdx.x == 0)
 436         {
 437             atomicAdd(vtot, vtot_loc);
 438         }
 439         if (calcVir && threadIdx.x < SHIFTS)
 440         {
 441             fvec_inc_atomic(fshift[threadIdx.x], fshift_loc[threadIdx.x]);
 442         }
 443     }
 444 }
 445
 446 template <bool returnShift, typename T>
 447 __device__ __forceinline__
 448 static float dih_angle_gpu(const T xi, const T xj, const T xk, const T xl,
 449                            const PbcAiuc &pbcAiuc,
 450                            fvec r_ij, fvec r_kj, fvec r_kl, fvec m, fvec n,
 451                            int *t1, int *t2, int *t3)
 452 {
 453     *t1 = pbcDxAiuc<returnShift>(pbcAiuc, xi, xj, r_ij);
 454     *t2 = pbcDxAiuc<returnShift>(pbcAiuc, xk, xj, r_kj);
 455     *t3 = pbcDxAiuc<returnShift>(pbcAiuc, xk, xl, r_kl);
 456
 457     cprod_gpu(r_ij, r_kj, m);
 458     cprod_gpu(r_kj, r_kl, n);
 459     float phi  = gmx_angle_gpu(m, n);
 460     float ipr  = iprod_gpu(r_ij, n);
 461     float sign = (ipr < 0.0f) ? -1.0f : 1.0f;
 462     phi        = sign*phi;
 463
 464     return phi;
 465 }
 466
 467
 468 __device__ __forceinline__
 469 static void dopdihs_gpu(const float cpA, const float phiA, const int mult,
 470                         const float phi, float *V, float *F)
 471 {
 472     float mdphi, sdphi;
 473
 474     mdphi = mult*phi - phiA*DEG2RAD;
 475     sdphi = sinf(mdphi);
 476     *V    = cpA * (1.0f + cosf(mdphi));
 477     *F    = -cpA*mult*sdphi;
 478 }
 479
 480 template <bool calcVir>
 481 __device__
 482 static void do_dih_fup_gpu(const int i, const int j, const int k, const int l,
 483                            const float ddphi, const fvec r_ij, const fvec r_kj, const fvec r_kl,
 484                            const fvec m, const fvec n, fvec force[], fvec fshift[],
 485                            const PbcAiuc &pbcAiuc,
 486                            const float4 x[], const int t1, const int t2, const int gmx_unused t3)
 487 {
 488     float iprm  = iprod_gpu(m, m);
 489     float iprn  = iprod_gpu(n, n);
 490     float nrkj2 = iprod_gpu(r_kj, r_kj);
 491     float toler = nrkj2*GMX_REAL_EPS;
 492     if ((iprm > toler) && (iprn > toler))
 493     {
 494         float nrkj_1 = rsqrtf(nrkj2); // replacing std::invsqrt call
 495         float nrkj_2 = nrkj_1*nrkj_1;
 496         float nrkj   = nrkj2*nrkj_1;
 497         float a      = -ddphi*nrkj/iprm;
 498         fvec  f_i;
 499         svmul_gpu(a, m, f_i);
 500         float b      = ddphi*nrkj/iprn;
 501         fvec  f_l;
 502         svmul_gpu(b, n, f_l);
 503         float p      = iprod_gpu(r_ij, r_kj);
 504         p           *= nrkj_2;
 505         float q      = iprod_gpu(r_kl, r_kj);
 506         q           *= nrkj_2;
 507         fvec  uvec;
 508         svmul_gpu(p, f_i, uvec);
 509         fvec  vvec;
 510         svmul_gpu(q, f_l, vvec);
 511         fvec  svec;
 512         fvec_sub_gpu(uvec, vvec, svec);
 513         fvec  f_j;
 514         fvec_sub_gpu(f_i, svec, f_j);
 515         fvec  f_k;
 516         fvec_add_gpu(f_l, svec, f_k);
 517 #pragma unroll
 518         for (int m = 0; (m < DIM); m++)
 519         {
 520             atomicAdd(&force[i][m], f_i[m]);
 521             atomicAdd(&force[j][m], -f_j[m]);
 522             atomicAdd(&force[k][m], -f_k[m]);
 523             atomicAdd(&force[l][m], f_l[m]);
 524         }
 525
 526         if (calcVir)
 527         {
 528             fvec dx_jl;
 529             int  t3 = pbcDxAiuc<calcVir>(pbcAiuc, x[l], x[j], dx_jl);
 530
 531             fvec_inc_atomic(fshift[t1], f_i);
 532             fvec_dec_atomic(fshift[CENTRAL], f_j);
 533             fvec_dec_atomic(fshift[t2], f_k);
 534             fvec_inc_atomic(fshift[t3], f_l);
 535         }
 536     }
 537 }
 538
 539 template <bool calcVir, bool calcEner>
 540 __global__
 541 void  pdihs_gpu(float *vtot, const int nbonds,
 542                 const t_iatom forceatoms[], const t_iparams forceparams[],
 543                 const float4 x[], fvec f[], fvec fshift[],
 544                 const PbcAiuc pbcAiuc)
 545 {
 546     const int        i = blockIdx.x*blockDim.x + threadIdx.x;
 547
 548     __shared__ float vtot_loc;
 549     __shared__ fvec  fshift_loc[SHIFTS];
 550
 551     if (calcVir || calcEner)
 552     {
 553         if (threadIdx.x == 0)
 554         {
 555             vtot_loc = 0.0f;
 556         }
 557         if (threadIdx.x < SHIFTS)
 558         {
 559             fshift_loc[threadIdx.x][XX] = 0.0f;
 560             fshift_loc[threadIdx.x][YY] = 0.0f;
 561             fshift_loc[threadIdx.x][ZZ] = 0.0f;
 562         }
 563         __syncthreads();
 564     }
 565
 566     if (i < nbonds)
 567     {
 568         int   type = forceatoms[5*i];
 569         int   ai   = forceatoms[5*i + 1];
 570         int   aj   = forceatoms[5*i + 2];
 571         int   ak   = forceatoms[5*i + 3];
 572         int   al   = forceatoms[5*i + 4];
 573
 574         fvec  r_ij;
 575         fvec  r_kj;
 576         fvec  r_kl;
 577         fvec  m;
 578         fvec  n;
 579         int   t1;
 580         int   t2;
 581         int   t3;
 582         float phi  =
 583             dih_angle_gpu<calcVir>(x[ai], x[aj], x[ak], x[al], pbcAiuc,
 584                                    r_ij, r_kj, r_kl, m, n, &t1, &t2, &t3);
 585
 586         float vpd;
 587         float ddphi;
 588         dopdihs_gpu(forceparams[type].pdihs.cpA,
 589                     forceparams[type].pdihs.phiA,
 590                     forceparams[type].pdihs.mult,
 591                     phi, &vpd, &ddphi);
 592
 593         if (calcEner)
 594         {
 595             atomicAdd(&vtot_loc, vpd);
 596         }
 597
 598         do_dih_fup_gpu<calcVir>(ai, aj, ak, al,
 599                                 ddphi, r_ij, r_kj, r_kl, m, n,
 600                                 f, fshift_loc, pbcAiuc,
 601                                 x, t1, t2, t3);
 602
 603     }
 604
 605     if (calcVir || calcEner)
 606     {
 607         __syncthreads();
 608
 609         if (calcEner && threadIdx.x == 0)
 610         {
 611             atomicAdd(vtot, vtot_loc);
 612         }
 613         if (calcVir && threadIdx.x < SHIFTS)
 614         {
 615             fvec_inc_atomic(fshift[threadIdx.x], fshift_loc[threadIdx.x]);
 616         }
 617     }
 618 }
 619
 620 template <bool calcVir, bool calcEner>
 621 __global__
 622 void rbdihs_gpu(float *vtot, const int nbonds,
 623                 const t_iatom forceatoms[], const t_iparams forceparams[],
 624                 const float4 x[], fvec f[], fvec fshift[],
 625                 const PbcAiuc pbcAiuc)
 626 {
 627     constexpr float  c0 = 0.0f, c1 = 1.0f, c2 = 2.0f, c3 = 3.0f, c4 = 4.0f, c5 = 5.0f;
 628
 629     __shared__ float vtot_loc;
 630     __shared__ fvec  fshift_loc[SHIFTS];
 631
 632     const int        i = blockIdx.x*blockDim.x + threadIdx.x;
 633
 634     if (calcVir || calcEner)
 635     {
 636         if (threadIdx.x == 0)
 637         {
 638             vtot_loc = 0.0f;
 639         }
 640         if (threadIdx.x < SHIFTS)
 641         {
 642             fshift_loc[threadIdx.x][XX] = 0.0f;
 643             fshift_loc[threadIdx.x][YY] = 0.0f;
 644             fshift_loc[threadIdx.x][ZZ] = 0.0f;
 645         }
 646
 647         __syncthreads();
 648     }
 649
 650     if (i < nbonds)
 651     {
 652         int   type = forceatoms[5*i];
 653         int   ai   = forceatoms[5*i+1];
 654         int   aj   = forceatoms[5*i+2];
 655         int   ak   = forceatoms[5*i+3];
 656         int   al   = forceatoms[5*i+4];
 657
 658         fvec  r_ij;
 659         fvec  r_kj;
 660         fvec  r_kl;
 661         fvec  m;
 662         fvec  n;
 663         int   t1;
 664         int   t2;
 665         int   t3;
 666         float phi  =
 667             dih_angle_gpu<calcVir>(x[ai], x[aj], x[ak], x[al], pbcAiuc,
 668                                    r_ij, r_kj, r_kl, m, n, &t1, &t2, &t3);
 669
 670         /* Change to polymer convention */
 671         if (phi < c0)
 672         {
 673             phi += CUDART_PI_F;
 674         }
 675         else
 676         {
 677             phi -= CUDART_PI_F;
 678
 679         }
 680         float cos_phi = cosf(phi);
 681         /* Beware of accuracy loss, cannot use 1-sqrt(cos^2) ! */
 682         float sin_phi = sinf(phi);
 683
 684         float parm[NR_RBDIHS];
 685         for (int j = 0; j < NR_RBDIHS; j++)
 686         {
 687             parm[j]  = forceparams[type].rbdihs.rbcA[j];
 688         }
 689         /* Calculate cosine powers */
 690         /* Calculate the energy */
 691         /* Calculate the derivative */
 692         float v      = parm[0];
 693         float ddphi  = c0;
 694         float cosfac = c1;
 695
 696         float rbp    = parm[1];
 697         ddphi       += rbp*cosfac;
 698         cosfac      *= cos_phi;
 699         if (calcEner)
 700         {
 701             v       += cosfac*rbp;
 702         }
 703         rbp          = parm[2];
 704         ddphi       += c2*rbp*cosfac;
 705         cosfac      *= cos_phi;
 706         if (calcEner)
 707         {
 708             v       += cosfac*rbp;
 709         }
 710         rbp          = parm[3];
 711         ddphi       += c3*rbp*cosfac;
 712         cosfac      *= cos_phi;
 713         if (calcEner)
 714         {
 715             v       += cosfac*rbp;
 716         }
 717         rbp          = parm[4];
 718         ddphi       += c4*rbp*cosfac;
 719         cosfac      *= cos_phi;
 720         if (calcEner)
 721         {
 722             v       += cosfac*rbp;
 723         }
 724         rbp          = parm[5];
 725         ddphi       += c5*rbp*cosfac;
 726         cosfac      *= cos_phi;
 727         if (calcEner)
 728         {
 729             v       += cosfac*rbp;
 730         }
 731
 732         ddphi = -ddphi*sin_phi;
 733
 734         do_dih_fup_gpu<calcVir>(ai, aj, ak, al,
 735                                 ddphi, r_ij, r_kj, r_kl, m, n,
 736                                 f, fshift_loc, pbcAiuc,
 737                                 x, t1, t2, t3);
 738         if (calcEner)
 739         {
 740             atomicAdd(&vtot_loc, v);
 741         }
 742     }
 743
 744     if (calcVir || calcEner)
 745     {
 746         __syncthreads();
 747
 748         if (calcEner && threadIdx.x == 0)
 749         {
 750             atomicAdd(vtot, vtot_loc);
 751         }
 752         if (calcVir && threadIdx.x < SHIFTS)
 753         {
 754             fvec_inc_atomic(fshift[threadIdx.x], fshift_loc[threadIdx.x]);
 755         }
 756     }
 757 }
 758
 759 __device__ __forceinline__
 760 static void make_dp_periodic_gpu(float *dp)
 761 {
 762     /* dp cannot be outside (-pi,pi) */
 763     if (*dp >= CUDART_PI_F)
 764     {
 765         *dp -= 2.0f*CUDART_PI_F;
 766     }
 767     else if (*dp < -CUDART_PI_F)
 768     {
 769         *dp += 2.0f*CUDART_PI_F;
 770     }
 771 }
 772
 773 template <bool calcVir, bool calcEner>
 774 __global__
 775 void  idihs_gpu(float *vtot, const int nbonds,
 776                 const t_iatom forceatoms[], const t_iparams forceparams[],
 777                 const float4 x[], fvec f[], fvec fshift[],
 778                 const PbcAiuc pbcAiuc)
 779 {
 780     const int        i = blockIdx.x*blockDim.x + threadIdx.x;
 781
 782     __shared__ float vtot_loc;
 783     __shared__ fvec  fshift_loc[SHIFTS];
 784
 785     if (calcVir || calcEner)
 786     {
 787         if (threadIdx.x == 0)
 788         {
 789             vtot_loc = 0.0f;
 790         }
 791         if (threadIdx.x < SHIFTS)
 792         {
 793             fshift_loc[threadIdx.x][XX] = 0.0f;
 794             fshift_loc[threadIdx.x][YY] = 0.0f;
 795             fshift_loc[threadIdx.x][ZZ] = 0.0f;
 796         }
 797         __syncthreads();
 798     }
 799
 800     if (i < nbonds)
 801     {
 802         int   type = forceatoms[5*i];
 803         int   ai   = forceatoms[5*i + 1];
 804         int   aj   = forceatoms[5*i + 2];
 805         int   ak   = forceatoms[5*i + 3];
 806         int   al   = forceatoms[5*i + 4];
 807
 808         fvec  r_ij;
 809         fvec  r_kj;
 810         fvec  r_kl;
 811         fvec  m;
 812         fvec  n;
 813         int   t1;
 814         int   t2;
 815         int   t3;
 816         float phi  =
 817             dih_angle_gpu<calcVir>(x[ai], x[aj], x[ak], x[al], pbcAiuc,
 818                                    r_ij, r_kj, r_kl, m, n, &t1, &t2, &t3);
 819
 820         /* phi can jump if phi0 is close to Pi/-Pi, which will cause huge
 821          * force changes if we just apply a normal harmonic.
 822          * Instead, we first calculate phi-phi0 and take it modulo (-Pi,Pi).
 823          * This means we will never have the periodicity problem, unless
 824          * the dihedral is Pi away from phiO, which is very unlikely due to
 825          * the potential.
 826          */
 827         float kA   = forceparams[type].harmonic.krA;
 828         float pA   = forceparams[type].harmonic.rA;
 829
 830         float phi0 = pA*DEG2RAD;
 831
 832         float dp   = phi - phi0;
 833
 834         make_dp_periodic_gpu(&dp);
 835
 836         float ddphi = -kA*dp;
 837
 838         do_dih_fup_gpu<calcVir>(ai, aj, ak, al,
 839                                 -ddphi, r_ij, r_kj, r_kl, m, n,
 840                                 f, fshift_loc, pbcAiuc,
 841                                 x, t1, t2, t3);
 842
 843         if (calcEner)
 844         {
 845             atomicAdd(&vtot_loc, -0.5f*ddphi*dp);
 846         }
 847     }
 848
 849     if (calcVir || calcEner)
 850     {
 851         __syncthreads();
 852
 853         if (calcEner && threadIdx.x == 0)
 854         {
 855             atomicAdd(vtot, vtot_loc);
 856         }
 857         if (calcVir && threadIdx.x < SHIFTS)
 858         {
 859             fvec_inc_atomic(fshift[threadIdx.x], fshift_loc[threadIdx.x]);
 860         }
 861     }
 862 }
 863
 864 template <bool calcVir, bool calcEner>
 865 __global__
 866 void pairs_gpu(const int nbonds,
 867                const t_iatom iatoms[], const t_iparams iparams[],
 868                const float4 xq[], fvec force[], fvec fshift[],
 869                const PbcAiuc pbcAiuc,
 870                const float scale_factor,
 871                float *vtotVdw, float *vtotElec)
 872 {
 873     const int        i = blockIdx.x*blockDim.x+threadIdx.x;
 874
 875     __shared__ float vtotVdw_loc;
 876     __shared__ float vtotElec_loc;
 877     __shared__ fvec  fshift_loc[SHIFTS];
 878
 879     if (calcVir || calcEner)
 880     {
 881         if (threadIdx.x == 0)
 882         {
 883             vtotVdw_loc  = 0.0f;
 884             vtotElec_loc = 0.0f;
 885         }
 886
 887         if (threadIdx.x < SHIFTS)
 888         {
 889             fshift_loc[threadIdx.x][XX] = 0.0f;
 890             fshift_loc[threadIdx.x][YY] = 0.0f;
 891             fshift_loc[threadIdx.x][ZZ] = 0.0f;
 892         }
 893         __syncthreads();
 894     }
 895
 896     if (i <  nbonds)
 897     {
 898         int   itype = iatoms[3*i];
 899         int   ai    = iatoms[3*i + 1];
 900         int   aj    = iatoms[3*i + 2];
 901
 902         float qq    = xq[ai].w*xq[aj].w;
 903         float c6    = iparams[itype].lj14.c6A;
 904         float c12   = iparams[itype].lj14.c12A;
 905
 906         /* Do we need to apply full periodic boundary conditions? */
 907         fvec  dr;
 908         int   fshift_index = pbcDxAiuc<calcVir>(pbcAiuc, xq[ai], xq[aj], dr);
 909
 910         float r2    = norm2_gpu(dr);
 911         float rinv  = rsqrtf(r2);
 912         float rinv2 = rinv*rinv;
 913         float rinv6 = rinv2*rinv2*rinv2;
 914
 915         /* Calculate the Coulomb force * r */
 916         float velec = scale_factor*qq*rinv;
 917
 918         /* Calculate the LJ force * r and add it to the Coulomb part */
 919         float fr    = (12.0f*c12*rinv6 - 6.0f*c6)*rinv6 + velec;
 920
 921         float finvr = fr * rinv2;
 922         fvec  f;
 923         svmul_gpu(finvr, dr, f);
 924
 925         /* Add the forces */
 926 #pragma unroll
 927         for (int m = 0; m < DIM; m++)
 928         {
 929             atomicAdd(&force[ai][m], f[m]);
 930             atomicAdd(&force[aj][m], -f[m]);
 931         }
 932
 933         if (calcEner)
 934         {
 935             atomicAdd(&vtotVdw_loc, (c12*rinv6 - c6)*rinv6);
 936             atomicAdd(&vtotElec_loc, velec);
 937         }
 938
 939         if (calcVir && fshift_index != CENTRAL)
 940         {
 941             fvec_inc_atomic(fshift_loc[fshift_index], f);
 942             fvec_dec_atomic(fshift_loc[CENTRAL], f);
 943         }
 944     }
 945
 946     if (calcVir || calcEner)
 947     {
 948         __syncthreads();
 949
 950         if (calcEner && threadIdx.x == 0)
 951         {
 952             atomicAdd(vtotVdw, vtotVdw_loc);
 953             atomicAdd(vtotElec, vtotElec_loc);
 954         }
 955         if (calcVir && threadIdx.x < SHIFTS)
 956         {
 957             fvec_inc_atomic(fshift[threadIdx.x], fshift_loc[threadIdx.x]);
 958         }
 959     }
 960 }
 961
 962 /*-------------------------------- End CUDA kernels-----------------------------*/
 963
 964
 965 static void setPbcAiuc(int           numPbcDim,
 966                        const matrix  box,
 967                        PbcAiuc      *pbcAiuc)
 968 {
 969     if (numPbcDim > ZZ)
 970     {
 971         pbcAiuc->invBoxDiagZ = 1/box[ZZ][ZZ];
 972         pbcAiuc->boxZX       = box[ZZ][XX];
 973         pbcAiuc->boxZY       = box[ZZ][YY];
 974         pbcAiuc->boxZZ       = box[ZZ][ZZ];
 975     }
 976     else
 977     {
 978         pbcAiuc->invBoxDiagZ = 0;
 979         pbcAiuc->boxZX       = 0;
 980         pbcAiuc->boxZY       = 0;
 981         pbcAiuc->boxZZ       = 0;
 982     }
 983     if (numPbcDim > YY)
 984     {
 985         pbcAiuc->invBoxDiagY = 1/box[YY][YY];
 986         pbcAiuc->boxYX       = box[YY][XX];
 987         pbcAiuc->boxYY       = box[YY][YY];
 988     }
 989     else
 990     {
 991         pbcAiuc->invBoxDiagY = 0;
 992         pbcAiuc->boxYX       = 0;
 993         pbcAiuc->boxYY       = 0;
 994     }
 995     if (numPbcDim > XX)
 996     {
 997         pbcAiuc->invBoxDiagX = 1/box[XX][XX];
 998         pbcAiuc->boxXX       = box[XX][XX];
 999     }
1000     else
1001     {
1002         pbcAiuc->invBoxDiagX = 0;
1003         pbcAiuc->boxXX       = 0;
1004     }
1005 }
1006
1007 namespace gmx
1008 {
1009
1010 template <bool calcVir, bool calcEner>
1011 void
1012 GpuBonded::Impl::launchKernels(const t_forcerec *fr,
1013                                const matrix      box)
1014 {
1015     GMX_ASSERT(haveInteractions_,
1016                "Cannot launch bonded GPU kernels unless bonded GPU work was scheduled");
1017
1018     PbcAiuc       pbcAiuc;
1019     setPbcAiuc(fr->bMolPBC ? ePBC2npbcdim(fr->ePBC) : 0, box, &pbcAiuc);
1020
1021     const t_iparams *forceparams_d = forceparamsDevice;
1022     float           *vtot_d        = vtotDevice;
1023     const float4    *xq_d          = xqDevice;
1024     fvec            *force_d       = forceDevice;
1025     fvec            *fshift_d      = fshiftDevice;
1026
1027     for (int ftype : ftypesOnGpu)
1028     {
1029         const auto &iList = iLists[ftype];
1030
1031         if (iList.size() > 0)
1032         {
1033             int                nat1   = interaction_function[ftype].nratoms + 1;
1034             int                nbonds = iList.size()/nat1;
1035
1036             KernelLaunchConfig config;
1037             config.blockSize[0] = TPB_BONDED;
1038             config.blockSize[1] = 1;
1039             config.blockSize[2] = 1;
1040             config.gridSize[0]  = (nbonds + TPB_BONDED - 1)/TPB_BONDED;
1041             config.gridSize[1]  = 1;
1042             config.gridSize[2]  = 1;
1043             config.stream       = stream;
1044
1045             const t_iatom *iatoms = iListsDevice[ftype].iatoms;
1046
1047             if (ftype == F_PDIHS || ftype == F_PIDIHS)
1048             {
1049                 auto       kernelPtr      = pdihs_gpu<calcVir, calcEner>;
1050                 float     *ftypeEnergyPtr = vtot_d + ftype;
1051                 const auto kernelArgs     = prepareGpuKernelArguments(kernelPtr, config,
1052                                                                       &ftypeEnergyPtr, &nbonds,
1053                                                                       &iatoms, &forceparams_d,
1054                                                                       &xq_d, &force_d, &fshift_d,
1055                                                                       &pbcAiuc);
1056                 launchGpuKernel(kernelPtr, config, nullptr, "pdihs_gpu<calcVir, calcEner>", kernelArgs);
1057             }
1058         }
1059     }
1060
1061     for (int ftype : ftypesOnGpu)
1062     {
1063         const auto &iList = iLists[ftype];
1064
1065         if (iList.size() > 0)
1066         {
1067             int                nat1   = interaction_function[ftype].nratoms + 1;
1068             int                nbonds = iList.size()/nat1;
1069
1070             const t_iatom     *iatoms = iListsDevice[ftype].iatoms;
1071
1072             KernelLaunchConfig config;
1073             config.blockSize[0] = TPB_BONDED;
1074             config.blockSize[1] = 1;
1075             config.blockSize[2] = 1;
1076             config.gridSize[0]  = (nbonds + TPB_BONDED - 1)/TPB_BONDED;
1077             config.gridSize[1]  = 1;
1078             config.gridSize[2]  = 1;
1079             config.stream       = stream;
1080
1081             float *ftypeEnergyPtr = vtot_d + ftype;
1082             // TODO consider using a map to assign the fn pointers to ftypes
1083             if (ftype == F_BONDS)
1084             {
1085                 auto       kernelPtr  = bonds_gpu<calcVir, calcEner>;
1086                 const auto kernelArgs = prepareGpuKernelArguments(kernelPtr, config,
1087                                                                   &ftypeEnergyPtr, &nbonds,
1088                                                                   &iatoms, &forceparams_d,
1089                                                                   &xq_d, &force_d, &fshift_d,
1090                                                                   &pbcAiuc);
1091                 launchGpuKernel(kernelPtr, config, nullptr, "bonds_gpu<calcVir, calcEner>", kernelArgs);
1092             }
1093
1094             if (ftype == F_ANGLES)
1095             {
1096                 auto       kernelPtr  = angles_gpu<calcVir, calcEner>;
1097                 const auto kernelArgs = prepareGpuKernelArguments(kernelPtr, config,
1098                                                                   &ftypeEnergyPtr, &nbonds,
1099                                                                   &iatoms, &forceparams_d,
1100                                                                   &xq_d, &force_d, &fshift_d,
1101                                                                   &pbcAiuc);
1102                 launchGpuKernel(kernelPtr, config, nullptr, "angles_gpu<calcVir, calcEner>", kernelArgs);
1103             }
1104
1105             if (ftype == F_UREY_BRADLEY)
1106             {
1107                 auto       kernelPtr  = urey_bradley_gpu<calcVir, calcEner>;
1108                 const auto kernelArgs = prepareGpuKernelArguments(kernelPtr, config,
1109                                                                   &ftypeEnergyPtr, &nbonds,
1110                                                                   &iatoms, &forceparams_d,
1111                                                                   &xq_d, &force_d, &fshift_d,
1112                                                                   &pbcAiuc);
1113                 launchGpuKernel(kernelPtr, config, nullptr, "urey_bradley_gpu<calcVir, calcEner>", kernelArgs);
1114             }
1115
1116             if (ftype == F_RBDIHS)
1117             {
1118                 auto       kernelPtr  = rbdihs_gpu<calcVir, calcEner>;
1119                 const auto kernelArgs = prepareGpuKernelArguments(kernelPtr, config,
1120                                                                   &ftypeEnergyPtr, &nbonds,
1121                                                                   &iatoms, &forceparams_d,
1122                                                                   &xq_d, &force_d, &fshift_d,
1123                                                                   &pbcAiuc);
1124                 launchGpuKernel(kernelPtr, config, nullptr, "rbdihs_gpu<calcVir, calcEner>", kernelArgs);
1125             }
1126
1127             if (ftype == F_IDIHS)
1128             {
1129                 auto       kernelPtr  = idihs_gpu<calcVir, calcEner>;
1130                 const auto kernelArgs = prepareGpuKernelArguments(kernelPtr, config,
1131                                                                   &ftypeEnergyPtr, &nbonds,
1132                                                                   &iatoms, &forceparams_d,
1133                                                                   &xq_d, &force_d, &fshift_d,
1134                                                                   &pbcAiuc);
1135                 launchGpuKernel(kernelPtr, config, nullptr, "idihs_gpu<calcVir, calcEner>", kernelArgs);
1136             }
1137
1138             if (ftype == F_LJ14)
1139             {
1140                 auto       kernelPtr       = pairs_gpu<calcVir, calcEner>;
1141                 float      scale_factor    = fr->ic->epsfac*fr->fudgeQQ;
1142                 float     *lj14Energy      = vtot_d + F_LJ14;
1143                 float     *coulomb14Energy = vtot_d + F_COUL14;
1144                 const auto kernelArgs      = prepareGpuKernelArguments(kernelPtr, config,
1145                                                                        &nbonds,
1146                                                                        &iatoms, &forceparams_d,
1147                                                                        &xq_d, &force_d, &fshift_d,
1148                                                                        &pbcAiuc,
1149                                                                        &scale_factor,
1150                                                                        &lj14Energy, &coulomb14Energy);
1151                 launchGpuKernel(kernelPtr, config, nullptr, "pairs_gpu<calcVir, calcEner>", kernelArgs);
1152             }
1153         }
1154     }
1155 }
1156
1157 void
1158 GpuBonded::launchKernels(const t_forcerec *fr,
1159                          int               forceFlags,
1160                          const matrix      box)
1161 {
1162     if (forceFlags & GMX_FORCE_ENERGY)
1163     {
1164         // When we need the energy, we also need the virial
1165         impl_->launchKernels<true, true>
1166             (fr, box);
1167     }
1168     else if (forceFlags & GMX_FORCE_VIRIAL)
1169     {
1170         impl_->launchKernels<true, false>
1171             (fr, box);
1172     }
1173     else
1174     {
1175         impl_->launchKernels<false, false>
1176             (fr, box);
1177     }
1178 }
1179
1180 } // namespace gmx