42d24e40e1a33b2d67a3fb1783e5eef31108e618
[alexxy/gromacs.git] / src / gromacs / listed_forces / gpubondedkernels.cu
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*! \internal \file
36  *
37  * \brief Implements CUDA bonded functionality
38  *
39  * \author Jon Vincent <jvincent@nvidia.com>
40  * \author Magnus Lundborg <lundborg.magnus@gmail.com>
41  * \author Berk Hess <hess@kth.se>
42  * \author Szilárd Páll <pall.szilard@gmail.com>
43  * \author Alan Gray <alang@nvidia.com>
44  * \author Mark Abraham <mark.j.abraham@gmail.com>
45  *
46  * \ingroup module_listed_forces
47  */
48
49 #include "gmxpre.h"
50
51 #include <cassert>
52
53 #include <math_constants.h>
54
55 #include "gromacs/gpu_utils/cudautils.cuh"
56 #include "gromacs/gpu_utils/typecasts.cuh"
57 #include "gromacs/gpu_utils/vectype_ops.cuh"
58 #include "gromacs/listed_forces/gpubonded.h"
59 #include "gromacs/math/units.h"
60 #include "gromacs/mdlib/force_flags.h"
61 #include "gromacs/mdtypes/interaction_const.h"
62 #include "gromacs/mdtypes/simulation_workload.h"
63 #include "gromacs/pbcutil/pbc_aiuc_cuda.cuh"
64 #include "gromacs/timing/wallcycle.h"
65 #include "gromacs/utility/gmxassert.h"
66
67 #include "gpubonded_impl.h"
68
69 #if defined(_MSVC)
70 #    include <limits>
71 #endif
72
73 /*-------------------------------- CUDA kernels-------------------------------- */
74 /*------------------------------------------------------------------------------*/
75
76 #define CUDA_DEG2RAD_F (CUDART_PI_F / 180.0f)
77
78 /*---------------- BONDED CUDA kernels--------------*/
79
80 /* Harmonic */
81 __device__ __forceinline__ static void
82            harmonic_gpu(const float kA, const float xA, const float x, float* V, float* F)
83 {
84     constexpr float half = 0.5f;
85     float           dx, dx2;
86
87     dx  = x - xA;
88     dx2 = dx * dx;
89
90     *F = -kA * dx;
91     *V = half * kA * dx2;
92 }
93
94 template<bool calcVir, bool calcEner>
95 __device__ void bonds_gpu(const int       i,
96                           float*          vtot_loc,
97                           const int       numBonds,
98                           const t_iatom   d_forceatoms[],
99                           const t_iparams d_forceparams[],
100                           const float4    gm_xq[],
101                           float3          gm_f[],
102                           float3          sm_fShiftLoc[],
103                           const PbcAiuc   pbcAiuc)
104 {
105     if (i < numBonds)
106     {
107         int3 bondData = *(int3*)(d_forceatoms + 3 * i);
108         int  type     = bondData.x;
109         int  ai       = bondData.y;
110         int  aj       = bondData.z;
111
112         /* dx = xi - xj, corrected for periodic boundary conditions. */
113         float3 dx;
114         int    ki = pbcDxAiuc<calcVir>(pbcAiuc, gm_xq[ai], gm_xq[aj], dx);
115
116         float dr2 = norm2(dx);
117         float dr  = sqrt(dr2);
118
119         float vbond;
120         float fbond;
121         harmonic_gpu(d_forceparams[type].harmonic.krA, d_forceparams[type].harmonic.rA, dr, &vbond, &fbond);
122
123         if (calcEner)
124         {
125             *vtot_loc += vbond;
126         }
127
128         if (dr2 != 0.0f)
129         {
130             fbond *= rsqrtf(dr2);
131
132             float3 fij = fbond * dx;
133             atomicAdd(&gm_f[ai], fij);
134             atomicAdd(&gm_f[aj], -fij);
135             if (calcVir && ki != CENTRAL)
136             {
137                 atomicAdd(&sm_fShiftLoc[ki], fij);
138                 atomicAdd(&sm_fShiftLoc[CENTRAL], -fij);
139             }
140         }
141     }
142 }
143
144 template<bool returnShift>
145 __device__ __forceinline__ static float bond_angle_gpu(const float4   xi,
146                                                        const float4   xj,
147                                                        const float4   xk,
148                                                        const PbcAiuc& pbcAiuc,
149                                                        float3*        r_ij,
150                                                        float3*        r_kj,
151                                                        float*         costh,
152                                                        int*           t1,
153                                                        int*           t2)
154 /* Return value is the angle between the bonds i-j and j-k */
155 {
156     *t1 = pbcDxAiuc<returnShift>(pbcAiuc, xi, xj, *r_ij);
157     *t2 = pbcDxAiuc<returnShift>(pbcAiuc, xk, xj, *r_kj);
158
159     *costh   = cos_angle(*r_ij, *r_kj);
160     float th = acosf(*costh);
161
162     return th;
163 }
164
165 template<bool calcVir, bool calcEner>
166 __device__ void angles_gpu(const int       i,
167                            float*          vtot_loc,
168                            const int       numBonds,
169                            const t_iatom   d_forceatoms[],
170                            const t_iparams d_forceparams[],
171                            const float4    gm_xq[],
172                            float3          gm_f[],
173                            float3          sm_fShiftLoc[],
174                            const PbcAiuc   pbcAiuc)
175 {
176     if (i < numBonds)
177     {
178         int4 angleData = *(int4*)(d_forceatoms + 4 * i);
179         int  type      = angleData.x;
180         int  ai        = angleData.y;
181         int  aj        = angleData.z;
182         int  ak        = angleData.w;
183
184         float3 r_ij;
185         float3 r_kj;
186         float  cos_theta;
187         int    t1;
188         int    t2;
189         float  theta = bond_angle_gpu<calcVir>(
190                 gm_xq[ai], gm_xq[aj], gm_xq[ak], pbcAiuc, &r_ij, &r_kj, &cos_theta, &t1, &t2);
191
192         float va;
193         float dVdt;
194         harmonic_gpu(d_forceparams[type].harmonic.krA,
195                      d_forceparams[type].harmonic.rA * CUDA_DEG2RAD_F,
196                      theta,
197                      &va,
198                      &dVdt);
199
200         if (calcEner)
201         {
202             *vtot_loc += va;
203         }
204
205         float cos_theta2 = cos_theta * cos_theta;
206         if (cos_theta2 < 1.0f)
207         {
208             float st    = dVdt * rsqrtf(1.0f - cos_theta2);
209             float sth   = st * cos_theta;
210             float nrij2 = norm2(r_ij);
211             float nrkj2 = norm2(r_kj);
212
213             float nrij_1 = rsqrtf(nrij2);
214             float nrkj_1 = rsqrtf(nrkj2);
215
216             float cik = st * nrij_1 * nrkj_1;
217             float cii = sth * nrij_1 * nrij_1;
218             float ckk = sth * nrkj_1 * nrkj_1;
219
220             float3 f_i = cii * r_ij - cik * r_kj;
221             float3 f_k = ckk * r_kj - cik * r_ij;
222             float3 f_j = -f_i - f_k;
223
224             atomicAdd(&gm_f[ai], f_i);
225             atomicAdd(&gm_f[aj], f_j);
226             atomicAdd(&gm_f[ak], f_k);
227
228             if (calcVir)
229             {
230                 atomicAdd(&sm_fShiftLoc[t1], f_i);
231                 atomicAdd(&sm_fShiftLoc[CENTRAL], f_j);
232                 atomicAdd(&sm_fShiftLoc[t2], f_k);
233             }
234         }
235     }
236 }
237
238 template<bool calcVir, bool calcEner>
239 __device__ void urey_bradley_gpu(const int       i,
240                                  float*          vtot_loc,
241                                  const int       numBonds,
242                                  const t_iatom   d_forceatoms[],
243                                  const t_iparams d_forceparams[],
244                                  const float4    gm_xq[],
245                                  float3          gm_f[],
246                                  float3          sm_fShiftLoc[],
247                                  const PbcAiuc   pbcAiuc)
248 {
249     if (i < numBonds)
250     {
251         int4 ubData = *(int4*)(d_forceatoms + 4 * i);
252         int  type   = ubData.x;
253         int  ai     = ubData.y;
254         int  aj     = ubData.z;
255         int  ak     = ubData.w;
256
257         float th0A = d_forceparams[type].u_b.thetaA * CUDA_DEG2RAD_F;
258         float kthA = d_forceparams[type].u_b.kthetaA;
259         float r13A = d_forceparams[type].u_b.r13A;
260         float kUBA = d_forceparams[type].u_b.kUBA;
261
262         float3 r_ij;
263         float3 r_kj;
264         float  cos_theta;
265         int    t1;
266         int    t2;
267         float  theta = bond_angle_gpu<calcVir>(
268                 gm_xq[ai], gm_xq[aj], gm_xq[ak], pbcAiuc, &r_ij, &r_kj, &cos_theta, &t1, &t2);
269
270         float va;
271         float dVdt;
272         harmonic_gpu(kthA, th0A, theta, &va, &dVdt);
273
274         if (calcEner)
275         {
276             *vtot_loc += va;
277         }
278
279         float3 r_ik;
280         int    ki = pbcDxAiuc<calcVir>(pbcAiuc, gm_xq[ai], gm_xq[ak], r_ik);
281
282         float dr2 = norm2(r_ik);
283         float dr  = dr2 * rsqrtf(dr2);
284
285         float vbond;
286         float fbond;
287         harmonic_gpu(kUBA, r13A, dr, &vbond, &fbond);
288
289         float cos_theta2 = cos_theta * cos_theta;
290         if (cos_theta2 < 1.0f)
291         {
292             float st  = dVdt * rsqrtf(1.0f - cos_theta2);
293             float sth = st * cos_theta;
294
295             float nrkj2 = norm2(r_kj);
296             float nrij2 = norm2(r_ij);
297
298             float cik = st * rsqrtf(nrkj2 * nrij2);
299             float cii = sth / nrij2;
300             float ckk = sth / nrkj2;
301
302             float3 f_i = cii * r_ij - cik * r_kj;
303             float3 f_k = ckk * r_kj - cik * r_ij;
304             float3 f_j = -f_i - f_k;
305
306             atomicAdd(&gm_f[ai], f_i);
307             atomicAdd(&gm_f[aj], f_j);
308             atomicAdd(&gm_f[ak], f_k);
309
310             if (calcVir)
311             {
312                 atomicAdd(&sm_fShiftLoc[t1], f_i);
313                 atomicAdd(&sm_fShiftLoc[CENTRAL], f_j);
314                 atomicAdd(&sm_fShiftLoc[t2], f_k);
315             }
316         }
317
318         /* Time for the bond calculations */
319         if (dr2 != 0.0f)
320         {
321             if (calcEner)
322             {
323                 *vtot_loc += vbond;
324             }
325
326             fbond *= rsqrtf(dr2);
327
328             float3 fik = fbond * r_ik;
329             atomicAdd(&gm_f[ai], fik);
330             atomicAdd(&gm_f[ak], -fik);
331
332             if (calcVir && ki != CENTRAL)
333             {
334                 atomicAdd(&sm_fShiftLoc[ki], fik);
335                 atomicAdd(&sm_fShiftLoc[CENTRAL], -fik);
336             }
337         }
338     }
339 }
340
341 template<bool returnShift, typename T>
342 __device__ __forceinline__ static float dih_angle_gpu(const T        xi,
343                                                       const T        xj,
344                                                       const T        xk,
345                                                       const T        xl,
346                                                       const PbcAiuc& pbcAiuc,
347                                                       float3*        r_ij,
348                                                       float3*        r_kj,
349                                                       float3*        r_kl,
350                                                       float3*        m,
351                                                       float3*        n,
352                                                       int*           t1,
353                                                       int*           t2,
354                                                       int*           t3)
355 {
356     *t1 = pbcDxAiuc<returnShift>(pbcAiuc, xi, xj, *r_ij);
357     *t2 = pbcDxAiuc<returnShift>(pbcAiuc, xk, xj, *r_kj);
358     *t3 = pbcDxAiuc<returnShift>(pbcAiuc, xk, xl, *r_kl);
359
360     *m         = cprod(*r_ij, *r_kj);
361     *n         = cprod(*r_kj, *r_kl);
362     float phi  = gmx_angle(*m, *n);
363     float ipr  = iprod(*r_ij, *n);
364     float sign = (ipr < 0.0f) ? -1.0f : 1.0f;
365     phi        = sign * phi;
366
367     return phi;
368 }
369
370
371 __device__ __forceinline__ static void
372            dopdihs_gpu(const float cpA, const float phiA, const int mult, const float phi, float* v, float* f)
373 {
374     float mdphi, sdphi;
375
376     mdphi = mult * phi - phiA * CUDA_DEG2RAD_F;
377     sdphi = sinf(mdphi);
378     *v    = cpA * (1.0f + cosf(mdphi));
379     *f    = -cpA * mult * sdphi;
380 }
381
382 template<bool calcVir>
383 __device__ static void do_dih_fup_gpu(const int      i,
384                                       const int      j,
385                                       const int      k,
386                                       const int      l,
387                                       const float    ddphi,
388                                       const float3   r_ij,
389                                       const float3   r_kj,
390                                       const float3   r_kl,
391                                       const float3   m,
392                                       const float3   n,
393                                       float3         gm_f[],
394                                       float3         sm_fShiftLoc[],
395                                       const PbcAiuc& pbcAiuc,
396                                       const float4   gm_xq[],
397                                       const int      t1,
398                                       const int      t2,
399                                       const int gmx_unused t3)
400 {
401     float iprm  = norm2(m);
402     float iprn  = norm2(n);
403     float nrkj2 = norm2(r_kj);
404     float toler = nrkj2 * GMX_REAL_EPS;
405     if ((iprm > toler) && (iprn > toler))
406     {
407         float  nrkj_1 = rsqrtf(nrkj2); // replacing std::invsqrt call
408         float  nrkj_2 = nrkj_1 * nrkj_1;
409         float  nrkj   = nrkj2 * nrkj_1;
410         float  a      = -ddphi * nrkj / iprm;
411         float3 f_i    = a * m;
412         float  b      = ddphi * nrkj / iprn;
413         float3 f_l    = b * n;
414         float  p      = iprod(r_ij, r_kj);
415         p *= nrkj_2;
416         float q = iprod(r_kl, r_kj);
417         q *= nrkj_2;
418         float3 uvec = p * f_i;
419         float3 vvec = q * f_l;
420         float3 svec = uvec - vvec;
421         float3 f_j  = f_i - svec;
422         float3 f_k  = f_l + svec;
423
424         atomicAdd(&gm_f[i], f_i);
425         atomicAdd(&gm_f[j], -f_j);
426         atomicAdd(&gm_f[k], -f_k);
427         atomicAdd(&gm_f[l], f_l);
428
429         if (calcVir)
430         {
431             float3 dx_jl;
432             int    t3 = pbcDxAiuc<calcVir>(pbcAiuc, gm_xq[l], gm_xq[j], dx_jl);
433
434             atomicAdd(&sm_fShiftLoc[t1], f_i);
435             atomicAdd(&sm_fShiftLoc[CENTRAL], -f_j);
436             atomicAdd(&sm_fShiftLoc[t2], -f_k);
437             atomicAdd(&sm_fShiftLoc[t3], f_l);
438         }
439     }
440 }
441
442 template<bool calcVir, bool calcEner>
443 __device__ void pdihs_gpu(const int       i,
444                           float*          vtot_loc,
445                           const int       numBonds,
446                           const t_iatom   d_forceatoms[],
447                           const t_iparams d_forceparams[],
448                           const float4    gm_xq[],
449                           float3          gm_f[],
450                           float3          sm_fShiftLoc[],
451                           const PbcAiuc   pbcAiuc)
452 {
453     if (i < numBonds)
454     {
455         int type = d_forceatoms[5 * i];
456         int ai   = d_forceatoms[5 * i + 1];
457         int aj   = d_forceatoms[5 * i + 2];
458         int ak   = d_forceatoms[5 * i + 3];
459         int al   = d_forceatoms[5 * i + 4];
460
461         float3 r_ij;
462         float3 r_kj;
463         float3 r_kl;
464         float3 m;
465         float3 n;
466         int    t1;
467         int    t2;
468         int    t3;
469         float  phi = dih_angle_gpu<calcVir>(
470                 gm_xq[ai], gm_xq[aj], gm_xq[ak], gm_xq[al], pbcAiuc, &r_ij, &r_kj, &r_kl, &m, &n, &t1, &t2, &t3);
471
472         float vpd;
473         float ddphi;
474         dopdihs_gpu(d_forceparams[type].pdihs.cpA,
475                     d_forceparams[type].pdihs.phiA,
476                     d_forceparams[type].pdihs.mult,
477                     phi,
478                     &vpd,
479                     &ddphi);
480
481         if (calcEner)
482         {
483             *vtot_loc += vpd;
484         }
485
486         do_dih_fup_gpu<calcVir>(
487                 ai, aj, ak, al, ddphi, r_ij, r_kj, r_kl, m, n, gm_f, sm_fShiftLoc, pbcAiuc, gm_xq, t1, t2, t3);
488     }
489 }
490
491 template<bool calcVir, bool calcEner>
492 __device__ void rbdihs_gpu(const int       i,
493                            float*          vtot_loc,
494                            const int       numBonds,
495                            const t_iatom   d_forceatoms[],
496                            const t_iparams d_forceparams[],
497                            const float4    gm_xq[],
498                            float3          gm_f[],
499                            float3          sm_fShiftLoc[],
500                            const PbcAiuc   pbcAiuc)
501 {
502     constexpr float c0 = 0.0f, c1 = 1.0f, c2 = 2.0f, c3 = 3.0f, c4 = 4.0f, c5 = 5.0f;
503
504     if (i < numBonds)
505     {
506         int type = d_forceatoms[5 * i];
507         int ai   = d_forceatoms[5 * i + 1];
508         int aj   = d_forceatoms[5 * i + 2];
509         int ak   = d_forceatoms[5 * i + 3];
510         int al   = d_forceatoms[5 * i + 4];
511
512         float3 r_ij;
513         float3 r_kj;
514         float3 r_kl;
515         float3 m;
516         float3 n;
517         int    t1;
518         int    t2;
519         int    t3;
520         float  phi = dih_angle_gpu<calcVir>(
521                 gm_xq[ai], gm_xq[aj], gm_xq[ak], gm_xq[al], pbcAiuc, &r_ij, &r_kj, &r_kl, &m, &n, &t1, &t2, &t3);
522
523         /* Change to polymer convention */
524         if (phi < c0)
525         {
526             phi += CUDART_PI_F;
527         }
528         else
529         {
530             phi -= CUDART_PI_F;
531         }
532         float cos_phi = cosf(phi);
533         /* Beware of accuracy loss, cannot use 1-sqrt(cos^2) ! */
534         float sin_phi = sinf(phi);
535
536         float parm[NR_RBDIHS];
537         for (int j = 0; j < NR_RBDIHS; j++)
538         {
539             parm[j] = d_forceparams[type].rbdihs.rbcA[j];
540         }
541         /* Calculate cosine powers */
542         /* Calculate the energy */
543         /* Calculate the derivative */
544         float v      = parm[0];
545         float ddphi  = c0;
546         float cosfac = c1;
547
548         float rbp = parm[1];
549         ddphi += rbp * cosfac;
550         cosfac *= cos_phi;
551         if (calcEner)
552         {
553             v += cosfac * rbp;
554         }
555         rbp = parm[2];
556         ddphi += c2 * rbp * cosfac;
557         cosfac *= cos_phi;
558         if (calcEner)
559         {
560             v += cosfac * rbp;
561         }
562         rbp = parm[3];
563         ddphi += c3 * rbp * cosfac;
564         cosfac *= cos_phi;
565         if (calcEner)
566         {
567             v += cosfac * rbp;
568         }
569         rbp = parm[4];
570         ddphi += c4 * rbp * cosfac;
571         cosfac *= cos_phi;
572         if (calcEner)
573         {
574             v += cosfac * rbp;
575         }
576         rbp = parm[5];
577         ddphi += c5 * rbp * cosfac;
578         cosfac *= cos_phi;
579         if (calcEner)
580         {
581             v += cosfac * rbp;
582         }
583
584         ddphi = -ddphi * sin_phi;
585
586         do_dih_fup_gpu<calcVir>(
587                 ai, aj, ak, al, ddphi, r_ij, r_kj, r_kl, m, n, gm_f, sm_fShiftLoc, pbcAiuc, gm_xq, t1, t2, t3);
588         if (calcEner)
589         {
590             *vtot_loc += v;
591         }
592     }
593 }
594
595 __device__ __forceinline__ static void make_dp_periodic_gpu(float* dp)
596 {
597     /* dp cannot be outside (-pi,pi) */
598     if (*dp >= CUDART_PI_F)
599     {
600         *dp -= 2.0f * CUDART_PI_F;
601     }
602     else if (*dp < -CUDART_PI_F)
603     {
604         *dp += 2.0f * CUDART_PI_F;
605     }
606 }
607
608 template<bool calcVir, bool calcEner>
609 __device__ void idihs_gpu(const int       i,
610                           float*          vtot_loc,
611                           const int       numBonds,
612                           const t_iatom   d_forceatoms[],
613                           const t_iparams d_forceparams[],
614                           const float4    gm_xq[],
615                           float3          gm_f[],
616                           float3          sm_fShiftLoc[],
617                           const PbcAiuc   pbcAiuc)
618 {
619     if (i < numBonds)
620     {
621         int type = d_forceatoms[5 * i];
622         int ai   = d_forceatoms[5 * i + 1];
623         int aj   = d_forceatoms[5 * i + 2];
624         int ak   = d_forceatoms[5 * i + 3];
625         int al   = d_forceatoms[5 * i + 4];
626
627         float3 r_ij;
628         float3 r_kj;
629         float3 r_kl;
630         float3 m;
631         float3 n;
632         int    t1;
633         int    t2;
634         int    t3;
635         float  phi = dih_angle_gpu<calcVir>(
636                 gm_xq[ai], gm_xq[aj], gm_xq[ak], gm_xq[al], pbcAiuc, &r_ij, &r_kj, &r_kl, &m, &n, &t1, &t2, &t3);
637
638         /* phi can jump if phi0 is close to Pi/-Pi, which will cause huge
639          * force changes if we just apply a normal harmonic.
640          * Instead, we first calculate phi-phi0 and take it modulo (-Pi,Pi).
641          * This means we will never have the periodicity problem, unless
642          * the dihedral is Pi away from phiO, which is very unlikely due to
643          * the potential.
644          */
645         float kA = d_forceparams[type].harmonic.krA;
646         float pA = d_forceparams[type].harmonic.rA;
647
648         float phi0 = pA * CUDA_DEG2RAD_F;
649
650         float dp = phi - phi0;
651
652         make_dp_periodic_gpu(&dp);
653
654         float ddphi = -kA * dp;
655
656         do_dih_fup_gpu<calcVir>(
657                 ai, aj, ak, al, -ddphi, r_ij, r_kj, r_kl, m, n, gm_f, sm_fShiftLoc, pbcAiuc, gm_xq, t1, t2, t3);
658
659         if (calcEner)
660         {
661             *vtot_loc += -0.5f * ddphi * dp;
662         }
663     }
664 }
665
666 template<bool calcVir, bool calcEner>
667 __device__ void pairs_gpu(const int       i,
668                           const int       numBonds,
669                           const t_iatom   d_forceatoms[],
670                           const t_iparams iparams[],
671                           const float4    gm_xq[],
672                           float3          gm_f[],
673                           float3          sm_fShiftLoc[],
674                           const PbcAiuc   pbcAiuc,
675                           const float     scale_factor,
676                           float*          vtotVdw_loc,
677                           float*          vtotElec_loc)
678 {
679     if (i < numBonds)
680     {
681         // TODO this should be made into a separate type, the GPU and CPU sizes should be compared
682         int3 pairData = *(int3*)(d_forceatoms + 3 * i);
683         int  type     = pairData.x;
684         int  ai       = pairData.y;
685         int  aj       = pairData.z;
686
687         float qq  = gm_xq[ai].w * gm_xq[aj].w;
688         float c6  = iparams[type].lj14.c6A;
689         float c12 = iparams[type].lj14.c12A;
690
691         /* Do we need to apply full periodic boundary conditions? */
692         float3 dr;
693         int    fshift_index = pbcDxAiuc<calcVir>(pbcAiuc, gm_xq[ai], gm_xq[aj], dr);
694
695         float r2    = norm2(dr);
696         float rinv  = rsqrtf(r2);
697         float rinv2 = rinv * rinv;
698         float rinv6 = rinv2 * rinv2 * rinv2;
699
700         /* Calculate the Coulomb force * r */
701         float velec = scale_factor * qq * rinv;
702
703         /* Calculate the LJ force * r and add it to the Coulomb part */
704         float fr = (12.0f * c12 * rinv6 - 6.0f * c6) * rinv6 + velec;
705
706         float  finvr = fr * rinv2;
707         float3 f     = finvr * dr;
708
709         /* Add the forces */
710         atomicAdd(&gm_f[ai], f);
711         atomicAdd(&gm_f[aj], -f);
712         if (calcVir && fshift_index != CENTRAL)
713         {
714             atomicAdd(&sm_fShiftLoc[fshift_index], f);
715             atomicAdd(&sm_fShiftLoc[CENTRAL], -f);
716         }
717
718         if (calcEner)
719         {
720             *vtotVdw_loc += (c12 * rinv6 - c6) * rinv6;
721             *vtotElec_loc += velec;
722         }
723     }
724 }
725
726 namespace gmx
727 {
728
729 template<bool calcVir, bool calcEner>
730 __global__ void exec_kernel_gpu(BondedCudaKernelParameters kernelParams)
731 {
732     assert(blockDim.y == 1 && blockDim.z == 1);
733     const int tid          = blockIdx.x * blockDim.x + threadIdx.x;
734     float     vtot_loc     = 0;
735     float     vtotVdw_loc  = 0;
736     float     vtotElec_loc = 0;
737
738     extern __shared__ char sm_dynamicShmem[];
739     char*                  sm_nextSlotPtr = sm_dynamicShmem;
740     float3*                sm_fShiftLoc   = (float3*)sm_nextSlotPtr;
741     sm_nextSlotPtr += SHIFTS * sizeof(float3);
742
743     if (calcVir)
744     {
745         if (threadIdx.x < SHIFTS)
746         {
747             sm_fShiftLoc[threadIdx.x] = make_float3(0.0f, 0.0f, 0.0f);
748         }
749         __syncthreads();
750     }
751
752     int  fType;
753     bool threadComputedPotential = false;
754 #pragma unroll
755     for (int j = 0; j < numFTypesOnGpu; j++)
756     {
757         if (tid >= kernelParams.fTypeRangeStart[j] && tid <= kernelParams.fTypeRangeEnd[j])
758         {
759             const int      numBonds = kernelParams.numFTypeBonds[j];
760             int            fTypeTid = tid - kernelParams.fTypeRangeStart[j];
761             const t_iatom* iatoms   = kernelParams.d_iatoms[j];
762             fType                   = kernelParams.fTypesOnGpu[j];
763             if (calcEner)
764             {
765                 threadComputedPotential = true;
766             }
767
768             switch (fType)
769             {
770                 case F_BONDS:
771                     bonds_gpu<calcVir, calcEner>(fTypeTid,
772                                                  &vtot_loc,
773                                                  numBonds,
774                                                  iatoms,
775                                                  kernelParams.d_forceParams,
776                                                  kernelParams.d_xq,
777                                                  kernelParams.d_f,
778                                                  sm_fShiftLoc,
779                                                  kernelParams.pbcAiuc);
780                     break;
781                 case F_ANGLES:
782                     angles_gpu<calcVir, calcEner>(fTypeTid,
783                                                   &vtot_loc,
784                                                   numBonds,
785                                                   iatoms,
786                                                   kernelParams.d_forceParams,
787                                                   kernelParams.d_xq,
788                                                   kernelParams.d_f,
789                                                   sm_fShiftLoc,
790                                                   kernelParams.pbcAiuc);
791                     break;
792                 case F_UREY_BRADLEY:
793                     urey_bradley_gpu<calcVir, calcEner>(fTypeTid,
794                                                         &vtot_loc,
795                                                         numBonds,
796                                                         iatoms,
797                                                         kernelParams.d_forceParams,
798                                                         kernelParams.d_xq,
799                                                         kernelParams.d_f,
800                                                         sm_fShiftLoc,
801                                                         kernelParams.pbcAiuc);
802                     break;
803                 case F_PDIHS:
804                 case F_PIDIHS:
805                     pdihs_gpu<calcVir, calcEner>(fTypeTid,
806                                                  &vtot_loc,
807                                                  numBonds,
808                                                  iatoms,
809                                                  kernelParams.d_forceParams,
810                                                  kernelParams.d_xq,
811                                                  kernelParams.d_f,
812                                                  sm_fShiftLoc,
813                                                  kernelParams.pbcAiuc);
814                     break;
815                 case F_RBDIHS:
816                     rbdihs_gpu<calcVir, calcEner>(fTypeTid,
817                                                   &vtot_loc,
818                                                   numBonds,
819                                                   iatoms,
820                                                   kernelParams.d_forceParams,
821                                                   kernelParams.d_xq,
822                                                   kernelParams.d_f,
823                                                   sm_fShiftLoc,
824                                                   kernelParams.pbcAiuc);
825                     break;
826                 case F_IDIHS:
827                     idihs_gpu<calcVir, calcEner>(fTypeTid,
828                                                  &vtot_loc,
829                                                  numBonds,
830                                                  iatoms,
831                                                  kernelParams.d_forceParams,
832                                                  kernelParams.d_xq,
833                                                  kernelParams.d_f,
834                                                  sm_fShiftLoc,
835                                                  kernelParams.pbcAiuc);
836                     break;
837                 case F_LJ14:
838                     pairs_gpu<calcVir, calcEner>(fTypeTid,
839                                                  numBonds,
840                                                  iatoms,
841                                                  kernelParams.d_forceParams,
842                                                  kernelParams.d_xq,
843                                                  kernelParams.d_f,
844                                                  sm_fShiftLoc,
845                                                  kernelParams.pbcAiuc,
846                                                  kernelParams.electrostaticsScaleFactor,
847                                                  &vtotVdw_loc,
848                                                  &vtotElec_loc);
849                     break;
850             }
851             break;
852         }
853     }
854
855     if (threadComputedPotential)
856     {
857         float* vtotVdw  = kernelParams.d_vTot + F_LJ14;
858         float* vtotElec = kernelParams.d_vTot + F_COUL14;
859
860         // Stage atomic accumulation through shared memory:
861         // each warp will accumulate its own partial sum
862         // and then a single thread per warp will accumulate this to the global sum
863
864         int numWarps = blockDim.x / warpSize;
865         int warpId   = threadIdx.x / warpSize;
866
867         // Shared memory variables to hold block-local partial sum
868         float* sm_vTot = (float*)sm_nextSlotPtr;
869         sm_nextSlotPtr += numWarps * sizeof(float);
870         float* sm_vTotVdw = (float*)sm_nextSlotPtr;
871         sm_nextSlotPtr += numWarps * sizeof(float);
872         float* sm_vTotElec = (float*)sm_nextSlotPtr;
873
874         if (threadIdx.x % warpSize == 0)
875         {
876             // One thread per warp initializes to zero
877             sm_vTot[warpId]     = 0.;
878             sm_vTotVdw[warpId]  = 0.;
879             sm_vTotElec[warpId] = 0.;
880         }
881         __syncwarp(); // All threads in warp must wait for initialization
882
883         // Perform warp-local accumulation in shared memory
884         atomicAdd(sm_vTot + warpId, vtot_loc);
885         atomicAdd(sm_vTotVdw + warpId, vtotVdw_loc);
886         atomicAdd(sm_vTotElec + warpId, vtotElec_loc);
887
888         __syncwarp(); // Ensure all threads in warp have completed
889         if (threadIdx.x % warpSize == 0)
890         { // One thread per warp accumulates partial sum into global sum
891             atomicAdd(kernelParams.d_vTot + fType, sm_vTot[warpId]);
892             atomicAdd(vtotVdw, sm_vTotVdw[warpId]);
893             atomicAdd(vtotElec, sm_vTotElec[warpId]);
894         }
895     }
896     /* Accumulate shift vectors from shared memory to global memory on the first SHIFTS threads of the block. */
897     if (calcVir)
898     {
899         __syncthreads();
900         if (threadIdx.x < SHIFTS)
901         {
902             atomicAdd(kernelParams.d_fShift[threadIdx.x], sm_fShiftLoc[threadIdx.x]);
903         }
904     }
905 }
906
907
908 /*-------------------------------- End CUDA kernels-----------------------------*/
909
910
911 template<bool calcVir, bool calcEner>
912 void GpuBonded::Impl::launchKernel()
913 {
914     GMX_ASSERT(haveInteractions_,
915                "Cannot launch bonded GPU kernels unless bonded GPU work was scheduled");
916
917     wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
918     wallcycle_sub_start(wcycle_, ewcsLAUNCH_GPU_BONDED);
919
920     int fTypeRangeEnd = kernelParams_.fTypeRangeEnd[numFTypesOnGpu - 1];
921
922     if (fTypeRangeEnd < 0)
923     {
924         return;
925     }
926
927     auto kernelPtr = exec_kernel_gpu<calcVir, calcEner>;
928
929     const auto kernelArgs = prepareGpuKernelArguments(kernelPtr, kernelLaunchConfig_, &kernelParams_);
930
931     launchGpuKernel(kernelPtr,
932                     kernelLaunchConfig_,
933                     deviceStream_,
934                     nullptr,
935                     "exec_kernel_gpu<calcVir, calcEner>",
936                     kernelArgs);
937
938     wallcycle_sub_stop(wcycle_, ewcsLAUNCH_GPU_BONDED);
939     wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
940 }
941
942 void GpuBonded::launchKernel(const gmx::StepWorkload& stepWork)
943 {
944     if (stepWork.computeEnergy)
945     {
946         // When we need the energy, we also need the virial
947         impl_->launchKernel<true, true>();
948     }
949     else if (stepWork.computeVirial)
950     {
951         impl_->launchKernel<true, false>();
952     }
953     else
954     {
955         impl_->launchKernel<false, false>();
956     }
957 }
958
959 } // namespace gmx