2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2018,2019, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
37 * \brief Implements CUDA bonded functionality
39 * \author Jon Vincent <jvincent@nvidia.com>
40 * \author Magnus Lundborg <lundborg.magnus@gmail.com>
41 * \author Berk Hess <hess@kth.se>
42 * \author Szilárd Páll <pall.szilard@gmail.com>
43 * \author Alan Gray <alang@nvidia.com>
44 * \author Mark Abraham <mark.j.abraham@gmail.com>
46 * \ingroup module_listed_forces
51 #include <math_constants.h>
53 #include "gromacs/gpu_utils/cudautils.cuh"
54 #include "gromacs/gpu_utils/devicebuffer.h"
55 #include "gromacs/gpu_utils/gpu_vec.cuh"
56 #include "gromacs/gpu_utils/gputraits.cuh"
57 #include "gromacs/listed_forces/gpubonded.h"
58 #include "gromacs/listed_forces/listed_forces.h"
59 #include "gromacs/math/units.h"
60 #include "gromacs/mdlib/force_flags.h"
61 #include "gromacs/mdtypes/enerdata.h"
62 #include "gromacs/mdtypes/forcerec.h"
63 #include "gromacs/mdtypes/group.h"
64 #include "gromacs/mdtypes/mdatom.h"
65 #include "gromacs/pbcutil/ishift.h"
66 #include "gromacs/pbcutil/pbc.h"
67 #include "gromacs/pbcutil/pbc_aiuc_cuda.cuh"
68 #include "gromacs/topology/idef.h"
69 #include "gromacs/topology/ifunc.h"
70 #include "gromacs/utility/gmxassert.h"
72 #include "gpubonded_impl.h"
78 //CUDA threads per block
79 #define TPB_BONDED 256
81 /*-------------------------------- CUDA kernels-------------------------------- */
82 /*------------------------------------------------------------------------------*/
85 /*---------------- BONDED CUDA kernels--------------*/
88 __device__ __forceinline__
89 static void harmonic_gpu(const float kA, const float xA, const float x, float *V, float *F)
91 constexpr float half = 0.5f;
101 template <bool calcVir, bool calcEner>
103 void bonds_gpu(float *vtot, const int nbonds,
104 const t_iatom forceatoms[], const t_iparams forceparams[],
105 const float4 xq[], fvec force[], fvec fshift[],
106 const PbcAiuc pbcAiuc)
108 const int i = blockIdx.x * blockDim.x + threadIdx.x;
110 __shared__ float vtot_loc;
111 __shared__ fvec fshift_loc[SHIFTS];
113 if (calcVir || calcEner)
115 if (threadIdx.x == 0)
119 if (threadIdx.x < SHIFTS)
121 fshift_loc[threadIdx.x][XX] = 0.0f;
122 fshift_loc[threadIdx.x][YY] = 0.0f;
123 fshift_loc[threadIdx.x][ZZ] = 0.0f;
130 int type = forceatoms[3*i];
131 int ai = forceatoms[3*i + 1];
132 int aj = forceatoms[3*i + 2];
134 /* dx = xi - xj, corrected for periodic boundry conditions. */
136 int ki = pbcDxAiuc<calcVir>(pbcAiuc, xq[ai], xq[aj], dx);
138 float dr2 = iprod_gpu(dx, dx);
139 float dr = sqrt(dr2);
143 harmonic_gpu(forceparams[type].harmonic.krA,
144 forceparams[type].harmonic.rA,
149 atomicAdd(&vtot_loc, vbond);
154 fbond *= rsqrtf(dr2);
157 for (int m = 0; m < DIM; m++)
159 float fij = fbond*dx[m];
160 atomicAdd(&force[ai][m], fij);
161 atomicAdd(&force[aj][m], -fij);
162 if (calcVir && ki != CENTRAL)
164 atomicAdd(&fshift_loc[ki][m], fij);
165 atomicAdd(&fshift_loc[CENTRAL][m], -fij);
171 if (calcVir || calcEner)
174 if (calcEner && threadIdx.x == 0)
176 atomicAdd(vtot, vtot_loc);
178 if (calcVir && threadIdx.x < SHIFTS)
180 fvec_inc_atomic(fshift[threadIdx.x], fshift_loc[threadIdx.x]);
185 template <bool returnShift>
186 __device__ __forceinline__
187 static float bond_angle_gpu(const float4 xi, const float4 xj, const float4 xk,
188 const PbcAiuc &pbcAiuc,
189 fvec r_ij, fvec r_kj, float *costh,
191 /* Return value is the angle between the bonds i-j and j-k */
193 *t1 = pbcDxAiuc<returnShift>(pbcAiuc, xi, xj, r_ij);
194 *t2 = pbcDxAiuc<returnShift>(pbcAiuc, xk, xj, r_kj);
196 *costh = cos_angle_gpu(r_ij, r_kj);
197 float th = acosf(*costh);
202 template <bool calcVir, bool calcEner>
204 void angles_gpu(float *vtot, const int nbonds,
205 const t_iatom forceatoms[], const t_iparams forceparams[],
206 const float4 x[], fvec force[], fvec fshift[],
207 const PbcAiuc pbcAiuc)
209 __shared__ float vtot_loc;
210 __shared__ fvec fshift_loc[SHIFTS];
212 const int i = blockIdx.x*blockDim.x + threadIdx.x;
214 if (calcVir || calcEner)
216 if (threadIdx.x == 0)
220 if (threadIdx.x < SHIFTS)
222 fshift_loc[threadIdx.x][XX] = 0.0f;
223 fshift_loc[threadIdx.x][YY] = 0.0f;
224 fshift_loc[threadIdx.x][ZZ] = 0.0f;
232 int type = forceatoms[4*i];
233 int ai = forceatoms[4*i + 1];
234 int aj = forceatoms[4*i + 2];
235 int ak = forceatoms[4*i + 3];
243 bond_angle_gpu<calcVir>(x[ai], x[aj], x[ak], pbcAiuc,
244 r_ij, r_kj, &cos_theta, &t1, &t2);
248 harmonic_gpu(forceparams[type].harmonic.krA,
249 forceparams[type].harmonic.rA*DEG2RAD,
254 atomicAdd(&vtot_loc, va);
257 float cos_theta2 = cos_theta*cos_theta;
258 if (cos_theta2 < 1.0f)
260 float st = dVdt*rsqrtf(1.0f - cos_theta2);
261 float sth = st*cos_theta;
262 float nrij2 = iprod_gpu(r_ij, r_ij);
263 float nrkj2 = iprod_gpu(r_kj, r_kj);
265 float nrij_1 = rsqrtf(nrij2);
266 float nrkj_1 = rsqrtf(nrkj2);
268 float cik = st*nrij_1*nrkj_1;
269 float cii = sth*nrij_1*nrij_1;
270 float ckk = sth*nrkj_1*nrkj_1;
275 for (int m = 0; m < DIM; m++)
277 f_i[m] = -(cik*r_kj[m] - cii*r_ij[m]);
278 f_k[m] = -(cik*r_ij[m] - ckk*r_kj[m]);
279 f_j[m] = -f_i[m] - f_k[m];
280 atomicAdd(&force[ai][m], f_i[m]);
281 atomicAdd(&force[aj][m], f_j[m]);
282 atomicAdd(&force[ak][m], f_k[m]);
286 fvec_inc_atomic(fshift_loc[t1], f_i);
287 fvec_inc_atomic(fshift_loc[CENTRAL], f_j);
288 fvec_inc_atomic(fshift_loc[t2], f_k);
294 if (calcVir || calcEner)
298 if (calcEner && threadIdx.x == 0)
300 atomicAdd(vtot, vtot_loc);
302 if (calcVir && threadIdx.x < SHIFTS)
304 fvec_inc_atomic(fshift[threadIdx.x], fshift_loc[threadIdx.x]);
309 template <bool calcVir, bool calcEner>
311 void urey_bradley_gpu(float *vtot, const int nbonds,
312 const t_iatom forceatoms[], const t_iparams forceparams[],
313 const float4 x[], fvec force[], fvec fshift[],
314 const PbcAiuc pbcAiuc)
316 __shared__ float vtot_loc;
317 __shared__ fvec fshift_loc[SHIFTS];
319 const int i = blockIdx.x*blockDim.x + threadIdx.x;
321 if (calcVir || calcEner)
323 if (threadIdx.x == 0)
327 if (threadIdx.x < SHIFTS)
329 fshift_loc[threadIdx.x][XX] = 0.0f;
330 fshift_loc[threadIdx.x][YY] = 0.0f;
331 fshift_loc[threadIdx.x][ZZ] = 0.0f;
339 int type = forceatoms[4*i];
340 int ai = forceatoms[4*i+1];
341 int aj = forceatoms[4*i+2];
342 int ak = forceatoms[4*i+3];
344 float th0A = forceparams[type].u_b.thetaA*DEG2RAD;
345 float kthA = forceparams[type].u_b.kthetaA;
346 float r13A = forceparams[type].u_b.r13A;
347 float kUBA = forceparams[type].u_b.kUBA;
354 float theta = bond_angle_gpu<calcVir>(x[ai], x[aj], x[ak], pbcAiuc,
355 r_ij, r_kj, &cos_theta, &t1, &t2);
359 harmonic_gpu(kthA, th0A, theta, &va, &dVdt);
363 atomicAdd(&vtot_loc, va);
367 int ki = pbcDxAiuc<calcVir>(pbcAiuc, x[ai], x[ak], r_ik);
369 float dr2 = iprod_gpu(r_ik, r_ik);
370 float dr = dr2*rsqrtf(dr2);
374 harmonic_gpu(kUBA, r13A, dr, &vbond, &fbond);
376 float cos_theta2 = cos_theta*cos_theta;
377 if (cos_theta2 < 1.0f)
379 float st = dVdt*rsqrtf(1.0f - cos_theta2);
380 float sth = st*cos_theta;
382 float nrkj2 = iprod_gpu(r_kj, r_kj);
383 float nrij2 = iprod_gpu(r_ij, r_ij);
385 float cik = st*rsqrtf(nrkj2*nrij2);
386 float cii = sth/nrij2;
387 float ckk = sth/nrkj2;
392 for (int m = 0; m < DIM; m++)
394 f_i[m] = -(cik*r_kj[m]-cii*r_ij[m]);
395 f_k[m] = -(cik*r_ij[m]-ckk*r_kj[m]);
396 f_j[m] = -f_i[m]-f_k[m];
397 atomicAdd(&force[ai][m], f_i[m]);
398 atomicAdd(&force[aj][m], f_j[m]);
399 atomicAdd(&force[ak][m], f_k[m]);
401 fvec_inc_atomic(fshift_loc[t1], f_i);
402 fvec_inc_atomic(fshift_loc[CENTRAL], f_j);
403 fvec_inc_atomic(fshift_loc[t2], f_k);
406 /* Time for the bond calculations */
411 atomicAdd(&vtot_loc, vbond);
414 fbond *= rsqrtf(dr2);
416 for (int m = 0; m < DIM; m++)
418 float fik = fbond*r_ik[m];
419 atomicAdd(&force[ai][m], fik);
420 atomicAdd(&force[ak][m], -fik);
422 if (calcVir && ki != CENTRAL)
424 atomicAdd(&fshift_loc[ki][m], fik);
425 atomicAdd(&fshift_loc[CENTRAL][m], -fik);
431 if (calcVir || calcEner)
435 if (calcEner && threadIdx.x == 0)
437 atomicAdd(vtot, vtot_loc);
439 if (calcVir && threadIdx.x < SHIFTS)
441 fvec_inc_atomic(fshift[threadIdx.x], fshift_loc[threadIdx.x]);
446 template <bool returnShift, typename T>
447 __device__ __forceinline__
448 static float dih_angle_gpu(const T xi, const T xj, const T xk, const T xl,
449 const PbcAiuc &pbcAiuc,
450 fvec r_ij, fvec r_kj, fvec r_kl, fvec m, fvec n,
451 int *t1, int *t2, int *t3)
453 *t1 = pbcDxAiuc<returnShift>(pbcAiuc, xi, xj, r_ij);
454 *t2 = pbcDxAiuc<returnShift>(pbcAiuc, xk, xj, r_kj);
455 *t3 = pbcDxAiuc<returnShift>(pbcAiuc, xk, xl, r_kl);
457 cprod_gpu(r_ij, r_kj, m);
458 cprod_gpu(r_kj, r_kl, n);
459 float phi = gmx_angle_gpu(m, n);
460 float ipr = iprod_gpu(r_ij, n);
461 float sign = (ipr < 0.0f) ? -1.0f : 1.0f;
468 __device__ __forceinline__
469 static void dopdihs_gpu(const float cpA, const float phiA, const int mult,
470 const float phi, float *V, float *F)
474 mdphi = mult*phi - phiA*DEG2RAD;
476 *V = cpA * (1.0f + cosf(mdphi));
477 *F = -cpA*mult*sdphi;
480 template <bool calcVir>
482 static void do_dih_fup_gpu(const int i, const int j, const int k, const int l,
483 const float ddphi, const fvec r_ij, const fvec r_kj, const fvec r_kl,
484 const fvec m, const fvec n, fvec force[], fvec fshift[],
485 const PbcAiuc &pbcAiuc,
486 const float4 x[], const int t1, const int t2, const int gmx_unused t3)
488 float iprm = iprod_gpu(m, m);
489 float iprn = iprod_gpu(n, n);
490 float nrkj2 = iprod_gpu(r_kj, r_kj);
491 float toler = nrkj2*GMX_REAL_EPS;
492 if ((iprm > toler) && (iprn > toler))
494 float nrkj_1 = rsqrtf(nrkj2); // replacing std::invsqrt call
495 float nrkj_2 = nrkj_1*nrkj_1;
496 float nrkj = nrkj2*nrkj_1;
497 float a = -ddphi*nrkj/iprm;
499 svmul_gpu(a, m, f_i);
500 float b = ddphi*nrkj/iprn;
502 svmul_gpu(b, n, f_l);
503 float p = iprod_gpu(r_ij, r_kj);
505 float q = iprod_gpu(r_kl, r_kj);
508 svmul_gpu(p, f_i, uvec);
510 svmul_gpu(q, f_l, vvec);
512 fvec_sub_gpu(uvec, vvec, svec);
514 fvec_sub_gpu(f_i, svec, f_j);
516 fvec_add_gpu(f_l, svec, f_k);
518 for (int m = 0; (m < DIM); m++)
520 atomicAdd(&force[i][m], f_i[m]);
521 atomicAdd(&force[j][m], -f_j[m]);
522 atomicAdd(&force[k][m], -f_k[m]);
523 atomicAdd(&force[l][m], f_l[m]);
529 int t3 = pbcDxAiuc<calcVir>(pbcAiuc, x[l], x[j], dx_jl);
531 fvec_inc_atomic(fshift[t1], f_i);
532 fvec_dec_atomic(fshift[CENTRAL], f_j);
533 fvec_dec_atomic(fshift[t2], f_k);
534 fvec_inc_atomic(fshift[t3], f_l);
539 template <bool calcVir, bool calcEner>
541 void pdihs_gpu(float *vtot, const int nbonds,
542 const t_iatom forceatoms[], const t_iparams forceparams[],
543 const float4 x[], fvec f[], fvec fshift[],
544 const PbcAiuc pbcAiuc)
546 const int i = blockIdx.x*blockDim.x + threadIdx.x;
548 __shared__ float vtot_loc;
549 __shared__ fvec fshift_loc[SHIFTS];
551 if (calcVir || calcEner)
553 if (threadIdx.x == 0)
557 if (threadIdx.x < SHIFTS)
559 fshift_loc[threadIdx.x][XX] = 0.0f;
560 fshift_loc[threadIdx.x][YY] = 0.0f;
561 fshift_loc[threadIdx.x][ZZ] = 0.0f;
568 int type = forceatoms[5*i];
569 int ai = forceatoms[5*i + 1];
570 int aj = forceatoms[5*i + 2];
571 int ak = forceatoms[5*i + 3];
572 int al = forceatoms[5*i + 4];
583 dih_angle_gpu<calcVir>(x[ai], x[aj], x[ak], x[al], pbcAiuc,
584 r_ij, r_kj, r_kl, m, n, &t1, &t2, &t3);
588 dopdihs_gpu(forceparams[type].pdihs.cpA,
589 forceparams[type].pdihs.phiA,
590 forceparams[type].pdihs.mult,
595 atomicAdd(&vtot_loc, vpd);
598 do_dih_fup_gpu<calcVir>(ai, aj, ak, al,
599 ddphi, r_ij, r_kj, r_kl, m, n,
600 f, fshift_loc, pbcAiuc,
605 if (calcVir || calcEner)
609 if (calcEner && threadIdx.x == 0)
611 atomicAdd(vtot, vtot_loc);
613 if (calcVir && threadIdx.x < SHIFTS)
615 fvec_inc_atomic(fshift[threadIdx.x], fshift_loc[threadIdx.x]);
620 template <bool calcVir, bool calcEner>
622 void rbdihs_gpu(float *vtot, const int nbonds,
623 const t_iatom forceatoms[], const t_iparams forceparams[],
624 const float4 x[], fvec f[], fvec fshift[],
625 const PbcAiuc pbcAiuc)
627 constexpr float c0 = 0.0f, c1 = 1.0f, c2 = 2.0f, c3 = 3.0f, c4 = 4.0f, c5 = 5.0f;
629 __shared__ float vtot_loc;
630 __shared__ fvec fshift_loc[SHIFTS];
632 const int i = blockIdx.x*blockDim.x + threadIdx.x;
634 if (calcVir || calcEner)
636 if (threadIdx.x == 0)
640 if (threadIdx.x < SHIFTS)
642 fshift_loc[threadIdx.x][XX] = 0.0f;
643 fshift_loc[threadIdx.x][YY] = 0.0f;
644 fshift_loc[threadIdx.x][ZZ] = 0.0f;
652 int type = forceatoms[5*i];
653 int ai = forceatoms[5*i+1];
654 int aj = forceatoms[5*i+2];
655 int ak = forceatoms[5*i+3];
656 int al = forceatoms[5*i+4];
667 dih_angle_gpu<calcVir>(x[ai], x[aj], x[ak], x[al], pbcAiuc,
668 r_ij, r_kj, r_kl, m, n, &t1, &t2, &t3);
670 /* Change to polymer convention */
680 float cos_phi = cosf(phi);
681 /* Beware of accuracy loss, cannot use 1-sqrt(cos^2) ! */
682 float sin_phi = sinf(phi);
684 float parm[NR_RBDIHS];
685 for (int j = 0; j < NR_RBDIHS; j++)
687 parm[j] = forceparams[type].rbdihs.rbcA[j];
689 /* Calculate cosine powers */
690 /* Calculate the energy */
691 /* Calculate the derivative */
704 ddphi += c2*rbp*cosfac;
711 ddphi += c3*rbp*cosfac;
718 ddphi += c4*rbp*cosfac;
725 ddphi += c5*rbp*cosfac;
732 ddphi = -ddphi*sin_phi;
734 do_dih_fup_gpu<calcVir>(ai, aj, ak, al,
735 ddphi, r_ij, r_kj, r_kl, m, n,
736 f, fshift_loc, pbcAiuc,
740 atomicAdd(&vtot_loc, v);
744 if (calcVir || calcEner)
748 if (calcEner && threadIdx.x == 0)
750 atomicAdd(vtot, vtot_loc);
752 if (calcVir && threadIdx.x < SHIFTS)
754 fvec_inc_atomic(fshift[threadIdx.x], fshift_loc[threadIdx.x]);
759 __device__ __forceinline__
760 static void make_dp_periodic_gpu(float *dp)
762 /* dp cannot be outside (-pi,pi) */
763 if (*dp >= CUDART_PI_F)
765 *dp -= 2.0f*CUDART_PI_F;
767 else if (*dp < -CUDART_PI_F)
769 *dp += 2.0f*CUDART_PI_F;
773 template <bool calcVir, bool calcEner>
775 void idihs_gpu(float *vtot, const int nbonds,
776 const t_iatom forceatoms[], const t_iparams forceparams[],
777 const float4 x[], fvec f[], fvec fshift[],
778 const PbcAiuc pbcAiuc)
780 const int i = blockIdx.x*blockDim.x + threadIdx.x;
782 __shared__ float vtot_loc;
783 __shared__ fvec fshift_loc[SHIFTS];
785 if (calcVir || calcEner)
787 if (threadIdx.x == 0)
791 if (threadIdx.x < SHIFTS)
793 fshift_loc[threadIdx.x][XX] = 0.0f;
794 fshift_loc[threadIdx.x][YY] = 0.0f;
795 fshift_loc[threadIdx.x][ZZ] = 0.0f;
802 int type = forceatoms[5*i];
803 int ai = forceatoms[5*i + 1];
804 int aj = forceatoms[5*i + 2];
805 int ak = forceatoms[5*i + 3];
806 int al = forceatoms[5*i + 4];
817 dih_angle_gpu<calcVir>(x[ai], x[aj], x[ak], x[al], pbcAiuc,
818 r_ij, r_kj, r_kl, m, n, &t1, &t2, &t3);
820 /* phi can jump if phi0 is close to Pi/-Pi, which will cause huge
821 * force changes if we just apply a normal harmonic.
822 * Instead, we first calculate phi-phi0 and take it modulo (-Pi,Pi).
823 * This means we will never have the periodicity problem, unless
824 * the dihedral is Pi away from phiO, which is very unlikely due to
827 float kA = forceparams[type].harmonic.krA;
828 float pA = forceparams[type].harmonic.rA;
830 float phi0 = pA*DEG2RAD;
832 float dp = phi - phi0;
834 make_dp_periodic_gpu(&dp);
836 float ddphi = -kA*dp;
838 do_dih_fup_gpu<calcVir>(ai, aj, ak, al,
839 -ddphi, r_ij, r_kj, r_kl, m, n,
840 f, fshift_loc, pbcAiuc,
845 atomicAdd(&vtot_loc, -0.5f*ddphi*dp);
849 if (calcVir || calcEner)
853 if (calcEner && threadIdx.x == 0)
855 atomicAdd(vtot, vtot_loc);
857 if (calcVir && threadIdx.x < SHIFTS)
859 fvec_inc_atomic(fshift[threadIdx.x], fshift_loc[threadIdx.x]);
864 template <bool calcVir, bool calcEner>
866 void pairs_gpu(const int nbonds,
867 const t_iatom iatoms[], const t_iparams iparams[],
868 const float4 xq[], fvec force[], fvec fshift[],
869 const PbcAiuc pbcAiuc,
870 const float scale_factor,
871 float *vtotVdw, float *vtotElec)
873 const int i = blockIdx.x*blockDim.x+threadIdx.x;
875 __shared__ float vtotVdw_loc;
876 __shared__ float vtotElec_loc;
877 __shared__ fvec fshift_loc[SHIFTS];
879 if (calcVir || calcEner)
881 if (threadIdx.x == 0)
887 if (threadIdx.x < SHIFTS)
889 fshift_loc[threadIdx.x][XX] = 0.0f;
890 fshift_loc[threadIdx.x][YY] = 0.0f;
891 fshift_loc[threadIdx.x][ZZ] = 0.0f;
898 int itype = iatoms[3*i];
899 int ai = iatoms[3*i + 1];
900 int aj = iatoms[3*i + 2];
902 float qq = xq[ai].w*xq[aj].w;
903 float c6 = iparams[itype].lj14.c6A;
904 float c12 = iparams[itype].lj14.c12A;
906 /* Do we need to apply full periodic boundary conditions? */
908 int fshift_index = pbcDxAiuc<calcVir>(pbcAiuc, xq[ai], xq[aj], dr);
910 float r2 = norm2_gpu(dr);
911 float rinv = rsqrtf(r2);
912 float rinv2 = rinv*rinv;
913 float rinv6 = rinv2*rinv2*rinv2;
915 /* Calculate the Coulomb force * r */
916 float velec = scale_factor*qq*rinv;
918 /* Calculate the LJ force * r and add it to the Coulomb part */
919 float fr = (12.0f*c12*rinv6 - 6.0f*c6)*rinv6 + velec;
921 float finvr = fr * rinv2;
923 svmul_gpu(finvr, dr, f);
927 for (int m = 0; m < DIM; m++)
929 atomicAdd(&force[ai][m], f[m]);
930 atomicAdd(&force[aj][m], -f[m]);
935 atomicAdd(&vtotVdw_loc, (c12*rinv6 - c6)*rinv6);
936 atomicAdd(&vtotElec_loc, velec);
939 if (calcVir && fshift_index != CENTRAL)
941 fvec_inc_atomic(fshift_loc[fshift_index], f);
942 fvec_dec_atomic(fshift_loc[CENTRAL], f);
946 if (calcVir || calcEner)
950 if (calcEner && threadIdx.x == 0)
952 atomicAdd(vtotVdw, vtotVdw_loc);
953 atomicAdd(vtotElec, vtotElec_loc);
955 if (calcVir && threadIdx.x < SHIFTS)
957 fvec_inc_atomic(fshift[threadIdx.x], fshift_loc[threadIdx.x]);
962 /*-------------------------------- End CUDA kernels-----------------------------*/
968 template <bool calcVir, bool calcEner>
970 GpuBonded::Impl::launchKernels(const t_forcerec *fr,
973 GMX_ASSERT(haveInteractions_,
974 "Cannot launch bonded GPU kernels unless bonded GPU work was scheduled");
977 setPbcAiuc(fr->bMolPBC ? ePBC2npbcdim(fr->ePBC) : 0, box, &pbcAiuc);
979 const t_iparams *forceparams_d = forceparamsDevice;
980 float *vtot_d = vtotDevice;
981 const float4 *xq_d = xqDevice;
982 fvec *force_d = forceDevice;
983 fvec *fshift_d = fshiftDevice;
985 for (int ftype : ftypesOnGpu)
987 const auto &iList = iLists[ftype];
989 if (iList.size() > 0)
991 int nat1 = interaction_function[ftype].nratoms + 1;
992 int nbonds = iList.size()/nat1;
994 KernelLaunchConfig config;
995 config.blockSize[0] = TPB_BONDED;
996 config.blockSize[1] = 1;
997 config.blockSize[2] = 1;
998 config.gridSize[0] = (nbonds + TPB_BONDED - 1)/TPB_BONDED;
999 config.gridSize[1] = 1;
1000 config.gridSize[2] = 1;
1001 config.stream = stream;
1003 const t_iatom *iatoms = iListsDevice[ftype].iatoms;
1005 if (ftype == F_PDIHS || ftype == F_PIDIHS)
1007 auto kernelPtr = pdihs_gpu<calcVir, calcEner>;
1008 float *ftypeEnergyPtr = vtot_d + ftype;
1009 const auto kernelArgs = prepareGpuKernelArguments(kernelPtr, config,
1010 &ftypeEnergyPtr, &nbonds,
1011 &iatoms, &forceparams_d,
1012 &xq_d, &force_d, &fshift_d,
1014 launchGpuKernel(kernelPtr, config, nullptr, "pdihs_gpu<calcVir, calcEner>", kernelArgs);
1019 for (int ftype : ftypesOnGpu)
1021 const auto &iList = iLists[ftype];
1023 if (iList.size() > 0)
1025 int nat1 = interaction_function[ftype].nratoms + 1;
1026 int nbonds = iList.size()/nat1;
1028 const t_iatom *iatoms = iListsDevice[ftype].iatoms;
1030 KernelLaunchConfig config;
1031 config.blockSize[0] = TPB_BONDED;
1032 config.blockSize[1] = 1;
1033 config.blockSize[2] = 1;
1034 config.gridSize[0] = (nbonds + TPB_BONDED - 1)/TPB_BONDED;
1035 config.gridSize[1] = 1;
1036 config.gridSize[2] = 1;
1037 config.stream = stream;
1039 float *ftypeEnergyPtr = vtot_d + ftype;
1040 // TODO consider using a map to assign the fn pointers to ftypes
1041 if (ftype == F_BONDS)
1043 auto kernelPtr = bonds_gpu<calcVir, calcEner>;
1044 const auto kernelArgs = prepareGpuKernelArguments(kernelPtr, config,
1045 &ftypeEnergyPtr, &nbonds,
1046 &iatoms, &forceparams_d,
1047 &xq_d, &force_d, &fshift_d,
1049 launchGpuKernel(kernelPtr, config, nullptr, "bonds_gpu<calcVir, calcEner>", kernelArgs);
1052 if (ftype == F_ANGLES)
1054 auto kernelPtr = angles_gpu<calcVir, calcEner>;
1055 const auto kernelArgs = prepareGpuKernelArguments(kernelPtr, config,
1056 &ftypeEnergyPtr, &nbonds,
1057 &iatoms, &forceparams_d,
1058 &xq_d, &force_d, &fshift_d,
1060 launchGpuKernel(kernelPtr, config, nullptr, "angles_gpu<calcVir, calcEner>", kernelArgs);
1063 if (ftype == F_UREY_BRADLEY)
1065 auto kernelPtr = urey_bradley_gpu<calcVir, calcEner>;
1066 const auto kernelArgs = prepareGpuKernelArguments(kernelPtr, config,
1067 &ftypeEnergyPtr, &nbonds,
1068 &iatoms, &forceparams_d,
1069 &xq_d, &force_d, &fshift_d,
1071 launchGpuKernel(kernelPtr, config, nullptr, "urey_bradley_gpu<calcVir, calcEner>", kernelArgs);
1074 if (ftype == F_RBDIHS)
1076 auto kernelPtr = rbdihs_gpu<calcVir, calcEner>;
1077 const auto kernelArgs = prepareGpuKernelArguments(kernelPtr, config,
1078 &ftypeEnergyPtr, &nbonds,
1079 &iatoms, &forceparams_d,
1080 &xq_d, &force_d, &fshift_d,
1082 launchGpuKernel(kernelPtr, config, nullptr, "rbdihs_gpu<calcVir, calcEner>", kernelArgs);
1085 if (ftype == F_IDIHS)
1087 auto kernelPtr = idihs_gpu<calcVir, calcEner>;
1088 const auto kernelArgs = prepareGpuKernelArguments(kernelPtr, config,
1089 &ftypeEnergyPtr, &nbonds,
1090 &iatoms, &forceparams_d,
1091 &xq_d, &force_d, &fshift_d,
1093 launchGpuKernel(kernelPtr, config, nullptr, "idihs_gpu<calcVir, calcEner>", kernelArgs);
1096 if (ftype == F_LJ14)
1098 auto kernelPtr = pairs_gpu<calcVir, calcEner>;
1099 float scale_factor = fr->ic->epsfac*fr->fudgeQQ;
1100 float *lj14Energy = vtot_d + F_LJ14;
1101 float *coulomb14Energy = vtot_d + F_COUL14;
1102 const auto kernelArgs = prepareGpuKernelArguments(kernelPtr, config,
1104 &iatoms, &forceparams_d,
1105 &xq_d, &force_d, &fshift_d,
1108 &lj14Energy, &coulomb14Energy);
1109 launchGpuKernel(kernelPtr, config, nullptr, "pairs_gpu<calcVir, calcEner>", kernelArgs);
1116 GpuBonded::launchKernels(const t_forcerec *fr,
1120 if (forceFlags & GMX_FORCE_ENERGY)
1122 // When we need the energy, we also need the virial
1123 impl_->launchKernels<true, true>
1126 else if (forceFlags & GMX_FORCE_VIRIAL)
1128 impl_->launchKernels<true, false>
1133 impl_->launchKernels<false, false>