2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2018,2019, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
37 * \brief Implements CUDA bonded functionality
39 * \author Jon Vincent <jvincent@nvidia.com>
40 * \author Magnus Lundborg <lundborg.magnus@gmail.com>
41 * \author Berk Hess <hess@kth.se>
42 * \author Szilárd Páll <pall.szilard@gmail.com>
43 * \author Alan Gray <alang@nvidia.com>
44 * \author Mark Abraham <mark.j.abraham@gmail.com>
46 * \ingroup module_listed_forces
53 #include <math_constants.h>
55 #include "gromacs/gpu_utils/cudautils.cuh"
56 #include "gromacs/gpu_utils/gpu_vec.cuh"
57 #include "gromacs/listed_forces/gpubonded.h"
58 #include "gromacs/math/units.h"
59 #include "gromacs/mdlib/force_flags.h"
60 #include "gromacs/mdlib/ppforceworkload.h"
61 #include "gromacs/mdtypes/forcerec.h"
62 #include "gromacs/pbcutil/pbc.h"
63 #include "gromacs/pbcutil/pbc_aiuc_cuda.cuh"
64 #include "gromacs/utility/gmxassert.h"
66 #include "gpubonded_impl.h"
72 //CUDA threads per block
73 #define TPB_BONDED 256
75 /*-------------------------------- CUDA kernels-------------------------------- */
76 /*------------------------------------------------------------------------------*/
78 #define CUDA_DEG2RAD_F (CUDART_PI_F/180.0f)
80 /*---------------- BONDED CUDA kernels--------------*/
83 __device__ __forceinline__
84 static void harmonic_gpu(const float kA, const float xA, const float x, float *V, float *F)
86 constexpr float half = 0.5f;
96 template <bool calcVir, bool calcEner>
98 void bonds_gpu(const int i, float *vtot_loc, const int numBonds,
99 const t_iatom d_forceatoms[], const t_iparams d_forceparams[],
100 const float4 gm_xq[], fvec gm_f[], fvec sm_fShiftLoc[],
101 const PbcAiuc pbcAiuc)
105 int3 bondData = *(int3 *)(d_forceatoms + 3 * i);
106 int type = bondData.x;
110 /* dx = xi - xj, corrected for periodic boundary conditions. */
112 int ki = pbcDxAiuc<calcVir>(pbcAiuc, gm_xq[ai], gm_xq[aj], dx);
114 float dr2 = iprod_gpu(dx, dx);
115 float dr = sqrt(dr2);
119 harmonic_gpu(d_forceparams[type].harmonic.krA,
120 d_forceparams[type].harmonic.rA,
130 fbond *= rsqrtf(dr2);
133 for (int m = 0; m < DIM; m++)
135 float fij = fbond*dx[m];
136 atomicAdd(&gm_f[ai][m], fij);
137 atomicAdd(&gm_f[aj][m], -fij);
138 if (calcVir && ki != CENTRAL)
140 atomicAdd(&sm_fShiftLoc[ki][m], fij);
141 atomicAdd(&sm_fShiftLoc[CENTRAL][m], -fij);
148 template <bool returnShift>
149 __device__ __forceinline__
150 static float bond_angle_gpu(const float4 xi, const float4 xj, const float4 xk,
151 const PbcAiuc &pbcAiuc,
152 fvec r_ij, fvec r_kj, float *costh,
154 /* Return value is the angle between the bonds i-j and j-k */
156 *t1 = pbcDxAiuc<returnShift>(pbcAiuc, xi, xj, r_ij);
157 *t2 = pbcDxAiuc<returnShift>(pbcAiuc, xk, xj, r_kj);
159 *costh = cos_angle_gpu(r_ij, r_kj);
160 float th = acosf(*costh);
165 template <bool calcVir, bool calcEner>
167 void angles_gpu(const int i, float *vtot_loc, const int numBonds,
168 const t_iatom d_forceatoms[], const t_iparams d_forceparams[],
169 const float4 gm_xq[], fvec gm_f[], fvec sm_fShiftLoc[],
170 const PbcAiuc pbcAiuc)
174 int4 angleData = *(int4 *)(d_forceatoms + 4 * i);
175 int type = angleData.x;
176 int ai = angleData.y;
177 int aj = angleData.z;
178 int ak = angleData.w;
186 bond_angle_gpu<calcVir>(gm_xq[ai], gm_xq[aj], gm_xq[ak], pbcAiuc,
187 r_ij, r_kj, &cos_theta, &t1, &t2);
191 harmonic_gpu(d_forceparams[type].harmonic.krA,
192 d_forceparams[type].harmonic.rA*CUDA_DEG2RAD_F,
200 float cos_theta2 = cos_theta*cos_theta;
201 if (cos_theta2 < 1.0f)
203 float st = dVdt*rsqrtf(1.0f - cos_theta2);
204 float sth = st*cos_theta;
205 float nrij2 = iprod_gpu(r_ij, r_ij);
206 float nrkj2 = iprod_gpu(r_kj, r_kj);
208 float nrij_1 = rsqrtf(nrij2);
209 float nrkj_1 = rsqrtf(nrkj2);
211 float cik = st*nrij_1*nrkj_1;
212 float cii = sth*nrij_1*nrij_1;
213 float ckk = sth*nrkj_1*nrkj_1;
219 for (int m = 0; m < DIM; m++)
221 f_i[m] = -(cik*r_kj[m] - cii*r_ij[m]);
222 f_k[m] = -(cik*r_ij[m] - ckk*r_kj[m]);
223 f_j[m] = -f_i[m] - f_k[m];
224 atomicAdd(&gm_f[ai][m], f_i[m]);
225 atomicAdd(&gm_f[aj][m], f_j[m]);
226 atomicAdd(&gm_f[ak][m], f_k[m]);
229 atomicAdd(&sm_fShiftLoc[t1][m], f_i[m]);
230 atomicAdd(&sm_fShiftLoc[CENTRAL][m], f_j[m]);
231 atomicAdd(&sm_fShiftLoc[t2][m], f_k[m]);
239 template <bool calcVir, bool calcEner>
241 void urey_bradley_gpu(const int i, float *vtot_loc, const int numBonds,
242 const t_iatom d_forceatoms[], const t_iparams d_forceparams[],
243 const float4 gm_xq[], fvec gm_f[], fvec sm_fShiftLoc[],
244 const PbcAiuc pbcAiuc)
248 int4 ubData = *(int4 *)(d_forceatoms + 4 * i);
254 float th0A = d_forceparams[type].u_b.thetaA*CUDA_DEG2RAD_F;
255 float kthA = d_forceparams[type].u_b.kthetaA;
256 float r13A = d_forceparams[type].u_b.r13A;
257 float kUBA = d_forceparams[type].u_b.kUBA;
264 float theta = bond_angle_gpu<calcVir>(gm_xq[ai], gm_xq[aj], gm_xq[ak], pbcAiuc,
265 r_ij, r_kj, &cos_theta, &t1, &t2);
269 harmonic_gpu(kthA, th0A, theta, &va, &dVdt);
277 int ki = pbcDxAiuc<calcVir>(pbcAiuc, gm_xq[ai], gm_xq[ak], r_ik);
279 float dr2 = iprod_gpu(r_ik, r_ik);
280 float dr = dr2*rsqrtf(dr2);
284 harmonic_gpu(kUBA, r13A, dr, &vbond, &fbond);
286 float cos_theta2 = cos_theta*cos_theta;
287 if (cos_theta2 < 1.0f)
289 float st = dVdt*rsqrtf(1.0f - cos_theta2);
290 float sth = st*cos_theta;
292 float nrkj2 = iprod_gpu(r_kj, r_kj);
293 float nrij2 = iprod_gpu(r_ij, r_ij);
295 float cik = st*rsqrtf(nrkj2*nrij2);
296 float cii = sth/nrij2;
297 float ckk = sth/nrkj2;
303 for (int m = 0; m < DIM; m++)
305 f_i[m] = -(cik*r_kj[m]-cii*r_ij[m]);
306 f_k[m] = -(cik*r_ij[m]-ckk*r_kj[m]);
307 f_j[m] = -f_i[m]-f_k[m];
308 atomicAdd(&gm_f[ai][m], f_i[m]);
309 atomicAdd(&gm_f[aj][m], f_j[m]);
310 atomicAdd(&gm_f[ak][m], f_k[m]);
313 atomicAdd(&sm_fShiftLoc[t1][m], f_i[m]);
314 atomicAdd(&sm_fShiftLoc[CENTRAL][m], f_j[m]);
315 atomicAdd(&sm_fShiftLoc[t2][m], f_k[m]);
320 /* Time for the bond calculations */
328 fbond *= rsqrtf(dr2);
331 for (int m = 0; m < DIM; m++)
333 float fik = fbond*r_ik[m];
334 atomicAdd(&gm_f[ai][m], fik);
335 atomicAdd(&gm_f[ak][m], -fik);
337 if (calcVir && ki != CENTRAL)
339 atomicAdd(&sm_fShiftLoc[ki][m], fik);
340 atomicAdd(&sm_fShiftLoc[CENTRAL][m], -fik);
347 template <bool returnShift, typename T>
348 __device__ __forceinline__
349 static float dih_angle_gpu(const T xi, const T xj, const T xk, const T xl,
350 const PbcAiuc &pbcAiuc,
351 fvec r_ij, fvec r_kj, fvec r_kl, fvec m, fvec n,
352 int *t1, int *t2, int *t3)
354 *t1 = pbcDxAiuc<returnShift>(pbcAiuc, xi, xj, r_ij);
355 *t2 = pbcDxAiuc<returnShift>(pbcAiuc, xk, xj, r_kj);
356 *t3 = pbcDxAiuc<returnShift>(pbcAiuc, xk, xl, r_kl);
358 cprod_gpu(r_ij, r_kj, m);
359 cprod_gpu(r_kj, r_kl, n);
360 float phi = gmx_angle_gpu(m, n);
361 float ipr = iprod_gpu(r_ij, n);
362 float sign = (ipr < 0.0f) ? -1.0f : 1.0f;
369 __device__ __forceinline__
370 static void dopdihs_gpu(const float cpA, const float phiA, const int mult,
371 const float phi, float *v, float *f)
375 mdphi = mult*phi - phiA*CUDA_DEG2RAD_F;
377 *v = cpA * (1.0f + cosf(mdphi));
378 *f = -cpA*mult*sdphi;
381 template <bool calcVir>
383 static void do_dih_fup_gpu(const int i, const int j, const int k, const int l,
384 const float ddphi, const fvec r_ij, const fvec r_kj, const fvec r_kl,
385 const fvec m, const fvec n, fvec gm_f[], fvec sm_fShiftLoc[],
386 const PbcAiuc &pbcAiuc,
387 const float4 gm_xq[], const int t1, const int t2, const int gmx_unused t3)
389 float iprm = iprod_gpu(m, m);
390 float iprn = iprod_gpu(n, n);
391 float nrkj2 = iprod_gpu(r_kj, r_kj);
392 float toler = nrkj2*GMX_REAL_EPS;
393 if ((iprm > toler) && (iprn > toler))
395 float nrkj_1 = rsqrtf(nrkj2); // replacing std::invsqrt call
396 float nrkj_2 = nrkj_1*nrkj_1;
397 float nrkj = nrkj2*nrkj_1;
398 float a = -ddphi*nrkj/iprm;
400 svmul_gpu(a, m, f_i);
401 float b = ddphi*nrkj/iprn;
403 svmul_gpu(b, n, f_l);
404 float p = iprod_gpu(r_ij, r_kj);
406 float q = iprod_gpu(r_kl, r_kj);
409 svmul_gpu(p, f_i, uvec);
411 svmul_gpu(q, f_l, vvec);
413 fvec_sub_gpu(uvec, vvec, svec);
415 fvec_sub_gpu(f_i, svec, f_j);
417 fvec_add_gpu(f_l, svec, f_k);
419 for (int m = 0; (m < DIM); m++)
421 atomicAdd(&gm_f[i][m], f_i[m]);
422 atomicAdd(&gm_f[j][m], -f_j[m]);
423 atomicAdd(&gm_f[k][m], -f_k[m]);
424 atomicAdd(&gm_f[l][m], f_l[m]);
430 int t3 = pbcDxAiuc<calcVir>(pbcAiuc, gm_xq[l], gm_xq[j], dx_jl);
433 for (int m = 0; (m < DIM); m++)
435 atomicAdd(&sm_fShiftLoc[t1][m], f_i[m]);
436 atomicAdd(&sm_fShiftLoc[CENTRAL][m], -f_j[m]);
437 atomicAdd(&sm_fShiftLoc[t2][m], -f_k[m]);
438 atomicAdd(&sm_fShiftLoc[t3][m], f_l[m]);
444 template <bool calcVir, bool calcEner>
446 void pdihs_gpu(const int i, float *vtot_loc, const int numBonds,
447 const t_iatom d_forceatoms[], const t_iparams d_forceparams[],
448 const float4 gm_xq[], fvec gm_f[], fvec sm_fShiftLoc[],
449 const PbcAiuc pbcAiuc)
453 int type = d_forceatoms[5*i];
454 int ai = d_forceatoms[5*i + 1];
455 int aj = d_forceatoms[5*i + 2];
456 int ak = d_forceatoms[5*i + 3];
457 int al = d_forceatoms[5*i + 4];
468 dih_angle_gpu<calcVir>(gm_xq[ai], gm_xq[aj], gm_xq[ak], gm_xq[al], pbcAiuc,
469 r_ij, r_kj, r_kl, m, n, &t1, &t2, &t3);
473 dopdihs_gpu(d_forceparams[type].pdihs.cpA,
474 d_forceparams[type].pdihs.phiA,
475 d_forceparams[type].pdihs.mult,
483 do_dih_fup_gpu<calcVir>(ai, aj, ak, al,
484 ddphi, r_ij, r_kj, r_kl, m, n,
485 gm_f, sm_fShiftLoc, pbcAiuc,
491 template <bool calcVir, bool calcEner>
493 void rbdihs_gpu(const int i, float *vtot_loc, const int numBonds,
494 const t_iatom d_forceatoms[], const t_iparams d_forceparams[],
495 const float4 gm_xq[], fvec gm_f[], fvec sm_fShiftLoc[],
496 const PbcAiuc pbcAiuc)
498 constexpr float c0 = 0.0f, c1 = 1.0f, c2 = 2.0f, c3 = 3.0f, c4 = 4.0f, c5 = 5.0f;
502 int type = d_forceatoms[5*i];
503 int ai = d_forceatoms[5*i+1];
504 int aj = d_forceatoms[5*i+2];
505 int ak = d_forceatoms[5*i+3];
506 int al = d_forceatoms[5*i+4];
517 dih_angle_gpu<calcVir>(gm_xq[ai], gm_xq[aj], gm_xq[ak], gm_xq[al], pbcAiuc,
518 r_ij, r_kj, r_kl, m, n, &t1, &t2, &t3);
520 /* Change to polymer convention */
530 float cos_phi = cosf(phi);
531 /* Beware of accuracy loss, cannot use 1-sqrt(cos^2) ! */
532 float sin_phi = sinf(phi);
534 float parm[NR_RBDIHS];
535 for (int j = 0; j < NR_RBDIHS; j++)
537 parm[j] = d_forceparams[type].rbdihs.rbcA[j];
539 /* Calculate cosine powers */
540 /* Calculate the energy */
541 /* Calculate the derivative */
554 ddphi += c2*rbp*cosfac;
561 ddphi += c3*rbp*cosfac;
568 ddphi += c4*rbp*cosfac;
575 ddphi += c5*rbp*cosfac;
582 ddphi = -ddphi*sin_phi;
584 do_dih_fup_gpu<calcVir>(ai, aj, ak, al,
585 ddphi, r_ij, r_kj, r_kl, m, n,
586 gm_f, sm_fShiftLoc, pbcAiuc,
595 __device__ __forceinline__
596 static void make_dp_periodic_gpu(float *dp)
598 /* dp cannot be outside (-pi,pi) */
599 if (*dp >= CUDART_PI_F)
601 *dp -= 2.0f*CUDART_PI_F;
603 else if (*dp < -CUDART_PI_F)
605 *dp += 2.0f*CUDART_PI_F;
609 template <bool calcVir, bool calcEner>
611 void idihs_gpu(const int i, float *vtot_loc, const int numBonds,
612 const t_iatom d_forceatoms[], const t_iparams d_forceparams[],
613 const float4 gm_xq[], fvec gm_f[], fvec sm_fShiftLoc[],
614 const PbcAiuc pbcAiuc)
618 int type = d_forceatoms[5*i];
619 int ai = d_forceatoms[5*i + 1];
620 int aj = d_forceatoms[5*i + 2];
621 int ak = d_forceatoms[5*i + 3];
622 int al = d_forceatoms[5*i + 4];
633 dih_angle_gpu<calcVir>(gm_xq[ai], gm_xq[aj], gm_xq[ak], gm_xq[al], pbcAiuc,
634 r_ij, r_kj, r_kl, m, n, &t1, &t2, &t3);
636 /* phi can jump if phi0 is close to Pi/-Pi, which will cause huge
637 * force changes if we just apply a normal harmonic.
638 * Instead, we first calculate phi-phi0 and take it modulo (-Pi,Pi).
639 * This means we will never have the periodicity problem, unless
640 * the dihedral is Pi away from phiO, which is very unlikely due to
643 float kA = d_forceparams[type].harmonic.krA;
644 float pA = d_forceparams[type].harmonic.rA;
646 float phi0 = pA*CUDA_DEG2RAD_F;
648 float dp = phi - phi0;
650 make_dp_periodic_gpu(&dp);
652 float ddphi = -kA*dp;
654 do_dih_fup_gpu<calcVir>(ai, aj, ak, al,
655 -ddphi, r_ij, r_kj, r_kl, m, n,
656 gm_f, sm_fShiftLoc, pbcAiuc,
661 *vtot_loc += -0.5f*ddphi*dp;
666 template <bool calcVir, bool calcEner>
668 void pairs_gpu(const int i, const int numBonds,
669 const t_iatom d_forceatoms[], const t_iparams iparams[],
670 const float4 gm_xq[], fvec gm_f[], fvec sm_fShiftLoc[],
671 const PbcAiuc pbcAiuc,
672 const float scale_factor,
673 float *vtotVdw_loc, float *vtotElec_loc)
677 int3 pairData = *(int3 *)(d_forceatoms + 3 * i);
678 int type = pairData.x;
682 float qq = gm_xq[ai].w*gm_xq[aj].w;
683 float c6 = iparams[type].lj14.c6A;
684 float c12 = iparams[type].lj14.c12A;
686 /* Do we need to apply full periodic boundary conditions? */
688 int fshift_index = pbcDxAiuc<calcVir>(pbcAiuc, gm_xq[ai], gm_xq[aj], dr);
690 float r2 = norm2_gpu(dr);
691 float rinv = rsqrtf(r2);
692 float rinv2 = rinv*rinv;
693 float rinv6 = rinv2*rinv2*rinv2;
695 /* Calculate the Coulomb force * r */
696 float velec = scale_factor*qq*rinv;
698 /* Calculate the LJ force * r and add it to the Coulomb part */
699 float fr = (12.0f*c12*rinv6 - 6.0f*c6)*rinv6 + velec;
701 float finvr = fr * rinv2;
703 svmul_gpu(finvr, dr, f);
707 for (int m = 0; m < DIM; m++)
709 atomicAdd(&gm_f[ai][m], f[m]);
710 atomicAdd(&gm_f[aj][m], -f[m]);
711 if (calcVir && fshift_index != CENTRAL)
713 atomicAdd(&sm_fShiftLoc[fshift_index][m], f[m]);
714 atomicAdd(&sm_fShiftLoc[CENTRAL][m], -f[m]);
720 *vtotVdw_loc += (c12*rinv6 - c6)*rinv6;
721 *vtotElec_loc += velec;
729 template <bool calcVir, bool calcEner>
731 void exec_kernel_gpu(BondedCudaKernelParameters kernelParams)
733 assert(blockDim.y == 1 && blockDim.z == 1);
734 const int tid = blockIdx.x*blockDim.x+threadIdx.x;
736 float vtotVdw_loc = 0;
737 float vtotElec_loc = 0;
738 __shared__ fvec sm_fShiftLoc[SHIFTS];
742 if (threadIdx.x < SHIFTS)
744 sm_fShiftLoc[threadIdx.x][XX] = 0.0f;
745 sm_fShiftLoc[threadIdx.x][YY] = 0.0f;
746 sm_fShiftLoc[threadIdx.x][ZZ] = 0.0f;
752 bool threadComputedPotential = false;
754 for (int j = 0; j < numFTypesOnGpu; j++)
756 if (tid >= kernelParams.fTypeRangeStart[j] && tid <= kernelParams.fTypeRangeEnd[j])
758 const int numBonds = kernelParams.numFTypeBonds[j];
759 int fTypeTid = tid - kernelParams.fTypeRangeStart[j];
760 const t_iatom *iatoms = kernelParams.d_iatoms[j];
761 fType = kernelParams.fTypesOnGpu[j];
764 threadComputedPotential = true;
770 bonds_gpu<calcVir, calcEner>(fTypeTid, &vtot_loc, numBonds, iatoms, kernelParams.d_forceParams,
771 kernelParams.d_xq, kernelParams.d_f, sm_fShiftLoc, kernelParams.pbcAiuc);
774 angles_gpu<calcVir, calcEner>(fTypeTid, &vtot_loc, numBonds, iatoms, kernelParams.d_forceParams,
775 kernelParams.d_xq, kernelParams.d_f, sm_fShiftLoc, kernelParams.pbcAiuc);
778 urey_bradley_gpu<calcVir, calcEner>(fTypeTid, &vtot_loc, numBonds, iatoms, kernelParams.d_forceParams,
779 kernelParams.d_xq, kernelParams.d_f, sm_fShiftLoc, kernelParams.pbcAiuc);
783 pdihs_gpu<calcVir, calcEner>(fTypeTid, &vtot_loc, numBonds, iatoms, kernelParams.d_forceParams,
784 kernelParams.d_xq, kernelParams.d_f, sm_fShiftLoc, kernelParams.pbcAiuc);
787 rbdihs_gpu<calcVir, calcEner>(fTypeTid, &vtot_loc, numBonds, iatoms, kernelParams.d_forceParams,
788 kernelParams.d_xq, kernelParams.d_f, sm_fShiftLoc, kernelParams.pbcAiuc);
791 idihs_gpu<calcVir, calcEner>(fTypeTid, &vtot_loc, numBonds, iatoms, kernelParams.d_forceParams,
792 kernelParams.d_xq, kernelParams.d_f, sm_fShiftLoc, kernelParams.pbcAiuc);
795 pairs_gpu<calcVir, calcEner>(fTypeTid, numBonds, iatoms, kernelParams.d_forceParams,
796 kernelParams.d_xq, kernelParams.d_f, sm_fShiftLoc, kernelParams.pbcAiuc,
797 kernelParams.scaleFactor, &vtotVdw_loc, &vtotElec_loc);
804 if (threadComputedPotential)
806 float *vtotVdw = kernelParams.d_vTot + F_LJ14;
807 float *vtotElec = kernelParams.d_vTot + F_COUL14;
808 atomicAdd(kernelParams.d_vTot + fType, vtot_loc);
809 atomicAdd(vtotVdw, vtotVdw_loc);
810 atomicAdd(vtotElec, vtotElec_loc);
812 /* Accumulate shift vectors from shared memory to global memory on the first SHIFTS threads of the block. */
816 if (threadIdx.x < SHIFTS)
818 fvec_inc_atomic(kernelParams.d_fShift[threadIdx.x], sm_fShiftLoc[threadIdx.x]);
824 /*-------------------------------- End CUDA kernels-----------------------------*/
827 template <bool calcVir, bool calcEner>
829 GpuBonded::Impl::launchKernel(const t_forcerec *fr,
832 GMX_ASSERT(haveInteractions_,
833 "Cannot launch bonded GPU kernels unless bonded GPU work was scheduled");
834 static_assert(TPB_BONDED >= SHIFTS, "TPB_BONDED must be >= SHIFTS for the virial kernel (calcVir=true)");
837 setPbcAiuc(fr->bMolPBC ? ePBC2npbcdim(fr->ePBC) : 0, box, &pbcAiuc);
839 int fTypeRangeEnd = kernelParams_.fTypeRangeEnd[numFTypesOnGpu - 1];
841 if (fTypeRangeEnd < 0)
846 KernelLaunchConfig config;
847 config.blockSize[0] = TPB_BONDED;
848 config.blockSize[1] = 1;
849 config.blockSize[2] = 1;
850 config.gridSize[0] = (fTypeRangeEnd + TPB_BONDED)/TPB_BONDED;
851 config.gridSize[1] = 1;
852 config.gridSize[2] = 1;
853 config.stream = stream_;
855 auto kernelPtr = exec_kernel_gpu<calcVir, calcEner>;
856 kernelParams_.scaleFactor = fr->ic->epsfac*fr->fudgeQQ;
857 kernelParams_.pbcAiuc = pbcAiuc;
859 const auto kernelArgs = prepareGpuKernelArguments(kernelPtr, config, &kernelParams_);
861 launchGpuKernel(kernelPtr, config, nullptr, "exec_kernel_gpu<calcVir, calcEner>", kernelArgs);
865 GpuBonded::launchKernel(const t_forcerec *fr,
866 const gmx::ForceFlags &forceFlags,
869 if (forceFlags.computeEnergy)
871 // When we need the energy, we also need the virial
872 impl_->launchKernel<true, true>
875 else if (forceFlags.computeVirial)
877 impl_->launchKernel<true, false>
882 impl_->launchKernel<false, false>