2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2018,2019, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
37 * \brief Implements CUDA bonded functionality
39 * \author Jon Vincent <jvincent@nvidia.com>
40 * \author Magnus Lundborg <lundborg.magnus@gmail.com>
41 * \author Berk Hess <hess@kth.se>
42 * \author Szilárd Páll <pall.szilard@gmail.com>
43 * \author Alan Gray <alang@nvidia.com>
44 * \author Mark Abraham <mark.j.abraham@gmail.com>
46 * \ingroup module_listed_forces
53 #include <math_constants.h>
55 #include "gromacs/gpu_utils/cudautils.cuh"
56 #include "gromacs/gpu_utils/devicebuffer.h"
57 #include "gromacs/gpu_utils/gpu_vec.cuh"
58 #include "gromacs/gpu_utils/gputraits.cuh"
59 #include "gromacs/listed_forces/gpubonded.h"
60 #include "gromacs/listed_forces/listed_forces.h"
61 #include "gromacs/math/units.h"
62 #include "gromacs/mdlib/force_flags.h"
63 #include "gromacs/mdtypes/enerdata.h"
64 #include "gromacs/mdtypes/forcerec.h"
65 #include "gromacs/mdtypes/group.h"
66 #include "gromacs/mdtypes/mdatom.h"
67 #include "gromacs/pbcutil/ishift.h"
68 #include "gromacs/pbcutil/pbc.h"
69 #include "gromacs/pbcutil/pbc_aiuc_cuda.cuh"
70 #include "gromacs/topology/idef.h"
71 #include "gromacs/topology/ifunc.h"
72 #include "gromacs/utility/gmxassert.h"
74 #include "gpubonded_impl.h"
80 //CUDA threads per block
81 #define TPB_BONDED 256
83 /*-------------------------------- CUDA kernels-------------------------------- */
84 /*------------------------------------------------------------------------------*/
87 /*---------------- BONDED CUDA kernels--------------*/
90 __device__ __forceinline__
91 static void harmonic_gpu(const float kA, const float xA, const float x, float *V, float *F)
93 constexpr float half = 0.5f;
103 template <bool calcVir, bool calcEner>
105 void bonds_gpu(const int i, float *vtot_loc, const int nBonds,
106 const t_iatom forceatoms[], const t_iparams forceparams[],
107 const float4 xq[], fvec force[], fvec sm_fShiftLoc[],
108 const PbcAiuc pbcAiuc)
112 int type = forceatoms[3*i];
113 int ai = forceatoms[3*i + 1];
114 int aj = forceatoms[3*i + 2];
116 /* dx = xi - xj, corrected for periodic boundary conditions. */
118 int ki = pbcDxAiuc<calcVir>(pbcAiuc, xq[ai], xq[aj], dx);
120 float dr2 = iprod_gpu(dx, dx);
121 float dr = sqrt(dr2);
125 harmonic_gpu(forceparams[type].harmonic.krA,
126 forceparams[type].harmonic.rA,
136 fbond *= rsqrtf(dr2);
139 for (int m = 0; m < DIM; m++)
141 float fij = fbond*dx[m];
142 atomicAdd(&force[ai][m], fij);
143 atomicAdd(&force[aj][m], -fij);
144 if (calcVir && ki != CENTRAL)
146 atomicAdd(&sm_fShiftLoc[ki][m], fij);
147 atomicAdd(&sm_fShiftLoc[CENTRAL][m], -fij);
154 template <bool returnShift>
155 __device__ __forceinline__
156 static float bond_angle_gpu(const float4 xi, const float4 xj, const float4 xk,
157 const PbcAiuc &pbcAiuc,
158 fvec r_ij, fvec r_kj, float *costh,
160 /* Return value is the angle between the bonds i-j and j-k */
162 *t1 = pbcDxAiuc<returnShift>(pbcAiuc, xi, xj, r_ij);
163 *t2 = pbcDxAiuc<returnShift>(pbcAiuc, xk, xj, r_kj);
165 *costh = cos_angle_gpu(r_ij, r_kj);
166 float th = acosf(*costh);
171 template <bool calcVir, bool calcEner>
173 void angles_gpu(const int i, float *vtot_loc, const int nBonds,
174 const t_iatom forceatoms[], const t_iparams forceparams[],
175 const float4 xq[], fvec force[], fvec sm_fShiftLoc[],
176 const PbcAiuc pbcAiuc)
180 int type = forceatoms[4*i];
181 int ai = forceatoms[4*i + 1];
182 int aj = forceatoms[4*i + 2];
183 int ak = forceatoms[4*i + 3];
191 bond_angle_gpu<calcVir>(xq[ai], xq[aj], xq[ak], pbcAiuc,
192 r_ij, r_kj, &cos_theta, &t1, &t2);
196 harmonic_gpu(forceparams[type].harmonic.krA,
197 forceparams[type].harmonic.rA*DEG2RAD,
205 float cos_theta2 = cos_theta*cos_theta;
206 if (cos_theta2 < 1.0f)
208 float st = dVdt*rsqrtf(1.0f - cos_theta2);
209 float sth = st*cos_theta;
210 float nrij2 = iprod_gpu(r_ij, r_ij);
211 float nrkj2 = iprod_gpu(r_kj, r_kj);
213 float nrij_1 = rsqrtf(nrij2);
214 float nrkj_1 = rsqrtf(nrkj2);
216 float cik = st*nrij_1*nrkj_1;
217 float cii = sth*nrij_1*nrij_1;
218 float ckk = sth*nrkj_1*nrkj_1;
224 for (int m = 0; m < DIM; m++)
226 f_i[m] = -(cik*r_kj[m] - cii*r_ij[m]);
227 f_k[m] = -(cik*r_ij[m] - ckk*r_kj[m]);
228 f_j[m] = -f_i[m] - f_k[m];
229 atomicAdd(&force[ai][m], f_i[m]);
230 atomicAdd(&force[aj][m], f_j[m]);
231 atomicAdd(&force[ak][m], f_k[m]);
234 atomicAdd(&sm_fShiftLoc[t1][m], f_i[m]);
235 atomicAdd(&sm_fShiftLoc[CENTRAL][m], f_j[m]);
236 atomicAdd(&sm_fShiftLoc[t2][m], f_k[m]);
244 template <bool calcVir, bool calcEner>
246 void urey_bradley_gpu(const int i, float *vtot_loc, const int nBonds,
247 const t_iatom forceatoms[], const t_iparams forceparams[],
248 const float4 xq[], fvec force[], fvec sm_fShiftLoc[],
249 const PbcAiuc pbcAiuc)
253 int type = forceatoms[4*i];
254 int ai = forceatoms[4*i+1];
255 int aj = forceatoms[4*i+2];
256 int ak = forceatoms[4*i+3];
258 float th0A = forceparams[type].u_b.thetaA*DEG2RAD;
259 float kthA = forceparams[type].u_b.kthetaA;
260 float r13A = forceparams[type].u_b.r13A;
261 float kUBA = forceparams[type].u_b.kUBA;
268 float theta = bond_angle_gpu<calcVir>(xq[ai], xq[aj], xq[ak], pbcAiuc,
269 r_ij, r_kj, &cos_theta, &t1, &t2);
273 harmonic_gpu(kthA, th0A, theta, &va, &dVdt);
281 int ki = pbcDxAiuc<calcVir>(pbcAiuc, xq[ai], xq[ak], r_ik);
283 float dr2 = iprod_gpu(r_ik, r_ik);
284 float dr = dr2*rsqrtf(dr2);
288 harmonic_gpu(kUBA, r13A, dr, &vbond, &fbond);
290 float cos_theta2 = cos_theta*cos_theta;
291 if (cos_theta2 < 1.0f)
293 float st = dVdt*rsqrtf(1.0f - cos_theta2);
294 float sth = st*cos_theta;
296 float nrkj2 = iprod_gpu(r_kj, r_kj);
297 float nrij2 = iprod_gpu(r_ij, r_ij);
299 float cik = st*rsqrtf(nrkj2*nrij2);
300 float cii = sth/nrij2;
301 float ckk = sth/nrkj2;
307 for (int m = 0; m < DIM; m++)
309 f_i[m] = -(cik*r_kj[m]-cii*r_ij[m]);
310 f_k[m] = -(cik*r_ij[m]-ckk*r_kj[m]);
311 f_j[m] = -f_i[m]-f_k[m];
312 atomicAdd(&force[ai][m], f_i[m]);
313 atomicAdd(&force[aj][m], f_j[m]);
314 atomicAdd(&force[ak][m], f_k[m]);
317 atomicAdd(&sm_fShiftLoc[t1][m], f_i[m]);
318 atomicAdd(&sm_fShiftLoc[CENTRAL][m], f_j[m]);
319 atomicAdd(&sm_fShiftLoc[t2][m], f_k[m]);
324 /* Time for the bond calculations */
332 fbond *= rsqrtf(dr2);
335 for (int m = 0; m < DIM; m++)
337 float fik = fbond*r_ik[m];
338 atomicAdd(&force[ai][m], fik);
339 atomicAdd(&force[ak][m], -fik);
341 if (calcVir && ki != CENTRAL)
343 atomicAdd(&sm_fShiftLoc[ki][m], fik);
344 atomicAdd(&sm_fShiftLoc[CENTRAL][m], -fik);
351 template <bool returnShift, typename T>
352 __device__ __forceinline__
353 static float dih_angle_gpu(const T xi, const T xj, const T xk, const T xl,
354 const PbcAiuc &pbcAiuc,
355 fvec r_ij, fvec r_kj, fvec r_kl, fvec m, fvec n,
356 int *t1, int *t2, int *t3)
358 *t1 = pbcDxAiuc<returnShift>(pbcAiuc, xi, xj, r_ij);
359 *t2 = pbcDxAiuc<returnShift>(pbcAiuc, xk, xj, r_kj);
360 *t3 = pbcDxAiuc<returnShift>(pbcAiuc, xk, xl, r_kl);
362 cprod_gpu(r_ij, r_kj, m);
363 cprod_gpu(r_kj, r_kl, n);
364 float phi = gmx_angle_gpu(m, n);
365 float ipr = iprod_gpu(r_ij, n);
366 float sign = (ipr < 0.0f) ? -1.0f : 1.0f;
373 __device__ __forceinline__
374 static void dopdihs_gpu(const float cpA, const float phiA, const int mult,
375 const float phi, float *V, float *F)
379 mdphi = mult*phi - phiA*DEG2RAD;
381 *V = cpA * (1.0f + cosf(mdphi));
382 *F = -cpA*mult*sdphi;
385 template <bool calcVir>
387 static void do_dih_fup_gpu(const int i, const int j, const int k, const int l,
388 const float ddphi, const fvec r_ij, const fvec r_kj, const fvec r_kl,
389 const fvec m, const fvec n, fvec force[], fvec fshift[],
390 const PbcAiuc &pbcAiuc,
391 const float4 xq[], const int t1, const int t2, const int gmx_unused t3)
393 float iprm = iprod_gpu(m, m);
394 float iprn = iprod_gpu(n, n);
395 float nrkj2 = iprod_gpu(r_kj, r_kj);
396 float toler = nrkj2*GMX_REAL_EPS;
397 if ((iprm > toler) && (iprn > toler))
399 float nrkj_1 = rsqrtf(nrkj2); // replacing std::invsqrt call
400 float nrkj_2 = nrkj_1*nrkj_1;
401 float nrkj = nrkj2*nrkj_1;
402 float a = -ddphi*nrkj/iprm;
404 svmul_gpu(a, m, f_i);
405 float b = ddphi*nrkj/iprn;
407 svmul_gpu(b, n, f_l);
408 float p = iprod_gpu(r_ij, r_kj);
410 float q = iprod_gpu(r_kl, r_kj);
413 svmul_gpu(p, f_i, uvec);
415 svmul_gpu(q, f_l, vvec);
417 fvec_sub_gpu(uvec, vvec, svec);
419 fvec_sub_gpu(f_i, svec, f_j);
421 fvec_add_gpu(f_l, svec, f_k);
423 for (int m = 0; (m < DIM); m++)
425 atomicAdd(&force[i][m], f_i[m]);
426 atomicAdd(&force[j][m], -f_j[m]);
427 atomicAdd(&force[k][m], -f_k[m]);
428 atomicAdd(&force[l][m], f_l[m]);
434 int t3 = pbcDxAiuc<calcVir>(pbcAiuc, xq[l], xq[j], dx_jl);
437 for (int m = 0; (m < DIM); m++)
439 atomicAdd(&fshift[t1][m], f_i[m]);
440 atomicAdd(&fshift[CENTRAL][m], -f_j[m]);
441 atomicAdd(&fshift[t2][m], -f_k[m]);
442 atomicAdd(&fshift[t3][m], f_l[m]);
448 template <bool calcVir, bool calcEner>
450 void pdihs_gpu(const int i, float *vtot_loc, const int nBonds,
451 const t_iatom forceatoms[], const t_iparams forceparams[],
452 const float4 xq[], fvec f[], fvec sm_fShiftLoc[],
453 const PbcAiuc pbcAiuc)
457 int type = forceatoms[5*i];
458 int ai = forceatoms[5*i + 1];
459 int aj = forceatoms[5*i + 2];
460 int ak = forceatoms[5*i + 3];
461 int al = forceatoms[5*i + 4];
472 dih_angle_gpu<calcVir>(xq[ai], xq[aj], xq[ak], xq[al], pbcAiuc,
473 r_ij, r_kj, r_kl, m, n, &t1, &t2, &t3);
477 dopdihs_gpu(forceparams[type].pdihs.cpA,
478 forceparams[type].pdihs.phiA,
479 forceparams[type].pdihs.mult,
487 do_dih_fup_gpu<calcVir>(ai, aj, ak, al,
488 ddphi, r_ij, r_kj, r_kl, m, n,
489 f, sm_fShiftLoc, pbcAiuc,
495 template <bool calcVir, bool calcEner>
497 void rbdihs_gpu(const int i, float *vtot_loc, const int nBonds,
498 const t_iatom forceatoms[], const t_iparams forceparams[],
499 const float4 xq[], fvec f[], fvec sm_fShiftLoc[],
500 const PbcAiuc pbcAiuc)
502 constexpr float c0 = 0.0f, c1 = 1.0f, c2 = 2.0f, c3 = 3.0f, c4 = 4.0f, c5 = 5.0f;
506 int type = forceatoms[5*i];
507 int ai = forceatoms[5*i+1];
508 int aj = forceatoms[5*i+2];
509 int ak = forceatoms[5*i+3];
510 int al = forceatoms[5*i+4];
521 dih_angle_gpu<calcVir>(xq[ai], xq[aj], xq[ak], xq[al], pbcAiuc,
522 r_ij, r_kj, r_kl, m, n, &t1, &t2, &t3);
524 /* Change to polymer convention */
534 float cos_phi = cosf(phi);
535 /* Beware of accuracy loss, cannot use 1-sqrt(cos^2) ! */
536 float sin_phi = sinf(phi);
538 float parm[NR_RBDIHS];
539 for (int j = 0; j < NR_RBDIHS; j++)
541 parm[j] = forceparams[type].rbdihs.rbcA[j];
543 /* Calculate cosine powers */
544 /* Calculate the energy */
545 /* Calculate the derivative */
558 ddphi += c2*rbp*cosfac;
565 ddphi += c3*rbp*cosfac;
572 ddphi += c4*rbp*cosfac;
579 ddphi += c5*rbp*cosfac;
586 ddphi = -ddphi*sin_phi;
588 do_dih_fup_gpu<calcVir>(ai, aj, ak, al,
589 ddphi, r_ij, r_kj, r_kl, m, n,
590 f, sm_fShiftLoc, pbcAiuc,
599 __device__ __forceinline__
600 static void make_dp_periodic_gpu(float *dp)
602 /* dp cannot be outside (-pi,pi) */
603 if (*dp >= CUDART_PI_F)
605 *dp -= 2.0f*CUDART_PI_F;
607 else if (*dp < -CUDART_PI_F)
609 *dp += 2.0f*CUDART_PI_F;
613 template <bool calcVir, bool calcEner>
615 void idihs_gpu(const int i, float *vtot_loc, const int nBonds,
616 const t_iatom forceatoms[], const t_iparams forceparams[],
617 const float4 xq[], fvec f[], fvec sm_fShiftLoc[],
618 const PbcAiuc pbcAiuc)
622 int type = forceatoms[5*i];
623 int ai = forceatoms[5*i + 1];
624 int aj = forceatoms[5*i + 2];
625 int ak = forceatoms[5*i + 3];
626 int al = forceatoms[5*i + 4];
637 dih_angle_gpu<calcVir>(xq[ai], xq[aj], xq[ak], xq[al], pbcAiuc,
638 r_ij, r_kj, r_kl, m, n, &t1, &t2, &t3);
640 /* phi can jump if phi0 is close to Pi/-Pi, which will cause huge
641 * force changes if we just apply a normal harmonic.
642 * Instead, we first calculate phi-phi0 and take it modulo (-Pi,Pi).
643 * This means we will never have the periodicity problem, unless
644 * the dihedral is Pi away from phiO, which is very unlikely due to
647 float kA = forceparams[type].harmonic.krA;
648 float pA = forceparams[type].harmonic.rA;
650 float phi0 = pA*DEG2RAD;
652 float dp = phi - phi0;
654 make_dp_periodic_gpu(&dp);
656 float ddphi = -kA*dp;
658 do_dih_fup_gpu<calcVir>(ai, aj, ak, al,
659 -ddphi, r_ij, r_kj, r_kl, m, n,
660 f, sm_fShiftLoc, pbcAiuc,
665 *vtot_loc += -0.5f*ddphi*dp;
670 template <bool calcVir, bool calcEner>
672 void pairs_gpu(const int i, const int nBonds,
673 const t_iatom iatoms[], const t_iparams iparams[],
674 const float4 xq[], fvec force[], fvec sm_fShiftLoc[],
675 const PbcAiuc pbcAiuc,
676 const float scale_factor,
677 float *vtotVdw_loc, float *vtotElec_loc)
681 int itype = iatoms[3*i];
682 int ai = iatoms[3*i + 1];
683 int aj = iatoms[3*i + 2];
685 float qq = xq[ai].w*xq[aj].w;
686 float c6 = iparams[itype].lj14.c6A;
687 float c12 = iparams[itype].lj14.c12A;
689 /* Do we need to apply full periodic boundary conditions? */
691 int fshift_index = pbcDxAiuc<calcVir>(pbcAiuc, xq[ai], xq[aj], dr);
693 float r2 = norm2_gpu(dr);
694 float rinv = rsqrtf(r2);
695 float rinv2 = rinv*rinv;
696 float rinv6 = rinv2*rinv2*rinv2;
698 /* Calculate the Coulomb force * r */
699 float velec = scale_factor*qq*rinv;
701 /* Calculate the LJ force * r and add it to the Coulomb part */
702 float fr = (12.0f*c12*rinv6 - 6.0f*c6)*rinv6 + velec;
704 float finvr = fr * rinv2;
706 svmul_gpu(finvr, dr, f);
710 for (int m = 0; m < DIM; m++)
712 atomicAdd(&force[ai][m], f[m]);
713 atomicAdd(&force[aj][m], -f[m]);
714 if (calcVir && fshift_index != CENTRAL)
716 atomicAdd(&sm_fShiftLoc[fshift_index][m], f[m]);
717 atomicAdd(&sm_fShiftLoc[CENTRAL][m], -f[m]);
723 *vtotVdw_loc += (c12*rinv6 - c6)*rinv6;
724 *vtotElec_loc += velec;
732 template <bool calcVir, bool calcEner>
734 void exec_kernel_gpu(BondedCudaKernelParameters kernelParams)
736 assert(blockDim.y == 1 && blockDim.z == 1);
737 const int threadIndex = blockIdx.x*blockDim.x+threadIdx.x;
739 float vtotVdw_loc = 0;
740 float vtotElec_loc = 0;
741 __shared__ fvec sm_fShiftLoc[SHIFTS];
745 if (threadIdx.x < SHIFTS)
747 sm_fShiftLoc[threadIdx.x][XX] = 0.0f;
748 sm_fShiftLoc[threadIdx.x][YY] = 0.0f;
749 sm_fShiftLoc[threadIdx.x][ZZ] = 0.0f;
755 bool threadComputedPotential = false;
757 for (int j = 0; j < nFtypesOnGpu; j++)
759 if (threadIndex >= kernelParams.ftypeRangeStart[j] && threadIndex <= kernelParams.ftypeRangeEnd[j])
761 const int nBonds = kernelParams.nrFTypeBonds[j];
763 int localThreadIndex = threadIndex - kernelParams.ftypeRangeStart[j];
764 const t_iatom *iatoms = kernelParams.iatoms[j];
765 ftype = kernelParams.ftypesOnGpu[j];
768 threadComputedPotential = true;
774 bonds_gpu<calcVir, calcEner>(localThreadIndex, &vtot_loc, nBonds, iatoms, kernelParams.forceparamsDevice,
775 kernelParams.xqDevice, kernelParams.forceDevice, sm_fShiftLoc, kernelParams.pbcAiuc);
778 angles_gpu<calcVir, calcEner>(localThreadIndex, &vtot_loc, nBonds, iatoms, kernelParams.forceparamsDevice,
779 kernelParams.xqDevice, kernelParams.forceDevice, sm_fShiftLoc, kernelParams.pbcAiuc);
782 urey_bradley_gpu<calcVir, calcEner>(localThreadIndex, &vtot_loc, nBonds, iatoms, kernelParams.forceparamsDevice,
783 kernelParams.xqDevice, kernelParams.forceDevice, sm_fShiftLoc, kernelParams.pbcAiuc);
787 pdihs_gpu<calcVir, calcEner>(localThreadIndex, &vtot_loc, nBonds, iatoms, kernelParams.forceparamsDevice,
788 kernelParams.xqDevice, kernelParams.forceDevice, sm_fShiftLoc, kernelParams.pbcAiuc);
791 rbdihs_gpu<calcVir, calcEner>(localThreadIndex, &vtot_loc, nBonds, iatoms, kernelParams.forceparamsDevice,
792 kernelParams.xqDevice, kernelParams.forceDevice, sm_fShiftLoc, kernelParams.pbcAiuc);
795 idihs_gpu<calcVir, calcEner>(localThreadIndex, &vtot_loc, nBonds, iatoms, kernelParams.forceparamsDevice,
796 kernelParams.xqDevice, kernelParams.forceDevice, sm_fShiftLoc, kernelParams.pbcAiuc);
799 pairs_gpu<calcVir, calcEner>(localThreadIndex, nBonds, iatoms, kernelParams.forceparamsDevice,
800 kernelParams.xqDevice, kernelParams.forceDevice, sm_fShiftLoc, kernelParams.pbcAiuc,
801 kernelParams.scaleFactor, &vtotVdw_loc, &vtotElec_loc);
808 if (threadComputedPotential)
810 float *vtotVdw = kernelParams.vtotDevice + F_LJ14;
811 float *vtotElec = kernelParams.vtotDevice + F_COUL14;
812 atomicAdd(kernelParams.vtotDevice + ftype, vtot_loc);
813 atomicAdd(vtotVdw, vtotVdw_loc);
814 atomicAdd(vtotElec, vtotElec_loc);
816 /* Accumulate shift vectors from shared memory to global memory on the first SHIFTS threads of the block. */
820 if (threadIdx.x < SHIFTS)
822 fvec_inc_atomic(kernelParams.fshiftDevice[threadIdx.x], sm_fShiftLoc[threadIdx.x]);
828 /*-------------------------------- End CUDA kernels-----------------------------*/
831 template <bool calcVir, bool calcEner>
833 GpuBonded::Impl::launchKernel(const t_forcerec *fr,
836 GMX_ASSERT(haveInteractions_,
837 "Cannot launch bonded GPU kernels unless bonded GPU work was scheduled");
838 static_assert(TPB_BONDED >= SHIFTS, "TPB_BONDED must be >= SHIFTS for the virial kernel (calcVir=true)");
841 setPbcAiuc(fr->bMolPBC ? ePBC2npbcdim(fr->ePBC) : 0, box, &pbcAiuc);
843 int ftypeRangeEnd = kernelParams_.ftypeRangeEnd[nFtypesOnGpu - 1];
845 if (ftypeRangeEnd < 0)
850 KernelLaunchConfig config;
851 config.blockSize[0] = TPB_BONDED;
852 config.blockSize[1] = 1;
853 config.blockSize[2] = 1;
854 config.gridSize[0] = (ftypeRangeEnd + TPB_BONDED)/TPB_BONDED;
855 config.gridSize[1] = 1;
856 config.gridSize[2] = 1;
857 config.stream = stream;
859 auto kernelPtr = exec_kernel_gpu<calcVir, calcEner>;
860 kernelParams_.scaleFactor = fr->ic->epsfac*fr->fudgeQQ;
861 kernelParams_.pbcAiuc = pbcAiuc;
863 const auto kernelArgs = prepareGpuKernelArguments(kernelPtr, config, &kernelParams_);
865 launchGpuKernel(kernelPtr, config, nullptr, "exec_kernel_gpu<calcVir, calcEner>", kernelArgs);
869 GpuBonded::launchKernel(const t_forcerec *fr,
873 if (forceFlags & GMX_FORCE_ENERGY)
875 // When we need the energy, we also need the virial
876 impl_->launchKernel<true, true>
879 else if (forceFlags & GMX_FORCE_VIRIAL)
881 impl_->launchKernel<true, false>
886 impl_->launchKernel<false, false>