2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
37 * \brief Implements CUDA bonded functionality
39 * \author Jon Vincent <jvincent@nvidia.com>
40 * \author Magnus Lundborg <lundborg.magnus@gmail.com>
41 * \author Berk Hess <hess@kth.se>
42 * \author Szilárd Páll <pall.szilard@gmail.com>
43 * \author Alan Gray <alang@nvidia.com>
44 * \author Mark Abraham <mark.j.abraham@gmail.com>
46 * \ingroup module_listed_forces
53 #include <math_constants.h>
55 #include "gromacs/gpu_utils/cudautils.cuh"
56 #include "gromacs/gpu_utils/gpu_vec.cuh"
57 #include "gromacs/listed_forces/gpubonded.h"
58 #include "gromacs/math/units.h"
59 #include "gromacs/mdlib/force_flags.h"
60 #include "gromacs/mdtypes/forcerec.h"
61 #include "gromacs/mdtypes/simulation_workload.h"
62 #include "gromacs/pbcutil/pbc.h"
63 #include "gromacs/pbcutil/pbc_aiuc_cuda.cuh"
64 #include "gromacs/utility/gmxassert.h"
66 #include "gpubonded_impl.h"
72 // CUDA threads per block
73 #define TPB_BONDED 256
75 /*-------------------------------- CUDA kernels-------------------------------- */
76 /*------------------------------------------------------------------------------*/
78 #define CUDA_DEG2RAD_F (CUDART_PI_F / 180.0f)
80 /*---------------- BONDED CUDA kernels--------------*/
83 __device__ __forceinline__ static void
84 harmonic_gpu(const float kA, const float xA, const float x, float* V, float* F)
86 constexpr float half = 0.5f;
96 template<bool calcVir, bool calcEner>
97 __device__ void bonds_gpu(const int i,
100 const t_iatom d_forceatoms[],
101 const t_iparams d_forceparams[],
102 const float4 gm_xq[],
105 const PbcAiuc pbcAiuc)
109 int3 bondData = *(int3*)(d_forceatoms + 3 * i);
110 int type = bondData.x;
114 /* dx = xi - xj, corrected for periodic boundary conditions. */
116 int ki = pbcDxAiuc<calcVir>(pbcAiuc, gm_xq[ai], gm_xq[aj], dx);
118 float dr2 = iprod_gpu(dx, dx);
119 float dr = sqrt(dr2);
123 harmonic_gpu(d_forceparams[type].harmonic.krA, d_forceparams[type].harmonic.rA, dr, &vbond, &fbond);
132 fbond *= rsqrtf(dr2);
135 for (int m = 0; m < DIM; m++)
137 float fij = fbond * dx[m];
138 atomicAdd(&gm_f[ai][m], fij);
139 atomicAdd(&gm_f[aj][m], -fij);
140 if (calcVir && ki != CENTRAL)
142 atomicAdd(&sm_fShiftLoc[ki][m], fij);
143 atomicAdd(&sm_fShiftLoc[CENTRAL][m], -fij);
150 template<bool returnShift>
151 __device__ __forceinline__ static float bond_angle_gpu(const float4 xi,
154 const PbcAiuc& pbcAiuc,
160 /* Return value is the angle between the bonds i-j and j-k */
162 *t1 = pbcDxAiuc<returnShift>(pbcAiuc, xi, xj, r_ij);
163 *t2 = pbcDxAiuc<returnShift>(pbcAiuc, xk, xj, r_kj);
165 *costh = cos_angle_gpu(r_ij, r_kj);
166 float th = acosf(*costh);
171 template<bool calcVir, bool calcEner>
172 __device__ void angles_gpu(const int i,
175 const t_iatom d_forceatoms[],
176 const t_iparams d_forceparams[],
177 const float4 gm_xq[],
180 const PbcAiuc pbcAiuc)
184 int4 angleData = *(int4*)(d_forceatoms + 4 * i);
185 int type = angleData.x;
186 int ai = angleData.y;
187 int aj = angleData.z;
188 int ak = angleData.w;
195 float theta = bond_angle_gpu<calcVir>(gm_xq[ai], gm_xq[aj], gm_xq[ak], pbcAiuc, r_ij, r_kj,
196 &cos_theta, &t1, &t2);
200 harmonic_gpu(d_forceparams[type].harmonic.krA,
201 d_forceparams[type].harmonic.rA * CUDA_DEG2RAD_F, theta, &va, &dVdt);
208 float cos_theta2 = cos_theta * cos_theta;
209 if (cos_theta2 < 1.0f)
211 float st = dVdt * rsqrtf(1.0f - cos_theta2);
212 float sth = st * cos_theta;
213 float nrij2 = iprod_gpu(r_ij, r_ij);
214 float nrkj2 = iprod_gpu(r_kj, r_kj);
216 float nrij_1 = rsqrtf(nrij2);
217 float nrkj_1 = rsqrtf(nrkj2);
219 float cik = st * nrij_1 * nrkj_1;
220 float cii = sth * nrij_1 * nrij_1;
221 float ckk = sth * nrkj_1 * nrkj_1;
227 for (int m = 0; m < DIM; m++)
229 f_i[m] = -(cik * r_kj[m] - cii * r_ij[m]);
230 f_k[m] = -(cik * r_ij[m] - ckk * r_kj[m]);
231 f_j[m] = -f_i[m] - f_k[m];
232 atomicAdd(&gm_f[ai][m], f_i[m]);
233 atomicAdd(&gm_f[aj][m], f_j[m]);
234 atomicAdd(&gm_f[ak][m], f_k[m]);
237 atomicAdd(&sm_fShiftLoc[t1][m], f_i[m]);
238 atomicAdd(&sm_fShiftLoc[CENTRAL][m], f_j[m]);
239 atomicAdd(&sm_fShiftLoc[t2][m], f_k[m]);
246 template<bool calcVir, bool calcEner>
247 __device__ void urey_bradley_gpu(const int i,
250 const t_iatom d_forceatoms[],
251 const t_iparams d_forceparams[],
252 const float4 gm_xq[],
255 const PbcAiuc pbcAiuc)
259 int4 ubData = *(int4*)(d_forceatoms + 4 * i);
265 float th0A = d_forceparams[type].u_b.thetaA * CUDA_DEG2RAD_F;
266 float kthA = d_forceparams[type].u_b.kthetaA;
267 float r13A = d_forceparams[type].u_b.r13A;
268 float kUBA = d_forceparams[type].u_b.kUBA;
275 float theta = bond_angle_gpu<calcVir>(gm_xq[ai], gm_xq[aj], gm_xq[ak], pbcAiuc, r_ij, r_kj,
276 &cos_theta, &t1, &t2);
280 harmonic_gpu(kthA, th0A, theta, &va, &dVdt);
288 int ki = pbcDxAiuc<calcVir>(pbcAiuc, gm_xq[ai], gm_xq[ak], r_ik);
290 float dr2 = iprod_gpu(r_ik, r_ik);
291 float dr = dr2 * rsqrtf(dr2);
295 harmonic_gpu(kUBA, r13A, dr, &vbond, &fbond);
297 float cos_theta2 = cos_theta * cos_theta;
298 if (cos_theta2 < 1.0f)
300 float st = dVdt * rsqrtf(1.0f - cos_theta2);
301 float sth = st * cos_theta;
303 float nrkj2 = iprod_gpu(r_kj, r_kj);
304 float nrij2 = iprod_gpu(r_ij, r_ij);
306 float cik = st * rsqrtf(nrkj2 * nrij2);
307 float cii = sth / nrij2;
308 float ckk = sth / nrkj2;
314 for (int m = 0; m < DIM; m++)
316 f_i[m] = -(cik * r_kj[m] - cii * r_ij[m]);
317 f_k[m] = -(cik * r_ij[m] - ckk * r_kj[m]);
318 f_j[m] = -f_i[m] - f_k[m];
319 atomicAdd(&gm_f[ai][m], f_i[m]);
320 atomicAdd(&gm_f[aj][m], f_j[m]);
321 atomicAdd(&gm_f[ak][m], f_k[m]);
324 atomicAdd(&sm_fShiftLoc[t1][m], f_i[m]);
325 atomicAdd(&sm_fShiftLoc[CENTRAL][m], f_j[m]);
326 atomicAdd(&sm_fShiftLoc[t2][m], f_k[m]);
331 /* Time for the bond calculations */
339 fbond *= rsqrtf(dr2);
342 for (int m = 0; m < DIM; m++)
344 float fik = fbond * r_ik[m];
345 atomicAdd(&gm_f[ai][m], fik);
346 atomicAdd(&gm_f[ak][m], -fik);
348 if (calcVir && ki != CENTRAL)
350 atomicAdd(&sm_fShiftLoc[ki][m], fik);
351 atomicAdd(&sm_fShiftLoc[CENTRAL][m], -fik);
358 template<bool returnShift, typename T>
359 __device__ __forceinline__ static float dih_angle_gpu(const T xi,
363 const PbcAiuc& pbcAiuc,
373 *t1 = pbcDxAiuc<returnShift>(pbcAiuc, xi, xj, r_ij);
374 *t2 = pbcDxAiuc<returnShift>(pbcAiuc, xk, xj, r_kj);
375 *t3 = pbcDxAiuc<returnShift>(pbcAiuc, xk, xl, r_kl);
377 cprod_gpu(r_ij, r_kj, m);
378 cprod_gpu(r_kj, r_kl, n);
379 float phi = gmx_angle_gpu(m, n);
380 float ipr = iprod_gpu(r_ij, n);
381 float sign = (ipr < 0.0f) ? -1.0f : 1.0f;
388 __device__ __forceinline__ static void
389 dopdihs_gpu(const float cpA, const float phiA, const int mult, const float phi, float* v, float* f)
393 mdphi = mult * phi - phiA * CUDA_DEG2RAD_F;
395 *v = cpA * (1.0f + cosf(mdphi));
396 *f = -cpA * mult * sdphi;
399 template<bool calcVir>
400 __device__ static void do_dih_fup_gpu(const int i,
412 const PbcAiuc& pbcAiuc,
413 const float4 gm_xq[],
416 const int gmx_unused t3)
418 float iprm = iprod_gpu(m, m);
419 float iprn = iprod_gpu(n, n);
420 float nrkj2 = iprod_gpu(r_kj, r_kj);
421 float toler = nrkj2 * GMX_REAL_EPS;
422 if ((iprm > toler) && (iprn > toler))
424 float nrkj_1 = rsqrtf(nrkj2); // replacing std::invsqrt call
425 float nrkj_2 = nrkj_1 * nrkj_1;
426 float nrkj = nrkj2 * nrkj_1;
427 float a = -ddphi * nrkj / iprm;
429 svmul_gpu(a, m, f_i);
430 float b = ddphi * nrkj / iprn;
432 svmul_gpu(b, n, f_l);
433 float p = iprod_gpu(r_ij, r_kj);
435 float q = iprod_gpu(r_kl, r_kj);
438 svmul_gpu(p, f_i, uvec);
440 svmul_gpu(q, f_l, vvec);
442 fvec_sub_gpu(uvec, vvec, svec);
444 fvec_sub_gpu(f_i, svec, f_j);
446 fvec_add_gpu(f_l, svec, f_k);
448 for (int m = 0; (m < DIM); m++)
450 atomicAdd(&gm_f[i][m], f_i[m]);
451 atomicAdd(&gm_f[j][m], -f_j[m]);
452 atomicAdd(&gm_f[k][m], -f_k[m]);
453 atomicAdd(&gm_f[l][m], f_l[m]);
459 int t3 = pbcDxAiuc<calcVir>(pbcAiuc, gm_xq[l], gm_xq[j], dx_jl);
462 for (int m = 0; (m < DIM); m++)
464 atomicAdd(&sm_fShiftLoc[t1][m], f_i[m]);
465 atomicAdd(&sm_fShiftLoc[CENTRAL][m], -f_j[m]);
466 atomicAdd(&sm_fShiftLoc[t2][m], -f_k[m]);
467 atomicAdd(&sm_fShiftLoc[t3][m], f_l[m]);
473 template<bool calcVir, bool calcEner>
474 __device__ void pdihs_gpu(const int i,
477 const t_iatom d_forceatoms[],
478 const t_iparams d_forceparams[],
479 const float4 gm_xq[],
482 const PbcAiuc pbcAiuc)
486 int type = d_forceatoms[5 * i];
487 int ai = d_forceatoms[5 * i + 1];
488 int aj = d_forceatoms[5 * i + 2];
489 int ak = d_forceatoms[5 * i + 3];
490 int al = d_forceatoms[5 * i + 4];
500 float phi = dih_angle_gpu<calcVir>(gm_xq[ai], gm_xq[aj], gm_xq[ak], gm_xq[al], pbcAiuc,
501 r_ij, r_kj, r_kl, m, n, &t1, &t2, &t3);
505 dopdihs_gpu(d_forceparams[type].pdihs.cpA, d_forceparams[type].pdihs.phiA,
506 d_forceparams[type].pdihs.mult, phi, &vpd, &ddphi);
513 do_dih_fup_gpu<calcVir>(ai, aj, ak, al, ddphi, r_ij, r_kj, r_kl, m, n, gm_f, sm_fShiftLoc,
514 pbcAiuc, gm_xq, t1, t2, t3);
518 template<bool calcVir, bool calcEner>
519 __device__ void rbdihs_gpu(const int i,
522 const t_iatom d_forceatoms[],
523 const t_iparams d_forceparams[],
524 const float4 gm_xq[],
527 const PbcAiuc pbcAiuc)
529 constexpr float c0 = 0.0f, c1 = 1.0f, c2 = 2.0f, c3 = 3.0f, c4 = 4.0f, c5 = 5.0f;
533 int type = d_forceatoms[5 * i];
534 int ai = d_forceatoms[5 * i + 1];
535 int aj = d_forceatoms[5 * i + 2];
536 int ak = d_forceatoms[5 * i + 3];
537 int al = d_forceatoms[5 * i + 4];
547 float phi = dih_angle_gpu<calcVir>(gm_xq[ai], gm_xq[aj], gm_xq[ak], gm_xq[al], pbcAiuc,
548 r_ij, r_kj, r_kl, m, n, &t1, &t2, &t3);
550 /* Change to polymer convention */
559 float cos_phi = cosf(phi);
560 /* Beware of accuracy loss, cannot use 1-sqrt(cos^2) ! */
561 float sin_phi = sinf(phi);
563 float parm[NR_RBDIHS];
564 for (int j = 0; j < NR_RBDIHS; j++)
566 parm[j] = d_forceparams[type].rbdihs.rbcA[j];
568 /* Calculate cosine powers */
569 /* Calculate the energy */
570 /* Calculate the derivative */
576 ddphi += rbp * cosfac;
583 ddphi += c2 * rbp * cosfac;
590 ddphi += c3 * rbp * cosfac;
597 ddphi += c4 * rbp * cosfac;
604 ddphi += c5 * rbp * cosfac;
611 ddphi = -ddphi * sin_phi;
613 do_dih_fup_gpu<calcVir>(ai, aj, ak, al, ddphi, r_ij, r_kj, r_kl, m, n, gm_f, sm_fShiftLoc,
614 pbcAiuc, gm_xq, t1, t2, t3);
622 __device__ __forceinline__ static void make_dp_periodic_gpu(float* dp)
624 /* dp cannot be outside (-pi,pi) */
625 if (*dp >= CUDART_PI_F)
627 *dp -= 2.0f * CUDART_PI_F;
629 else if (*dp < -CUDART_PI_F)
631 *dp += 2.0f * CUDART_PI_F;
635 template<bool calcVir, bool calcEner>
636 __device__ void idihs_gpu(const int i,
639 const t_iatom d_forceatoms[],
640 const t_iparams d_forceparams[],
641 const float4 gm_xq[],
644 const PbcAiuc pbcAiuc)
648 int type = d_forceatoms[5 * i];
649 int ai = d_forceatoms[5 * i + 1];
650 int aj = d_forceatoms[5 * i + 2];
651 int ak = d_forceatoms[5 * i + 3];
652 int al = d_forceatoms[5 * i + 4];
662 float phi = dih_angle_gpu<calcVir>(gm_xq[ai], gm_xq[aj], gm_xq[ak], gm_xq[al], pbcAiuc,
663 r_ij, r_kj, r_kl, m, n, &t1, &t2, &t3);
665 /* phi can jump if phi0 is close to Pi/-Pi, which will cause huge
666 * force changes if we just apply a normal harmonic.
667 * Instead, we first calculate phi-phi0 and take it modulo (-Pi,Pi).
668 * This means we will never have the periodicity problem, unless
669 * the dihedral is Pi away from phiO, which is very unlikely due to
672 float kA = d_forceparams[type].harmonic.krA;
673 float pA = d_forceparams[type].harmonic.rA;
675 float phi0 = pA * CUDA_DEG2RAD_F;
677 float dp = phi - phi0;
679 make_dp_periodic_gpu(&dp);
681 float ddphi = -kA * dp;
683 do_dih_fup_gpu<calcVir>(ai, aj, ak, al, -ddphi, r_ij, r_kj, r_kl, m, n, gm_f, sm_fShiftLoc,
684 pbcAiuc, gm_xq, t1, t2, t3);
688 *vtot_loc += -0.5f * ddphi * dp;
693 template<bool calcVir, bool calcEner>
694 __device__ void pairs_gpu(const int i,
696 const t_iatom d_forceatoms[],
697 const t_iparams iparams[],
698 const float4 gm_xq[],
701 const PbcAiuc pbcAiuc,
702 const float scale_factor,
708 int3 pairData = *(int3*)(d_forceatoms + 3 * i);
709 int type = pairData.x;
713 float qq = gm_xq[ai].w * gm_xq[aj].w;
714 float c6 = iparams[type].lj14.c6A;
715 float c12 = iparams[type].lj14.c12A;
717 /* Do we need to apply full periodic boundary conditions? */
719 int fshift_index = pbcDxAiuc<calcVir>(pbcAiuc, gm_xq[ai], gm_xq[aj], dr);
721 float r2 = norm2_gpu(dr);
722 float rinv = rsqrtf(r2);
723 float rinv2 = rinv * rinv;
724 float rinv6 = rinv2 * rinv2 * rinv2;
726 /* Calculate the Coulomb force * r */
727 float velec = scale_factor * qq * rinv;
729 /* Calculate the LJ force * r and add it to the Coulomb part */
730 float fr = (12.0f * c12 * rinv6 - 6.0f * c6) * rinv6 + velec;
732 float finvr = fr * rinv2;
734 svmul_gpu(finvr, dr, f);
738 for (int m = 0; m < DIM; m++)
740 atomicAdd(&gm_f[ai][m], f[m]);
741 atomicAdd(&gm_f[aj][m], -f[m]);
742 if (calcVir && fshift_index != CENTRAL)
744 atomicAdd(&sm_fShiftLoc[fshift_index][m], f[m]);
745 atomicAdd(&sm_fShiftLoc[CENTRAL][m], -f[m]);
751 *vtotVdw_loc += (c12 * rinv6 - c6) * rinv6;
752 *vtotElec_loc += velec;
760 template<bool calcVir, bool calcEner>
761 __global__ void exec_kernel_gpu(BondedCudaKernelParameters kernelParams)
763 assert(blockDim.y == 1 && blockDim.z == 1);
764 const int tid = blockIdx.x * blockDim.x + threadIdx.x;
766 float vtotVdw_loc = 0;
767 float vtotElec_loc = 0;
768 __shared__ fvec sm_fShiftLoc[SHIFTS];
772 if (threadIdx.x < SHIFTS)
774 sm_fShiftLoc[threadIdx.x][XX] = 0.0f;
775 sm_fShiftLoc[threadIdx.x][YY] = 0.0f;
776 sm_fShiftLoc[threadIdx.x][ZZ] = 0.0f;
782 bool threadComputedPotential = false;
784 for (int j = 0; j < numFTypesOnGpu; j++)
786 if (tid >= kernelParams.fTypeRangeStart[j] && tid <= kernelParams.fTypeRangeEnd[j])
788 const int numBonds = kernelParams.numFTypeBonds[j];
789 int fTypeTid = tid - kernelParams.fTypeRangeStart[j];
790 const t_iatom* iatoms = kernelParams.d_iatoms[j];
791 fType = kernelParams.fTypesOnGpu[j];
794 threadComputedPotential = true;
800 bonds_gpu<calcVir, calcEner>(fTypeTid, &vtot_loc, numBonds, iatoms,
801 kernelParams.d_forceParams, kernelParams.d_xq,
802 kernelParams.d_f, sm_fShiftLoc, kernelParams.pbcAiuc);
805 angles_gpu<calcVir, calcEner>(
806 fTypeTid, &vtot_loc, numBonds, iatoms, kernelParams.d_forceParams,
807 kernelParams.d_xq, kernelParams.d_f, sm_fShiftLoc, kernelParams.pbcAiuc);
810 urey_bradley_gpu<calcVir, calcEner>(
811 fTypeTid, &vtot_loc, numBonds, iatoms, kernelParams.d_forceParams,
812 kernelParams.d_xq, kernelParams.d_f, sm_fShiftLoc, kernelParams.pbcAiuc);
816 pdihs_gpu<calcVir, calcEner>(fTypeTid, &vtot_loc, numBonds, iatoms,
817 kernelParams.d_forceParams, kernelParams.d_xq,
818 kernelParams.d_f, sm_fShiftLoc, kernelParams.pbcAiuc);
821 rbdihs_gpu<calcVir, calcEner>(
822 fTypeTid, &vtot_loc, numBonds, iatoms, kernelParams.d_forceParams,
823 kernelParams.d_xq, kernelParams.d_f, sm_fShiftLoc, kernelParams.pbcAiuc);
826 idihs_gpu<calcVir, calcEner>(fTypeTid, &vtot_loc, numBonds, iatoms,
827 kernelParams.d_forceParams, kernelParams.d_xq,
828 kernelParams.d_f, sm_fShiftLoc, kernelParams.pbcAiuc);
831 pairs_gpu<calcVir, calcEner>(fTypeTid, numBonds, iatoms, kernelParams.d_forceParams,
832 kernelParams.d_xq, kernelParams.d_f, sm_fShiftLoc,
833 kernelParams.pbcAiuc, kernelParams.scaleFactor,
834 &vtotVdw_loc, &vtotElec_loc);
841 if (threadComputedPotential)
843 float* vtotVdw = kernelParams.d_vTot + F_LJ14;
844 float* vtotElec = kernelParams.d_vTot + F_COUL14;
845 atomicAdd(kernelParams.d_vTot + fType, vtot_loc);
846 atomicAdd(vtotVdw, vtotVdw_loc);
847 atomicAdd(vtotElec, vtotElec_loc);
849 /* Accumulate shift vectors from shared memory to global memory on the first SHIFTS threads of the block. */
853 if (threadIdx.x < SHIFTS)
855 fvec_inc_atomic(kernelParams.d_fShift[threadIdx.x], sm_fShiftLoc[threadIdx.x]);
861 /*-------------------------------- End CUDA kernels-----------------------------*/
864 template<bool calcVir, bool calcEner>
865 void GpuBonded::Impl::launchKernel(const t_forcerec* fr, const matrix box)
867 GMX_ASSERT(haveInteractions_,
868 "Cannot launch bonded GPU kernels unless bonded GPU work was scheduled");
869 static_assert(TPB_BONDED >= SHIFTS,
870 "TPB_BONDED must be >= SHIFTS for the virial kernel (calcVir=true)");
873 setPbcAiuc(fr->bMolPBC ? numPbcDimensions(fr->pbcType) : 0, box, &pbcAiuc);
875 int fTypeRangeEnd = kernelParams_.fTypeRangeEnd[numFTypesOnGpu - 1];
877 if (fTypeRangeEnd < 0)
882 KernelLaunchConfig config;
883 config.blockSize[0] = TPB_BONDED;
884 config.blockSize[1] = 1;
885 config.blockSize[2] = 1;
886 config.gridSize[0] = (fTypeRangeEnd + TPB_BONDED) / TPB_BONDED;
887 config.gridSize[1] = 1;
888 config.gridSize[2] = 1;
889 config.stream = stream_;
891 auto kernelPtr = exec_kernel_gpu<calcVir, calcEner>;
892 kernelParams_.scaleFactor = fr->ic->epsfac * fr->fudgeQQ;
893 kernelParams_.pbcAiuc = pbcAiuc;
895 const auto kernelArgs = prepareGpuKernelArguments(kernelPtr, config, &kernelParams_);
897 launchGpuKernel(kernelPtr, config, nullptr, "exec_kernel_gpu<calcVir, calcEner>", kernelArgs);
900 void GpuBonded::launchKernel(const t_forcerec* fr, const gmx::StepWorkload& stepWork, const matrix box)
902 if (stepWork.computeEnergy)
904 // When we need the energy, we also need the virial
905 impl_->launchKernel<true, true>(fr, box);
907 else if (stepWork.computeVirial)
909 impl_->launchKernel<true, false>(fr, box);
913 impl_->launchKernel<false, false>(fr, box);