2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
37 * \brief Implements CUDA bonded functionality
39 * \author Jon Vincent <jvincent@nvidia.com>
40 * \author Magnus Lundborg <lundborg.magnus@gmail.com>
41 * \author Berk Hess <hess@kth.se>
42 * \author Szilárd Páll <pall.szilard@gmail.com>
43 * \author Alan Gray <alang@nvidia.com>
44 * \author Mark Abraham <mark.j.abraham@gmail.com>
46 * \ingroup module_listed_forces
53 #include <math_constants.h>
55 #include "gromacs/gpu_utils/cudautils.cuh"
56 #include "gromacs/gpu_utils/typecasts.cuh"
57 #include "gromacs/gpu_utils/vectype_ops.cuh"
58 #include "gromacs/listed_forces/gpubonded.h"
59 #include "gromacs/math/units.h"
60 #include "gromacs/mdlib/force_flags.h"
61 #include "gromacs/mdtypes/forcerec.h"
62 #include "gromacs/mdtypes/interaction_const.h"
63 #include "gromacs/mdtypes/simulation_workload.h"
64 #include "gromacs/pbcutil/pbc.h"
65 #include "gromacs/pbcutil/pbc_aiuc_cuda.cuh"
66 #include "gromacs/utility/gmxassert.h"
68 #include "gpubonded_impl.h"
74 // CUDA threads per block
75 #define TPB_BONDED 256
77 /*-------------------------------- CUDA kernels-------------------------------- */
78 /*------------------------------------------------------------------------------*/
80 #define CUDA_DEG2RAD_F (CUDART_PI_F / 180.0f)
82 /*---------------- BONDED CUDA kernels--------------*/
85 __device__ __forceinline__ static void
86 harmonic_gpu(const float kA, const float xA, const float x, float* V, float* F)
88 constexpr float half = 0.5f;
98 template<bool calcVir, bool calcEner>
99 __device__ void bonds_gpu(const int i,
102 const t_iatom d_forceatoms[],
103 const t_iparams d_forceparams[],
104 const float4 gm_xq[],
106 float3 sm_fShiftLoc[],
107 const PbcAiuc pbcAiuc)
111 int3 bondData = *(int3*)(d_forceatoms + 3 * i);
112 int type = bondData.x;
116 /* dx = xi - xj, corrected for periodic boundary conditions. */
118 int ki = pbcDxAiuc<calcVir>(pbcAiuc, gm_xq[ai], gm_xq[aj], dx);
120 float dr2 = norm2(dx);
121 float dr = sqrt(dr2);
125 harmonic_gpu(d_forceparams[type].harmonic.krA, d_forceparams[type].harmonic.rA, dr, &vbond, &fbond);
134 fbond *= rsqrtf(dr2);
136 float3 fij = fbond * dx;
137 atomicAdd(&gm_f[ai], fij);
138 atomicAdd(&gm_f[aj], -fij);
139 if (calcVir && ki != CENTRAL)
141 atomicAdd(&sm_fShiftLoc[ki], fij);
142 atomicAdd(&sm_fShiftLoc[CENTRAL], -fij);
148 template<bool returnShift>
149 __device__ __forceinline__ static float bond_angle_gpu(const float4 xi,
152 const PbcAiuc& pbcAiuc,
158 /* Return value is the angle between the bonds i-j and j-k */
160 *t1 = pbcDxAiuc<returnShift>(pbcAiuc, xi, xj, *r_ij);
161 *t2 = pbcDxAiuc<returnShift>(pbcAiuc, xk, xj, *r_kj);
163 *costh = cos_angle(*r_ij, *r_kj);
164 float th = acosf(*costh);
169 template<bool calcVir, bool calcEner>
170 __device__ void angles_gpu(const int i,
173 const t_iatom d_forceatoms[],
174 const t_iparams d_forceparams[],
175 const float4 gm_xq[],
177 float3 sm_fShiftLoc[],
178 const PbcAiuc pbcAiuc)
182 int4 angleData = *(int4*)(d_forceatoms + 4 * i);
183 int type = angleData.x;
184 int ai = angleData.y;
185 int aj = angleData.z;
186 int ak = angleData.w;
193 float theta = bond_angle_gpu<calcVir>(gm_xq[ai], gm_xq[aj], gm_xq[ak], pbcAiuc, &r_ij,
194 &r_kj, &cos_theta, &t1, &t2);
198 harmonic_gpu(d_forceparams[type].harmonic.krA,
199 d_forceparams[type].harmonic.rA * CUDA_DEG2RAD_F, theta, &va, &dVdt);
206 float cos_theta2 = cos_theta * cos_theta;
207 if (cos_theta2 < 1.0f)
209 float st = dVdt * rsqrtf(1.0f - cos_theta2);
210 float sth = st * cos_theta;
211 float nrij2 = norm2(r_ij);
212 float nrkj2 = norm2(r_kj);
214 float nrij_1 = rsqrtf(nrij2);
215 float nrkj_1 = rsqrtf(nrkj2);
217 float cik = st * nrij_1 * nrkj_1;
218 float cii = sth * nrij_1 * nrij_1;
219 float ckk = sth * nrkj_1 * nrkj_1;
221 float3 f_i = cii * r_ij - cik * r_kj;
222 float3 f_k = ckk * r_kj - cik * r_ij;
223 float3 f_j = -f_i - f_k;
225 atomicAdd(&gm_f[ai], f_i);
226 atomicAdd(&gm_f[aj], f_j);
227 atomicAdd(&gm_f[ak], f_k);
231 atomicAdd(&sm_fShiftLoc[t1], f_i);
232 atomicAdd(&sm_fShiftLoc[CENTRAL], f_j);
233 atomicAdd(&sm_fShiftLoc[t2], f_k);
239 template<bool calcVir, bool calcEner>
240 __device__ void urey_bradley_gpu(const int i,
243 const t_iatom d_forceatoms[],
244 const t_iparams d_forceparams[],
245 const float4 gm_xq[],
247 float3 sm_fShiftLoc[],
248 const PbcAiuc pbcAiuc)
252 int4 ubData = *(int4*)(d_forceatoms + 4 * i);
258 float th0A = d_forceparams[type].u_b.thetaA * CUDA_DEG2RAD_F;
259 float kthA = d_forceparams[type].u_b.kthetaA;
260 float r13A = d_forceparams[type].u_b.r13A;
261 float kUBA = d_forceparams[type].u_b.kUBA;
268 float theta = bond_angle_gpu<calcVir>(gm_xq[ai], gm_xq[aj], gm_xq[ak], pbcAiuc, &r_ij,
269 &r_kj, &cos_theta, &t1, &t2);
273 harmonic_gpu(kthA, th0A, theta, &va, &dVdt);
281 int ki = pbcDxAiuc<calcVir>(pbcAiuc, gm_xq[ai], gm_xq[ak], r_ik);
283 float dr2 = norm2(r_ik);
284 float dr = dr2 * rsqrtf(dr2);
288 harmonic_gpu(kUBA, r13A, dr, &vbond, &fbond);
290 float cos_theta2 = cos_theta * cos_theta;
291 if (cos_theta2 < 1.0f)
293 float st = dVdt * rsqrtf(1.0f - cos_theta2);
294 float sth = st * cos_theta;
296 float nrkj2 = norm2(r_kj);
297 float nrij2 = norm2(r_ij);
299 float cik = st * rsqrtf(nrkj2 * nrij2);
300 float cii = sth / nrij2;
301 float ckk = sth / nrkj2;
303 float3 f_i = cii * r_ij - cik * r_kj;
304 float3 f_k = ckk * r_kj - cik * r_ij;
305 float3 f_j = -f_i - f_k;
307 atomicAdd(&gm_f[ai], f_i);
308 atomicAdd(&gm_f[aj], f_j);
309 atomicAdd(&gm_f[ak], f_k);
313 atomicAdd(&sm_fShiftLoc[t1], f_i);
314 atomicAdd(&sm_fShiftLoc[CENTRAL], f_j);
315 atomicAdd(&sm_fShiftLoc[t2], f_k);
319 /* Time for the bond calculations */
327 fbond *= rsqrtf(dr2);
329 float3 fik = fbond * r_ik;
330 atomicAdd(&gm_f[ai], fik);
331 atomicAdd(&gm_f[ak], -fik);
333 if (calcVir && ki != CENTRAL)
335 atomicAdd(&sm_fShiftLoc[ki], fik);
336 atomicAdd(&sm_fShiftLoc[CENTRAL], -fik);
342 template<bool returnShift, typename T>
343 __device__ __forceinline__ static float dih_angle_gpu(const T xi,
347 const PbcAiuc& pbcAiuc,
357 *t1 = pbcDxAiuc<returnShift>(pbcAiuc, xi, xj, *r_ij);
358 *t2 = pbcDxAiuc<returnShift>(pbcAiuc, xk, xj, *r_kj);
359 *t3 = pbcDxAiuc<returnShift>(pbcAiuc, xk, xl, *r_kl);
361 *m = cprod(*r_ij, *r_kj);
362 *n = cprod(*r_kj, *r_kl);
363 float phi = gmx_angle(*m, *n);
364 float ipr = iprod(*r_ij, *n);
365 float sign = (ipr < 0.0f) ? -1.0f : 1.0f;
372 __device__ __forceinline__ static void
373 dopdihs_gpu(const float cpA, const float phiA, const int mult, const float phi, float* v, float* f)
377 mdphi = mult * phi - phiA * CUDA_DEG2RAD_F;
379 *v = cpA * (1.0f + cosf(mdphi));
380 *f = -cpA * mult * sdphi;
383 template<bool calcVir>
384 __device__ static void do_dih_fup_gpu(const int i,
395 float3 sm_fShiftLoc[],
396 const PbcAiuc& pbcAiuc,
397 const float4 gm_xq[],
400 const int gmx_unused t3)
402 float iprm = norm2(m);
403 float iprn = norm2(n);
404 float nrkj2 = norm2(r_kj);
405 float toler = nrkj2 * GMX_REAL_EPS;
406 if ((iprm > toler) && (iprn > toler))
408 float nrkj_1 = rsqrtf(nrkj2); // replacing std::invsqrt call
409 float nrkj_2 = nrkj_1 * nrkj_1;
410 float nrkj = nrkj2 * nrkj_1;
411 float a = -ddphi * nrkj / iprm;
413 float b = ddphi * nrkj / iprn;
415 float p = iprod(r_ij, r_kj);
417 float q = iprod(r_kl, r_kj);
419 float3 uvec = p * f_i;
420 float3 vvec = q * f_l;
421 float3 svec = uvec - vvec;
422 float3 f_j = f_i - svec;
423 float3 f_k = f_l + svec;
425 atomicAdd(&gm_f[i], f_i);
426 atomicAdd(&gm_f[j], -f_j);
427 atomicAdd(&gm_f[k], -f_k);
428 atomicAdd(&gm_f[l], f_l);
433 int t3 = pbcDxAiuc<calcVir>(pbcAiuc, gm_xq[l], gm_xq[j], dx_jl);
435 atomicAdd(&sm_fShiftLoc[t1], f_i);
436 atomicAdd(&sm_fShiftLoc[CENTRAL], -f_j);
437 atomicAdd(&sm_fShiftLoc[t2], -f_k);
438 atomicAdd(&sm_fShiftLoc[t3], f_l);
443 template<bool calcVir, bool calcEner>
444 __device__ void pdihs_gpu(const int i,
447 const t_iatom d_forceatoms[],
448 const t_iparams d_forceparams[],
449 const float4 gm_xq[],
451 float3 sm_fShiftLoc[],
452 const PbcAiuc pbcAiuc)
456 int type = d_forceatoms[5 * i];
457 int ai = d_forceatoms[5 * i + 1];
458 int aj = d_forceatoms[5 * i + 2];
459 int ak = d_forceatoms[5 * i + 3];
460 int al = d_forceatoms[5 * i + 4];
470 float phi = dih_angle_gpu<calcVir>(gm_xq[ai], gm_xq[aj], gm_xq[ak], gm_xq[al], pbcAiuc,
471 &r_ij, &r_kj, &r_kl, &m, &n, &t1, &t2, &t3);
475 dopdihs_gpu(d_forceparams[type].pdihs.cpA, d_forceparams[type].pdihs.phiA,
476 d_forceparams[type].pdihs.mult, phi, &vpd, &ddphi);
483 do_dih_fup_gpu<calcVir>(ai, aj, ak, al, ddphi, r_ij, r_kj, r_kl, m, n, gm_f, sm_fShiftLoc,
484 pbcAiuc, gm_xq, t1, t2, t3);
488 template<bool calcVir, bool calcEner>
489 __device__ void rbdihs_gpu(const int i,
492 const t_iatom d_forceatoms[],
493 const t_iparams d_forceparams[],
494 const float4 gm_xq[],
496 float3 sm_fShiftLoc[],
497 const PbcAiuc pbcAiuc)
499 constexpr float c0 = 0.0f, c1 = 1.0f, c2 = 2.0f, c3 = 3.0f, c4 = 4.0f, c5 = 5.0f;
503 int type = d_forceatoms[5 * i];
504 int ai = d_forceatoms[5 * i + 1];
505 int aj = d_forceatoms[5 * i + 2];
506 int ak = d_forceatoms[5 * i + 3];
507 int al = d_forceatoms[5 * i + 4];
517 float phi = dih_angle_gpu<calcVir>(gm_xq[ai], gm_xq[aj], gm_xq[ak], gm_xq[al], pbcAiuc,
518 &r_ij, &r_kj, &r_kl, &m, &n, &t1, &t2, &t3);
520 /* Change to polymer convention */
529 float cos_phi = cosf(phi);
530 /* Beware of accuracy loss, cannot use 1-sqrt(cos^2) ! */
531 float sin_phi = sinf(phi);
533 float parm[NR_RBDIHS];
534 for (int j = 0; j < NR_RBDIHS; j++)
536 parm[j] = d_forceparams[type].rbdihs.rbcA[j];
538 /* Calculate cosine powers */
539 /* Calculate the energy */
540 /* Calculate the derivative */
546 ddphi += rbp * cosfac;
553 ddphi += c2 * rbp * cosfac;
560 ddphi += c3 * rbp * cosfac;
567 ddphi += c4 * rbp * cosfac;
574 ddphi += c5 * rbp * cosfac;
581 ddphi = -ddphi * sin_phi;
583 do_dih_fup_gpu<calcVir>(ai, aj, ak, al, ddphi, r_ij, r_kj, r_kl, m, n, gm_f, sm_fShiftLoc,
584 pbcAiuc, gm_xq, t1, t2, t3);
592 __device__ __forceinline__ static void make_dp_periodic_gpu(float* dp)
594 /* dp cannot be outside (-pi,pi) */
595 if (*dp >= CUDART_PI_F)
597 *dp -= 2.0f * CUDART_PI_F;
599 else if (*dp < -CUDART_PI_F)
601 *dp += 2.0f * CUDART_PI_F;
605 template<bool calcVir, bool calcEner>
606 __device__ void idihs_gpu(const int i,
609 const t_iatom d_forceatoms[],
610 const t_iparams d_forceparams[],
611 const float4 gm_xq[],
613 float3 sm_fShiftLoc[],
614 const PbcAiuc pbcAiuc)
618 int type = d_forceatoms[5 * i];
619 int ai = d_forceatoms[5 * i + 1];
620 int aj = d_forceatoms[5 * i + 2];
621 int ak = d_forceatoms[5 * i + 3];
622 int al = d_forceatoms[5 * i + 4];
632 float phi = dih_angle_gpu<calcVir>(gm_xq[ai], gm_xq[aj], gm_xq[ak], gm_xq[al], pbcAiuc,
633 &r_ij, &r_kj, &r_kl, &m, &n, &t1, &t2, &t3);
635 /* phi can jump if phi0 is close to Pi/-Pi, which will cause huge
636 * force changes if we just apply a normal harmonic.
637 * Instead, we first calculate phi-phi0 and take it modulo (-Pi,Pi).
638 * This means we will never have the periodicity problem, unless
639 * the dihedral is Pi away from phiO, which is very unlikely due to
642 float kA = d_forceparams[type].harmonic.krA;
643 float pA = d_forceparams[type].harmonic.rA;
645 float phi0 = pA * CUDA_DEG2RAD_F;
647 float dp = phi - phi0;
649 make_dp_periodic_gpu(&dp);
651 float ddphi = -kA * dp;
653 do_dih_fup_gpu<calcVir>(ai, aj, ak, al, -ddphi, r_ij, r_kj, r_kl, m, n, gm_f, sm_fShiftLoc,
654 pbcAiuc, gm_xq, t1, t2, t3);
658 *vtot_loc += -0.5f * ddphi * dp;
663 template<bool calcVir, bool calcEner>
664 __device__ void pairs_gpu(const int i,
666 const t_iatom d_forceatoms[],
667 const t_iparams iparams[],
668 const float4 gm_xq[],
670 float3 sm_fShiftLoc[],
671 const PbcAiuc pbcAiuc,
672 const float scale_factor,
678 // TODO this should be made into a separate type, the GPU and CPU sizes should be compared
679 int3 pairData = *(int3*)(d_forceatoms + 3 * i);
680 int type = pairData.x;
684 float qq = gm_xq[ai].w * gm_xq[aj].w;
685 float c6 = iparams[type].lj14.c6A;
686 float c12 = iparams[type].lj14.c12A;
688 /* Do we need to apply full periodic boundary conditions? */
690 int fshift_index = pbcDxAiuc<calcVir>(pbcAiuc, gm_xq[ai], gm_xq[aj], dr);
692 float r2 = norm2(dr);
693 float rinv = rsqrtf(r2);
694 float rinv2 = rinv * rinv;
695 float rinv6 = rinv2 * rinv2 * rinv2;
697 /* Calculate the Coulomb force * r */
698 float velec = scale_factor * qq * rinv;
700 /* Calculate the LJ force * r and add it to the Coulomb part */
701 float fr = (12.0f * c12 * rinv6 - 6.0f * c6) * rinv6 + velec;
703 float finvr = fr * rinv2;
704 float3 f = finvr * dr;
707 atomicAdd(&gm_f[ai], f);
708 atomicAdd(&gm_f[aj], -f);
709 if (calcVir && fshift_index != CENTRAL)
711 atomicAdd(&sm_fShiftLoc[fshift_index], f);
712 atomicAdd(&sm_fShiftLoc[CENTRAL], -f);
717 *vtotVdw_loc += (c12 * rinv6 - c6) * rinv6;
718 *vtotElec_loc += velec;
726 template<bool calcVir, bool calcEner>
727 __global__ void exec_kernel_gpu(BondedCudaKernelParameters kernelParams)
729 assert(blockDim.y == 1 && blockDim.z == 1);
730 const int tid = blockIdx.x * blockDim.x + threadIdx.x;
732 float vtotVdw_loc = 0;
733 float vtotElec_loc = 0;
734 __shared__ float3 sm_fShiftLoc[SHIFTS];
738 if (threadIdx.x < SHIFTS)
740 sm_fShiftLoc[threadIdx.x] = make_float3(0.0f, 0.0f, 0.0f);
746 bool threadComputedPotential = false;
748 for (int j = 0; j < numFTypesOnGpu; j++)
750 if (tid >= kernelParams.fTypeRangeStart[j] && tid <= kernelParams.fTypeRangeEnd[j])
752 const int numBonds = kernelParams.numFTypeBonds[j];
753 int fTypeTid = tid - kernelParams.fTypeRangeStart[j];
754 const t_iatom* iatoms = kernelParams.d_iatoms[j];
755 fType = kernelParams.fTypesOnGpu[j];
758 threadComputedPotential = true;
764 bonds_gpu<calcVir, calcEner>(fTypeTid, &vtot_loc, numBonds, iatoms,
765 kernelParams.d_forceParams, kernelParams.d_xq,
766 kernelParams.d_f, sm_fShiftLoc, kernelParams.pbcAiuc);
769 angles_gpu<calcVir, calcEner>(
770 fTypeTid, &vtot_loc, numBonds, iatoms, kernelParams.d_forceParams,
771 kernelParams.d_xq, kernelParams.d_f, sm_fShiftLoc, kernelParams.pbcAiuc);
774 urey_bradley_gpu<calcVir, calcEner>(
775 fTypeTid, &vtot_loc, numBonds, iatoms, kernelParams.d_forceParams,
776 kernelParams.d_xq, kernelParams.d_f, sm_fShiftLoc, kernelParams.pbcAiuc);
780 pdihs_gpu<calcVir, calcEner>(fTypeTid, &vtot_loc, numBonds, iatoms,
781 kernelParams.d_forceParams, kernelParams.d_xq,
782 kernelParams.d_f, sm_fShiftLoc, kernelParams.pbcAiuc);
785 rbdihs_gpu<calcVir, calcEner>(
786 fTypeTid, &vtot_loc, numBonds, iatoms, kernelParams.d_forceParams,
787 kernelParams.d_xq, kernelParams.d_f, sm_fShiftLoc, kernelParams.pbcAiuc);
790 idihs_gpu<calcVir, calcEner>(fTypeTid, &vtot_loc, numBonds, iatoms,
791 kernelParams.d_forceParams, kernelParams.d_xq,
792 kernelParams.d_f, sm_fShiftLoc, kernelParams.pbcAiuc);
795 pairs_gpu<calcVir, calcEner>(fTypeTid, numBonds, iatoms, kernelParams.d_forceParams,
796 kernelParams.d_xq, kernelParams.d_f, sm_fShiftLoc,
797 kernelParams.pbcAiuc, kernelParams.scaleFactor,
798 &vtotVdw_loc, &vtotElec_loc);
805 if (threadComputedPotential)
807 float* vtotVdw = kernelParams.d_vTot + F_LJ14;
808 float* vtotElec = kernelParams.d_vTot + F_COUL14;
809 atomicAdd(kernelParams.d_vTot + fType, vtot_loc);
810 atomicAdd(vtotVdw, vtotVdw_loc);
811 atomicAdd(vtotElec, vtotElec_loc);
813 /* Accumulate shift vectors from shared memory to global memory on the first SHIFTS threads of the block. */
817 if (threadIdx.x < SHIFTS)
819 atomicAdd(kernelParams.d_fShift[threadIdx.x], sm_fShiftLoc[threadIdx.x]);
825 /*-------------------------------- End CUDA kernels-----------------------------*/
828 template<bool calcVir, bool calcEner>
829 void GpuBonded::Impl::launchKernel(const t_forcerec* fr, const matrix box)
831 GMX_ASSERT(haveInteractions_,
832 "Cannot launch bonded GPU kernels unless bonded GPU work was scheduled");
833 static_assert(TPB_BONDED >= SHIFTS,
834 "TPB_BONDED must be >= SHIFTS for the virial kernel (calcVir=true)");
837 setPbcAiuc(fr->bMolPBC ? numPbcDimensions(fr->pbcType) : 0, box, &pbcAiuc);
839 int fTypeRangeEnd = kernelParams_.fTypeRangeEnd[numFTypesOnGpu - 1];
841 if (fTypeRangeEnd < 0)
846 KernelLaunchConfig config;
847 config.blockSize[0] = TPB_BONDED;
848 config.blockSize[1] = 1;
849 config.blockSize[2] = 1;
850 config.gridSize[0] = (fTypeRangeEnd + TPB_BONDED) / TPB_BONDED;
851 config.gridSize[1] = 1;
852 config.gridSize[2] = 1;
853 config.stream = stream_;
855 auto kernelPtr = exec_kernel_gpu<calcVir, calcEner>;
856 kernelParams_.scaleFactor = fr->ic->epsfac * fr->fudgeQQ;
857 kernelParams_.pbcAiuc = pbcAiuc;
859 const auto kernelArgs = prepareGpuKernelArguments(kernelPtr, config, &kernelParams_);
861 launchGpuKernel(kernelPtr, config, nullptr, "exec_kernel_gpu<calcVir, calcEner>", kernelArgs);
864 void GpuBonded::launchKernel(const t_forcerec* fr, const gmx::StepWorkload& stepWork, const matrix box)
866 if (stepWork.computeEnergy)
868 // When we need the energy, we also need the virial
869 impl_->launchKernel<true, true>(fr, box);
871 else if (stepWork.computeVirial)
873 impl_->launchKernel<true, false>(fr, box);
877 impl_->launchKernel<false, false>(fr, box);