2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
37 * \brief Implements CUDA bonded functionality
39 * \author Jon Vincent <jvincent@nvidia.com>
40 * \author Magnus Lundborg <lundborg.magnus@gmail.com>
41 * \author Berk Hess <hess@kth.se>
42 * \author Szilárd Páll <pall.szilard@gmail.com>
43 * \author Alan Gray <alang@nvidia.com>
44 * \author Mark Abraham <mark.j.abraham@gmail.com>
46 * \ingroup module_listed_forces
53 #include <math_constants.h>
55 #include "gromacs/gpu_utils/cudautils.cuh"
56 #include "gromacs/gpu_utils/typecasts.cuh"
57 #include "gromacs/gpu_utils/vectype_ops.cuh"
58 #include "gromacs/listed_forces/gpubonded.h"
59 #include "gromacs/math/units.h"
60 #include "gromacs/mdlib/force_flags.h"
61 #include "gromacs/mdtypes/interaction_const.h"
62 #include "gromacs/mdtypes/simulation_workload.h"
63 #include "gromacs/pbcutil/pbc_aiuc_cuda.cuh"
64 #include "gromacs/timing/wallcycle.h"
65 #include "gromacs/utility/gmxassert.h"
67 #include "gpubonded_impl.h"
73 /*-------------------------------- CUDA kernels-------------------------------- */
74 /*------------------------------------------------------------------------------*/
76 #define CUDA_DEG2RAD_F (CUDART_PI_F / 180.0f)
78 /*---------------- BONDED CUDA kernels--------------*/
81 __device__ __forceinline__ static void
82 harmonic_gpu(const float kA, const float xA, const float x, float* V, float* F)
84 constexpr float half = 0.5f;
94 template<bool calcVir, bool calcEner>
95 __device__ void bonds_gpu(const int i,
98 const t_iatom d_forceatoms[],
99 const t_iparams d_forceparams[],
100 const float4 gm_xq[],
102 float3 sm_fShiftLoc[],
103 const PbcAiuc pbcAiuc)
107 int3 bondData = *(int3*)(d_forceatoms + 3 * i);
108 int type = bondData.x;
112 /* dx = xi - xj, corrected for periodic boundary conditions. */
114 int ki = pbcDxAiuc<calcVir>(pbcAiuc, gm_xq[ai], gm_xq[aj], dx);
116 float dr2 = norm2(dx);
117 float dr = sqrt(dr2);
121 harmonic_gpu(d_forceparams[type].harmonic.krA, d_forceparams[type].harmonic.rA, dr, &vbond, &fbond);
130 fbond *= rsqrtf(dr2);
132 float3 fij = fbond * dx;
133 atomicAdd(&gm_f[ai], fij);
134 atomicAdd(&gm_f[aj], -fij);
135 if (calcVir && ki != CENTRAL)
137 atomicAdd(&sm_fShiftLoc[ki], fij);
138 atomicAdd(&sm_fShiftLoc[CENTRAL], -fij);
144 template<bool returnShift>
145 __device__ __forceinline__ static float bond_angle_gpu(const float4 xi,
148 const PbcAiuc& pbcAiuc,
154 /* Return value is the angle between the bonds i-j and j-k */
156 *t1 = pbcDxAiuc<returnShift>(pbcAiuc, xi, xj, *r_ij);
157 *t2 = pbcDxAiuc<returnShift>(pbcAiuc, xk, xj, *r_kj);
159 *costh = cos_angle(*r_ij, *r_kj);
160 float th = acosf(*costh);
165 template<bool calcVir, bool calcEner>
166 __device__ void angles_gpu(const int i,
169 const t_iatom d_forceatoms[],
170 const t_iparams d_forceparams[],
171 const float4 gm_xq[],
173 float3 sm_fShiftLoc[],
174 const PbcAiuc pbcAiuc)
178 int4 angleData = *(int4*)(d_forceatoms + 4 * i);
179 int type = angleData.x;
180 int ai = angleData.y;
181 int aj = angleData.z;
182 int ak = angleData.w;
189 float theta = bond_angle_gpu<calcVir>(gm_xq[ai], gm_xq[aj], gm_xq[ak], pbcAiuc, &r_ij,
190 &r_kj, &cos_theta, &t1, &t2);
194 harmonic_gpu(d_forceparams[type].harmonic.krA,
195 d_forceparams[type].harmonic.rA * CUDA_DEG2RAD_F, theta, &va, &dVdt);
202 float cos_theta2 = cos_theta * cos_theta;
203 if (cos_theta2 < 1.0f)
205 float st = dVdt * rsqrtf(1.0f - cos_theta2);
206 float sth = st * cos_theta;
207 float nrij2 = norm2(r_ij);
208 float nrkj2 = norm2(r_kj);
210 float nrij_1 = rsqrtf(nrij2);
211 float nrkj_1 = rsqrtf(nrkj2);
213 float cik = st * nrij_1 * nrkj_1;
214 float cii = sth * nrij_1 * nrij_1;
215 float ckk = sth * nrkj_1 * nrkj_1;
217 float3 f_i = cii * r_ij - cik * r_kj;
218 float3 f_k = ckk * r_kj - cik * r_ij;
219 float3 f_j = -f_i - f_k;
221 atomicAdd(&gm_f[ai], f_i);
222 atomicAdd(&gm_f[aj], f_j);
223 atomicAdd(&gm_f[ak], f_k);
227 atomicAdd(&sm_fShiftLoc[t1], f_i);
228 atomicAdd(&sm_fShiftLoc[CENTRAL], f_j);
229 atomicAdd(&sm_fShiftLoc[t2], f_k);
235 template<bool calcVir, bool calcEner>
236 __device__ void urey_bradley_gpu(const int i,
239 const t_iatom d_forceatoms[],
240 const t_iparams d_forceparams[],
241 const float4 gm_xq[],
243 float3 sm_fShiftLoc[],
244 const PbcAiuc pbcAiuc)
248 int4 ubData = *(int4*)(d_forceatoms + 4 * i);
254 float th0A = d_forceparams[type].u_b.thetaA * CUDA_DEG2RAD_F;
255 float kthA = d_forceparams[type].u_b.kthetaA;
256 float r13A = d_forceparams[type].u_b.r13A;
257 float kUBA = d_forceparams[type].u_b.kUBA;
264 float theta = bond_angle_gpu<calcVir>(gm_xq[ai], gm_xq[aj], gm_xq[ak], pbcAiuc, &r_ij,
265 &r_kj, &cos_theta, &t1, &t2);
269 harmonic_gpu(kthA, th0A, theta, &va, &dVdt);
277 int ki = pbcDxAiuc<calcVir>(pbcAiuc, gm_xq[ai], gm_xq[ak], r_ik);
279 float dr2 = norm2(r_ik);
280 float dr = dr2 * rsqrtf(dr2);
284 harmonic_gpu(kUBA, r13A, dr, &vbond, &fbond);
286 float cos_theta2 = cos_theta * cos_theta;
287 if (cos_theta2 < 1.0f)
289 float st = dVdt * rsqrtf(1.0f - cos_theta2);
290 float sth = st * cos_theta;
292 float nrkj2 = norm2(r_kj);
293 float nrij2 = norm2(r_ij);
295 float cik = st * rsqrtf(nrkj2 * nrij2);
296 float cii = sth / nrij2;
297 float ckk = sth / nrkj2;
299 float3 f_i = cii * r_ij - cik * r_kj;
300 float3 f_k = ckk * r_kj - cik * r_ij;
301 float3 f_j = -f_i - f_k;
303 atomicAdd(&gm_f[ai], f_i);
304 atomicAdd(&gm_f[aj], f_j);
305 atomicAdd(&gm_f[ak], f_k);
309 atomicAdd(&sm_fShiftLoc[t1], f_i);
310 atomicAdd(&sm_fShiftLoc[CENTRAL], f_j);
311 atomicAdd(&sm_fShiftLoc[t2], f_k);
315 /* Time for the bond calculations */
323 fbond *= rsqrtf(dr2);
325 float3 fik = fbond * r_ik;
326 atomicAdd(&gm_f[ai], fik);
327 atomicAdd(&gm_f[ak], -fik);
329 if (calcVir && ki != CENTRAL)
331 atomicAdd(&sm_fShiftLoc[ki], fik);
332 atomicAdd(&sm_fShiftLoc[CENTRAL], -fik);
338 template<bool returnShift, typename T>
339 __device__ __forceinline__ static float dih_angle_gpu(const T xi,
343 const PbcAiuc& pbcAiuc,
353 *t1 = pbcDxAiuc<returnShift>(pbcAiuc, xi, xj, *r_ij);
354 *t2 = pbcDxAiuc<returnShift>(pbcAiuc, xk, xj, *r_kj);
355 *t3 = pbcDxAiuc<returnShift>(pbcAiuc, xk, xl, *r_kl);
357 *m = cprod(*r_ij, *r_kj);
358 *n = cprod(*r_kj, *r_kl);
359 float phi = gmx_angle(*m, *n);
360 float ipr = iprod(*r_ij, *n);
361 float sign = (ipr < 0.0f) ? -1.0f : 1.0f;
368 __device__ __forceinline__ static void
369 dopdihs_gpu(const float cpA, const float phiA, const int mult, const float phi, float* v, float* f)
373 mdphi = mult * phi - phiA * CUDA_DEG2RAD_F;
375 *v = cpA * (1.0f + cosf(mdphi));
376 *f = -cpA * mult * sdphi;
379 template<bool calcVir>
380 __device__ static void do_dih_fup_gpu(const int i,
391 float3 sm_fShiftLoc[],
392 const PbcAiuc& pbcAiuc,
393 const float4 gm_xq[],
396 const int gmx_unused t3)
398 float iprm = norm2(m);
399 float iprn = norm2(n);
400 float nrkj2 = norm2(r_kj);
401 float toler = nrkj2 * GMX_REAL_EPS;
402 if ((iprm > toler) && (iprn > toler))
404 float nrkj_1 = rsqrtf(nrkj2); // replacing std::invsqrt call
405 float nrkj_2 = nrkj_1 * nrkj_1;
406 float nrkj = nrkj2 * nrkj_1;
407 float a = -ddphi * nrkj / iprm;
409 float b = ddphi * nrkj / iprn;
411 float p = iprod(r_ij, r_kj);
413 float q = iprod(r_kl, r_kj);
415 float3 uvec = p * f_i;
416 float3 vvec = q * f_l;
417 float3 svec = uvec - vvec;
418 float3 f_j = f_i - svec;
419 float3 f_k = f_l + svec;
421 atomicAdd(&gm_f[i], f_i);
422 atomicAdd(&gm_f[j], -f_j);
423 atomicAdd(&gm_f[k], -f_k);
424 atomicAdd(&gm_f[l], f_l);
429 int t3 = pbcDxAiuc<calcVir>(pbcAiuc, gm_xq[l], gm_xq[j], dx_jl);
431 atomicAdd(&sm_fShiftLoc[t1], f_i);
432 atomicAdd(&sm_fShiftLoc[CENTRAL], -f_j);
433 atomicAdd(&sm_fShiftLoc[t2], -f_k);
434 atomicAdd(&sm_fShiftLoc[t3], f_l);
439 template<bool calcVir, bool calcEner>
440 __device__ void pdihs_gpu(const int i,
443 const t_iatom d_forceatoms[],
444 const t_iparams d_forceparams[],
445 const float4 gm_xq[],
447 float3 sm_fShiftLoc[],
448 const PbcAiuc pbcAiuc)
452 int type = d_forceatoms[5 * i];
453 int ai = d_forceatoms[5 * i + 1];
454 int aj = d_forceatoms[5 * i + 2];
455 int ak = d_forceatoms[5 * i + 3];
456 int al = d_forceatoms[5 * i + 4];
466 float phi = dih_angle_gpu<calcVir>(gm_xq[ai], gm_xq[aj], gm_xq[ak], gm_xq[al], pbcAiuc,
467 &r_ij, &r_kj, &r_kl, &m, &n, &t1, &t2, &t3);
471 dopdihs_gpu(d_forceparams[type].pdihs.cpA, d_forceparams[type].pdihs.phiA,
472 d_forceparams[type].pdihs.mult, phi, &vpd, &ddphi);
479 do_dih_fup_gpu<calcVir>(ai, aj, ak, al, ddphi, r_ij, r_kj, r_kl, m, n, gm_f, sm_fShiftLoc,
480 pbcAiuc, gm_xq, t1, t2, t3);
484 template<bool calcVir, bool calcEner>
485 __device__ void rbdihs_gpu(const int i,
488 const t_iatom d_forceatoms[],
489 const t_iparams d_forceparams[],
490 const float4 gm_xq[],
492 float3 sm_fShiftLoc[],
493 const PbcAiuc pbcAiuc)
495 constexpr float c0 = 0.0f, c1 = 1.0f, c2 = 2.0f, c3 = 3.0f, c4 = 4.0f, c5 = 5.0f;
499 int type = d_forceatoms[5 * i];
500 int ai = d_forceatoms[5 * i + 1];
501 int aj = d_forceatoms[5 * i + 2];
502 int ak = d_forceatoms[5 * i + 3];
503 int al = d_forceatoms[5 * i + 4];
513 float phi = dih_angle_gpu<calcVir>(gm_xq[ai], gm_xq[aj], gm_xq[ak], gm_xq[al], pbcAiuc,
514 &r_ij, &r_kj, &r_kl, &m, &n, &t1, &t2, &t3);
516 /* Change to polymer convention */
525 float cos_phi = cosf(phi);
526 /* Beware of accuracy loss, cannot use 1-sqrt(cos^2) ! */
527 float sin_phi = sinf(phi);
529 float parm[NR_RBDIHS];
530 for (int j = 0; j < NR_RBDIHS; j++)
532 parm[j] = d_forceparams[type].rbdihs.rbcA[j];
534 /* Calculate cosine powers */
535 /* Calculate the energy */
536 /* Calculate the derivative */
542 ddphi += rbp * cosfac;
549 ddphi += c2 * rbp * cosfac;
556 ddphi += c3 * rbp * cosfac;
563 ddphi += c4 * rbp * cosfac;
570 ddphi += c5 * rbp * cosfac;
577 ddphi = -ddphi * sin_phi;
579 do_dih_fup_gpu<calcVir>(ai, aj, ak, al, ddphi, r_ij, r_kj, r_kl, m, n, gm_f, sm_fShiftLoc,
580 pbcAiuc, gm_xq, t1, t2, t3);
588 __device__ __forceinline__ static void make_dp_periodic_gpu(float* dp)
590 /* dp cannot be outside (-pi,pi) */
591 if (*dp >= CUDART_PI_F)
593 *dp -= 2.0f * CUDART_PI_F;
595 else if (*dp < -CUDART_PI_F)
597 *dp += 2.0f * CUDART_PI_F;
601 template<bool calcVir, bool calcEner>
602 __device__ void idihs_gpu(const int i,
605 const t_iatom d_forceatoms[],
606 const t_iparams d_forceparams[],
607 const float4 gm_xq[],
609 float3 sm_fShiftLoc[],
610 const PbcAiuc pbcAiuc)
614 int type = d_forceatoms[5 * i];
615 int ai = d_forceatoms[5 * i + 1];
616 int aj = d_forceatoms[5 * i + 2];
617 int ak = d_forceatoms[5 * i + 3];
618 int al = d_forceatoms[5 * i + 4];
628 float phi = dih_angle_gpu<calcVir>(gm_xq[ai], gm_xq[aj], gm_xq[ak], gm_xq[al], pbcAiuc,
629 &r_ij, &r_kj, &r_kl, &m, &n, &t1, &t2, &t3);
631 /* phi can jump if phi0 is close to Pi/-Pi, which will cause huge
632 * force changes if we just apply a normal harmonic.
633 * Instead, we first calculate phi-phi0 and take it modulo (-Pi,Pi).
634 * This means we will never have the periodicity problem, unless
635 * the dihedral is Pi away from phiO, which is very unlikely due to
638 float kA = d_forceparams[type].harmonic.krA;
639 float pA = d_forceparams[type].harmonic.rA;
641 float phi0 = pA * CUDA_DEG2RAD_F;
643 float dp = phi - phi0;
645 make_dp_periodic_gpu(&dp);
647 float ddphi = -kA * dp;
649 do_dih_fup_gpu<calcVir>(ai, aj, ak, al, -ddphi, r_ij, r_kj, r_kl, m, n, gm_f, sm_fShiftLoc,
650 pbcAiuc, gm_xq, t1, t2, t3);
654 *vtot_loc += -0.5f * ddphi * dp;
659 template<bool calcVir, bool calcEner>
660 __device__ void pairs_gpu(const int i,
662 const t_iatom d_forceatoms[],
663 const t_iparams iparams[],
664 const float4 gm_xq[],
666 float3 sm_fShiftLoc[],
667 const PbcAiuc pbcAiuc,
668 const float scale_factor,
674 // TODO this should be made into a separate type, the GPU and CPU sizes should be compared
675 int3 pairData = *(int3*)(d_forceatoms + 3 * i);
676 int type = pairData.x;
680 float qq = gm_xq[ai].w * gm_xq[aj].w;
681 float c6 = iparams[type].lj14.c6A;
682 float c12 = iparams[type].lj14.c12A;
684 /* Do we need to apply full periodic boundary conditions? */
686 int fshift_index = pbcDxAiuc<calcVir>(pbcAiuc, gm_xq[ai], gm_xq[aj], dr);
688 float r2 = norm2(dr);
689 float rinv = rsqrtf(r2);
690 float rinv2 = rinv * rinv;
691 float rinv6 = rinv2 * rinv2 * rinv2;
693 /* Calculate the Coulomb force * r */
694 float velec = scale_factor * qq * rinv;
696 /* Calculate the LJ force * r and add it to the Coulomb part */
697 float fr = (12.0f * c12 * rinv6 - 6.0f * c6) * rinv6 + velec;
699 float finvr = fr * rinv2;
700 float3 f = finvr * dr;
703 atomicAdd(&gm_f[ai], f);
704 atomicAdd(&gm_f[aj], -f);
705 if (calcVir && fshift_index != CENTRAL)
707 atomicAdd(&sm_fShiftLoc[fshift_index], f);
708 atomicAdd(&sm_fShiftLoc[CENTRAL], -f);
713 *vtotVdw_loc += (c12 * rinv6 - c6) * rinv6;
714 *vtotElec_loc += velec;
722 template<bool calcVir, bool calcEner>
723 __global__ void exec_kernel_gpu(BondedCudaKernelParameters kernelParams)
725 assert(blockDim.y == 1 && blockDim.z == 1);
726 const int tid = blockIdx.x * blockDim.x + threadIdx.x;
728 float vtotVdw_loc = 0;
729 float vtotElec_loc = 0;
731 extern __shared__ char sm_dynamicShmem[];
732 char* sm_nextSlotPtr = sm_dynamicShmem;
733 float3* sm_fShiftLoc = (float3*)sm_nextSlotPtr;
734 sm_nextSlotPtr += SHIFTS * sizeof(float3);
738 if (threadIdx.x < SHIFTS)
740 sm_fShiftLoc[threadIdx.x] = make_float3(0.0f, 0.0f, 0.0f);
746 bool threadComputedPotential = false;
748 for (int j = 0; j < numFTypesOnGpu; j++)
750 if (tid >= kernelParams.fTypeRangeStart[j] && tid <= kernelParams.fTypeRangeEnd[j])
752 const int numBonds = kernelParams.numFTypeBonds[j];
753 int fTypeTid = tid - kernelParams.fTypeRangeStart[j];
754 const t_iatom* iatoms = kernelParams.d_iatoms[j];
755 fType = kernelParams.fTypesOnGpu[j];
758 threadComputedPotential = true;
764 bonds_gpu<calcVir, calcEner>(fTypeTid, &vtot_loc, numBonds, iatoms,
765 kernelParams.d_forceParams, kernelParams.d_xq,
766 kernelParams.d_f, sm_fShiftLoc, kernelParams.pbcAiuc);
769 angles_gpu<calcVir, calcEner>(
770 fTypeTid, &vtot_loc, numBonds, iatoms, kernelParams.d_forceParams,
771 kernelParams.d_xq, kernelParams.d_f, sm_fShiftLoc, kernelParams.pbcAiuc);
774 urey_bradley_gpu<calcVir, calcEner>(
775 fTypeTid, &vtot_loc, numBonds, iatoms, kernelParams.d_forceParams,
776 kernelParams.d_xq, kernelParams.d_f, sm_fShiftLoc, kernelParams.pbcAiuc);
780 pdihs_gpu<calcVir, calcEner>(fTypeTid, &vtot_loc, numBonds, iatoms,
781 kernelParams.d_forceParams, kernelParams.d_xq,
782 kernelParams.d_f, sm_fShiftLoc, kernelParams.pbcAiuc);
785 rbdihs_gpu<calcVir, calcEner>(
786 fTypeTid, &vtot_loc, numBonds, iatoms, kernelParams.d_forceParams,
787 kernelParams.d_xq, kernelParams.d_f, sm_fShiftLoc, kernelParams.pbcAiuc);
790 idihs_gpu<calcVir, calcEner>(fTypeTid, &vtot_loc, numBonds, iatoms,
791 kernelParams.d_forceParams, kernelParams.d_xq,
792 kernelParams.d_f, sm_fShiftLoc, kernelParams.pbcAiuc);
795 pairs_gpu<calcVir, calcEner>(
796 fTypeTid, numBonds, iatoms, kernelParams.d_forceParams,
797 kernelParams.d_xq, kernelParams.d_f, sm_fShiftLoc, kernelParams.pbcAiuc,
798 kernelParams.electrostaticsScaleFactor, &vtotVdw_loc, &vtotElec_loc);
805 if (threadComputedPotential)
807 float* vtotVdw = kernelParams.d_vTot + F_LJ14;
808 float* vtotElec = kernelParams.d_vTot + F_COUL14;
810 // Stage atomic accumulation through shared memory:
811 // each warp will accumulate its own partial sum
812 // and then a single thread per warp will accumulate this to the global sum
814 int numWarps = blockDim.x / warpSize;
815 int warpId = threadIdx.x / warpSize;
817 // Shared memory variables to hold block-local partial sum
818 float* sm_vTot = (float*)sm_nextSlotPtr;
819 sm_nextSlotPtr += numWarps * sizeof(float);
820 float* sm_vTotVdw = (float*)sm_nextSlotPtr;
821 sm_nextSlotPtr += numWarps * sizeof(float);
822 float* sm_vTotElec = (float*)sm_nextSlotPtr;
824 if (threadIdx.x % warpSize == 0)
826 // One thread per warp initializes to zero
827 sm_vTot[warpId] = 0.;
828 sm_vTotVdw[warpId] = 0.;
829 sm_vTotElec[warpId] = 0.;
831 __syncwarp(); // All threads in warp must wait for initialization
833 // Perform warp-local accumulation in shared memory
834 atomicAdd(sm_vTot + warpId, vtot_loc);
835 atomicAdd(sm_vTotVdw + warpId, vtotVdw_loc);
836 atomicAdd(sm_vTotElec + warpId, vtotElec_loc);
838 __syncwarp(); // Ensure all threads in warp have completed
839 if (threadIdx.x % warpSize == 0)
840 { // One thread per warp accumulates partial sum into global sum
841 atomicAdd(kernelParams.d_vTot + fType, sm_vTot[warpId]);
842 atomicAdd(vtotVdw, sm_vTotVdw[warpId]);
843 atomicAdd(vtotElec, sm_vTotElec[warpId]);
846 /* Accumulate shift vectors from shared memory to global memory on the first SHIFTS threads of the block. */
850 if (threadIdx.x < SHIFTS)
852 atomicAdd(kernelParams.d_fShift[threadIdx.x], sm_fShiftLoc[threadIdx.x]);
858 /*-------------------------------- End CUDA kernels-----------------------------*/
861 template<bool calcVir, bool calcEner>
862 void GpuBonded::Impl::launchKernel()
864 GMX_ASSERT(haveInteractions_,
865 "Cannot launch bonded GPU kernels unless bonded GPU work was scheduled");
867 wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
868 wallcycle_sub_start(wcycle_, ewcsLAUNCH_GPU_BONDED);
870 int fTypeRangeEnd = kernelParams_.fTypeRangeEnd[numFTypesOnGpu - 1];
872 if (fTypeRangeEnd < 0)
877 auto kernelPtr = exec_kernel_gpu<calcVir, calcEner>;
879 const auto kernelArgs = prepareGpuKernelArguments(kernelPtr, kernelLaunchConfig_, &kernelParams_);
881 launchGpuKernel(kernelPtr, kernelLaunchConfig_, deviceStream_, nullptr,
882 "exec_kernel_gpu<calcVir, calcEner>", kernelArgs);
884 wallcycle_sub_stop(wcycle_, ewcsLAUNCH_GPU_BONDED);
885 wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
888 void GpuBonded::launchKernel(const gmx::StepWorkload& stepWork)
890 if (stepWork.computeEnergy)
892 // When we need the energy, we also need the virial
893 impl_->launchKernel<true, true>();
895 else if (stepWork.computeVirial)
897 impl_->launchKernel<true, false>();
901 impl_->launchKernel<false, false>();