2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2016 by the GROMACS development team.
5 * Copyright (c) 2017,2018,2019,2020,2021, by the GROMACS development team, led by
6 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
7 * and including many others, as listed in the AUTHORS file in the
8 * top-level source directory and at http://www.gromacs.org.
10 * GROMACS is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public License
12 * as published by the Free Software Foundation; either version 2.1
13 * of the License, or (at your option) any later version.
15 * GROMACS is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with GROMACS; if not, see
22 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
23 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
25 * If you want to redistribute modifications to GROMACS, please
26 * consider that scientific software is very special. Version
27 * control is crucial - bugs must be traceable. We will be happy to
28 * consider code for inclusion in the official distribution, but
29 * derived work must not be called official GROMACS. Details are found
30 * in the README & COPYING files - if they are missing, get the
31 * official version at http://www.gromacs.org.
33 * To help us fund GROMACS development, we humbly ask that you cite
34 * the research papers on the package. Check out http://www.gromacs.org.
38 #include "kernel_gpu_ref.h"
44 #include "gromacs/math/functions.h"
45 #include "gromacs/math/utilities.h"
46 #include "gromacs/math/vec.h"
47 #include "gromacs/mdtypes/interaction_const.h"
48 #include "gromacs/mdtypes/md_enums.h"
49 #include "gromacs/mdtypes/simulation_workload.h"
50 #include "gromacs/nbnxm/atomdata.h"
51 #include "gromacs/nbnxm/nbnxm.h"
52 #include "gromacs/nbnxm/pairlist.h"
53 #include "gromacs/pbcutil/ishift.h"
54 #include "gromacs/utility/fatalerror.h"
56 static constexpr int c_clSize = c_nbnxnGpuClusterSize;
58 void nbnxn_kernel_gpu_ref(const NbnxnPairlistGpu* nbl,
59 const nbnxn_atomdata_t* nbat,
60 const interaction_const_t* iconst,
62 const gmx::StepWorkload& stepWork,
64 gmx::ArrayRef<real> f,
71 const nbnxn_excl_t* excl[2];
73 if (nbl->na_ci != c_clSize)
76 "The neighborlist cluster size in the GPU reference kernel is %d, expected it to "
82 if (clearF == enbvClearFYes)
90 const bool bEwald = EEL_FULL(iconst->eeltype);
92 const real rcut2 = iconst->rcoulomb * iconst->rcoulomb;
93 const real rvdw2 = iconst->rvdw * iconst->rvdw;
95 const real rlist2 = nbl->rlist * nbl->rlist;
97 const int* type = nbat->params().type.data();
98 const real facel = iconst->epsfac;
99 const real* shiftvec = shift_vec[0];
100 const real* vdwparam = nbat->params().nbfp.data();
101 const int ntype = nbat->params().numTypes;
103 const real* x = nbat->x().data();
109 for (const nbnxn_sci_t& nbln : nbl->sci)
111 const int ish3 = 3 * nbln.shift;
112 const real shX = shiftvec[ish3];
113 const real shY = shiftvec[ish3 + 1];
114 const real shZ = shiftvec[ish3 + 2];
115 const int cj4_ind0 = nbln.cj4_ind_start;
116 const int cj4_ind1 = nbln.cj4_ind_end;
117 const int sci = nbln.sci;
121 if (nbln.shift == CENTRAL && nbl->cj4[cj4_ind0].cj[0] == sci * c_nbnxnGpuNumClusterPerSupercluster)
123 /* we have the diagonal:
124 * add the charge self interaction energy term
126 for (int im = 0; im < c_nbnxnGpuNumClusterPerSupercluster; im++)
128 const int ci = sci * c_nbnxnGpuNumClusterPerSupercluster + im;
129 for (int ic = 0; ic < c_clSize; ic++)
131 const int ia = ci * c_clSize + ic;
132 real iq = x[ia * nbat->xstride + 3];
138 vctot *= -facel * 0.5 * iconst->reactionFieldShift;
142 /* last factor 1/sqrt(pi) */
143 vctot *= -facel * iconst->ewaldcoeff_q * M_1_SQRTPI;
147 for (int cj4_ind = cj4_ind0; (cj4_ind < cj4_ind1); cj4_ind++)
149 excl[0] = &nbl->excl[nbl->cj4[cj4_ind].imei[0].excl_ind];
150 excl[1] = &nbl->excl[nbl->cj4[cj4_ind].imei[1].excl_ind];
152 for (int jm = 0; jm < c_nbnxnGpuJgroupSize; jm++)
154 const int cj = nbl->cj4[cj4_ind].cj[jm];
156 for (int im = 0; im < c_nbnxnGpuNumClusterPerSupercluster; im++)
158 /* We're only using the first imask,
159 * but here imei[1].imask is identical.
161 if ((nbl->cj4[cj4_ind].imei[0].imask >> (jm * c_nbnxnGpuNumClusterPerSupercluster + im))
164 const int ci = sci * c_nbnxnGpuNumClusterPerSupercluster + im;
166 bool within_rlist = false;
168 for (int ic = 0; ic < c_clSize; ic++)
170 const int ia = ci * c_clSize + ic;
172 const int is = ia * nbat->xstride;
173 const int ifs = ia * nbat->fstride;
174 const real ix = shX + x[is + 0];
175 const real iy = shY + x[is + 1];
176 const real iz = shZ + x[is + 2];
177 const real iq = facel * x[is + 3];
178 const int nti = ntype * 2 * type[ia];
184 for (int jc = 0; jc < c_clSize; jc++)
186 const int ja = cj * c_clSize + jc;
188 if (nbln.shift == CENTRAL && ci == cj && ja <= ia)
193 constexpr int clusterPerSplit =
194 c_nbnxnGpuClusterSize / c_nbnxnGpuClusterpairSplit;
195 const real int_bit = static_cast<real>(
196 (excl[jc / clusterPerSplit]->pair[(jc & (clusterPerSplit - 1)) * c_clSize + ic]
197 >> (jm * c_nbnxnGpuNumClusterPerSupercluster + im))
200 int js = ja * nbat->xstride;
201 int jfs = ja * nbat->fstride;
202 const real jx = x[js + 0];
203 const real jy = x[js + 1];
204 const real jz = x[js + 2];
205 const real dx = ix - jx;
206 const real dy = iy - jy;
207 const real dz = iz - jz;
208 real rsq = dx * dx + dy * dy + dz * dz;
218 if (type[ia] != ntype - 1 && type[ja] != ntype - 1)
223 // Ensure distance do not become so small that r^-12 overflows
224 rsq = std::max(rsq, c_nbnxnMinDistanceSquared);
226 const real rinv = gmx::invsqrt(rsq);
227 const real rinvsq = rinv * rinv;
229 const real qq = iq * x[js + 3];
233 const real krsq = iconst->reactionFieldCoefficient * rsq;
234 fscal = qq * (int_bit * rinv - 2 * krsq) * rinvsq;
235 if (stepWork.computeEnergy)
237 vcoul = qq * (int_bit * rinv + krsq - iconst->reactionFieldShift);
242 const real r = rsq * rinv;
243 const real rt = r * iconst->coulombEwaldTables->scale;
244 const int n0 = static_cast<int>(rt);
245 const real eps = rt - static_cast<real>(n0);
246 const real* Ftab = iconst->coulombEwaldTables->tableF.data();
248 const real fexcl = (1 - eps) * Ftab[n0] + eps * Ftab[n0 + 1];
250 fscal = qq * (int_bit * rinvsq - fexcl) * rinv;
252 if (stepWork.computeEnergy)
255 * ((int_bit - std::erf(iconst->ewaldcoeff_q * r)) * rinv
256 - int_bit * iconst->sh_ewald);
262 const int tj = nti + 2 * type[ja];
264 /* Vanilla Lennard-Jones cutoff */
265 const real c6 = vdwparam[tj];
266 const real c12 = vdwparam[tj + 1];
268 const real rinvsix = int_bit * rinvsq * rinvsq * rinvsq;
269 const real Vvdw_disp = c6 * rinvsix;
270 const real Vvdw_rep = c12 * rinvsix * rinvsix;
271 fscal += (Vvdw_rep - Vvdw_disp) * rinvsq;
273 if (stepWork.computeEnergy)
278 (Vvdw_rep + int_bit * c12 * iconst->repulsion_shift.cpot) / 12
280 + int_bit * c6 * iconst->dispersion_shift.cpot)
285 real tx = fscal * dx;
286 real ty = fscal * dy;
287 real tz = fscal * dz;
299 fshift[ish3] = fshift[ish3] + fix;
300 fshift[ish3 + 1] = fshift[ish3 + 1] + fiy;
301 fshift[ish3 + 2] = fshift[ish3 + 2] + fiz;
303 /* Count in half work-units.
304 * In CUDA one work-unit is 2 warps.
306 if ((ic + 1) % (c_clSize / c_nbnxnGpuClusterpairSplit) == 0)
316 within_rlist = false;
325 if (stepWork.computeEnergy)
328 Vc[ggid] = Vc[ggid] + vctot;
329 Vvdw[ggid] = Vvdw[ggid] + Vvdwtot;
336 "number of half %dx%d atom pairs: %d after pruning: %d fraction %4.2f\n",
341 nhwu_pruned / static_cast<double>(nhwu));
342 fprintf(debug, "generic kernel pair interactions: %d\n", nhwu * nbl->na_ci / 2 * nbl->na_ci);
344 "generic kernel post-prune pair interactions: %d\n",
345 nhwu_pruned * nbl->na_ci / 2 * nbl->na_ci);
346 fprintf(debug, "generic kernel non-zero pair interactions: %d\n", npair_tot);
348 "ratio non-zero/post-prune pair interactions: %4.2f\n",
349 npair_tot / static_cast<double>(nhwu_pruned * gmx::exactDiv(nbl->na_ci, 2) * nbl->na_ci));