cd0ee7f70ebc5ed8269f2c9ec2646de3d4a0a4e0
[alexxy/gromacs.git] / src / gromacs / nbnxm / kernels_reference / kernel_gpu_ref.cpp
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014,2015,2016 by the GROMACS development team.
5  * Copyright (c) 2017,2018,2019,2020,2021, by the GROMACS development team, led by
6  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
7  * and including many others, as listed in the AUTHORS file in the
8  * top-level source directory and at http://www.gromacs.org.
9  *
10  * GROMACS is free software; you can redistribute it and/or
11  * modify it under the terms of the GNU Lesser General Public License
12  * as published by the Free Software Foundation; either version 2.1
13  * of the License, or (at your option) any later version.
14  *
15  * GROMACS is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18  * Lesser General Public License for more details.
19  *
20  * You should have received a copy of the GNU Lesser General Public
21  * License along with GROMACS; if not, see
22  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
23  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
24  *
25  * If you want to redistribute modifications to GROMACS, please
26  * consider that scientific software is very special. Version
27  * control is crucial - bugs must be traceable. We will be happy to
28  * consider code for inclusion in the official distribution, but
29  * derived work must not be called official GROMACS. Details are found
30  * in the README & COPYING files - if they are missing, get the
31  * official version at http://www.gromacs.org.
32  *
33  * To help us fund GROMACS development, we humbly ask that you cite
34  * the research papers on the package. Check out http://www.gromacs.org.
35  */
36 #include "gmxpre.h"
37
38 #include "kernel_gpu_ref.h"
39
40 #include <cmath>
41
42 #include <algorithm>
43
44 #include "gromacs/math/functions.h"
45 #include "gromacs/math/utilities.h"
46 #include "gromacs/math/vec.h"
47 #include "gromacs/mdtypes/interaction_const.h"
48 #include "gromacs/mdtypes/md_enums.h"
49 #include "gromacs/mdtypes/simulation_workload.h"
50 #include "gromacs/nbnxm/atomdata.h"
51 #include "gromacs/nbnxm/nbnxm.h"
52 #include "gromacs/nbnxm/pairlist.h"
53 #include "gromacs/pbcutil/ishift.h"
54 #include "gromacs/utility/fatalerror.h"
55
56 static constexpr int c_clSize = c_nbnxnGpuClusterSize;
57
58 void nbnxn_kernel_gpu_ref(const NbnxnPairlistGpu*    nbl,
59                           const nbnxn_atomdata_t*    nbat,
60                           const interaction_const_t* iconst,
61                           rvec*                      shift_vec,
62                           const gmx::StepWorkload&   stepWork,
63                           int                        clearF,
64                           gmx::ArrayRef<real>        f,
65                           real*                      fshift,
66                           real*                      Vc,
67                           real*                      Vvdw)
68 {
69     real                fscal = NAN;
70     real                vcoul = 0;
71     const nbnxn_excl_t* excl[2];
72
73     if (nbl->na_ci != c_clSize)
74     {
75         gmx_fatal(FARGS,
76                   "The neighborlist cluster size in the GPU reference kernel is %d, expected it to "
77                   "be %d",
78                   nbl->na_ci,
79                   c_clSize);
80     }
81
82     if (clearF == enbvClearFYes)
83     {
84         for (real& elem : f)
85         {
86             elem = 0;
87         }
88     }
89
90     const bool bEwald = EEL_FULL(iconst->eeltype);
91
92     const real rcut2 = iconst->rcoulomb * iconst->rcoulomb;
93     const real rvdw2 = iconst->rvdw * iconst->rvdw;
94
95     const real rlist2 = nbl->rlist * nbl->rlist;
96
97     const int*  type     = nbat->params().type.data();
98     const real  facel    = iconst->epsfac;
99     const real* shiftvec = shift_vec[0];
100     const real* vdwparam = nbat->params().nbfp.data();
101     const int   ntype    = nbat->params().numTypes;
102
103     const real* x = nbat->x().data();
104
105     int npair_tot   = 0;
106     int nhwu        = 0;
107     int nhwu_pruned = 0;
108
109     for (const nbnxn_sci_t& nbln : nbl->sci)
110     {
111         const int  ish3     = 3 * nbln.shift;
112         const real shX      = shiftvec[ish3];
113         const real shY      = shiftvec[ish3 + 1];
114         const real shZ      = shiftvec[ish3 + 2];
115         const int  cj4_ind0 = nbln.cj4_ind_start;
116         const int  cj4_ind1 = nbln.cj4_ind_end;
117         const int  sci      = nbln.sci;
118         real       vctot    = 0;
119         real       Vvdwtot  = 0;
120
121         if (nbln.shift == CENTRAL && nbl->cj4[cj4_ind0].cj[0] == sci * c_nbnxnGpuNumClusterPerSupercluster)
122         {
123             /* we have the diagonal:
124              * add the charge self interaction energy term
125              */
126             for (int im = 0; im < c_nbnxnGpuNumClusterPerSupercluster; im++)
127             {
128                 const int ci = sci * c_nbnxnGpuNumClusterPerSupercluster + im;
129                 for (int ic = 0; ic < c_clSize; ic++)
130                 {
131                     const int ia = ci * c_clSize + ic;
132                     real      iq = x[ia * nbat->xstride + 3];
133                     vctot += iq * iq;
134                 }
135             }
136             if (!bEwald)
137             {
138                 vctot *= -facel * 0.5 * iconst->c_rf;
139             }
140             else
141             {
142                 /* last factor 1/sqrt(pi) */
143                 vctot *= -facel * iconst->ewaldcoeff_q * M_1_SQRTPI;
144             }
145         }
146
147         for (int cj4_ind = cj4_ind0; (cj4_ind < cj4_ind1); cj4_ind++)
148         {
149             excl[0] = &nbl->excl[nbl->cj4[cj4_ind].imei[0].excl_ind];
150             excl[1] = &nbl->excl[nbl->cj4[cj4_ind].imei[1].excl_ind];
151
152             for (int jm = 0; jm < c_nbnxnGpuJgroupSize; jm++)
153             {
154                 const int cj = nbl->cj4[cj4_ind].cj[jm];
155
156                 for (int im = 0; im < c_nbnxnGpuNumClusterPerSupercluster; im++)
157                 {
158                     /* We're only using the first imask,
159                      * but here imei[1].imask is identical.
160                      */
161                     if ((nbl->cj4[cj4_ind].imei[0].imask >> (jm * c_nbnxnGpuNumClusterPerSupercluster + im))
162                         & 1)
163                     {
164                         const int ci = sci * c_nbnxnGpuNumClusterPerSupercluster + im;
165
166                         bool within_rlist = false;
167                         int  npair        = 0;
168                         for (int ic = 0; ic < c_clSize; ic++)
169                         {
170                             const int ia = ci * c_clSize + ic;
171
172                             const int  is  = ia * nbat->xstride;
173                             const int  ifs = ia * nbat->fstride;
174                             const real ix  = shX + x[is + 0];
175                             const real iy  = shY + x[is + 1];
176                             const real iz  = shZ + x[is + 2];
177                             const real iq  = facel * x[is + 3];
178                             const int  nti = ntype * 2 * type[ia];
179
180                             real fix = 0;
181                             real fiy = 0;
182                             real fiz = 0;
183
184                             for (int jc = 0; jc < c_clSize; jc++)
185                             {
186                                 const int ja = cj * c_clSize + jc;
187
188                                 if (nbln.shift == CENTRAL && ci == cj && ja <= ia)
189                                 {
190                                     continue;
191                                 }
192
193                                 constexpr int clusterPerSplit =
194                                         c_nbnxnGpuClusterSize / c_nbnxnGpuClusterpairSplit;
195                                 const real int_bit = static_cast<real>(
196                                         (excl[jc / clusterPerSplit]->pair[(jc & (clusterPerSplit - 1)) * c_clSize + ic]
197                                          >> (jm * c_nbnxnGpuNumClusterPerSupercluster + im))
198                                         & 1);
199
200                                 int        js  = ja * nbat->xstride;
201                                 int        jfs = ja * nbat->fstride;
202                                 const real jx  = x[js + 0];
203                                 const real jy  = x[js + 1];
204                                 const real jz  = x[js + 2];
205                                 const real dx  = ix - jx;
206                                 const real dy  = iy - jy;
207                                 const real dz  = iz - jz;
208                                 real       rsq = dx * dx + dy * dy + dz * dz;
209                                 if (rsq < rlist2)
210                                 {
211                                     within_rlist = true;
212                                 }
213                                 if (rsq >= rcut2)
214                                 {
215                                     continue;
216                                 }
217
218                                 if (type[ia] != ntype - 1 && type[ja] != ntype - 1)
219                                 {
220                                     npair++;
221                                 }
222
223                                 // Ensure distance do not become so small that r^-12 overflows
224                                 rsq = std::max(rsq, c_nbnxnMinDistanceSquared);
225
226                                 const real rinv   = gmx::invsqrt(rsq);
227                                 const real rinvsq = rinv * rinv;
228
229                                 const real qq = iq * x[js + 3];
230                                 if (!bEwald)
231                                 {
232                                     /* Reaction-field */
233                                     const real krsq = iconst->k_rf * rsq;
234                                     fscal           = qq * (int_bit * rinv - 2 * krsq) * rinvsq;
235                                     if (stepWork.computeEnergy)
236                                     {
237                                         vcoul = qq * (int_bit * rinv + krsq - iconst->c_rf);
238                                     }
239                                 }
240                                 else
241                                 {
242                                     const real  r    = rsq * rinv;
243                                     const real  rt   = r * iconst->coulombEwaldTables->scale;
244                                     const int   n0   = static_cast<int>(rt);
245                                     const real  eps  = rt - static_cast<real>(n0);
246                                     const real* Ftab = iconst->coulombEwaldTables->tableF.data();
247
248                                     const real fexcl = (1 - eps) * Ftab[n0] + eps * Ftab[n0 + 1];
249
250                                     fscal = qq * (int_bit * rinvsq - fexcl) * rinv;
251
252                                     if (stepWork.computeEnergy)
253                                     {
254                                         vcoul = qq
255                                                 * ((int_bit - std::erf(iconst->ewaldcoeff_q * r)) * rinv
256                                                    - int_bit * iconst->sh_ewald);
257                                     }
258                                 }
259
260                                 if (rsq < rvdw2)
261                                 {
262                                     const int tj = nti + 2 * type[ja];
263
264                                     /* Vanilla Lennard-Jones cutoff */
265                                     const real c6  = vdwparam[tj];
266                                     const real c12 = vdwparam[tj + 1];
267
268                                     const real rinvsix   = int_bit * rinvsq * rinvsq * rinvsq;
269                                     const real Vvdw_disp = c6 * rinvsix;
270                                     const real Vvdw_rep  = c12 * rinvsix * rinvsix;
271                                     fscal += (Vvdw_rep - Vvdw_disp) * rinvsq;
272
273                                     if (stepWork.computeEnergy)
274                                     {
275                                         vctot += vcoul;
276
277                                         Vvdwtot +=
278                                                 (Vvdw_rep + int_bit * c12 * iconst->repulsion_shift.cpot) / 12
279                                                 - (Vvdw_disp
280                                                    + int_bit * c6 * iconst->dispersion_shift.cpot)
281                                                           / 6;
282                                     }
283                                 }
284
285                                 real tx = fscal * dx;
286                                 real ty = fscal * dy;
287                                 real tz = fscal * dz;
288                                 fix     = fix + tx;
289                                 fiy     = fiy + ty;
290                                 fiz     = fiz + tz;
291                                 f[jfs + 0] -= tx;
292                                 f[jfs + 1] -= ty;
293                                 f[jfs + 2] -= tz;
294                             }
295
296                             f[ifs + 0] += fix;
297                             f[ifs + 1] += fiy;
298                             f[ifs + 2] += fiz;
299                             fshift[ish3]     = fshift[ish3] + fix;
300                             fshift[ish3 + 1] = fshift[ish3 + 1] + fiy;
301                             fshift[ish3 + 2] = fshift[ish3 + 2] + fiz;
302
303                             /* Count in half work-units.
304                              * In CUDA one work-unit is 2 warps.
305                              */
306                             if ((ic + 1) % (c_clSize / c_nbnxnGpuClusterpairSplit) == 0)
307                             {
308                                 npair_tot += npair;
309
310                                 nhwu++;
311                                 if (within_rlist)
312                                 {
313                                     nhwu_pruned++;
314                                 }
315
316                                 within_rlist = false;
317                                 npair        = 0;
318                             }
319                         }
320                     }
321                 }
322             }
323         }
324
325         if (stepWork.computeEnergy)
326         {
327             const int ggid = 0;
328             Vc[ggid]       = Vc[ggid] + vctot;
329             Vvdw[ggid]     = Vvdw[ggid] + Vvdwtot;
330         }
331     }
332
333     if (debug)
334     {
335         fprintf(debug,
336                 "number of half %dx%d atom pairs: %d after pruning: %d fraction %4.2f\n",
337                 nbl->na_ci,
338                 nbl->na_ci,
339                 nhwu,
340                 nhwu_pruned,
341                 nhwu_pruned / static_cast<double>(nhwu));
342         fprintf(debug, "generic kernel pair interactions:            %d\n", nhwu * nbl->na_ci / 2 * nbl->na_ci);
343         fprintf(debug,
344                 "generic kernel post-prune pair interactions: %d\n",
345                 nhwu_pruned * nbl->na_ci / 2 * nbl->na_ci);
346         fprintf(debug, "generic kernel non-zero pair interactions:   %d\n", npair_tot);
347         fprintf(debug,
348                 "ratio non-zero/post-prune pair interactions: %4.2f\n",
349                 npair_tot / static_cast<double>(nhwu_pruned * gmx::exactDiv(nbl->na_ci, 2) * nbl->na_ci));
350     }
351 }