Apply re-formatting to C++ in src/ tree.
[alexxy/gromacs.git] / src / gromacs / nbnxm / kernels_reference / kernel_gpu_ref.cpp
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014,2015,2016 by the GROMACS development team.
5  * Copyright (c) 2017,2018,2019,2020, by the GROMACS development team, led by
6  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
7  * and including many others, as listed in the AUTHORS file in the
8  * top-level source directory and at http://www.gromacs.org.
9  *
10  * GROMACS is free software; you can redistribute it and/or
11  * modify it under the terms of the GNU Lesser General Public License
12  * as published by the Free Software Foundation; either version 2.1
13  * of the License, or (at your option) any later version.
14  *
15  * GROMACS is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18  * Lesser General Public License for more details.
19  *
20  * You should have received a copy of the GNU Lesser General Public
21  * License along with GROMACS; if not, see
22  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
23  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
24  *
25  * If you want to redistribute modifications to GROMACS, please
26  * consider that scientific software is very special. Version
27  * control is crucial - bugs must be traceable. We will be happy to
28  * consider code for inclusion in the official distribution, but
29  * derived work must not be called official GROMACS. Details are found
30  * in the README & COPYING files - if they are missing, get the
31  * official version at http://www.gromacs.org.
32  *
33  * To help us fund GROMACS development, we humbly ask that you cite
34  * the research papers on the package. Check out http://www.gromacs.org.
35  */
36 #include "gmxpre.h"
37
38 #include "kernel_gpu_ref.h"
39
40 #include <cmath>
41
42 #include <algorithm>
43
44 #include "gromacs/math/functions.h"
45 #include "gromacs/math/utilities.h"
46 #include "gromacs/math/vec.h"
47 #include "gromacs/mdtypes/interaction_const.h"
48 #include "gromacs/mdtypes/md_enums.h"
49 #include "gromacs/mdtypes/simulation_workload.h"
50 #include "gromacs/nbnxm/atomdata.h"
51 #include "gromacs/nbnxm/nbnxm.h"
52 #include "gromacs/nbnxm/pairlist.h"
53 #include "gromacs/pbcutil/ishift.h"
54 #include "gromacs/utility/fatalerror.h"
55
56 static constexpr int c_clSize = c_nbnxnGpuClusterSize;
57
58 void nbnxn_kernel_gpu_ref(const NbnxnPairlistGpu*    nbl,
59                           const nbnxn_atomdata_t*    nbat,
60                           const interaction_const_t* iconst,
61                           rvec*                      shift_vec,
62                           const gmx::StepWorkload&   stepWork,
63                           int                        clearF,
64                           gmx::ArrayRef<real>        f,
65                           real*                      fshift,
66                           real*                      Vc,
67                           real*                      Vvdw)
68 {
69     gmx_bool            bEwald;
70     const real*         Ftab = nullptr;
71     real                rcut2, rvdw2, rlist2;
72     int                 ntype;
73     real                facel;
74     int                 ish3;
75     int                 sci;
76     int                 cj4_ind0, cj4_ind1, cj4_ind;
77     int                 ci, cj;
78     int                 ic, jc, ia, ja, is, ifs, js, jfs, im, jm;
79     int                 n0;
80     int                 ggid;
81     real                shX, shY, shZ;
82     real                fscal, tx, ty, tz;
83     real                rinvsq;
84     real                iq;
85     real                qq, vcoul = 0, krsq, vctot;
86     int                 nti;
87     int                 tj;
88     real                rt, r, eps;
89     real                rinvsix;
90     real                Vvdwtot;
91     real                Vvdw_rep, Vvdw_disp;
92     real                ix, iy, iz, fix, fiy, fiz;
93     real                jx, jy, jz;
94     real                dx, dy, dz, rsq, rinv;
95     real                int_bit;
96     real                fexcl;
97     real                c6, c12;
98     const nbnxn_excl_t* excl[2];
99
100     int npair_tot, npair;
101     int nhwu, nhwu_pruned;
102
103     if (nbl->na_ci != c_clSize)
104     {
105         gmx_fatal(FARGS,
106                   "The neighborlist cluster size in the GPU reference kernel is %d, expected it to "
107                   "be %d",
108                   nbl->na_ci,
109                   c_clSize);
110     }
111
112     if (clearF == enbvClearFYes)
113     {
114         for (real& elem : f)
115         {
116             elem = 0;
117         }
118     }
119
120     bEwald = EEL_FULL(iconst->eeltype);
121     if (bEwald)
122     {
123         Ftab = iconst->coulombEwaldTables->tableF.data();
124     }
125
126     rcut2 = iconst->rcoulomb * iconst->rcoulomb;
127     rvdw2 = iconst->rvdw * iconst->rvdw;
128
129     rlist2 = nbl->rlist * nbl->rlist;
130
131     const int* type      = nbat->params().type.data();
132     facel                = iconst->epsfac;
133     const real* shiftvec = shift_vec[0];
134     const real* vdwparam = nbat->params().nbfp.data();
135     ntype                = nbat->params().numTypes;
136
137     const real* x = nbat->x().data();
138
139     npair_tot   = 0;
140     nhwu        = 0;
141     nhwu_pruned = 0;
142
143     for (const nbnxn_sci_t& nbln : nbl->sci)
144     {
145         ish3     = 3 * nbln.shift;
146         shX      = shiftvec[ish3];
147         shY      = shiftvec[ish3 + 1];
148         shZ      = shiftvec[ish3 + 2];
149         cj4_ind0 = nbln.cj4_ind_start;
150         cj4_ind1 = nbln.cj4_ind_end;
151         sci      = nbln.sci;
152         vctot    = 0;
153         Vvdwtot  = 0;
154
155         if (nbln.shift == CENTRAL && nbl->cj4[cj4_ind0].cj[0] == sci * c_nbnxnGpuNumClusterPerSupercluster)
156         {
157             /* we have the diagonal:
158              * add the charge self interaction energy term
159              */
160             for (im = 0; im < c_nbnxnGpuNumClusterPerSupercluster; im++)
161             {
162                 ci = sci * c_nbnxnGpuNumClusterPerSupercluster + im;
163                 for (ic = 0; ic < c_clSize; ic++)
164                 {
165                     ia = ci * c_clSize + ic;
166                     iq = x[ia * nbat->xstride + 3];
167                     vctot += iq * iq;
168                 }
169             }
170             if (!bEwald)
171             {
172                 vctot *= -facel * 0.5 * iconst->c_rf;
173             }
174             else
175             {
176                 /* last factor 1/sqrt(pi) */
177                 vctot *= -facel * iconst->ewaldcoeff_q * M_1_SQRTPI;
178             }
179         }
180
181         for (cj4_ind = cj4_ind0; (cj4_ind < cj4_ind1); cj4_ind++)
182         {
183             excl[0] = &nbl->excl[nbl->cj4[cj4_ind].imei[0].excl_ind];
184             excl[1] = &nbl->excl[nbl->cj4[cj4_ind].imei[1].excl_ind];
185
186             for (jm = 0; jm < c_nbnxnGpuJgroupSize; jm++)
187             {
188                 cj = nbl->cj4[cj4_ind].cj[jm];
189
190                 for (im = 0; im < c_nbnxnGpuNumClusterPerSupercluster; im++)
191                 {
192                     /* We're only using the first imask,
193                      * but here imei[1].imask is identical.
194                      */
195                     if ((nbl->cj4[cj4_ind].imei[0].imask >> (jm * c_nbnxnGpuNumClusterPerSupercluster + im))
196                         & 1)
197                     {
198                         gmx_bool within_rlist;
199
200                         ci = sci * c_nbnxnGpuNumClusterPerSupercluster + im;
201
202                         within_rlist = FALSE;
203                         npair        = 0;
204                         for (ic = 0; ic < c_clSize; ic++)
205                         {
206                             ia = ci * c_clSize + ic;
207
208                             is  = ia * nbat->xstride;
209                             ifs = ia * nbat->fstride;
210                             ix  = shX + x[is + 0];
211                             iy  = shY + x[is + 1];
212                             iz  = shZ + x[is + 2];
213                             iq  = facel * x[is + 3];
214                             nti = ntype * 2 * type[ia];
215
216                             fix = 0;
217                             fiy = 0;
218                             fiz = 0;
219
220                             for (jc = 0; jc < c_clSize; jc++)
221                             {
222                                 ja = cj * c_clSize + jc;
223
224                                 if (nbln.shift == CENTRAL && ci == cj && ja <= ia)
225                                 {
226                                     continue;
227                                 }
228
229                                 constexpr int clusterPerSplit =
230                                         c_nbnxnGpuClusterSize / c_nbnxnGpuClusterpairSplit;
231                                 int_bit = static_cast<real>(
232                                         (excl[jc / clusterPerSplit]->pair[(jc & (clusterPerSplit - 1)) * c_clSize + ic]
233                                          >> (jm * c_nbnxnGpuNumClusterPerSupercluster + im))
234                                         & 1);
235
236                                 js  = ja * nbat->xstride;
237                                 jfs = ja * nbat->fstride;
238                                 jx  = x[js + 0];
239                                 jy  = x[js + 1];
240                                 jz  = x[js + 2];
241                                 dx  = ix - jx;
242                                 dy  = iy - jy;
243                                 dz  = iz - jz;
244                                 rsq = dx * dx + dy * dy + dz * dz;
245                                 if (rsq < rlist2)
246                                 {
247                                     within_rlist = TRUE;
248                                 }
249                                 if (rsq >= rcut2)
250                                 {
251                                     continue;
252                                 }
253
254                                 if (type[ia] != ntype - 1 && type[ja] != ntype - 1)
255                                 {
256                                     npair++;
257                                 }
258
259                                 // Ensure distance do not become so small that r^-12 overflows
260                                 rsq = std::max(rsq, c_nbnxnMinDistanceSquared);
261
262                                 rinv   = gmx::invsqrt(rsq);
263                                 rinvsq = rinv * rinv;
264
265                                 qq = iq * x[js + 3];
266                                 if (!bEwald)
267                                 {
268                                     /* Reaction-field */
269                                     krsq  = iconst->k_rf * rsq;
270                                     fscal = qq * (int_bit * rinv - 2 * krsq) * rinvsq;
271                                     if (stepWork.computeEnergy)
272                                     {
273                                         vcoul = qq * (int_bit * rinv + krsq - iconst->c_rf);
274                                     }
275                                 }
276                                 else
277                                 {
278                                     r   = rsq * rinv;
279                                     rt  = r * iconst->coulombEwaldTables->scale;
280                                     n0  = static_cast<int>(rt);
281                                     eps = rt - static_cast<real>(n0);
282
283                                     fexcl = (1 - eps) * Ftab[n0] + eps * Ftab[n0 + 1];
284
285                                     fscal = qq * (int_bit * rinvsq - fexcl) * rinv;
286
287                                     if (stepWork.computeEnergy)
288                                     {
289                                         vcoul = qq
290                                                 * ((int_bit - std::erf(iconst->ewaldcoeff_q * r)) * rinv
291                                                    - int_bit * iconst->sh_ewald);
292                                     }
293                                 }
294
295                                 if (rsq < rvdw2)
296                                 {
297                                     tj = nti + 2 * type[ja];
298
299                                     /* Vanilla Lennard-Jones cutoff */
300                                     c6  = vdwparam[tj];
301                                     c12 = vdwparam[tj + 1];
302
303                                     rinvsix   = int_bit * rinvsq * rinvsq * rinvsq;
304                                     Vvdw_disp = c6 * rinvsix;
305                                     Vvdw_rep  = c12 * rinvsix * rinvsix;
306                                     fscal += (Vvdw_rep - Vvdw_disp) * rinvsq;
307
308                                     if (stepWork.computeEnergy)
309                                     {
310                                         vctot += vcoul;
311
312                                         Vvdwtot +=
313                                                 (Vvdw_rep + int_bit * c12 * iconst->repulsion_shift.cpot) / 12
314                                                 - (Vvdw_disp
315                                                    + int_bit * c6 * iconst->dispersion_shift.cpot)
316                                                           / 6;
317                                     }
318                                 }
319
320                                 tx  = fscal * dx;
321                                 ty  = fscal * dy;
322                                 tz  = fscal * dz;
323                                 fix = fix + tx;
324                                 fiy = fiy + ty;
325                                 fiz = fiz + tz;
326                                 f[jfs + 0] -= tx;
327                                 f[jfs + 1] -= ty;
328                                 f[jfs + 2] -= tz;
329                             }
330
331                             f[ifs + 0] += fix;
332                             f[ifs + 1] += fiy;
333                             f[ifs + 2] += fiz;
334                             fshift[ish3]     = fshift[ish3] + fix;
335                             fshift[ish3 + 1] = fshift[ish3 + 1] + fiy;
336                             fshift[ish3 + 2] = fshift[ish3 + 2] + fiz;
337
338                             /* Count in half work-units.
339                              * In CUDA one work-unit is 2 warps.
340                              */
341                             if ((ic + 1) % (c_clSize / c_nbnxnGpuClusterpairSplit) == 0)
342                             {
343                                 npair_tot += npair;
344
345                                 nhwu++;
346                                 if (within_rlist)
347                                 {
348                                     nhwu_pruned++;
349                                 }
350
351                                 within_rlist = FALSE;
352                                 npair        = 0;
353                             }
354                         }
355                     }
356                 }
357             }
358         }
359
360         if (stepWork.computeEnergy)
361         {
362             ggid       = 0;
363             Vc[ggid]   = Vc[ggid] + vctot;
364             Vvdw[ggid] = Vvdw[ggid] + Vvdwtot;
365         }
366     }
367
368     if (debug)
369     {
370         fprintf(debug,
371                 "number of half %dx%d atom pairs: %d after pruning: %d fraction %4.2f\n",
372                 nbl->na_ci,
373                 nbl->na_ci,
374                 nhwu,
375                 nhwu_pruned,
376                 nhwu_pruned / static_cast<double>(nhwu));
377         fprintf(debug, "generic kernel pair interactions:            %d\n", nhwu * nbl->na_ci / 2 * nbl->na_ci);
378         fprintf(debug,
379                 "generic kernel post-prune pair interactions: %d\n",
380                 nhwu_pruned * nbl->na_ci / 2 * nbl->na_ci);
381         fprintf(debug, "generic kernel non-zero pair interactions:   %d\n", npair_tot);
382         fprintf(debug,
383                 "ratio non-zero/post-prune pair interactions: %4.2f\n",
384                 npair_tot / static_cast<double>(nhwu_pruned * gmx::exactDiv(nbl->na_ci, 2) * nbl->na_ci));
385     }
386 }