16a439711295b359e5276ee0508ad8462e0b13a4
[alexxy/gromacs.git] / src / gromacs / nbnxm / kernels_reference / kernel_gpu_ref.cpp
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 #include "gmxpre.h"
36
37 #include "kernel_gpu_ref.h"
38
39 #include <cmath>
40
41 #include <algorithm>
42
43 #include "gromacs/math/functions.h"
44 #include "gromacs/math/utilities.h"
45 #include "gromacs/math/vec.h"
46 #include "gromacs/mdlib/ppforceworkload.h"
47 #include "gromacs/mdtypes/md_enums.h"
48 #include "gromacs/nbnxm/atomdata.h"
49 #include "gromacs/nbnxm/nbnxm.h"
50 #include "gromacs/nbnxm/pairlist.h"
51 #include "gromacs/pbcutil/ishift.h"
52 #include "gromacs/utility/fatalerror.h"
53
54 static const int c_numClPerSupercl = c_nbnxnGpuNumClusterPerSupercluster;
55 static const int c_clSize          = c_nbnxnGpuClusterSize;
56
57 void
58 nbnxn_kernel_gpu_ref(const NbnxnPairlistGpu     *nbl,
59                      const nbnxn_atomdata_t     *nbat,
60                      const interaction_const_t  *iconst,
61                      rvec                       *shift_vec,
62                      const gmx::ForceFlags      &forceFlags,
63                      int                         clearF,
64                      gmx::ArrayRef<real>         f,
65                      real  *                     fshift,
66                      real  *                     Vc,
67                      real  *                     Vvdw)
68 {
69     gmx_bool            bEwald;
70     const real         *Ftab = nullptr;
71     real                rcut2, rvdw2, rlist2;
72     int                 ntype;
73     real                facel;
74     int                 ish3;
75     int                 sci;
76     int                 cj4_ind0, cj4_ind1, cj4_ind;
77     int                 ci, cj;
78     int                 ic, jc, ia, ja, is, ifs, js, jfs, im, jm;
79     int                 n0;
80     int                 ggid;
81     real                shX, shY, shZ;
82     real                fscal, tx, ty, tz;
83     real                rinvsq;
84     real                iq;
85     real                qq, vcoul = 0, krsq, vctot;
86     int                 nti;
87     int                 tj;
88     real                rt, r, eps;
89     real                rinvsix;
90     real                Vvdwtot;
91     real                Vvdw_rep, Vvdw_disp;
92     real                ix, iy, iz, fix, fiy, fiz;
93     real                jx, jy, jz;
94     real                dx, dy, dz, rsq, rinv;
95     real                int_bit;
96     real                fexcl;
97     real                c6, c12;
98     const nbnxn_excl_t *excl[2];
99
100     int                 npair_tot, npair;
101     int                 nhwu, nhwu_pruned;
102
103     if (nbl->na_ci != c_clSize)
104     {
105         gmx_fatal(FARGS, "The neighborlist cluster size in the GPU reference kernel is %d, expected it to be %d", nbl->na_ci, c_clSize);
106     }
107
108     if (clearF == enbvClearFYes)
109     {
110         for (real &elem : f)
111         {
112             elem = 0;
113         }
114     }
115
116     bEwald = EEL_FULL(iconst->eeltype);
117     if (bEwald)
118     {
119         Ftab = iconst->coulombEwaldTables->tableF.data();
120     }
121
122     rcut2                = iconst->rcoulomb*iconst->rcoulomb;
123     rvdw2                = iconst->rvdw*iconst->rvdw;
124
125     rlist2               = nbl->rlist*nbl->rlist;
126
127     const int  *type     = nbat->params().type.data();
128     facel                = iconst->epsfac;
129     const real *shiftvec = shift_vec[0];
130     const real *vdwparam = nbat->params().nbfp.data();
131     ntype                = nbat->params().numTypes;
132
133     const real *x        = nbat->x().data();
134
135     npair_tot   = 0;
136     nhwu        = 0;
137     nhwu_pruned = 0;
138
139     for (const nbnxn_sci_t &nbln : nbl->sci)
140     {
141         ish3             = 3*nbln.shift;
142         shX              = shiftvec[ish3];
143         shY              = shiftvec[ish3+1];
144         shZ              = shiftvec[ish3+2];
145         cj4_ind0         = nbln.cj4_ind_start;
146         cj4_ind1         = nbln.cj4_ind_end;
147         sci              = nbln.sci;
148         vctot            = 0;
149         Vvdwtot          = 0;
150
151         if (nbln.shift == CENTRAL &&
152             nbl->cj4[cj4_ind0].cj[0] == sci*c_numClPerSupercl)
153         {
154             /* we have the diagonal:
155              * add the charge self interaction energy term
156              */
157             for (im = 0; im < c_numClPerSupercl; im++)
158             {
159                 ci = sci*c_numClPerSupercl + im;
160                 for (ic = 0; ic < c_clSize; ic++)
161                 {
162                     ia     = ci*c_clSize + ic;
163                     iq     = x[ia*nbat->xstride+3];
164                     vctot += iq*iq;
165                 }
166             }
167             if (!bEwald)
168             {
169                 vctot *= -facel*0.5*iconst->c_rf;
170             }
171             else
172             {
173                 /* last factor 1/sqrt(pi) */
174                 vctot *= -facel*iconst->ewaldcoeff_q*M_1_SQRTPI;
175             }
176         }
177
178         for (cj4_ind = cj4_ind0; (cj4_ind < cj4_ind1); cj4_ind++)
179         {
180             excl[0]           = &nbl->excl[nbl->cj4[cj4_ind].imei[0].excl_ind];
181             excl[1]           = &nbl->excl[nbl->cj4[cj4_ind].imei[1].excl_ind];
182
183             for (jm = 0; jm < c_nbnxnGpuJgroupSize; jm++)
184             {
185                 cj               = nbl->cj4[cj4_ind].cj[jm];
186
187                 for (im = 0; im < c_numClPerSupercl; im++)
188                 {
189                     /* We're only using the first imask,
190                      * but here imei[1].imask is identical.
191                      */
192                     if ((nbl->cj4[cj4_ind].imei[0].imask >> (jm*c_numClPerSupercl + im)) & 1)
193                     {
194                         gmx_bool within_rlist;
195
196                         ci               = sci*c_numClPerSupercl + im;
197
198                         within_rlist     = FALSE;
199                         npair            = 0;
200                         for (ic = 0; ic < c_clSize; ic++)
201                         {
202                             ia               = ci*c_clSize + ic;
203
204                             is               = ia*nbat->xstride;
205                             ifs              = ia*nbat->fstride;
206                             ix               = shX + x[is+0];
207                             iy               = shY + x[is+1];
208                             iz               = shZ + x[is+2];
209                             iq               = facel*x[is+3];
210                             nti              = ntype*2*type[ia];
211
212                             fix              = 0;
213                             fiy              = 0;
214                             fiz              = 0;
215
216                             for (jc = 0; jc < c_clSize; jc++)
217                             {
218                                 ja               = cj*c_clSize + jc;
219
220                                 if (nbln.shift == CENTRAL &&
221                                     ci == cj && ja <= ia)
222                                 {
223                                     continue;
224                                 }
225
226                                 constexpr int clusterPerSplit = c_nbnxnGpuClusterSize/c_nbnxnGpuClusterpairSplit;
227                                 int_bit = static_cast<real>((excl[jc/clusterPerSplit]->pair[(jc & (clusterPerSplit - 1))*c_clSize + ic]
228                                                              >> (jm*c_numClPerSupercl + im)) & 1);
229
230                                 js               = ja*nbat->xstride;
231                                 jfs              = ja*nbat->fstride;
232                                 jx               = x[js+0];
233                                 jy               = x[js+1];
234                                 jz               = x[js+2];
235                                 dx               = ix - jx;
236                                 dy               = iy - jy;
237                                 dz               = iz - jz;
238                                 rsq              = dx*dx + dy*dy + dz*dz;
239                                 if (rsq < rlist2)
240                                 {
241                                     within_rlist = TRUE;
242                                 }
243                                 if (rsq >= rcut2)
244                                 {
245                                     continue;
246                                 }
247
248                                 if (type[ia] != ntype-1 && type[ja] != ntype-1)
249                                 {
250                                     npair++;
251                                 }
252
253                                 // Ensure distance do not become so small that r^-12 overflows
254                                 rsq              = std::max(rsq, NBNXN_MIN_RSQ);
255
256                                 rinv             = gmx::invsqrt(rsq);
257                                 rinvsq           = rinv*rinv;
258
259                                 qq               = iq*x[js+3];
260                                 if (!bEwald)
261                                 {
262                                     /* Reaction-field */
263                                     krsq  = iconst->k_rf*rsq;
264                                     fscal = qq*(int_bit*rinv - 2*krsq)*rinvsq;
265                                     if (forceFlags.computeEnergy)
266                                     {
267                                         vcoul = qq*(int_bit*rinv + krsq - iconst->c_rf);
268                                     }
269                                 }
270                                 else
271                                 {
272                                     r     = rsq*rinv;
273                                     rt    = r*iconst->coulombEwaldTables->scale;
274                                     n0    = static_cast<int>(rt);
275                                     eps   = rt - static_cast<real>(n0);
276
277                                     fexcl = (1 - eps)*Ftab[n0] + eps*Ftab[n0+1];
278
279                                     fscal = qq*(int_bit*rinvsq - fexcl)*rinv;
280
281                                     if (forceFlags.computeEnergy)
282                                     {
283                                         vcoul = qq*((int_bit - std::erf(iconst->ewaldcoeff_q*r))*rinv - int_bit*iconst->sh_ewald);
284                                     }
285                                 }
286
287                                 if (rsq < rvdw2)
288                                 {
289                                     tj        = nti + 2*type[ja];
290
291                                     /* Vanilla Lennard-Jones cutoff */
292                                     c6        = vdwparam[tj];
293                                     c12       = vdwparam[tj+1];
294
295                                     rinvsix   = int_bit*rinvsq*rinvsq*rinvsq;
296                                     Vvdw_disp = c6*rinvsix;
297                                     Vvdw_rep  = c12*rinvsix*rinvsix;
298                                     fscal    += (Vvdw_rep - Vvdw_disp)*rinvsq;
299
300                                     if (forceFlags.computeEnergy)
301                                     {
302                                         vctot   += vcoul;
303
304                                         Vvdwtot +=
305                                             (Vvdw_rep + int_bit*c12*iconst->repulsion_shift.cpot)/12 -
306                                             (Vvdw_disp + int_bit*c6*iconst->dispersion_shift.cpot)/6;
307                                     }
308                                 }
309
310                                 tx        = fscal*dx;
311                                 ty        = fscal*dy;
312                                 tz        = fscal*dz;
313                                 fix       = fix + tx;
314                                 fiy       = fiy + ty;
315                                 fiz       = fiz + tz;
316                                 f[jfs+0] -= tx;
317                                 f[jfs+1] -= ty;
318                                 f[jfs+2] -= tz;
319                             }
320
321                             f[ifs+0]        += fix;
322                             f[ifs+1]        += fiy;
323                             f[ifs+2]        += fiz;
324                             fshift[ish3]     = fshift[ish3]   + fix;
325                             fshift[ish3+1]   = fshift[ish3+1] + fiy;
326                             fshift[ish3+2]   = fshift[ish3+2] + fiz;
327
328                             /* Count in half work-units.
329                              * In CUDA one work-unit is 2 warps.
330                              */
331                             if ((ic+1) % (c_clSize/c_nbnxnGpuClusterpairSplit) == 0)
332                             {
333                                 npair_tot += npair;
334
335                                 nhwu++;
336                                 if (within_rlist)
337                                 {
338                                     nhwu_pruned++;
339                                 }
340
341                                 within_rlist = FALSE;
342                                 npair        = 0;
343                             }
344                         }
345                     }
346                 }
347             }
348         }
349
350         if (forceFlags.computeEnergy)
351         {
352             ggid             = 0;
353             Vc[ggid]         = Vc[ggid]   + vctot;
354             Vvdw[ggid]       = Vvdw[ggid] + Vvdwtot;
355         }
356     }
357
358     if (debug)
359     {
360         fprintf(debug, "number of half %dx%d atom pairs: %d after pruning: %d fraction %4.2f\n",
361                 nbl->na_ci, nbl->na_ci,
362                 nhwu, nhwu_pruned, nhwu_pruned/static_cast<double>(nhwu));
363         fprintf(debug, "generic kernel pair interactions:            %d\n",
364                 nhwu*nbl->na_ci/2*nbl->na_ci);
365         fprintf(debug, "generic kernel post-prune pair interactions: %d\n",
366                 nhwu_pruned*nbl->na_ci/2*nbl->na_ci);
367         fprintf(debug, "generic kernel non-zero pair interactions:   %d\n",
368                 npair_tot);
369         fprintf(debug, "ratio non-zero/post-prune pair interactions: %4.2f\n",
370                 npair_tot/static_cast<double>(nhwu_pruned*gmx::exactDiv(nbl->na_ci, 2)*nbl->na_ci));
371     }
372 }