Move some verlet headers to mdlib
[alexxy/gromacs.git] / src / gromacs / mdlib / nbnxn_kernels / nbnxn_kernel_gpu_ref.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 #include "config.h"
36
37 #include <math.h>
38
39 #include "types/simple.h"
40 #include "typedefs.h"
41 #include "force.h"
42
43 #include "gromacs/math/utilities.h"
44 #include "gromacs/math/vec.h"
45 #include "gromacs/pbcutil/ishift.h"
46
47 #include "nbnxn_kernel_gpu_ref.h"
48 #include "../nbnxn_consts.h"
49 #include "nbnxn_kernel_common.h"
50 #include "gromacs/mdlib/nb_verlet.h"
51
52 #define NCL_PER_SUPERCL         (NBNXN_GPU_NCLUSTER_PER_SUPERCLUSTER)
53 #define CL_SIZE                 (NBNXN_GPU_CLUSTER_SIZE)
54
55 void
56 nbnxn_kernel_gpu_ref(const nbnxn_pairlist_t     *nbl,
57                      const nbnxn_atomdata_t     *nbat,
58                      const interaction_const_t  *iconst,
59                      rvec                       *shift_vec,
60                      int                         force_flags,
61                      int                         clearF,
62                      real  *                     f,
63                      real  *                     fshift,
64                      real  *                     Vc,
65                      real  *                     Vvdw)
66 {
67     const nbnxn_sci_t  *nbln;
68     const real         *x;
69     gmx_bool            bEner;
70     gmx_bool            bEwald;
71     const real         *Ftab = NULL;
72     real                rcut2, rvdw2, rlist2;
73     int                 ntype;
74     real                facel;
75     int                 n;
76     int                 ish3;
77     int                 sci;
78     int                 cj4_ind0, cj4_ind1, cj4_ind;
79     int                 ci, cj;
80     int                 ic, jc, ia, ja, is, ifs, js, jfs, im, jm;
81     int                 n0;
82     int                 ggid;
83     real                shX, shY, shZ;
84     real                fscal, tx, ty, tz;
85     real                rinvsq;
86     real                iq;
87     real                qq, vcoul = 0, krsq, vctot;
88     int                 nti;
89     int                 tj;
90     real                rt, r, eps;
91     real                rinvsix;
92     real                Vvdwtot;
93     real                Vvdw_rep, Vvdw_disp;
94     real                ix, iy, iz, fix, fiy, fiz;
95     real                jx, jy, jz;
96     real                dx, dy, dz, rsq, rinv;
97     int                 int_bit;
98     real                fexcl;
99     real                c6, c12, cexp1, cexp2, br;
100     const real       *  shiftvec;
101     real       *        vdwparam;
102     int       *         shift;
103     int       *         type;
104     const nbnxn_excl_t *excl[2];
105
106     int                 npair_tot, npair;
107     int                 nhwu, nhwu_pruned;
108
109     if (nbl->na_ci != CL_SIZE)
110     {
111         gmx_fatal(FARGS, "The neighborlist cluster size in the GPU reference kernel is %d, expected it to be %d", nbl->na_ci, CL_SIZE);
112     }
113
114     if (clearF == enbvClearFYes)
115     {
116         clear_f(nbat, 0, f);
117     }
118
119     bEner = (force_flags & GMX_FORCE_ENERGY);
120
121     bEwald = EEL_FULL(iconst->eeltype);
122     if (bEwald)
123     {
124         Ftab = iconst->tabq_coul_F;
125     }
126
127     rcut2               = iconst->rcoulomb*iconst->rcoulomb;
128     rvdw2               = iconst->rvdw*iconst->rvdw;
129
130     rlist2              = nbl->rlist*nbl->rlist;
131
132     type                = nbat->type;
133     facel               = iconst->epsfac;
134     shiftvec            = shift_vec[0];
135     vdwparam            = nbat->nbfp;
136     ntype               = nbat->ntype;
137
138     x = nbat->x;
139
140     npair_tot   = 0;
141     nhwu        = 0;
142     nhwu_pruned = 0;
143
144     for (n = 0; n < nbl->nsci; n++)
145     {
146         nbln = &nbl->sci[n];
147
148         ish3             = 3*nbln->shift;
149         shX              = shiftvec[ish3];
150         shY              = shiftvec[ish3+1];
151         shZ              = shiftvec[ish3+2];
152         cj4_ind0         = nbln->cj4_ind_start;
153         cj4_ind1         = nbln->cj4_ind_end;
154         sci              = nbln->sci;
155         vctot            = 0;
156         Vvdwtot          = 0;
157
158         if (nbln->shift == CENTRAL &&
159             nbl->cj4[cj4_ind0].cj[0] == sci*NCL_PER_SUPERCL)
160         {
161             /* we have the diagonal:
162              * add the charge self interaction energy term
163              */
164             for (im = 0; im < NCL_PER_SUPERCL; im++)
165             {
166                 ci = sci*NCL_PER_SUPERCL + im;
167                 for (ic = 0; ic < CL_SIZE; ic++)
168                 {
169                     ia     = ci*CL_SIZE + ic;
170                     iq     = x[ia*nbat->xstride+3];
171                     vctot += iq*iq;
172                 }
173             }
174             if (!bEwald)
175             {
176                 vctot *= -facel*0.5*iconst->c_rf;
177             }
178             else
179             {
180                 /* last factor 1/sqrt(pi) */
181                 vctot *= -facel*iconst->ewaldcoeff_q*M_1_SQRTPI;
182             }
183         }
184
185         for (cj4_ind = cj4_ind0; (cj4_ind < cj4_ind1); cj4_ind++)
186         {
187             excl[0]           = &nbl->excl[nbl->cj4[cj4_ind].imei[0].excl_ind];
188             excl[1]           = &nbl->excl[nbl->cj4[cj4_ind].imei[1].excl_ind];
189
190             for (jm = 0; jm < NBNXN_GPU_JGROUP_SIZE; jm++)
191             {
192                 cj               = nbl->cj4[cj4_ind].cj[jm];
193
194                 for (im = 0; im < NCL_PER_SUPERCL; im++)
195                 {
196                     /* We're only using the first imask,
197                      * but here imei[1].imask is identical.
198                      */
199                     if ((nbl->cj4[cj4_ind].imei[0].imask >> (jm*NCL_PER_SUPERCL+im)) & 1)
200                     {
201                         gmx_bool within_rlist;
202
203                         ci               = sci*NCL_PER_SUPERCL + im;
204
205                         within_rlist     = FALSE;
206                         npair            = 0;
207                         for (ic = 0; ic < CL_SIZE; ic++)
208                         {
209                             ia               = ci*CL_SIZE + ic;
210
211                             is               = ia*nbat->xstride;
212                             ifs              = ia*nbat->fstride;
213                             ix               = shX + x[is+0];
214                             iy               = shY + x[is+1];
215                             iz               = shZ + x[is+2];
216                             iq               = facel*x[is+3];
217                             nti              = ntype*2*type[ia];
218
219                             fix              = 0;
220                             fiy              = 0;
221                             fiz              = 0;
222
223                             for (jc = 0; jc < CL_SIZE; jc++)
224                             {
225                                 ja               = cj*CL_SIZE + jc;
226
227                                 if (nbln->shift == CENTRAL &&
228                                     ci == cj && ja <= ia)
229                                 {
230                                     continue;
231                                 }
232
233                                 int_bit = ((excl[jc>>2]->pair[(jc & 3)*CL_SIZE+ic] >> (jm*NCL_PER_SUPERCL+im)) & 1);
234
235                                 js               = ja*nbat->xstride;
236                                 jfs              = ja*nbat->fstride;
237                                 jx               = x[js+0];
238                                 jy               = x[js+1];
239                                 jz               = x[js+2];
240                                 dx               = ix - jx;
241                                 dy               = iy - jy;
242                                 dz               = iz - jz;
243                                 rsq              = dx*dx + dy*dy + dz*dz;
244                                 if (rsq < rlist2)
245                                 {
246                                     within_rlist = TRUE;
247                                 }
248                                 if (rsq >= rcut2)
249                                 {
250                                     continue;
251                                 }
252
253                                 if (type[ia] != ntype-1 && type[ja] != ntype-1)
254                                 {
255                                     npair++;
256                                 }
257
258                                 /* avoid NaN for excluded pairs at r=0 */
259                                 rsq             += (1.0 - int_bit)*NBNXN_AVOID_SING_R2_INC;
260
261                                 rinv             = gmx_invsqrt(rsq);
262                                 rinvsq           = rinv*rinv;
263                                 fscal            = 0;
264
265                                 qq               = iq*x[js+3];
266                                 if (!bEwald)
267                                 {
268                                     /* Reaction-field */
269                                     krsq  = iconst->k_rf*rsq;
270                                     fscal = qq*(int_bit*rinv - 2*krsq)*rinvsq;
271                                     if (bEner)
272                                     {
273                                         vcoul = qq*(int_bit*rinv + krsq - iconst->c_rf);
274                                     }
275                                 }
276                                 else
277                                 {
278                                     r     = rsq*rinv;
279                                     rt    = r*iconst->tabq_scale;
280                                     n0    = rt;
281                                     eps   = rt - n0;
282
283                                     fexcl = (1 - eps)*Ftab[n0] + eps*Ftab[n0+1];
284
285                                     fscal = qq*(int_bit*rinvsq - fexcl)*rinv;
286
287                                     if (bEner)
288                                     {
289                                         vcoul = qq*((int_bit - gmx_erf(iconst->ewaldcoeff_q*r))*rinv - int_bit*iconst->sh_ewald);
290                                     }
291                                 }
292
293                                 if (rsq < rvdw2)
294                                 {
295                                     tj        = nti + 2*type[ja];
296
297                                     /* Vanilla Lennard-Jones cutoff */
298                                     c6        = vdwparam[tj];
299                                     c12       = vdwparam[tj+1];
300
301                                     rinvsix   = int_bit*rinvsq*rinvsq*rinvsq;
302                                     Vvdw_disp = c6*rinvsix;
303                                     Vvdw_rep  = c12*rinvsix*rinvsix;
304                                     fscal    += (Vvdw_rep - Vvdw_disp)*rinvsq;
305
306                                     if (bEner)
307                                     {
308                                         vctot   += vcoul;
309
310                                         Vvdwtot +=
311                                             (Vvdw_rep - int_bit*c12*iconst->sh_invrc6*iconst->sh_invrc6)/12 -
312                                             (Vvdw_disp - int_bit*c6*iconst->sh_invrc6)/6;
313                                     }
314                                 }
315
316                                 tx        = fscal*dx;
317                                 ty        = fscal*dy;
318                                 tz        = fscal*dz;
319                                 fix       = fix + tx;
320                                 fiy       = fiy + ty;
321                                 fiz       = fiz + tz;
322                                 f[jfs+0] -= tx;
323                                 f[jfs+1] -= ty;
324                                 f[jfs+2] -= tz;
325                             }
326
327                             f[ifs+0]        += fix;
328                             f[ifs+1]        += fiy;
329                             f[ifs+2]        += fiz;
330                             fshift[ish3]     = fshift[ish3]   + fix;
331                             fshift[ish3+1]   = fshift[ish3+1] + fiy;
332                             fshift[ish3+2]   = fshift[ish3+2] + fiz;
333
334                             /* Count in half work-units.
335                              * In CUDA one work-unit is 2 warps.
336                              */
337                             if ((ic+1) % (CL_SIZE/2) == 0)
338                             {
339                                 npair_tot += npair;
340
341                                 nhwu++;
342                                 if (within_rlist)
343                                 {
344                                     nhwu_pruned++;
345                                 }
346
347                                 within_rlist = FALSE;
348                                 npair        = 0;
349                             }
350                         }
351                     }
352                 }
353             }
354         }
355
356         if (bEner)
357         {
358             ggid             = 0;
359             Vc[ggid]         = Vc[ggid]   + vctot;
360             Vvdw[ggid]       = Vvdw[ggid] + Vvdwtot;
361         }
362     }
363
364     if (debug)
365     {
366         fprintf(debug, "number of half %dx%d atom pairs: %d after pruning: %d fraction %4.2f\n",
367                 nbl->na_ci, nbl->na_ci,
368                 nhwu, nhwu_pruned, nhwu_pruned/(double)nhwu);
369         fprintf(debug, "generic kernel pair interactions:            %d\n",
370                 nhwu*nbl->na_ci/2*nbl->na_ci);
371         fprintf(debug, "generic kernel post-prune pair interactions: %d\n",
372                 nhwu_pruned*nbl->na_ci/2*nbl->na_ci);
373         fprintf(debug, "generic kernel non-zero pair interactions:   %d\n",
374                 npair_tot);
375         fprintf(debug, "ratio non-zero/post-prune pair interactions: %4.2f\n",
376                 npair_tot/(double)(nhwu_pruned*nbl->na_ci/2*nbl->na_ci));
377     }
378 }