2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
37 #include "nbnxn_kernel_gpu_ref.h"
43 #include "gromacs/legacyheaders/force.h"
44 #include "gromacs/legacyheaders/typedefs.h"
45 #include "gromacs/legacyheaders/types/simple.h"
46 #include "gromacs/math/utilities.h"
47 #include "gromacs/math/vec.h"
48 #include "gromacs/mdlib/nb_verlet.h"
49 #include "gromacs/mdlib/nbnxn_consts.h"
50 #include "gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_common.h"
51 #include "gromacs/pbcutil/ishift.h"
53 #define NCL_PER_SUPERCL (NBNXN_GPU_NCLUSTER_PER_SUPERCLUSTER)
54 #define CL_SIZE (NBNXN_GPU_CLUSTER_SIZE)
57 nbnxn_kernel_gpu_ref(const nbnxn_pairlist_t *nbl,
58 const nbnxn_atomdata_t *nbat,
59 const interaction_const_t *iconst,
68 const nbnxn_sci_t *nbln;
72 const real *Ftab = NULL;
73 real rcut2, rvdw2, rlist2;
79 int cj4_ind0, cj4_ind1, cj4_ind;
81 int ic, jc, ia, ja, is, ifs, js, jfs, im, jm;
85 real fscal, tx, ty, tz;
88 real qq, vcoul = 0, krsq, vctot;
94 real Vvdw_rep, Vvdw_disp;
95 real ix, iy, iz, fix, fiy, fiz;
97 real dx, dy, dz, rsq, rinv;
100 real c6, c12, cexp1, cexp2, br;
101 const real * shiftvec;
105 const nbnxn_excl_t *excl[2];
107 int npair_tot, npair;
108 int nhwu, nhwu_pruned;
110 if (nbl->na_ci != CL_SIZE)
112 gmx_fatal(FARGS, "The neighborlist cluster size in the GPU reference kernel is %d, expected it to be %d", nbl->na_ci, CL_SIZE);
115 if (clearF == enbvClearFYes)
120 bEner = (force_flags & GMX_FORCE_ENERGY);
122 bEwald = EEL_FULL(iconst->eeltype);
125 Ftab = iconst->tabq_coul_F;
128 rcut2 = iconst->rcoulomb*iconst->rcoulomb;
129 rvdw2 = iconst->rvdw*iconst->rvdw;
131 rlist2 = nbl->rlist*nbl->rlist;
134 facel = iconst->epsfac;
135 shiftvec = shift_vec[0];
136 vdwparam = nbat->nbfp;
145 for (n = 0; n < nbl->nsci; n++)
149 ish3 = 3*nbln->shift;
150 shX = shiftvec[ish3];
151 shY = shiftvec[ish3+1];
152 shZ = shiftvec[ish3+2];
153 cj4_ind0 = nbln->cj4_ind_start;
154 cj4_ind1 = nbln->cj4_ind_end;
159 if (nbln->shift == CENTRAL &&
160 nbl->cj4[cj4_ind0].cj[0] == sci*NCL_PER_SUPERCL)
162 /* we have the diagonal:
163 * add the charge self interaction energy term
165 for (im = 0; im < NCL_PER_SUPERCL; im++)
167 ci = sci*NCL_PER_SUPERCL + im;
168 for (ic = 0; ic < CL_SIZE; ic++)
170 ia = ci*CL_SIZE + ic;
171 iq = x[ia*nbat->xstride+3];
177 vctot *= -facel*0.5*iconst->c_rf;
181 /* last factor 1/sqrt(pi) */
182 vctot *= -facel*iconst->ewaldcoeff_q*M_1_SQRTPI;
186 for (cj4_ind = cj4_ind0; (cj4_ind < cj4_ind1); cj4_ind++)
188 excl[0] = &nbl->excl[nbl->cj4[cj4_ind].imei[0].excl_ind];
189 excl[1] = &nbl->excl[nbl->cj4[cj4_ind].imei[1].excl_ind];
191 for (jm = 0; jm < NBNXN_GPU_JGROUP_SIZE; jm++)
193 cj = nbl->cj4[cj4_ind].cj[jm];
195 for (im = 0; im < NCL_PER_SUPERCL; im++)
197 /* We're only using the first imask,
198 * but here imei[1].imask is identical.
200 if ((nbl->cj4[cj4_ind].imei[0].imask >> (jm*NCL_PER_SUPERCL+im)) & 1)
202 gmx_bool within_rlist;
204 ci = sci*NCL_PER_SUPERCL + im;
206 within_rlist = FALSE;
208 for (ic = 0; ic < CL_SIZE; ic++)
210 ia = ci*CL_SIZE + ic;
212 is = ia*nbat->xstride;
213 ifs = ia*nbat->fstride;
218 nti = ntype*2*type[ia];
224 for (jc = 0; jc < CL_SIZE; jc++)
226 ja = cj*CL_SIZE + jc;
228 if (nbln->shift == CENTRAL &&
229 ci == cj && ja <= ia)
234 int_bit = ((excl[jc>>2]->pair[(jc & 3)*CL_SIZE+ic] >> (jm*NCL_PER_SUPERCL+im)) & 1);
236 js = ja*nbat->xstride;
237 jfs = ja*nbat->fstride;
244 rsq = dx*dx + dy*dy + dz*dz;
254 if (type[ia] != ntype-1 && type[ja] != ntype-1)
259 /* avoid NaN for excluded pairs at r=0 */
260 rsq += (1.0 - int_bit)*NBNXN_AVOID_SING_R2_INC;
262 rinv = gmx_invsqrt(rsq);
270 krsq = iconst->k_rf*rsq;
271 fscal = qq*(int_bit*rinv - 2*krsq)*rinvsq;
274 vcoul = qq*(int_bit*rinv + krsq - iconst->c_rf);
280 rt = r*iconst->tabq_scale;
284 fexcl = (1 - eps)*Ftab[n0] + eps*Ftab[n0+1];
286 fscal = qq*(int_bit*rinvsq - fexcl)*rinv;
290 vcoul = qq*((int_bit - gmx_erf(iconst->ewaldcoeff_q*r))*rinv - int_bit*iconst->sh_ewald);
296 tj = nti + 2*type[ja];
298 /* Vanilla Lennard-Jones cutoff */
300 c12 = vdwparam[tj+1];
302 rinvsix = int_bit*rinvsq*rinvsq*rinvsq;
303 Vvdw_disp = c6*rinvsix;
304 Vvdw_rep = c12*rinvsix*rinvsix;
305 fscal += (Vvdw_rep - Vvdw_disp)*rinvsq;
312 (Vvdw_rep - int_bit*c12*iconst->sh_invrc6*iconst->sh_invrc6)/12 -
313 (Vvdw_disp - int_bit*c6*iconst->sh_invrc6)/6;
331 fshift[ish3] = fshift[ish3] + fix;
332 fshift[ish3+1] = fshift[ish3+1] + fiy;
333 fshift[ish3+2] = fshift[ish3+2] + fiz;
335 /* Count in half work-units.
336 * In CUDA one work-unit is 2 warps.
338 if ((ic+1) % (CL_SIZE/2) == 0)
348 within_rlist = FALSE;
360 Vc[ggid] = Vc[ggid] + vctot;
361 Vvdw[ggid] = Vvdw[ggid] + Vvdwtot;
367 fprintf(debug, "number of half %dx%d atom pairs: %d after pruning: %d fraction %4.2f\n",
368 nbl->na_ci, nbl->na_ci,
369 nhwu, nhwu_pruned, nhwu_pruned/(double)nhwu);
370 fprintf(debug, "generic kernel pair interactions: %d\n",
371 nhwu*nbl->na_ci/2*nbl->na_ci);
372 fprintf(debug, "generic kernel post-prune pair interactions: %d\n",
373 nhwu_pruned*nbl->na_ci/2*nbl->na_ci);
374 fprintf(debug, "generic kernel non-zero pair interactions: %d\n",
376 fprintf(debug, "ratio non-zero/post-prune pair interactions: %4.2f\n",
377 npair_tot/(double)(nhwu_pruned*nbl->na_ci/2*nbl->na_ci));