1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
4 * This source code is part of
8 * GROningen MAchine for Chemical Simulations
11 * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
12 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
13 * Copyright (c) 2001-2004, The GROMACS development team,
14 * check out http://www.gromacs.org for more information.
16 * This program is free software; you can redistribute it and/or
17 * modify it under the terms of the GNU General Public License
18 * as published by the Free Software Foundation; either version 2
19 * of the License, or (at your option) any later version.
21 * If you want to redistribute modifications, please consider that
22 * scientific software is very special. Version control is crucial -
23 * bugs must be traceable. We will be happy to consider code for
24 * inclusion in the official distribution, but derived work must not
25 * be called official GROMACS. Details are found in the README & COPYING
26 * files - if they are missing, get the official version at www.gromacs.org.
28 * To help us fund GROMACS development, we humbly ask that you cite
29 * the papers on the package - you can find them in the top README file.
31 * For more info, check our website at http://www.gromacs.org
34 * GROningen Mixture of Alchemy and Childrens' Stories
42 #include "types/simple.h"
47 #include "nbnxn_kernel_gpu_ref.h"
48 #include "../nbnxn_consts.h"
49 #include "nbnxn_kernel_common.h"
51 #define NCL_PER_SUPERCL (NBNXN_GPU_NCLUSTER_PER_SUPERCLUSTER)
52 #define CL_SIZE (NBNXN_GPU_CLUSTER_SIZE)
55 nbnxn_kernel_gpu_ref(const nbnxn_pairlist_t *nbl,
56 const nbnxn_atomdata_t *nbat,
57 const interaction_const_t *iconst,
66 const nbnxn_sci_t *nbln;
70 const real *Ftab=NULL;
71 real rcut2,rvdw2,rlist2;
77 int cj4_ind0,cj4_ind1,cj4_ind;
79 int ic,jc,ia,ja,is,ifs,js,jfs,im,jm;
86 real qq,vcoul=0,krsq,vctot;
92 real Vvdw_rep,Vvdw_disp;
93 real ix,iy,iz,fix,fiy,fiz;
95 real dx,dy,dz,rsq,rinv;
98 real c6,c12,cexp1,cexp2,br;
99 const real * shiftvec;
103 const nbnxn_excl_t *excl[2];
106 int nhwu,nhwu_pruned;
108 if (nbl->na_ci != CL_SIZE)
110 gmx_fatal(FARGS,"The neighborlist cluster size in the GPU reference kernel is %d, expected it to be %d",nbl->na_ci,CL_SIZE);
113 if (clearF == enbvClearFYes)
118 bEner = (force_flags & GMX_FORCE_ENERGY);
120 bEwald = EEL_FULL(iconst->eeltype);
123 Ftab = iconst->tabq_coul_F;
126 rcut2 = iconst->rcoulomb*iconst->rcoulomb;
127 rvdw2 = iconst->rvdw*iconst->rvdw;
129 rlist2 = nbl->rlist*nbl->rlist;
132 facel = iconst->epsfac;
133 shiftvec = shift_vec[0];
134 vdwparam = nbat->nbfp;
143 for(n=0; n<nbl->nsci; n++)
147 ish3 = 3*nbln->shift;
148 shX = shiftvec[ish3];
149 shY = shiftvec[ish3+1];
150 shZ = shiftvec[ish3+2];
151 cj4_ind0 = nbln->cj4_ind_start;
152 cj4_ind1 = nbln->cj4_ind_end;
157 if (nbln->shift == CENTRAL &&
158 nbl->cj4[cj4_ind0].cj[0] == sci*NCL_PER_SUPERCL)
160 /* we have the diagonal:
161 * add the charge self interaction energy term
163 for(im=0; im<NCL_PER_SUPERCL; im++)
165 ci = sci*NCL_PER_SUPERCL + im;
166 for (ic=0; ic<CL_SIZE; ic++)
168 ia = ci*CL_SIZE + ic;
169 iq = x[ia*nbat->xstride+3];
175 vctot *= -facel*0.5*iconst->c_rf;
179 /* last factor 1/sqrt(pi) */
180 vctot *= -facel*iconst->ewaldcoeff*M_1_SQRTPI;
184 for(cj4_ind=cj4_ind0; (cj4_ind<cj4_ind1); cj4_ind++)
186 excl[0] = &nbl->excl[nbl->cj4[cj4_ind].imei[0].excl_ind];
187 excl[1] = &nbl->excl[nbl->cj4[cj4_ind].imei[1].excl_ind];
189 for(jm=0; jm<4; jm++)
191 cj = nbl->cj4[cj4_ind].cj[jm];
193 for(im=0; im<NCL_PER_SUPERCL; im++)
195 /* We're only using the first imask,
196 * but here imei[1].imask is identical.
198 if ((nbl->cj4[cj4_ind].imei[0].imask >> (jm*NCL_PER_SUPERCL+im)) & 1)
200 gmx_bool within_rlist;
202 ci = sci*NCL_PER_SUPERCL + im;
204 within_rlist = FALSE;
206 for(ic=0; ic<CL_SIZE; ic++)
208 ia = ci*CL_SIZE + ic;
210 is = ia*nbat->xstride;
211 ifs = ia*nbat->fstride;
216 nti = ntype*2*type[ia];
222 for(jc=0; jc<CL_SIZE; jc++)
224 ja = cj*CL_SIZE + jc;
226 if (nbln->shift == CENTRAL &&
227 ci == cj && ja <= ia)
232 int_bit = ((excl[jc>>2]->pair[(jc & 3)*CL_SIZE+ic] >> (jm*NCL_PER_SUPERCL+im)) & 1);
234 js = ja*nbat->xstride;
235 jfs = ja*nbat->fstride;
242 rsq = dx*dx + dy*dy + dz*dz;
252 if (type[ia] != ntype-1 && type[ja] != ntype-1)
257 /* avoid NaN for excluded pairs at r=0 */
258 rsq += (1.0 - int_bit)*NBNXN_AVOID_SING_R2_INC;
260 rinv = gmx_invsqrt(rsq);
268 krsq = iconst->k_rf*rsq;
269 fscal = qq*(int_bit*rinv - 2*krsq)*rinvsq;
272 vcoul = qq*(int_bit*rinv + krsq - iconst->c_rf);
278 rt = r*iconst->tabq_scale;
282 fexcl = (1 - eps)*Ftab[n0] + eps*Ftab[n0+1];
284 fscal = qq*(int_bit*rinvsq - fexcl)*rinv;
288 vcoul = qq*((int_bit - gmx_erf(iconst->ewaldcoeff*r))*rinv - int_bit*iconst->sh_ewald);
294 tj = nti + 2*type[ja];
296 /* Vanilla Lennard-Jones cutoff */
298 c12 = vdwparam[tj+1];
300 rinvsix = int_bit*rinvsq*rinvsq*rinvsq;
301 Vvdw_disp = c6*rinvsix;
302 Vvdw_rep = c12*rinvsix*rinvsix;
303 fscal += (Vvdw_rep - Vvdw_disp)*rinvsq;
310 (Vvdw_rep - int_bit*c12*iconst->sh_invrc6*iconst->sh_invrc6)/12 -
311 (Vvdw_disp - int_bit*c6*iconst->sh_invrc6)/6;
329 fshift[ish3] = fshift[ish3] + fix;
330 fshift[ish3+1] = fshift[ish3+1] + fiy;
331 fshift[ish3+2] = fshift[ish3+2] + fiz;
333 /* Count in half work-units.
334 * In CUDA one work-unit is 2 warps.
336 if ((ic+1) % (CL_SIZE/2) == 0)
346 within_rlist = FALSE;
358 Vc[ggid] = Vc[ggid] + vctot;
359 Vvdw[ggid] = Vvdw[ggid] + Vvdwtot;
365 fprintf(debug,"number of half %dx%d atom pairs: %d after pruning: %d fraction %4.2f\n",
366 nbl->na_ci,nbl->na_ci,
367 nhwu,nhwu_pruned,nhwu_pruned/(double)nhwu);
368 fprintf(debug,"generic kernel pair interactions: %d\n",
369 nhwu*nbl->na_ci/2*nbl->na_ci);
370 fprintf(debug,"generic kernel post-prune pair interactions: %d\n",
371 nhwu_pruned*nbl->na_ci/2*nbl->na_ci);
372 fprintf(debug,"generic kernel non-zero pair interactions: %d\n",
374 fprintf(debug,"ratio non-zero/post-prune pair interactions: %4.2f\n",
375 npair_tot/(double)(nhwu_pruned*nbl->na_ci/2*nbl->na_ci));