static const int c_numClPerSupercl = c_nbnxnGpuNumClusterPerSupercluster;
static const int c_clSize = c_nbnxnGpuClusterSize;
-void
-nbnxn_kernel_gpu_ref(const NbnxnPairlistGpu *nbl,
- const nbnxn_atomdata_t *nbat,
- const interaction_const_t *iconst,
- rvec *shift_vec,
- const gmx::StepWorkload &stepWork,
- int clearF,
- gmx::ArrayRef<real> f,
- real * fshift,
- real * Vc,
- real * Vvdw)
+void nbnxn_kernel_gpu_ref(const NbnxnPairlistGpu* nbl,
+ const nbnxn_atomdata_t* nbat,
+ const interaction_const_t* iconst,
+ rvec* shift_vec,
+ const gmx::StepWorkload& stepWork,
+ int clearF,
+ gmx::ArrayRef<real> f,
+ real* fshift,
+ real* Vc,
+ real* Vvdw)
{
gmx_bool bEwald;
- const real *Ftab = nullptr;
+ const real* Ftab = nullptr;
real rcut2, rvdw2, rlist2;
int ntype;
real facel;
real int_bit;
real fexcl;
real c6, c12;
- const nbnxn_excl_t *excl[2];
+ const nbnxn_excl_t* excl[2];
- int npair_tot, npair;
- int nhwu, nhwu_pruned;
+ int npair_tot, npair;
+ int nhwu, nhwu_pruned;
if (nbl->na_ci != c_clSize)
{
- gmx_fatal(FARGS, "The neighborlist cluster size in the GPU reference kernel is %d, expected it to be %d", nbl->na_ci, c_clSize);
+ gmx_fatal(FARGS,
+ "The neighborlist cluster size in the GPU reference kernel is %d, expected it to "
+ "be %d",
+ nbl->na_ci, c_clSize);
}
if (clearF == enbvClearFYes)
{
- for (real &elem : f)
+ for (real& elem : f)
{
elem = 0;
}
Ftab = iconst->coulombEwaldTables->tableF.data();
}
- rcut2 = iconst->rcoulomb*iconst->rcoulomb;
- rvdw2 = iconst->rvdw*iconst->rvdw;
+ rcut2 = iconst->rcoulomb * iconst->rcoulomb;
+ rvdw2 = iconst->rvdw * iconst->rvdw;
- rlist2 = nbl->rlist*nbl->rlist;
+ rlist2 = nbl->rlist * nbl->rlist;
- const int *type = nbat->params().type.data();
+ const int* type = nbat->params().type.data();
facel = iconst->epsfac;
- const real *shiftvec = shift_vec[0];
- const real *vdwparam = nbat->params().nbfp.data();
+ const real* shiftvec = shift_vec[0];
+ const real* vdwparam = nbat->params().nbfp.data();
ntype = nbat->params().numTypes;
- const real *x = nbat->x().data();
+ const real* x = nbat->x().data();
npair_tot = 0;
nhwu = 0;
nhwu_pruned = 0;
- for (const nbnxn_sci_t &nbln : nbl->sci)
+ for (const nbnxn_sci_t& nbln : nbl->sci)
{
- ish3 = 3*nbln.shift;
- shX = shiftvec[ish3];
- shY = shiftvec[ish3+1];
- shZ = shiftvec[ish3+2];
- cj4_ind0 = nbln.cj4_ind_start;
- cj4_ind1 = nbln.cj4_ind_end;
- sci = nbln.sci;
- vctot = 0;
- Vvdwtot = 0;
-
- if (nbln.shift == CENTRAL &&
- nbl->cj4[cj4_ind0].cj[0] == sci*c_numClPerSupercl)
+ ish3 = 3 * nbln.shift;
+ shX = shiftvec[ish3];
+ shY = shiftvec[ish3 + 1];
+ shZ = shiftvec[ish3 + 2];
+ cj4_ind0 = nbln.cj4_ind_start;
+ cj4_ind1 = nbln.cj4_ind_end;
+ sci = nbln.sci;
+ vctot = 0;
+ Vvdwtot = 0;
+
+ if (nbln.shift == CENTRAL && nbl->cj4[cj4_ind0].cj[0] == sci * c_numClPerSupercl)
{
/* we have the diagonal:
* add the charge self interaction energy term
*/
for (im = 0; im < c_numClPerSupercl; im++)
{
- ci = sci*c_numClPerSupercl + im;
+ ci = sci * c_numClPerSupercl + im;
for (ic = 0; ic < c_clSize; ic++)
{
- ia = ci*c_clSize + ic;
- iq = x[ia*nbat->xstride+3];
- vctot += iq*iq;
+ ia = ci * c_clSize + ic;
+ iq = x[ia * nbat->xstride + 3];
+ vctot += iq * iq;
}
}
if (!bEwald)
{
- vctot *= -facel*0.5*iconst->c_rf;
+ vctot *= -facel * 0.5 * iconst->c_rf;
}
else
{
/* last factor 1/sqrt(pi) */
- vctot *= -facel*iconst->ewaldcoeff_q*M_1_SQRTPI;
+ vctot *= -facel * iconst->ewaldcoeff_q * M_1_SQRTPI;
}
}
for (cj4_ind = cj4_ind0; (cj4_ind < cj4_ind1); cj4_ind++)
{
- excl[0] = &nbl->excl[nbl->cj4[cj4_ind].imei[0].excl_ind];
- excl[1] = &nbl->excl[nbl->cj4[cj4_ind].imei[1].excl_ind];
+ excl[0] = &nbl->excl[nbl->cj4[cj4_ind].imei[0].excl_ind];
+ excl[1] = &nbl->excl[nbl->cj4[cj4_ind].imei[1].excl_ind];
for (jm = 0; jm < c_nbnxnGpuJgroupSize; jm++)
{
- cj = nbl->cj4[cj4_ind].cj[jm];
+ cj = nbl->cj4[cj4_ind].cj[jm];
for (im = 0; im < c_numClPerSupercl; im++)
{
/* We're only using the first imask,
* but here imei[1].imask is identical.
*/
- if ((nbl->cj4[cj4_ind].imei[0].imask >> (jm*c_numClPerSupercl + im)) & 1)
+ if ((nbl->cj4[cj4_ind].imei[0].imask >> (jm * c_numClPerSupercl + im)) & 1)
{
gmx_bool within_rlist;
- ci = sci*c_numClPerSupercl + im;
+ ci = sci * c_numClPerSupercl + im;
- within_rlist = FALSE;
- npair = 0;
+ within_rlist = FALSE;
+ npair = 0;
for (ic = 0; ic < c_clSize; ic++)
{
- ia = ci*c_clSize + ic;
+ ia = ci * c_clSize + ic;
- is = ia*nbat->xstride;
- ifs = ia*nbat->fstride;
- ix = shX + x[is+0];
- iy = shY + x[is+1];
- iz = shZ + x[is+2];
- iq = facel*x[is+3];
- nti = ntype*2*type[ia];
+ is = ia * nbat->xstride;
+ ifs = ia * nbat->fstride;
+ ix = shX + x[is + 0];
+ iy = shY + x[is + 1];
+ iz = shZ + x[is + 2];
+ iq = facel * x[is + 3];
+ nti = ntype * 2 * type[ia];
- fix = 0;
- fiy = 0;
- fiz = 0;
+ fix = 0;
+ fiy = 0;
+ fiz = 0;
for (jc = 0; jc < c_clSize; jc++)
{
- ja = cj*c_clSize + jc;
+ ja = cj * c_clSize + jc;
- if (nbln.shift == CENTRAL &&
- ci == cj && ja <= ia)
+ if (nbln.shift == CENTRAL && ci == cj && ja <= ia)
{
continue;
}
- constexpr int clusterPerSplit = c_nbnxnGpuClusterSize/c_nbnxnGpuClusterpairSplit;
- int_bit = static_cast<real>((excl[jc/clusterPerSplit]->pair[(jc & (clusterPerSplit - 1))*c_clSize + ic]
- >> (jm*c_numClPerSupercl + im)) & 1);
-
- js = ja*nbat->xstride;
- jfs = ja*nbat->fstride;
- jx = x[js+0];
- jy = x[js+1];
- jz = x[js+2];
- dx = ix - jx;
- dy = iy - jy;
- dz = iz - jz;
- rsq = dx*dx + dy*dy + dz*dz;
+ constexpr int clusterPerSplit =
+ c_nbnxnGpuClusterSize / c_nbnxnGpuClusterpairSplit;
+ int_bit = static_cast<real>(
+ (excl[jc / clusterPerSplit]->pair[(jc & (clusterPerSplit - 1)) * c_clSize + ic]
+ >> (jm * c_numClPerSupercl + im))
+ & 1);
+
+ js = ja * nbat->xstride;
+ jfs = ja * nbat->fstride;
+ jx = x[js + 0];
+ jy = x[js + 1];
+ jz = x[js + 2];
+ dx = ix - jx;
+ dy = iy - jy;
+ dz = iz - jz;
+ rsq = dx * dx + dy * dy + dz * dz;
if (rsq < rlist2)
{
within_rlist = TRUE;
continue;
}
- if (type[ia] != ntype-1 && type[ja] != ntype-1)
+ if (type[ia] != ntype - 1 && type[ja] != ntype - 1)
{
npair++;
}
// Ensure distance do not become so small that r^-12 overflows
- rsq = std::max(rsq, NBNXN_MIN_RSQ);
+ rsq = std::max(rsq, NBNXN_MIN_RSQ);
- rinv = gmx::invsqrt(rsq);
- rinvsq = rinv*rinv;
+ rinv = gmx::invsqrt(rsq);
+ rinvsq = rinv * rinv;
- qq = iq*x[js+3];
+ qq = iq * x[js + 3];
if (!bEwald)
{
/* Reaction-field */
- krsq = iconst->k_rf*rsq;
- fscal = qq*(int_bit*rinv - 2*krsq)*rinvsq;
+ krsq = iconst->k_rf * rsq;
+ fscal = qq * (int_bit * rinv - 2 * krsq) * rinvsq;
if (stepWork.computeEnergy)
{
- vcoul = qq*(int_bit*rinv + krsq - iconst->c_rf);
+ vcoul = qq * (int_bit * rinv + krsq - iconst->c_rf);
}
}
else
{
- r = rsq*rinv;
- rt = r*iconst->coulombEwaldTables->scale;
- n0 = static_cast<int>(rt);
- eps = rt - static_cast<real>(n0);
+ r = rsq * rinv;
+ rt = r * iconst->coulombEwaldTables->scale;
+ n0 = static_cast<int>(rt);
+ eps = rt - static_cast<real>(n0);
- fexcl = (1 - eps)*Ftab[n0] + eps*Ftab[n0+1];
+ fexcl = (1 - eps) * Ftab[n0] + eps * Ftab[n0 + 1];
- fscal = qq*(int_bit*rinvsq - fexcl)*rinv;
+ fscal = qq * (int_bit * rinvsq - fexcl) * rinv;
if (stepWork.computeEnergy)
{
- vcoul = qq*((int_bit - std::erf(iconst->ewaldcoeff_q*r))*rinv - int_bit*iconst->sh_ewald);
+ vcoul = qq
+ * ((int_bit - std::erf(iconst->ewaldcoeff_q * r)) * rinv
+ - int_bit * iconst->sh_ewald);
}
}
if (rsq < rvdw2)
{
- tj = nti + 2*type[ja];
+ tj = nti + 2 * type[ja];
/* Vanilla Lennard-Jones cutoff */
- c6 = vdwparam[tj];
- c12 = vdwparam[tj+1];
+ c6 = vdwparam[tj];
+ c12 = vdwparam[tj + 1];
- rinvsix = int_bit*rinvsq*rinvsq*rinvsq;
- Vvdw_disp = c6*rinvsix;
- Vvdw_rep = c12*rinvsix*rinvsix;
- fscal += (Vvdw_rep - Vvdw_disp)*rinvsq;
+ rinvsix = int_bit * rinvsq * rinvsq * rinvsq;
+ Vvdw_disp = c6 * rinvsix;
+ Vvdw_rep = c12 * rinvsix * rinvsix;
+ fscal += (Vvdw_rep - Vvdw_disp) * rinvsq;
if (stepWork.computeEnergy)
{
- vctot += vcoul;
+ vctot += vcoul;
Vvdwtot +=
- (Vvdw_rep + int_bit*c12*iconst->repulsion_shift.cpot)/12 -
- (Vvdw_disp + int_bit*c6*iconst->dispersion_shift.cpot)/6;
+ (Vvdw_rep + int_bit * c12 * iconst->repulsion_shift.cpot) / 12
+ - (Vvdw_disp
+ + int_bit * c6 * iconst->dispersion_shift.cpot)
+ / 6;
}
}
- tx = fscal*dx;
- ty = fscal*dy;
- tz = fscal*dz;
- fix = fix + tx;
- fiy = fiy + ty;
- fiz = fiz + tz;
- f[jfs+0] -= tx;
- f[jfs+1] -= ty;
- f[jfs+2] -= tz;
+ tx = fscal * dx;
+ ty = fscal * dy;
+ tz = fscal * dz;
+ fix = fix + tx;
+ fiy = fiy + ty;
+ fiz = fiz + tz;
+ f[jfs + 0] -= tx;
+ f[jfs + 1] -= ty;
+ f[jfs + 2] -= tz;
}
- f[ifs+0] += fix;
- f[ifs+1] += fiy;
- f[ifs+2] += fiz;
- fshift[ish3] = fshift[ish3] + fix;
- fshift[ish3+1] = fshift[ish3+1] + fiy;
- fshift[ish3+2] = fshift[ish3+2] + fiz;
+ f[ifs + 0] += fix;
+ f[ifs + 1] += fiy;
+ f[ifs + 2] += fiz;
+ fshift[ish3] = fshift[ish3] + fix;
+ fshift[ish3 + 1] = fshift[ish3 + 1] + fiy;
+ fshift[ish3 + 2] = fshift[ish3 + 2] + fiz;
/* Count in half work-units.
* In CUDA one work-unit is 2 warps.
*/
- if ((ic+1) % (c_clSize/c_nbnxnGpuClusterpairSplit) == 0)
+ if ((ic + 1) % (c_clSize / c_nbnxnGpuClusterpairSplit) == 0)
{
npair_tot += npair;
if (stepWork.computeEnergy)
{
- ggid = 0;
- Vc[ggid] = Vc[ggid] + vctot;
- Vvdw[ggid] = Vvdw[ggid] + Vvdwtot;
+ ggid = 0;
+ Vc[ggid] = Vc[ggid] + vctot;
+ Vvdw[ggid] = Vvdw[ggid] + Vvdwtot;
}
}
if (debug)
{
fprintf(debug, "number of half %dx%d atom pairs: %d after pruning: %d fraction %4.2f\n",
- nbl->na_ci, nbl->na_ci,
- nhwu, nhwu_pruned, nhwu_pruned/static_cast<double>(nhwu));
+ nbl->na_ci, nbl->na_ci, nhwu, nhwu_pruned, nhwu_pruned / static_cast<double>(nhwu));
fprintf(debug, "generic kernel pair interactions: %d\n",
- nhwu*nbl->na_ci/2*nbl->na_ci);
+ nhwu * nbl->na_ci / 2 * nbl->na_ci);
fprintf(debug, "generic kernel post-prune pair interactions: %d\n",
- nhwu_pruned*nbl->na_ci/2*nbl->na_ci);
- fprintf(debug, "generic kernel non-zero pair interactions: %d\n",
- npair_tot);
+ nhwu_pruned * nbl->na_ci / 2 * nbl->na_ci);
+ fprintf(debug, "generic kernel non-zero pair interactions: %d\n", npair_tot);
fprintf(debug, "ratio non-zero/post-prune pair interactions: %4.2f\n",
- npair_tot/static_cast<double>(nhwu_pruned*gmx::exactDiv(nbl->na_ci, 2)*nbl->na_ci));
+ npair_tot / static_cast<double>(nhwu_pruned * gmx::exactDiv(nbl->na_ci, 2) * nbl->na_ci));
}
}