src/gromacs/nbnxm/kernels_reference/kernel_gpu_ref.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2012,2013,2014,2015,2016 by the GROMACS development team.
   5  * Copyright (c) 2017,2018,2019,2020,2021, by the GROMACS development team, led by
   6  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   7  * and including many others, as listed in the AUTHORS file in the
   8  * top-level source directory and at http://www.gromacs.org.
   9  *
  10  * GROMACS is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU Lesser General Public License
  12  * as published by the Free Software Foundation; either version 2.1
  13  * of the License, or (at your option) any later version.
  14  *
  15  * GROMACS is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18  * Lesser General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public
  21  * License along with GROMACS; if not, see
  22  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  23  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  24  *
  25  * If you want to redistribute modifications to GROMACS, please
  26  * consider that scientific software is very special. Version
  27  * control is crucial - bugs must be traceable. We will be happy to
  28  * consider code for inclusion in the official distribution, but
  29  * derived work must not be called official GROMACS. Details are found
  30  * in the README & COPYING files - if they are missing, get the
  31  * official version at http://www.gromacs.org.
  32  *
  33  * To help us fund GROMACS development, we humbly ask that you cite
  34  * the research papers on the package. Check out http://www.gromacs.org.
  35  */
  36 #include "gmxpre.h"
  37
  38 #include "kernel_gpu_ref.h"
  39
  40 #include <cmath>
  41
  42 #include <algorithm>
  43
  44 #include "gromacs/math/functions.h"
  45 #include "gromacs/math/units.h"
  46 #include "gromacs/math/utilities.h"
  47 #include "gromacs/math/vec.h"
  48 #include "gromacs/mdtypes/interaction_const.h"
  49 #include "gromacs/mdtypes/md_enums.h"
  50 #include "gromacs/mdtypes/simulation_workload.h"
  51 #include "gromacs/nbnxm/atomdata.h"
  52 #include "gromacs/nbnxm/nbnxm.h"
  53 #include "gromacs/nbnxm/pairlist.h"
  54 #include "gromacs/pbcutil/ishift.h"
  55 #include "gromacs/utility/fatalerror.h"
  56
  57 static constexpr int c_clSize = c_nbnxnGpuClusterSize;
  58
  59 void nbnxn_kernel_gpu_ref(const NbnxnPairlistGpu*    nbl,
  60                           const nbnxn_atomdata_t*    nbat,
  61                           const interaction_const_t* iconst,
  62                           rvec*                      shift_vec,
  63                           const gmx::StepWorkload&   stepWork,
  64                           int                        clearF,
  65                           gmx::ArrayRef<real>        f,
  66                           real*                      fshift,
  67                           real*                      Vc,
  68                           real*                      Vvdw)
  69 {
  70     real                fscal = NAN;
  71     real                vcoul = 0;
  72     const nbnxn_excl_t* excl[2];
  73
  74     if (nbl->na_ci != c_clSize)
  75     {
  76         gmx_fatal(FARGS,
  77                   "The neighborlist cluster size in the GPU reference kernel is %d, expected it to "
  78                   "be %d",
  79                   nbl->na_ci,
  80                   c_clSize);
  81     }
  82
  83     if (clearF == enbvClearFYes)
  84     {
  85         for (real& elem : f)
  86         {
  87             elem = 0;
  88         }
  89     }
  90
  91     const bool bEwald = EEL_FULL(iconst->eeltype);
  92
  93     const real rcut2 = iconst->rcoulomb * iconst->rcoulomb;
  94     const real rvdw2 = iconst->rvdw * iconst->rvdw;
  95
  96     const real rlist2 = nbl->rlist * nbl->rlist;
  97
  98     const int*  type     = nbat->params().type.data();
  99     const real  facel    = iconst->epsfac;
 100     const real* shiftvec = shift_vec[0];
 101     const real* vdwparam = nbat->params().nbfp.data();
 102     const int   ntype    = nbat->params().numTypes;
 103
 104     const real* x = nbat->x().data();
 105
 106     int npair_tot   = 0;
 107     int nhwu        = 0;
 108     int nhwu_pruned = 0;
 109
 110     for (const nbnxn_sci_t& nbln : nbl->sci)
 111     {
 112         const int  ish3     = 3 * nbln.shift;
 113         const real shX      = shiftvec[ish3];
 114         const real shY      = shiftvec[ish3 + 1];
 115         const real shZ      = shiftvec[ish3 + 2];
 116         const int  cj4_ind0 = nbln.cj4_ind_start;
 117         const int  cj4_ind1 = nbln.cj4_ind_end;
 118         const int  sci      = nbln.sci;
 119         real       vctot    = 0;
 120         real       Vvdwtot  = 0;
 121
 122         if (nbln.shift == CENTRAL && nbl->cj4[cj4_ind0].cj[0] == sci * c_nbnxnGpuNumClusterPerSupercluster)
 123         {
 124             /* we have the diagonal:
 125              * add the charge self interaction energy term
 126              */
 127             for (int im = 0; im < c_nbnxnGpuNumClusterPerSupercluster; im++)
 128             {
 129                 const int ci = sci * c_nbnxnGpuNumClusterPerSupercluster + im;
 130                 for (int ic = 0; ic < c_clSize; ic++)
 131                 {
 132                     const int ia = ci * c_clSize + ic;
 133                     real      iq = x[ia * nbat->xstride + 3];
 134                     vctot += iq * iq;
 135                 }
 136             }
 137             if (!bEwald)
 138             {
 139                 vctot *= -facel * 0.5 * iconst->reactionFieldShift;
 140             }
 141             else
 142             {
 143                 /* last factor 1/sqrt(pi) */
 144                 vctot *= -facel * iconst->ewaldcoeff_q * M_1_SQRTPI;
 145             }
 146         }
 147
 148         for (int cj4_ind = cj4_ind0; (cj4_ind < cj4_ind1); cj4_ind++)
 149         {
 150             excl[0] = &nbl->excl[nbl->cj4[cj4_ind].imei[0].excl_ind];
 151             excl[1] = &nbl->excl[nbl->cj4[cj4_ind].imei[1].excl_ind];
 152
 153             for (int jm = 0; jm < c_nbnxnGpuJgroupSize; jm++)
 154             {
 155                 const int cj = nbl->cj4[cj4_ind].cj[jm];
 156
 157                 for (int im = 0; im < c_nbnxnGpuNumClusterPerSupercluster; im++)
 158                 {
 159                     /* We're only using the first imask,
 160                      * but here imei[1].imask is identical.
 161                      */
 162                     if ((nbl->cj4[cj4_ind].imei[0].imask >> (jm * c_nbnxnGpuNumClusterPerSupercluster + im))
 163                         & 1)
 164                     {
 165                         const int ci = sci * c_nbnxnGpuNumClusterPerSupercluster + im;
 166
 167                         bool within_rlist = false;
 168                         int  npair        = 0;
 169                         for (int ic = 0; ic < c_clSize; ic++)
 170                         {
 171                             const int ia = ci * c_clSize + ic;
 172
 173                             const int  is  = ia * nbat->xstride;
 174                             const int  ifs = ia * nbat->fstride;
 175                             const real ix  = shX + x[is + 0];
 176                             const real iy  = shY + x[is + 1];
 177                             const real iz  = shZ + x[is + 2];
 178                             const real iq  = facel * x[is + 3];
 179                             const int  nti = ntype * 2 * type[ia];
 180
 181                             real fix = 0;
 182                             real fiy = 0;
 183                             real fiz = 0;
 184
 185                             for (int jc = 0; jc < c_clSize; jc++)
 186                             {
 187                                 const int ja = cj * c_clSize + jc;
 188
 189                                 if (nbln.shift == CENTRAL && ci == cj && ja <= ia)
 190                                 {
 191                                     continue;
 192                                 }
 193
 194                                 constexpr int clusterPerSplit =
 195                                         c_nbnxnGpuClusterSize / c_nbnxnGpuClusterpairSplit;
 196                                 const real int_bit = static_cast<real>(
 197                                         (excl[jc / clusterPerSplit]->pair[(jc & (clusterPerSplit - 1)) * c_clSize + ic]
 198                                          >> (jm * c_nbnxnGpuNumClusterPerSupercluster + im))
 199                                         & 1);
 200
 201                                 int        js  = ja * nbat->xstride;
 202                                 int        jfs = ja * nbat->fstride;
 203                                 const real jx  = x[js + 0];
 204                                 const real jy  = x[js + 1];
 205                                 const real jz  = x[js + 2];
 206                                 const real dx  = ix - jx;
 207                                 const real dy  = iy - jy;
 208                                 const real dz  = iz - jz;
 209                                 real       rsq = dx * dx + dy * dy + dz * dz;
 210                                 if (rsq < rlist2)
 211                                 {
 212                                     within_rlist = true;
 213                                 }
 214                                 if (rsq >= rcut2)
 215                                 {
 216                                     continue;
 217                                 }
 218
 219                                 if (type[ia] != ntype - 1 && type[ja] != ntype - 1)
 220                                 {
 221                                     npair++;
 222                                 }
 223
 224                                 // Ensure distance do not become so small that r^-12 overflows
 225                                 rsq = std::max(rsq, c_nbnxnMinDistanceSquared);
 226
 227                                 const real rinv   = gmx::invsqrt(rsq);
 228                                 const real rinvsq = rinv * rinv;
 229
 230                                 const real qq = iq * x[js + 3];
 231                                 if (!bEwald)
 232                                 {
 233                                     /* Reaction-field */
 234                                     const real krsq = iconst->reactionFieldCoefficient * rsq;
 235                                     fscal           = qq * (int_bit * rinv - 2 * krsq) * rinvsq;
 236                                     if (stepWork.computeEnergy)
 237                                     {
 238                                         vcoul = qq * (int_bit * rinv + krsq - iconst->reactionFieldShift);
 239                                     }
 240                                 }
 241                                 else
 242                                 {
 243                                     const real  r    = rsq * rinv;
 244                                     const real  rt   = r * iconst->coulombEwaldTables->scale;
 245                                     const int   n0   = static_cast<int>(rt);
 246                                     const real  eps  = rt - static_cast<real>(n0);
 247                                     const real* Ftab = iconst->coulombEwaldTables->tableF.data();
 248
 249                                     const real fexcl = (1 - eps) * Ftab[n0] + eps * Ftab[n0 + 1];
 250
 251                                     fscal = qq * (int_bit * rinvsq - fexcl) * rinv;
 252
 253                                     if (stepWork.computeEnergy)
 254                                     {
 255                                         vcoul = qq
 256                                                 * ((int_bit - std::erf(iconst->ewaldcoeff_q * r)) * rinv
 257                                                    - int_bit * iconst->sh_ewald);
 258                                     }
 259                                 }
 260
 261                                 if (rsq < rvdw2)
 262                                 {
 263                                     const int tj = nti + 2 * type[ja];
 264
 265                                     /* Vanilla Lennard-Jones cutoff */
 266                                     const real c6  = vdwparam[tj];
 267                                     const real c12 = vdwparam[tj + 1];
 268
 269                                     const real rinvsix   = int_bit * rinvsq * rinvsq * rinvsq;
 270                                     const real Vvdw_disp = c6 * rinvsix;
 271                                     const real Vvdw_rep  = c12 * rinvsix * rinvsix;
 272                                     fscal += (Vvdw_rep - Vvdw_disp) * rinvsq;
 273
 274                                     if (stepWork.computeEnergy)
 275                                     {
 276                                         vctot += vcoul;
 277
 278                                         Vvdwtot +=
 279                                                 (Vvdw_rep + int_bit * c12 * iconst->repulsion_shift.cpot) / 12
 280                                                 - (Vvdw_disp
 281                                                    + int_bit * c6 * iconst->dispersion_shift.cpot)
 282                                                           / 6;
 283                                     }
 284                                 }
 285
 286                                 real tx = fscal * dx;
 287                                 real ty = fscal * dy;
 288                                 real tz = fscal * dz;
 289                                 fix     = fix + tx;
 290                                 fiy     = fiy + ty;
 291                                 fiz     = fiz + tz;
 292                                 f[jfs + 0] -= tx;
 293                                 f[jfs + 1] -= ty;
 294                                 f[jfs + 2] -= tz;
 295                             }
 296
 297                             f[ifs + 0] += fix;
 298                             f[ifs + 1] += fiy;
 299                             f[ifs + 2] += fiz;
 300                             fshift[ish3]     = fshift[ish3] + fix;
 301                             fshift[ish3 + 1] = fshift[ish3 + 1] + fiy;
 302                             fshift[ish3 + 2] = fshift[ish3 + 2] + fiz;
 303
 304                             /* Count in half work-units.
 305                              * In CUDA one work-unit is 2 warps.
 306                              */
 307                             if ((ic + 1) % (c_clSize / c_nbnxnGpuClusterpairSplit) == 0)
 308                             {
 309                                 npair_tot += npair;
 310
 311                                 nhwu++;
 312                                 if (within_rlist)
 313                                 {
 314                                     nhwu_pruned++;
 315                                 }
 316
 317                                 within_rlist = false;
 318                                 npair        = 0;
 319                             }
 320                         }
 321                     }
 322                 }
 323             }
 324         }
 325
 326         if (stepWork.computeEnergy)
 327         {
 328             const int ggid = 0;
 329             Vc[ggid]       = Vc[ggid] + vctot;
 330             Vvdw[ggid]     = Vvdw[ggid] + Vvdwtot;
 331         }
 332     }
 333
 334     if (debug)
 335     {
 336         fprintf(debug,
 337                 "number of half %dx%d atom pairs: %d after pruning: %d fraction %4.2f\n",
 338                 nbl->na_ci,
 339                 nbl->na_ci,
 340                 nhwu,
 341                 nhwu_pruned,
 342                 nhwu_pruned / static_cast<double>(nhwu));
 343         fprintf(debug, "generic kernel pair interactions:            %d\n", nhwu * nbl->na_ci / 2 * nbl->na_ci);
 344         fprintf(debug,
 345                 "generic kernel post-prune pair interactions: %d\n",
 346                 nhwu_pruned * nbl->na_ci / 2 * nbl->na_ci);
 347         fprintf(debug, "generic kernel non-zero pair interactions:   %d\n", npair_tot);
 348         fprintf(debug,
 349                 "ratio non-zero/post-prune pair interactions: %4.2f\n",
 350                 npair_tot / static_cast<double>(nhwu_pruned * gmx::exactDiv(nbl->na_ci, 2) * nbl->na_ci));
 351     }
 352 }