src/gromacs/nbnxm/kernels_reference/kernel_gpu_ref.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2012,2013,2014,2015,2016 by the GROMACS development team.
   5  * Copyright (c) 2017,2018,2019,2020, by the GROMACS development team, led by
   6  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   7  * and including many others, as listed in the AUTHORS file in the
   8  * top-level source directory and at http://www.gromacs.org.
   9  *
  10  * GROMACS is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU Lesser General Public License
  12  * as published by the Free Software Foundation; either version 2.1
  13  * of the License, or (at your option) any later version.
  14  *
  15  * GROMACS is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18  * Lesser General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public
  21  * License along with GROMACS; if not, see
  22  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  23  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  24  *
  25  * If you want to redistribute modifications to GROMACS, please
  26  * consider that scientific software is very special. Version
  27  * control is crucial - bugs must be traceable. We will be happy to
  28  * consider code for inclusion in the official distribution, but
  29  * derived work must not be called official GROMACS. Details are found
  30  * in the README & COPYING files - if they are missing, get the
  31  * official version at http://www.gromacs.org.
  32  *
  33  * To help us fund GROMACS development, we humbly ask that you cite
  34  * the research papers on the package. Check out http://www.gromacs.org.
  35  */
  36 #include "gmxpre.h"
  37
  38 #include "kernel_gpu_ref.h"
  39
  40 #include <cmath>
  41
  42 #include <algorithm>
  43
  44 #include "gromacs/math/functions.h"
  45 #include "gromacs/math/utilities.h"
  46 #include "gromacs/math/vec.h"
  47 #include "gromacs/mdtypes/interaction_const.h"
  48 #include "gromacs/mdtypes/md_enums.h"
  49 #include "gromacs/mdtypes/simulation_workload.h"
  50 #include "gromacs/nbnxm/atomdata.h"
  51 #include "gromacs/nbnxm/nbnxm.h"
  52 #include "gromacs/nbnxm/pairlist.h"
  53 #include "gromacs/pbcutil/ishift.h"
  54 #include "gromacs/utility/fatalerror.h"
  55
  56 static constexpr int c_clSize = c_nbnxnGpuClusterSize;
  57
  58 void nbnxn_kernel_gpu_ref(const NbnxnPairlistGpu*    nbl,
  59                           const nbnxn_atomdata_t*    nbat,
  60                           const interaction_const_t* iconst,
  61                           rvec*                      shift_vec,
  62                           const gmx::StepWorkload&   stepWork,
  63                           int                        clearF,
  64                           gmx::ArrayRef<real>        f,
  65                           real*                      fshift,
  66                           real*                      Vc,
  67                           real*                      Vvdw)
  68 {
  69     gmx_bool            bEwald;
  70     const real*         Ftab = nullptr;
  71     real                rcut2, rvdw2, rlist2;
  72     int                 ntype;
  73     real                facel;
  74     int                 ish3;
  75     int                 sci;
  76     int                 cj4_ind0, cj4_ind1, cj4_ind;
  77     int                 ci, cj;
  78     int                 ic, jc, ia, ja, is, ifs, js, jfs, im, jm;
  79     int                 n0;
  80     int                 ggid;
  81     real                shX, shY, shZ;
  82     real                fscal, tx, ty, tz;
  83     real                rinvsq;
  84     real                iq;
  85     real                qq, vcoul = 0, krsq, vctot;
  86     int                 nti;
  87     int                 tj;
  88     real                rt, r, eps;
  89     real                rinvsix;
  90     real                Vvdwtot;
  91     real                Vvdw_rep, Vvdw_disp;
  92     real                ix, iy, iz, fix, fiy, fiz;
  93     real                jx, jy, jz;
  94     real                dx, dy, dz, rsq, rinv;
  95     real                int_bit;
  96     real                fexcl;
  97     real                c6, c12;
  98     const nbnxn_excl_t* excl[2];
  99
 100     int npair_tot, npair;
 101     int nhwu, nhwu_pruned;
 102
 103     if (nbl->na_ci != c_clSize)
 104     {
 105         gmx_fatal(FARGS,
 106                   "The neighborlist cluster size in the GPU reference kernel is %d, expected it to "
 107                   "be %d",
 108                   nbl->na_ci, c_clSize);
 109     }
 110
 111     if (clearF == enbvClearFYes)
 112     {
 113         for (real& elem : f)
 114         {
 115             elem = 0;
 116         }
 117     }
 118
 119     bEwald = EEL_FULL(iconst->eeltype);
 120     if (bEwald)
 121     {
 122         Ftab = iconst->coulombEwaldTables->tableF.data();
 123     }
 124
 125     rcut2 = iconst->rcoulomb * iconst->rcoulomb;
 126     rvdw2 = iconst->rvdw * iconst->rvdw;
 127
 128     rlist2 = nbl->rlist * nbl->rlist;
 129
 130     const int* type      = nbat->params().type.data();
 131     facel                = iconst->epsfac;
 132     const real* shiftvec = shift_vec[0];
 133     const real* vdwparam = nbat->params().nbfp.data();
 134     ntype                = nbat->params().numTypes;
 135
 136     const real* x = nbat->x().data();
 137
 138     npair_tot   = 0;
 139     nhwu        = 0;
 140     nhwu_pruned = 0;
 141
 142     for (const nbnxn_sci_t& nbln : nbl->sci)
 143     {
 144         ish3     = 3 * nbln.shift;
 145         shX      = shiftvec[ish3];
 146         shY      = shiftvec[ish3 + 1];
 147         shZ      = shiftvec[ish3 + 2];
 148         cj4_ind0 = nbln.cj4_ind_start;
 149         cj4_ind1 = nbln.cj4_ind_end;
 150         sci      = nbln.sci;
 151         vctot    = 0;
 152         Vvdwtot  = 0;
 153
 154         if (nbln.shift == CENTRAL && nbl->cj4[cj4_ind0].cj[0] == sci * c_nbnxnGpuNumClusterPerSupercluster)
 155         {
 156             /* we have the diagonal:
 157              * add the charge self interaction energy term
 158              */
 159             for (im = 0; im < c_nbnxnGpuNumClusterPerSupercluster; im++)
 160             {
 161                 ci = sci * c_nbnxnGpuNumClusterPerSupercluster + im;
 162                 for (ic = 0; ic < c_clSize; ic++)
 163                 {
 164                     ia = ci * c_clSize + ic;
 165                     iq = x[ia * nbat->xstride + 3];
 166                     vctot += iq * iq;
 167                 }
 168             }
 169             if (!bEwald)
 170             {
 171                 vctot *= -facel * 0.5 * iconst->c_rf;
 172             }
 173             else
 174             {
 175                 /* last factor 1/sqrt(pi) */
 176                 vctot *= -facel * iconst->ewaldcoeff_q * M_1_SQRTPI;
 177             }
 178         }
 179
 180         for (cj4_ind = cj4_ind0; (cj4_ind < cj4_ind1); cj4_ind++)
 181         {
 182             excl[0] = &nbl->excl[nbl->cj4[cj4_ind].imei[0].excl_ind];
 183             excl[1] = &nbl->excl[nbl->cj4[cj4_ind].imei[1].excl_ind];
 184
 185             for (jm = 0; jm < c_nbnxnGpuJgroupSize; jm++)
 186             {
 187                 cj = nbl->cj4[cj4_ind].cj[jm];
 188
 189                 for (im = 0; im < c_nbnxnGpuNumClusterPerSupercluster; im++)
 190                 {
 191                     /* We're only using the first imask,
 192                      * but here imei[1].imask is identical.
 193                      */
 194                     if ((nbl->cj4[cj4_ind].imei[0].imask >> (jm * c_nbnxnGpuNumClusterPerSupercluster + im))
 195                         & 1)
 196                     {
 197                         gmx_bool within_rlist;
 198
 199                         ci = sci * c_nbnxnGpuNumClusterPerSupercluster + im;
 200
 201                         within_rlist = FALSE;
 202                         npair        = 0;
 203                         for (ic = 0; ic < c_clSize; ic++)
 204                         {
 205                             ia = ci * c_clSize + ic;
 206
 207                             is  = ia * nbat->xstride;
 208                             ifs = ia * nbat->fstride;
 209                             ix  = shX + x[is + 0];
 210                             iy  = shY + x[is + 1];
 211                             iz  = shZ + x[is + 2];
 212                             iq  = facel * x[is + 3];
 213                             nti = ntype * 2 * type[ia];
 214
 215                             fix = 0;
 216                             fiy = 0;
 217                             fiz = 0;
 218
 219                             for (jc = 0; jc < c_clSize; jc++)
 220                             {
 221                                 ja = cj * c_clSize + jc;
 222
 223                                 if (nbln.shift == CENTRAL && ci == cj && ja <= ia)
 224                                 {
 225                                     continue;
 226                                 }
 227
 228                                 constexpr int clusterPerSplit =
 229                                         c_nbnxnGpuClusterSize / c_nbnxnGpuClusterpairSplit;
 230                                 int_bit = static_cast<real>(
 231                                         (excl[jc / clusterPerSplit]->pair[(jc & (clusterPerSplit - 1)) * c_clSize + ic]
 232                                          >> (jm * c_nbnxnGpuNumClusterPerSupercluster + im))
 233                                         & 1);
 234
 235                                 js  = ja * nbat->xstride;
 236                                 jfs = ja * nbat->fstride;
 237                                 jx  = x[js + 0];
 238                                 jy  = x[js + 1];
 239                                 jz  = x[js + 2];
 240                                 dx  = ix - jx;
 241                                 dy  = iy - jy;
 242                                 dz  = iz - jz;
 243                                 rsq = dx * dx + dy * dy + dz * dz;
 244                                 if (rsq < rlist2)
 245                                 {
 246                                     within_rlist = TRUE;
 247                                 }
 248                                 if (rsq >= rcut2)
 249                                 {
 250                                     continue;
 251                                 }
 252
 253                                 if (type[ia] != ntype - 1 && type[ja] != ntype - 1)
 254                                 {
 255                                     npair++;
 256                                 }
 257
 258                                 // Ensure distance do not become so small that r^-12 overflows
 259                                 rsq = std::max(rsq, c_nbnxnMinDistanceSquared);
 260
 261                                 rinv   = gmx::invsqrt(rsq);
 262                                 rinvsq = rinv * rinv;
 263
 264                                 qq = iq * x[js + 3];
 265                                 if (!bEwald)
 266                                 {
 267                                     /* Reaction-field */
 268                                     krsq  = iconst->k_rf * rsq;
 269                                     fscal = qq * (int_bit * rinv - 2 * krsq) * rinvsq;
 270                                     if (stepWork.computeEnergy)
 271                                     {
 272                                         vcoul = qq * (int_bit * rinv + krsq - iconst->c_rf);
 273                                     }
 274                                 }
 275                                 else
 276                                 {
 277                                     r   = rsq * rinv;
 278                                     rt  = r * iconst->coulombEwaldTables->scale;
 279                                     n0  = static_cast<int>(rt);
 280                                     eps = rt - static_cast<real>(n0);
 281
 282                                     fexcl = (1 - eps) * Ftab[n0] + eps * Ftab[n0 + 1];
 283
 284                                     fscal = qq * (int_bit * rinvsq - fexcl) * rinv;
 285
 286                                     if (stepWork.computeEnergy)
 287                                     {
 288                                         vcoul = qq
 289                                                 * ((int_bit - std::erf(iconst->ewaldcoeff_q * r)) * rinv
 290                                                    - int_bit * iconst->sh_ewald);
 291                                     }
 292                                 }
 293
 294                                 if (rsq < rvdw2)
 295                                 {
 296                                     tj = nti + 2 * type[ja];
 297
 298                                     /* Vanilla Lennard-Jones cutoff */
 299                                     c6  = vdwparam[tj];
 300                                     c12 = vdwparam[tj + 1];
 301
 302                                     rinvsix   = int_bit * rinvsq * rinvsq * rinvsq;
 303                                     Vvdw_disp = c6 * rinvsix;
 304                                     Vvdw_rep  = c12 * rinvsix * rinvsix;
 305                                     fscal += (Vvdw_rep - Vvdw_disp) * rinvsq;
 306
 307                                     if (stepWork.computeEnergy)
 308                                     {
 309                                         vctot += vcoul;
 310
 311                                         Vvdwtot +=
 312                                                 (Vvdw_rep + int_bit * c12 * iconst->repulsion_shift.cpot) / 12
 313                                                 - (Vvdw_disp
 314                                                    + int_bit * c6 * iconst->dispersion_shift.cpot)
 315                                                           / 6;
 316                                     }
 317                                 }
 318
 319                                 tx  = fscal * dx;
 320                                 ty  = fscal * dy;
 321                                 tz  = fscal * dz;
 322                                 fix = fix + tx;
 323                                 fiy = fiy + ty;
 324                                 fiz = fiz + tz;
 325                                 f[jfs + 0] -= tx;
 326                                 f[jfs + 1] -= ty;
 327                                 f[jfs + 2] -= tz;
 328                             }
 329
 330                             f[ifs + 0] += fix;
 331                             f[ifs + 1] += fiy;
 332                             f[ifs + 2] += fiz;
 333                             fshift[ish3]     = fshift[ish3] + fix;
 334                             fshift[ish3 + 1] = fshift[ish3 + 1] + fiy;
 335                             fshift[ish3 + 2] = fshift[ish3 + 2] + fiz;
 336
 337                             /* Count in half work-units.
 338                              * In CUDA one work-unit is 2 warps.
 339                              */
 340                             if ((ic + 1) % (c_clSize / c_nbnxnGpuClusterpairSplit) == 0)
 341                             {
 342                                 npair_tot += npair;
 343
 344                                 nhwu++;
 345                                 if (within_rlist)
 346                                 {
 347                                     nhwu_pruned++;
 348                                 }
 349
 350                                 within_rlist = FALSE;
 351                                 npair        = 0;
 352                             }
 353                         }
 354                     }
 355                 }
 356             }
 357         }
 358
 359         if (stepWork.computeEnergy)
 360         {
 361             ggid       = 0;
 362             Vc[ggid]   = Vc[ggid] + vctot;
 363             Vvdw[ggid] = Vvdw[ggid] + Vvdwtot;
 364         }
 365     }
 366
 367     if (debug)
 368     {
 369         fprintf(debug, "number of half %dx%d atom pairs: %d after pruning: %d fraction %4.2f\n",
 370                 nbl->na_ci, nbl->na_ci, nhwu, nhwu_pruned, nhwu_pruned / static_cast<double>(nhwu));
 371         fprintf(debug, "generic kernel pair interactions:            %d\n",
 372                 nhwu * nbl->na_ci / 2 * nbl->na_ci);
 373         fprintf(debug, "generic kernel post-prune pair interactions: %d\n",
 374                 nhwu_pruned * nbl->na_ci / 2 * nbl->na_ci);
 375         fprintf(debug, "generic kernel non-zero pair interactions:   %d\n", npair_tot);
 376         fprintf(debug, "ratio non-zero/post-prune pair interactions: %4.2f\n",
 377                 npair_tot / static_cast<double>(nhwu_pruned * gmx::exactDiv(nbl->na_ci, 2) * nbl->na_ci));
 378     }
 379 }