src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_gpu_ref.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2012,2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35 #include "gmxpre.h"
  36
  37 #include "nbnxn_kernel_gpu_ref.h"
  38
  39 #include "config.h"
  40
  41 #include <cmath>
  42
  43 #include <algorithm>
  44
  45 #include "gromacs/math/functions.h"
  46 #include "gromacs/math/utilities.h"
  47 #include "gromacs/math/vec.h"
  48 #include "gromacs/mdlib/force_flags.h"
  49 #include "gromacs/mdlib/nb_verlet.h"
  50 #include "gromacs/mdlib/nbnxn_consts.h"
  51 #include "gromacs/mdtypes/md_enums.h"
  52 #include "gromacs/pbcutil/ishift.h"
  53 #include "gromacs/utility/fatalerror.h"
  54
  55 #include "nbnxn_kernel_common.h"
  56
  57 static const int c_numClPerSupercl = c_nbnxnGpuNumClusterPerSupercluster;
  58 static const int c_clSize          = c_nbnxnGpuClusterSize;
  59
  60 void
  61 nbnxn_kernel_gpu_ref(const NbnxnPairlistGpu     *nbl,
  62                      const nbnxn_atomdata_t     *nbat,
  63                      const interaction_const_t  *iconst,
  64                      rvec                       *shift_vec,
  65                      int                         force_flags,
  66                      int                         clearF,
  67                      gmx::ArrayRef<real>         f,
  68                      real  *                     fshift,
  69                      real  *                     Vc,
  70                      real  *                     Vvdw)
  71 {
  72     const nbnxn_sci_t  *nbln;
  73     gmx_bool            bEner;
  74     gmx_bool            bEwald;
  75     const real         *Ftab = nullptr;
  76     real                rcut2, rvdw2, rlist2;
  77     int                 ntype;
  78     real                facel;
  79     int                 n;
  80     int                 ish3;
  81     int                 sci;
  82     int                 cj4_ind0, cj4_ind1, cj4_ind;
  83     int                 ci, cj;
  84     int                 ic, jc, ia, ja, is, ifs, js, jfs, im, jm;
  85     int                 n0;
  86     int                 ggid;
  87     real                shX, shY, shZ;
  88     real                fscal, tx, ty, tz;
  89     real                rinvsq;
  90     real                iq;
  91     real                qq, vcoul = 0, krsq, vctot;
  92     int                 nti;
  93     int                 tj;
  94     real                rt, r, eps;
  95     real                rinvsix;
  96     real                Vvdwtot;
  97     real                Vvdw_rep, Vvdw_disp;
  98     real                ix, iy, iz, fix, fiy, fiz;
  99     real                jx, jy, jz;
 100     real                dx, dy, dz, rsq, rinv;
 101     int                 int_bit;
 102     real                fexcl;
 103     real                c6, c12;
 104     const nbnxn_excl_t *excl[2];
 105
 106     int                 npair_tot, npair;
 107     int                 nhwu, nhwu_pruned;
 108
 109     if (nbl->na_ci != c_clSize)
 110     {
 111         gmx_fatal(FARGS, "The neighborlist cluster size in the GPU reference kernel is %d, expected it to be %d", nbl->na_ci, c_clSize);
 112     }
 113
 114     if (clearF == enbvClearFYes)
 115     {
 116         clear_f(nbat, 0, f.data());
 117     }
 118
 119     bEner = ((force_flags & GMX_FORCE_ENERGY) != 0);
 120
 121     bEwald = EEL_FULL(iconst->eeltype);
 122     if (bEwald)
 123     {
 124         Ftab = iconst->tabq_coul_F;
 125     }
 126
 127     rcut2                = iconst->rcoulomb*iconst->rcoulomb;
 128     rvdw2                = iconst->rvdw*iconst->rvdw;
 129
 130     rlist2               = nbl->rlist*nbl->rlist;
 131
 132     const int  *type     = nbat->params().type.data();
 133     facel                = iconst->epsfac;
 134     const real *shiftvec = shift_vec[0];
 135     const real *vdwparam = nbat->params().nbfp.data();
 136     ntype                = nbat->params().numTypes;
 137
 138     const real *x        = nbat->x().data();
 139
 140     npair_tot   = 0;
 141     nhwu        = 0;
 142     nhwu_pruned = 0;
 143
 144     for (n = 0; n < nbl->nsci; n++)
 145     {
 146         nbln = &nbl->sci[n];
 147
 148         ish3             = 3*nbln->shift;
 149         shX              = shiftvec[ish3];
 150         shY              = shiftvec[ish3+1];
 151         shZ              = shiftvec[ish3+2];
 152         cj4_ind0         = nbln->cj4_ind_start;
 153         cj4_ind1         = nbln->cj4_ind_end;
 154         sci              = nbln->sci;
 155         vctot            = 0;
 156         Vvdwtot          = 0;
 157
 158         if (nbln->shift == CENTRAL &&
 159             nbl->cj4[cj4_ind0].cj[0] == sci*c_numClPerSupercl)
 160         {
 161             /* we have the diagonal:
 162              * add the charge self interaction energy term
 163              */
 164             for (im = 0; im < c_numClPerSupercl; im++)
 165             {
 166                 ci = sci*c_numClPerSupercl + im;
 167                 for (ic = 0; ic < c_clSize; ic++)
 168                 {
 169                     ia     = ci*c_clSize + ic;
 170                     iq     = x[ia*nbat->xstride+3];
 171                     vctot += iq*iq;
 172                 }
 173             }
 174             if (!bEwald)
 175             {
 176                 vctot *= -facel*0.5*iconst->c_rf;
 177             }
 178             else
 179             {
 180                 /* last factor 1/sqrt(pi) */
 181                 vctot *= -facel*iconst->ewaldcoeff_q*M_1_SQRTPI;
 182             }
 183         }
 184
 185         for (cj4_ind = cj4_ind0; (cj4_ind < cj4_ind1); cj4_ind++)
 186         {
 187             excl[0]           = &nbl->excl[nbl->cj4[cj4_ind].imei[0].excl_ind];
 188             excl[1]           = &nbl->excl[nbl->cj4[cj4_ind].imei[1].excl_ind];
 189
 190             for (jm = 0; jm < c_nbnxnGpuJgroupSize; jm++)
 191             {
 192                 cj               = nbl->cj4[cj4_ind].cj[jm];
 193
 194                 for (im = 0; im < c_numClPerSupercl; im++)
 195                 {
 196                     /* We're only using the first imask,
 197                      * but here imei[1].imask is identical.
 198                      */
 199                     if ((nbl->cj4[cj4_ind].imei[0].imask >> (jm*c_numClPerSupercl + im)) & 1)
 200                     {
 201                         gmx_bool within_rlist;
 202
 203                         ci               = sci*c_numClPerSupercl + im;
 204
 205                         within_rlist     = FALSE;
 206                         npair            = 0;
 207                         for (ic = 0; ic < c_clSize; ic++)
 208                         {
 209                             ia               = ci*c_clSize + ic;
 210
 211                             is               = ia*nbat->xstride;
 212                             ifs              = ia*nbat->fstride;
 213                             ix               = shX + x[is+0];
 214                             iy               = shY + x[is+1];
 215                             iz               = shZ + x[is+2];
 216                             iq               = facel*x[is+3];
 217                             nti              = ntype*2*type[ia];
 218
 219                             fix              = 0;
 220                             fiy              = 0;
 221                             fiz              = 0;
 222
 223                             for (jc = 0; jc < c_clSize; jc++)
 224                             {
 225                                 ja               = cj*c_clSize + jc;
 226
 227                                 if (nbln->shift == CENTRAL &&
 228                                     ci == cj && ja <= ia)
 229                                 {
 230                                     continue;
 231                                 }
 232
 233                                 constexpr int clusterPerSplit = c_nbnxnGpuClusterSize/c_nbnxnGpuClusterpairSplit;
 234                                 int_bit = ((excl[jc/clusterPerSplit]->pair[(jc & (clusterPerSplit - 1))*c_clSize + ic]
 235                                             >> (jm*c_numClPerSupercl + im)) & 1);
 236
 237                                 js               = ja*nbat->xstride;
 238                                 jfs              = ja*nbat->fstride;
 239                                 jx               = x[js+0];
 240                                 jy               = x[js+1];
 241                                 jz               = x[js+2];
 242                                 dx               = ix - jx;
 243                                 dy               = iy - jy;
 244                                 dz               = iz - jz;
 245                                 rsq              = dx*dx + dy*dy + dz*dz;
 246                                 if (rsq < rlist2)
 247                                 {
 248                                     within_rlist = TRUE;
 249                                 }
 250                                 if (rsq >= rcut2)
 251                                 {
 252                                     continue;
 253                                 }
 254
 255                                 if (type[ia] != ntype-1 && type[ja] != ntype-1)
 256                                 {
 257                                     npair++;
 258                                 }
 259
 260                                 // Ensure distance do not become so small that r^-12 overflows
 261                                 rsq              = std::max(rsq, NBNXN_MIN_RSQ);
 262
 263                                 rinv             = gmx::invsqrt(rsq);
 264                                 rinvsq           = rinv*rinv;
 265
 266                                 qq               = iq*x[js+3];
 267                                 if (!bEwald)
 268                                 {
 269                                     /* Reaction-field */
 270                                     krsq  = iconst->k_rf*rsq;
 271                                     fscal = qq*(int_bit*rinv - 2*krsq)*rinvsq;
 272                                     if (bEner)
 273                                     {
 274                                         vcoul = qq*(int_bit*rinv + krsq - iconst->c_rf);
 275                                     }
 276                                 }
 277                                 else
 278                                 {
 279                                     r     = rsq*rinv;
 280                                     rt    = r*iconst->tabq_scale;
 281                                     n0    = static_cast<int>(rt);
 282                                     eps   = rt - n0;
 283
 284                                     fexcl = (1 - eps)*Ftab[n0] + eps*Ftab[n0+1];
 285
 286                                     fscal = qq*(int_bit*rinvsq - fexcl)*rinv;
 287
 288                                     if (bEner)
 289                                     {
 290                                         vcoul = qq*((int_bit - std::erf(iconst->ewaldcoeff_q*r))*rinv - int_bit*iconst->sh_ewald);
 291                                     }
 292                                 }
 293
 294                                 if (rsq < rvdw2)
 295                                 {
 296                                     tj        = nti + 2*type[ja];
 297
 298                                     /* Vanilla Lennard-Jones cutoff */
 299                                     c6        = vdwparam[tj];
 300                                     c12       = vdwparam[tj+1];
 301
 302                                     rinvsix   = int_bit*rinvsq*rinvsq*rinvsq;
 303                                     Vvdw_disp = c6*rinvsix;
 304                                     Vvdw_rep  = c12*rinvsix*rinvsix;
 305                                     fscal    += (Vvdw_rep - Vvdw_disp)*rinvsq;
 306
 307                                     if (bEner)
 308                                     {
 309                                         vctot   += vcoul;
 310
 311                                         Vvdwtot +=
 312                                             (Vvdw_rep - int_bit*c12*iconst->sh_invrc6*iconst->sh_invrc6)/12 -
 313                                             (Vvdw_disp - int_bit*c6*iconst->sh_invrc6)/6;
 314                                     }
 315                                 }
 316
 317                                 tx        = fscal*dx;
 318                                 ty        = fscal*dy;
 319                                 tz        = fscal*dz;
 320                                 fix       = fix + tx;
 321                                 fiy       = fiy + ty;
 322                                 fiz       = fiz + tz;
 323                                 f[jfs+0] -= tx;
 324                                 f[jfs+1] -= ty;
 325                                 f[jfs+2] -= tz;
 326                             }
 327
 328                             f[ifs+0]        += fix;
 329                             f[ifs+1]        += fiy;
 330                             f[ifs+2]        += fiz;
 331                             fshift[ish3]     = fshift[ish3]   + fix;
 332                             fshift[ish3+1]   = fshift[ish3+1] + fiy;
 333                             fshift[ish3+2]   = fshift[ish3+2] + fiz;
 334
 335                             /* Count in half work-units.
 336                              * In CUDA one work-unit is 2 warps.
 337                              */
 338                             if ((ic+1) % (c_clSize/c_nbnxnGpuClusterpairSplit) == 0)
 339                             {
 340                                 npair_tot += npair;
 341
 342                                 nhwu++;
 343                                 if (within_rlist)
 344                                 {
 345                                     nhwu_pruned++;
 346                                 }
 347
 348                                 within_rlist = FALSE;
 349                                 npair        = 0;
 350                             }
 351                         }
 352                     }
 353                 }
 354             }
 355         }
 356
 357         if (bEner)
 358         {
 359             ggid             = 0;
 360             Vc[ggid]         = Vc[ggid]   + vctot;
 361             Vvdw[ggid]       = Vvdw[ggid] + Vvdwtot;
 362         }
 363     }
 364
 365     if (debug)
 366     {
 367         fprintf(debug, "number of half %dx%d atom pairs: %d after pruning: %d fraction %4.2f\n",
 368                 nbl->na_ci, nbl->na_ci,
 369                 nhwu, nhwu_pruned, nhwu_pruned/static_cast<double>(nhwu));
 370         fprintf(debug, "generic kernel pair interactions:            %d\n",
 371                 nhwu*nbl->na_ci/2*nbl->na_ci);
 372         fprintf(debug, "generic kernel post-prune pair interactions: %d\n",
 373                 nhwu_pruned*nbl->na_ci/2*nbl->na_ci);
 374         fprintf(debug, "generic kernel non-zero pair interactions:   %d\n",
 375                 npair_tot);
 376         fprintf(debug, "ratio non-zero/post-prune pair interactions: %4.2f\n",
 377                 npair_tot/static_cast<double>(nhwu_pruned*gmx::exactDiv(nbl->na_ci, 2)*nbl->na_ci));
 378     }
 379 }