src/gromacs/nbnxm/kerneldispatch.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2012,2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 #include "gmxpre.h"
  37
  38 #include "gromacs/gmxlib/nrnb.h"
  39 #include "gromacs/math/vectypes.h"
  40 #include "gromacs/mdlib/force_flags.h"
  41 #include "gromacs/mdlib/gmx_omp_nthreads.h"
  42 #include "gromacs/mdtypes/enerdata.h"
  43 #include "gromacs/mdtypes/interaction_const.h"
  44 #include "gromacs/mdtypes/md_enums.h"
  45 #include "gromacs/nbnxm/gpu_data_mgmt.h"
  46 #include "gromacs/nbnxm/nbnxm.h"
  47 #include "gromacs/nbnxm/nbnxm_simd.h"
  48 #include "gromacs/nbnxm/kernels_reference/kernel_gpu_ref.h"
  49 #include "gromacs/simd/simd.h"
  50 #include "gromacs/utility/gmxassert.h"
  51 #include "gromacs/utility/real.h"
  52
  53 #include "kernel_common.h"
  54 #define INCLUDE_KERNELFUNCTION_TABLES
  55 #include "gromacs/nbnxm/kernels_reference/kernel_ref.h"
  56 #ifdef GMX_NBNXN_SIMD_2XNN
  57 #include "gromacs/nbnxm/kernels_simd_2xmm/kernels.h"
  58 #endif
  59 #ifdef GMX_NBNXN_SIMD_4XN
  60 #include "gromacs/nbnxm/kernels_simd_4xm/kernels.h"
  61 #endif
  62 #undef INCLUDE_FUNCTION_TABLES
  63
  64 /*! \brief Clears the energy group output buffers
  65  *
  66  * \param[in,out] out  nbnxn kernel output struct
  67  */
  68 static void clearGroupEnergies(nbnxn_atomdata_output_t *out)
  69 {
  70     std::fill(out->Vvdw.begin(), out->Vvdw.end(), 0.0_real);
  71     std::fill(out->Vc.begin(), out->Vc.end(), 0.0_real);
  72     std::fill(out->VSvdw.begin(), out->VSvdw.end(), 0.0_real);
  73     std::fill(out->VSc.begin(), out->VSc.end(), 0.0_real);
  74 }
  75
  76 /*! \brief Reduce the group-pair energy buffers produced by a SIMD kernel
  77  * to single terms in the output buffers.
  78  *
  79  * The SIMD kernels produce a large number of energy buffer in SIMD registers
  80  * to avoid scattered reads and writes.
  81  *
  82  * \tparam        unrollj         The unroll size for j-particles in the SIMD kernel
  83  * \param[in]     numGroups       The number of energy groups
  84  * \param[in]     numGroups_2log  Log2 of numGroups, rounded up
  85  * \param[in,out] out             Struct with energy buffers
  86  */
  87 template <int unrollj> static void
  88 reduceGroupEnergySimdBuffers(int                       numGroups,
  89                              int                       numGroups_2log,
  90                              nbnxn_atomdata_output_t  *out)
  91 {
  92     const int                 unrollj_half     = unrollj/2;
  93     /* Energies are stored in SIMD registers with size 2^numGroups_2log */
  94     const int                 numGroupsStorage = (1 << numGroups_2log);
  95
  96     const real * gmx_restrict vVdwSimd     = out->VSvdw.data();
  97     const real * gmx_restrict vCoulombSimd = out->VSc.data();
  98     real * gmx_restrict       vVdw         = out->Vvdw.data();
  99     real * gmx_restrict       vCoulomb     = out->Vc.data();
 100
 101     /* The size of the SIMD energy group buffer array is:
 102      * numGroups*numGroups*numGroupsStorage*unrollj_half*simd_width
 103      */
 104     for (int i = 0; i < numGroups; i++)
 105     {
 106         for (int j1 = 0; j1 < numGroups; j1++)
 107         {
 108             for (int j0 = 0; j0 < numGroups; j0++)
 109             {
 110                 int c = ((i*numGroups + j1)*numGroupsStorage + j0)*unrollj_half*unrollj;
 111                 for (int s = 0; s < unrollj_half; s++)
 112                 {
 113                     vVdw    [i*numGroups + j0] += vVdwSimd    [c + 0];
 114                     vVdw    [i*numGroups + j1] += vVdwSimd    [c + 1];
 115                     vCoulomb[i*numGroups + j0] += vCoulombSimd[c + 0];
 116                     vCoulomb[i*numGroups + j1] += vCoulombSimd[c + 1];
 117                     c                          += unrollj + 2;
 118                 }
 119             }
 120         }
 121     }
 122 }
 123
 124 /*! \brief Dispatches the non-bonded N versus M atom cluster CPU kernels.
 125  *
 126  * OpenMP parallelization is performed within this function.
 127  * Energy reduction, but not force and shift force reduction, is performed
 128  * within this function.
 129  *
 130  * \param[in]     pairlistSet   Pairlists with local or non-local interactions to compute
 131  * \param[in]     kernelSetup   The non-bonded kernel setup
 132  * \param[in,out] nbat          The atomdata for the interactions
 133  * \param[in]     ic            Non-bonded interaction constants
 134  * \param[in]     shiftVectors  The PBC shift vectors
 135  * \param[in]     forceFlags    Flags that tell what to compute
 136  * \param[in]     clearF        Enum that tells if to clear the force output buffer
 137  * \param[out]    fshift        Shift force output buffer
 138  * \param[out]    vCoulomb      Output buffer for Coulomb energies
 139  * \param[out]    vVdw          Output buffer for Van der Waals energies
 140  */
 141 static void
 142 nbnxn_kernel_cpu(const nbnxn_pairlist_set_t     &pairlistSet,
 143                  const Nbnxm::KernelSetup       &kernelSetup,
 144                  nbnxn_atomdata_t               *nbat,
 145                  const interaction_const_t      &ic,
 146                  rvec                           *shiftVectors,
 147                  int                             forceFlags,
 148                  int                             clearF,
 149                  real                           *fshift,
 150                  real                           *vCoulomb,
 151                  real                           *vVdw)
 152 {
 153
 154     int                      coulkt;
 155     if (EEL_RF(ic.eeltype) || ic.eeltype == eelCUT)
 156     {
 157         coulkt = coulktRF;
 158     }
 159     else
 160     {
 161         if (kernelSetup.ewaldExclusionType == Nbnxm::EwaldExclusionType::Table)
 162         {
 163             if (ic.rcoulomb == ic.rvdw)
 164             {
 165                 coulkt = coulktTAB;
 166             }
 167             else
 168             {
 169                 coulkt = coulktTAB_TWIN;
 170             }
 171         }
 172         else
 173         {
 174             if (ic.rcoulomb == ic.rvdw)
 175             {
 176                 coulkt = coulktEWALD;
 177             }
 178             else
 179             {
 180                 coulkt = coulktEWALD_TWIN;
 181             }
 182         }
 183     }
 184
 185     const nbnxn_atomdata_t::Params &nbatParams = nbat->params();
 186
 187     int vdwkt = 0;
 188     if (ic.vdwtype == evdwCUT)
 189     {
 190         switch (ic.vdw_modifier)
 191         {
 192             case eintmodNONE:
 193             case eintmodPOTSHIFT:
 194                 switch (nbatParams.comb_rule)
 195                 {
 196                     case ljcrGEOM: vdwkt = vdwktLJCUT_COMBGEOM; break;
 197                     case ljcrLB:   vdwkt = vdwktLJCUT_COMBLB;   break;
 198                     case ljcrNONE: vdwkt = vdwktLJCUT_COMBNONE; break;
 199                     default:
 200                         GMX_RELEASE_ASSERT(false, "Unknown combination rule");
 201                 }
 202                 break;
 203             case eintmodFORCESWITCH:
 204                 vdwkt = vdwktLJFORCESWITCH;
 205                 break;
 206             case eintmodPOTSWITCH:
 207                 vdwkt = vdwktLJPOTSWITCH;
 208                 break;
 209             default:
 210                 GMX_RELEASE_ASSERT(false, "Unsupported VdW interaction modifier");
 211         }
 212     }
 213     else if (ic.vdwtype == evdwPME)
 214     {
 215         if (ic.ljpme_comb_rule == eljpmeGEOM)
 216         {
 217             vdwkt = vdwktLJEWALDCOMBGEOM;
 218         }
 219         else
 220         {
 221             vdwkt = vdwktLJEWALDCOMBLB;
 222             /* At setup we (should have) selected the C reference kernel */
 223             GMX_RELEASE_ASSERT(kernelSetup.kernelType == Nbnxm::KernelType::Cpu4x4_PlainC, "Only the C reference nbnxn SIMD kernel supports LJ-PME with LB combination rules");
 224         }
 225     }
 226     else
 227     {
 228         GMX_RELEASE_ASSERT(false, "Unsupported VdW interaction type");
 229     }
 230
 231     int                        nnbl = pairlistSet.nnbl;
 232     NbnxnPairlistCpu * const * nbl  = pairlistSet.nbl;
 233
 234     int gmx_unused             nthreads = gmx_omp_nthreads_get(emntNonbonded);
 235 #pragma omp parallel for schedule(static) num_threads(nthreads)
 236     for (int nb = 0; nb < nnbl; nb++)
 237     {
 238         // Presently, the kernels do not call C++ code that can throw,
 239         // so no need for a try/catch pair in this OpenMP region.
 240         nbnxn_atomdata_output_t *out = &nbat->out[nb];
 241
 242         if (clearF == enbvClearFYes)
 243         {
 244             clear_f(nbat, nb, out->f.data());
 245         }
 246
 247         real *fshift_p;
 248         if ((forceFlags & GMX_FORCE_VIRIAL) && nnbl == 1)
 249         {
 250             fshift_p = fshift;
 251         }
 252         else
 253         {
 254             fshift_p = out->fshift.data();
 255
 256             if (clearF == enbvClearFYes)
 257             {
 258                 clear_fshift(fshift_p);
 259             }
 260         }
 261
 262         if (!(forceFlags & GMX_FORCE_ENERGY))
 263         {
 264             /* Don't calculate energies */
 265             switch (kernelSetup.kernelType)
 266             {
 267                 case Nbnxm::KernelType::Cpu4x4_PlainC:
 268                     nbnxn_kernel_noener_ref[coulkt][vdwkt](nbl[nb], nbat,
 269                                                            &ic,
 270                                                            shiftVectors,
 271                                                            out->f.data(),
 272                                                            fshift_p);
 273                     break;
 274 #ifdef GMX_NBNXN_SIMD_2XNN
 275                 case Nbnxm::KernelType::Cpu4xN_Simd_2xNN:
 276                     nbnxm_kernel_noener_simd_2xmm[coulkt][vdwkt](nbl[nb], nbat,
 277                                                                  &ic,
 278                                                                  shiftVectors,
 279                                                                  out->f.data(),
 280                                                                  fshift_p);
 281                     break;
 282 #endif
 283 #ifdef GMX_NBNXN_SIMD_4XN
 284                 case Nbnxm::KernelType::Cpu4xN_Simd_4xN:
 285                     nbnxm_kernel_noener_simd_4xm[coulkt][vdwkt](nbl[nb], nbat,
 286                                                                 &ic,
 287                                                                 shiftVectors,
 288                                                                 out->f.data(),
 289                                                                 fshift_p);
 290                     break;
 291 #endif
 292                 default:
 293                     GMX_RELEASE_ASSERT(false, "Unsupported kernel architecture");
 294             }
 295         }
 296         else if (out->Vvdw.size() == 1)
 297         {
 298             /* A single energy group (pair) */
 299             out->Vvdw[0] = 0;
 300             out->Vc[0]   = 0;
 301
 302             switch (kernelSetup.kernelType)
 303             {
 304                 case Nbnxm::KernelType::Cpu4x4_PlainC:
 305                     nbnxn_kernel_ener_ref[coulkt][vdwkt](nbl[nb], nbat,
 306                                                          &ic,
 307                                                          shiftVectors,
 308                                                          out->f.data(),
 309                                                          fshift_p,
 310                                                          out->Vvdw.data(),
 311                                                          out->Vc.data());
 312                     break;
 313 #ifdef GMX_NBNXN_SIMD_2XNN
 314                 case Nbnxm::KernelType::Cpu4xN_Simd_2xNN:
 315                     nbnxm_kernel_ener_simd_2xmm[coulkt][vdwkt](nbl[nb], nbat,
 316                                                                &ic,
 317                                                                shiftVectors,
 318                                                                out->f.data(),
 319                                                                fshift_p,
 320                                                                out->Vvdw.data(),
 321                                                                out->Vc.data());
 322                     break;
 323 #endif
 324 #ifdef GMX_NBNXN_SIMD_4XN
 325                 case Nbnxm::KernelType::Cpu4xN_Simd_4xN:
 326                     nbnxm_kernel_ener_simd_4xm[coulkt][vdwkt](nbl[nb], nbat,
 327                                                               &ic,
 328                                                               shiftVectors,
 329                                                               out->f.data(),
 330                                                               fshift_p,
 331                                                               out->Vvdw.data(),
 332                                                               out->Vc.data());
 333                     break;
 334 #endif
 335                 default:
 336                     GMX_RELEASE_ASSERT(false, "Unsupported kernel architecture");
 337             }
 338         }
 339         else
 340         {
 341             /* Calculate energy group contributions */
 342             clearGroupEnergies(out);
 343
 344             int unrollj = 0;
 345
 346             switch (kernelSetup.kernelType)
 347             {
 348                 case Nbnxm::KernelType::Cpu4x4_PlainC:
 349                     unrollj = c_nbnxnCpuIClusterSize;
 350                     nbnxn_kernel_energrp_ref[coulkt][vdwkt](nbl[nb], nbat,
 351                                                             &ic,
 352                                                             shiftVectors,
 353                                                             out->f.data(),
 354                                                             fshift_p,
 355                                                             out->Vvdw.data(),
 356                                                             out->Vc.data());
 357                     break;
 358 #ifdef GMX_NBNXN_SIMD_2XNN
 359                 case Nbnxm::KernelType::Cpu4xN_Simd_2xNN:
 360                     unrollj = GMX_SIMD_REAL_WIDTH/2;
 361                     nbnxm_kernel_energrp_simd_2xmm[coulkt][vdwkt](nbl[nb], nbat,
 362                                                                   &ic,
 363                                                                   shiftVectors,
 364                                                                   out->f.data(),
 365                                                                   fshift_p,
 366                                                                   out->VSvdw.data(),
 367                                                                   out->VSc.data());
 368                     break;
 369 #endif
 370 #ifdef GMX_NBNXN_SIMD_4XN
 371                 case Nbnxm::KernelType::Cpu4xN_Simd_4xN:
 372                     unrollj = GMX_SIMD_REAL_WIDTH;
 373                     nbnxm_kernel_energrp_simd_4xm[coulkt][vdwkt](nbl[nb], nbat,
 374                                                                  &ic,
 375                                                                  shiftVectors,
 376                                                                  out->f.data(),
 377                                                                  fshift_p,
 378                                                                  out->VSvdw.data(),
 379                                                                  out->VSc.data());
 380                     break;
 381 #endif
 382                 default:
 383                     GMX_RELEASE_ASSERT(false, "Unsupported kernel architecture");
 384             }
 385
 386             if (kernelSetup.kernelType != Nbnxm::KernelType::Cpu4x4_PlainC)
 387             {
 388                 switch (unrollj)
 389                 {
 390                     case 2:
 391                         reduceGroupEnergySimdBuffers<2>(nbatParams.nenergrp,
 392                                                         nbatParams.neg_2log,
 393                                                         out);
 394                         break;
 395                     case 4:
 396                         reduceGroupEnergySimdBuffers<4>(nbatParams.nenergrp,
 397                                                         nbatParams.neg_2log,
 398                                                         out);
 399                         break;
 400                     case 8:
 401                         reduceGroupEnergySimdBuffers<8>(nbatParams.nenergrp,
 402                                                         nbatParams.neg_2log,
 403                                                         out);
 404                         break;
 405                     default:
 406                         GMX_RELEASE_ASSERT(false, "Unsupported j-unroll size");
 407                 }
 408             }
 409         }
 410     }
 411
 412     if (forceFlags & GMX_FORCE_ENERGY)
 413     {
 414         reduce_energies_over_lists(nbat, nnbl, vVdw, vCoulomb);
 415     }
 416 }
 417
 418 static void accountFlops(t_nrnb                           *nrnb,
 419                          const nonbonded_verlet_t         &nbv,
 420                          const Nbnxm::InteractionLocality  iLocality,
 421                          const interaction_const_t        &ic,
 422                          const int                         forceFlags)
 423 {
 424     const nbnxn_pairlist_set_t &pairlistSet     = nbv.pairlistSet(iLocality);
 425     const bool                  usingGpuKernels = nbv.useGpu();
 426
 427     int enr_nbnxn_kernel_ljc;
 428     if (EEL_RF(ic.eeltype) || ic.eeltype == eelCUT)
 429     {
 430         enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_RF;
 431     }
 432     else if ((!usingGpuKernels && nbv.kernelSetup().ewaldExclusionType == Nbnxm::EwaldExclusionType::Analytical) ||
 433              (usingGpuKernels && Nbnxm::gpu_is_kernel_ewald_analytical(nbv.gpu_nbv)))
 434     {
 435         enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_EWALD;
 436     }
 437     else
 438     {
 439         enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_TAB;
 440     }
 441     int enr_nbnxn_kernel_lj = eNR_NBNXN_LJ;
 442     if (forceFlags & GMX_FORCE_ENERGY)
 443     {
 444         /* In eNR_??? the nbnxn F+E kernels are always the F kernel + 1 */
 445         enr_nbnxn_kernel_ljc += 1;
 446         enr_nbnxn_kernel_lj  += 1;
 447     }
 448
 449     inc_nrnb(nrnb, enr_nbnxn_kernel_ljc,
 450              pairlistSet.natpair_ljq);
 451     inc_nrnb(nrnb, enr_nbnxn_kernel_lj,
 452              pairlistSet.natpair_lj);
 453     /* The Coulomb-only kernels are offset -eNR_NBNXN_LJ_RF+eNR_NBNXN_RF */
 454     inc_nrnb(nrnb, enr_nbnxn_kernel_ljc-eNR_NBNXN_LJ_RF+eNR_NBNXN_RF,
 455              pairlistSet.natpair_q);
 456
 457     const bool calcEnergy = ((forceFlags & GMX_FORCE_ENERGY) != 0);
 458     if (ic.vdw_modifier == eintmodFORCESWITCH)
 459     {
 460         /* We add up the switch cost separately */
 461         inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_FSW + (calcEnergy ? 1 : 0),
 462                  pairlistSet.natpair_ljq + pairlistSet.natpair_lj);
 463     }
 464     if (ic.vdw_modifier == eintmodPOTSWITCH)
 465     {
 466         /* We add up the switch cost separately */
 467         inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_PSW + (calcEnergy ? 1 : 0),
 468                  pairlistSet.natpair_ljq + pairlistSet.natpair_lj);
 469     }
 470     if (ic.vdwtype == evdwPME)
 471     {
 472         /* We add up the LJ Ewald cost separately */
 473         inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_EWALD + (calcEnergy ? 1 : 0),
 474                  pairlistSet.natpair_ljq + pairlistSet.natpair_lj);
 475     }
 476 }
 477
 478 void NbnxnDispatchKernel(nonbonded_verlet_t        *nbv,
 479                          Nbnxm::InteractionLocality iLocality,
 480                          const interaction_const_t &ic,
 481                          int                        forceFlags,
 482                          int                        clearF,
 483                          t_forcerec                *fr,
 484                          gmx_enerdata_t            *enerd,
 485                          t_nrnb                    *nrnb)
 486 {
 487     const nbnxn_pairlist_set_t &pairlistSet = nbv->pairlistSet(iLocality);
 488
 489     switch (nbv->kernelSetup().kernelType)
 490     {
 491         case Nbnxm::KernelType::Cpu4x4_PlainC:
 492         case Nbnxm::KernelType::Cpu4xN_Simd_4xN:
 493         case Nbnxm::KernelType::Cpu4xN_Simd_2xNN:
 494             nbnxn_kernel_cpu(pairlistSet,
 495                              nbv->kernelSetup(),
 496                              nbv->nbat,
 497                              ic,
 498                              fr->shift_vec,
 499                              forceFlags,
 500                              clearF,
 501                              fr->fshift[0],
 502                              enerd->grpp.ener[egCOULSR],
 503                              fr->bBHAM ?
 504                              enerd->grpp.ener[egBHAMSR] :
 505                              enerd->grpp.ener[egLJSR]);
 506             break;
 507
 508         case Nbnxm::KernelType::Gpu8x8x8:
 509             Nbnxm::gpu_launch_kernel(nbv->gpu_nbv, forceFlags, iLocality);
 510             break;
 511
 512         case Nbnxm::KernelType::Cpu8x8x8_PlainC:
 513             nbnxn_kernel_gpu_ref(pairlistSet.nblGpu[0],
 514                                  nbv->nbat, &ic,
 515                                  fr->shift_vec,
 516                                  forceFlags,
 517                                  clearF,
 518                                  nbv->nbat->out[0].f,
 519                                  fr->fshift[0],
 520                                  enerd->grpp.ener[egCOULSR],
 521                                  fr->bBHAM ?
 522                                  enerd->grpp.ener[egBHAMSR] :
 523                                  enerd->grpp.ener[egLJSR]);
 524             break;
 525
 526         default:
 527             GMX_RELEASE_ASSERT(false, "Invalid nonbonded kernel type passed!");
 528
 529     }
 530
 531     accountFlops(nrnb, *nbv, iLocality, ic, forceFlags);
 532 }