src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel.cuh

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2012,2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 /*! \internal \file
  37  *  \brief
  38  *  CUDA non-bonded kernel used through preprocessor-based code generation
  39  *  of multiple kernel flavors, see nbnxn_cuda_kernels.cuh.
  40  *
  41  *  NOTE: No include fence as it is meant to be included multiple times.
  42  *
  43  *  \author Szilárd Páll <pall.szilard@gmail.com>
  44  *  \author Berk Hess <hess@kth.se>
  45  *  \ingroup module_nbnxm
  46  */
  47
  48 #include "gromacs/gpu_utils/cuda_arch_utils.cuh"
  49 #include "gromacs/gpu_utils/cuda_kernel_utils.cuh"
  50 #include "gromacs/math/utilities.h"
  51 #include "gromacs/pbcutil/ishift.h"
  52 /* Note that floating-point constants in CUDA code should be suffixed
  53  * with f (e.g. 0.5f), to stop the compiler producing intermediate
  54  * code that is in double precision.
  55  */
  56
  57 #if defined EL_EWALD_ANA || defined EL_EWALD_TAB
  58 /* Note: convenience macro, needs to be undef-ed at the end of the file. */
  59 #    define EL_EWALD_ANY
  60 #endif
  61
  62 #if defined EL_EWALD_ANY || defined EL_RF || defined LJ_EWALD \
  63         || (defined EL_CUTOFF && defined CALC_ENERGIES)
  64 /* Macro to control the calculation of exclusion forces in the kernel
  65  * We do that with Ewald (elec/vdw) and RF. Cut-off only has exclusion
  66  * energy terms.
  67  *
  68  * Note: convenience macro, needs to be undef-ed at the end of the file.
  69  */
  70 #    define EXCLUSION_FORCES
  71 #endif
  72
  73 #if defined LJ_EWALD_COMB_GEOM || defined LJ_EWALD_COMB_LB
  74 /* Note: convenience macro, needs to be undef-ed at the end of the file. */
  75 #    define LJ_EWALD
  76 #endif
  77
  78 #if defined LJ_COMB_GEOM || defined LJ_COMB_LB
  79 #    define LJ_COMB
  80 #endif
  81
  82 /*
  83    Kernel launch parameters:
  84     - #blocks   = #pair lists, blockId = pair list Id
  85     - #threads  = NTHREAD_Z * c_clSize^2
  86     - shmem     = see nbnxn_cuda.cu:calc_shmem_required_nonbonded()
  87
  88     Each thread calculates an i force-component taking one pair of i-j atoms.
  89  */
  90
  91 /**@{*/
  92 /*! \brief Compute capability dependent definition of kernel launch configuration parameters.
  93  *
  94  * NTHREAD_Z controls the number of j-clusters processed concurrently on NTHREAD_Z
  95  * warp-pairs per block.
  96  *
  97  * - On CC 3.0-3.5, and >=5.0 NTHREAD_Z == 1, translating to 64 th/block with 16
  98  * blocks/multiproc, is the fastest even though this setup gives low occupancy
  99  * (except on 6.0).
 100  * NTHREAD_Z > 1 results in excessive register spilling unless the minimum blocks
 101  * per multiprocessor is reduced proportionally to get the original number of max
 102  * threads in flight (and slightly lower performance).
 103  * - On CC 3.7 there are enough registers to double the number of threads; using
 104  * NTHREADS_Z == 2 is fastest with 16 blocks (TODO: test with RF and other kernels
 105  * with low-register use).
 106  *
 107  * Note that the current kernel implementation only supports NTHREAD_Z > 1 with
 108  * shuffle-based reduction, hence CC >= 3.0.
 109  *
 110  *
 111  * NOTEs on Volta / CUDA 9 extensions:
 112  *
 113  * - While active thread masks are required for the warp collectives
 114  *   (we use any and shfl), the kernel is designed such that all conditions
 115  *   (other than the inner-most distance check) including loop trip counts
 116  *   are warp-synchronous. Therefore, we don't need ballot to compute the
 117  *   active masks as these are all full-warp masks.
 118  *
 119  * - TODO: reconsider the use of __syncwarp(): its only role is currently to prevent
 120  *   WAR hazard due to the cj preload; we should try to replace it with direct
 121  *   loads (which may be faster given the improved L1 on Volta).
 122  */
 123
 124 /* Kernel launch bounds for different compute capabilities. The value of NTHREAD_Z
 125  * determines the number of threads per block and it is chosen such that
 126  * 16 blocks/multiprocessor can be kept in flight.
 127  * - CC 3.0,3.5, and >=5.0: NTHREAD_Z=1, (64, 16) bounds
 128  * - CC 3.7:                NTHREAD_Z=2, (128, 16) bounds
 129  *
 130  * Note: convenience macros, need to be undef-ed at the end of the file.
 131  */
 132 #if GMX_PTX_ARCH == 370
 133 #    define NTHREAD_Z (2)
 134 #    define MIN_BLOCKS_PER_MP (16)
 135 #else
 136 #    define NTHREAD_Z (1)
 137 #    define MIN_BLOCKS_PER_MP (16)
 138 #endif /* GMX_PTX_ARCH == 370 */
 139 #define THREADS_PER_BLOCK (c_clSize * c_clSize * NTHREAD_Z)
 140
 141 #if GMX_PTX_ARCH >= 350
 142 /**@}*/
 143 __launch_bounds__(THREADS_PER_BLOCK, MIN_BLOCKS_PER_MP)
 144 #else
 145 __launch_bounds__(THREADS_PER_BLOCK)
 146 #endif /* GMX_PTX_ARCH >= 350 */
 147 #ifdef PRUNE_NBL
 148 #    ifdef CALC_ENERGIES
 149         __global__ void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _VF_prune_cuda)
 150 #    else
 151         __global__ void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_prune_cuda)
 152 #    endif /* CALC_ENERGIES */
 153 #else
 154 #    ifdef CALC_ENERGIES
 155         __global__ void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _VF_cuda)
 156 #    else
 157         __global__ void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_cuda)
 158 #    endif /* CALC_ENERGIES */
 159 #endif     /* PRUNE_NBL */
 160                 (const cu_atomdata_t atdat, const cu_nbparam_t nbparam, const cu_plist_t plist, bool bCalcFshift)
 161 #ifdef FUNCTION_DECLARATION_ONLY
 162                         ; /* Only do function declaration, omit the function body. */
 163 #else
 164 {
 165     /* convenience variables */
 166     const nbnxn_sci_t* pl_sci = plist.sci;
 167 #    ifndef PRUNE_NBL
 168     const
 169 #    endif
 170             nbnxn_cj4_t* pl_cj4      = plist.cj4;
 171     const nbnxn_excl_t*  excl        = plist.excl;
 172 #    ifndef LJ_COMB
 173     const int*           atom_types  = atdat.atom_types;
 174     int                  ntypes      = atdat.ntypes;
 175 #    else
 176     const float2* lj_comb = atdat.lj_comb;
 177     float2        ljcp_i, ljcp_j;
 178 #    endif
 179     const float4*        xq          = atdat.xq;
 180     float3*              f           = atdat.f;
 181     const float3*        shift_vec   = atdat.shift_vec;
 182     float                rcoulomb_sq = nbparam.rcoulomb_sq;
 183 #    ifdef VDW_CUTOFF_CHECK
 184     float                rvdw_sq     = nbparam.rvdw_sq;
 185     float                vdw_in_range;
 186 #    endif
 187 #    ifdef LJ_EWALD
 188     float                lje_coeff2, lje_coeff6_6;
 189 #    endif
 190 #    ifdef EL_RF
 191     float                two_k_rf    = nbparam.two_k_rf;
 192 #    endif
 193 #    ifdef EL_EWALD_ANA
 194     float                beta2       = nbparam.ewald_beta * nbparam.ewald_beta;
 195     float                beta3       = nbparam.ewald_beta * nbparam.ewald_beta * nbparam.ewald_beta;
 196 #    endif
 197 #    ifdef PRUNE_NBL
 198     float                rlist_sq    = nbparam.rlistOuter_sq;
 199 #    endif
 200
 201 #    ifdef CALC_ENERGIES
 202 #        ifdef EL_EWALD_ANY
 203     float                beta        = nbparam.ewald_beta;
 204     float                ewald_shift = nbparam.sh_ewald;
 205 #        else
 206     float c_rf = nbparam.c_rf;
 207 #        endif /* EL_EWALD_ANY */
 208     float*               e_lj        = atdat.e_lj;
 209     float*               e_el        = atdat.e_el;
 210 #    endif     /* CALC_ENERGIES */
 211
 212     /* thread/block/warp id-s */
 213     unsigned int tidxi = threadIdx.x;
 214     unsigned int tidxj = threadIdx.y;
 215     unsigned int tidx  = threadIdx.y * blockDim.x + threadIdx.x;
 216 #    if NTHREAD_Z == 1
 217     unsigned int tidxz = 0;
 218 #    else
 219     unsigned int  tidxz = threadIdx.z;
 220 #    endif
 221     unsigned int bidx  = blockIdx.x;
 222     unsigned int widx  = tidx / warp_size; /* warp index */
 223
 224     int          sci, ci, cj, ai, aj, cij4_start, cij4_end;
 225 #    ifndef LJ_COMB
 226     int          typei, typej;
 227 #    endif
 228     int          i, jm, j4, wexcl_idx;
 229     float        qi, qj_f, r2, inv_r, inv_r2;
 230 #    if !defined LJ_COMB_LB || defined CALC_ENERGIES
 231     float        inv_r6, c6, c12;
 232 #    endif
 233 #    ifdef LJ_COMB_LB
 234     float        sigma, epsilon;
 235 #    endif
 236     float        int_bit, F_invr;
 237 #    ifdef CALC_ENERGIES
 238     float        E_lj, E_el;
 239 #    endif
 240 #    if defined CALC_ENERGIES || defined LJ_POT_SWITCH
 241     float        E_lj_p;
 242 #    endif
 243     unsigned int wexcl, imask, mask_ji;
 244     float4       xqbuf;
 245     float3       xi, xj, rv, f_ij, fcj_buf;
 246     float3       fci_buf[c_numClPerSupercl]; /* i force buffer */
 247     nbnxn_sci_t  nb_sci;
 248
 249     /*! i-cluster interaction mask for a super-cluster with all c_numClPerSupercl=8 bits set */
 250     const unsigned superClInteractionMask = ((1U << c_numClPerSupercl) - 1U);
 251
 252     /*********************************************************************
 253      * Set up shared memory pointers.
 254      * sm_nextSlotPtr should always be updated to point to the "next slot",
 255      * that is past the last point where data has been stored.
 256      */
 257     extern __shared__ char sm_dynamicShmem[];
 258     char*                  sm_nextSlotPtr = sm_dynamicShmem;
 259     static_assert(sizeof(char) == 1,
 260                   "The shared memory offset calculation assumes that char is 1 byte");
 261
 262     /* shmem buffer for i x+q pre-loading */
 263     float4* xqib = (float4*)sm_nextSlotPtr;
 264     sm_nextSlotPtr += (c_numClPerSupercl * c_clSize * sizeof(*xqib));
 265
 266     /* shmem buffer for cj, for each warp separately */
 267     int* cjs = (int*)(sm_nextSlotPtr);
 268     /* the cjs buffer's use expects a base pointer offset for pairs of warps in the j-concurrent execution */
 269     cjs += tidxz * c_nbnxnGpuClusterpairSplit * c_nbnxnGpuJgroupSize;
 270     sm_nextSlotPtr += (NTHREAD_Z * c_nbnxnGpuClusterpairSplit * c_nbnxnGpuJgroupSize * sizeof(*cjs));
 271
 272 #    ifndef LJ_COMB
 273     /* shmem buffer for i atom-type pre-loading */
 274     int* atib = (int*)sm_nextSlotPtr;
 275     sm_nextSlotPtr += (c_numClPerSupercl * c_clSize * sizeof(*atib));
 276 #    else
 277     /* shmem buffer for i-atom LJ combination rule parameters */
 278     float2* ljcpib = (float2*)sm_nextSlotPtr;
 279     sm_nextSlotPtr += (c_numClPerSupercl * c_clSize * sizeof(*ljcpib));
 280 #    endif
 281     /*********************************************************************/
 282
 283     nb_sci     = pl_sci[bidx];         /* my i super-cluster's index = current bidx */
 284     sci        = nb_sci.sci;           /* super-cluster */
 285     cij4_start = nb_sci.cj4_ind_start; /* first ...*/
 286     cij4_end   = nb_sci.cj4_ind_end;   /* and last index of j clusters */
 287
 288     if (tidxz == 0)
 289     {
 290         /* Pre-load i-atom x and q into shared memory */
 291         ci = sci * c_numClPerSupercl + tidxj;
 292         ai = ci * c_clSize + tidxi;
 293
 294         float* shiftptr = (float*)&shift_vec[nb_sci.shift];
 295         xqbuf = xq[ai] + make_float4(LDG(shiftptr), LDG(shiftptr + 1), LDG(shiftptr + 2), 0.0f);
 296         xqbuf.w *= nbparam.epsfac;
 297         xqib[tidxj * c_clSize + tidxi] = xqbuf;
 298
 299 #    ifndef LJ_COMB
 300         /* Pre-load the i-atom types into shared memory */
 301         atib[tidxj * c_clSize + tidxi] = atom_types[ai];
 302 #    else
 303         /* Pre-load the LJ combination parameters into shared memory */
 304         ljcpib[tidxj * c_clSize + tidxi] = lj_comb[ai];
 305 #    endif
 306     }
 307     __syncthreads();
 308
 309     for (i = 0; i < c_numClPerSupercl; i++)
 310     {
 311         fci_buf[i] = make_float3(0.0f);
 312     }
 313
 314 #    ifdef LJ_EWALD
 315     /* TODO: we are trading registers with flops by keeping lje_coeff-s, try re-calculating it later */
 316     lje_coeff2   = nbparam.ewaldcoeff_lj * nbparam.ewaldcoeff_lj;
 317     lje_coeff6_6 = lje_coeff2 * lje_coeff2 * lje_coeff2 * c_oneSixth;
 318 #    endif
 319
 320
 321 #    ifdef CALC_ENERGIES
 322     E_lj         = 0.0f;
 323     E_el         = 0.0f;
 324
 325 #        ifdef EXCLUSION_FORCES /* Ewald or RF */
 326     if (nb_sci.shift == CENTRAL && pl_cj4[cij4_start].cj[0] == sci * c_numClPerSupercl)
 327     {
 328         /* we have the diagonal: add the charge and LJ self interaction energy term */
 329         for (i = 0; i < c_numClPerSupercl; i++)
 330         {
 331 #            if defined EL_EWALD_ANY || defined EL_RF || defined EL_CUTOFF
 332             qi = xqib[i * c_clSize + tidxi].w;
 333             E_el += qi * qi;
 334 #            endif
 335
 336 #            ifdef LJ_EWALD
 337 #                if DISABLE_CUDA_TEXTURES
 338             E_lj += LDG(
 339                     &nbparam.nbfp[atom_types[(sci * c_numClPerSupercl + i) * c_clSize + tidxi] * (ntypes + 1) * 2]);
 340 #                else
 341             E_lj += tex1Dfetch<float>(
 342                     nbparam.nbfp_texobj,
 343                     atom_types[(sci * c_numClPerSupercl + i) * c_clSize + tidxi] * (ntypes + 1) * 2);
 344 #                endif
 345 #            endif
 346         }
 347
 348         /* divide the self term(s) equally over the j-threads, then multiply with the coefficients. */
 349 #            ifdef LJ_EWALD
 350         E_lj /= c_clSize * NTHREAD_Z;
 351         E_lj *= 0.5f * c_oneSixth * lje_coeff6_6;
 352 #            endif
 353
 354 #            if defined EL_EWALD_ANY || defined EL_RF || defined EL_CUTOFF
 355         /* Correct for epsfac^2 due to adding qi^2 */
 356         E_el /= nbparam.epsfac * c_clSize * NTHREAD_Z;
 357 #                if defined EL_RF || defined EL_CUTOFF
 358         E_el *= -0.5f * c_rf;
 359 #                else
 360         E_el *= -beta * M_FLOAT_1_SQRTPI; /* last factor 1/sqrt(pi) */
 361 #                endif
 362 #            endif /* EL_EWALD_ANY || defined EL_RF || defined EL_CUTOFF */
 363     }
 364 #        endif     /* EXCLUSION_FORCES */
 365
 366 #    endif /* CALC_ENERGIES */
 367
 368 #    ifdef EXCLUSION_FORCES
 369     const int nonSelfInteraction = !(nb_sci.shift == CENTRAL & tidxj <= tidxi);
 370 #    endif
 371
 372     /* loop over the j clusters = seen by any of the atoms in the current super-cluster;
 373      * The loop stride NTHREAD_Z ensures that consecutive warps-pairs are assigned
 374      * consecutive j4's entries.
 375      */
 376     for (j4 = cij4_start + tidxz; j4 < cij4_end; j4 += NTHREAD_Z)
 377     {
 378         wexcl_idx = pl_cj4[j4].imei[widx].excl_ind;
 379         imask     = pl_cj4[j4].imei[widx].imask;
 380         wexcl     = excl[wexcl_idx].pair[(tidx) & (warp_size - 1)];
 381
 382 #    ifndef PRUNE_NBL
 383         if (imask)
 384 #    endif
 385         {
 386             /* Pre-load cj into shared memory on both warps separately */
 387             if ((tidxj == 0 | tidxj == 4) & (tidxi < c_nbnxnGpuJgroupSize))
 388             {
 389                 cjs[tidxi + tidxj * c_nbnxnGpuJgroupSize / c_splitClSize] = pl_cj4[j4].cj[tidxi];
 390             }
 391             __syncwarp(c_fullWarpMask);
 392
 393             /* Unrolling this loop
 394                - with pruning leads to register spilling;
 395                - on Kepler and later it is much slower;
 396                Tested with up to nvcc 7.5 */
 397             for (jm = 0; jm < c_nbnxnGpuJgroupSize; jm++)
 398             {
 399                 if (imask & (superClInteractionMask << (jm * c_numClPerSupercl)))
 400                 {
 401                     mask_ji = (1U << (jm * c_numClPerSupercl));
 402
 403                     cj = cjs[jm + (tidxj & 4) * c_nbnxnGpuJgroupSize / c_splitClSize];
 404                     aj = cj * c_clSize + tidxj;
 405
 406                     /* load j atom data */
 407                     xqbuf = xq[aj];
 408                     xj    = make_float3(xqbuf.x, xqbuf.y, xqbuf.z);
 409                     qj_f  = xqbuf.w;
 410 #    ifndef LJ_COMB
 411                     typej = atom_types[aj];
 412 #    else
 413                     ljcp_j = lj_comb[aj];
 414 #    endif
 415
 416                     fcj_buf = make_float3(0.0f);
 417
 418 #    if !defined PRUNE_NBL
 419 #        pragma unroll 8
 420 #    endif
 421                     for (i = 0; i < c_numClPerSupercl; i++)
 422                     {
 423                         if (imask & mask_ji)
 424                         {
 425                             ci = sci * c_numClPerSupercl + i; /* i cluster index */
 426
 427                             /* all threads load an atom from i cluster ci into shmem! */
 428                             xqbuf = xqib[i * c_clSize + tidxi];
 429                             xi    = make_float3(xqbuf.x, xqbuf.y, xqbuf.z);
 430
 431                             /* distance between i and j atoms */
 432                             rv = xi - xj;
 433                             r2 = norm2(rv);
 434
 435 #    ifdef PRUNE_NBL
 436                             /* If _none_ of the atoms pairs are in cutoff range,
 437                                the bit corresponding to the current
 438                                cluster-pair in imask gets set to 0. */
 439                             if (!__any_sync(c_fullWarpMask, r2 < rlist_sq))
 440                             {
 441                                 imask &= ~mask_ji;
 442                             }
 443 #    endif
 444
 445                             int_bit = (wexcl & mask_ji) ? 1.0f : 0.0f;
 446
 447                             /* cutoff & exclusion check */
 448 #    ifdef EXCLUSION_FORCES
 449                             if ((r2 < rcoulomb_sq) * (nonSelfInteraction | (ci != cj)))
 450 #    else
 451                             if ((r2 < rcoulomb_sq) * int_bit)
 452 #    endif
 453                             {
 454                                 /* load the rest of the i-atom parameters */
 455                                 qi = xqbuf.w;
 456
 457 #    ifndef LJ_COMB
 458                                 /* LJ 6*C6 and 12*C12 */
 459                                 typei = atib[i * c_clSize + tidxi];
 460                                 fetch_nbfp_c6_c12(c6, c12, nbparam, ntypes * typei + typej);
 461 #    else
 462                                 ljcp_i       = ljcpib[i * c_clSize + tidxi];
 463 #        ifdef LJ_COMB_GEOM
 464                                 c6           = ljcp_i.x * ljcp_j.x;
 465                                 c12          = ljcp_i.y * ljcp_j.y;
 466 #        else
 467                                 /* LJ 2^(1/6)*sigma and 12*epsilon */
 468                                 sigma   = ljcp_i.x + ljcp_j.x;
 469                                 epsilon = ljcp_i.y * ljcp_j.y;
 470 #            if defined CALC_ENERGIES || defined LJ_FORCE_SWITCH || defined LJ_POT_SWITCH
 471                                 convert_sigma_epsilon_to_c6_c12(sigma, epsilon, &c6, &c12);
 472 #            endif
 473 #        endif /* LJ_COMB_GEOM */
 474 #    endif     /* LJ_COMB */
 475
 476                                 // Ensure distance do not become so small that r^-12 overflows
 477                                 r2 = max(r2, NBNXN_MIN_RSQ);
 478
 479                                 inv_r  = rsqrt(r2);
 480                                 inv_r2 = inv_r * inv_r;
 481 #    if !defined LJ_COMB_LB || defined CALC_ENERGIES
 482                                 inv_r6 = inv_r2 * inv_r2 * inv_r2;
 483 #        ifdef EXCLUSION_FORCES
 484                                 /* We could mask inv_r2, but with Ewald
 485                                  * masking both inv_r6 and F_invr is faster */
 486                                 inv_r6 *= int_bit;
 487 #        endif /* EXCLUSION_FORCES */
 488
 489                                 F_invr = inv_r6 * (c12 * inv_r6 - c6) * inv_r2;
 490 #        if defined CALC_ENERGIES || defined LJ_POT_SWITCH
 491                                 E_lj_p = int_bit
 492                                          * (c12 * (inv_r6 * inv_r6 + nbparam.repulsion_shift.cpot) * c_oneTwelveth
 493                                             - c6 * (inv_r6 + nbparam.dispersion_shift.cpot) * c_oneSixth);
 494 #        endif
 495 #    else /* !LJ_COMB_LB || CALC_ENERGIES */
 496                                 float sig_r  = sigma * inv_r;
 497                                 float sig_r2 = sig_r * sig_r;
 498                                 float sig_r6 = sig_r2 * sig_r2 * sig_r2;
 499 #        ifdef EXCLUSION_FORCES
 500                                 sig_r6 *= int_bit;
 501 #        endif /* EXCLUSION_FORCES */
 502
 503                                 F_invr = epsilon * sig_r6 * (sig_r6 - 1.0f) * inv_r2;
 504 #    endif     /* !LJ_COMB_LB || CALC_ENERGIES */
 505
 506 #    ifdef LJ_FORCE_SWITCH
 507 #        ifdef CALC_ENERGIES
 508                                 calculate_force_switch_F_E(nbparam, c6, c12, inv_r, r2, &F_invr, &E_lj_p);
 509 #        else
 510                                 calculate_force_switch_F(nbparam, c6, c12, inv_r, r2, &F_invr);
 511 #        endif /* CALC_ENERGIES */
 512 #    endif     /* LJ_FORCE_SWITCH */
 513
 514
 515 #    ifdef LJ_EWALD
 516 #        ifdef LJ_EWALD_COMB_GEOM
 517 #            ifdef CALC_ENERGIES
 518                                 calculate_lj_ewald_comb_geom_F_E(nbparam, typei, typej, r2, inv_r2,
 519                                                                  lje_coeff2, lje_coeff6_6, int_bit,
 520                                                                  &F_invr, &E_lj_p);
 521 #            else
 522                                 calculate_lj_ewald_comb_geom_F(nbparam, typei, typej, r2, inv_r2,
 523                                                                lje_coeff2, lje_coeff6_6, &F_invr);
 524 #            endif /* CALC_ENERGIES */
 525 #        elif defined LJ_EWALD_COMB_LB
 526                                 calculate_lj_ewald_comb_LB_F_E(nbparam, typei, typej, r2, inv_r2,
 527                                                                lje_coeff2, lje_coeff6_6,
 528 #            ifdef CALC_ENERGIES
 529                                                                int_bit, &F_invr, &E_lj_p
 530 #            else
 531                                                                0, &F_invr, nullptr
 532 #            endif /* CALC_ENERGIES */
 533                                 );
 534 #        endif     /* LJ_EWALD_COMB_GEOM */
 535 #    endif         /* LJ_EWALD */
 536
 537 #    ifdef LJ_POT_SWITCH
 538 #        ifdef CALC_ENERGIES
 539                                 calculate_potential_switch_F_E(nbparam, inv_r, r2, &F_invr, &E_lj_p);
 540 #        else
 541                                 calculate_potential_switch_F(nbparam, inv_r, r2, &F_invr, &E_lj_p);
 542 #        endif /* CALC_ENERGIES */
 543 #    endif     /* LJ_POT_SWITCH */
 544
 545 #    ifdef VDW_CUTOFF_CHECK
 546                                 /* Separate VDW cut-off check to enable twin-range cut-offs
 547                                  * (rvdw < rcoulomb <= rlist)
 548                                  */
 549                                 vdw_in_range = (r2 < rvdw_sq) ? 1.0f : 0.0f;
 550                                 F_invr *= vdw_in_range;
 551 #        ifdef CALC_ENERGIES
 552                                 E_lj_p *= vdw_in_range;
 553 #        endif
 554 #    endif /* VDW_CUTOFF_CHECK */
 555
 556 #    ifdef CALC_ENERGIES
 557                                 E_lj += E_lj_p;
 558 #    endif
 559
 560
 561 #    ifdef EL_CUTOFF
 562 #        ifdef EXCLUSION_FORCES
 563                                 F_invr += qi * qj_f * int_bit * inv_r2 * inv_r;
 564 #        else
 565                                 F_invr += qi * qj_f * inv_r2 * inv_r;
 566 #        endif
 567 #    endif
 568 #    ifdef EL_RF
 569                                 F_invr += qi * qj_f * (int_bit * inv_r2 * inv_r - two_k_rf);
 570 #    endif
 571 #    if defined   EL_EWALD_ANA
 572                                 F_invr += qi * qj_f
 573                                           * (int_bit * inv_r2 * inv_r + pmecorrF(beta2 * r2) * beta3);
 574 #    elif defined EL_EWALD_TAB
 575                                 F_invr += qi * qj_f
 576                                           * (int_bit * inv_r2
 577                                              - interpolate_coulomb_force_r(nbparam, r2 * inv_r))
 578                                           * inv_r;
 579 #    endif /* EL_EWALD_ANA/TAB */
 580
 581 #    ifdef CALC_ENERGIES
 582 #        ifdef EL_CUTOFF
 583                                 E_el += qi * qj_f * (int_bit * inv_r - c_rf);
 584 #        endif
 585 #        ifdef EL_RF
 586                                 E_el += qi * qj_f * (int_bit * inv_r + 0.5f * two_k_rf * r2 - c_rf);
 587 #        endif
 588 #        ifdef EL_EWALD_ANY
 589                                 /* 1.0f - erff is faster than erfcf */
 590                                 E_el += qi * qj_f
 591                                         * (inv_r * (int_bit - erff(r2 * inv_r * beta)) - int_bit * ewald_shift);
 592 #        endif /* EL_EWALD_ANY */
 593 #    endif
 594                                 f_ij = rv * F_invr;
 595
 596                                 /* accumulate j forces in registers */
 597                                 fcj_buf -= f_ij;
 598
 599                                 /* accumulate i forces in registers */
 600                                 fci_buf[i] += f_ij;
 601                             }
 602                         }
 603
 604                         /* shift the mask bit by 1 */
 605                         mask_ji += mask_ji;
 606                     }
 607
 608                     /* reduce j forces */
 609                     reduce_force_j_warp_shfl(fcj_buf, f, tidxi, aj, c_fullWarpMask);
 610                 }
 611             }
 612 #    ifdef PRUNE_NBL
 613             /* Update the imask with the new one which does not contain the
 614                out of range clusters anymore. */
 615             pl_cj4[j4].imei[widx].imask = imask;
 616 #    endif
 617         }
 618         // avoid shared memory WAR hazards between loop iterations
 619         __syncwarp(c_fullWarpMask);
 620     }
 621
 622     /* skip central shifts when summing shift forces */
 623     if (nb_sci.shift == CENTRAL)
 624     {
 625         bCalcFshift = false;
 626     }
 627
 628     float fshift_buf = 0.0f;
 629
 630     /* reduce i forces */
 631     for (i = 0; i < c_numClPerSupercl; i++)
 632     {
 633         ai = (sci * c_numClPerSupercl + i) * c_clSize + tidxi;
 634         reduce_force_i_warp_shfl(fci_buf[i], f, &fshift_buf, bCalcFshift, tidxj, ai, c_fullWarpMask);
 635     }
 636
 637     /* add up local shift forces into global mem, tidxj indexes x,y,z */
 638     if (bCalcFshift && (tidxj & 3) < 3)
 639     {
 640         atomicAdd(&(atdat.fshift[nb_sci.shift].x) + (tidxj & 3), fshift_buf);
 641     }
 642
 643 #    ifdef CALC_ENERGIES
 644     /* reduce the energies over warps and store into global memory */
 645     reduce_energy_warp_shfl(E_lj, E_el, e_lj, e_el, tidx, c_fullWarpMask);
 646 #    endif
 647 }
 648 #endif /* FUNCTION_DECLARATION_ONLY */
 649
 650 #undef NTHREAD_Z
 651 #undef MIN_BLOCKS_PER_MP
 652 #undef THREADS_PER_BLOCK
 653
 654 #undef EL_EWALD_ANY
 655 #undef EXCLUSION_FORCES
 656 #undef LJ_EWALD
 657
 658 #undef LJ_COMB