src/gromacs/nbnxm/opencl/nbnxm_ocl_kernel_utils.clh

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2012,2013,2014,2016,2017 by the GROMACS development team.
   5  * Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by
   6  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   7  * and including many others, as listed in the AUTHORS file in the
   8  * top-level source directory and at http://www.gromacs.org.
   9  *
  10  * GROMACS is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU Lesser General Public License
  12  * as published by the Free Software Foundation; either version 2.1
  13  * of the License, or (at your option) any later version.
  14  *
  15  * GROMACS is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18  * Lesser General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public
  21  * License along with GROMACS; if not, see
  22  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  23  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  24  *
  25  * If you want to redistribute modifications to GROMACS, please
  26  * consider that scientific software is very special. Version
  27  * control is crucial - bugs must be traceable. We will be happy to
  28  * consider code for inclusion in the official distribution, but
  29  * derived work must not be called official GROMACS. Details are found
  30  * in the README & COPYING files - if they are missing, get the
  31  * official version at http://www.gromacs.org.
  32  *
  33  * To help us fund GROMACS development, we humbly ask that you cite
  34  * the research papers on the package. Check out http://www.gromacs.org.
  35  */
  36 /*! \internal \file
  37  *  \brief
  38  *  Utility constant and function declaration for the OpenCL non-bonded kernels.
  39  *  This header should be included once at the top level, just before the
  40  *  kernels are included (has to be preceded by nbnxn_ocl_types.h).
  41  *
  42  *  \author Szilárd Páll <pall.szilard@gmail.com>
  43  *  \ingroup module_nbnxm
  44  */
  45
  46 #define GMX_DOUBLE 0
  47
  48 #include "gromacs/gpu_utils/device_utils.clh"
  49 #include "gromacs/gpu_utils/vectype_ops.clh"
  50 #include "gromacs/pbcutil/ishift.h"
  51
  52 #include "nbnxm_ocl_consts.h"
  53
  54 #define CL_SIZE (c_nbnxnGpuClusterSize)
  55
  56 #define WARP_SIZE (CL_SIZE * CL_SIZE / 2) // Currently only c_nbnxnGpuClusterpairSplit=2 supported
  57
  58 #if defined _NVIDIA_SOURCE_ || defined _AMD_SOURCE_
  59 /* Currently we enable CJ prefetch for AMD/NVIDIA and disable it for other vendors
  60  * Note that this should precede the kernel_utils include.
  61  */
  62 #    define USE_CJ_PREFETCH 1
  63 #else
  64 #    define USE_CJ_PREFETCH 0
  65 #endif
  66
  67 #if defined cl_intel_subgroups || defined cl_khr_subgroups \
  68         || (defined __OPENCL_VERSION__ && __OPENCL_VERSION__ >= 210)
  69 #    define HAVE_SUBGROUP 1
  70 #else
  71 #    define HAVE_SUBGROUP 0
  72 #endif
  73
  74 #ifdef cl_intel_subgroups
  75 #    define HAVE_INTEL_SUBGROUP 1
  76 #else
  77 #    define HAVE_INTEL_SUBGROUP 0
  78 #endif
  79
  80 #if defined _INTEL_SOURCE_
  81 #    define SUBGROUP_SIZE 8
  82 #elif defined _AMD_SOURCE_
  83 #    define SUBGROUP_SIZE 64
  84 #else
  85 #    define SUBGROUP_SIZE 32
  86 #endif
  87
  88 #define REDUCE_SHUFFLE (HAVE_INTEL_SUBGROUP && CL_SIZE == 4 && SUBGROUP_SIZE == WARP_SIZE)
  89 #define USE_SUBGROUP_ANY (HAVE_SUBGROUP && SUBGROUP_SIZE == WARP_SIZE)
  90 #define USE_SUBGROUP_PRELOAD HAVE_INTEL_SUBGROUP
  91
  92 /* 1.0 / sqrt(M_PI) */
  93 #define M_FLOAT_1_SQRTPI 0.564189583547756F
  94
  95 //-------------------
  96
  97 #ifndef NBNXN_OPENCL_KERNEL_UTILS_CLH
  98 #    define NBNXN_OPENCL_KERNEL_UTILS_CLH
  99
 100 #    if CL_SIZE == 8
 101 #        define WARP_SIZE_LOG2 (5)
 102 #        define CL_SIZE_LOG2 (3)
 103 #    elif CL_SIZE == 4
 104 #        define WARP_SIZE_LOG2 (3)
 105 #        define CL_SIZE_LOG2 (2)
 106 #    else
 107 #        error unsupported CL_SIZE
 108 #    endif
 109
 110 #    define CL_SIZE_SQ (CL_SIZE * CL_SIZE)
 111 #    define FBUF_STRIDE (CL_SIZE_SQ)
 112
 113 #    define ONE_SIXTH_F 0.16666667F
 114 #    define ONE_TWELVETH_F 0.08333333F
 115 #    define HALF_F 0.5F
 116
 117
 118 #    ifdef __GNUC__
 119 /* GCC, clang */
 120 #        define gmx_unused __attribute__((unused))
 121 #    else
 122 #        define gmx_unused
 123 #    endif
 124
 125 /*! \brief Single precision floating point short vector type (as rvec in the CPU codebase).
 126  *   Currently only used to avoid float3 register arrays.
 127  */
 128 typedef float fvec[3];
 129
 130 // Data structures shared between OpenCL device code and OpenCL host code
 131 // TODO: review, improve
 132 // Replaced real by float for now, to avoid including any other header
 133 typedef struct
 134 {
 135     float c2;
 136     float c3;
 137     float cpot;
 138 } shift_consts_t;
 139
 140 /* Used with potential switching:
 141  * rsw        = max(r - r_switch, 0)
 142  * sw         = 1 + c3*rsw^3 + c4*rsw^4 + c5*rsw^5
 143  * dsw        = 3*c3*rsw^2 + 4*c4*rsw^3 + 5*c5*rsw^4
 144  * force      = force*dsw - potential*sw
 145  * potential *= sw
 146  */
 147 typedef struct
 148 {
 149     float c3;
 150     float c4;
 151     float c5;
 152 } switch_consts_t;
 153
 154 // Data structure shared between the OpenCL device code and OpenCL host code
 155 // Must not contain OpenCL objects (buffers)
 156 typedef struct cl_nbparam_params
 157 {
 158
 159     //! type of electrostatics, takes values from #ElecType
 160     int elecType;
 161     //! type of VdW impl., takes values from #VdwType
 162     int vdwType;
 163
 164     //! charge multiplication factor
 165     float epsfac;
 166     //! Reaction-field/plain cutoff electrostatics const.
 167     float c_rf;
 168     //! Reaction-field electrostatics constant
 169     float two_k_rf;
 170     //! Ewald/PME parameter
 171     float ewald_beta;
 172     //! Ewald/PME correction term substracted from the direct-space potential
 173     float sh_ewald;
 174     //! LJ-Ewald/PME correction term added to the correction potential
 175     float sh_lj_ewald;
 176     //! LJ-Ewald/PME coefficient
 177     float ewaldcoeff_lj;
 178
 179     //! Coulomb cut-off squared
 180     float rcoulomb_sq;
 181
 182     //! VdW cut-off squared
 183     float rvdw_sq;
 184     //! VdW switched cut-off
 185     float rvdw_switch;
 186     //! Full, outer pair-list cut-off squared
 187     float rlistOuter_sq;
 188     //! Inner, dynamic pruned pair-list cut-off squared  XXX: this is only needed in the pruning kernels, but for now we also pass it to the nonbondeds
 189     float rlistInner_sq;
 190
 191     //! VdW shift dispersion constants
 192     shift_consts_t dispersion_shift;
 193     //! VdW shift repulsion constants
 194     shift_consts_t repulsion_shift;
 195     //! VdW switch constants
 196     switch_consts_t vdw_switch;
 197
 198     /* Ewald Coulomb force table data - accessed through texture memory */
 199     //! table scale/spacing
 200     float coulomb_tab_scale;
 201 } cl_nbparam_params_t;
 202
 203 typedef struct
 204 {
 205     //! i-super-cluster
 206     int sci;
 207     //! Shift vector index plus possible flags
 208     int shift;
 209     //! Start index into cj4
 210     int cj4_ind_start;
 211     //! End index into cj4
 212     int cj4_ind_end;
 213 } nbnxn_sci_t;
 214
 215 typedef struct
 216 {
 217     //! The i-cluster interactions mask for 1 warp
 218     unsigned int imask;
 219     //! Index into the exclusion array for 1 warp
 220     int excl_ind;
 221 } nbnxn_im_ei_t;
 222
 223 typedef struct
 224 {
 225     //! The 4 j-clusters
 226     int cj[4];
 227     //! The i-cluster mask data       for 2 warps
 228     nbnxn_im_ei_t imei[2];
 229 } nbnxn_cj4_t;
 230
 231
 232 typedef struct
 233 {
 234     unsigned int pair[CL_SIZE * CL_SIZE / 2]; /* Topology exclusion interaction bits for one warp,
 235                                                * each unsigned has bitS for 4*8 i clusters
 236                                                */
 237 } nbnxn_excl_t;
 238
 239 /*! i-cluster interaction mask for a super-cluster with all c_nbnxnGpuNumClusterPerSupercluster bits set */
 240 __constant unsigned supercl_interaction_mask = ((1U << c_nbnxnGpuNumClusterPerSupercluster) - 1U);
 241
 242 /*! Minimum single precision threshold for r^2 to avoid r^-12 overflow. */
 243 __constant float c_nbnxnMinDistanceSquared = NBNXM_MIN_DISTANCE_SQUARED_VALUE_FLOAT;
 244
 245 gmx_opencl_inline void preloadCj4Generic(__local int*        sm_cjPreload,
 246                                          const __global int* gm_cj,
 247                                          int                 tidxi,
 248                                          int                 tidxj,
 249                                          bool gmx_unused iMaskCond)
 250 {
 251     /* Pre-load cj into shared memory */
 252 #    if defined _AMD_SOURCE_ // TODO: fix by setting c_nbnxnGpuClusterpairSplit properly
 253     if (tidxj == 0 & tidxi < c_nbnxnGpuJgroupSize)
 254     {
 255         sm_cjPreload[tidxi] = gm_cj[tidxi];
 256     }
 257 #    else
 258     const int c_clSize                   = CL_SIZE;
 259     const int c_nbnxnGpuClusterpairSplit = 2;
 260     const int c_splitClSize              = c_clSize / c_nbnxnGpuClusterpairSplit;
 261     if ((tidxj == 0 | tidxj == c_splitClSize) & (tidxi < c_nbnxnGpuJgroupSize))
 262     {
 263         sm_cjPreload[tidxi + tidxj * c_nbnxnGpuJgroupSize / c_splitClSize] = gm_cj[tidxi];
 264     }
 265 #    endif
 266 }
 267
 268
 269 #    if USE_SUBGROUP_PRELOAD
 270 gmx_opencl_inline int preloadCj4Subgroup(const __global int* gm_cj)
 271 {
 272     // loads subgroup-size # of elements (8) instead of the 4 required
 273     // equivalent to *cjs = *gm_cj
 274     return intel_sub_group_block_read((const __global uint*)gm_cj);
 275 }
 276 #    endif // USE_SUBGROUP_PRELOAD
 277
 278 #    if USE_SUBGROUP_PRELOAD
 279 typedef size_t CjType;
 280 #    else
 281 typedef __local int* CjType;
 282 #    endif
 283
 284 /*! \brief Preload cj4
 285  *
 286  * - For AMD we load once for a wavefront of 64 threads (on 4 threads * NTHREAD_Z)
 287  * - For NVIDIA once per warp (on 2x4 threads * NTHREAD_Z)
 288  * - For Intel(/USE_SUBGROUP_PRELOAD) loads into private memory(/register) instead of local memory
 289  *
 290  * It is the caller's responsibility to make sure that data is consumed only when
 291  * it's ready. This function does not call a barrier.
 292  */
 293 gmx_opencl_inline void preloadCj4(CjType gmx_unused* cjs,
 294                                   const __global int gmx_unused* gm_cj,
 295                                   int gmx_unused tidxi,
 296                                   int gmx_unused tidxj,
 297                                   bool gmx_unused iMaskCond)
 298 {
 299 #    if USE_SUBGROUP_PRELOAD
 300     *cjs = preloadCj4Subgroup(gm_cj);
 301 #    elif USE_CJ_PREFETCH
 302     preloadCj4Generic(*cjs, gm_cj, tidxi, tidxj, iMaskCond);
 303 #    else
 304     // nothing to do
 305 #    endif
 306 }
 307
 308 gmx_opencl_inline int loadCjPreload(__local int* sm_cjPreload, int jm, int gmx_unused tidxi, int gmx_unused tidxj)
 309 {
 310 #    if defined _AMD_SOURCE_
 311     int         warpLoadOffset = 0; // TODO: fix by setting c_nbnxnGpuClusterpairSplit properly
 312 #    else
 313     const int c_clSize = CL_SIZE;
 314     const int c_nbnxnGpuClusterpairSplit = 2;
 315     const int c_splitClSize = c_clSize / c_nbnxnGpuClusterpairSplit;
 316     int warpLoadOffset = (tidxj & c_splitClSize) * c_nbnxnGpuJgroupSize / c_splitClSize;
 317 #    endif
 318     return sm_cjPreload[jm + warpLoadOffset];
 319 }
 320
 321 /* \brief Load a cj given a jm index.
 322  *
 323  * If cj4 preloading is enabled, it loads from the local memory, otherwise from global.
 324  */
 325 gmx_opencl_inline int
 326 loadCj(CjType cjs, const __global int gmx_unused* gm_cj, int jm, int gmx_unused tidxi, int gmx_unused tidxj)
 327 {
 328 #    if USE_SUBGROUP_PRELOAD
 329     return sub_group_broadcast(cjs, jm);
 330 #    elif USE_CJ_PREFETCH
 331     return loadCjPreload(cjs, jm, tidxi, tidxj);
 332 #    else
 333     return gm_cj[jm];
 334 #    endif
 335 }
 336
 337 /*! Convert LJ sigma,epsilon parameters to C6,C12. */
 338 gmx_opencl_inline float2 convert_sigma_epsilon_to_c6_c12(const float sigma, const float epsilon)
 339 {
 340     const float  sigma2 = sigma * sigma;
 341     const float  sigma6 = sigma2 * sigma2 * sigma2;
 342     const float  c6     = epsilon * sigma6;
 343     const float2 c6c12  = (float2)(c6,           /* c6 */
 344                                   c6 * sigma6); /* c12 */
 345     return c6c12;
 346 }
 347
 348
 349 /*! Apply force switch,  force + energy version. */
 350 gmx_opencl_inline void calculate_force_switch_F(const cl_nbparam_params_t* nbparam,
 351                                                 float                      c6,
 352                                                 float                      c12,
 353                                                 float                      inv_r,
 354                                                 float                      r2,
 355                                                 float*                     F_invr)
 356 {
 357     /* force switch constants */
 358     const float disp_shift_V2 = nbparam->dispersion_shift.c2;
 359     const float disp_shift_V3 = nbparam->dispersion_shift.c3;
 360     const float repu_shift_V2 = nbparam->repulsion_shift.c2;
 361     const float repu_shift_V3 = nbparam->repulsion_shift.c3;
 362
 363     const float r        = r2 * inv_r;
 364     float       r_switch = r - nbparam->rvdw_switch;
 365     r_switch             = r_switch >= 0.0F ? r_switch : 0.0F;
 366
 367     *F_invr += -c6 * (disp_shift_V2 + disp_shift_V3 * r_switch) * r_switch * r_switch * inv_r
 368                + c12 * (repu_shift_V2 + repu_shift_V3 * r_switch) * r_switch * r_switch * inv_r;
 369 }
 370
 371 /*! Apply force switch, force-only version. */
 372 gmx_opencl_inline void calculate_force_switch_F_E(const cl_nbparam_params_t* nbparam,
 373                                                   float                      c6,
 374                                                   float                      c12,
 375                                                   float                      inv_r,
 376                                                   float                      r2,
 377                                                   float*                     F_invr,
 378                                                   float*                     E_lj)
 379 {
 380     /* force switch constants */
 381     const float disp_shift_V2 = nbparam->dispersion_shift.c2;
 382     const float disp_shift_V3 = nbparam->dispersion_shift.c3;
 383     const float repu_shift_V2 = nbparam->repulsion_shift.c2;
 384     const float repu_shift_V3 = nbparam->repulsion_shift.c3;
 385
 386     const float disp_shift_F2 = nbparam->dispersion_shift.c2 / 3;
 387     const float disp_shift_F3 = nbparam->dispersion_shift.c3 / 4;
 388     const float repu_shift_F2 = nbparam->repulsion_shift.c2 / 3;
 389     const float repu_shift_F3 = nbparam->repulsion_shift.c3 / 4;
 390
 391     const float r        = r2 * inv_r;
 392     float       r_switch = r - nbparam->rvdw_switch;
 393     r_switch             = r_switch >= 0.0F ? r_switch : 0.0F;
 394
 395     *F_invr += -c6 * (disp_shift_V2 + disp_shift_V3 * r_switch) * r_switch * r_switch * inv_r
 396                + c12 * (repu_shift_V2 + repu_shift_V3 * r_switch) * r_switch * r_switch * inv_r;
 397     *E_lj += c6 * (disp_shift_F2 + disp_shift_F3 * r_switch) * r_switch * r_switch * r_switch
 398              - c12 * (repu_shift_F2 + repu_shift_F3 * r_switch) * r_switch * r_switch * r_switch;
 399 }
 400
 401 /*! Apply potential switch, force-only version. */
 402 gmx_opencl_inline void calculate_potential_switch_F(const cl_nbparam_params_t* nbparam,
 403                                                     float                      inv_r,
 404                                                     float                      r2,
 405                                                     float*                     F_invr,
 406                                                     const float*               E_lj)
 407 {
 408     /* potential switch constants */
 409     const float switch_V3 = nbparam->vdw_switch.c3;
 410     const float switch_V4 = nbparam->vdw_switch.c4;
 411     const float switch_V5 = nbparam->vdw_switch.c5;
 412     const float switch_F2 = nbparam->vdw_switch.c3;
 413     const float switch_F3 = nbparam->vdw_switch.c4;
 414     const float switch_F4 = nbparam->vdw_switch.c5;
 415
 416     const float r        = r2 * inv_r;
 417     const float r_switch = r - nbparam->rvdw_switch;
 418
 419     /* Unlike in the F+E kernel, conditional is faster here */
 420     if (r_switch > 0.0F)
 421     {
 422         const float sw = 1.0F
 423                          + (switch_V3 + (switch_V4 + switch_V5 * r_switch) * r_switch) * r_switch
 424                                    * r_switch * r_switch;
 425         const float dsw = (switch_F2 + (switch_F3 + switch_F4 * r_switch) * r_switch) * r_switch * r_switch;
 426
 427         *F_invr = (*F_invr) * sw - inv_r * (*E_lj) * dsw;
 428     }
 429 }
 430
 431 /*! Apply potential switch, force + energy version. */
 432 gmx_opencl_inline void calculate_potential_switch_F_E(const cl_nbparam_params_t* nbparam,
 433                                                       float                      inv_r,
 434                                                       float                      r2,
 435                                                       float*                     F_invr,
 436                                                       float*                     E_lj)
 437 {
 438     /* potential switch constants */
 439     const float switch_V3 = nbparam->vdw_switch.c3;
 440     const float switch_V4 = nbparam->vdw_switch.c4;
 441     const float switch_V5 = nbparam->vdw_switch.c5;
 442     const float switch_F2 = nbparam->vdw_switch.c3;
 443     const float switch_F3 = nbparam->vdw_switch.c4;
 444     const float switch_F4 = nbparam->vdw_switch.c5;
 445
 446     const float r        = r2 * inv_r;
 447     float       r_switch = r - nbparam->rvdw_switch;
 448     r_switch             = r_switch >= 0.0F ? r_switch : 0.0F;
 449
 450     /* Unlike in the F-only kernel, masking is faster here */
 451     const float sw =
 452             1.0F + (switch_V3 + (switch_V4 + switch_V5 * r_switch) * r_switch) * r_switch * r_switch * r_switch;
 453     const float dsw = (switch_F2 + (switch_F3 + switch_F4 * r_switch) * r_switch) * r_switch * r_switch;
 454
 455     *F_invr = (*F_invr) * sw - inv_r * (*E_lj) * dsw;
 456     *E_lj *= sw;
 457 }
 458
 459 /*! Calculate LJ-PME grid force contribution with
 460  *  geometric combination rule.
 461  */
 462 gmx_opencl_inline void calculate_lj_ewald_comb_geom_F(__constant const float* nbfp_comb,
 463                                                       int                     typei,
 464                                                       int                     typej,
 465                                                       float                   r2,
 466                                                       float                   inv_r2,
 467                                                       float                   lje_coeff2,
 468                                                       float                   lje_coeff6_6,
 469                                                       float*                  F_invr)
 470 {
 471     const float c6grid = nbfp_comb[2 * typei] * nbfp_comb[2 * typej];
 472
 473     /* Recalculate inv_r6 without exclusion mask */
 474     const float inv_r6_nm = inv_r2 * inv_r2 * inv_r2;
 475     const float cr2       = lje_coeff2 * r2;
 476     const float expmcr2   = exp(-cr2);
 477     const float poly      = 1.0F + cr2 + HALF_F * cr2 * cr2;
 478
 479     /* Subtract the grid force from the total LJ force */
 480     *F_invr += c6grid * (inv_r6_nm - expmcr2 * (inv_r6_nm * poly + lje_coeff6_6)) * inv_r2;
 481 }
 482
 483 /*! Calculate LJ-PME grid force + energy contribution with
 484  *  geometric combination rule.
 485  */
 486 gmx_opencl_inline void calculate_lj_ewald_comb_geom_F_E(__constant const float*    nbfp_comb,
 487                                                         const cl_nbparam_params_t* nbparam,
 488                                                         int                        typei,
 489                                                         int                        typej,
 490                                                         float                      r2,
 491                                                         float                      inv_r2,
 492                                                         float                      lje_coeff2,
 493                                                         float                      lje_coeff6_6,
 494                                                         float                      int_bit,
 495                                                         float*                     F_invr,
 496                                                         float*                     E_lj)
 497 {
 498     const float c6grid = nbfp_comb[2 * typei] * nbfp_comb[2 * typej];
 499
 500     /* Recalculate inv_r6 without exclusion mask */
 501     const float inv_r6_nm = inv_r2 * inv_r2 * inv_r2;
 502     const float cr2       = lje_coeff2 * r2;
 503     const float expmcr2   = exp(-cr2);
 504     const float poly      = 1.0F + cr2 + HALF_F * cr2 * cr2;
 505
 506     /* Subtract the grid force from the total LJ force */
 507     *F_invr += c6grid * (inv_r6_nm - expmcr2 * (inv_r6_nm * poly + lje_coeff6_6)) * inv_r2;
 508
 509     /* Shift should be applied only to real LJ pairs */
 510     const float sh_mask = nbparam->sh_lj_ewald * int_bit;
 511     *E_lj += ONE_SIXTH_F * c6grid * (inv_r6_nm * (1.0F - expmcr2 * poly) + sh_mask);
 512 }
 513
 514 /*! Calculate LJ-PME grid force + energy contribution (if E_lj != NULL) with
 515  *  Lorentz-Berthelot combination rule.
 516  *  We use a single F+E kernel with conditional because the performance impact
 517  *  of this is pretty small and LB on the CPU is anyway very slow.
 518  */
 519 gmx_opencl_inline void calculate_lj_ewald_comb_LB_F_E(__constant const float*    nbfp_comb,
 520                                                       const cl_nbparam_params_t* nbparam,
 521                                                       int                        typei,
 522                                                       int                        typej,
 523                                                       float                      r2,
 524                                                       float                      inv_r2,
 525                                                       float                      lje_coeff2,
 526                                                       float                      lje_coeff6_6,
 527                                                       float                      int_bit,
 528                                                       bool                       with_E_lj,
 529                                                       float*                     F_invr,
 530                                                       float*                     E_lj)
 531 {
 532     /* sigma and epsilon are scaled to give 6*C6 */
 533     const float sigma = nbfp_comb[2 * typei] + nbfp_comb[2 * typej];
 534
 535     const float epsilon = nbfp_comb[2 * typei + 1] * nbfp_comb[2 * typej + 1];
 536
 537     const float sigma2 = sigma * sigma;
 538     const float c6grid = epsilon * sigma2 * sigma2 * sigma2;
 539
 540     /* Recalculate inv_r6 without exclusion mask */
 541     const float inv_r6_nm = inv_r2 * inv_r2 * inv_r2;
 542     const float cr2       = lje_coeff2 * r2;
 543     const float expmcr2   = exp(-cr2);
 544     const float poly      = 1.0F + cr2 + HALF_F * cr2 * cr2;
 545
 546     /* Subtract the grid force from the total LJ force */
 547     *F_invr += c6grid * (inv_r6_nm - expmcr2 * (inv_r6_nm * poly + lje_coeff6_6)) * inv_r2;
 548
 549     if (with_E_lj)
 550     {
 551
 552         /* Shift should be applied only to real LJ pairs */
 553         const float sh_mask = nbparam->sh_lj_ewald * int_bit;
 554         *E_lj += ONE_SIXTH_F * c6grid * (inv_r6_nm * (1.0F - expmcr2 * poly) + sh_mask);
 555     }
 556 }
 557
 558 /*! Interpolate Ewald coulomb force using the table through the tex_nbfp texture.
 559  *  Original idea: from the OpenMM project
 560  */
 561 gmx_opencl_inline float interpolate_coulomb_force_r(__constant const float* coulomb_tab, float r, float scale)
 562 {
 563     float normalized = scale * r;
 564     int   index      = (int)normalized;
 565     float fract2     = normalized - (float)index;
 566     float fract1     = 1.0F - fract2;
 567
 568     return fract1 * coulomb_tab[index] + fract2 * coulomb_tab[index + 1];
 569 }
 570
 571 /*! Calculate analytical Ewald correction term. */
 572 gmx_opencl_inline float pmecorrF(float z2)
 573 {
 574     const float FN6 = -1.7357322914161492954e-8F;
 575     const float FN5 = 1.4703624142580877519e-6F;
 576     const float FN4 = -0.000053401640219807709149F;
 577     const float FN3 = 0.0010054721316683106153F;
 578     const float FN2 = -0.019278317264888380590F;
 579     const float FN1 = 0.069670166153766424023F;
 580     const float FN0 = -0.75225204789749321333F;
 581
 582     const float FD4 = 0.0011193462567257629232F;
 583     const float FD3 = 0.014866955030185295499F;
 584     const float FD2 = 0.11583842382862377919F;
 585     const float FD1 = 0.50736591960530292870F;
 586     const float FD0 = 1.0F;
 587
 588     const float z4 = z2 * z2;
 589
 590     float polyFD0 = FD4 * z4 + FD2;
 591     float polyFD1 = FD3 * z4 + FD1;
 592     polyFD0       = polyFD0 * z4 + FD0;
 593     polyFD0       = polyFD1 * z2 + polyFD0;
 594
 595     polyFD0 = 1.0F / polyFD0;
 596
 597     float polyFN0 = FN6 * z4 + FN4;
 598     float polyFN1 = FN5 * z4 + FN3;
 599     polyFN0       = polyFN0 * z4 + FN2;
 600     polyFN1       = polyFN1 * z4 + FN1;
 601     polyFN0       = polyFN0 * z4 + FN0;
 602     polyFN0       = polyFN1 * z2 + polyFN0;
 603
 604     return polyFN0 * polyFD0;
 605 }
 606
 607 #    if REDUCE_SHUFFLE
 608 gmx_opencl_inline void
 609 reduce_force_j_shfl(float3 fin, __global float* fout, int gmx_unused tidxi, int gmx_unused tidxj, int aidx)
 610 {
 611     /* Only does reduction over 4 elements in cluster. Needs to be changed
 612      * for CL_SIZE>4. See CUDA code for required code */
 613     fin.x += intel_sub_group_shuffle_down(fin.x, fin.x, 1);
 614     fin.y += intel_sub_group_shuffle_up(fin.y, fin.y, 1);
 615     fin.z += intel_sub_group_shuffle_down(fin.z, fin.z, 1);
 616     if ((tidxi & 1) == 1)
 617     {
 618         fin.x = fin.y;
 619     }
 620     fin.x += intel_sub_group_shuffle_down(fin.x, fin.x, 2);
 621     fin.z += intel_sub_group_shuffle_up(fin.z, fin.z, 2);
 622     if (tidxi == 2)
 623     {
 624         fin.x = fin.z;
 625     }
 626     if (tidxi < 3)
 627     {
 628         atomicAdd_g_f(&fout[3 * aidx + tidxi], fin.x);
 629     }
 630 }
 631 #    endif
 632
 633 gmx_opencl_inline void
 634 reduce_force_j_generic(__local float* f_buf, float3 fcj_buf, __global float* fout, int tidxi, int tidxj, int aidx)
 635 {
 636     int tidx                      = tidxi + tidxj * CL_SIZE;
 637     f_buf[tidx]                   = fcj_buf.x;
 638     f_buf[FBUF_STRIDE + tidx]     = fcj_buf.y;
 639     f_buf[2 * FBUF_STRIDE + tidx] = fcj_buf.z;
 640
 641     /* Split the reduction between the first 3 column threads
 642        Threads with column id 0 will do the reduction for (float3).x components
 643        Threads with column id 1 will do the reduction for (float3).y components
 644        Threads with column id 2 will do the reduction for (float3).z components.
 645        The reduction is performed for each line tidxj of f_buf. */
 646     if (tidxi < 3)
 647     {
 648         float f = 0.0F;
 649         for (int j = tidxj * CL_SIZE; j < (tidxj + 1) * CL_SIZE; j++)
 650         {
 651             f += f_buf[FBUF_STRIDE * tidxi + j];
 652         }
 653
 654         atomicAdd_g_f(&fout[3 * aidx + tidxi], f);
 655     }
 656 }
 657
 658 /*! Final j-force reduction
 659  */
 660 gmx_opencl_inline void reduce_force_j(__local float gmx_unused* f_buf,
 661                                       float3                    fcj_buf,
 662                                       __global float*           fout,
 663                                       int                       tidxi,
 664                                       int                       tidxj,
 665                                       int                       aidx)
 666 {
 667 #    if REDUCE_SHUFFLE
 668     reduce_force_j_shfl(fcj_buf, fout, tidxi, tidxj, aidx);
 669 #    else
 670     reduce_force_j_generic(f_buf, fcj_buf, fout, tidxi, tidxj, aidx);
 671 #    endif
 672 }
 673
 674 #    if REDUCE_SHUFFLE
 675 gmx_opencl_inline void reduce_force_i_and_shift_shfl(__private fvec  fci_buf[],
 676                                                      __global float* fout,
 677                                                      bool            bCalcFshift,
 678                                                      int             tidxi,
 679                                                      int             tidxj,
 680                                                      int             sci,
 681                                                      int             shift,
 682                                                      __global float* fshift)
 683 {
 684     /* Only does reduction over 4 elements in cluster (2 per warp). Needs to be changed
 685      * for CL_SIZE>4.*/
 686     float2 fshift_buf = 0;
 687     for (int ci_offset = 0; ci_offset < c_nbnxnGpuNumClusterPerSupercluster; ci_offset++)
 688     {
 689         int    aidx = (sci * c_nbnxnGpuNumClusterPerSupercluster + ci_offset) * CL_SIZE + tidxi;
 690         float3 fin  = (float3)(fci_buf[ci_offset][0], fci_buf[ci_offset][1], fci_buf[ci_offset][2]);
 691         fin.x += intel_sub_group_shuffle_down(fin.x, fin.x, CL_SIZE);
 692         fin.y += intel_sub_group_shuffle_up(fin.y, fin.y, CL_SIZE);
 693         fin.z += intel_sub_group_shuffle_down(fin.z, fin.z, CL_SIZE);
 694
 695         if (tidxj & 1)
 696         {
 697             fin.x = fin.y;
 698         }
 699         /* Threads 0,1 and 2,3 increment x,y for their warp */
 700         atomicAdd_g_f(&fout[3 * aidx + (tidxj & 1)], fin.x);
 701         if (bCalcFshift)
 702         {
 703             fshift_buf[0] += fin.x;
 704         }
 705         /* Threads 0 and 2 increment z for their warp */
 706         if ((tidxj & 1) == 0)
 707         {
 708             atomicAdd_g_f(&fout[3 * aidx + 2], fin.z);
 709             if (bCalcFshift)
 710             {
 711                 fshift_buf[1] += fin.z;
 712             }
 713         }
 714     }
 715     /* add up local shift forces into global mem */
 716     if (bCalcFshift)
 717     {
 718         // Threads 0,1 and 2,3 update x,y
 719         atomicAdd_g_f(&(fshift[3 * shift + (tidxj & 1)]), fshift_buf[0]);
 720         // Threads 0 and 2 update z
 721         if ((tidxj & 1) == 0)
 722         {
 723             atomicAdd_g_f(&(fshift[3 * shift + 2]), fshift_buf[1]);
 724         }
 725     }
 726 }
 727 #    endif
 728
 729 /*! Final i-force reduction; this implementation works only with power of two
 730  *  array sizes.
 731  */
 732 gmx_opencl_inline void reduce_force_i_and_shift_pow2(volatile __local float* f_buf,
 733                                                      __private fvec  fci_buf[],
 734                                                      __global float* fout,
 735                                                      bool            bCalcFshift,
 736                                                      int             tidxi,
 737                                                      int             tidxj,
 738                                                      int             sci,
 739                                                      int             shift,
 740                                                      __global float* fshift)
 741 {
 742     float fshift_buf = 0;
 743     for (int ci_offset = 0; ci_offset < c_nbnxnGpuNumClusterPerSupercluster; ci_offset++)
 744     {
 745         int aidx = (sci * c_nbnxnGpuNumClusterPerSupercluster + ci_offset) * CL_SIZE + tidxi;
 746         int tidx = tidxi + tidxj * CL_SIZE;
 747         /* store i forces in shmem */
 748         f_buf[tidx]                   = fci_buf[ci_offset][0];
 749         f_buf[FBUF_STRIDE + tidx]     = fci_buf[ci_offset][1];
 750         f_buf[2 * FBUF_STRIDE + tidx] = fci_buf[ci_offset][2];
 751         barrier(CLK_LOCAL_MEM_FENCE);
 752
 753         /* Reduce the initial CL_SIZE values for each i atom to half
 754          * every step by using CL_SIZE * i threads.
 755          * Can't just use i as loop variable because than nvcc refuses to unroll.
 756          */
 757         int i = CL_SIZE / 2;
 758         for (int j = CL_SIZE_LOG2 - 1; j > 0; j--)
 759         {
 760             if (tidxj < i)
 761             {
 762
 763                 f_buf[tidxj * CL_SIZE + tidxi] += f_buf[(tidxj + i) * CL_SIZE + tidxi];
 764                 f_buf[FBUF_STRIDE + tidxj * CL_SIZE + tidxi] +=
 765                         f_buf[FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi];
 766                 f_buf[2 * FBUF_STRIDE + tidxj * CL_SIZE + tidxi] +=
 767                         f_buf[2 * FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi];
 768             }
 769             i >>= 1;
 770         }
 771         /* needed because
 772          * a) for CL_SIZE<8: id 2 (doing z in next block) is in 2nd warp
 773          * b) for all CL_SIZE a barrier is needed before f_buf is reused by next reduce_force_i call
 774          * TODO: Test on Nvidia for performance difference between having the barrier here or after the atomicAdd
 775          */
 776         barrier(CLK_LOCAL_MEM_FENCE);
 777
 778         /* i == 1, last reduction step, writing to global mem */
 779         /* Split the reduction between the first 3 line threads
 780            Threads with line id 0 will do the reduction for (float3).x components
 781            Threads with line id 1 will do the reduction for (float3).y components
 782            Threads with line id 2 will do the reduction for (float3).z components. */
 783         if (tidxj < 3)
 784         {
 785             float f = f_buf[tidxj * FBUF_STRIDE + tidxi] + f_buf[tidxj * FBUF_STRIDE + i * CL_SIZE + tidxi];
 786
 787             atomicAdd_g_f(&fout[3 * aidx + tidxj], f);
 788
 789             if (bCalcFshift)
 790             {
 791                 fshift_buf += f;
 792             }
 793         }
 794     }
 795     /* add up local shift forces into global mem */
 796     if (bCalcFshift)
 797     {
 798         /* Only threads with tidxj < 3 will update fshift.
 799            The threads performing the update, must be the same as the threads
 800            storing the reduction result above.
 801          */
 802         if (tidxj < 3)
 803         {
 804             atomicAdd_g_f(&(fshift[3 * shift + tidxj]), fshift_buf);
 805         }
 806     }
 807 }
 808
 809 /*! Final i-force reduction
 810  */
 811 gmx_opencl_inline void reduce_force_i_and_shift(__local float gmx_unused* f_buf,
 812                                                 __private fvec  fci_buf[],
 813                                                 __global float* f,
 814                                                 bool            bCalcFshift,
 815                                                 int             tidxi,
 816                                                 int             tidxj,
 817                                                 int             sci,
 818                                                 int             shift,
 819                                                 __global float* fshift)
 820 {
 821 #    if REDUCE_SHUFFLE
 822     reduce_force_i_and_shift_shfl(fci_buf, f, bCalcFshift, tidxi, tidxj, sci, shift, fshift);
 823 #    else
 824     reduce_force_i_and_shift_pow2(f_buf, fci_buf, f, bCalcFshift, tidxi, tidxj, sci, shift, fshift);
 825 #    endif
 826 }
 827
 828
 829 #    if REDUCE_SHUFFLE
 830 gmx_opencl_inline void reduce_energy_shfl(float                    E_lj,
 831                                           float                    E_el,
 832                                           volatile __global float* e_lj,
 833                                           volatile __global float* e_el,
 834                                           int                      tidx)
 835 {
 836     E_lj = sub_group_reduce_add(E_lj);
 837     E_el = sub_group_reduce_add(E_el);
 838     /* Should be get_sub_group_local_id()==0. Doesn't work with Intel Classic driver.
 839      * To make it possible to use REDUCE_SHUFFLE with single subgroup per i-j pair
 840      * (e.g. subgroup size 16 with CL_SIZE 4), either this "if" needs to be changed or
 841      * the definition of WARP_SIZE (currently CL_SIZE*CL_SIZE/2) needs to be changed
 842      * (by supporting c_nbnxnGpuClusterpairSplit=1). */
 843     if (tidx == 0 || tidx == WARP_SIZE)
 844     {
 845         atomicAdd_g_f(e_lj, E_lj);
 846         atomicAdd_g_f(e_el, E_el);
 847     }
 848 }
 849 #    endif
 850
 851 /*! Energy reduction; this implementation works only with power of two
 852  *  array sizes.
 853  */
 854 gmx_opencl_inline void reduce_energy_pow2(volatile __local float*  buf,
 855                                           volatile __global float* e_lj,
 856                                           volatile __global float* e_el,
 857                                           int                      tidx)
 858 {
 859     int i = WARP_SIZE / 2;
 860
 861     /* Can't just use i as loop variable because than nvcc refuses to unroll. */
 862     for (int j = WARP_SIZE_LOG2 - 1; j > 0; j--)
 863     {
 864         if (tidx < i)
 865         {
 866             buf[tidx] += buf[tidx + i];
 867             buf[FBUF_STRIDE + tidx] += buf[FBUF_STRIDE + tidx + i];
 868         }
 869         i >>= 1;
 870     }
 871
 872     /* last reduction step, writing to global mem */
 873     if (tidx == 0)
 874     {
 875         float e1 = buf[tidx] + buf[tidx + i];
 876         float e2 = buf[FBUF_STRIDE + tidx] + buf[FBUF_STRIDE + tidx + i];
 877
 878         atomicAdd_g_f(e_lj, e1);
 879         atomicAdd_g_f(e_el, e2);
 880     }
 881 }
 882
 883 gmx_opencl_inline void reduce_energy(volatile __local float gmx_unused* buf,
 884                                      float                              E_lj,
 885                                      float                              E_el,
 886                                      volatile __global float*           e_lj,
 887                                      volatile __global float*           e_el,
 888                                      int                                tidx)
 889 {
 890 #    if REDUCE_SHUFFLE
 891     reduce_energy_shfl(E_lj, E_el, e_lj, e_el, tidx);
 892 #    else
 893     /* flush the energies to shmem and reduce them */
 894     buf[tidx] = E_lj;
 895     buf[FBUF_STRIDE + tidx] = E_el;
 896     reduce_energy_pow2(buf + (tidx & WARP_SIZE), e_lj, e_el, tidx & ~WARP_SIZE);
 897 #    endif
 898 }
 899
 900 gmx_opencl_inline bool gmx_sub_group_any_localmem(volatile __local int* warp_any, int widx, bool pred)
 901 {
 902     if (pred)
 903     {
 904         warp_any[widx] = 1;
 905     }
 906
 907     bool ret = warp_any[widx];
 908
 909     warp_any[widx] = 0;
 910
 911     return ret;
 912 }
 913
 914 //! Returns a true if predicate is true for any work item in warp
 915 gmx_opencl_inline bool gmx_sub_group_any(volatile __local int gmx_unused* warp_any, int gmx_unused widx, bool pred)
 916 {
 917 #    if USE_SUBGROUP_ANY
 918     return sub_group_any(pred);
 919 #    else
 920     return gmx_sub_group_any_localmem(warp_any, widx, pred);
 921 #    endif
 922 }
 923
 924 #endif /* NBNXN_OPENCL_KERNEL_UTILS_CLH */