src/gromacs/nbnxm/opencl/nbnxm_ocl_kernel_utils.clh

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2012,2013,2014,2016,2017 by the GROMACS development team.
   5  * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
   6  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   7  * and including many others, as listed in the AUTHORS file in the
   8  * top-level source directory and at http://www.gromacs.org.
   9  *
  10  * GROMACS is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU Lesser General Public License
  12  * as published by the Free Software Foundation; either version 2.1
  13  * of the License, or (at your option) any later version.
  14  *
  15  * GROMACS is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18  * Lesser General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public
  21  * License along with GROMACS; if not, see
  22  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  23  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  24  *
  25  * If you want to redistribute modifications to GROMACS, please
  26  * consider that scientific software is very special. Version
  27  * control is crucial - bugs must be traceable. We will be happy to
  28  * consider code for inclusion in the official distribution, but
  29  * derived work must not be called official GROMACS. Details are found
  30  * in the README & COPYING files - if they are missing, get the
  31  * official version at http://www.gromacs.org.
  32  *
  33  * To help us fund GROMACS development, we humbly ask that you cite
  34  * the research papers on the package. Check out http://www.gromacs.org.
  35  */
  36
  37 #define GMX_DOUBLE 0
  38
  39 #include "gromacs/gpu_utils/device_utils.clh"
  40 #include "gromacs/gpu_utils/vectype_ops.clh"
  41 #include "gromacs/nbnxm/constants.h"
  42 #include "gromacs/pbcutil/ishift.h"
  43
  44 #include "nbnxm_ocl_consts.h"
  45
  46 #define CL_SIZE (NBNXN_GPU_CLUSTER_SIZE)
  47 #define NCL_PER_SUPERCL c_nbnxnGpuNumClusterPerSupercluster
  48
  49 #define WARP_SIZE (CL_SIZE * CL_SIZE / 2) // Currently only c_nbnxnGpuClusterpairSplit=2 supported
  50
  51 #if defined _NVIDIA_SOURCE_ || defined _AMD_SOURCE_
  52 /* Currently we enable CJ prefetch for AMD/NVIDIA and disable it for other vendors
  53  * Note that this should precede the kernel_utils include.
  54  */
  55 #    define USE_CJ_PREFETCH 1
  56 #else
  57 #    define USE_CJ_PREFETCH 0
  58 #endif
  59
  60 #if defined cl_intel_subgroups || defined cl_khr_subgroups \
  61         || (defined __OPENCL_VERSION__ && __OPENCL_VERSION__ >= 210)
  62 #    define HAVE_SUBGROUP 1
  63 #else
  64 #    define HAVE_SUBGROUP 0
  65 #endif
  66
  67 #ifdef cl_intel_subgroups
  68 #    define HAVE_INTEL_SUBGROUP 1
  69 #else
  70 #    define HAVE_INTEL_SUBGROUP 0
  71 #endif
  72
  73 #if defined _INTEL_SOURCE_
  74 #    define SUBGROUP_SIZE 8
  75 #elif defined _AMD_SOURCE_
  76 #    define SUBGROUP_SIZE 64
  77 #else
  78 #    define SUBGROUP_SIZE 32
  79 #endif
  80
  81 #define REDUCE_SHUFFLE (HAVE_INTEL_SUBGROUP && CL_SIZE == 4 && SUBGROUP_SIZE == WARP_SIZE)
  82 #define USE_SUBGROUP_ANY (HAVE_SUBGROUP && SUBGROUP_SIZE == WARP_SIZE)
  83 #define USE_SUBGROUP_PRELOAD HAVE_INTEL_SUBGROUP
  84
  85 /* 1.0 / sqrt(M_PI) */
  86 #define M_FLOAT_1_SQRTPI 0.564189583547756F
  87
  88 //-------------------
  89
  90 #ifndef NBNXN_OPENCL_KERNEL_UTILS_CLH
  91 #    define NBNXN_OPENCL_KERNEL_UTILS_CLH
  92
  93 #    if CL_SIZE == 8
  94 #        define WARP_SIZE_LOG2 (5)
  95 #        define CL_SIZE_LOG2 (3)
  96 #    elif CL_SIZE == 4
  97 #        define WARP_SIZE_LOG2 (3)
  98 #        define CL_SIZE_LOG2 (2)
  99 #    else
 100 #        error unsupported CL_SIZE
 101 #    endif
 102
 103 #    define CL_SIZE_SQ (CL_SIZE * CL_SIZE)
 104 #    define FBUF_STRIDE (CL_SIZE_SQ)
 105
 106 #    define ONE_SIXTH_F 0.16666667F
 107 #    define ONE_TWELVETH_F 0.08333333F
 108 #    define HALF_F 0.5F
 109
 110
 111 #    ifdef __GNUC__
 112 /* GCC, clang, and some ICC pretending to be GCC */
 113 #        define gmx_unused __attribute__((unused))
 114 #    else
 115 #        define gmx_unused
 116 #    endif
 117
 118 // Data structures shared between OpenCL device code and OpenCL host code
 119 // TODO: review, improve
 120 // Replaced real by float for now, to avoid including any other header
 121 typedef struct
 122 {
 123     float c2;
 124     float c3;
 125     float cpot;
 126 } shift_consts_t;
 127
 128 /* Used with potential switching:
 129  * rsw        = max(r - r_switch, 0)
 130  * sw         = 1 + c3*rsw^3 + c4*rsw^4 + c5*rsw^5
 131  * dsw        = 3*c3*rsw^2 + 4*c4*rsw^3 + 5*c5*rsw^4
 132  * force      = force*dsw - potential*sw
 133  * potential *= sw
 134  */
 135 typedef struct
 136 {
 137     float c3;
 138     float c4;
 139     float c5;
 140 } switch_consts_t;
 141
 142 // Data structure shared between the OpenCL device code and OpenCL host code
 143 // Must not contain OpenCL objects (buffers)
 144 typedef struct cl_nbparam_params
 145 {
 146
 147     int eeltype; /**< type of electrostatics, takes values from #eelCu */
 148     int vdwtype; /**< type of VdW impl., takes values from #evdwCu     */
 149
 150     float epsfac;      /**< charge multiplication factor                      */
 151     float c_rf;        /**< Reaction-field/plain cutoff electrostatics const. */
 152     float two_k_rf;    /**< Reaction-field electrostatics constant            */
 153     float ewald_beta;  /**< Ewald/PME parameter                               */
 154     float sh_ewald;    /**< Ewald/PME correction term substracted from the direct-space potential */
 155     float sh_lj_ewald; /**< LJ-Ewald/PME correction term added to the correction potential        */
 156     float ewaldcoeff_lj; /**< LJ-Ewald/PME coefficient                          */
 157
 158     float rcoulomb_sq; /**< Coulomb cut-off squared                           */
 159
 160     float rvdw_sq;       /**< VdW cut-off squared                               */
 161     float rvdw_switch;   /**< VdW switched cut-off                              */
 162     float rlistOuter_sq; /**< Full, outer pair-list cut-off squared             */
 163     float rlistInner_sq; /**< Inner, dynamic pruned pair-list cut-off squared  XXX: this is only needed in the pruning kernels, but for now we also pass it to the nonbondeds */
 164
 165     shift_consts_t  dispersion_shift; /**< VdW shift dispersion constants           */
 166     shift_consts_t  repulsion_shift;  /**< VdW shift repulsion constants            */
 167     switch_consts_t vdw_switch;       /**< VdW switch constants                     */
 168
 169     /* Ewald Coulomb force table data - accessed through texture memory */
 170     float coulomb_tab_scale; /**< table scale/spacing                        */
 171 } cl_nbparam_params_t;
 172
 173 typedef struct
 174 {
 175     int sci;           /* i-super-cluster       */
 176     int shift;         /* Shift vector index plus possible flags */
 177     int cj4_ind_start; /* Start index into cj4  */
 178     int cj4_ind_end;   /* End index into cj4    */
 179 } nbnxn_sci_t;
 180
 181 typedef struct
 182 {
 183     unsigned int imask;    /* The i-cluster interactions mask for 1 warp  */
 184     int          excl_ind; /* Index into the exclusion array for 1 warp   */
 185 } nbnxn_im_ei_t;
 186
 187 typedef struct
 188 {
 189     int           cj[4];   /* The 4 j-clusters                            */
 190     nbnxn_im_ei_t imei[2]; /* The i-cluster mask data       for 2 warps   */
 191 } nbnxn_cj4_t;
 192
 193
 194 typedef struct
 195 {
 196     unsigned int pair[CL_SIZE * CL_SIZE / 2]; /* Topology exclusion interaction bits for one warp,
 197                                                * each unsigned has bitS for 4*8 i clusters
 198                                                */
 199 } nbnxn_excl_t;
 200
 201 /*! i-cluster interaction mask for a super-cluster with all NCL_PER_SUPERCL bits set */
 202 __constant unsigned supercl_interaction_mask = ((1U << NCL_PER_SUPERCL) - 1U);
 203
 204 gmx_opencl_inline void preloadCj4Generic(__local int*        sm_cjPreload,
 205                                          const __global int* gm_cj,
 206                                          int                 tidxi,
 207                                          int                 tidxj,
 208                                          bool gmx_unused iMaskCond)
 209 {
 210     /* Pre-load cj into shared memory */
 211 #    if defined _AMD_SOURCE_ // TODO: fix by setting c_nbnxnGpuClusterpairSplit properly
 212     if (tidxj == 0 & tidxi < c_nbnxnGpuJgroupSize)
 213     {
 214         sm_cjPreload[tidxi] = gm_cj[tidxi];
 215     }
 216 #    else
 217     const int c_clSize                   = CL_SIZE;
 218     const int c_nbnxnGpuClusterpairSplit = 2;
 219     const int c_splitClSize              = c_clSize / c_nbnxnGpuClusterpairSplit;
 220     if ((tidxj == 0 | tidxj == c_splitClSize) & (tidxi < c_nbnxnGpuJgroupSize))
 221     {
 222         sm_cjPreload[tidxi + tidxj * c_nbnxnGpuJgroupSize / c_splitClSize] = gm_cj[tidxi];
 223     }
 224 #    endif
 225 }
 226
 227
 228 #    if USE_SUBGROUP_PRELOAD
 229 gmx_opencl_inline int preloadCj4Subgroup(const __global int* gm_cj)
 230 {
 231     // loads subgroup-size # of elements (8) instead of the 4 required
 232     // equivalent to *cjs = *gm_cj
 233     return intel_sub_group_block_read((const __global uint*)gm_cj);
 234 }
 235 #    endif // USE_SUBGROUP_PRELOAD
 236
 237 #    if USE_SUBGROUP_PRELOAD
 238 typedef size_t CjType;
 239 #    else
 240 typedef __local int* CjType;
 241 #    endif
 242
 243 /*! \brief Preload cj4
 244  *
 245  * - For AMD we load once for a wavefront of 64 threads (on 4 threads * NTHREAD_Z)
 246  * - For NVIDIA once per warp (on 2x4 threads * NTHREAD_Z)
 247  * - For Intel(/USE_SUBGROUP_PRELOAD) loads into private memory(/register) instead of local memory
 248  *
 249  * It is the caller's responsibility to make sure that data is consumed only when
 250  * it's ready. This function does not call a barrier.
 251  */
 252 gmx_opencl_inline void preloadCj4(CjType gmx_unused* cjs,
 253                                   const __global int gmx_unused* gm_cj,
 254                                   int gmx_unused tidxi,
 255                                   int gmx_unused tidxj,
 256                                   bool gmx_unused iMaskCond)
 257 {
 258 #    if USE_SUBGROUP_PRELOAD
 259     *cjs = preloadCj4Subgroup(gm_cj);
 260 #    elif USE_CJ_PREFETCH
 261     preloadCj4Generic(*cjs, gm_cj, tidxi, tidxj, iMaskCond);
 262 #    else
 263     // nothing to do
 264 #    endif
 265 }
 266
 267 gmx_opencl_inline int loadCjPreload(__local int* sm_cjPreload, int jm, int gmx_unused tidxi, int gmx_unused tidxj)
 268 {
 269 #    if defined _AMD_SOURCE_
 270     int         warpLoadOffset = 0; // TODO: fix by setting c_nbnxnGpuClusterpairSplit properly
 271 #    else
 272     const int c_clSize = CL_SIZE;
 273     const int c_nbnxnGpuClusterpairSplit = 2;
 274     const int c_splitClSize = c_clSize / c_nbnxnGpuClusterpairSplit;
 275     int warpLoadOffset = (tidxj & c_splitClSize) * c_nbnxnGpuJgroupSize / c_splitClSize;
 276 #    endif
 277     return sm_cjPreload[jm + warpLoadOffset];
 278 }
 279
 280 /* \brief Load a cj given a jm index.
 281  *
 282  * If cj4 preloading is enabled, it loads from the local memory, otherwise from global.
 283  */
 284 gmx_opencl_inline int
 285 loadCj(CjType cjs, const __global int gmx_unused* gm_cj, int jm, int gmx_unused tidxi, int gmx_unused tidxj)
 286 {
 287 #    if USE_SUBGROUP_PRELOAD
 288     return sub_group_broadcast(cjs, jm);
 289 #    elif USE_CJ_PREFETCH
 290     return loadCjPreload(cjs, jm, tidxi, tidxj);
 291 #    else
 292     return gm_cj[jm];
 293 #    endif
 294 }
 295
 296 /*! Convert LJ sigma,epsilon parameters to C6,C12. */
 297 gmx_opencl_inline void convert_sigma_epsilon_to_c6_c12(const float sigma, const float epsilon, float* c6, float* c12)
 298 {
 299     float sigma2, sigma6;
 300
 301     sigma2 = sigma * sigma;
 302     sigma6 = sigma2 * sigma2 * sigma2;
 303     *c6    = epsilon * sigma6;
 304     *c12   = *c6 * sigma6;
 305 }
 306
 307
 308 /*! Apply force switch,  force + energy version. */
 309 gmx_opencl_inline void calculate_force_switch_F(const cl_nbparam_params_t* nbparam,
 310                                                 float                      c6,
 311                                                 float                      c12,
 312                                                 float                      inv_r,
 313                                                 float                      r2,
 314                                                 float*                     F_invr)
 315 {
 316     float r, r_switch;
 317
 318     /* force switch constants */
 319     float disp_shift_V2 = nbparam->dispersion_shift.c2;
 320     float disp_shift_V3 = nbparam->dispersion_shift.c3;
 321     float repu_shift_V2 = nbparam->repulsion_shift.c2;
 322     float repu_shift_V3 = nbparam->repulsion_shift.c3;
 323
 324     r        = r2 * inv_r;
 325     r_switch = r - nbparam->rvdw_switch;
 326     r_switch = r_switch >= 0.0F ? r_switch : 0.0F;
 327
 328     *F_invr += -c6 * (disp_shift_V2 + disp_shift_V3 * r_switch) * r_switch * r_switch * inv_r
 329                + c12 * (repu_shift_V2 + repu_shift_V3 * r_switch) * r_switch * r_switch * inv_r;
 330 }
 331
 332 /*! Apply force switch, force-only version. */
 333 gmx_opencl_inline void calculate_force_switch_F_E(const cl_nbparam_params_t* nbparam,
 334                                                   float                      c6,
 335                                                   float                      c12,
 336                                                   float                      inv_r,
 337                                                   float                      r2,
 338                                                   float*                     F_invr,
 339                                                   float*                     E_lj)
 340 {
 341     float r, r_switch;
 342
 343     /* force switch constants */
 344     float disp_shift_V2 = nbparam->dispersion_shift.c2;
 345     float disp_shift_V3 = nbparam->dispersion_shift.c3;
 346     float repu_shift_V2 = nbparam->repulsion_shift.c2;
 347     float repu_shift_V3 = nbparam->repulsion_shift.c3;
 348
 349     float disp_shift_F2 = nbparam->dispersion_shift.c2 / 3;
 350     float disp_shift_F3 = nbparam->dispersion_shift.c3 / 4;
 351     float repu_shift_F2 = nbparam->repulsion_shift.c2 / 3;
 352     float repu_shift_F3 = nbparam->repulsion_shift.c3 / 4;
 353
 354     r        = r2 * inv_r;
 355     r_switch = r - nbparam->rvdw_switch;
 356     r_switch = r_switch >= 0.0F ? r_switch : 0.0F;
 357
 358     *F_invr += -c6 * (disp_shift_V2 + disp_shift_V3 * r_switch) * r_switch * r_switch * inv_r
 359                + c12 * (repu_shift_V2 + repu_shift_V3 * r_switch) * r_switch * r_switch * inv_r;
 360     *E_lj += c6 * (disp_shift_F2 + disp_shift_F3 * r_switch) * r_switch * r_switch * r_switch
 361              - c12 * (repu_shift_F2 + repu_shift_F3 * r_switch) * r_switch * r_switch * r_switch;
 362 }
 363
 364 /*! Apply potential switch, force-only version. */
 365 gmx_opencl_inline void calculate_potential_switch_F(const cl_nbparam_params_t* nbparam,
 366                                                     float                      inv_r,
 367                                                     float                      r2,
 368                                                     float*                     F_invr,
 369                                                     const float*               E_lj)
 370 {
 371     float r, r_switch;
 372     float sw, dsw;
 373
 374     /* potential switch constants */
 375     float switch_V3 = nbparam->vdw_switch.c3;
 376     float switch_V4 = nbparam->vdw_switch.c4;
 377     float switch_V5 = nbparam->vdw_switch.c5;
 378     float switch_F2 = nbparam->vdw_switch.c3;
 379     float switch_F3 = nbparam->vdw_switch.c4;
 380     float switch_F4 = nbparam->vdw_switch.c5;
 381
 382     r        = r2 * inv_r;
 383     r_switch = r - nbparam->rvdw_switch;
 384
 385     /* Unlike in the F+E kernel, conditional is faster here */
 386     if (r_switch > 0.0F)
 387     {
 388         sw  = 1.0F + (switch_V3 + (switch_V4 + switch_V5 * r_switch) * r_switch) * r_switch * r_switch * r_switch;
 389         dsw = (switch_F2 + (switch_F3 + switch_F4 * r_switch) * r_switch) * r_switch * r_switch;
 390
 391         *F_invr = (*F_invr) * sw - inv_r * (*E_lj) * dsw;
 392     }
 393 }
 394
 395 /*! Apply potential switch, force + energy version. */
 396 gmx_opencl_inline void calculate_potential_switch_F_E(const cl_nbparam_params_t* nbparam,
 397                                                       float                      inv_r,
 398                                                       float                      r2,
 399                                                       float*                     F_invr,
 400                                                       float*                     E_lj)
 401 {
 402     float r, r_switch;
 403     float sw, dsw;
 404
 405     /* potential switch constants */
 406     float switch_V3 = nbparam->vdw_switch.c3;
 407     float switch_V4 = nbparam->vdw_switch.c4;
 408     float switch_V5 = nbparam->vdw_switch.c5;
 409     float switch_F2 = nbparam->vdw_switch.c3;
 410     float switch_F3 = nbparam->vdw_switch.c4;
 411     float switch_F4 = nbparam->vdw_switch.c5;
 412
 413     r        = r2 * inv_r;
 414     r_switch = r - nbparam->rvdw_switch;
 415     r_switch = r_switch >= 0.0F ? r_switch : 0.0F;
 416
 417     /* Unlike in the F-only kernel, masking is faster here */
 418     sw  = 1.0F + (switch_V3 + (switch_V4 + switch_V5 * r_switch) * r_switch) * r_switch * r_switch * r_switch;
 419     dsw = (switch_F2 + (switch_F3 + switch_F4 * r_switch) * r_switch) * r_switch * r_switch;
 420
 421     *F_invr = (*F_invr) * sw - inv_r * (*E_lj) * dsw;
 422     *E_lj *= sw;
 423 }
 424
 425 /*! Calculate LJ-PME grid force contribution with
 426  *  geometric combination rule.
 427  */
 428 gmx_opencl_inline void calculate_lj_ewald_comb_geom_F(__constant const float* nbfp_comb_climg2d,
 429                                                       int                     typei,
 430                                                       int                     typej,
 431                                                       float                   r2,
 432                                                       float                   inv_r2,
 433                                                       float                   lje_coeff2,
 434                                                       float                   lje_coeff6_6,
 435                                                       float*                  F_invr)
 436 {
 437     float c6grid, inv_r6_nm, cr2, expmcr2, poly;
 438
 439     c6grid = nbfp_comb_climg2d[2 * typei] * nbfp_comb_climg2d[2 * typej];
 440
 441     /* Recalculate inv_r6 without exclusion mask */
 442     inv_r6_nm = inv_r2 * inv_r2 * inv_r2;
 443     cr2       = lje_coeff2 * r2;
 444     expmcr2   = exp(-cr2);
 445     poly      = 1.0F + cr2 + HALF_F * cr2 * cr2;
 446
 447     /* Subtract the grid force from the total LJ force */
 448     *F_invr += c6grid * (inv_r6_nm - expmcr2 * (inv_r6_nm * poly + lje_coeff6_6)) * inv_r2;
 449 }
 450
 451 /*! Calculate LJ-PME grid force + energy contribution with
 452  *  geometric combination rule.
 453  */
 454 gmx_opencl_inline void calculate_lj_ewald_comb_geom_F_E(__constant const float* nbfp_comb_climg2d,
 455                                                         const cl_nbparam_params_t* nbparam,
 456                                                         int                        typei,
 457                                                         int                        typej,
 458                                                         float                      r2,
 459                                                         float                      inv_r2,
 460                                                         float                      lje_coeff2,
 461                                                         float                      lje_coeff6_6,
 462                                                         float                      int_bit,
 463                                                         float*                     F_invr,
 464                                                         float*                     E_lj)
 465 {
 466     float c6grid, inv_r6_nm, cr2, expmcr2, poly, sh_mask;
 467
 468     c6grid = nbfp_comb_climg2d[2 * typei] * nbfp_comb_climg2d[2 * typej];
 469
 470     /* Recalculate inv_r6 without exclusion mask */
 471     inv_r6_nm = inv_r2 * inv_r2 * inv_r2;
 472     cr2       = lje_coeff2 * r2;
 473     expmcr2   = exp(-cr2);
 474     poly      = 1.0F + cr2 + HALF_F * cr2 * cr2;
 475
 476     /* Subtract the grid force from the total LJ force */
 477     *F_invr += c6grid * (inv_r6_nm - expmcr2 * (inv_r6_nm * poly + lje_coeff6_6)) * inv_r2;
 478
 479     /* Shift should be applied only to real LJ pairs */
 480     sh_mask = nbparam->sh_lj_ewald * int_bit;
 481     *E_lj += ONE_SIXTH_F * c6grid * (inv_r6_nm * (1.0F - expmcr2 * poly) + sh_mask);
 482 }
 483
 484 /*! Calculate LJ-PME grid force + energy contribution (if E_lj != NULL) with
 485  *  Lorentz-Berthelot combination rule.
 486  *  We use a single F+E kernel with conditional because the performance impact
 487  *  of this is pretty small and LB on the CPU is anyway very slow.
 488  */
 489 gmx_opencl_inline void calculate_lj_ewald_comb_LB_F_E(__constant const float*    nbfp_comb_climg2d,
 490                                                       const cl_nbparam_params_t* nbparam,
 491                                                       int                        typei,
 492                                                       int                        typej,
 493                                                       float                      r2,
 494                                                       float                      inv_r2,
 495                                                       float                      lje_coeff2,
 496                                                       float                      lje_coeff6_6,
 497                                                       float                      int_bit,
 498                                                       bool                       with_E_lj,
 499                                                       float*                     F_invr,
 500                                                       float*                     E_lj)
 501 {
 502     float c6grid, inv_r6_nm, cr2, expmcr2, poly;
 503     float sigma, sigma2, epsilon;
 504
 505     /* sigma and epsilon are scaled to give 6*C6 */
 506     sigma = nbfp_comb_climg2d[2 * typei] + nbfp_comb_climg2d[2 * typej];
 507
 508     epsilon = nbfp_comb_climg2d[2 * typei + 1] * nbfp_comb_climg2d[2 * typej + 1];
 509
 510     sigma2 = sigma * sigma;
 511     c6grid = epsilon * sigma2 * sigma2 * sigma2;
 512
 513     /* Recalculate inv_r6 without exclusion mask */
 514     inv_r6_nm = inv_r2 * inv_r2 * inv_r2;
 515     cr2       = lje_coeff2 * r2;
 516     expmcr2   = exp(-cr2);
 517     poly      = 1.0F + cr2 + HALF_F * cr2 * cr2;
 518
 519     /* Subtract the grid force from the total LJ force */
 520     *F_invr += c6grid * (inv_r6_nm - expmcr2 * (inv_r6_nm * poly + lje_coeff6_6)) * inv_r2;
 521
 522     if (with_E_lj)
 523     {
 524         float sh_mask;
 525
 526         /* Shift should be applied only to real LJ pairs */
 527         sh_mask = nbparam->sh_lj_ewald * int_bit;
 528         *E_lj += ONE_SIXTH_F * c6grid * (inv_r6_nm * (1.0F - expmcr2 * poly) + sh_mask);
 529     }
 530 }
 531
 532 /*! Interpolate Ewald coulomb force using the table through the tex_nbfp texture.
 533  *  Original idea: from the OpenMM project
 534  */
 535 gmx_opencl_inline float interpolate_coulomb_force_r(__constant const float* coulomb_tab_climg2d,
 536                                                     float                   r,
 537                                                     float                   scale)
 538 {
 539     float normalized = scale * r;
 540     int   index      = (int)normalized;
 541     float fract2     = normalized - (float)index;
 542     float fract1     = 1.0F - fract2;
 543
 544     return fract1 * coulomb_tab_climg2d[index] + fract2 * coulomb_tab_climg2d[index + 1];
 545 }
 546
 547 /*! Calculate analytical Ewald correction term. */
 548 gmx_opencl_inline float pmecorrF(float z2)
 549 {
 550     const float FN6 = -1.7357322914161492954e-8F;
 551     const float FN5 = 1.4703624142580877519e-6F;
 552     const float FN4 = -0.000053401640219807709149F;
 553     const float FN3 = 0.0010054721316683106153F;
 554     const float FN2 = -0.019278317264888380590F;
 555     const float FN1 = 0.069670166153766424023F;
 556     const float FN0 = -0.75225204789749321333F;
 557
 558     const float FD4 = 0.0011193462567257629232F;
 559     const float FD3 = 0.014866955030185295499F;
 560     const float FD2 = 0.11583842382862377919F;
 561     const float FD1 = 0.50736591960530292870F;
 562     const float FD0 = 1.0F;
 563
 564     float z4;
 565     float polyFN0, polyFN1, polyFD0, polyFD1;
 566
 567     z4 = z2 * z2;
 568
 569     polyFD0 = FD4 * z4 + FD2;
 570     polyFD1 = FD3 * z4 + FD1;
 571     polyFD0 = polyFD0 * z4 + FD0;
 572     polyFD0 = polyFD1 * z2 + polyFD0;
 573
 574     polyFD0 = 1.0F / polyFD0;
 575
 576     polyFN0 = FN6 * z4 + FN4;
 577     polyFN1 = FN5 * z4 + FN3;
 578     polyFN0 = polyFN0 * z4 + FN2;
 579     polyFN1 = polyFN1 * z4 + FN1;
 580     polyFN0 = polyFN0 * z4 + FN0;
 581     polyFN0 = polyFN1 * z2 + polyFN0;
 582
 583     return polyFN0 * polyFD0;
 584 }
 585
 586 #    if REDUCE_SHUFFLE
 587 gmx_opencl_inline void
 588 reduce_force_j_shfl(float3 fin, __global float* fout, int gmx_unused tidxi, int gmx_unused tidxj, int aidx)
 589 {
 590     /* Only does reduction over 4 elements in cluster. Needs to be changed
 591      * for CL_SIZE>4. See CUDA code for required code */
 592     fin.x += intel_sub_group_shuffle_down(fin.x, fin.x, 1);
 593     fin.y += intel_sub_group_shuffle_up(fin.y, fin.y, 1);
 594     fin.z += intel_sub_group_shuffle_down(fin.z, fin.z, 1);
 595     if ((tidxi & 1) == 1)
 596     {
 597         fin.x = fin.y;
 598     }
 599     fin.x += intel_sub_group_shuffle_down(fin.x, fin.x, 2);
 600     fin.z += intel_sub_group_shuffle_up(fin.z, fin.z, 2);
 601     if (tidxi == 2)
 602     {
 603         fin.x = fin.z;
 604     }
 605     if (tidxi < 3)
 606     {
 607         atomicAdd_g_f(&fout[3 * aidx + tidxi], fin.x);
 608     }
 609 }
 610 #    endif
 611
 612 gmx_opencl_inline void
 613 reduce_force_j_generic(__local float* f_buf, float3 fcj_buf, __global float* fout, int tidxi, int tidxj, int aidx)
 614 {
 615     int tidx                      = tidxi + tidxj * CL_SIZE;
 616     f_buf[tidx]                   = fcj_buf.x;
 617     f_buf[FBUF_STRIDE + tidx]     = fcj_buf.y;
 618     f_buf[2 * FBUF_STRIDE + tidx] = fcj_buf.z;
 619
 620     /* Split the reduction between the first 3 column threads
 621        Threads with column id 0 will do the reduction for (float3).x components
 622        Threads with column id 1 will do the reduction for (float3).y components
 623        Threads with column id 2 will do the reduction for (float3).z components.
 624        The reduction is performed for each line tidxj of f_buf. */
 625     if (tidxi < 3)
 626     {
 627         float f = 0.0F;
 628         for (int j = tidxj * CL_SIZE; j < (tidxj + 1) * CL_SIZE; j++)
 629         {
 630             f += f_buf[FBUF_STRIDE * tidxi + j];
 631         }
 632
 633         atomicAdd_g_f(&fout[3 * aidx + tidxi], f);
 634     }
 635 }
 636
 637 /*! Final j-force reduction
 638  */
 639 gmx_opencl_inline void reduce_force_j(__local float gmx_unused* f_buf,
 640                                       float3                    fcj_buf,
 641                                       __global float*           fout,
 642                                       int                       tidxi,
 643                                       int                       tidxj,
 644                                       int                       aidx)
 645 {
 646 #    if REDUCE_SHUFFLE
 647     reduce_force_j_shfl(fcj_buf, fout, tidxi, tidxj, aidx);
 648 #    else
 649     reduce_force_j_generic(f_buf, fcj_buf, fout, tidxi, tidxj, aidx);
 650 #    endif
 651 }
 652
 653 #    if REDUCE_SHUFFLE
 654 gmx_opencl_inline void reduce_force_i_and_shift_shfl(float3*         fci_buf,
 655                                                      __global float* fout,
 656                                                      bool            bCalcFshift,
 657                                                      int             tidxi,
 658                                                      int             tidxj,
 659                                                      int             sci,
 660                                                      int             shift,
 661                                                      __global float* fshift)
 662 {
 663     /* Only does reduction over 4 elements in cluster (2 per warp). Needs to be changed
 664      * for CL_SIZE>4.*/
 665     float2 fshift_buf = 0;
 666     for (int ci_offset = 0; ci_offset < NCL_PER_SUPERCL; ci_offset++)
 667     {
 668         int    aidx = (sci * NCL_PER_SUPERCL + ci_offset) * CL_SIZE + tidxi;
 669         float3 fin  = fci_buf[ci_offset];
 670         fin.x += intel_sub_group_shuffle_down(fin.x, fin.x, CL_SIZE);
 671         fin.y += intel_sub_group_shuffle_up(fin.y, fin.y, CL_SIZE);
 672         fin.z += intel_sub_group_shuffle_down(fin.z, fin.z, CL_SIZE);
 673
 674         if (tidxj & 1)
 675         {
 676             fin.x = fin.y;
 677         }
 678         /* Threads 0,1 and 2,3 increment x,y for their warp */
 679         atomicAdd_g_f(&fout[3 * aidx + (tidxj & 1)], fin.x);
 680         if (bCalcFshift)
 681         {
 682             fshift_buf[0] += fin.x;
 683         }
 684         /* Threads 0 and 2 increment z for their warp */
 685         if ((tidxj & 1) == 0)
 686         {
 687             atomicAdd_g_f(&fout[3 * aidx + 2], fin.z);
 688             if (bCalcFshift)
 689             {
 690                 fshift_buf[1] += fin.z;
 691             }
 692         }
 693     }
 694     /* add up local shift forces into global mem */
 695     if (bCalcFshift)
 696     {
 697         // Threads 0,1 and 2,3 update x,y
 698         atomicAdd_g_f(&(fshift[3 * shift + (tidxj & 1)]), fshift_buf[0]);
 699         // Threads 0 and 2 update z
 700         if ((tidxj & 1) == 0)
 701         {
 702             atomicAdd_g_f(&(fshift[3 * shift + 2]), fshift_buf[1]);
 703         }
 704     }
 705 }
 706 #    endif
 707
 708 /*! Final i-force reduction; this implementation works only with power of two
 709  *  array sizes.
 710  */
 711 gmx_opencl_inline void reduce_force_i_and_shift_pow2(volatile __local float* f_buf,
 712                                                      float3*                 fci_buf,
 713                                                      __global float*         fout,
 714                                                      bool                    bCalcFshift,
 715                                                      int                     tidxi,
 716                                                      int                     tidxj,
 717                                                      int                     sci,
 718                                                      int                     shift,
 719                                                      __global float*         fshift)
 720 {
 721     float fshift_buf = 0;
 722     for (int ci_offset = 0; ci_offset < NCL_PER_SUPERCL; ci_offset++)
 723     {
 724         int aidx = (sci * NCL_PER_SUPERCL + ci_offset) * CL_SIZE + tidxi;
 725         int tidx = tidxi + tidxj * CL_SIZE;
 726         /* store i forces in shmem */
 727         f_buf[tidx]                   = fci_buf[ci_offset].x;
 728         f_buf[FBUF_STRIDE + tidx]     = fci_buf[ci_offset].y;
 729         f_buf[2 * FBUF_STRIDE + tidx] = fci_buf[ci_offset].z;
 730         barrier(CLK_LOCAL_MEM_FENCE);
 731
 732         /* Reduce the initial CL_SIZE values for each i atom to half
 733          * every step by using CL_SIZE * i threads.
 734          * Can't just use i as loop variable because than nvcc refuses to unroll.
 735          */
 736         int i = CL_SIZE / 2;
 737         for (int j = CL_SIZE_LOG2 - 1; j > 0; j--)
 738         {
 739             if (tidxj < i)
 740             {
 741
 742                 f_buf[tidxj * CL_SIZE + tidxi] += f_buf[(tidxj + i) * CL_SIZE + tidxi];
 743                 f_buf[FBUF_STRIDE + tidxj * CL_SIZE + tidxi] +=
 744                         f_buf[FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi];
 745                 f_buf[2 * FBUF_STRIDE + tidxj * CL_SIZE + tidxi] +=
 746                         f_buf[2 * FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi];
 747             }
 748             i >>= 1;
 749         }
 750         /* needed because
 751          * a) for CL_SIZE<8: id 2 (doing z in next block) is in 2nd warp
 752          * b) for all CL_SIZE a barrier is needed before f_buf is reused by next reduce_force_i call
 753          * TODO: Test on Nvidia for performance difference between having the barrier here or after the atomicAdd
 754          */
 755         barrier(CLK_LOCAL_MEM_FENCE);
 756
 757         /* i == 1, last reduction step, writing to global mem */
 758         /* Split the reduction between the first 3 line threads
 759            Threads with line id 0 will do the reduction for (float3).x components
 760            Threads with line id 1 will do the reduction for (float3).y components
 761            Threads with line id 2 will do the reduction for (float3).z components. */
 762         if (tidxj < 3)
 763         {
 764             float f = f_buf[tidxj * FBUF_STRIDE + tidxi] + f_buf[tidxj * FBUF_STRIDE + i * CL_SIZE + tidxi];
 765
 766             atomicAdd_g_f(&fout[3 * aidx + tidxj], f);
 767
 768             if (bCalcFshift)
 769             {
 770                 fshift_buf += f;
 771             }
 772         }
 773     }
 774     /* add up local shift forces into global mem */
 775     if (bCalcFshift)
 776     {
 777         /* Only threads with tidxj < 3 will update fshift.
 778            The threads performing the update, must be the same as the threads
 779            storing the reduction result above.
 780          */
 781         if (tidxj < 3)
 782         {
 783             atomicAdd_g_f(&(fshift[3 * shift + tidxj]), fshift_buf);
 784         }
 785     }
 786 }
 787
 788 /*! Final i-force reduction
 789  */
 790 gmx_opencl_inline void reduce_force_i_and_shift(__local float gmx_unused* f_buf,
 791                                                 float3*                   fci_buf,
 792                                                 __global float*           f,
 793                                                 bool                      bCalcFshift,
 794                                                 int                       tidxi,
 795                                                 int                       tidxj,
 796                                                 int                       sci,
 797                                                 int                       shift,
 798                                                 __global float*           fshift)
 799 {
 800 #    if REDUCE_SHUFFLE
 801     reduce_force_i_and_shift_shfl(fci_buf, f, bCalcFshift, tidxi, tidxj, sci, shift, fshift);
 802 #    else
 803     reduce_force_i_and_shift_pow2(f_buf, fci_buf, f, bCalcFshift, tidxi, tidxj, sci, shift, fshift);
 804 #    endif
 805 }
 806
 807
 808 #    if REDUCE_SHUFFLE
 809 gmx_opencl_inline void reduce_energy_shfl(float                    E_lj,
 810                                           float                    E_el,
 811                                           volatile __global float* e_lj,
 812                                           volatile __global float* e_el,
 813                                           int                      tidx)
 814 {
 815     E_lj = sub_group_reduce_add(E_lj);
 816     E_el = sub_group_reduce_add(E_el);
 817     /* Should be get_sub_group_local_id()==0. Doesn't work with Intel Classic driver.
 818      * To make it possible to use REDUCE_SHUFFLE with single subgroup per i-j pair
 819      * (e.g. subgroup size 16 with CL_SIZE 4), either this "if" needs to be changed or
 820      * the definition of WARP_SIZE (currently CL_SIZE*CL_SIZE/2) needs to be changed
 821      * (by supporting c_nbnxnGpuClusterpairSplit=1). */
 822     if (tidx == 0 || tidx == WARP_SIZE)
 823     {
 824         atomicAdd_g_f(e_lj, E_lj);
 825         atomicAdd_g_f(e_el, E_el);
 826     }
 827 }
 828 #    endif
 829
 830 /*! Energy reduction; this implementation works only with power of two
 831  *  array sizes.
 832  */
 833 gmx_opencl_inline void reduce_energy_pow2(volatile __local float*  buf,
 834                                           volatile __global float* e_lj,
 835                                           volatile __global float* e_el,
 836                                           int                      tidx)
 837 {
 838     int j;
 839
 840     int i = WARP_SIZE / 2;
 841
 842     /* Can't just use i as loop variable because than nvcc refuses to unroll. */
 843     for (j = WARP_SIZE_LOG2 - 1; j > 0; j--)
 844     {
 845         if (tidx < i)
 846         {
 847             buf[tidx] += buf[tidx + i];
 848             buf[FBUF_STRIDE + tidx] += buf[FBUF_STRIDE + tidx + i];
 849         }
 850         i >>= 1;
 851     }
 852
 853     /* last reduction step, writing to global mem */
 854     if (tidx == 0)
 855     {
 856         float e1 = buf[tidx] + buf[tidx + i];
 857         float e2 = buf[FBUF_STRIDE + tidx] + buf[FBUF_STRIDE + tidx + i];
 858
 859         atomicAdd_g_f(e_lj, e1);
 860         atomicAdd_g_f(e_el, e2);
 861     }
 862 }
 863
 864 gmx_opencl_inline void reduce_energy(volatile __local float gmx_unused* buf,
 865                                      float                              E_lj,
 866                                      float                              E_el,
 867                                      volatile __global float*           e_lj,
 868                                      volatile __global float*           e_el,
 869                                      int                                tidx)
 870 {
 871 #    if REDUCE_SHUFFLE
 872     reduce_energy_shfl(E_lj, E_el, e_lj, e_el, tidx);
 873 #    else
 874     /* flush the energies to shmem and reduce them */
 875     buf[tidx] = E_lj;
 876     buf[FBUF_STRIDE + tidx] = E_el;
 877     reduce_energy_pow2(buf + (tidx & WARP_SIZE), e_lj, e_el, tidx & ~WARP_SIZE);
 878 #    endif
 879 }
 880
 881 gmx_opencl_inline bool gmx_sub_group_any_localmem(volatile __local int* warp_any, int widx, bool pred)
 882 {
 883     if (pred)
 884     {
 885         warp_any[widx] = 1;
 886     }
 887
 888     bool ret = warp_any[widx];
 889
 890     warp_any[widx] = 0;
 891
 892     return ret;
 893 }
 894
 895 //! Returns a true if predicate is true for any work item in warp
 896 gmx_opencl_inline bool gmx_sub_group_any(volatile __local int gmx_unused* warp_any, int gmx_unused widx, bool pred)
 897 {
 898 #    if USE_SUBGROUP_ANY
 899     return sub_group_any(pred);
 900 #    else
 901     return gmx_sub_group_any_localmem(warp_any, widx, pred);
 902 #    endif
 903 }
 904
 905 #endif /* NBNXN_OPENCL_KERNEL_UTILS_CLH */