src/gromacs/nbnxm/opencl/nbnxm_ocl_kernel_utils.clh

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2012,2013,2014,2016,2017,2018,2019, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 #define GMX_DOUBLE 0
  37
  38 #include "gromacs/gpu_utils/device_utils.clh"
  39 #include "gromacs/gpu_utils/vectype_ops.clh"
  40 #include "gromacs/nbnxm/constants.h"
  41 #include "gromacs/pbcutil/ishift.h"
  42
  43 #include "nbnxm_ocl_consts.h"
  44
  45 #define CL_SIZE (NBNXN_GPU_CLUSTER_SIZE)
  46 #define NCL_PER_SUPERCL c_nbnxnGpuNumClusterPerSupercluster
  47
  48 #define WARP_SIZE (CL_SIZE * CL_SIZE / 2) // Currently only c_nbnxnGpuClusterpairSplit=2 supported
  49
  50 #if defined _NVIDIA_SOURCE_ || defined _AMD_SOURCE_
  51 /* Currently we enable CJ prefetch for AMD/NVIDIA and disable it for other vendors
  52  * Note that this should precede the kernel_utils include.
  53  */
  54 #    define USE_CJ_PREFETCH 1
  55 #else
  56 #    define USE_CJ_PREFETCH 0
  57 #endif
  58
  59 #if defined cl_intel_subgroups || defined cl_khr_subgroups \
  60         || (defined __OPENCL_VERSION__ && __OPENCL_VERSION__ >= 210)
  61 #    define HAVE_SUBGROUP 1
  62 #else
  63 #    define HAVE_SUBGROUP 0
  64 #endif
  65
  66 #ifdef cl_intel_subgroups
  67 #    define HAVE_INTEL_SUBGROUP 1
  68 #else
  69 #    define HAVE_INTEL_SUBGROUP 0
  70 #endif
  71
  72 #if defined _INTEL_SOURCE_
  73 #    define SUBGROUP_SIZE 8
  74 #elif defined _AMD_SOURCE_
  75 #    define SUBGROUP_SIZE 64
  76 #else
  77 #    define SUBGROUP_SIZE 32
  78 #endif
  79
  80 #define REDUCE_SHUFFLE (HAVE_INTEL_SUBGROUP && CL_SIZE == 4 && SUBGROUP_SIZE == WARP_SIZE)
  81 #define USE_SUBGROUP_ANY (HAVE_SUBGROUP && SUBGROUP_SIZE == WARP_SIZE)
  82 #define USE_SUBGROUP_PRELOAD HAVE_INTEL_SUBGROUP
  83
  84 /* 1.0 / sqrt(M_PI) */
  85 #define M_FLOAT_1_SQRTPI 0.564189583547756f
  86
  87 //-------------------
  88
  89 #ifndef NBNXN_OPENCL_KERNEL_UTILS_CLH
  90 #    define NBNXN_OPENCL_KERNEL_UTILS_CLH
  91
  92 #    if CL_SIZE == 8
  93 #        define WARP_SIZE_LOG2 (5)
  94 #        define CL_SIZE_LOG2 (3)
  95 #    elif CL_SIZE == 4
  96 #        define WARP_SIZE_LOG2 (3)
  97 #        define CL_SIZE_LOG2 (2)
  98 #    else
  99 #        error unsupported CL_SIZE
 100 #    endif
 101
 102 #    define CL_SIZE_SQ (CL_SIZE * CL_SIZE)
 103 #    define FBUF_STRIDE (CL_SIZE_SQ)
 104
 105 #    define ONE_SIXTH_F 0.16666667f
 106 #    define ONE_TWELVETH_F 0.08333333f
 107
 108
 109 #    ifdef __GNUC__
 110 /* GCC, clang, and some ICC pretending to be GCC */
 111 #        define gmx_unused __attribute__((unused))
 112 #    else
 113 #        define gmx_unused
 114 #    endif
 115
 116 // Data structures shared between OpenCL device code and OpenCL host code
 117 // TODO: review, improve
 118 // Replaced real by float for now, to avoid including any other header
 119 typedef struct
 120 {
 121     float c2;
 122     float c3;
 123     float cpot;
 124 } shift_consts_t;
 125
 126 /* Used with potential switching:
 127  * rsw        = max(r - r_switch, 0)
 128  * sw         = 1 + c3*rsw^3 + c4*rsw^4 + c5*rsw^5
 129  * dsw        = 3*c3*rsw^2 + 4*c4*rsw^3 + 5*c5*rsw^4
 130  * force      = force*dsw - potential*sw
 131  * potential *= sw
 132  */
 133 typedef struct
 134 {
 135     float c3;
 136     float c4;
 137     float c5;
 138 } switch_consts_t;
 139
 140 // Data structure shared between the OpenCL device code and OpenCL host code
 141 // Must not contain OpenCL objects (buffers)
 142 typedef struct cl_nbparam_params
 143 {
 144
 145     int eeltype; /**< type of electrostatics, takes values from #eelCu */
 146     int vdwtype; /**< type of VdW impl., takes values from #evdwCu     */
 147
 148     float epsfac;      /**< charge multiplication factor                      */
 149     float c_rf;        /**< Reaction-field/plain cutoff electrostatics const. */
 150     float two_k_rf;    /**< Reaction-field electrostatics constant            */
 151     float ewald_beta;  /**< Ewald/PME parameter                               */
 152     float sh_ewald;    /**< Ewald/PME correction term substracted from the direct-space potential */
 153     float sh_lj_ewald; /**< LJ-Ewald/PME correction term added to the correction potential        */
 154     float ewaldcoeff_lj; /**< LJ-Ewald/PME coefficient                          */
 155
 156     float rcoulomb_sq; /**< Coulomb cut-off squared                           */
 157
 158     float rvdw_sq;       /**< VdW cut-off squared                               */
 159     float rvdw_switch;   /**< VdW switched cut-off                              */
 160     float rlistOuter_sq; /**< Full, outer pair-list cut-off squared             */
 161     float rlistInner_sq; /**< Inner, dynamic pruned pair-list cut-off squared  XXX: this is only needed in the pruning kernels, but for now we also pass it to the nonbondeds */
 162
 163     shift_consts_t  dispersion_shift; /**< VdW shift dispersion constants           */
 164     shift_consts_t  repulsion_shift;  /**< VdW shift repulsion constants            */
 165     switch_consts_t vdw_switch;       /**< VdW switch constants                     */
 166
 167     /* Ewald Coulomb force table data - accessed through texture memory */
 168     float coulomb_tab_scale; /**< table scale/spacing                        */
 169 } cl_nbparam_params_t;
 170
 171 typedef struct
 172 {
 173     int sci;           /* i-super-cluster       */
 174     int shift;         /* Shift vector index plus possible flags */
 175     int cj4_ind_start; /* Start index into cj4  */
 176     int cj4_ind_end;   /* End index into cj4    */
 177 } nbnxn_sci_t;
 178
 179 typedef struct
 180 {
 181     unsigned int imask;    /* The i-cluster interactions mask for 1 warp  */
 182     int          excl_ind; /* Index into the exclusion array for 1 warp   */
 183 } nbnxn_im_ei_t;
 184
 185 typedef struct
 186 {
 187     int           cj[4];   /* The 4 j-clusters                            */
 188     nbnxn_im_ei_t imei[2]; /* The i-cluster mask data       for 2 warps   */
 189 } nbnxn_cj4_t;
 190
 191
 192 typedef struct
 193 {
 194     unsigned int pair[CL_SIZE * CL_SIZE / 2]; /* Topology exclusion interaction bits for one warp,
 195                                                * each unsigned has bitS for 4*8 i clusters
 196                                                */
 197 } nbnxn_excl_t;
 198
 199 /*! i-cluster interaction mask for a super-cluster with all NCL_PER_SUPERCL bits set */
 200 __constant unsigned supercl_interaction_mask = ((1U << NCL_PER_SUPERCL) - 1U);
 201
 202 gmx_opencl_inline void preloadCj4Generic(__local int*        sm_cjPreload,
 203                                          const __global int* gm_cj,
 204                                          int                 tidxi,
 205                                          int                 tidxj,
 206                                          bool gmx_unused iMaskCond)
 207 {
 208     /* Pre-load cj into shared memory */
 209 #    if defined _AMD_SOURCE_ // TODO: fix by setting c_nbnxnGpuClusterpairSplit properly
 210     if (tidxj == 0 & tidxi < c_nbnxnGpuJgroupSize)
 211     {
 212         sm_cjPreload[tidxi] = gm_cj[tidxi];
 213     }
 214 #    else
 215     const int c_clSize                   = CL_SIZE;
 216     const int c_nbnxnGpuClusterpairSplit = 2;
 217     const int c_splitClSize              = c_clSize / c_nbnxnGpuClusterpairSplit;
 218     if ((tidxj == 0 | tidxj == c_splitClSize) & (tidxi < c_nbnxnGpuJgroupSize))
 219     {
 220         sm_cjPreload[tidxi + tidxj * c_nbnxnGpuJgroupSize / c_splitClSize] = gm_cj[tidxi];
 221     }
 222 #    endif
 223 }
 224
 225
 226 #    if USE_SUBGROUP_PRELOAD
 227 gmx_opencl_inline int preloadCj4Subgroup(const __global int* gm_cj)
 228 {
 229     // loads subgroup-size # of elements (8) instead of the 4 required
 230     // equivalent to *cjs = *gm_cj
 231     return intel_sub_group_block_read((const __global uint*)gm_cj);
 232 }
 233 #    endif // USE_SUBGROUP_PRELOAD
 234
 235 #    if USE_SUBGROUP_PRELOAD
 236 typedef size_t CjType;
 237 #    else
 238 typedef __local int* CjType;
 239 #    endif
 240
 241 /*! \brief Preload cj4
 242  *
 243  * - For AMD we load once for a wavefront of 64 threads (on 4 threads * NTHREAD_Z)
 244  * - For NVIDIA once per warp (on 2x4 threads * NTHREAD_Z)
 245  * - For Intel(/USE_SUBGROUP_PRELOAD) loads into private memory(/register) instead of local memory
 246  *
 247  * It is the caller's responsibility to make sure that data is consumed only when
 248  * it's ready. This function does not call a barrier.
 249  */
 250 gmx_opencl_inline void preloadCj4(CjType gmx_unused* cjs,
 251                                   const __global int gmx_unused* gm_cj,
 252                                   int gmx_unused tidxi,
 253                                   int gmx_unused tidxj,
 254                                   bool gmx_unused iMaskCond)
 255 {
 256 #    if USE_SUBGROUP_PRELOAD
 257     *cjs = preloadCj4Subgroup(gm_cj);
 258 #    elif USE_CJ_PREFETCH
 259     preloadCj4Generic(*cjs, gm_cj, tidxi, tidxj, iMaskCond);
 260 #    else
 261     // nothing to do
 262 #    endif
 263 }
 264
 265 gmx_opencl_inline int loadCjPreload(__local int* sm_cjPreload, int jm, int gmx_unused tidxi, int gmx_unused tidxj)
 266 {
 267 #    if defined _AMD_SOURCE_
 268     int         warpLoadOffset = 0; // TODO: fix by setting c_nbnxnGpuClusterpairSplit properly
 269 #    else
 270     const int c_clSize = CL_SIZE;
 271     const int c_nbnxnGpuClusterpairSplit = 2;
 272     const int c_splitClSize = c_clSize / c_nbnxnGpuClusterpairSplit;
 273     int warpLoadOffset = (tidxj & c_splitClSize) * c_nbnxnGpuJgroupSize / c_splitClSize;
 274 #    endif
 275     return sm_cjPreload[jm + warpLoadOffset];
 276 }
 277
 278 /* \brief Load a cj given a jm index.
 279  *
 280  * If cj4 preloading is enabled, it loads from the local memory, otherwise from global.
 281  */
 282 gmx_opencl_inline int
 283 loadCj(CjType cjs, const __global int gmx_unused* gm_cj, int jm, int gmx_unused tidxi, int gmx_unused tidxj)
 284 {
 285 #    if USE_SUBGROUP_PRELOAD
 286     return sub_group_broadcast(cjs, jm);
 287 #    elif USE_CJ_PREFETCH
 288     return loadCjPreload(cjs, jm, tidxi, tidxj);
 289 #    else
 290     return gm_cj[jm];
 291 #    endif
 292 }
 293
 294 /*! Convert LJ sigma,epsilon parameters to C6,C12. */
 295 gmx_opencl_inline void convert_sigma_epsilon_to_c6_c12(const float sigma, const float epsilon, float* c6, float* c12)
 296 {
 297     float sigma2, sigma6;
 298
 299     sigma2 = sigma * sigma;
 300     sigma6 = sigma2 * sigma2 * sigma2;
 301     *c6    = epsilon * sigma6;
 302     *c12   = *c6 * sigma6;
 303 }
 304
 305
 306 /*! Apply force switch,  force + energy version. */
 307 gmx_opencl_inline void calculate_force_switch_F(const cl_nbparam_params_t* nbparam,
 308                                                 float                      c6,
 309                                                 float                      c12,
 310                                                 float                      inv_r,
 311                                                 float                      r2,
 312                                                 float*                     F_invr)
 313 {
 314     float r, r_switch;
 315
 316     /* force switch constants */
 317     float disp_shift_V2 = nbparam->dispersion_shift.c2;
 318     float disp_shift_V3 = nbparam->dispersion_shift.c3;
 319     float repu_shift_V2 = nbparam->repulsion_shift.c2;
 320     float repu_shift_V3 = nbparam->repulsion_shift.c3;
 321
 322     r        = r2 * inv_r;
 323     r_switch = r - nbparam->rvdw_switch;
 324     r_switch = r_switch >= 0.0f ? r_switch : 0.0f;
 325
 326     *F_invr += -c6 * (disp_shift_V2 + disp_shift_V3 * r_switch) * r_switch * r_switch * inv_r
 327                + c12 * (repu_shift_V2 + repu_shift_V3 * r_switch) * r_switch * r_switch * inv_r;
 328 }
 329
 330 /*! Apply force switch, force-only version. */
 331 gmx_opencl_inline void calculate_force_switch_F_E(const cl_nbparam_params_t* nbparam,
 332                                                   float                      c6,
 333                                                   float                      c12,
 334                                                   float                      inv_r,
 335                                                   float                      r2,
 336                                                   float*                     F_invr,
 337                                                   float*                     E_lj)
 338 {
 339     float r, r_switch;
 340
 341     /* force switch constants */
 342     float disp_shift_V2 = nbparam->dispersion_shift.c2;
 343     float disp_shift_V3 = nbparam->dispersion_shift.c3;
 344     float repu_shift_V2 = nbparam->repulsion_shift.c2;
 345     float repu_shift_V3 = nbparam->repulsion_shift.c3;
 346
 347     float disp_shift_F2 = nbparam->dispersion_shift.c2 / 3;
 348     float disp_shift_F3 = nbparam->dispersion_shift.c3 / 4;
 349     float repu_shift_F2 = nbparam->repulsion_shift.c2 / 3;
 350     float repu_shift_F3 = nbparam->repulsion_shift.c3 / 4;
 351
 352     r        = r2 * inv_r;
 353     r_switch = r - nbparam->rvdw_switch;
 354     r_switch = r_switch >= 0.0f ? r_switch : 0.0f;
 355
 356     *F_invr += -c6 * (disp_shift_V2 + disp_shift_V3 * r_switch) * r_switch * r_switch * inv_r
 357                + c12 * (repu_shift_V2 + repu_shift_V3 * r_switch) * r_switch * r_switch * inv_r;
 358     *E_lj += c6 * (disp_shift_F2 + disp_shift_F3 * r_switch) * r_switch * r_switch * r_switch
 359              - c12 * (repu_shift_F2 + repu_shift_F3 * r_switch) * r_switch * r_switch * r_switch;
 360 }
 361
 362 /*! Apply potential switch, force-only version. */
 363 gmx_opencl_inline void calculate_potential_switch_F(const cl_nbparam_params_t* nbparam,
 364                                                     float                      inv_r,
 365                                                     float                      r2,
 366                                                     float*                     F_invr,
 367                                                     const float*               E_lj)
 368 {
 369     float r, r_switch;
 370     float sw, dsw;
 371
 372     /* potential switch constants */
 373     float switch_V3 = nbparam->vdw_switch.c3;
 374     float switch_V4 = nbparam->vdw_switch.c4;
 375     float switch_V5 = nbparam->vdw_switch.c5;
 376     float switch_F2 = nbparam->vdw_switch.c3;
 377     float switch_F3 = nbparam->vdw_switch.c4;
 378     float switch_F4 = nbparam->vdw_switch.c5;
 379
 380     r        = r2 * inv_r;
 381     r_switch = r - nbparam->rvdw_switch;
 382
 383     /* Unlike in the F+E kernel, conditional is faster here */
 384     if (r_switch > 0.0f)
 385     {
 386         sw  = 1.0f + (switch_V3 + (switch_V4 + switch_V5 * r_switch) * r_switch) * r_switch * r_switch * r_switch;
 387         dsw = (switch_F2 + (switch_F3 + switch_F4 * r_switch) * r_switch) * r_switch * r_switch;
 388
 389         *F_invr = (*F_invr) * sw - inv_r * (*E_lj) * dsw;
 390     }
 391 }
 392
 393 /*! Apply potential switch, force + energy version. */
 394 gmx_opencl_inline void calculate_potential_switch_F_E(const cl_nbparam_params_t* nbparam,
 395                                                       float                      inv_r,
 396                                                       float                      r2,
 397                                                       float*                     F_invr,
 398                                                       float*                     E_lj)
 399 {
 400     float r, r_switch;
 401     float sw, dsw;
 402
 403     /* potential switch constants */
 404     float switch_V3 = nbparam->vdw_switch.c3;
 405     float switch_V4 = nbparam->vdw_switch.c4;
 406     float switch_V5 = nbparam->vdw_switch.c5;
 407     float switch_F2 = nbparam->vdw_switch.c3;
 408     float switch_F3 = nbparam->vdw_switch.c4;
 409     float switch_F4 = nbparam->vdw_switch.c5;
 410
 411     r        = r2 * inv_r;
 412     r_switch = r - nbparam->rvdw_switch;
 413     r_switch = r_switch >= 0.0f ? r_switch : 0.0f;
 414
 415     /* Unlike in the F-only kernel, masking is faster here */
 416     sw  = 1.0f + (switch_V3 + (switch_V4 + switch_V5 * r_switch) * r_switch) * r_switch * r_switch * r_switch;
 417     dsw = (switch_F2 + (switch_F3 + switch_F4 * r_switch) * r_switch) * r_switch * r_switch;
 418
 419     *F_invr = (*F_invr) * sw - inv_r * (*E_lj) * dsw;
 420     *E_lj *= sw;
 421 }
 422
 423 /*! Calculate LJ-PME grid force contribution with
 424  *  geometric combination rule.
 425  */
 426 gmx_opencl_inline void calculate_lj_ewald_comb_geom_F(__constant const float* nbfp_comb_climg2d,
 427                                                       int                     typei,
 428                                                       int                     typej,
 429                                                       float                   r2,
 430                                                       float                   inv_r2,
 431                                                       float                   lje_coeff2,
 432                                                       float                   lje_coeff6_6,
 433                                                       float*                  F_invr)
 434 {
 435     float c6grid, inv_r6_nm, cr2, expmcr2, poly;
 436
 437     c6grid = nbfp_comb_climg2d[2 * typei] * nbfp_comb_climg2d[2 * typej];
 438
 439     /* Recalculate inv_r6 without exclusion mask */
 440     inv_r6_nm = inv_r2 * inv_r2 * inv_r2;
 441     cr2       = lje_coeff2 * r2;
 442     expmcr2   = exp(-cr2);
 443     poly      = 1.0f + cr2 + 0.5f * cr2 * cr2;
 444
 445     /* Subtract the grid force from the total LJ force */
 446     *F_invr += c6grid * (inv_r6_nm - expmcr2 * (inv_r6_nm * poly + lje_coeff6_6)) * inv_r2;
 447 }
 448
 449 /*! Calculate LJ-PME grid force + energy contribution with
 450  *  geometric combination rule.
 451  */
 452 gmx_opencl_inline void calculate_lj_ewald_comb_geom_F_E(__constant const float* nbfp_comb_climg2d,
 453                                                         const cl_nbparam_params_t* nbparam,
 454                                                         int                        typei,
 455                                                         int                        typej,
 456                                                         float                      r2,
 457                                                         float                      inv_r2,
 458                                                         float                      lje_coeff2,
 459                                                         float                      lje_coeff6_6,
 460                                                         float                      int_bit,
 461                                                         float*                     F_invr,
 462                                                         float*                     E_lj)
 463 {
 464     float c6grid, inv_r6_nm, cr2, expmcr2, poly, sh_mask;
 465
 466     c6grid = nbfp_comb_climg2d[2 * typei] * nbfp_comb_climg2d[2 * typej];
 467
 468     /* Recalculate inv_r6 without exclusion mask */
 469     inv_r6_nm = inv_r2 * inv_r2 * inv_r2;
 470     cr2       = lje_coeff2 * r2;
 471     expmcr2   = exp(-cr2);
 472     poly      = 1.0f + cr2 + 0.5f * cr2 * cr2;
 473
 474     /* Subtract the grid force from the total LJ force */
 475     *F_invr += c6grid * (inv_r6_nm - expmcr2 * (inv_r6_nm * poly + lje_coeff6_6)) * inv_r2;
 476
 477     /* Shift should be applied only to real LJ pairs */
 478     sh_mask = nbparam->sh_lj_ewald * int_bit;
 479     *E_lj += ONE_SIXTH_F * c6grid * (inv_r6_nm * (1.0f - expmcr2 * poly) + sh_mask);
 480 }
 481
 482 /*! Calculate LJ-PME grid force + energy contribution (if E_lj != NULL) with
 483  *  Lorentz-Berthelot combination rule.
 484  *  We use a single F+E kernel with conditional because the performance impact
 485  *  of this is pretty small and LB on the CPU is anyway very slow.
 486  */
 487 gmx_opencl_inline void calculate_lj_ewald_comb_LB_F_E(__constant const float*    nbfp_comb_climg2d,
 488                                                       const cl_nbparam_params_t* nbparam,
 489                                                       int                        typei,
 490                                                       int                        typej,
 491                                                       float                      r2,
 492                                                       float                      inv_r2,
 493                                                       float                      lje_coeff2,
 494                                                       float                      lje_coeff6_6,
 495                                                       float                      int_bit,
 496                                                       bool                       with_E_lj,
 497                                                       float*                     F_invr,
 498                                                       float*                     E_lj)
 499 {
 500     float c6grid, inv_r6_nm, cr2, expmcr2, poly;
 501     float sigma, sigma2, epsilon;
 502
 503     /* sigma and epsilon are scaled to give 6*C6 */
 504     sigma = nbfp_comb_climg2d[2 * typei] + nbfp_comb_climg2d[2 * typej];
 505
 506     epsilon = nbfp_comb_climg2d[2 * typei + 1] * nbfp_comb_climg2d[2 * typej + 1];
 507
 508     sigma2 = sigma * sigma;
 509     c6grid = epsilon * sigma2 * sigma2 * sigma2;
 510
 511     /* Recalculate inv_r6 without exclusion mask */
 512     inv_r6_nm = inv_r2 * inv_r2 * inv_r2;
 513     cr2       = lje_coeff2 * r2;
 514     expmcr2   = exp(-cr2);
 515     poly      = 1.0f + cr2 + 0.5f * cr2 * cr2;
 516
 517     /* Subtract the grid force from the total LJ force */
 518     *F_invr += c6grid * (inv_r6_nm - expmcr2 * (inv_r6_nm * poly + lje_coeff6_6)) * inv_r2;
 519
 520     if (with_E_lj)
 521     {
 522         float sh_mask;
 523
 524         /* Shift should be applied only to real LJ pairs */
 525         sh_mask = nbparam->sh_lj_ewald * int_bit;
 526         *E_lj += ONE_SIXTH_F * c6grid * (inv_r6_nm * (1.0f - expmcr2 * poly) + sh_mask);
 527     }
 528 }
 529
 530 /*! Interpolate Ewald coulomb force using the table through the tex_nbfp texture.
 531  *  Original idea: from the OpenMM project
 532  */
 533 gmx_opencl_inline float interpolate_coulomb_force_r(__constant const float* coulomb_tab_climg2d,
 534                                                     float                   r,
 535                                                     float                   scale)
 536 {
 537     float normalized = scale * r;
 538     int   index      = (int)normalized;
 539     float fract2     = normalized - index;
 540     float fract1     = 1.0f - fract2;
 541
 542     return fract1 * coulomb_tab_climg2d[index] + fract2 * coulomb_tab_climg2d[index + 1];
 543 }
 544
 545 /*! Calculate analytical Ewald correction term. */
 546 gmx_opencl_inline float pmecorrF(float z2)
 547 {
 548     const float FN6 = -1.7357322914161492954e-8f;
 549     const float FN5 = 1.4703624142580877519e-6f;
 550     const float FN4 = -0.000053401640219807709149f;
 551     const float FN3 = 0.0010054721316683106153f;
 552     const float FN2 = -0.019278317264888380590f;
 553     const float FN1 = 0.069670166153766424023f;
 554     const float FN0 = -0.75225204789749321333f;
 555
 556     const float FD4 = 0.0011193462567257629232f;
 557     const float FD3 = 0.014866955030185295499f;
 558     const float FD2 = 0.11583842382862377919f;
 559     const float FD1 = 0.50736591960530292870f;
 560     const float FD0 = 1.0f;
 561
 562     float z4;
 563     float polyFN0, polyFN1, polyFD0, polyFD1;
 564
 565     z4 = z2 * z2;
 566
 567     polyFD0 = FD4 * z4 + FD2;
 568     polyFD1 = FD3 * z4 + FD1;
 569     polyFD0 = polyFD0 * z4 + FD0;
 570     polyFD0 = polyFD1 * z2 + polyFD0;
 571
 572     polyFD0 = 1.0f / polyFD0;
 573
 574     polyFN0 = FN6 * z4 + FN4;
 575     polyFN1 = FN5 * z4 + FN3;
 576     polyFN0 = polyFN0 * z4 + FN2;
 577     polyFN1 = polyFN1 * z4 + FN1;
 578     polyFN0 = polyFN0 * z4 + FN0;
 579     polyFN0 = polyFN1 * z2 + polyFN0;
 580
 581     return polyFN0 * polyFD0;
 582 }
 583
 584 #    if REDUCE_SHUFFLE
 585 gmx_opencl_inline void
 586 reduce_force_j_shfl(float3 fin, __global float* fout, int gmx_unused tidxi, int gmx_unused tidxj, int aidx)
 587 {
 588     /* Only does reduction over 4 elements in cluster. Needs to be changed
 589      * for CL_SIZE>4. See CUDA code for required code */
 590     fin.x += intel_sub_group_shuffle_down(fin.x, fin.x, 1);
 591     fin.y += intel_sub_group_shuffle_up(fin.y, fin.y, 1);
 592     fin.z += intel_sub_group_shuffle_down(fin.z, fin.z, 1);
 593     if ((tidxi & 1) == 1)
 594     {
 595         fin.x = fin.y;
 596     }
 597     fin.x += intel_sub_group_shuffle_down(fin.x, fin.x, 2);
 598     fin.z += intel_sub_group_shuffle_up(fin.z, fin.z, 2);
 599     if (tidxi == 2)
 600     {
 601         fin.x = fin.z;
 602     }
 603     if (tidxi < 3)
 604     {
 605         atomicAdd_g_f(&fout[3 * aidx + tidxi], fin.x);
 606     }
 607 }
 608 #    endif
 609
 610 gmx_opencl_inline void
 611 reduce_force_j_generic(__local float* f_buf, float3 fcj_buf, __global float* fout, int tidxi, int tidxj, int aidx)
 612 {
 613     int tidx                      = tidxi + tidxj * CL_SIZE;
 614     f_buf[tidx]                   = fcj_buf.x;
 615     f_buf[FBUF_STRIDE + tidx]     = fcj_buf.y;
 616     f_buf[2 * FBUF_STRIDE + tidx] = fcj_buf.z;
 617
 618     /* Split the reduction between the first 3 column threads
 619        Threads with column id 0 will do the reduction for (float3).x components
 620        Threads with column id 1 will do the reduction for (float3).y components
 621        Threads with column id 2 will do the reduction for (float3).z components.
 622        The reduction is performed for each line tidxj of f_buf. */
 623     if (tidxi < 3)
 624     {
 625         float f = 0.0f;
 626         for (int j = tidxj * CL_SIZE; j < (tidxj + 1) * CL_SIZE; j++)
 627         {
 628             f += f_buf[FBUF_STRIDE * tidxi + j];
 629         }
 630
 631         atomicAdd_g_f(&fout[3 * aidx + tidxi], f);
 632     }
 633 }
 634
 635 /*! Final j-force reduction
 636  */
 637 gmx_opencl_inline void reduce_force_j(__local float gmx_unused* f_buf,
 638                                       float3                    fcj_buf,
 639                                       __global float*           fout,
 640                                       int                       tidxi,
 641                                       int                       tidxj,
 642                                       int                       aidx)
 643 {
 644 #    if REDUCE_SHUFFLE
 645     reduce_force_j_shfl(fcj_buf, fout, tidxi, tidxj, aidx);
 646 #    else
 647     reduce_force_j_generic(f_buf, fcj_buf, fout, tidxi, tidxj, aidx);
 648 #    endif
 649 }
 650
 651 #    if REDUCE_SHUFFLE
 652 gmx_opencl_inline void reduce_force_i_and_shift_shfl(float3*         fci_buf,
 653                                                      __global float* fout,
 654                                                      bool            bCalcFshift,
 655                                                      int             tidxi,
 656                                                      int             tidxj,
 657                                                      int             sci,
 658                                                      int             shift,
 659                                                      __global float* fshift)
 660 {
 661     /* Only does reduction over 4 elements in cluster (2 per warp). Needs to be changed
 662      * for CL_SIZE>4.*/
 663     float2 fshift_buf = 0;
 664     for (int ci_offset = 0; ci_offset < NCL_PER_SUPERCL; ci_offset++)
 665     {
 666         int    aidx = (sci * NCL_PER_SUPERCL + ci_offset) * CL_SIZE + tidxi;
 667         float3 fin  = fci_buf[ci_offset];
 668         fin.x += intel_sub_group_shuffle_down(fin.x, fin.x, CL_SIZE);
 669         fin.y += intel_sub_group_shuffle_up(fin.y, fin.y, CL_SIZE);
 670         fin.z += intel_sub_group_shuffle_down(fin.z, fin.z, CL_SIZE);
 671
 672         if (tidxj & 1)
 673         {
 674             fin.x = fin.y;
 675         }
 676         /* Threads 0,1 and 2,3 increment x,y for their warp */
 677         atomicAdd_g_f(&fout[3 * aidx + (tidxj & 1)], fin.x);
 678         if (bCalcFshift)
 679         {
 680             fshift_buf[0] += fin.x;
 681         }
 682         /* Threads 0 and 2 increment z for their warp */
 683         if ((tidxj & 1) == 0)
 684         {
 685             atomicAdd_g_f(&fout[3 * aidx + 2], fin.z);
 686             if (bCalcFshift)
 687             {
 688                 fshift_buf[1] += fin.z;
 689             }
 690         }
 691     }
 692     /* add up local shift forces into global mem */
 693     if (bCalcFshift)
 694     {
 695         // Threads 0,1 and 2,3 update x,y
 696         atomicAdd_g_f(&(fshift[3 * shift + (tidxj & 1)]), fshift_buf[0]);
 697         // Threads 0 and 2 update z
 698         if ((tidxj & 1) == 0)
 699         {
 700             atomicAdd_g_f(&(fshift[3 * shift + 2]), fshift_buf[1]);
 701         }
 702     }
 703 }
 704 #    endif
 705
 706 /*! Final i-force reduction; this implementation works only with power of two
 707  *  array sizes.
 708  */
 709 gmx_opencl_inline void reduce_force_i_and_shift_pow2(volatile __local float* f_buf,
 710                                                      float3*                 fci_buf,
 711                                                      __global float*         fout,
 712                                                      bool                    bCalcFshift,
 713                                                      int                     tidxi,
 714                                                      int                     tidxj,
 715                                                      int                     sci,
 716                                                      int                     shift,
 717                                                      __global float*         fshift)
 718 {
 719     float fshift_buf = 0;
 720     for (int ci_offset = 0; ci_offset < NCL_PER_SUPERCL; ci_offset++)
 721     {
 722         int aidx = (sci * NCL_PER_SUPERCL + ci_offset) * CL_SIZE + tidxi;
 723         int tidx = tidxi + tidxj * CL_SIZE;
 724         /* store i forces in shmem */
 725         f_buf[tidx]                   = fci_buf[ci_offset].x;
 726         f_buf[FBUF_STRIDE + tidx]     = fci_buf[ci_offset].y;
 727         f_buf[2 * FBUF_STRIDE + tidx] = fci_buf[ci_offset].z;
 728         barrier(CLK_LOCAL_MEM_FENCE);
 729
 730         int i, j;
 731         /* Reduce the initial CL_SIZE values for each i atom to half
 732          * every step by using CL_SIZE * i threads.
 733          * Can't just use i as loop variable because than nvcc refuses to unroll.
 734          */
 735         i = CL_SIZE / 2;
 736         for (j = CL_SIZE_LOG2 - 1; j > 0; j--)
 737         {
 738             if (tidxj < i)
 739             {
 740
 741                 f_buf[tidxj * CL_SIZE + tidxi] += f_buf[(tidxj + i) * CL_SIZE + tidxi];
 742                 f_buf[FBUF_STRIDE + tidxj * CL_SIZE + tidxi] +=
 743                         f_buf[FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi];
 744                 f_buf[2 * FBUF_STRIDE + tidxj * CL_SIZE + tidxi] +=
 745                         f_buf[2 * FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi];
 746             }
 747             i >>= 1;
 748         }
 749         /* needed because
 750          * a) for CL_SIZE<8: id 2 (doing z in next block) is in 2nd warp
 751          * b) for all CL_SIZE a barrier is needed before f_buf is reused by next reduce_force_i call
 752          * TODO: Test on Nvidia for performance difference between having the barrier here or after the atomicAdd
 753          */
 754         barrier(CLK_LOCAL_MEM_FENCE);
 755
 756         /* i == 1, last reduction step, writing to global mem */
 757         /* Split the reduction between the first 3 line threads
 758            Threads with line id 0 will do the reduction for (float3).x components
 759            Threads with line id 1 will do the reduction for (float3).y components
 760            Threads with line id 2 will do the reduction for (float3).z components. */
 761         if (tidxj < 3)
 762         {
 763             float f = f_buf[tidxj * FBUF_STRIDE + tidxi] + f_buf[tidxj * FBUF_STRIDE + i * CL_SIZE + tidxi];
 764
 765             atomicAdd_g_f(&fout[3 * aidx + tidxj], f);
 766
 767             if (bCalcFshift)
 768             {
 769                 fshift_buf += f;
 770             }
 771         }
 772     }
 773     /* add up local shift forces into global mem */
 774     if (bCalcFshift)
 775     {
 776         /* Only threads with tidxj < 3 will update fshift.
 777            The threads performing the update, must be the same as the threads
 778            storing the reduction result above.
 779          */
 780         if (tidxj < 3)
 781         {
 782             atomicAdd_g_f(&(fshift[3 * shift + tidxj]), fshift_buf);
 783         }
 784     }
 785 }
 786
 787 /*! Final i-force reduction
 788  */
 789 gmx_opencl_inline void reduce_force_i_and_shift(__local float gmx_unused* f_buf,
 790                                                 float3*                   fci_buf,
 791                                                 __global float*           f,
 792                                                 bool                      bCalcFshift,
 793                                                 int                       tidxi,
 794                                                 int                       tidxj,
 795                                                 int                       sci,
 796                                                 int                       shift,
 797                                                 __global float*           fshift)
 798 {
 799 #    if REDUCE_SHUFFLE
 800     reduce_force_i_and_shift_shfl(fci_buf, f, bCalcFshift, tidxi, tidxj, sci, shift, fshift);
 801 #    else
 802     reduce_force_i_and_shift_pow2(f_buf, fci_buf, f, bCalcFshift, tidxi, tidxj, sci, shift, fshift);
 803 #    endif
 804 }
 805
 806
 807 #    if REDUCE_SHUFFLE
 808 gmx_opencl_inline void reduce_energy_shfl(float                    E_lj,
 809                                           float                    E_el,
 810                                           volatile __global float* e_lj,
 811                                           volatile __global float* e_el,
 812                                           unsigned int             tidx)
 813 {
 814     E_lj = sub_group_reduce_add(E_lj);
 815     E_el = sub_group_reduce_add(E_el);
 816     /* Should be get_sub_group_local_id()==0. Doesn't work with Intel Classic driver.
 817      * To make it possible to use REDUCE_SHUFFLE with single subgroup per i-j pair
 818      * (e.g. subgroup size 16 with CL_SIZE 4), either this "if" needs to be changed or
 819      * the definition of WARP_SIZE (currently CL_SIZE*CL_SIZE/2) needs to be changed
 820      * (by supporting c_nbnxnGpuClusterpairSplit=1). */
 821     if (tidx == 0 || tidx == WARP_SIZE)
 822     {
 823         atomicAdd_g_f(e_lj, E_lj);
 824         atomicAdd_g_f(e_el, E_el);
 825     }
 826 }
 827 #    endif
 828
 829 /*! Energy reduction; this implementation works only with power of two
 830  *  array sizes.
 831  */
 832 gmx_opencl_inline void reduce_energy_pow2(volatile __local float*  buf,
 833                                           volatile __global float* e_lj,
 834                                           volatile __global float* e_el,
 835                                           unsigned int             tidx)
 836 {
 837     int j;
 838
 839     unsigned int i = WARP_SIZE / 2;
 840
 841     /* Can't just use i as loop variable because than nvcc refuses to unroll. */
 842     for (j = WARP_SIZE_LOG2 - 1; j > 0; j--)
 843     {
 844         if (tidx < i)
 845         {
 846             buf[tidx] += buf[tidx + i];
 847             buf[FBUF_STRIDE + tidx] += buf[FBUF_STRIDE + tidx + i];
 848         }
 849         i >>= 1;
 850     }
 851
 852     /* last reduction step, writing to global mem */
 853     if (tidx == 0)
 854     {
 855         float e1 = buf[tidx] + buf[tidx + i];
 856         float e2 = buf[FBUF_STRIDE + tidx] + buf[FBUF_STRIDE + tidx + i];
 857
 858         atomicAdd_g_f(e_lj, e1);
 859         atomicAdd_g_f(e_el, e2);
 860     }
 861 }
 862
 863 gmx_opencl_inline void reduce_energy(volatile __local float gmx_unused* buf,
 864                                      float                              E_lj,
 865                                      float                              E_el,
 866                                      volatile __global float*           e_lj,
 867                                      volatile __global float*           e_el,
 868                                      unsigned int                       tidx)
 869 {
 870 #    if REDUCE_SHUFFLE
 871     reduce_energy_shfl(E_lj, E_el, e_lj, e_el, tidx);
 872 #    else
 873     /* flush the energies to shmem and reduce them */
 874     buf[tidx] = E_lj;
 875     buf[FBUF_STRIDE + tidx] = E_el;
 876     reduce_energy_pow2(buf + (tidx & WARP_SIZE), e_lj, e_el, tidx & ~WARP_SIZE);
 877 #    endif
 878 }
 879
 880 gmx_opencl_inline bool gmx_sub_group_any_localmem(volatile __local uint* warp_any, int widx, bool pred)
 881 {
 882     if (pred)
 883     {
 884         warp_any[widx] = 1;
 885     }
 886
 887     bool ret = warp_any[widx];
 888
 889     warp_any[widx] = 0;
 890
 891     return ret;
 892 }
 893
 894 //! Returns a true if predicate is true for any work item in warp
 895 gmx_opencl_inline bool gmx_sub_group_any(volatile __local uint gmx_unused* warp_any,
 896                                          int gmx_unused widx,
 897                                          bool           pred)
 898 {
 899 #    if USE_SUBGROUP_ANY
 900     return sub_group_any(pred);
 901 #    else
 902     return gmx_sub_group_any_localmem(warp_any, widx, pred);
 903 #    endif
 904 }
 905
 906 #endif /* NBNXN_OPENCL_KERNEL_UTILS_CLH */