src/gromacs/nbnxm/opencl/nbnxm_ocl_kernel_utils.clh

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2012,2013,2014,2016,2017,2018,2019, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 #define GMX_DOUBLE 0
  37
  38 #include "gromacs/gpu_utils/device_utils.clh"
  39 #include "gromacs/gpu_utils/vectype_ops.clh"
  40 #include "gromacs/nbnxm/constants.h"
  41 #include "gromacs/pbcutil/ishift.h"
  42
  43 #include "nbnxm_ocl_consts.h"
  44
  45 #define CL_SIZE                 (NBNXN_GPU_CLUSTER_SIZE)
  46 #define NCL_PER_SUPERCL         c_nbnxnGpuNumClusterPerSupercluster
  47
  48 #define WARP_SIZE  (CL_SIZE*CL_SIZE/2) //Currently only c_nbnxnGpuClusterpairSplit=2 supported
  49
  50 #if defined _NVIDIA_SOURCE_ || defined _AMD_SOURCE_
  51 /* Currently we enable CJ prefetch for AMD/NVIDIA and disable it for other vendors
  52  * Note that this should precede the kernel_utils include.
  53  */
  54 #define USE_CJ_PREFETCH 1
  55 #else
  56 #define USE_CJ_PREFETCH 0
  57 #endif
  58
  59 #if defined cl_intel_subgroups || defined cl_khr_subgroups || (defined __OPENCL_VERSION__ && __OPENCL_VERSION__ >= 210)
  60 #define HAVE_SUBGROUP 1
  61 #else
  62 #define HAVE_SUBGROUP 0
  63 #endif
  64
  65 #ifdef cl_intel_subgroups
  66 #define HAVE_INTEL_SUBGROUP 1
  67 #else
  68 #define HAVE_INTEL_SUBGROUP 0
  69 #endif
  70
  71 #if defined _INTEL_SOURCE_
  72 #define SUBGROUP_SIZE 8
  73 #elif defined _AMD_SOURCE_
  74 #define SUBGROUP_SIZE 64
  75 #else
  76 #define SUBGROUP_SIZE 32
  77 #endif
  78
  79 #define REDUCE_SHUFFLE (HAVE_INTEL_SUBGROUP && CL_SIZE == 4 && SUBGROUP_SIZE == WARP_SIZE)
  80 #define USE_SUBGROUP_ANY (HAVE_SUBGROUP && SUBGROUP_SIZE == WARP_SIZE)
  81 #define USE_SUBGROUP_PRELOAD HAVE_INTEL_SUBGROUP
  82
  83 /* 1.0 / sqrt(M_PI) */
  84 #define M_FLOAT_1_SQRTPI 0.564189583547756f
  85
  86 //-------------------
  87
  88 #ifndef NBNXN_OPENCL_KERNEL_UTILS_CLH
  89 #define NBNXN_OPENCL_KERNEL_UTILS_CLH
  90
  91 #if CL_SIZE == 8
  92 #define WARP_SIZE_LOG2  (5)
  93 #define CL_SIZE_LOG2    (3)
  94 #elif CL_SIZE == 4
  95 #define WARP_SIZE_LOG2  (3)
  96 #define CL_SIZE_LOG2    (2)
  97 #else
  98 #error unsupported CL_SIZE
  99 #endif
 100
 101 #define CL_SIZE_SQ      (CL_SIZE * CL_SIZE)
 102 #define FBUF_STRIDE     (CL_SIZE_SQ)
 103
 104 #define ONE_SIXTH_F     0.16666667f
 105 #define ONE_TWELVETH_F  0.08333333f
 106
 107
 108 #ifdef __GNUC__
 109 /* GCC, clang, and some ICC pretending to be GCC */
 110 #  define gmx_unused __attribute__ ((unused))
 111 #else
 112 #  define gmx_unused
 113 #endif
 114
 115 // Data structures shared between OpenCL device code and OpenCL host code
 116 // TODO: review, improve
 117 // Replaced real by float for now, to avoid including any other header
 118 typedef struct {
 119     float c2;
 120     float c3;
 121     float cpot;
 122 } shift_consts_t;
 123
 124 /* Used with potential switching:
 125  * rsw        = max(r - r_switch, 0)
 126  * sw         = 1 + c3*rsw^3 + c4*rsw^4 + c5*rsw^5
 127  * dsw        = 3*c3*rsw^2 + 4*c4*rsw^3 + 5*c5*rsw^4
 128  * force      = force*dsw - potential*sw
 129  * potential *= sw
 130  */
 131 typedef struct {
 132     float c3;
 133     float c4;
 134     float c5;
 135 } switch_consts_t;
 136
 137 // Data structure shared between the OpenCL device code and OpenCL host code
 138 // Must not contain OpenCL objects (buffers)
 139 typedef struct cl_nbparam_params
 140 {
 141
 142     int             eeltype;          /**< type of electrostatics, takes values from #eelCu */
 143     int             vdwtype;          /**< type of VdW impl., takes values from #evdwCu     */
 144
 145     float           epsfac;           /**< charge multiplication factor                      */
 146     float           c_rf;             /**< Reaction-field/plain cutoff electrostatics const. */
 147     float           two_k_rf;         /**< Reaction-field electrostatics constant            */
 148     float           ewald_beta;       /**< Ewald/PME parameter                               */
 149     float           sh_ewald;         /**< Ewald/PME correction term substracted from the direct-space potential */
 150     float           sh_lj_ewald;      /**< LJ-Ewald/PME correction term added to the correction potential        */
 151     float           ewaldcoeff_lj;    /**< LJ-Ewald/PME coefficient                          */
 152
 153     float           rcoulomb_sq;      /**< Coulomb cut-off squared                           */
 154
 155     float           rvdw_sq;          /**< VdW cut-off squared                               */
 156     float           rvdw_switch;      /**< VdW switched cut-off                              */
 157     float           rlistOuter_sq;    /**< Full, outer pair-list cut-off squared             */
 158     float           rlistInner_sq;    /**< Inner, dynamic pruned pair-list cut-off squared  XXX: this is only needed in the pruning kernels, but for now we also pass it to the nonbondeds */
 159
 160     shift_consts_t  dispersion_shift; /**< VdW shift dispersion constants           */
 161     shift_consts_t  repulsion_shift;  /**< VdW shift repulsion constants            */
 162     switch_consts_t vdw_switch;       /**< VdW switch constants                     */
 163
 164     /* Ewald Coulomb force table data - accessed through texture memory */
 165     float                  coulomb_tab_scale;  /**< table scale/spacing                        */
 166 }cl_nbparam_params_t;
 167
 168 typedef struct {
 169     int sci;            /* i-super-cluster       */
 170     int shift;          /* Shift vector index plus possible flags */
 171     int cj4_ind_start;  /* Start index into cj4  */
 172     int cj4_ind_end;    /* End index into cj4    */
 173 } nbnxn_sci_t;
 174
 175 typedef struct {
 176     unsigned int imask;    /* The i-cluster interactions mask for 1 warp  */
 177     int          excl_ind; /* Index into the exclusion array for 1 warp   */
 178 } nbnxn_im_ei_t;
 179
 180 typedef struct {
 181     int           cj[4];   /* The 4 j-clusters                            */
 182     nbnxn_im_ei_t imei[2]; /* The i-cluster mask data       for 2 warps   */
 183 } nbnxn_cj4_t;
 184
 185
 186 typedef struct {
 187     unsigned int pair[CL_SIZE*CL_SIZE/2]; /* Topology exclusion interaction bits for one warp,
 188                                            * each unsigned has bitS for 4*8 i clusters
 189                                            */
 190 } nbnxn_excl_t;
 191
 192 /*! i-cluster interaction mask for a super-cluster with all NCL_PER_SUPERCL bits set */
 193 __constant unsigned supercl_interaction_mask = ((1U << NCL_PER_SUPERCL) - 1U);
 194
 195 gmx_opencl_inline
 196 void preloadCj4Generic(__local int        *sm_cjPreload,
 197                        const __global int *gm_cj,
 198                        int                 tidxi,
 199                        int                 tidxj,
 200                        bool gmx_unused     iMaskCond)
 201 {
 202     /* Pre-load cj into shared memory */
 203 #if defined _AMD_SOURCE_ //TODO: fix by setting c_nbnxnGpuClusterpairSplit properly
 204     if (tidxj == 0 & tidxi < c_nbnxnGpuJgroupSize)
 205     {
 206         sm_cjPreload[tidxi] = gm_cj[tidxi];
 207     }
 208 #else
 209     const int c_clSize                   = CL_SIZE;
 210     const int c_nbnxnGpuClusterpairSplit = 2;
 211     const int c_splitClSize              = c_clSize/c_nbnxnGpuClusterpairSplit;
 212     if ((tidxj == 0 | tidxj == c_splitClSize) & (tidxi < c_nbnxnGpuJgroupSize))
 213     {
 214         sm_cjPreload[tidxi + tidxj * c_nbnxnGpuJgroupSize/c_splitClSize] = gm_cj[tidxi];
 215     }
 216 #endif
 217 }
 218
 219
 220 #if USE_SUBGROUP_PRELOAD
 221 gmx_opencl_inline
 222 int  preloadCj4Subgroup(const __global int *gm_cj)
 223 {
 224     //loads subgroup-size # of elements (8) instead of the 4 required
 225     //equivalent to *cjs = *gm_cj
 226     return intel_sub_group_block_read((const __global uint *)gm_cj);
 227 }
 228 #endif //USE_SUBGROUP_PRELOAD
 229
 230 #if USE_SUBGROUP_PRELOAD
 231 typedef size_t CjType;
 232 #else
 233 typedef __local int* CjType;
 234 #endif
 235
 236 /*! \brief Preload cj4
 237  *
 238  * - For AMD we load once for a wavefront of 64 threads (on 4 threads * NTHREAD_Z)
 239  * - For NVIDIA once per warp (on 2x4 threads * NTHREAD_Z)
 240  * - For Intel(/USE_SUBGROUP_PRELOAD) loads into private memory(/register) instead of local memory
 241  *
 242  * It is the caller's responsibility to make sure that data is consumed only when
 243  * it's ready. This function does not call a barrier.
 244  */
 245 gmx_opencl_inline
 246 void preloadCj4(CjType gmx_unused             *cjs,
 247                 const __global int gmx_unused *gm_cj,
 248                 int gmx_unused                 tidxi,
 249                 int gmx_unused                 tidxj,
 250                 bool gmx_unused                iMaskCond)
 251 {
 252 #if USE_SUBGROUP_PRELOAD
 253     *cjs = preloadCj4Subgroup(gm_cj);
 254 #elif USE_CJ_PREFETCH
 255     preloadCj4Generic(*cjs, gm_cj, tidxi, tidxj, iMaskCond);
 256 #else
 257     //nothing to do
 258 #endif
 259 }
 260
 261 gmx_opencl_inline
 262 int loadCjPreload(__local int           *        sm_cjPreload,
 263                   int                            jm,
 264                   int gmx_unused                 tidxi,
 265                   int gmx_unused                 tidxj)
 266 {
 267 #if defined _AMD_SOURCE_
 268     int       warpLoadOffset = 0; //TODO: fix by setting c_nbnxnGpuClusterpairSplit properly
 269 #else
 270     const int c_clSize                   = CL_SIZE;
 271     const int c_nbnxnGpuClusterpairSplit = 2;
 272     const int c_splitClSize              = c_clSize/c_nbnxnGpuClusterpairSplit;
 273     int       warpLoadOffset             = (tidxj & c_splitClSize) * c_nbnxnGpuJgroupSize/c_splitClSize;
 274 #endif
 275     return sm_cjPreload[jm + warpLoadOffset];
 276 }
 277
 278 /* \brief Load a cj given a jm index.
 279  *
 280  * If cj4 preloading is enabled, it loads from the local memory, otherwise from global.
 281  */
 282 gmx_opencl_inline
 283 int loadCj(CjType cjs, const __global int gmx_unused* gm_cj,
 284            int jm, int gmx_unused tidxi, int gmx_unused tidxj)
 285 {
 286 #if USE_SUBGROUP_PRELOAD
 287     return sub_group_broadcast(cjs, jm);
 288 #elif USE_CJ_PREFETCH
 289     return loadCjPreload(cjs, jm, tidxi, tidxj);
 290 #else
 291     return gm_cj[jm];
 292 #endif
 293 }
 294
 295 /*! Convert LJ sigma,epsilon parameters to C6,C12. */
 296 gmx_opencl_inline
 297 void convert_sigma_epsilon_to_c6_c12(const float  sigma,
 298                                      const float  epsilon,
 299                                      float       *c6,
 300                                      float       *c12)
 301 {
 302     float sigma2, sigma6;
 303
 304     sigma2 = sigma * sigma;
 305     sigma6 = sigma2 *sigma2 * sigma2;
 306     *c6    = epsilon * sigma6;
 307     *c12   = *c6 * sigma6;
 308 }
 309
 310
 311 /*! Apply force switch,  force + energy version. */
 312 gmx_opencl_inline
 313 void calculate_force_switch_F(const cl_nbparam_params_t *nbparam,
 314                               float                      c6,
 315                               float                      c12,
 316                               float                      inv_r,
 317                               float                      r2,
 318                               float                     *F_invr)
 319 {
 320     float r, r_switch;
 321
 322     /* force switch constants */
 323     float disp_shift_V2 = nbparam->dispersion_shift.c2;
 324     float disp_shift_V3 = nbparam->dispersion_shift.c3;
 325     float repu_shift_V2 = nbparam->repulsion_shift.c2;
 326     float repu_shift_V3 = nbparam->repulsion_shift.c3;
 327
 328     r         = r2 * inv_r;
 329     r_switch  = r - nbparam->rvdw_switch;
 330     r_switch  = r_switch >= 0.0f ? r_switch : 0.0f;
 331
 332     *F_invr  +=
 333         -c6*(disp_shift_V2 + disp_shift_V3*r_switch)*r_switch*r_switch*inv_r +
 334         c12*(-repu_shift_V2 + repu_shift_V3*r_switch)*r_switch*r_switch*inv_r;
 335 }
 336
 337 /*! Apply force switch, force-only version. */
 338 gmx_opencl_inline
 339 void calculate_force_switch_F_E(const cl_nbparam_params_t *nbparam,
 340                                 float                      c6,
 341                                 float                      c12,
 342                                 float                      inv_r,
 343                                 float                      r2,
 344                                 float                     *F_invr,
 345                                 float                     *E_lj)
 346 {
 347     float r, r_switch;
 348
 349     /* force switch constants */
 350     float disp_shift_V2 = nbparam->dispersion_shift.c2;
 351     float disp_shift_V3 = nbparam->dispersion_shift.c3;
 352     float repu_shift_V2 = nbparam->repulsion_shift.c2;
 353     float repu_shift_V3 = nbparam->repulsion_shift.c3;
 354
 355     float disp_shift_F2 = nbparam->dispersion_shift.c2/3;
 356     float disp_shift_F3 = nbparam->dispersion_shift.c3/4;
 357     float repu_shift_F2 = nbparam->repulsion_shift.c2/3;
 358     float repu_shift_F3 = nbparam->repulsion_shift.c3/4;
 359
 360     r         = r2 * inv_r;
 361     r_switch  = r - nbparam->rvdw_switch;
 362     r_switch  = r_switch >= 0.0f ? r_switch : 0.0f;
 363
 364     *F_invr  +=
 365         -c6*(disp_shift_V2 + disp_shift_V3*r_switch)*r_switch*r_switch*inv_r +
 366         c12*(-repu_shift_V2 + repu_shift_V3*r_switch)*r_switch*r_switch*inv_r;
 367     *E_lj    +=
 368         c6*(disp_shift_F2 + disp_shift_F3*r_switch)*r_switch*r_switch*r_switch -
 369         c12*(repu_shift_F2 + repu_shift_F3*r_switch)*r_switch*r_switch*r_switch;
 370 }
 371
 372 /*! Apply potential switch, force-only version. */
 373 gmx_opencl_inline
 374 void calculate_potential_switch_F(const cl_nbparam_params_t *nbparam,
 375                                   float                      inv_r,
 376                                   float                      r2,
 377                                   float                     *F_invr,
 378                                   const float               *E_lj)
 379 {
 380     float r, r_switch;
 381     float sw, dsw;
 382
 383     /* potential switch constants */
 384     float switch_V3 = nbparam->vdw_switch.c3;
 385     float switch_V4 = nbparam->vdw_switch.c4;
 386     float switch_V5 = nbparam->vdw_switch.c5;
 387     float switch_F2 = nbparam->vdw_switch.c3;
 388     float switch_F3 = nbparam->vdw_switch.c4;
 389     float switch_F4 = nbparam->vdw_switch.c5;
 390
 391     r        = r2 * inv_r;
 392     r_switch = r - nbparam->rvdw_switch;
 393
 394     /* Unlike in the F+E kernel, conditional is faster here */
 395     if (r_switch > 0.0f)
 396     {
 397         sw      = 1.0f + (switch_V3 + (switch_V4 + switch_V5*r_switch)*r_switch)*r_switch*r_switch*r_switch;
 398         dsw     = (switch_F2 + (switch_F3 + switch_F4*r_switch)*r_switch)*r_switch*r_switch;
 399
 400         *F_invr = (*F_invr)*sw - inv_r*(*E_lj)*dsw;
 401     }
 402 }
 403
 404 /*! Apply potential switch, force + energy version. */
 405 gmx_opencl_inline
 406 void calculate_potential_switch_F_E(const cl_nbparam_params_t *nbparam,
 407                                     float                      inv_r,
 408                                     float                      r2,
 409                                     float                     *F_invr,
 410                                     float                     *E_lj)
 411 {
 412     float r, r_switch;
 413     float sw, dsw;
 414
 415     /* potential switch constants */
 416     float switch_V3 = nbparam->vdw_switch.c3;
 417     float switch_V4 = nbparam->vdw_switch.c4;
 418     float switch_V5 = nbparam->vdw_switch.c5;
 419     float switch_F2 = nbparam->vdw_switch.c3;
 420     float switch_F3 = nbparam->vdw_switch.c4;
 421     float switch_F4 = nbparam->vdw_switch.c5;
 422
 423     r        = r2 * inv_r;
 424     r_switch = r - nbparam->rvdw_switch;
 425     r_switch = r_switch >= 0.0f ? r_switch : 0.0f;
 426
 427     /* Unlike in the F-only kernel, masking is faster here */
 428     sw       = 1.0f + (switch_V3 + (switch_V4 + switch_V5*r_switch)*r_switch)*r_switch*r_switch*r_switch;
 429     dsw      = (switch_F2 + (switch_F3 + switch_F4*r_switch)*r_switch)*r_switch*r_switch;
 430
 431     *F_invr  = (*F_invr)*sw - inv_r*(*E_lj)*dsw;
 432     *E_lj   *= sw;
 433 }
 434
 435 /*! Calculate LJ-PME grid force contribution with
 436  *  geometric combination rule.
 437  */
 438 gmx_opencl_inline
 439 void calculate_lj_ewald_comb_geom_F(__constant const float *nbfp_comb_climg2d,
 440                                     int                     typei,
 441                                     int                     typej,
 442                                     float                   r2,
 443                                     float                   inv_r2,
 444                                     float                   lje_coeff2,
 445                                     float                   lje_coeff6_6,
 446                                     float                  *F_invr)
 447 {
 448     float c6grid, inv_r6_nm, cr2, expmcr2, poly;
 449
 450     c6grid    = nbfp_comb_climg2d[2*typei]*nbfp_comb_climg2d[2*typej];
 451
 452     /* Recalculate inv_r6 without exclusion mask */
 453     inv_r6_nm = inv_r2*inv_r2*inv_r2;
 454     cr2       = lje_coeff2*r2;
 455     expmcr2   = exp(-cr2);
 456     poly      = 1.0f + cr2 + 0.5f*cr2*cr2;
 457
 458     /* Subtract the grid force from the total LJ force */
 459     *F_invr  += c6grid*(inv_r6_nm - expmcr2*(inv_r6_nm*poly + lje_coeff6_6))*inv_r2;
 460 }
 461
 462 /*! Calculate LJ-PME grid force + energy contribution with
 463  *  geometric combination rule.
 464  */
 465 gmx_opencl_inline
 466 void calculate_lj_ewald_comb_geom_F_E(__constant const float    *nbfp_comb_climg2d,
 467                                       const cl_nbparam_params_t *nbparam,
 468                                       int                        typei,
 469                                       int                        typej,
 470                                       float                      r2,
 471                                       float                      inv_r2,
 472                                       float                      lje_coeff2,
 473                                       float                      lje_coeff6_6,
 474                                       float                      int_bit,
 475                                       float                     *F_invr,
 476                                       float                     *E_lj)
 477 {
 478     float c6grid, inv_r6_nm, cr2, expmcr2, poly, sh_mask;
 479
 480     c6grid    = nbfp_comb_climg2d[2*typei]*nbfp_comb_climg2d[2*typej];
 481
 482     /* Recalculate inv_r6 without exclusion mask */
 483     inv_r6_nm = inv_r2*inv_r2*inv_r2;
 484     cr2       = lje_coeff2*r2;
 485     expmcr2   = exp(-cr2);
 486     poly      = 1.0f + cr2 + 0.5f*cr2*cr2;
 487
 488     /* Subtract the grid force from the total LJ force */
 489     *F_invr  += c6grid*(inv_r6_nm - expmcr2*(inv_r6_nm*poly + lje_coeff6_6))*inv_r2;
 490
 491     /* Shift should be applied only to real LJ pairs */
 492     sh_mask   = nbparam->sh_lj_ewald*int_bit;
 493     *E_lj    += ONE_SIXTH_F*c6grid*(inv_r6_nm*(1.0f - expmcr2*poly) + sh_mask);
 494 }
 495
 496 /*! Calculate LJ-PME grid force + energy contribution (if E_lj != NULL) with
 497  *  Lorentz-Berthelot combination rule.
 498  *  We use a single F+E kernel with conditional because the performance impact
 499  *  of this is pretty small and LB on the CPU is anyway very slow.
 500  */
 501 gmx_opencl_inline
 502 void calculate_lj_ewald_comb_LB_F_E(__constant const float    *nbfp_comb_climg2d,
 503                                     const cl_nbparam_params_t *nbparam,
 504                                     int                        typei,
 505                                     int                        typej,
 506                                     float                      r2,
 507                                     float                      inv_r2,
 508                                     float                      lje_coeff2,
 509                                     float                      lje_coeff6_6,
 510                                     float                      int_bit,
 511                                     bool                       with_E_lj,
 512                                     float                     *F_invr,
 513                                     float                     *E_lj)
 514 {
 515     float c6grid, inv_r6_nm, cr2, expmcr2, poly;
 516     float sigma, sigma2, epsilon;
 517
 518     /* sigma and epsilon are scaled to give 6*C6 */
 519     sigma      = nbfp_comb_climg2d[2*typei] + nbfp_comb_climg2d[2*typej];
 520
 521     epsilon    = nbfp_comb_climg2d[2*typei+1]*nbfp_comb_climg2d[2*typej+1];
 522
 523     sigma2  = sigma*sigma;
 524     c6grid  = epsilon*sigma2*sigma2*sigma2;
 525
 526     /* Recalculate inv_r6 without exclusion mask */
 527     inv_r6_nm = inv_r2*inv_r2*inv_r2;
 528     cr2       = lje_coeff2*r2;
 529     expmcr2   = exp(-cr2);
 530     poly      = 1.0f + cr2 + 0.5f*cr2*cr2;
 531
 532     /* Subtract the grid force from the total LJ force */
 533     *F_invr  += c6grid*(inv_r6_nm - expmcr2*(inv_r6_nm*poly + lje_coeff6_6))*inv_r2;
 534
 535     if (with_E_lj)
 536     {
 537         float sh_mask;
 538
 539         /* Shift should be applied only to real LJ pairs */
 540         sh_mask   = nbparam->sh_lj_ewald*int_bit;
 541         *E_lj    += ONE_SIXTH_F*c6grid*(inv_r6_nm*(1.0f - expmcr2*poly) + sh_mask);
 542     }
 543 }
 544
 545 /*! Interpolate Ewald coulomb force using the table through the tex_nbfp texture.
 546  *  Original idea: from the OpenMM project
 547  */
 548 gmx_opencl_inline float
 549 interpolate_coulomb_force_r(__constant const float *coulomb_tab_climg2d,
 550                             float                   r,
 551                             float                   scale)
 552 {
 553     float   normalized = scale * r;
 554     int     index      = (int) normalized;
 555     float   fract2     = normalized - index;
 556     float   fract1     = 1.0f - fract2;
 557
 558     return fract1*coulomb_tab_climg2d[index] +
 559            fract2*coulomb_tab_climg2d[index + 1];
 560 }
 561
 562 /*! Calculate analytical Ewald correction term. */
 563 gmx_opencl_inline
 564 float pmecorrF(float z2)
 565 {
 566     const float FN6 = -1.7357322914161492954e-8f;
 567     const float FN5 = 1.4703624142580877519e-6f;
 568     const float FN4 = -0.000053401640219807709149f;
 569     const float FN3 = 0.0010054721316683106153f;
 570     const float FN2 = -0.019278317264888380590f;
 571     const float FN1 = 0.069670166153766424023f;
 572     const float FN0 = -0.75225204789749321333f;
 573
 574     const float FD4 = 0.0011193462567257629232f;
 575     const float FD3 = 0.014866955030185295499f;
 576     const float FD2 = 0.11583842382862377919f;
 577     const float FD1 = 0.50736591960530292870f;
 578     const float FD0 = 1.0f;
 579
 580     float       z4;
 581     float       polyFN0, polyFN1, polyFD0, polyFD1;
 582
 583     z4          = z2*z2;
 584
 585     polyFD0     = FD4*z4 + FD2;
 586     polyFD1     = FD3*z4 + FD1;
 587     polyFD0     = polyFD0*z4 + FD0;
 588     polyFD0     = polyFD1*z2 + polyFD0;
 589
 590     polyFD0     = 1.0f/polyFD0;
 591
 592     polyFN0     = FN6*z4 + FN4;
 593     polyFN1     = FN5*z4 + FN3;
 594     polyFN0     = polyFN0*z4 + FN2;
 595     polyFN1     = polyFN1*z4 + FN1;
 596     polyFN0     = polyFN0*z4 + FN0;
 597     polyFN0     = polyFN1*z2 + polyFN0;
 598
 599     return polyFN0*polyFD0;
 600 }
 601
 602 #if REDUCE_SHUFFLE
 603 gmx_opencl_inline
 604 void reduce_force_j_shfl(float3 fin, __global float *fout,
 605                          int gmx_unused tidxi, int gmx_unused tidxj, int aidx)
 606 {
 607     /* Only does reduction over 4 elements in cluster. Needs to be changed
 608      * for CL_SIZE>4. See CUDA code for required code */
 609     fin.x += intel_sub_group_shuffle_down(fin.x, fin.x, 1);
 610     fin.y += intel_sub_group_shuffle_up  (fin.y, fin.y, 1);
 611     fin.z += intel_sub_group_shuffle_down(fin.z, fin.z, 1);
 612     if ((tidxi & 1) == 1)
 613     {
 614         fin.x = fin.y;
 615     }
 616     fin.x += intel_sub_group_shuffle_down(fin.x, fin.x, 2);
 617     fin.z += intel_sub_group_shuffle_up  (fin.z, fin.z, 2);
 618     if (tidxi == 2)
 619     {
 620         fin.x = fin.z;
 621     }
 622     if (tidxi < 3)
 623     {
 624         atomicAdd_g_f(&fout[3 * aidx + tidxi], fin.x);
 625     }
 626 }
 627 #endif
 628
 629 gmx_opencl_inline
 630 void reduce_force_j_generic(__local float *f_buf, float3 fcj_buf, __global float *fout,
 631                             int tidxi, int tidxj, int aidx)
 632 {
 633     int tidx = tidxi + tidxj*CL_SIZE;
 634     f_buf[                  tidx] = fcj_buf.x;
 635     f_buf[    FBUF_STRIDE + tidx] = fcj_buf.y;
 636     f_buf[2 * FBUF_STRIDE + tidx] = fcj_buf.z;
 637
 638     /* Split the reduction between the first 3 column threads
 639        Threads with column id 0 will do the reduction for (float3).x components
 640        Threads with column id 1 will do the reduction for (float3).y components
 641        Threads with column id 2 will do the reduction for (float3).z components.
 642        The reduction is performed for each line tidxj of f_buf. */
 643     if (tidxi < 3)
 644     {
 645         float f = 0.0f;
 646         for (int j = tidxj * CL_SIZE; j < (tidxj + 1) * CL_SIZE; j++)
 647         {
 648             f += f_buf[FBUF_STRIDE * tidxi + j];
 649         }
 650
 651         atomicAdd_g_f(&fout[3 * aidx + tidxi], f);
 652     }
 653 }
 654
 655 /*! Final j-force reduction
 656  */
 657 gmx_opencl_inline
 658 void reduce_force_j(__local float gmx_unused *f_buf, float3 fcj_buf, __global float *fout,
 659                     int tidxi, int tidxj, int aidx)
 660 {
 661 #if REDUCE_SHUFFLE
 662     reduce_force_j_shfl(fcj_buf, fout, tidxi, tidxj, aidx);
 663 #else
 664     reduce_force_j_generic(f_buf, fcj_buf, fout, tidxi, tidxj, aidx);
 665 #endif
 666 }
 667
 668 #if REDUCE_SHUFFLE
 669 gmx_opencl_inline
 670 void reduce_force_i_and_shift_shfl(float3* fci_buf, __global float *fout,
 671                                    bool bCalcFshift, int tidxi, int tidxj,
 672                                    int sci, int shift, __global float *fshift)
 673 {
 674     /* Only does reduction over 4 elements in cluster (2 per warp). Needs to be changed
 675      * for CL_SIZE>4.*/
 676     float2 fshift_buf = 0;
 677     for (int ci_offset = 0; ci_offset < NCL_PER_SUPERCL; ci_offset++)
 678     {
 679         int    aidx = (sci * NCL_PER_SUPERCL + ci_offset) * CL_SIZE + tidxi;
 680         float3 fin  = fci_buf[ci_offset];
 681         fin.x += intel_sub_group_shuffle_down(fin.x, fin.x, CL_SIZE);
 682         fin.y += intel_sub_group_shuffle_up  (fin.y, fin.y, CL_SIZE);
 683         fin.z += intel_sub_group_shuffle_down(fin.z, fin.z, CL_SIZE);
 684
 685         if (tidxj & 1)
 686         {
 687             fin.x = fin.y;
 688         }
 689         /* Threads 0,1 and 2,3 increment x,y for their warp */
 690         atomicAdd_g_f(&fout[3*aidx + (tidxj & 1)], fin.x);
 691         if (bCalcFshift)
 692         {
 693             fshift_buf[0] += fin.x;
 694         }
 695         /* Threads 0 and 2 increment z for their warp */
 696         if ((tidxj & 1) == 0)
 697         {
 698             atomicAdd_g_f(&fout[3*aidx+2], fin.z);
 699             if (bCalcFshift)
 700             {
 701                 fshift_buf[1] += fin.z;
 702             }
 703         }
 704     }
 705     /* add up local shift forces into global mem */
 706     if (bCalcFshift)
 707     {
 708         //Threads 0,1 and 2,3 update x,y
 709         atomicAdd_g_f(&(fshift[3 * shift + (tidxj&1)]), fshift_buf[0]);
 710         //Threads 0 and 2 update z
 711         if ((tidxj & 1) == 0)
 712         {
 713             atomicAdd_g_f(&(fshift[3 * shift + 2]), fshift_buf[1]);
 714         }
 715     }
 716 }
 717 #endif
 718
 719 /*! Final i-force reduction; this implementation works only with power of two
 720  *  array sizes.
 721  */
 722 gmx_opencl_inline
 723 void reduce_force_i_and_shift_pow2(volatile __local float *f_buf, float3* fci_buf,
 724                                    __global float *fout,
 725                                    bool bCalcFshift,
 726                                    int tidxi, int tidxj,
 727                                    int sci, int shift, __global float *fshift)
 728 {
 729     float fshift_buf = 0;
 730     for (int ci_offset = 0; ci_offset < NCL_PER_SUPERCL; ci_offset++)
 731     {
 732         int aidx = (sci * NCL_PER_SUPERCL + ci_offset) * CL_SIZE + tidxi;
 733         int tidx = tidxi + tidxj*CL_SIZE;
 734         /* store i forces in shmem */
 735         f_buf[                  tidx] = fci_buf[ci_offset].x;
 736         f_buf[    FBUF_STRIDE + tidx] = fci_buf[ci_offset].y;
 737         f_buf[2 * FBUF_STRIDE + tidx] = fci_buf[ci_offset].z;
 738         barrier(CLK_LOCAL_MEM_FENCE);
 739
 740         int     i, j;
 741         /* Reduce the initial CL_SIZE values for each i atom to half
 742          * every step by using CL_SIZE * i threads.
 743          * Can't just use i as loop variable because than nvcc refuses to unroll.
 744          */
 745         i = CL_SIZE/2;
 746         for (j = CL_SIZE_LOG2 - 1; j > 0; j--)
 747         {
 748             if (tidxj < i)
 749             {
 750
 751                 f_buf[                  tidxj * CL_SIZE + tidxi] += f_buf[                  (tidxj + i) * CL_SIZE + tidxi];
 752                 f_buf[    FBUF_STRIDE + tidxj * CL_SIZE + tidxi] += f_buf[    FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi];
 753                 f_buf[2 * FBUF_STRIDE + tidxj * CL_SIZE + tidxi] += f_buf[2 * FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi];
 754             }
 755             i >>= 1;
 756         }
 757         /* needed because
 758          * a) for CL_SIZE<8: id 2 (doing z in next block) is in 2nd warp
 759          * b) for all CL_SIZE a barrier is needed before f_buf is reused by next reduce_force_i call
 760          * TODO: Test on Nvidia for performance difference between having the barrier here or after the atomicAdd
 761          */
 762         barrier(CLK_LOCAL_MEM_FENCE);
 763
 764         /* i == 1, last reduction step, writing to global mem */
 765         /* Split the reduction between the first 3 line threads
 766            Threads with line id 0 will do the reduction for (float3).x components
 767            Threads with line id 1 will do the reduction for (float3).y components
 768            Threads with line id 2 will do the reduction for (float3).z components. */
 769         if (tidxj < 3)
 770         {
 771             float f = f_buf[tidxj * FBUF_STRIDE + tidxi] + f_buf[tidxj * FBUF_STRIDE + i * CL_SIZE + tidxi];
 772
 773             atomicAdd_g_f(&fout[3 * aidx + tidxj], f);
 774
 775             if (bCalcFshift)
 776             {
 777                 fshift_buf += f;
 778             }
 779         }
 780     }
 781     /* add up local shift forces into global mem */
 782     if (bCalcFshift)
 783     {
 784         /* Only threads with tidxj < 3 will update fshift.
 785            The threads performing the update, must be the same as the threads
 786            storing the reduction result above.
 787          */
 788         if (tidxj < 3)
 789         {
 790             atomicAdd_g_f(&(fshift[3 * shift + tidxj]), fshift_buf);
 791         }
 792     }
 793 }
 794
 795 /*! Final i-force reduction
 796  */
 797 gmx_opencl_inline
 798 void reduce_force_i_and_shift(__local float gmx_unused *f_buf, float3* fci_buf, __global float *f,
 799                               bool bCalcFshift, int tidxi, int tidxj, int sci,
 800                               int shift, __global float *fshift)
 801 {
 802 #if REDUCE_SHUFFLE
 803     reduce_force_i_and_shift_shfl(fci_buf, f, bCalcFshift, tidxi, tidxj,
 804                                   sci, shift, fshift);
 805 #else
 806     reduce_force_i_and_shift_pow2(f_buf, fci_buf, f, bCalcFshift, tidxi, tidxj,
 807                                   sci, shift, fshift);
 808 #endif
 809 }
 810
 811
 812
 813 #if REDUCE_SHUFFLE
 814 gmx_opencl_inline
 815 void reduce_energy_shfl(float E_lj, float E_el,
 816                         volatile __global float *e_lj,
 817                         volatile __global float *e_el,
 818                         unsigned int    tidx)
 819 {
 820     E_lj = sub_group_reduce_add(E_lj);
 821     E_el = sub_group_reduce_add(E_el);
 822     /* Should be get_sub_group_local_id()==0. Doesn't work with Intel Classic driver.
 823      * To make it possible to use REDUCE_SHUFFLE with single subgroup per i-j pair
 824      * (e.g. subgroup size 16 with CL_SIZE 4), either this "if" needs to be changed or
 825      * the definition of WARP_SIZE (currently CL_SIZE*CL_SIZE/2) needs to be changed
 826      * (by supporting c_nbnxnGpuClusterpairSplit=1). */
 827     if (tidx == 0 || tidx == WARP_SIZE)
 828     {
 829         atomicAdd_g_f(e_lj, E_lj);
 830         atomicAdd_g_f(e_el, E_el);
 831     }
 832 }
 833 #endif
 834
 835 /*! Energy reduction; this implementation works only with power of two
 836  *  array sizes.
 837  */
 838 gmx_opencl_inline
 839 void reduce_energy_pow2(volatile __local float  *buf,
 840                         volatile __global float *e_lj,
 841                         volatile __global float *e_el,
 842                         unsigned int             tidx)
 843 {
 844     int          j;
 845
 846     unsigned int i = WARP_SIZE/2;
 847
 848     /* Can't just use i as loop variable because than nvcc refuses to unroll. */
 849     for (j = WARP_SIZE_LOG2 - 1; j > 0; j--)
 850     {
 851         if (tidx < i)
 852         {
 853             buf[              tidx] += buf[              tidx + i];
 854             buf[FBUF_STRIDE + tidx] += buf[FBUF_STRIDE + tidx + i];
 855         }
 856         i >>= 1;
 857     }
 858
 859     /* last reduction step, writing to global mem */
 860     if (tidx == 0)
 861     {
 862         float e1 = buf[              tidx] + buf[              tidx + i];
 863         float e2 = buf[FBUF_STRIDE + tidx] + buf[FBUF_STRIDE + tidx + i];
 864
 865         atomicAdd_g_f(e_lj, e1);
 866         atomicAdd_g_f(e_el, e2);
 867     }
 868 }
 869
 870 gmx_opencl_inline
 871 void reduce_energy(volatile __local float  gmx_unused *buf,
 872                    float E_lj, float E_el,
 873                    volatile __global float *e_lj,
 874                    volatile __global float *e_el,
 875                    unsigned int             tidx)
 876 {
 877 #if REDUCE_SHUFFLE
 878     reduce_energy_shfl(E_lj, E_el, e_lj, e_el, tidx);
 879 #else
 880     /* flush the energies to shmem and reduce them */
 881     buf[              tidx] = E_lj;
 882     buf[FBUF_STRIDE + tidx] = E_el;
 883     reduce_energy_pow2(buf + (tidx & WARP_SIZE), e_lj, e_el, tidx & ~WARP_SIZE);
 884 #endif
 885 }
 886
 887 gmx_opencl_inline
 888 bool gmx_sub_group_any_localmem(volatile __local uint *warp_any, int widx, bool pred)
 889 {
 890     if (pred)
 891     {
 892         warp_any[widx] = 1;
 893     }
 894
 895     bool ret = warp_any[widx];
 896
 897     warp_any[widx] = 0;
 898
 899     return ret;
 900 }
 901
 902 //! Returns a true if predicate is true for any work item in warp
 903 gmx_opencl_inline
 904 bool gmx_sub_group_any(volatile __local uint gmx_unused *warp_any, int gmx_unused widx, bool pred)
 905 {
 906 #if USE_SUBGROUP_ANY
 907     return sub_group_any(pred);
 908 #else
 909     return gmx_sub_group_any_localmem(warp_any, widx, pred);
 910 #endif
 911 }
 912
 913 #endif /* NBNXN_OPENCL_KERNEL_UTILS_CLH */