src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_utils.clh

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2012,2013,2014,2016,2017,2018, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 #include "gromacs/gpu_utils/vectype_ops.clh"
  37 #include "gromacs/gpu_utils/device_utils.clh"
  38 #include "gromacs/mdlib/nbnxn_consts.h"
  39 #include "gromacs/pbcutil/ishift.h"
  40
  41 #include "nbnxn_ocl_consts.h"
  42
  43 #define CL_SIZE                 (NBNXN_GPU_CLUSTER_SIZE)
  44 #define NCL_PER_SUPERCL         c_nbnxnGpuNumClusterPerSupercluster
  45
  46 #define WARP_SIZE  (CL_SIZE*CL_SIZE/2) //Currently only c_nbnxnGpuClusterpairSplit=2 supported
  47
  48 #if defined _NVIDIA_SOURCE_ || defined _AMD_SOURCE_
  49 /* Currently we enable CJ prefetch for AMD/NVIDIA and disable it for other vendors
  50  * Note that this should precede the kernel_utils include.
  51  */
  52 #define USE_CJ_PREFETCH 1
  53 #else
  54 #define USE_CJ_PREFETCH 0
  55 #endif
  56
  57 #if (defined cl_intel_subgroups || defined cl_khr_subgroups || __OPENCL_VERSION__ >= 210)
  58 #define HAVE_SUBGROUP 1
  59 #else
  60 #define HAVE_SUBGROUP 0
  61 #endif
  62
  63 #ifdef cl_intel_subgroups
  64 #define HAVE_INTEL_SUBGROUP 1
  65 #else
  66 #define HAVE_INTEL_SUBGROUP 0
  67 #endif
  68
  69 #if _INTEL_SOURCE_
  70 #define SUBGROUP_SIZE 8
  71 #elif _AMD_SOURCE_
  72 #define SUBGROUP_SIZE 64
  73 #else
  74 #define SUBGROUP_SIZE 32
  75 #endif
  76
  77 #define REDUCE_SHUFFLE (HAVE_INTEL_SUBGROUP && CL_SIZE == 4 && SUBGROUP_SIZE == WARP_SIZE)
  78 #define USE_SUBGROUP_ANY (HAVE_SUBGROUP && SUBGROUP_SIZE == WARP_SIZE)
  79 #define USE_SUBGROUP_PRELOAD HAVE_INTEL_SUBGROUP
  80
  81 /* 1.0 / sqrt(M_PI) */
  82 #define M_FLOAT_1_SQRTPI 0.564189583547756f
  83
  84 //-------------------
  85
  86 #ifndef NBNXN_OPENCL_KERNEL_UTILS_CLH
  87 #define NBNXN_OPENCL_KERNEL_UTILS_CLH
  88
  89 #if CL_SIZE == 8
  90 #define WARP_SIZE_LOG2  (5)
  91 #define CL_SIZE_LOG2    (3)
  92 #elif CL_SIZE == 4
  93 #define WARP_SIZE_LOG2  (3)
  94 #define CL_SIZE_LOG2    (2)
  95 #else
  96 #error unsupported CL_SIZE
  97 #endif
  98
  99 #define CL_SIZE_SQ      (CL_SIZE * CL_SIZE)
 100 #define FBUF_STRIDE     (CL_SIZE_SQ)
 101
 102 #define ONE_SIXTH_F     0.16666667f
 103 #define ONE_TWELVETH_F  0.08333333f
 104
 105
 106 // Data structures shared between OpenCL device code and OpenCL host code
 107 // TODO: review, improve
 108 // Replaced real by float for now, to avoid including any other header
 109 typedef struct {
 110     float c2;
 111     float c3;
 112     float cpot;
 113 } shift_consts_t;
 114
 115 /* Used with potential switching:
 116  * rsw        = max(r - r_switch, 0)
 117  * sw         = 1 + c3*rsw^3 + c4*rsw^4 + c5*rsw^5
 118  * dsw        = 3*c3*rsw^2 + 4*c4*rsw^3 + 5*c5*rsw^4
 119  * force      = force*dsw - potential*sw
 120  * potential *= sw
 121  */
 122 typedef struct {
 123     float c3;
 124     float c4;
 125     float c5;
 126 } switch_consts_t;
 127
 128 // Data structure shared between the OpenCL device code and OpenCL host code
 129 // Must not contain OpenCL objects (buffers)
 130 typedef struct cl_nbparam_params
 131 {
 132
 133     int             eeltype;          /**< type of electrostatics, takes values from #eelCu */
 134     int             vdwtype;          /**< type of VdW impl., takes values from #evdwCu     */
 135
 136     float           epsfac;           /**< charge multiplication factor                      */
 137     float           c_rf;             /**< Reaction-field/plain cutoff electrostatics const. */
 138     float           two_k_rf;         /**< Reaction-field electrostatics constant            */
 139     float           ewald_beta;       /**< Ewald/PME parameter                               */
 140     float           sh_ewald;         /**< Ewald/PME correction term substracted from the direct-space potential */
 141     float           sh_lj_ewald;      /**< LJ-Ewald/PME correction term added to the correction potential        */
 142     float           ewaldcoeff_lj;    /**< LJ-Ewald/PME coefficient                          */
 143
 144     float           rcoulomb_sq;      /**< Coulomb cut-off squared                           */
 145
 146     float           rvdw_sq;          /**< VdW cut-off squared                               */
 147     float           rvdw_switch;      /**< VdW switched cut-off                              */
 148     float           rlistOuter_sq;    /**< Full, outer pair-list cut-off squared             */
 149     float           rlistInner_sq;    /**< Inner, dynamic pruned pair-list cut-off squared  XXX: this is only needed in the pruning kernels, but for now we also pass it to the nonbondeds */
 150
 151     shift_consts_t  dispersion_shift; /**< VdW shift dispersion constants           */
 152     shift_consts_t  repulsion_shift;  /**< VdW shift repulsion constants            */
 153     switch_consts_t vdw_switch;       /**< VdW switch constants                     */
 154
 155     /* Ewald Coulomb force table data - accessed through texture memory */
 156     float                  coulomb_tab_scale;  /**< table scale/spacing                        */
 157 }cl_nbparam_params_t;
 158
 159 typedef struct {
 160     int sci;            /* i-super-cluster       */
 161     int shift;          /* Shift vector index plus possible flags */
 162     int cj4_ind_start;  /* Start index into cj4  */
 163     int cj4_ind_end;    /* End index into cj4    */
 164 } nbnxn_sci_t;
 165
 166 typedef struct {
 167     unsigned int imask;    /* The i-cluster interactions mask for 1 warp  */
 168     int          excl_ind; /* Index into the exclusion array for 1 warp   */
 169 } nbnxn_im_ei_t;
 170
 171 typedef struct {
 172     int           cj[4];   /* The 4 j-clusters                            */
 173     nbnxn_im_ei_t imei[2]; /* The i-cluster mask data       for 2 warps   */
 174 } nbnxn_cj4_t;
 175
 176
 177 typedef struct {
 178     unsigned int pair[CL_SIZE*CL_SIZE/2]; /* Topology exclusion interaction bits for one warp,
 179                                            * each unsigned has bitS for 4*8 i clusters
 180                                            */
 181 } nbnxn_excl_t;
 182
 183 /*! i-cluster interaction mask for a super-cluster with all NCL_PER_SUPERCL bits set */
 184 __constant unsigned supercl_interaction_mask = ((1U << NCL_PER_SUPERCL) - 1U);
 185
 186 gmx_opencl_inline
 187 void preloadCj4Generic(__local int        *sm_cjPreload,
 188                        const __global int *gm_cj,
 189                        int                 tidxi,
 190                        int                 tidxj,
 191                        bool                iMaskCond)
 192
 193 {
 194     /* Pre-load cj into shared memory */
 195 #if defined _AMD_SOURCE_ //TODO: fix by setting c_nbnxnGpuClusterpairSplit properly
 196     if (tidxj == 0 & tidxi < c_nbnxnGpuJgroupSize)
 197     {
 198         sm_cjPreload[tidxi] = gm_cj[tidxi];
 199     }
 200 #else
 201     const int c_clSize                   = CL_SIZE;
 202     const int c_nbnxnGpuClusterpairSplit = 2;
 203     const int c_splitClSize              = c_clSize/c_nbnxnGpuClusterpairSplit;
 204
 205     if ((tidxj == 0 | tidxj == c_splitClSize) & (tidxi < c_nbnxnGpuJgroupSize))
 206     {
 207         sm_cjPreload[tidxi + tidxj * c_nbnxnGpuJgroupSize/c_splitClSize] = gm_cj[tidxi];
 208     }
 209 #endif
 210 }
 211
 212
 213 #if USE_SUBGROUP_PRELOAD
 214 gmx_opencl_inline
 215 int  preloadCj4Subgroup(const __global int *gm_cj)
 216 {
 217     //loads subgroup-size # of elements (8) instead of the 4 required
 218     //equivalent to *cjs = *gm_cj
 219     return intel_sub_group_block_read((const __global uint *)gm_cj);
 220 }
 221 #endif //USE_SUBGROUP_PRELOAD
 222
 223 #if USE_SUBGROUP_PRELOAD
 224 typedef int CjType;
 225 #else
 226 typedef __local int* CjType;
 227 #endif
 228
 229 /*! \brief Preload cj4
 230  *
 231  * - For AMD we load once for a wavefront of 64 threads (on 4 threads * NTHREAD_Z)
 232  * - For NVIDIA once per warp (on 2x4 threads * NTHREAD_Z)
 233  * - For Intel(/USE_SUBGROUP_PRELOAD) loads into private memory(/register) instead of local memory
 234  *
 235  * It is the caller's responsibility to make sure that data is consumed only when
 236  * it's ready. This function does not call a barrier.
 237  */
 238 gmx_opencl_inline
 239 void preloadCj4(CjType             *cjs,
 240                 const __global int *gm_cj,
 241                 int                 tidxi,
 242                 int                 tidxj,
 243                 bool                iMaskCond)
 244 {
 245 #if USE_SUBGROUP_PRELOAD
 246     *cjs = preloadCj4Subgroup(gm_cj);
 247 #elif USE_CJ_PREFETCH
 248     preloadCj4Generic(*cjs, gm_cj, tidxi, tidxj, iMaskCond);
 249 #else
 250     //nothing to do
 251 #endif
 252 }
 253
 254 gmx_opencl_inline
 255 int loadCjPreload(__local int*        sm_cjPreload,
 256                   int                 jm,
 257                   int                 tidxi,
 258                   int                 tidxj)
 259 {
 260 #if defined _AMD_SOURCE_
 261     int       warpLoadOffset = 0; //TODO: fix by setting c_nbnxnGpuClusterpairSplit properly
 262 #else
 263     const int c_clSize                   = CL_SIZE;
 264     const int c_nbnxnGpuClusterpairSplit = 2;
 265     const int c_splitClSize              = c_clSize/c_nbnxnGpuClusterpairSplit;
 266
 267     int       warpLoadOffset = (tidxj & c_splitClSize) * c_nbnxnGpuJgroupSize/c_splitClSize;
 268 #endif
 269     return sm_cjPreload[jm + warpLoadOffset];
 270 }
 271
 272 /* \brief Load a cj given a jm index.
 273  *
 274  * If cj4 preloading is enabled, it loads from the local memory, otherwise from global.
 275  */
 276 gmx_opencl_inline
 277 int loadCj(CjType cjs, const __global int *gm_cj,
 278            int jm, int tidxi, int tidxj)
 279 {
 280 #if USE_SUBGROUP_PRELOAD
 281     return sub_group_broadcast(cjs, jm);
 282 #elif USE_CJ_PREFETCH
 283     return loadCjPreload(cjs, jm, tidxi, tidxj);
 284 #else
 285     return gm_cj[jm];
 286 #endif
 287 }
 288
 289 /*! Convert LJ sigma,epsilon parameters to C6,C12. */
 290 gmx_opencl_inline
 291 void convert_sigma_epsilon_to_c6_c12(const float  sigma,
 292                                      const float  epsilon,
 293                                      float       *c6,
 294                                      float       *c12)
 295 {
 296     float sigma2, sigma6;
 297
 298     sigma2 = sigma * sigma;
 299     sigma6 = sigma2 *sigma2 * sigma2;
 300     *c6    = epsilon * sigma6;
 301     *c12   = *c6 * sigma6;
 302 }
 303
 304
 305 /*! Apply force switch,  force + energy version. */
 306 gmx_opencl_inline
 307 void calculate_force_switch_F(cl_nbparam_params_t *nbparam,
 308                               float                c6,
 309                               float                c12,
 310                               float                inv_r,
 311                               float                r2,
 312                               float               *F_invr)
 313 {
 314     float r, r_switch;
 315
 316     /* force switch constants */
 317     float disp_shift_V2 = nbparam->dispersion_shift.c2;
 318     float disp_shift_V3 = nbparam->dispersion_shift.c3;
 319     float repu_shift_V2 = nbparam->repulsion_shift.c2;
 320     float repu_shift_V3 = nbparam->repulsion_shift.c3;
 321
 322     r         = r2 * inv_r;
 323     r_switch  = r - nbparam->rvdw_switch;
 324     r_switch  = r_switch >= 0.0f ? r_switch : 0.0f;
 325
 326     *F_invr  +=
 327         -c6*(disp_shift_V2 + disp_shift_V3*r_switch)*r_switch*r_switch*inv_r +
 328         c12*(-repu_shift_V2 + repu_shift_V3*r_switch)*r_switch*r_switch*inv_r;
 329 }
 330
 331 /*! Apply force switch, force-only version. */
 332 gmx_opencl_inline
 333 void calculate_force_switch_F_E(cl_nbparam_params_t *nbparam,
 334                                 float                c6,
 335                                 float                c12,
 336                                 float                inv_r,
 337                                 float                r2,
 338                                 float               *F_invr,
 339                                 float               *E_lj)
 340 {
 341     float r, r_switch;
 342
 343     /* force switch constants */
 344     float disp_shift_V2 = nbparam->dispersion_shift.c2;
 345     float disp_shift_V3 = nbparam->dispersion_shift.c3;
 346     float repu_shift_V2 = nbparam->repulsion_shift.c2;
 347     float repu_shift_V3 = nbparam->repulsion_shift.c3;
 348
 349     float disp_shift_F2 = nbparam->dispersion_shift.c2/3;
 350     float disp_shift_F3 = nbparam->dispersion_shift.c3/4;
 351     float repu_shift_F2 = nbparam->repulsion_shift.c2/3;
 352     float repu_shift_F3 = nbparam->repulsion_shift.c3/4;
 353
 354     r         = r2 * inv_r;
 355     r_switch  = r - nbparam->rvdw_switch;
 356     r_switch  = r_switch >= 0.0f ? r_switch : 0.0f;
 357
 358     *F_invr  +=
 359         -c6*(disp_shift_V2 + disp_shift_V3*r_switch)*r_switch*r_switch*inv_r +
 360         c12*(-repu_shift_V2 + repu_shift_V3*r_switch)*r_switch*r_switch*inv_r;
 361     *E_lj    +=
 362         c6*(disp_shift_F2 + disp_shift_F3*r_switch)*r_switch*r_switch*r_switch -
 363         c12*(repu_shift_F2 + repu_shift_F3*r_switch)*r_switch*r_switch*r_switch;
 364 }
 365
 366 /*! Apply potential switch, force-only version. */
 367 gmx_opencl_inline
 368 void calculate_potential_switch_F(cl_nbparam_params_t *nbparam,
 369                                   float                inv_r,
 370                                   float                r2,
 371                                   float               *F_invr,
 372                                   float               *E_lj)
 373 {
 374     float r, r_switch;
 375     float sw, dsw;
 376
 377     /* potential switch constants */
 378     float switch_V3 = nbparam->vdw_switch.c3;
 379     float switch_V4 = nbparam->vdw_switch.c4;
 380     float switch_V5 = nbparam->vdw_switch.c5;
 381     float switch_F2 = nbparam->vdw_switch.c3;
 382     float switch_F3 = nbparam->vdw_switch.c4;
 383     float switch_F4 = nbparam->vdw_switch.c5;
 384
 385     r        = r2 * inv_r;
 386     r_switch = r - nbparam->rvdw_switch;
 387
 388     /* Unlike in the F+E kernel, conditional is faster here */
 389     if (r_switch > 0.0f)
 390     {
 391         sw      = 1.0f + (switch_V3 + (switch_V4 + switch_V5*r_switch)*r_switch)*r_switch*r_switch*r_switch;
 392         dsw     = (switch_F2 + (switch_F3 + switch_F4*r_switch)*r_switch)*r_switch*r_switch;
 393
 394         *F_invr = (*F_invr)*sw - inv_r*(*E_lj)*dsw;
 395     }
 396 }
 397
 398 /*! Apply potential switch, force + energy version. */
 399 gmx_opencl_inline
 400 void calculate_potential_switch_F_E(cl_nbparam_params_t *nbparam,
 401                                     float                inv_r,
 402                                     float                r2,
 403                                     float               *F_invr,
 404                                     float               *E_lj)
 405 {
 406     float r, r_switch;
 407     float sw, dsw;
 408
 409     /* potential switch constants */
 410     float switch_V3 = nbparam->vdw_switch.c3;
 411     float switch_V4 = nbparam->vdw_switch.c4;
 412     float switch_V5 = nbparam->vdw_switch.c5;
 413     float switch_F2 = nbparam->vdw_switch.c3;
 414     float switch_F3 = nbparam->vdw_switch.c4;
 415     float switch_F4 = nbparam->vdw_switch.c5;
 416
 417     r        = r2 * inv_r;
 418     r_switch = r - nbparam->rvdw_switch;
 419     r_switch = r_switch >= 0.0f ? r_switch : 0.0f;
 420
 421     /* Unlike in the F-only kernel, masking is faster here */
 422     sw       = 1.0f + (switch_V3 + (switch_V4 + switch_V5*r_switch)*r_switch)*r_switch*r_switch*r_switch;
 423     dsw      = (switch_F2 + (switch_F3 + switch_F4*r_switch)*r_switch)*r_switch*r_switch;
 424
 425     *F_invr  = (*F_invr)*sw - inv_r*(*E_lj)*dsw;
 426     *E_lj   *= sw;
 427 }
 428
 429 /*! Calculate LJ-PME grid force contribution with
 430  *  geometric combination rule.
 431  */
 432 gmx_opencl_inline
 433 void calculate_lj_ewald_comb_geom_F(__constant float * nbfp_comb_climg2d,
 434                                     int                typei,
 435                                     int                typej,
 436                                     float              r2,
 437                                     float              inv_r2,
 438                                     float              lje_coeff2,
 439                                     float              lje_coeff6_6,
 440                                     float             *F_invr)
 441 {
 442     float c6grid, inv_r6_nm, cr2, expmcr2, poly;
 443
 444     c6grid    = nbfp_comb_climg2d[2*typei]*nbfp_comb_climg2d[2*typej];
 445
 446     /* Recalculate inv_r6 without exclusion mask */
 447     inv_r6_nm = inv_r2*inv_r2*inv_r2;
 448     cr2       = lje_coeff2*r2;
 449     expmcr2   = exp(-cr2);
 450     poly      = 1.0f + cr2 + 0.5f*cr2*cr2;
 451
 452     /* Subtract the grid force from the total LJ force */
 453     *F_invr  += c6grid*(inv_r6_nm - expmcr2*(inv_r6_nm*poly + lje_coeff6_6))*inv_r2;
 454 }
 455
 456 /*! Calculate LJ-PME grid force + energy contribution with
 457  *  geometric combination rule.
 458  */
 459 gmx_opencl_inline
 460 void calculate_lj_ewald_comb_geom_F_E(__constant float    *nbfp_comb_climg2d,
 461                                       cl_nbparam_params_t *nbparam,
 462                                       int                  typei,
 463                                       int                  typej,
 464                                       float                r2,
 465                                       float                inv_r2,
 466                                       float                lje_coeff2,
 467                                       float                lje_coeff6_6,
 468                                       float                int_bit,
 469                                       float               *F_invr,
 470                                       float               *E_lj)
 471 {
 472     float c6grid, inv_r6_nm, cr2, expmcr2, poly, sh_mask;
 473
 474     c6grid    = nbfp_comb_climg2d[2*typei]*nbfp_comb_climg2d[2*typej];
 475
 476     /* Recalculate inv_r6 without exclusion mask */
 477     inv_r6_nm = inv_r2*inv_r2*inv_r2;
 478     cr2       = lje_coeff2*r2;
 479     expmcr2   = exp(-cr2);
 480     poly      = 1.0f + cr2 + 0.5f*cr2*cr2;
 481
 482     /* Subtract the grid force from the total LJ force */
 483     *F_invr  += c6grid*(inv_r6_nm - expmcr2*(inv_r6_nm*poly + lje_coeff6_6))*inv_r2;
 484
 485     /* Shift should be applied only to real LJ pairs */
 486     sh_mask   = nbparam->sh_lj_ewald*int_bit;
 487     *E_lj    += ONE_SIXTH_F*c6grid*(inv_r6_nm*(1.0f - expmcr2*poly) + sh_mask);
 488 }
 489
 490 /*! Calculate LJ-PME grid force + energy contribution (if E_lj != NULL) with
 491  *  Lorentz-Berthelot combination rule.
 492  *  We use a single F+E kernel with conditional because the performance impact
 493  *  of this is pretty small and LB on the CPU is anyway very slow.
 494  */
 495 gmx_opencl_inline
 496 void calculate_lj_ewald_comb_LB_F_E(__constant float    *nbfp_comb_climg2d,
 497                                     cl_nbparam_params_t *nbparam,
 498                                     int                  typei,
 499                                     int                  typej,
 500                                     float                r2,
 501                                     float                inv_r2,
 502                                     float                lje_coeff2,
 503                                     float                lje_coeff6_6,
 504                                     float                int_bit,
 505                                     bool                 with_E_lj,
 506                                     float               *F_invr,
 507                                     float               *E_lj)
 508 {
 509     float c6grid, inv_r6_nm, cr2, expmcr2, poly;
 510     float sigma, sigma2, epsilon;
 511
 512     /* sigma and epsilon are scaled to give 6*C6 */
 513     sigma      = nbfp_comb_climg2d[2*typei] + nbfp_comb_climg2d[2*typej];
 514
 515     epsilon    = nbfp_comb_climg2d[2*typei+1]*nbfp_comb_climg2d[2*typej+1];
 516
 517     sigma2  = sigma*sigma;
 518     c6grid  = epsilon*sigma2*sigma2*sigma2;
 519
 520     /* Recalculate inv_r6 without exclusion mask */
 521     inv_r6_nm = inv_r2*inv_r2*inv_r2;
 522     cr2       = lje_coeff2*r2;
 523     expmcr2   = exp(-cr2);
 524     poly      = 1.0f + cr2 + 0.5f*cr2*cr2;
 525
 526     /* Subtract the grid force from the total LJ force */
 527     *F_invr  += c6grid*(inv_r6_nm - expmcr2*(inv_r6_nm*poly + lje_coeff6_6))*inv_r2;
 528
 529     if (with_E_lj)
 530     {
 531         float sh_mask;
 532
 533         /* Shift should be applied only to real LJ pairs */
 534         sh_mask   = nbparam->sh_lj_ewald*int_bit;
 535         *E_lj    += ONE_SIXTH_F*c6grid*(inv_r6_nm*(1.0f - expmcr2*poly) + sh_mask);
 536     }
 537 }
 538
 539 /*! Interpolate Ewald coulomb force using the table through the tex_nbfp texture.
 540  *  Original idea: from the OpenMM project
 541  */
 542 gmx_opencl_inline float
 543 interpolate_coulomb_force_r(__constant float *coulomb_tab_climg2d,
 544                             float             r,
 545                             float             scale)
 546 {
 547     float   normalized = scale * r;
 548     int     index      = (int) normalized;
 549     float   fract2     = normalized - index;
 550     float   fract1     = 1.0f - fract2;
 551
 552     return fract1*coulomb_tab_climg2d[index] +
 553            fract2*coulomb_tab_climg2d[index + 1];
 554 }
 555
 556 /*! Calculate analytical Ewald correction term. */
 557 gmx_opencl_inline
 558 float pmecorrF(float z2)
 559 {
 560     const float FN6 = -1.7357322914161492954e-8f;
 561     const float FN5 = 1.4703624142580877519e-6f;
 562     const float FN4 = -0.000053401640219807709149f;
 563     const float FN3 = 0.0010054721316683106153f;
 564     const float FN2 = -0.019278317264888380590f;
 565     const float FN1 = 0.069670166153766424023f;
 566     const float FN0 = -0.75225204789749321333f;
 567
 568     const float FD4 = 0.0011193462567257629232f;
 569     const float FD3 = 0.014866955030185295499f;
 570     const float FD2 = 0.11583842382862377919f;
 571     const float FD1 = 0.50736591960530292870f;
 572     const float FD0 = 1.0f;
 573
 574     float       z4;
 575     float       polyFN0, polyFN1, polyFD0, polyFD1;
 576
 577     z4          = z2*z2;
 578
 579     polyFD0     = FD4*z4 + FD2;
 580     polyFD1     = FD3*z4 + FD1;
 581     polyFD0     = polyFD0*z4 + FD0;
 582     polyFD0     = polyFD1*z2 + polyFD0;
 583
 584     polyFD0     = 1.0f/polyFD0;
 585
 586     polyFN0     = FN6*z4 + FN4;
 587     polyFN1     = FN5*z4 + FN3;
 588     polyFN0     = polyFN0*z4 + FN2;
 589     polyFN1     = polyFN1*z4 + FN1;
 590     polyFN0     = polyFN0*z4 + FN0;
 591     polyFN0     = polyFN1*z2 + polyFN0;
 592
 593     return polyFN0*polyFD0;
 594 }
 595
 596 #if REDUCE_SHUFFLE
 597 gmx_opencl_inline
 598 void reduce_force_j_shfl(float3 fin, __global float *fout,
 599                          int tidxi, int tidxj, int aidx)
 600 {
 601     /* Only does reduction over 4 elements in cluster. Needs to be changed
 602      * for CL_SIZE>4. See CUDA code for required code */
 603     fin.x += intel_sub_group_shuffle_down(fin.x, fin.x, 1);
 604     fin.y += intel_sub_group_shuffle_up  (fin.y, fin.y, 1);
 605     fin.z += intel_sub_group_shuffle_down(fin.z, fin.z, 1);
 606     if ((tidxi & 1) == 1)
 607     {
 608         fin.x = fin.y;
 609     }
 610     fin.x += intel_sub_group_shuffle_down(fin.x, fin.x, 2);
 611     fin.z += intel_sub_group_shuffle_up  (fin.z, fin.z, 2);
 612     if (tidxi == 2)
 613     {
 614         fin.x = fin.z;
 615     }
 616     if (tidxi < 3)
 617     {
 618         atomicAdd_g_f(&fout[3 * aidx + tidxi], fin.x);
 619     }
 620 }
 621 #endif
 622
 623 gmx_opencl_inline
 624 void reduce_force_j_generic(__local float *f_buf, float3 fcj_buf, __global float *fout,
 625                             int tidxi, int tidxj, int aidx)
 626 {
 627     int tidx = tidxi + tidxj*CL_SIZE;
 628     f_buf[                  tidx] = fcj_buf.x;
 629     f_buf[    FBUF_STRIDE + tidx] = fcj_buf.y;
 630     f_buf[2 * FBUF_STRIDE + tidx] = fcj_buf.z;
 631
 632     /* Split the reduction between the first 3 column threads
 633        Threads with column id 0 will do the reduction for (float3).x components
 634        Threads with column id 1 will do the reduction for (float3).y components
 635        Threads with column id 2 will do the reduction for (float3).z components.
 636        The reduction is performed for each line tidxj of f_buf. */
 637     if (tidxi < 3)
 638     {
 639         float f = 0.0f;
 640         for (int j = tidxj * CL_SIZE; j < (tidxj + 1) * CL_SIZE; j++)
 641         {
 642             f += f_buf[FBUF_STRIDE * tidxi + j];
 643         }
 644
 645         atomicAdd_g_f(&fout[3 * aidx + tidxi], f);
 646     }
 647 }
 648
 649 /*! Final j-force reduction
 650  */
 651 gmx_opencl_inline
 652 void reduce_force_j(__local float *f_buf, float3 fcj_buf, __global float *fout,
 653                     int tidxi, int tidxj, int aidx)
 654 {
 655 #if REDUCE_SHUFFLE
 656     reduce_force_j_shfl(fcj_buf, fout, tidxi, tidxj, aidx);
 657 #else
 658     reduce_force_j_generic(f_buf, fcj_buf, fout, tidxi, tidxj, aidx);
 659 #endif
 660 }
 661
 662 #if REDUCE_SHUFFLE
 663 gmx_opencl_inline
 664 void reduce_force_i_and_shift_shfl(float3* fci_buf, __global float *fout,
 665                                    bool bCalcFshift, int tidxi, int tidxj,
 666                                    int sci, int shift, __global float *fshift)
 667 {
 668     /* Only does reduction over 4 elements in cluster (2 per warp). Needs to be changed
 669      * for CL_SIZE>4.*/
 670     float2 fshift_buf = 0;
 671     for (int ci_offset = 0; ci_offset < NCL_PER_SUPERCL; ci_offset++)
 672     {
 673         int    aidx = (sci * NCL_PER_SUPERCL + ci_offset) * CL_SIZE + tidxi;
 674         float3 fin  = fci_buf[ci_offset];
 675         fin.x += intel_sub_group_shuffle_down(fin.x, fin.x, CL_SIZE);
 676         fin.y += intel_sub_group_shuffle_up  (fin.y, fin.y, CL_SIZE);
 677         fin.z += intel_sub_group_shuffle_down(fin.z, fin.z, CL_SIZE);
 678
 679         if (tidxj & 1)
 680         {
 681             fin.x = fin.y;
 682         }
 683         /* Threads 0,1 and 2,3 increment x,y for their warp */
 684         atomicAdd_g_f(&fout[3*aidx + (tidxj & 1)], fin.x);
 685         if (bCalcFshift)
 686         {
 687             fshift_buf[0] += fin.x;
 688         }
 689         /* Threads 0 and 2 increment z for their warp */
 690         if ((tidxj & 1) == 0)
 691         {
 692             atomicAdd_g_f(&fout[3*aidx+2], fin.z);
 693             if (bCalcFshift)
 694             {
 695                 fshift_buf[1] += fin.z;
 696             }
 697         }
 698     }
 699     /* add up local shift forces into global mem */
 700     if (bCalcFshift)
 701     {
 702         //Threads 0,1 and 2,3 update x,y
 703         atomicAdd_g_f(&(fshift[3 * shift + (tidxj&1)]), fshift_buf[0]);
 704         //Threads 0 and 2 update z
 705         if ((tidxj & 1) == 0)
 706         {
 707             atomicAdd_g_f(&(fshift[3 * shift + 2]), fshift_buf[1]);
 708         }
 709     }
 710 }
 711 #endif
 712
 713 /*! Final i-force reduction; this implementation works only with power of two
 714  *  array sizes.
 715  */
 716 gmx_opencl_inline
 717 void reduce_force_i_and_shift_pow2(volatile __local float *f_buf, float3* fci_buf,
 718                                    __global float *fout,
 719                                    bool bCalcFshift,
 720                                    int tidxi, int tidxj,
 721                                    int sci, int shift, __global float *fshift)
 722 {
 723     float fshift_buf = 0;
 724     for (int ci_offset = 0; ci_offset < NCL_PER_SUPERCL; ci_offset++)
 725     {
 726         int aidx = (sci * NCL_PER_SUPERCL + ci_offset) * CL_SIZE + tidxi;
 727         int tidx = tidxi + tidxj*CL_SIZE;
 728         /* store i forces in shmem */
 729         f_buf[                  tidx] = fci_buf[ci_offset].x;
 730         f_buf[    FBUF_STRIDE + tidx] = fci_buf[ci_offset].y;
 731         f_buf[2 * FBUF_STRIDE + tidx] = fci_buf[ci_offset].z;
 732         barrier(CLK_LOCAL_MEM_FENCE);
 733
 734         int     i, j;
 735         /* Reduce the initial CL_SIZE values for each i atom to half
 736          * every step by using CL_SIZE * i threads.
 737          * Can't just use i as loop variable because than nvcc refuses to unroll.
 738          */
 739         i = CL_SIZE/2;
 740         for (j = CL_SIZE_LOG2 - 1; j > 0; j--)
 741         {
 742             if (tidxj < i)
 743             {
 744
 745                 f_buf[                  tidxj * CL_SIZE + tidxi] += f_buf[                  (tidxj + i) * CL_SIZE + tidxi];
 746                 f_buf[    FBUF_STRIDE + tidxj * CL_SIZE + tidxi] += f_buf[    FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi];
 747                 f_buf[2 * FBUF_STRIDE + tidxj * CL_SIZE + tidxi] += f_buf[2 * FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi];
 748             }
 749             i >>= 1;
 750         }
 751         /* needed because
 752          * a) for CL_SIZE<8: id 2 (doing z in next block) is in 2nd warp
 753          * b) for all CL_SIZE a barrier is needed before f_buf is reused by next reduce_force_i call
 754          * TODO: Test on Nvidia for performance difference between having the barrier here or after the atomicAdd
 755          */
 756         barrier(CLK_LOCAL_MEM_FENCE);
 757
 758         /* i == 1, last reduction step, writing to global mem */
 759         /* Split the reduction between the first 3 line threads
 760            Threads with line id 0 will do the reduction for (float3).x components
 761            Threads with line id 1 will do the reduction for (float3).y components
 762            Threads with line id 2 will do the reduction for (float3).z components. */
 763         if (tidxj < 3)
 764         {
 765             float f = f_buf[tidxj * FBUF_STRIDE + tidxi] + f_buf[tidxj * FBUF_STRIDE + i * CL_SIZE + tidxi];
 766
 767             atomicAdd_g_f(&fout[3 * aidx + tidxj], f);
 768
 769             if (bCalcFshift)
 770             {
 771                 fshift_buf += f;
 772             }
 773         }
 774     }
 775     /* add up local shift forces into global mem */
 776     if (bCalcFshift)
 777     {
 778         /* Only threads with tidxj < 3 will update fshift.
 779            The threads performing the update, must be the same as the threads
 780            storing the reduction result above.
 781          */
 782         if (tidxj < 3)
 783         {
 784             atomicAdd_g_f(&(fshift[3 * shift + tidxj]), fshift_buf);
 785         }
 786     }
 787 }
 788
 789 /*! Final i-force reduction
 790  */
 791 gmx_opencl_inline
 792 void reduce_force_i_and_shift(__local float *f_buf, float3* fci_buf, __global float *f,
 793                               bool bCalcFshift, int tidxi, int tidxj, int sci,
 794                               int shift, __global float *fshift)
 795 {
 796 #if REDUCE_SHUFFLE
 797     reduce_force_i_and_shift_shfl(fci_buf, f, bCalcFshift, tidxi, tidxj,
 798                                   sci, shift, fshift);
 799 #else
 800     reduce_force_i_and_shift_pow2(f_buf, fci_buf, f, bCalcFshift, tidxi, tidxj,
 801                                   sci, shift, fshift);
 802 #endif
 803 }
 804
 805
 806
 807 #if REDUCE_SHUFFLE
 808 gmx_opencl_inline
 809 void reduce_energy_shfl(float E_lj, float E_el,
 810                         volatile __global float *e_lj,
 811                         volatile __global float *e_el,
 812                         unsigned int    tidx)
 813 {
 814     E_lj = sub_group_reduce_add(E_lj);
 815     E_el = sub_group_reduce_add(E_el);
 816     /* Should be get_sub_group_local_id()==0. Doesn't work with Intel Classic driver.
 817      * To make it possible to use REDUCE_SHUFFLE with single subgroup per i-j pair
 818      * (e.g. subgroup size 16 with CL_SIZE 4), either this "if" needs to be changed or
 819      * the definition of WARP_SIZE (currently CL_SIZE*CL_SIZE/2) needs to be changed
 820      * (by supporting c_nbnxnGpuClusterpairSplit=1). */
 821     if (tidx == 0 || tidx == WARP_SIZE)
 822     {
 823         atomicAdd_g_f(e_lj, E_lj);
 824         atomicAdd_g_f(e_el, E_el);
 825     }
 826 }
 827 #endif
 828
 829 /*! Energy reduction; this implementation works only with power of two
 830  *  array sizes.
 831  */
 832 gmx_opencl_inline
 833 void reduce_energy_pow2(volatile __local float  *buf,
 834                         volatile __global float *e_lj,
 835                         volatile __global float *e_el,
 836                         unsigned int             tidx)
 837 {
 838     int     i, j;
 839     float   e1, e2;
 840
 841     i = WARP_SIZE/2;
 842
 843     /* Can't just use i as loop variable because than nvcc refuses to unroll. */
 844     for (j = WARP_SIZE_LOG2 - 1; j > 0; j--)
 845     {
 846         if (tidx < i)
 847         {
 848             buf[              tidx] += buf[              tidx + i];
 849             buf[FBUF_STRIDE + tidx] += buf[FBUF_STRIDE + tidx + i];
 850         }
 851         i >>= 1;
 852     }
 853
 854     /* last reduction step, writing to global mem */
 855     if (tidx == 0)
 856     {
 857         e1 = buf[              tidx] + buf[              tidx + i];
 858         e2 = buf[FBUF_STRIDE + tidx] + buf[FBUF_STRIDE + tidx + i];
 859
 860         atomicAdd_g_f(e_lj, e1);
 861         atomicAdd_g_f(e_el, e2);
 862     }
 863 }
 864
 865 gmx_opencl_inline
 866 void reduce_energy(volatile __local float  *buf,
 867                    float E_lj, float E_el,
 868                    volatile __global float *e_lj,
 869                    volatile __global float *e_el,
 870                    unsigned int             tidx)
 871 {
 872 #if REDUCE_SHUFFLE
 873     reduce_energy_shfl(E_lj, E_el, e_lj, e_el, tidx);
 874 #else
 875     /* flush the energies to shmem and reduce them */
 876     buf[              tidx] = E_lj;
 877     buf[FBUF_STRIDE + tidx] = E_el;
 878     reduce_energy_pow2(buf + (tidx & WARP_SIZE), e_lj, e_el, tidx & ~WARP_SIZE);
 879 #endif
 880 }
 881
 882 bool gmx_sub_group_any_localmem(volatile __local uint *warp_any, int widx, bool pred)
 883 {
 884     if (pred)
 885     {
 886         warp_any[widx] = 1;
 887     }
 888
 889     bool ret = warp_any[widx];
 890
 891     warp_any[widx] = 0;
 892
 893     return ret;
 894 }
 895
 896 //! Returns a true if predicate is true for any work item in warp
 897 bool gmx_sub_group_any(volatile __local uint *warp_any, int widx, bool pred)
 898 {
 899 #if USE_SUBGROUP_ANY
 900     return sub_group_any(pred);
 901 #else
 902     return gmx_sub_group_any_localmem(warp_any, widx, pred);
 903 #endif
 904 }
 905
 906 #endif /* NBNXN_OPENCL_KERNEL_UTILS_CLH */