src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_utils.clh

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2012,2013,2014,2016,2017,2018, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 #include "vectype_ops.clh"
  37
  38 #define CL_SIZE                 (NBNXN_GPU_CLUSTER_SIZE)
  39 #define NCL_PER_SUPERCL         (NBNXN_GPU_NCLUSTER_PER_SUPERCLUSTER)
  40
  41 #define WARP_SIZE  (CL_SIZE*CL_SIZE/2)
  42
  43 #undef KERNEL_UTILS_INLINE
  44 #ifdef KERNEL_UTILS_INLINE
  45 #define __INLINE__ inline
  46 #else
  47 #define __INLINE__
  48 #endif
  49
  50 /* 1.0 / sqrt(M_PI) */
  51 #define M_FLOAT_1_SQRTPI 0.564189583547756f
  52
  53 //-------------------
  54
  55 #ifndef NBNXN_OPENCL_KERNEL_UTILS_CLH
  56 #define NBNXN_OPENCL_KERNEL_UTILS_CLH
  57
  58 __constant sampler_t generic_sampler     = (CLK_NORMALIZED_COORDS_FALSE /* Natural coords */
  59                                             | CLK_ADDRESS_NONE          /* No clamp/repeat*/
  60                                             | CLK_FILTER_NEAREST);      /* No interpolation */
  61
  62 #define __device__
  63
  64 #if CL_SIZE == 8
  65 #define WARP_SIZE_LOG2  (5)
  66 #define CL_SIZE_LOG2    (3)
  67 #elif CL_SIZE == 4
  68 #define WARP_SIZE_LOG2  (3)
  69 #define CL_SIZE_LOG2    (2)
  70 #else
  71 #error unsupported CL_SIZE
  72 #endif
  73
  74 #define CL_SIZE_SQ      (CL_SIZE * CL_SIZE)
  75 #define FBUF_STRIDE     (CL_SIZE_SQ)
  76
  77 #define ONE_SIXTH_F     0.16666667f
  78 #define ONE_TWELVETH_F  0.08333333f
  79
  80
  81 // Data structures shared between OpenCL device code and OpenCL host code
  82 // TODO: review, improve
  83 // Replaced real by float for now, to avoid including any other header
  84 typedef struct {
  85     float c2;
  86     float c3;
  87     float cpot;
  88 } shift_consts_t;
  89
  90 /* Used with potential switching:
  91  * rsw        = max(r - r_switch, 0)
  92  * sw         = 1 + c3*rsw^3 + c4*rsw^4 + c5*rsw^5
  93  * dsw        = 3*c3*rsw^2 + 4*c4*rsw^3 + 5*c5*rsw^4
  94  * force      = force*dsw - potential*sw
  95  * potential *= sw
  96  */
  97 typedef struct {
  98     float c3;
  99     float c4;
 100     float c5;
 101 } switch_consts_t;
 102
 103 // Data structure shared between the OpenCL device code and OpenCL host code
 104 // Must not contain OpenCL objects (buffers)
 105 typedef struct cl_nbparam_params
 106 {
 107
 108     int             eeltype;          /**< type of electrostatics, takes values from #eelCu */
 109     int             vdwtype;          /**< type of VdW impl., takes values from #evdwCu     */
 110
 111     float           epsfac;           /**< charge multiplication factor                      */
 112     float           c_rf;             /**< Reaction-field/plain cutoff electrostatics const. */
 113     float           two_k_rf;         /**< Reaction-field electrostatics constant            */
 114     float           ewald_beta;       /**< Ewald/PME parameter                               */
 115     float           sh_ewald;         /**< Ewald/PME correction term substracted from the direct-space potential */
 116     float           sh_lj_ewald;      /**< LJ-Ewald/PME correction term added to the correction potential        */
 117     float           ewaldcoeff_lj;    /**< LJ-Ewald/PME coefficient                          */
 118
 119     float           rcoulomb_sq;      /**< Coulomb cut-off squared                           */
 120
 121     float           rvdw_sq;          /**< VdW cut-off squared                               */
 122     float           rvdw_switch;      /**< VdW switched cut-off                              */
 123     float           rlistOuter_sq;    /**< Full, outer pair-list cut-off squared             */
 124     float           rlistInner_sq;    /**< Inner, dynamic pruned pair-list cut-off squared  XXX: this is only needed in the pruning kernels, but for now we also pass it to the nonbondeds */
 125
 126     shift_consts_t  dispersion_shift; /**< VdW shift dispersion constants           */
 127     shift_consts_t  repulsion_shift;  /**< VdW shift repulsion constants            */
 128     switch_consts_t vdw_switch;       /**< VdW switch constants                     */
 129
 130     /* Ewald Coulomb force table data - accessed through texture memory */
 131     float                  coulomb_tab_scale;  /**< table scale/spacing                        */
 132 }cl_nbparam_params_t;
 133
 134 typedef struct {
 135     int sci;            /* i-super-cluster       */
 136     int shift;          /* Shift vector index plus possible flags */
 137     int cj4_ind_start;  /* Start index into cj4  */
 138     int cj4_ind_end;    /* End index into cj4    */
 139 } nbnxn_sci_t;
 140
 141 typedef struct {
 142     unsigned int imask;    /* The i-cluster interactions mask for 1 warp  */
 143     int          excl_ind; /* Index into the exclusion array for 1 warp   */
 144 } nbnxn_im_ei_t;
 145
 146 typedef struct {
 147     int           cj[4];   /* The 4 j-clusters                            */
 148     nbnxn_im_ei_t imei[2]; /* The i-cluster mask data       for 2 warps   */
 149 } nbnxn_cj4_t;
 150
 151
 152 typedef struct {
 153     unsigned int pair[CL_SIZE*CL_SIZE/2]; /* Topology exclusion interaction bits for one warp,
 154                                            * each unsigned has bitS for 4*8 i clusters
 155                                            */
 156 } nbnxn_excl_t;
 157
 158 /*! i-cluster interaction mask for a super-cluster with all NCL_PER_SUPERCL bits set */
 159 __constant unsigned supercl_interaction_mask = ((1U << NCL_PER_SUPERCL) - 1U);
 160
 161
 162 /*! \brief Preload cj4 into local memory.
 163  *
 164  * - For AMD we load once for a wavefront of 64 threads (on 4 threads * NTHREAD_Z)
 165  * - For NVIDIA once per warp (on 2x4 threads * NTHREAD_Z)
 166  * - Same as AMD in the nowarp kernel; we do not assume execution width and therefore
 167  *   the caller needs to sync.
 168  *
 169  * It is the caller's responsibility to make sure that data is consumed only when
 170  * it's ready. This function does not call a barrier.
 171  */
 172 __INLINE__ __device__
 173 void preloadCj4(__local int        *sm_cjPreload,
 174                 const __global int *gm_cj,
 175                 int                 tidxi,
 176                 int                 tidxj,
 177                 bool                iMaskCond)
 178
 179 {
 180 #if !USE_CJ_PREFETCH
 181     return;
 182 #endif
 183
 184     const int c_clSize                   = CL_SIZE;
 185     const int c_nbnxnGpuJgroupSize       = NBNXN_GPU_JGROUP_SIZE;
 186     const int c_nbnxnGpuClusterpairSplit = 2;
 187     const int c_splitClSize              = c_clSize/c_nbnxnGpuClusterpairSplit;
 188
 189     /* Pre-load cj into shared memory */
 190 #if defined _NVIDIA_SOURCE_
 191     /* on both warps separately for NVIDIA */
 192     if ((tidxj == 0 | tidxj == 4) & (tidxi < c_nbnxnGpuJgroupSize))
 193     {
 194         sm_cjPreload[tidxi + tidxj * c_nbnxnGpuJgroupSize/c_splitClSize] = gm_cj[tidxi];
 195     }
 196 #else // AMD or nowarp
 197       /* Note that with "nowarp" / on hardware with wavefronts <64 a barrier is needed after preload. */
 198     if (tidxj == 0 & tidxi < c_nbnxnGpuJgroupSize)
 199     {
 200         sm_cjPreload[tidxi] = gm_cj[tidxi];
 201     }
 202 #endif
 203 }
 204
 205 /* \brief Load a cj given a jm index.
 206  *
 207  * If cj4 preloading is enabled, it loads from the local memory, otherwise from global.
 208  */
 209 __INLINE__ __device__
 210 int loadCj(__local int        *sm_cjPreload,
 211            const __global int *gm_cj,
 212            int                 jm,
 213            int                 tidxi,
 214            int                 tidxj)
 215 {
 216     const int c_clSize                   = CL_SIZE;
 217     const int c_nbnxnGpuJgroupSize       = NBNXN_GPU_JGROUP_SIZE;
 218     const int c_nbnxnGpuClusterpairSplit = 2;
 219     const int c_splitClSize              = c_clSize/c_nbnxnGpuClusterpairSplit;
 220
 221 #if USE_CJ_PREFETCH
 222 #if defined _NVIDIA_SOURCE_
 223     int warpLoadOffset = (tidxj & 4) * c_nbnxnGpuJgroupSize/c_splitClSize;
 224 #elif defined _AMD_SOURCE_
 225     int warpLoadOffset = 0;
 226 #else
 227 #error Not supported
 228 #endif
 229     return sm_cjPreload[jm + warpLoadOffset];
 230 #else
 231     return gm_cj[jm];
 232 #endif
 233 }
 234
 235
 236 /*! Convert LJ sigma,epsilon parameters to C6,C12. */
 237 __INLINE__ __device__
 238 void convert_sigma_epsilon_to_c6_c12(const float  sigma,
 239                                      const float  epsilon,
 240                                      float       *c6,
 241                                      float       *c12)
 242 {
 243     float sigma2, sigma6;
 244
 245     sigma2 = sigma * sigma;
 246     sigma6 = sigma2 *sigma2 * sigma2;
 247     *c6    = epsilon * sigma6;
 248     *c12   = *c6 * sigma6;
 249 }
 250
 251
 252 /*! Apply force switch,  force + energy version. */
 253 __INLINE__ __device__
 254 void calculate_force_switch_F(cl_nbparam_params_t *nbparam,
 255                               float                c6,
 256                               float                c12,
 257                               float                inv_r,
 258                               float                r2,
 259                               float               *F_invr)
 260 {
 261     float r, r_switch;
 262
 263     /* force switch constants */
 264     float disp_shift_V2 = nbparam->dispersion_shift.c2;
 265     float disp_shift_V3 = nbparam->dispersion_shift.c3;
 266     float repu_shift_V2 = nbparam->repulsion_shift.c2;
 267     float repu_shift_V3 = nbparam->repulsion_shift.c3;
 268
 269     r         = r2 * inv_r;
 270     r_switch  = r - nbparam->rvdw_switch;
 271     r_switch  = r_switch >= 0.0f ? r_switch : 0.0f;
 272
 273     *F_invr  +=
 274         -c6*(disp_shift_V2 + disp_shift_V3*r_switch)*r_switch*r_switch*inv_r +
 275         c12*(-repu_shift_V2 + repu_shift_V3*r_switch)*r_switch*r_switch*inv_r;
 276 }
 277
 278 /*! Apply force switch, force-only version. */
 279 __INLINE__ __device__
 280 void calculate_force_switch_F_E(cl_nbparam_params_t *nbparam,
 281                                 float                c6,
 282                                 float                c12,
 283                                 float                inv_r,
 284                                 float                r2,
 285                                 float               *F_invr,
 286                                 float               *E_lj)
 287 {
 288     float r, r_switch;
 289
 290     /* force switch constants */
 291     float disp_shift_V2 = nbparam->dispersion_shift.c2;
 292     float disp_shift_V3 = nbparam->dispersion_shift.c3;
 293     float repu_shift_V2 = nbparam->repulsion_shift.c2;
 294     float repu_shift_V3 = nbparam->repulsion_shift.c3;
 295
 296     float disp_shift_F2 = nbparam->dispersion_shift.c2/3;
 297     float disp_shift_F3 = nbparam->dispersion_shift.c3/4;
 298     float repu_shift_F2 = nbparam->repulsion_shift.c2/3;
 299     float repu_shift_F3 = nbparam->repulsion_shift.c3/4;
 300
 301     r         = r2 * inv_r;
 302     r_switch  = r - nbparam->rvdw_switch;
 303     r_switch  = r_switch >= 0.0f ? r_switch : 0.0f;
 304
 305     *F_invr  +=
 306         -c6*(disp_shift_V2 + disp_shift_V3*r_switch)*r_switch*r_switch*inv_r +
 307         c12*(-repu_shift_V2 + repu_shift_V3*r_switch)*r_switch*r_switch*inv_r;
 308     *E_lj    +=
 309         c6*(disp_shift_F2 + disp_shift_F3*r_switch)*r_switch*r_switch*r_switch -
 310         c12*(repu_shift_F2 + repu_shift_F3*r_switch)*r_switch*r_switch*r_switch;
 311 }
 312
 313 /*! Apply potential switch, force-only version. */
 314 __INLINE__ __device__
 315 void calculate_potential_switch_F(cl_nbparam_params_t *nbparam,
 316                                   float                inv_r,
 317                                   float                r2,
 318                                   float               *F_invr,
 319                                   float               *E_lj)
 320 {
 321     float r, r_switch;
 322     float sw, dsw;
 323
 324     /* potential switch constants */
 325     float switch_V3 = nbparam->vdw_switch.c3;
 326     float switch_V4 = nbparam->vdw_switch.c4;
 327     float switch_V5 = nbparam->vdw_switch.c5;
 328     float switch_F2 = nbparam->vdw_switch.c3;
 329     float switch_F3 = nbparam->vdw_switch.c4;
 330     float switch_F4 = nbparam->vdw_switch.c5;
 331
 332     r        = r2 * inv_r;
 333     r_switch = r - nbparam->rvdw_switch;
 334
 335     /* Unlike in the F+E kernel, conditional is faster here */
 336     if (r_switch > 0.0f)
 337     {
 338         sw      = 1.0f + (switch_V3 + (switch_V4 + switch_V5*r_switch)*r_switch)*r_switch*r_switch*r_switch;
 339         dsw     = (switch_F2 + (switch_F3 + switch_F4*r_switch)*r_switch)*r_switch*r_switch;
 340
 341         *F_invr = (*F_invr)*sw - inv_r*(*E_lj)*dsw;
 342     }
 343 }
 344
 345 /*! Apply potential switch, force + energy version. */
 346 __INLINE__ __device__
 347 void calculate_potential_switch_F_E(cl_nbparam_params_t *nbparam,
 348                                     float                inv_r,
 349                                     float                r2,
 350                                     float               *F_invr,
 351                                     float               *E_lj)
 352 {
 353     float r, r_switch;
 354     float sw, dsw;
 355
 356     /* potential switch constants */
 357     float switch_V3 = nbparam->vdw_switch.c3;
 358     float switch_V4 = nbparam->vdw_switch.c4;
 359     float switch_V5 = nbparam->vdw_switch.c5;
 360     float switch_F2 = nbparam->vdw_switch.c3;
 361     float switch_F3 = nbparam->vdw_switch.c4;
 362     float switch_F4 = nbparam->vdw_switch.c5;
 363
 364     r        = r2 * inv_r;
 365     r_switch = r - nbparam->rvdw_switch;
 366     r_switch = r_switch >= 0.0f ? r_switch : 0.0f;
 367
 368     /* Unlike in the F-only kernel, masking is faster here */
 369     sw       = 1.0f + (switch_V3 + (switch_V4 + switch_V5*r_switch)*r_switch)*r_switch*r_switch*r_switch;
 370     dsw      = (switch_F2 + (switch_F3 + switch_F4*r_switch)*r_switch)*r_switch*r_switch;
 371
 372     *F_invr  = (*F_invr)*sw - inv_r*(*E_lj)*dsw;
 373     *E_lj   *= sw;
 374 }
 375
 376 /*! Calculate LJ-PME grid force contribution with
 377  *  geometric combination rule.
 378  */
 379 __INLINE__ __device__
 380 void calculate_lj_ewald_comb_geom_F(__constant float * nbfp_comb_climg2d,
 381                                     int                typei,
 382                                     int                typej,
 383                                     float              r2,
 384                                     float              inv_r2,
 385                                     float              lje_coeff2,
 386                                     float              lje_coeff6_6,
 387                                     float             *F_invr)
 388 {
 389     float c6grid, inv_r6_nm, cr2, expmcr2, poly;
 390
 391     c6grid    = nbfp_comb_climg2d[2*typei]*nbfp_comb_climg2d[2*typej];
 392
 393     /* Recalculate inv_r6 without exclusion mask */
 394     inv_r6_nm = inv_r2*inv_r2*inv_r2;
 395     cr2       = lje_coeff2*r2;
 396     expmcr2   = exp(-cr2);
 397     poly      = 1.0f + cr2 + 0.5f*cr2*cr2;
 398
 399     /* Subtract the grid force from the total LJ force */
 400     *F_invr  += c6grid*(inv_r6_nm - expmcr2*(inv_r6_nm*poly + lje_coeff6_6))*inv_r2;
 401 }
 402
 403 /*! Calculate LJ-PME grid force + energy contribution with
 404  *  geometric combination rule.
 405  */
 406 __INLINE__ __device__
 407 void calculate_lj_ewald_comb_geom_F_E(__constant float    *nbfp_comb_climg2d,
 408                                       cl_nbparam_params_t *nbparam,
 409                                       int                  typei,
 410                                       int                  typej,
 411                                       float                r2,
 412                                       float                inv_r2,
 413                                       float                lje_coeff2,
 414                                       float                lje_coeff6_6,
 415                                       float                int_bit,
 416                                       float               *F_invr,
 417                                       float               *E_lj)
 418 {
 419     float c6grid, inv_r6_nm, cr2, expmcr2, poly, sh_mask;
 420
 421     c6grid    = nbfp_comb_climg2d[2*typei]*nbfp_comb_climg2d[2*typej];
 422
 423     /* Recalculate inv_r6 without exclusion mask */
 424     inv_r6_nm = inv_r2*inv_r2*inv_r2;
 425     cr2       = lje_coeff2*r2;
 426     expmcr2   = exp(-cr2);
 427     poly      = 1.0f + cr2 + 0.5f*cr2*cr2;
 428
 429     /* Subtract the grid force from the total LJ force */
 430     *F_invr  += c6grid*(inv_r6_nm - expmcr2*(inv_r6_nm*poly + lje_coeff6_6))*inv_r2;
 431
 432     /* Shift should be applied only to real LJ pairs */
 433     sh_mask   = nbparam->sh_lj_ewald*int_bit;
 434     *E_lj    += ONE_SIXTH_F*c6grid*(inv_r6_nm*(1.0f - expmcr2*poly) + sh_mask);
 435 }
 436
 437 /*! Calculate LJ-PME grid force + energy contribution (if E_lj != NULL) with
 438  *  Lorentz-Berthelot combination rule.
 439  *  We use a single F+E kernel with conditional because the performance impact
 440  *  of this is pretty small and LB on the CPU is anyway very slow.
 441  */
 442 __INLINE__ __device__
 443 void calculate_lj_ewald_comb_LB_F_E(__constant float    *nbfp_comb_climg2d,
 444                                     cl_nbparam_params_t *nbparam,
 445                                     int                  typei,
 446                                     int                  typej,
 447                                     float                r2,
 448                                     float                inv_r2,
 449                                     float                lje_coeff2,
 450                                     float                lje_coeff6_6,
 451                                     float                int_bit,
 452                                     bool                 with_E_lj,
 453                                     float               *F_invr,
 454                                     float               *E_lj)
 455 {
 456     float c6grid, inv_r6_nm, cr2, expmcr2, poly;
 457     float sigma, sigma2, epsilon;
 458
 459     /* sigma and epsilon are scaled to give 6*C6 */
 460     sigma      = nbfp_comb_climg2d[2*typei] + nbfp_comb_climg2d[2*typej];
 461
 462     epsilon    = nbfp_comb_climg2d[2*typei+1]*nbfp_comb_climg2d[2*typej+1];
 463
 464     sigma2  = sigma*sigma;
 465     c6grid  = epsilon*sigma2*sigma2*sigma2;
 466
 467     /* Recalculate inv_r6 without exclusion mask */
 468     inv_r6_nm = inv_r2*inv_r2*inv_r2;
 469     cr2       = lje_coeff2*r2;
 470     expmcr2   = exp(-cr2);
 471     poly      = 1.0f + cr2 + 0.5f*cr2*cr2;
 472
 473     /* Subtract the grid force from the total LJ force */
 474     *F_invr  += c6grid*(inv_r6_nm - expmcr2*(inv_r6_nm*poly + lje_coeff6_6))*inv_r2;
 475
 476     if (with_E_lj == true)
 477     {
 478         float sh_mask;
 479
 480         /* Shift should be applied only to real LJ pairs */
 481         sh_mask   = nbparam->sh_lj_ewald*int_bit;
 482         *E_lj    += ONE_SIXTH_F*c6grid*(inv_r6_nm*(1.0f - expmcr2*poly) + sh_mask);
 483     }
 484 }
 485
 486 /*! Interpolate Ewald coulomb force using the table through the tex_nbfp texture.
 487  *  Original idea: from the OpenMM project
 488  */
 489 __INLINE__ __device__ float
 490 interpolate_coulomb_force_r(__constant float *coulomb_tab_climg2d,
 491                             float             r,
 492                             float             scale)
 493 {
 494     float   normalized = scale * r;
 495     int     index      = (int) normalized;
 496     float   fract2     = normalized - index;
 497     float   fract1     = 1.0f - fract2;
 498
 499     return fract1*coulomb_tab_climg2d[index] +
 500            fract2*coulomb_tab_climg2d[index + 1];
 501 }
 502
 503 /*! Calculate analytical Ewald correction term. */
 504 __INLINE__ __device__
 505 float pmecorrF(float z2)
 506 {
 507     const float FN6 = -1.7357322914161492954e-8f;
 508     const float FN5 = 1.4703624142580877519e-6f;
 509     const float FN4 = -0.000053401640219807709149f;
 510     const float FN3 = 0.0010054721316683106153f;
 511     const float FN2 = -0.019278317264888380590f;
 512     const float FN1 = 0.069670166153766424023f;
 513     const float FN0 = -0.75225204789749321333f;
 514
 515     const float FD4 = 0.0011193462567257629232f;
 516     const float FD3 = 0.014866955030185295499f;
 517     const float FD2 = 0.11583842382862377919f;
 518     const float FD1 = 0.50736591960530292870f;
 519     const float FD0 = 1.0f;
 520
 521     float       z4;
 522     float       polyFN0, polyFN1, polyFD0, polyFD1;
 523
 524     z4          = z2*z2;
 525
 526     polyFD0     = FD4*z4 + FD2;
 527     polyFD1     = FD3*z4 + FD1;
 528     polyFD0     = polyFD0*z4 + FD0;
 529     polyFD0     = polyFD1*z2 + polyFD0;
 530
 531     polyFD0     = 1.0f/polyFD0;
 532
 533     polyFN0     = FN6*z4 + FN4;
 534     polyFN1     = FN5*z4 + FN3;
 535     polyFN0     = polyFN0*z4 + FN2;
 536     polyFN1     = polyFN1*z4 + FN1;
 537     polyFN0     = polyFN0*z4 + FN0;
 538     polyFN0     = polyFN1*z2 + polyFN0;
 539
 540     return polyFN0*polyFD0;
 541 }
 542
 543 /*! Final j-force reduction; this generic implementation works with
 544  *  arbitrary array sizes.
 545  */
 546 /* AMD OpenCL compiler error "Undeclared function index 1024" if __INLINE__d */
 547 __INLINE__ __device__
 548 void reduce_force_j_generic(__local float *f_buf, __global float *fout,
 549                             int tidxi, int tidxj, int aidx)
 550 {
 551     /* Split the reduction between the first 3 column threads
 552        Threads with column id 0 will do the reduction for (float3).x components
 553        Threads with column id 1 will do the reduction for (float3).y components
 554        Threads with column id 2 will do the reduction for (float3).z components.
 555        The reduction is performed for each line tidxj of f_buf. */
 556     if (tidxi < 3)
 557     {
 558         float f = 0.0f;
 559         for (int j = tidxj * CL_SIZE; j < (tidxj + 1) * CL_SIZE; j++)
 560         {
 561             f += f_buf[FBUF_STRIDE * tidxi + j];
 562         }
 563
 564         atomicAdd_g_f(&fout[3 * aidx + tidxi], f);
 565     }
 566 }
 567
 568 /*! Final i-force reduction; this generic implementation works with
 569  *  arbitrary array sizes.
 570  */
 571 __INLINE__ __device__
 572 void reduce_force_i_generic(__local float *f_buf, __global float *fout,
 573                             float *fshift_buf, bool bCalcFshift,
 574                             int tidxi, int tidxj, int aidx)
 575 {
 576     /* Split the reduction between the first 3 line threads
 577        Threads with line id 0 will do the reduction for (float3).x components
 578        Threads with line id 1 will do the reduction for (float3).y components
 579        Threads with line id 2 will do the reduction for (float3).z components. */
 580     if (tidxj < 3)
 581     {
 582         float f = 0.0f;
 583         for (int j = tidxi; j < CL_SIZE_SQ; j += CL_SIZE)
 584         {
 585             f += f_buf[tidxj * FBUF_STRIDE + j];
 586         }
 587
 588         atomicAdd_g_f(&fout[3 * aidx + tidxj], f);
 589
 590         if (bCalcFshift)
 591         {
 592             (*fshift_buf) += f;
 593         }
 594     }
 595     barrier(CLK_LOCAL_MEM_FENCE);
 596 }
 597
 598 /*! Final i-force reduction; this implementation works only with power of two
 599  *  array sizes.
 600  */
 601 __INLINE__ __device__
 602 void reduce_force_i_pow2(volatile __local float *f_buf, __global float *fout,
 603                          float *fshift_buf, bool bCalcFshift,
 604                          int tidxi, int tidxj, int aidx)
 605 {
 606     int     i, j;
 607     /* Reduce the initial CL_SIZE values for each i atom to half
 608      * every step by using CL_SIZE * i threads.
 609      * Can't just use i as loop variable because than nvcc refuses to unroll.
 610      */
 611     i = CL_SIZE/2;
 612     for (j = CL_SIZE_LOG2 - 1; j > 0; j--)
 613     {
 614         if (tidxj < i)
 615         {
 616
 617             f_buf[                  tidxj * CL_SIZE + tidxi] += f_buf[                  (tidxj + i) * CL_SIZE + tidxi];
 618             f_buf[    FBUF_STRIDE + tidxj * CL_SIZE + tidxi] += f_buf[    FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi];
 619             f_buf[2 * FBUF_STRIDE + tidxj * CL_SIZE + tidxi] += f_buf[2 * FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi];
 620         }
 621         i >>= 1;
 622     }
 623     /* needed because
 624      * a) for CL_SIZE<8: id 2 (doing z in next block) is in 2nd warp
 625      * b) for all CL_SIZE a barrier is needed before f_buf is reused by next reduce_force_i call
 626      */
 627     barrier(CLK_LOCAL_MEM_FENCE);
 628
 629     /* i == 1, last reduction step, writing to global mem */
 630     /* Split the reduction between the first 3 line threads
 631        Threads with line id 0 will do the reduction for (float3).x components
 632        Threads with line id 1 will do the reduction for (float3).y components
 633        Threads with line id 2 will do the reduction for (float3).z components. */
 634     if (tidxj < 3)
 635     {
 636         float f = f_buf[tidxj * FBUF_STRIDE + tidxi] + f_buf[tidxj * FBUF_STRIDE + i * CL_SIZE + tidxi];
 637
 638         atomicAdd_g_f(&fout[3 * aidx + tidxj], f);
 639
 640         if (bCalcFshift)
 641         {
 642             (*fshift_buf) += f;
 643         }
 644     }
 645 }
 646
 647 /*! Final i-force reduction wrapper; calls the generic or pow2 reduction depending
 648  *  on whether the size of the array to be reduced is power of two or not.
 649  */
 650 __INLINE__ __device__
 651 void reduce_force_i(__local float *f_buf, __global float *f,
 652                     float *fshift_buf, bool bCalcFshift,
 653                     int tidxi, int tidxj, int ai)
 654 {
 655     if ((CL_SIZE & (CL_SIZE - 1)))
 656     {
 657         reduce_force_i_generic(f_buf, f, fshift_buf, bCalcFshift, tidxi, tidxj, ai);
 658     }
 659     else
 660     {
 661         reduce_force_i_pow2(f_buf, f, fshift_buf, bCalcFshift, tidxi, tidxj, ai);
 662     }
 663 }
 664
 665 /*! Energy reduction; this implementation works only with power of two
 666  *  array sizes.
 667  */
 668 __INLINE__ __device__
 669 void reduce_energy_pow2(volatile __local float  *buf,
 670                         volatile __global float *e_lj,
 671                         volatile __global float *e_el,
 672                         unsigned int             tidx)
 673 {
 674     int     i, j;
 675     float   e1, e2;
 676
 677     i = WARP_SIZE/2;
 678
 679     /* Can't just use i as loop variable because than nvcc refuses to unroll. */
 680     for (j = WARP_SIZE_LOG2 - 1; j > 0; j--)
 681     {
 682         if (tidx < i)
 683         {
 684             buf[              tidx] += buf[              tidx + i];
 685             buf[FBUF_STRIDE + tidx] += buf[FBUF_STRIDE + tidx + i];
 686         }
 687         i >>= 1;
 688     }
 689
 690     /* last reduction step, writing to global mem */
 691     if (tidx == 0)
 692     {
 693         e1 = buf[              tidx] + buf[              tidx + i];
 694         e2 = buf[FBUF_STRIDE + tidx] + buf[FBUF_STRIDE + tidx + i];
 695
 696         atomicAdd_g_f(e_lj, e1);
 697         atomicAdd_g_f(e_el, e2);
 698     }
 699 }
 700
 701 #endif /* NBNXN_OPENCL_KERNEL_UTILS_CLH */