src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_utils.clh

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 #include "vectype_ops.clh"
  37
  38 #define CL_SIZE                 (NBNXN_GPU_CLUSTER_SIZE)
  39 #define NCL_PER_SUPERCL         (NBNXN_GPU_NCLUSTER_PER_SUPERCLUSTER)
  40
  41 #define WARP_SIZE  32
  42
  43 #undef KERNEL_UTILS_INLINE
  44 #ifdef KERNEL_UTILS_INLINE
  45 #define __INLINE__ inline
  46 #else
  47 #define __INLINE__
  48 #endif
  49
  50 /* 1.0 / sqrt(M_PI) */
  51 #define M_FLOAT_1_SQRTPI 0.564189583547756f
  52
  53 //-------------------
  54
  55 #ifndef NBNXN_OPENCL_KERNEL_UTILS_CLH
  56 #define NBNXN_OPENCL_KERNEL_UTILS_CLH
  57
  58 __constant sampler_t generic_sampler     = CLK_NORMALIZED_COORDS_FALSE  /* Natural coords */
  59                                             | CLK_ADDRESS_NONE          /* No clamp/repeat*/
  60                                             | CLK_FILTER_NEAREST ;      /* No interpolation */
  61
  62 #define __device__
  63
  64 #define WARP_SIZE_POW2_EXPONENT     (5)
  65 #define CL_SIZE_POW2_EXPONENT       (3)  /* change this together with GPU_NS_CLUSTER_SIZE !*/
  66 #define CL_SIZE_SQ                  (CL_SIZE * CL_SIZE)
  67 #define FBUF_STRIDE                 (CL_SIZE_SQ)
  68
  69 #define ONE_SIXTH_F     0.16666667f
  70 #define ONE_TWELVETH_F  0.08333333f
  71
  72
  73 // Data structures shared between OpenCL device code and OpenCL host code
  74 // TODO: review, improve
  75 // Replaced real by float for now, to avoid including any other header
  76 typedef struct {
  77     /*real*/float c2;
  78     /*real*/float c3;
  79     /*real*/float cpot;
  80 } shift_consts_t;
  81
  82 /* Used with potential switching:
  83  * rsw        = max(r - r_switch, 0)
  84  * sw         = 1 + c3*rsw^3 + c4*rsw^4 + c5*rsw^5
  85  * dsw        = 3*c3*rsw^2 + 4*c4*rsw^3 + 5*c5*rsw^4
  86  * force      = force*dsw - potential*sw
  87  * potential *= sw
  88  */
  89 typedef struct {
  90     /*real*/float c3;
  91     /*real*/float c4;
  92     /*real*/float c5;
  93 } switch_consts_t;
  94
  95 // Data structure shared between the OpenCL device code and OpenCL host code
  96 // Must not contain OpenCL objects (buffers)
  97 typedef struct cl_nbparam_params
  98 {
  99
 100     int             eeltype;          /**< type of electrostatics, takes values from #eelCu */
 101     int             vdwtype;          /**< type of VdW impl., takes values from #evdwCu     */
 102
 103     float           epsfac;           /**< charge multiplication factor                      */
 104     float           c_rf;             /**< Reaction-field/plain cutoff electrostatics const. */
 105     float           two_k_rf;         /**< Reaction-field electrostatics constant            */
 106     float           ewald_beta;       /**< Ewald/PME parameter                               */
 107     float           sh_ewald;         /**< Ewald/PME correction term substracted from the direct-space potential */
 108     float           sh_lj_ewald;      /**< LJ-Ewald/PME correction term added to the correction potential        */
 109     float           ewaldcoeff_lj;    /**< LJ-Ewald/PME coefficient                          */
 110
 111     float           rcoulomb_sq;      /**< Coulomb cut-off squared                           */
 112
 113     float           rvdw_sq;          /**< VdW cut-off squared                               */
 114     float           rvdw_switch;      /**< VdW switched cut-off                              */
 115     float           rlist_sq;         /**< pair-list cut-off squared                         */
 116
 117     shift_consts_t  dispersion_shift; /**< VdW shift dispersion constants           */
 118     shift_consts_t  repulsion_shift;  /**< VdW shift repulsion constants            */
 119     switch_consts_t vdw_switch;       /**< VdW switch constants                     */
 120
 121     /* Ewald Coulomb force table data - accessed through texture memory */
 122     int                    coulomb_tab_size;   /**< table size (s.t. it fits in texture cache) */
 123     float                  coulomb_tab_scale;  /**< table scale/spacing                        */
 124 }cl_nbparam_params_t;
 125
 126 typedef struct {
 127     int sci;            /* i-super-cluster       */
 128     int shift;          /* Shift vector index plus possible flags */
 129     int cj4_ind_start;  /* Start index into cj4  */
 130     int cj4_ind_end;    /* End index into cj4    */
 131 } nbnxn_sci_t;
 132
 133 typedef struct {
 134     unsigned int imask;    /* The i-cluster interactions mask for 1 warp  */
 135     int          excl_ind; /* Index into the exclusion array for 1 warp   */
 136 } nbnxn_im_ei_t;
 137
 138 typedef struct {
 139     int           cj[4];   /* The 4 j-clusters                            */
 140     nbnxn_im_ei_t imei[2]; /* The i-cluster mask data       for 2 warps   */
 141 } nbnxn_cj4_t;
 142
 143
 144 typedef struct {
 145     unsigned int pair[32]; /* Topology exclusion interaction bits for one warp,
 146                             * each unsigned has bitS for 4*8 i clusters
 147                             */
 148 } nbnxn_excl_t;
 149
 150 /*! i-cluster interaction mask for a super-cluster with all NCL_PER_SUPERCL bits set */
 151 __constant unsigned supercl_interaction_mask = ((1U << NCL_PER_SUPERCL) - 1U);
 152
 153 /*! Apply force switch,  force + energy version. */
 154  __INLINE__ __device__
 155 void calculate_force_switch_F(cl_nbparam_params_t *nbparam,
 156                               float     c6,
 157                               float     c12,
 158                               float     inv_r,
 159                               float     r2,
 160                               float *   F_invr)
 161 {
 162     float r, r_switch;
 163
 164     /* force switch constants */
 165     float disp_shift_V2 = nbparam->dispersion_shift.c2;
 166     float disp_shift_V3 = nbparam->dispersion_shift.c3;
 167     float repu_shift_V2 = nbparam->repulsion_shift.c2;
 168     float repu_shift_V3 = nbparam->repulsion_shift.c3;
 169
 170     r         = r2 * inv_r;
 171     r_switch  = r - nbparam->rvdw_switch;
 172     r_switch  = r_switch >= 0.0f ? r_switch : 0.0f;
 173
 174     *F_invr  +=
 175         -c6*(disp_shift_V2 + disp_shift_V3*r_switch)*r_switch*r_switch*inv_r +
 176         c12*(-repu_shift_V2 + repu_shift_V3*r_switch)*r_switch*r_switch*inv_r;
 177 }
 178
 179 /*! Apply force switch, force-only version. */
 180 __INLINE__ __device__
 181 void calculate_force_switch_F_E(cl_nbparam_params_t *nbparam,
 182                                 float               c6,
 183                                 float               c12,
 184                                 float               inv_r,
 185                                 float               r2,
 186                                 float      *F_invr,
 187                                 float      *E_lj)
 188 {
 189     float r, r_switch;
 190
 191     /* force switch constants */
 192     float disp_shift_V2 = nbparam->dispersion_shift.c2;
 193     float disp_shift_V3 = nbparam->dispersion_shift.c3;
 194     float repu_shift_V2 = nbparam->repulsion_shift.c2;
 195     float repu_shift_V3 = nbparam->repulsion_shift.c3;
 196
 197     float disp_shift_F2 = nbparam->dispersion_shift.c2/3;
 198     float disp_shift_F3 = nbparam->dispersion_shift.c3/4;
 199     float repu_shift_F2 = nbparam->repulsion_shift.c2/3;
 200     float repu_shift_F3 = nbparam->repulsion_shift.c3/4;
 201
 202     r         = r2 * inv_r;
 203     r_switch  = r - nbparam->rvdw_switch;
 204     r_switch  = r_switch >= 0.0f ? r_switch : 0.0f;
 205
 206     *F_invr  +=
 207         -c6*(disp_shift_V2 + disp_shift_V3*r_switch)*r_switch*r_switch*inv_r +
 208         c12*(-repu_shift_V2 + repu_shift_V3*r_switch)*r_switch*r_switch*inv_r;
 209     *E_lj    +=
 210         c6*(disp_shift_F2 + disp_shift_F3*r_switch)*r_switch*r_switch*r_switch -
 211         c12*(repu_shift_F2 + repu_shift_F3*r_switch)*r_switch*r_switch*r_switch;
 212 }
 213
 214 /*! Apply potential switch, force-only version. */
 215 __INLINE__ __device__
 216 void calculate_potential_switch_F(cl_nbparam_params_t *nbparam,
 217                                   float               c6,
 218                                   float               c12,
 219                                   float               inv_r,
 220                                   float               r2,
 221                                   float     *F_invr,
 222                                   float     *E_lj)
 223 {
 224     float r, r_switch;
 225     float sw, dsw;
 226
 227     /* potential switch constants */
 228     float switch_V3 = nbparam->vdw_switch.c3;
 229     float switch_V4 = nbparam->vdw_switch.c4;
 230     float switch_V5 = nbparam->vdw_switch.c5;
 231     float switch_F2 = nbparam->vdw_switch.c3;
 232     float switch_F3 = nbparam->vdw_switch.c4;
 233     float switch_F4 = nbparam->vdw_switch.c5;
 234
 235     r        = r2 * inv_r;
 236     r_switch = r - nbparam->rvdw_switch;
 237
 238     /* Unlike in the F+E kernel, conditional is faster here */
 239     if (r_switch > 0.0f)
 240     {
 241         sw      = 1.0f + (switch_V3 + (switch_V4 + switch_V5*r_switch)*r_switch)*r_switch*r_switch*r_switch;
 242         dsw     = (switch_F2 + (switch_F3 + switch_F4*r_switch)*r_switch)*r_switch*r_switch;
 243
 244         *F_invr = (*F_invr)*sw - inv_r*(*E_lj)*dsw;
 245     }
 246 }
 247
 248 /*! Apply potential switch, force + energy version. */
 249 __INLINE__ __device__
 250 void calculate_potential_switch_F_E(cl_nbparam_params_t *nbparam,
 251                                     float               c6,
 252                                     float               c12,
 253                                     float               inv_r,
 254                                     float               r2,
 255                                     float              *F_invr,
 256                                     float              *E_lj)
 257 {
 258     float r, r_switch;
 259     float sw, dsw;
 260
 261     /* potential switch constants */
 262     float switch_V3 = nbparam->vdw_switch.c3;
 263     float switch_V4 = nbparam->vdw_switch.c4;
 264     float switch_V5 = nbparam->vdw_switch.c5;
 265     float switch_F2 = nbparam->vdw_switch.c3;
 266     float switch_F3 = nbparam->vdw_switch.c4;
 267     float switch_F4 = nbparam->vdw_switch.c5;
 268
 269     r        = r2 * inv_r;
 270     r_switch = r - nbparam->rvdw_switch;
 271     r_switch = r_switch >= 0.0f ? r_switch : 0.0f;
 272
 273     /* Unlike in the F-only kernel, masking is faster here */
 274     sw       = 1.0f + (switch_V3 + (switch_V4 + switch_V5*r_switch)*r_switch)*r_switch*r_switch*r_switch;
 275     dsw      = (switch_F2 + (switch_F3 + switch_F4*r_switch)*r_switch)*r_switch*r_switch;
 276
 277     *F_invr  = (*F_invr)*sw - inv_r*(*E_lj)*dsw;
 278     *E_lj   *= sw;
 279 }
 280
 281 /*! Calculate LJ-PME grid force contribution with
 282  *  geometric combination rule.
 283  */
 284 __INLINE__ __device__
 285 void calculate_lj_ewald_comb_geom_F(__constant float *     nbfp_comb_climg2d,
 286                                     int                typei,
 287                                     int                typej,
 288                                     float              r2,
 289                                     float              inv_r2,
 290                                     float              lje_coeff2,
 291                                     float              lje_coeff6_6,
 292                                     float             *F_invr)
 293 {
 294     float c6grid, inv_r6_nm, cr2, expmcr2, poly;
 295
 296     c6grid    = nbfp_comb_climg2d[2*typei]*nbfp_comb_climg2d[2*typej];
 297
 298     /* Recalculate inv_r6 without exclusion mask */
 299     inv_r6_nm = inv_r2*inv_r2*inv_r2;
 300     cr2       = lje_coeff2*r2;
 301     expmcr2   = exp(-cr2);
 302     poly      = 1.0f + cr2 + 0.5f*cr2*cr2;
 303
 304     /* Subtract the grid force from the total LJ force */
 305     *F_invr  += c6grid*(inv_r6_nm - expmcr2*(inv_r6_nm*poly + lje_coeff6_6))*inv_r2;
 306 }
 307
 308 /*! Calculate LJ-PME grid force + energy contribution with
 309  *  geometric combination rule.
 310  */
 311 __INLINE__ __device__
 312 void calculate_lj_ewald_comb_geom_F_E(__constant float *nbfp_comb_climg2d,
 313                                       cl_nbparam_params_t *nbparam,
 314                                       int                typei,
 315                                       int                typej,
 316                                       float              r2,
 317                                       float              inv_r2,
 318                                       float              lje_coeff2,
 319                                       float              lje_coeff6_6,
 320                                       float              int_bit,
 321                                       float             *F_invr,
 322                                       float             *E_lj)
 323 {
 324     float c6grid, inv_r6_nm, cr2, expmcr2, poly, sh_mask;
 325
 326     c6grid    = nbfp_comb_climg2d[2*typei]*nbfp_comb_climg2d[2*typej];
 327
 328     /* Recalculate inv_r6 without exclusion mask */
 329     inv_r6_nm = inv_r2*inv_r2*inv_r2;
 330     cr2       = lje_coeff2*r2;
 331     expmcr2   = exp(-cr2);
 332     poly      = 1.0f + cr2 + 0.5f*cr2*cr2;
 333
 334     /* Subtract the grid force from the total LJ force */
 335     *F_invr  += c6grid*(inv_r6_nm - expmcr2*(inv_r6_nm*poly + lje_coeff6_6))*inv_r2;
 336
 337     /* Shift should be applied only to real LJ pairs */
 338     sh_mask   = nbparam->sh_lj_ewald*int_bit;
 339     *E_lj    += ONE_SIXTH_F*c6grid*(inv_r6_nm*(1.0f - expmcr2*poly) + sh_mask);
 340 }
 341
 342 /*! Calculate LJ-PME grid force + energy contribution (if E_lj != NULL) with
 343  *  Lorentz-Berthelot combination rule.
 344  *  We use a single F+E kernel with conditional because the performance impact
 345  *  of this is pretty small and LB on the CPU is anyway very slow.
 346  */
 347 __INLINE__ __device__
 348 void calculate_lj_ewald_comb_LB_F_E(__constant float *nbfp_comb_climg2d,
 349                                     cl_nbparam_params_t *nbparam,
 350                                     int                typei,
 351                                     int                typej,
 352                                     float              r2,
 353                                     float              inv_r2,
 354                                     float              lje_coeff2,
 355                                     float              lje_coeff6_6,
 356                                     float              int_bit,
 357                                     bool               with_E_lj,
 358                                     float             *F_invr,
 359                                     float             *E_lj)
 360 {
 361     float c6grid, inv_r6_nm, cr2, expmcr2, poly;
 362     float sigma, sigma2, epsilon;
 363
 364     /* sigma and epsilon are scaled to give 6*C6 */
 365     sigma      = nbfp_comb_climg2d[2*typei] + nbfp_comb_climg2d[2*typej];
 366
 367     epsilon    = nbfp_comb_climg2d[2*typei+1]*nbfp_comb_climg2d[2*typej+1];
 368
 369     sigma2  = sigma*sigma;
 370     c6grid  = epsilon*sigma2*sigma2*sigma2;
 371
 372     /* Recalculate inv_r6 without exclusion mask */
 373     inv_r6_nm = inv_r2*inv_r2*inv_r2;
 374     cr2       = lje_coeff2*r2;
 375     expmcr2   = exp(-cr2);
 376     poly      = 1.0f + cr2 + 0.5f*cr2*cr2;
 377
 378     /* Subtract the grid force from the total LJ force */
 379     *F_invr  += c6grid*(inv_r6_nm - expmcr2*(inv_r6_nm*poly + lje_coeff6_6))*inv_r2;
 380
 381     if (with_E_lj==true)
 382     {
 383         float sh_mask;
 384
 385         /* Shift should be applied only to real LJ pairs */
 386         sh_mask   = nbparam->sh_lj_ewald*int_bit;
 387         *E_lj    += ONE_SIXTH_F*c6grid*(inv_r6_nm*(1.0f - expmcr2*poly) + sh_mask);
 388     }
 389 }
 390
 391 /*! Interpolate Ewald coulomb force using the table through the tex_nbfp texture.
 392  *  Original idea: from the OpenMM project
 393  */
 394 __INLINE__ __device__ float
 395 interpolate_coulomb_force_r(__constant float*     coulomb_tab_climg2d,
 396                             float r,
 397                             float scale)
 398 {
 399     float   normalized = scale * r;
 400     int     index      = (int) normalized;
 401     float   fract2     = normalized - index;
 402     float   fract1     = 1.0f - fract2;
 403
 404     /* sigma and epsilon are scaled to give 6*C6 */
 405     return coulomb_tab_climg2d[index]*coulomb_tab_climg2d[index];
 406 }
 407
 408 /*! Calculate analytical Ewald correction term. */
 409 __INLINE__ __device__
 410 float pmecorrF(float z2)
 411 {
 412     const float FN6 = -1.7357322914161492954e-8f;
 413     const float FN5 = 1.4703624142580877519e-6f;
 414     const float FN4 = -0.000053401640219807709149f;
 415     const float FN3 = 0.0010054721316683106153f;
 416     const float FN2 = -0.019278317264888380590f;
 417     const float FN1 = 0.069670166153766424023f;
 418     const float FN0 = -0.75225204789749321333f;
 419
 420     const float FD4 = 0.0011193462567257629232f;
 421     const float FD3 = 0.014866955030185295499f;
 422     const float FD2 = 0.11583842382862377919f;
 423     const float FD1 = 0.50736591960530292870f;
 424     const float FD0 = 1.0f;
 425
 426     float       z4;
 427     float       polyFN0, polyFN1, polyFD0, polyFD1;
 428
 429     z4          = z2*z2;
 430
 431     polyFD0     = FD4*z4 + FD2;
 432     polyFD1     = FD3*z4 + FD1;
 433     polyFD0     = polyFD0*z4 + FD0;
 434     polyFD0     = polyFD1*z2 + polyFD0;
 435
 436     polyFD0     = 1.0f/polyFD0;
 437
 438     polyFN0     = FN6*z4 + FN4;
 439     polyFN1     = FN5*z4 + FN3;
 440     polyFN0     = polyFN0*z4 + FN2;
 441     polyFN1     = polyFN1*z4 + FN1;
 442     polyFN0     = polyFN0*z4 + FN0;
 443     polyFN0     = polyFN1*z2 + polyFN0;
 444
 445     return polyFN0*polyFD0;
 446 }
 447
 448 /*! Final j-force reduction; this generic implementation works with
 449  *  arbitrary array sizes.
 450  */
 451 /* AMD OpenCL compiler error "Undeclared function index 1024" if __INLINE__d */
 452 //__INLINE__ __device__
 453 void reduce_force_j_generic(__local float *f_buf, __global float *fout,//__global float3 *fout,
 454                             int tidxi, int tidxj, int aidx)
 455 {
 456     /* Split the reduction between the first 3 column threads
 457        Threads with column id 0 will do the reduction for (float3).x components
 458        Threads with column id 1 will do the reduction for (float3).y components
 459        Threads with column id 2 will do the reduction for (float3).z components.
 460        The reduction is performed for each line tidxj of f_buf. */
 461     if (tidxi < 3)
 462     {
 463         float f = 0.0f;
 464         for (int j = tidxj * CL_SIZE; j < (tidxj + 1) * CL_SIZE; j++)
 465         {
 466             f += f_buf[FBUF_STRIDE * tidxi + j];
 467         }
 468
 469         atomicAdd_g_f(&fout[3 * aidx + tidxi], f);
 470     }
 471 }
 472
 473 /*! Final i-force reduction; this generic implementation works with
 474  *  arbitrary array sizes.
 475  */
 476 __INLINE__ __device__
 477 void reduce_force_i_generic(__local float *f_buf, __global float *fout,
 478                             float *fshift_buf, bool bCalcFshift,
 479                             int tidxi, int tidxj, int aidx)
 480 {
 481     /* Split the reduction between the first 3 line threads
 482        Threads with line id 0 will do the reduction for (float3).x components
 483        Threads with line id 1 will do the reduction for (float3).y components
 484        Threads with line id 2 will do the reduction for (float3).z components. */
 485     if (tidxj < 3)
 486     {
 487         float f = 0.0f;
 488         for (int j = tidxi; j < CL_SIZE_SQ; j += CL_SIZE)
 489         {
 490             f += f_buf[tidxj * FBUF_STRIDE + j];
 491         }
 492
 493         atomicAdd_g_f(&fout[3 * aidx + tidxj], f);
 494
 495         if (bCalcFshift)
 496         {
 497             (*fshift_buf) += f;
 498         }
 499     }
 500 }
 501
 502 /*! Final i-force reduction; this implementation works only with power of two
 503  *  array sizes.
 504  */
 505 __INLINE__ __device__
 506 void reduce_force_i_pow2(volatile __local float *f_buf, __global float *fout,
 507                          float *fshift_buf, bool bCalcFshift,
 508                          int tidxi, int tidxj, int aidx)
 509 {
 510     int     i, j;
 511     /* Reduce the initial CL_SIZE values for each i atom to half
 512      * every step by using CL_SIZE * i threads.
 513      * Can't just use i as loop variable because than nvcc refuses to unroll.
 514      */
 515     i = CL_SIZE/2;
 516     for (j = CL_SIZE_POW2_EXPONENT - 1; j > 0; j--)
 517     {
 518         if (tidxj < i)
 519         {
 520
 521             f_buf[                  tidxj * CL_SIZE + tidxi] += f_buf[                  (tidxj + i) * CL_SIZE + tidxi];
 522             f_buf[    FBUF_STRIDE + tidxj * CL_SIZE + tidxi] += f_buf[    FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi];
 523             f_buf[2 * FBUF_STRIDE + tidxj * CL_SIZE + tidxi] += f_buf[2 * FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi];
 524         }
 525         i >>= 1;
 526     }
 527
 528     /* i == 1, last reduction step, writing to global mem */
 529     /* Split the reduction between the first 3 line threads
 530        Threads with line id 0 will do the reduction for (float3).x components
 531        Threads with line id 1 will do the reduction for (float3).y components
 532        Threads with line id 2 will do the reduction for (float3).z components. */
 533     if (tidxj < 3)
 534     {
 535         float f = f_buf[tidxj * FBUF_STRIDE + tidxi] + f_buf[tidxj * FBUF_STRIDE + i * CL_SIZE + tidxi];
 536
 537         atomicAdd_g_f(&fout[3 * aidx + tidxj], f);
 538
 539         if (bCalcFshift)
 540         {
 541             (*fshift_buf) += f;
 542         }
 543     }
 544 }
 545
 546 /*! Final i-force reduction wrapper; calls the generic or pow2 reduction depending
 547  *  on whether the size of the array to be reduced is power of two or not.
 548  */
 549 __INLINE__ __device__
 550 void reduce_force_i(__local float *f_buf, __global float *f,
 551                     float *fshift_buf, bool bCalcFshift,
 552                     int tidxi, int tidxj, int ai)
 553 {
 554     if ((CL_SIZE & (CL_SIZE - 1)))
 555     {
 556         reduce_force_i_generic(f_buf, f, fshift_buf, bCalcFshift, tidxi, tidxj, ai);
 557     }
 558     else
 559     {
 560         reduce_force_i_pow2(f_buf, f, fshift_buf, bCalcFshift, tidxi, tidxj, ai);
 561     }
 562 }
 563
 564 /*! Energy reduction; this implementation works only with power of two
 565  *  array sizes.
 566  */
 567 __INLINE__ __device__
 568 void reduce_energy_pow2(volatile __local float *buf,
 569                         volatile __global float *e_lj,
 570                         volatile __global float *e_el,
 571                         unsigned int tidx)
 572 {
 573     int     i, j;
 574     float   e1, e2;
 575
 576     i = WARP_SIZE/2;
 577
 578     /* Can't just use i as loop variable because than nvcc refuses to unroll. */
 579     for (j = WARP_SIZE_POW2_EXPONENT - 1; j > 0; j--)
 580     {
 581         if (tidx < i)
 582         {
 583             buf[              tidx] += buf[              tidx + i];
 584             buf[FBUF_STRIDE + tidx] += buf[FBUF_STRIDE + tidx + i];
 585         }
 586         i >>= 1;
 587     }
 588
 589     /* last reduction step, writing to global mem */
 590     if (tidx == 0)
 591     {
 592         e1 = buf[              tidx] + buf[              tidx + i];
 593         e2 = buf[FBUF_STRIDE + tidx] + buf[FBUF_STRIDE + tidx + i];
 594
 595         atomicAdd_g_f(e_lj, e1);
 596         atomicAdd_g_f(e_el, e2);
 597     }
 598 }
 599
 600 /*! Writes in debug_buffer the input value.
 601  *  Each thread has its own unique location in debug_buffer.
 602  *  Works for 2D global configurations.
 603  */
 604 void print_to_debug_buffer_f(__global float* debug_buffer, float value)
 605 {
 606     if (debug_buffer)
 607         debug_buffer[get_global_id(1) * get_global_size(0) + get_global_id(0)] = value;
 608 }
 609
 610 #endif /* NBNXN_OPENCL_KERNEL_UTILS_CLH */