src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_utils.clh

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2012,2013,2014,2016,2017,2018, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 #include "device_utils.clh"
  37 #include "vectype_ops.clh"
  38
  39 #define CL_SIZE                 (NBNXN_GPU_CLUSTER_SIZE)
  40 #define NCL_PER_SUPERCL         (NBNXN_GPU_NCLUSTER_PER_SUPERCLUSTER)
  41
  42 #define WARP_SIZE  (CL_SIZE*CL_SIZE/2)
  43
  44 /* 1.0 / sqrt(M_PI) */
  45 #define M_FLOAT_1_SQRTPI 0.564189583547756f
  46
  47 //-------------------
  48
  49 #ifndef NBNXN_OPENCL_KERNEL_UTILS_CLH
  50 #define NBNXN_OPENCL_KERNEL_UTILS_CLH
  51
  52 __constant sampler_t generic_sampler     = (CLK_NORMALIZED_COORDS_FALSE /* Natural coords */
  53                                             | CLK_ADDRESS_NONE          /* No clamp/repeat*/
  54                                             | CLK_FILTER_NEAREST);      /* No interpolation */
  55
  56 #if CL_SIZE == 8
  57 #define WARP_SIZE_LOG2  (5)
  58 #define CL_SIZE_LOG2    (3)
  59 #elif CL_SIZE == 4
  60 #define WARP_SIZE_LOG2  (3)
  61 #define CL_SIZE_LOG2    (2)
  62 #else
  63 #error unsupported CL_SIZE
  64 #endif
  65
  66 #define CL_SIZE_SQ      (CL_SIZE * CL_SIZE)
  67 #define FBUF_STRIDE     (CL_SIZE_SQ)
  68
  69 #define ONE_SIXTH_F     0.16666667f
  70 #define ONE_TWELVETH_F  0.08333333f
  71
  72
  73 // Data structures shared between OpenCL device code and OpenCL host code
  74 // TODO: review, improve
  75 // Replaced real by float for now, to avoid including any other header
  76 typedef struct {
  77     float c2;
  78     float c3;
  79     float cpot;
  80 } shift_consts_t;
  81
  82 /* Used with potential switching:
  83  * rsw        = max(r - r_switch, 0)
  84  * sw         = 1 + c3*rsw^3 + c4*rsw^4 + c5*rsw^5
  85  * dsw        = 3*c3*rsw^2 + 4*c4*rsw^3 + 5*c5*rsw^4
  86  * force      = force*dsw - potential*sw
  87  * potential *= sw
  88  */
  89 typedef struct {
  90     float c3;
  91     float c4;
  92     float c5;
  93 } switch_consts_t;
  94
  95 // Data structure shared between the OpenCL device code and OpenCL host code
  96 // Must not contain OpenCL objects (buffers)
  97 typedef struct cl_nbparam_params
  98 {
  99
 100     int             eeltype;          /**< type of electrostatics, takes values from #eelCu */
 101     int             vdwtype;          /**< type of VdW impl., takes values from #evdwCu     */
 102
 103     float           epsfac;           /**< charge multiplication factor                      */
 104     float           c_rf;             /**< Reaction-field/plain cutoff electrostatics const. */
 105     float           two_k_rf;         /**< Reaction-field electrostatics constant            */
 106     float           ewald_beta;       /**< Ewald/PME parameter                               */
 107     float           sh_ewald;         /**< Ewald/PME correction term substracted from the direct-space potential */
 108     float           sh_lj_ewald;      /**< LJ-Ewald/PME correction term added to the correction potential        */
 109     float           ewaldcoeff_lj;    /**< LJ-Ewald/PME coefficient                          */
 110
 111     float           rcoulomb_sq;      /**< Coulomb cut-off squared                           */
 112
 113     float           rvdw_sq;          /**< VdW cut-off squared                               */
 114     float           rvdw_switch;      /**< VdW switched cut-off                              */
 115     float           rlistOuter_sq;    /**< Full, outer pair-list cut-off squared             */
 116     float           rlistInner_sq;    /**< Inner, dynamic pruned pair-list cut-off squared  XXX: this is only needed in the pruning kernels, but for now we also pass it to the nonbondeds */
 117
 118     shift_consts_t  dispersion_shift; /**< VdW shift dispersion constants           */
 119     shift_consts_t  repulsion_shift;  /**< VdW shift repulsion constants            */
 120     switch_consts_t vdw_switch;       /**< VdW switch constants                     */
 121
 122     /* Ewald Coulomb force table data - accessed through texture memory */
 123     float                  coulomb_tab_scale;  /**< table scale/spacing                        */
 124 }cl_nbparam_params_t;
 125
 126 typedef struct {
 127     int sci;            /* i-super-cluster       */
 128     int shift;          /* Shift vector index plus possible flags */
 129     int cj4_ind_start;  /* Start index into cj4  */
 130     int cj4_ind_end;    /* End index into cj4    */
 131 } nbnxn_sci_t;
 132
 133 typedef struct {
 134     unsigned int imask;    /* The i-cluster interactions mask for 1 warp  */
 135     int          excl_ind; /* Index into the exclusion array for 1 warp   */
 136 } nbnxn_im_ei_t;
 137
 138 typedef struct {
 139     int           cj[4];   /* The 4 j-clusters                            */
 140     nbnxn_im_ei_t imei[2]; /* The i-cluster mask data       for 2 warps   */
 141 } nbnxn_cj4_t;
 142
 143
 144 typedef struct {
 145     unsigned int pair[CL_SIZE*CL_SIZE/2]; /* Topology exclusion interaction bits for one warp,
 146                                            * each unsigned has bitS for 4*8 i clusters
 147                                            */
 148 } nbnxn_excl_t;
 149
 150 /*! i-cluster interaction mask for a super-cluster with all NCL_PER_SUPERCL bits set */
 151 __constant unsigned supercl_interaction_mask = ((1U << NCL_PER_SUPERCL) - 1U);
 152
 153
 154 /*! \brief Preload cj4 into local memory.
 155  *
 156  * - For AMD we load once for a wavefront of 64 threads (on 4 threads * NTHREAD_Z)
 157  * - For NVIDIA once per warp (on 2x4 threads * NTHREAD_Z)
 158  * - Same as AMD in the nowarp kernel; we do not assume execution width and therefore
 159  *   the caller needs to sync.
 160  *
 161  * It is the caller's responsibility to make sure that data is consumed only when
 162  * it's ready. This function does not call a barrier.
 163  */
 164 gmx_opencl_inline
 165 void preloadCj4(__local int        *sm_cjPreload,
 166                 const __global int *gm_cj,
 167                 int                 tidxi,
 168                 int                 tidxj,
 169                 bool                iMaskCond)
 170
 171 {
 172 #if !USE_CJ_PREFETCH
 173     return;
 174 #endif
 175
 176     const int c_clSize                   = CL_SIZE;
 177     const int c_nbnxnGpuJgroupSize       = NBNXN_GPU_JGROUP_SIZE;
 178     const int c_nbnxnGpuClusterpairSplit = 2;
 179     const int c_splitClSize              = c_clSize/c_nbnxnGpuClusterpairSplit;
 180
 181     /* Pre-load cj into shared memory */
 182 #if defined _NVIDIA_SOURCE_
 183     /* on both warps separately for NVIDIA */
 184     if ((tidxj == 0 | tidxj == 4) & (tidxi < c_nbnxnGpuJgroupSize))
 185     {
 186         sm_cjPreload[tidxi + tidxj * c_nbnxnGpuJgroupSize/c_splitClSize] = gm_cj[tidxi];
 187     }
 188 #else // AMD or nowarp
 189       /* Note that with "nowarp" / on hardware with wavefronts <64 a barrier is needed after preload. */
 190     if (tidxj == 0 & tidxi < c_nbnxnGpuJgroupSize)
 191     {
 192         sm_cjPreload[tidxi] = gm_cj[tidxi];
 193     }
 194 #endif
 195 }
 196
 197 /* \brief Load a cj given a jm index.
 198  *
 199  * If cj4 preloading is enabled, it loads from the local memory, otherwise from global.
 200  */
 201 gmx_opencl_inline
 202 int loadCj(__local int        *sm_cjPreload,
 203            const __global int *gm_cj,
 204            int                 jm,
 205            int                 tidxi,
 206            int                 tidxj)
 207 {
 208     const int c_clSize                   = CL_SIZE;
 209     const int c_nbnxnGpuJgroupSize       = NBNXN_GPU_JGROUP_SIZE;
 210     const int c_nbnxnGpuClusterpairSplit = 2;
 211     const int c_splitClSize              = c_clSize/c_nbnxnGpuClusterpairSplit;
 212
 213 #if USE_CJ_PREFETCH
 214 #if defined _NVIDIA_SOURCE_
 215     int warpLoadOffset = (tidxj & 4) * c_nbnxnGpuJgroupSize/c_splitClSize;
 216 #elif defined _AMD_SOURCE_
 217     int warpLoadOffset = 0;
 218 #else
 219 #error Not supported
 220 #endif
 221     return sm_cjPreload[jm + warpLoadOffset];
 222 #else
 223     return gm_cj[jm];
 224 #endif
 225 }
 226
 227
 228 /*! Convert LJ sigma,epsilon parameters to C6,C12. */
 229 gmx_opencl_inline
 230 void convert_sigma_epsilon_to_c6_c12(const float  sigma,
 231                                      const float  epsilon,
 232                                      float       *c6,
 233                                      float       *c12)
 234 {
 235     float sigma2, sigma6;
 236
 237     sigma2 = sigma * sigma;
 238     sigma6 = sigma2 *sigma2 * sigma2;
 239     *c6    = epsilon * sigma6;
 240     *c12   = *c6 * sigma6;
 241 }
 242
 243
 244 /*! Apply force switch,  force + energy version. */
 245 gmx_opencl_inline
 246 void calculate_force_switch_F(cl_nbparam_params_t *nbparam,
 247                               float                c6,
 248                               float                c12,
 249                               float                inv_r,
 250                               float                r2,
 251                               float               *F_invr)
 252 {
 253     float r, r_switch;
 254
 255     /* force switch constants */
 256     float disp_shift_V2 = nbparam->dispersion_shift.c2;
 257     float disp_shift_V3 = nbparam->dispersion_shift.c3;
 258     float repu_shift_V2 = nbparam->repulsion_shift.c2;
 259     float repu_shift_V3 = nbparam->repulsion_shift.c3;
 260
 261     r         = r2 * inv_r;
 262     r_switch  = r - nbparam->rvdw_switch;
 263     r_switch  = r_switch >= 0.0f ? r_switch : 0.0f;
 264
 265     *F_invr  +=
 266         -c6*(disp_shift_V2 + disp_shift_V3*r_switch)*r_switch*r_switch*inv_r +
 267         c12*(-repu_shift_V2 + repu_shift_V3*r_switch)*r_switch*r_switch*inv_r;
 268 }
 269
 270 /*! Apply force switch, force-only version. */
 271 gmx_opencl_inline
 272 void calculate_force_switch_F_E(cl_nbparam_params_t *nbparam,
 273                                 float                c6,
 274                                 float                c12,
 275                                 float                inv_r,
 276                                 float                r2,
 277                                 float               *F_invr,
 278                                 float               *E_lj)
 279 {
 280     float r, r_switch;
 281
 282     /* force switch constants */
 283     float disp_shift_V2 = nbparam->dispersion_shift.c2;
 284     float disp_shift_V3 = nbparam->dispersion_shift.c3;
 285     float repu_shift_V2 = nbparam->repulsion_shift.c2;
 286     float repu_shift_V3 = nbparam->repulsion_shift.c3;
 287
 288     float disp_shift_F2 = nbparam->dispersion_shift.c2/3;
 289     float disp_shift_F3 = nbparam->dispersion_shift.c3/4;
 290     float repu_shift_F2 = nbparam->repulsion_shift.c2/3;
 291     float repu_shift_F3 = nbparam->repulsion_shift.c3/4;
 292
 293     r         = r2 * inv_r;
 294     r_switch  = r - nbparam->rvdw_switch;
 295     r_switch  = r_switch >= 0.0f ? r_switch : 0.0f;
 296
 297     *F_invr  +=
 298         -c6*(disp_shift_V2 + disp_shift_V3*r_switch)*r_switch*r_switch*inv_r +
 299         c12*(-repu_shift_V2 + repu_shift_V3*r_switch)*r_switch*r_switch*inv_r;
 300     *E_lj    +=
 301         c6*(disp_shift_F2 + disp_shift_F3*r_switch)*r_switch*r_switch*r_switch -
 302         c12*(repu_shift_F2 + repu_shift_F3*r_switch)*r_switch*r_switch*r_switch;
 303 }
 304
 305 /*! Apply potential switch, force-only version. */
 306 gmx_opencl_inline
 307 void calculate_potential_switch_F(cl_nbparam_params_t *nbparam,
 308                                   float                inv_r,
 309                                   float                r2,
 310                                   float               *F_invr,
 311                                   float               *E_lj)
 312 {
 313     float r, r_switch;
 314     float sw, dsw;
 315
 316     /* potential switch constants */
 317     float switch_V3 = nbparam->vdw_switch.c3;
 318     float switch_V4 = nbparam->vdw_switch.c4;
 319     float switch_V5 = nbparam->vdw_switch.c5;
 320     float switch_F2 = nbparam->vdw_switch.c3;
 321     float switch_F3 = nbparam->vdw_switch.c4;
 322     float switch_F4 = nbparam->vdw_switch.c5;
 323
 324     r        = r2 * inv_r;
 325     r_switch = r - nbparam->rvdw_switch;
 326
 327     /* Unlike in the F+E kernel, conditional is faster here */
 328     if (r_switch > 0.0f)
 329     {
 330         sw      = 1.0f + (switch_V3 + (switch_V4 + switch_V5*r_switch)*r_switch)*r_switch*r_switch*r_switch;
 331         dsw     = (switch_F2 + (switch_F3 + switch_F4*r_switch)*r_switch)*r_switch*r_switch;
 332
 333         *F_invr = (*F_invr)*sw - inv_r*(*E_lj)*dsw;
 334     }
 335 }
 336
 337 /*! Apply potential switch, force + energy version. */
 338 gmx_opencl_inline
 339 void calculate_potential_switch_F_E(cl_nbparam_params_t *nbparam,
 340                                     float                inv_r,
 341                                     float                r2,
 342                                     float               *F_invr,
 343                                     float               *E_lj)
 344 {
 345     float r, r_switch;
 346     float sw, dsw;
 347
 348     /* potential switch constants */
 349     float switch_V3 = nbparam->vdw_switch.c3;
 350     float switch_V4 = nbparam->vdw_switch.c4;
 351     float switch_V5 = nbparam->vdw_switch.c5;
 352     float switch_F2 = nbparam->vdw_switch.c3;
 353     float switch_F3 = nbparam->vdw_switch.c4;
 354     float switch_F4 = nbparam->vdw_switch.c5;
 355
 356     r        = r2 * inv_r;
 357     r_switch = r - nbparam->rvdw_switch;
 358     r_switch = r_switch >= 0.0f ? r_switch : 0.0f;
 359
 360     /* Unlike in the F-only kernel, masking is faster here */
 361     sw       = 1.0f + (switch_V3 + (switch_V4 + switch_V5*r_switch)*r_switch)*r_switch*r_switch*r_switch;
 362     dsw      = (switch_F2 + (switch_F3 + switch_F4*r_switch)*r_switch)*r_switch*r_switch;
 363
 364     *F_invr  = (*F_invr)*sw - inv_r*(*E_lj)*dsw;
 365     *E_lj   *= sw;
 366 }
 367
 368 /*! Calculate LJ-PME grid force contribution with
 369  *  geometric combination rule.
 370  */
 371 gmx_opencl_inline
 372 void calculate_lj_ewald_comb_geom_F(__constant float * nbfp_comb_climg2d,
 373                                     int                typei,
 374                                     int                typej,
 375                                     float              r2,
 376                                     float              inv_r2,
 377                                     float              lje_coeff2,
 378                                     float              lje_coeff6_6,
 379                                     float             *F_invr)
 380 {
 381     float c6grid, inv_r6_nm, cr2, expmcr2, poly;
 382
 383     c6grid    = nbfp_comb_climg2d[2*typei]*nbfp_comb_climg2d[2*typej];
 384
 385     /* Recalculate inv_r6 without exclusion mask */
 386     inv_r6_nm = inv_r2*inv_r2*inv_r2;
 387     cr2       = lje_coeff2*r2;
 388     expmcr2   = exp(-cr2);
 389     poly      = 1.0f + cr2 + 0.5f*cr2*cr2;
 390
 391     /* Subtract the grid force from the total LJ force */
 392     *F_invr  += c6grid*(inv_r6_nm - expmcr2*(inv_r6_nm*poly + lje_coeff6_6))*inv_r2;
 393 }
 394
 395 /*! Calculate LJ-PME grid force + energy contribution with
 396  *  geometric combination rule.
 397  */
 398 gmx_opencl_inline
 399 void calculate_lj_ewald_comb_geom_F_E(__constant float    *nbfp_comb_climg2d,
 400                                       cl_nbparam_params_t *nbparam,
 401                                       int                  typei,
 402                                       int                  typej,
 403                                       float                r2,
 404                                       float                inv_r2,
 405                                       float                lje_coeff2,
 406                                       float                lje_coeff6_6,
 407                                       float                int_bit,
 408                                       float               *F_invr,
 409                                       float               *E_lj)
 410 {
 411     float c6grid, inv_r6_nm, cr2, expmcr2, poly, sh_mask;
 412
 413     c6grid    = nbfp_comb_climg2d[2*typei]*nbfp_comb_climg2d[2*typej];
 414
 415     /* Recalculate inv_r6 without exclusion mask */
 416     inv_r6_nm = inv_r2*inv_r2*inv_r2;
 417     cr2       = lje_coeff2*r2;
 418     expmcr2   = exp(-cr2);
 419     poly      = 1.0f + cr2 + 0.5f*cr2*cr2;
 420
 421     /* Subtract the grid force from the total LJ force */
 422     *F_invr  += c6grid*(inv_r6_nm - expmcr2*(inv_r6_nm*poly + lje_coeff6_6))*inv_r2;
 423
 424     /* Shift should be applied only to real LJ pairs */
 425     sh_mask   = nbparam->sh_lj_ewald*int_bit;
 426     *E_lj    += ONE_SIXTH_F*c6grid*(inv_r6_nm*(1.0f - expmcr2*poly) + sh_mask);
 427 }
 428
 429 /*! Calculate LJ-PME grid force + energy contribution (if E_lj != NULL) with
 430  *  Lorentz-Berthelot combination rule.
 431  *  We use a single F+E kernel with conditional because the performance impact
 432  *  of this is pretty small and LB on the CPU is anyway very slow.
 433  */
 434 gmx_opencl_inline
 435 void calculate_lj_ewald_comb_LB_F_E(__constant float    *nbfp_comb_climg2d,
 436                                     cl_nbparam_params_t *nbparam,
 437                                     int                  typei,
 438                                     int                  typej,
 439                                     float                r2,
 440                                     float                inv_r2,
 441                                     float                lje_coeff2,
 442                                     float                lje_coeff6_6,
 443                                     float                int_bit,
 444                                     bool                 with_E_lj,
 445                                     float               *F_invr,
 446                                     float               *E_lj)
 447 {
 448     float c6grid, inv_r6_nm, cr2, expmcr2, poly;
 449     float sigma, sigma2, epsilon;
 450
 451     /* sigma and epsilon are scaled to give 6*C6 */
 452     sigma      = nbfp_comb_climg2d[2*typei] + nbfp_comb_climg2d[2*typej];
 453
 454     epsilon    = nbfp_comb_climg2d[2*typei+1]*nbfp_comb_climg2d[2*typej+1];
 455
 456     sigma2  = sigma*sigma;
 457     c6grid  = epsilon*sigma2*sigma2*sigma2;
 458
 459     /* Recalculate inv_r6 without exclusion mask */
 460     inv_r6_nm = inv_r2*inv_r2*inv_r2;
 461     cr2       = lje_coeff2*r2;
 462     expmcr2   = exp(-cr2);
 463     poly      = 1.0f + cr2 + 0.5f*cr2*cr2;
 464
 465     /* Subtract the grid force from the total LJ force */
 466     *F_invr  += c6grid*(inv_r6_nm - expmcr2*(inv_r6_nm*poly + lje_coeff6_6))*inv_r2;
 467
 468     if (with_E_lj == true)
 469     {
 470         float sh_mask;
 471
 472         /* Shift should be applied only to real LJ pairs */
 473         sh_mask   = nbparam->sh_lj_ewald*int_bit;
 474         *E_lj    += ONE_SIXTH_F*c6grid*(inv_r6_nm*(1.0f - expmcr2*poly) + sh_mask);
 475     }
 476 }
 477
 478 /*! Interpolate Ewald coulomb force using the table through the tex_nbfp texture.
 479  *  Original idea: from the OpenMM project
 480  */
 481 gmx_opencl_inline float
 482 interpolate_coulomb_force_r(__constant float *coulomb_tab_climg2d,
 483                             float             r,
 484                             float             scale)
 485 {
 486     float   normalized = scale * r;
 487     int     index      = (int) normalized;
 488     float   fract2     = normalized - index;
 489     float   fract1     = 1.0f - fract2;
 490
 491     return fract1*coulomb_tab_climg2d[index] +
 492            fract2*coulomb_tab_climg2d[index + 1];
 493 }
 494
 495 /*! Calculate analytical Ewald correction term. */
 496 gmx_opencl_inline
 497 float pmecorrF(float z2)
 498 {
 499     const float FN6 = -1.7357322914161492954e-8f;
 500     const float FN5 = 1.4703624142580877519e-6f;
 501     const float FN4 = -0.000053401640219807709149f;
 502     const float FN3 = 0.0010054721316683106153f;
 503     const float FN2 = -0.019278317264888380590f;
 504     const float FN1 = 0.069670166153766424023f;
 505     const float FN0 = -0.75225204789749321333f;
 506
 507     const float FD4 = 0.0011193462567257629232f;
 508     const float FD3 = 0.014866955030185295499f;
 509     const float FD2 = 0.11583842382862377919f;
 510     const float FD1 = 0.50736591960530292870f;
 511     const float FD0 = 1.0f;
 512
 513     float       z4;
 514     float       polyFN0, polyFN1, polyFD0, polyFD1;
 515
 516     z4          = z2*z2;
 517
 518     polyFD0     = FD4*z4 + FD2;
 519     polyFD1     = FD3*z4 + FD1;
 520     polyFD0     = polyFD0*z4 + FD0;
 521     polyFD0     = polyFD1*z2 + polyFD0;
 522
 523     polyFD0     = 1.0f/polyFD0;
 524
 525     polyFN0     = FN6*z4 + FN4;
 526     polyFN1     = FN5*z4 + FN3;
 527     polyFN0     = polyFN0*z4 + FN2;
 528     polyFN1     = polyFN1*z4 + FN1;
 529     polyFN0     = polyFN0*z4 + FN0;
 530     polyFN0     = polyFN1*z2 + polyFN0;
 531
 532     return polyFN0*polyFD0;
 533 }
 534
 535 /*! Final j-force reduction; this generic implementation works with
 536  *  arbitrary array sizes.
 537  */
 538 gmx_opencl_inline
 539 void reduce_force_j_generic(__local float *f_buf, __global float *fout,
 540                             int tidxi, int tidxj, int aidx)
 541 {
 542     /* Split the reduction between the first 3 column threads
 543        Threads with column id 0 will do the reduction for (float3).x components
 544        Threads with column id 1 will do the reduction for (float3).y components
 545        Threads with column id 2 will do the reduction for (float3).z components.
 546        The reduction is performed for each line tidxj of f_buf. */
 547     if (tidxi < 3)
 548     {
 549         float f = 0.0f;
 550         for (int j = tidxj * CL_SIZE; j < (tidxj + 1) * CL_SIZE; j++)
 551         {
 552             f += f_buf[FBUF_STRIDE * tidxi + j];
 553         }
 554
 555         atomicAdd_g_f(&fout[3 * aidx + tidxi], f);
 556     }
 557 }
 558
 559 /*! Final i-force reduction; this generic implementation works with
 560  *  arbitrary array sizes.
 561  */
 562 gmx_opencl_inline
 563 void reduce_force_i_generic(__local float *f_buf, __global float *fout,
 564                             float *fshift_buf, bool bCalcFshift,
 565                             int tidxi, int tidxj, int aidx)
 566 {
 567     /* Split the reduction between the first 3 line threads
 568        Threads with line id 0 will do the reduction for (float3).x components
 569        Threads with line id 1 will do the reduction for (float3).y components
 570        Threads with line id 2 will do the reduction for (float3).z components. */
 571     if (tidxj < 3)
 572     {
 573         float f = 0.0f;
 574         for (int j = tidxi; j < CL_SIZE_SQ; j += CL_SIZE)
 575         {
 576             f += f_buf[tidxj * FBUF_STRIDE + j];
 577         }
 578
 579         atomicAdd_g_f(&fout[3 * aidx + tidxj], f);
 580
 581         if (bCalcFshift)
 582         {
 583             (*fshift_buf) += f;
 584         }
 585     }
 586     barrier(CLK_LOCAL_MEM_FENCE);
 587 }
 588
 589 /*! Final i-force reduction; this implementation works only with power of two
 590  *  array sizes.
 591  */
 592 gmx_opencl_inline
 593 void reduce_force_i_pow2(volatile __local float *f_buf, __global float *fout,
 594                          float *fshift_buf, bool bCalcFshift,
 595                          int tidxi, int tidxj, int aidx)
 596 {
 597     int     i, j;
 598     /* Reduce the initial CL_SIZE values for each i atom to half
 599      * every step by using CL_SIZE * i threads.
 600      * Can't just use i as loop variable because than nvcc refuses to unroll.
 601      */
 602     i = CL_SIZE/2;
 603     for (j = CL_SIZE_LOG2 - 1; j > 0; j--)
 604     {
 605         if (tidxj < i)
 606         {
 607
 608             f_buf[                  tidxj * CL_SIZE + tidxi] += f_buf[                  (tidxj + i) * CL_SIZE + tidxi];
 609             f_buf[    FBUF_STRIDE + tidxj * CL_SIZE + tidxi] += f_buf[    FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi];
 610             f_buf[2 * FBUF_STRIDE + tidxj * CL_SIZE + tidxi] += f_buf[2 * FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi];
 611         }
 612         i >>= 1;
 613     }
 614     /* needed because
 615      * a) for CL_SIZE<8: id 2 (doing z in next block) is in 2nd warp
 616      * b) for all CL_SIZE a barrier is needed before f_buf is reused by next reduce_force_i call
 617      */
 618     barrier(CLK_LOCAL_MEM_FENCE);
 619
 620     /* i == 1, last reduction step, writing to global mem */
 621     /* Split the reduction between the first 3 line threads
 622        Threads with line id 0 will do the reduction for (float3).x components
 623        Threads with line id 1 will do the reduction for (float3).y components
 624        Threads with line id 2 will do the reduction for (float3).z components. */
 625     if (tidxj < 3)
 626     {
 627         float f = f_buf[tidxj * FBUF_STRIDE + tidxi] + f_buf[tidxj * FBUF_STRIDE + i * CL_SIZE + tidxi];
 628
 629         atomicAdd_g_f(&fout[3 * aidx + tidxj], f);
 630
 631         if (bCalcFshift)
 632         {
 633             (*fshift_buf) += f;
 634         }
 635     }
 636 }
 637
 638 /*! Final i-force reduction wrapper; calls the generic or pow2 reduction depending
 639  *  on whether the size of the array to be reduced is power of two or not.
 640  */
 641 gmx_opencl_inline
 642 void reduce_force_i(__local float *f_buf, __global float *f,
 643                     float *fshift_buf, bool bCalcFshift,
 644                     int tidxi, int tidxj, int ai)
 645 {
 646     if ((CL_SIZE & (CL_SIZE - 1)))
 647     {
 648         reduce_force_i_generic(f_buf, f, fshift_buf, bCalcFshift, tidxi, tidxj, ai);
 649     }
 650     else
 651     {
 652         reduce_force_i_pow2(f_buf, f, fshift_buf, bCalcFshift, tidxi, tidxj, ai);
 653     }
 654 }
 655
 656 /*! Energy reduction; this implementation works only with power of two
 657  *  array sizes.
 658  */
 659 gmx_opencl_inline
 660 void reduce_energy_pow2(volatile __local float  *buf,
 661                         volatile __global float *e_lj,
 662                         volatile __global float *e_el,
 663                         unsigned int             tidx)
 664 {
 665     int     i, j;
 666     float   e1, e2;
 667
 668     i = WARP_SIZE/2;
 669
 670     /* Can't just use i as loop variable because than nvcc refuses to unroll. */
 671     for (j = WARP_SIZE_LOG2 - 1; j > 0; j--)
 672     {
 673         if (tidx < i)
 674         {
 675             buf[              tidx] += buf[              tidx + i];
 676             buf[FBUF_STRIDE + tidx] += buf[FBUF_STRIDE + tidx + i];
 677         }
 678         i >>= 1;
 679     }
 680
 681     /* last reduction step, writing to global mem */
 682     if (tidx == 0)
 683     {
 684         e1 = buf[              tidx] + buf[              tidx + i];
 685         e2 = buf[FBUF_STRIDE + tidx] + buf[FBUF_STRIDE + tidx + i];
 686
 687         atomicAdd_g_f(e_lj, e1);
 688         atomicAdd_g_f(e_el, e2);
 689     }
 690 }
 691
 692 #endif /* NBNXN_OPENCL_KERNEL_UTILS_CLH */