src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2012,2013,2014,2016,2017, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 /*! \internal \file
  37  *  \brief
  38  *  Utility constant and function declaration for the CUDA non-bonded kernels.
  39  *  This header should be included once at the top level, just before the
  40  *  kernels are included (has to be preceded by nbnxn_cuda_types.h).
  41  *
  42  *  \author Szilárd Páll <pall.szilard@gmail.com>
  43  *  \ingroup module_mdlib
  44  */
  45 #include "config.h"
  46
  47 #include <assert.h>
  48
  49 /* Note that floating-point constants in CUDA code should be suffixed
  50  * with f (e.g. 0.5f), to stop the compiler producing intermediate
  51  * code that is in double precision.
  52  */
  53 #include "config.h"
  54
  55 #include "gromacs/gpu_utils/cuda_arch_utils.cuh"
  56 #include "gromacs/gpu_utils/vectype_ops.cuh"
  57
  58 #include "nbnxn_cuda_types.h"
  59
  60 #ifndef NBNXN_CUDA_KERNEL_UTILS_CUH
  61 #define NBNXN_CUDA_KERNEL_UTILS_CUH
  62
  63 /* Use texture objects if supported by the target hardware. */
  64 #if GMX_PTX_ARCH >= 300
  65 /* Note: convenience macro, needs to be undef-ed at the end of the file. */
  66 #define USE_TEXOBJ
  67 #endif
  68
  69 /*! \brief Log of the i and j cluster size.
  70  *  change this together with c_clSize !*/
  71 static const int          c_clSizeLog2  = 3;
  72 /*! \brief Square of cluster size. */
  73 static const int          c_clSizeSq    = c_clSize*c_clSize;
  74 /*! \brief j-cluster size after split (4 in the current implementation). */
  75 static const int          c_splitClSize = c_clSize/c_nbnxnGpuClusterpairSplit;
  76 /*! \brief Stride in the force accumualation buffer */
  77 static const int          c_fbufStride  = c_clSizeSq;
  78
  79 static const float        c_oneSixth    = 0.16666667f;
  80 static const float        c_oneTwelveth = 0.08333333f;
  81
  82 /* With multiple compilation units this ensures that texture refs are available
  83    in the the kernels' compilation units. */
  84 #if !GMX_CUDA_NB_SINGLE_COMPILATION_UNIT
  85 /*! Texture reference for LJ C6/C12 parameters; bound to cu_nbparam_t.nbfp */
  86 extern texture<float, 1, cudaReadModeElementType> nbfp_texref;
  87
  88 /*! Texture reference for LJ-PME parameters; bound to cu_nbparam_t.nbfp_comb */
  89 extern texture<float, 1, cudaReadModeElementType> nbfp_comb_texref;
  90
  91 /*! Texture reference for Ewald coulomb force table; bound to cu_nbparam_t.coulomb_tab */
  92 extern texture<float, 1, cudaReadModeElementType> coulomb_tab_texref;
  93 #endif /* GMX_CUDA_NB_SINGLE_COMPILATION_UNIT */
  94
  95 /*! Convert LJ sigma,epsilon parameters to C6,C12. */
  96 static __forceinline__ __device__
  97 void convert_sigma_epsilon_to_c6_c12(const float  sigma,
  98                                      const float  epsilon,
  99                                      float       *c6,
 100                                      float       *c12)
 101 {
 102     float sigma2, sigma6;
 103
 104     sigma2 = sigma * sigma;
 105     sigma6 = sigma2 *sigma2 * sigma2;
 106     *c6    = epsilon * sigma6;
 107     *c12   = *c6 * sigma6;
 108 }
 109
 110 /*! Apply force switch,  force + energy version. */
 111 static __forceinline__ __device__
 112 void calculate_force_switch_F(const  cu_nbparam_t nbparam,
 113                               float               c6,
 114                               float               c12,
 115                               float               inv_r,
 116                               float               r2,
 117                               float              *F_invr)
 118 {
 119     float r, r_switch;
 120
 121     /* force switch constants */
 122     float disp_shift_V2 = nbparam.dispersion_shift.c2;
 123     float disp_shift_V3 = nbparam.dispersion_shift.c3;
 124     float repu_shift_V2 = nbparam.repulsion_shift.c2;
 125     float repu_shift_V3 = nbparam.repulsion_shift.c3;
 126
 127     r         = r2 * inv_r;
 128     r_switch  = r - nbparam.rvdw_switch;
 129     r_switch  = r_switch >= 0.0f ? r_switch : 0.0f;
 130
 131     *F_invr  +=
 132         -c6*(disp_shift_V2 + disp_shift_V3*r_switch)*r_switch*r_switch*inv_r +
 133         c12*(-repu_shift_V2 + repu_shift_V3*r_switch)*r_switch*r_switch*inv_r;
 134 }
 135
 136 /*! Apply force switch, force-only version. */
 137 static __forceinline__ __device__
 138 void calculate_force_switch_F_E(const  cu_nbparam_t nbparam,
 139                                 float               c6,
 140                                 float               c12,
 141                                 float               inv_r,
 142                                 float               r2,
 143                                 float              *F_invr,
 144                                 float              *E_lj)
 145 {
 146     float r, r_switch;
 147
 148     /* force switch constants */
 149     float disp_shift_V2 = nbparam.dispersion_shift.c2;
 150     float disp_shift_V3 = nbparam.dispersion_shift.c3;
 151     float repu_shift_V2 = nbparam.repulsion_shift.c2;
 152     float repu_shift_V3 = nbparam.repulsion_shift.c3;
 153
 154     float disp_shift_F2 = nbparam.dispersion_shift.c2/3;
 155     float disp_shift_F3 = nbparam.dispersion_shift.c3/4;
 156     float repu_shift_F2 = nbparam.repulsion_shift.c2/3;
 157     float repu_shift_F3 = nbparam.repulsion_shift.c3/4;
 158
 159     r         = r2 * inv_r;
 160     r_switch  = r - nbparam.rvdw_switch;
 161     r_switch  = r_switch >= 0.0f ? r_switch : 0.0f;
 162
 163     *F_invr  +=
 164         -c6*(disp_shift_V2 + disp_shift_V3*r_switch)*r_switch*r_switch*inv_r +
 165         c12*(-repu_shift_V2 + repu_shift_V3*r_switch)*r_switch*r_switch*inv_r;
 166     *E_lj    +=
 167         c6*(disp_shift_F2 + disp_shift_F3*r_switch)*r_switch*r_switch*r_switch -
 168         c12*(repu_shift_F2 + repu_shift_F3*r_switch)*r_switch*r_switch*r_switch;
 169 }
 170
 171 /*! Apply potential switch, force-only version. */
 172 static __forceinline__ __device__
 173 void calculate_potential_switch_F(const  cu_nbparam_t nbparam,
 174                                   float               c6,
 175                                   float               c12,
 176                                   float               inv_r,
 177                                   float               r2,
 178                                   float              *F_invr,
 179                                   float              *E_lj)
 180 {
 181     float r, r_switch;
 182     float sw, dsw;
 183
 184     /* potential switch constants */
 185     float switch_V3 = nbparam.vdw_switch.c3;
 186     float switch_V4 = nbparam.vdw_switch.c4;
 187     float switch_V5 = nbparam.vdw_switch.c5;
 188     float switch_F2 = 3*nbparam.vdw_switch.c3;
 189     float switch_F3 = 4*nbparam.vdw_switch.c4;
 190     float switch_F4 = 5*nbparam.vdw_switch.c5;
 191
 192     r        = r2 * inv_r;
 193     r_switch = r - nbparam.rvdw_switch;
 194
 195     /* Unlike in the F+E kernel, conditional is faster here */
 196     if (r_switch > 0.0f)
 197     {
 198         sw      = 1.0f + (switch_V3 + (switch_V4 + switch_V5*r_switch)*r_switch)*r_switch*r_switch*r_switch;
 199         dsw     = (switch_F2 + (switch_F3 + switch_F4*r_switch)*r_switch)*r_switch*r_switch;
 200
 201         *F_invr = (*F_invr)*sw - inv_r*(*E_lj)*dsw;
 202     }
 203 }
 204
 205 /*! Apply potential switch, force + energy version. */
 206 static __forceinline__ __device__
 207 void calculate_potential_switch_F_E(const  cu_nbparam_t nbparam,
 208                                     float               c6,
 209                                     float               c12,
 210                                     float               inv_r,
 211                                     float               r2,
 212                                     float              *F_invr,
 213                                     float              *E_lj)
 214 {
 215     float r, r_switch;
 216     float sw, dsw;
 217
 218     /* potential switch constants */
 219     float switch_V3 = nbparam.vdw_switch.c3;
 220     float switch_V4 = nbparam.vdw_switch.c4;
 221     float switch_V5 = nbparam.vdw_switch.c5;
 222     float switch_F2 = 3*nbparam.vdw_switch.c3;
 223     float switch_F3 = 4*nbparam.vdw_switch.c4;
 224     float switch_F4 = 5*nbparam.vdw_switch.c5;
 225
 226     r        = r2 * inv_r;
 227     r_switch = r - nbparam.rvdw_switch;
 228     r_switch = r_switch >= 0.0f ? r_switch : 0.0f;
 229
 230     /* Unlike in the F-only kernel, masking is faster here */
 231     sw       = 1.0f + (switch_V3 + (switch_V4 + switch_V5*r_switch)*r_switch)*r_switch*r_switch*r_switch;
 232     dsw      = (switch_F2 + (switch_F3 + switch_F4*r_switch)*r_switch)*r_switch*r_switch;
 233
 234     *F_invr  = (*F_invr)*sw - inv_r*(*E_lj)*dsw;
 235     *E_lj   *= sw;
 236 }
 237
 238 /*! Calculate LJ-PME grid force contribution with
 239  *  geometric combination rule.
 240  */
 241 static __forceinline__ __device__
 242 void calculate_lj_ewald_comb_geom_F(const cu_nbparam_t nbparam,
 243                                     int                typei,
 244                                     int                typej,
 245                                     float              r2,
 246                                     float              inv_r2,
 247                                     float              lje_coeff2,
 248                                     float              lje_coeff6_6,
 249                                     float             *F_invr)
 250 {
 251     float c6grid, inv_r6_nm, cr2, expmcr2, poly;
 252
 253 #ifdef USE_TEXOBJ
 254     c6grid    = tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2*typei) * tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2*typej);
 255 #else
 256     c6grid    = tex1Dfetch(nbfp_comb_texref, 2*typei) * tex1Dfetch(nbfp_comb_texref, 2*typej);
 257 #endif /* USE_TEXOBJ */
 258
 259     /* Recalculate inv_r6 without exclusion mask */
 260     inv_r6_nm = inv_r2*inv_r2*inv_r2;
 261     cr2       = lje_coeff2*r2;
 262     expmcr2   = expf(-cr2);
 263     poly      = 1.0f + cr2 + 0.5f*cr2*cr2;
 264
 265     /* Subtract the grid force from the total LJ force */
 266     *F_invr  += c6grid*(inv_r6_nm - expmcr2*(inv_r6_nm*poly + lje_coeff6_6))*inv_r2;
 267 }
 268
 269 /*! Calculate LJ-PME grid force + energy contribution with
 270  *  geometric combination rule.
 271  */
 272 static __forceinline__ __device__
 273 void calculate_lj_ewald_comb_geom_F_E(const cu_nbparam_t nbparam,
 274                                       int                typei,
 275                                       int                typej,
 276                                       float              r2,
 277                                       float              inv_r2,
 278                                       float              lje_coeff2,
 279                                       float              lje_coeff6_6,
 280                                       float              int_bit,
 281                                       float             *F_invr,
 282                                       float             *E_lj)
 283 {
 284     float c6grid, inv_r6_nm, cr2, expmcr2, poly, sh_mask;
 285
 286 #ifdef USE_TEXOBJ
 287     c6grid    = tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2*typei) * tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2*typej);
 288 #else
 289     c6grid    = tex1Dfetch(nbfp_comb_texref, 2*typei) * tex1Dfetch(nbfp_comb_texref, 2*typej);
 290 #endif /* USE_TEXOBJ */
 291
 292     /* Recalculate inv_r6 without exclusion mask */
 293     inv_r6_nm = inv_r2*inv_r2*inv_r2;
 294     cr2       = lje_coeff2*r2;
 295     expmcr2   = expf(-cr2);
 296     poly      = 1.0f + cr2 + 0.5f*cr2*cr2;
 297
 298     /* Subtract the grid force from the total LJ force */
 299     *F_invr  += c6grid*(inv_r6_nm - expmcr2*(inv_r6_nm*poly + lje_coeff6_6))*inv_r2;
 300
 301     /* Shift should be applied only to real LJ pairs */
 302     sh_mask   = nbparam.sh_lj_ewald*int_bit;
 303     *E_lj    += c_oneSixth*c6grid*(inv_r6_nm*(1.0f - expmcr2*poly) + sh_mask);
 304 }
 305
 306 /*! Calculate LJ-PME grid force + energy contribution (if E_lj != NULL) with
 307  *  Lorentz-Berthelot combination rule.
 308  *  We use a single F+E kernel with conditional because the performance impact
 309  *  of this is pretty small and LB on the CPU is anyway very slow.
 310  */
 311 static __forceinline__ __device__
 312 void calculate_lj_ewald_comb_LB_F_E(const cu_nbparam_t nbparam,
 313                                     int                typei,
 314                                     int                typej,
 315                                     float              r2,
 316                                     float              inv_r2,
 317                                     float              lje_coeff2,
 318                                     float              lje_coeff6_6,
 319                                     float              int_bit,
 320                                     float             *F_invr,
 321                                     float             *E_lj)
 322 {
 323     float c6grid, inv_r6_nm, cr2, expmcr2, poly;
 324     float sigma, sigma2, epsilon;
 325
 326     /* sigma and epsilon are scaled to give 6*C6 */
 327 #ifdef USE_TEXOBJ
 328     sigma   = tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2*typei    ) + tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2*typej    );
 329     epsilon = tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2*typei + 1) * tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2*typej + 1);
 330 #else
 331     sigma   = tex1Dfetch(nbfp_comb_texref, 2*typei    ) + tex1Dfetch(nbfp_comb_texref, 2*typej    );
 332     epsilon = tex1Dfetch(nbfp_comb_texref, 2*typei + 1) * tex1Dfetch(nbfp_comb_texref, 2*typej + 1);
 333 #endif /* USE_TEXOBJ */
 334     sigma2  = sigma*sigma;
 335     c6grid  = epsilon*sigma2*sigma2*sigma2;
 336
 337     /* Recalculate inv_r6 without exclusion mask */
 338     inv_r6_nm = inv_r2*inv_r2*inv_r2;
 339     cr2       = lje_coeff2*r2;
 340     expmcr2   = expf(-cr2);
 341     poly      = 1.0f + cr2 + 0.5f*cr2*cr2;
 342
 343     /* Subtract the grid force from the total LJ force */
 344     *F_invr  += c6grid*(inv_r6_nm - expmcr2*(inv_r6_nm*poly + lje_coeff6_6))*inv_r2;
 345
 346     if (E_lj != NULL)
 347     {
 348         float sh_mask;
 349
 350         /* Shift should be applied only to real LJ pairs */
 351         sh_mask   = nbparam.sh_lj_ewald*int_bit;
 352         *E_lj    += c_oneSixth*c6grid*(inv_r6_nm*(1.0f - expmcr2*poly) + sh_mask);
 353     }
 354 }
 355
 356 /*! Interpolate Ewald coulomb force using the table through the tex_nbfp texture.
 357  *  Original idea: from the OpenMM project
 358  */
 359 static __forceinline__ __device__
 360 float interpolate_coulomb_force_r(float r, float scale)
 361 {
 362     float   normalized = scale * r;
 363     int     index      = (int) normalized;
 364     float   fract2     = normalized - index;
 365     float   fract1     = 1.0f - fract2;
 366
 367     return fract1 * tex1Dfetch(coulomb_tab_texref, index)
 368            + fract2 * tex1Dfetch(coulomb_tab_texref, index + 1);
 369 }
 370
 371 static __forceinline__ __device__
 372 float interpolate_coulomb_force_r(cudaTextureObject_t texobj_coulomb_tab,
 373                                   float r, float scale)
 374 {
 375     float   normalized = scale * r;
 376     int     index      = (int) normalized;
 377     float   fract2     = normalized - index;
 378     float   fract1     = 1.0f - fract2;
 379
 380     return fract1 * tex1Dfetch<float>(texobj_coulomb_tab, index) +
 381            fract2 * tex1Dfetch<float>(texobj_coulomb_tab, index + 1);
 382 }
 383
 384 /*! Calculate analytical Ewald correction term. */
 385 static __forceinline__ __device__
 386 float pmecorrF(float z2)
 387 {
 388     const float FN6 = -1.7357322914161492954e-8f;
 389     const float FN5 = 1.4703624142580877519e-6f;
 390     const float FN4 = -0.000053401640219807709149f;
 391     const float FN3 = 0.0010054721316683106153f;
 392     const float FN2 = -0.019278317264888380590f;
 393     const float FN1 = 0.069670166153766424023f;
 394     const float FN0 = -0.75225204789749321333f;
 395
 396     const float FD4 = 0.0011193462567257629232f;
 397     const float FD3 = 0.014866955030185295499f;
 398     const float FD2 = 0.11583842382862377919f;
 399     const float FD1 = 0.50736591960530292870f;
 400     const float FD0 = 1.0f;
 401
 402     float       z4;
 403     float       polyFN0, polyFN1, polyFD0, polyFD1;
 404
 405     z4          = z2*z2;
 406
 407     polyFD0     = FD4*z4 + FD2;
 408     polyFD1     = FD3*z4 + FD1;
 409     polyFD0     = polyFD0*z4 + FD0;
 410     polyFD0     = polyFD1*z2 + polyFD0;
 411
 412     polyFD0     = 1.0f/polyFD0;
 413
 414     polyFN0     = FN6*z4 + FN4;
 415     polyFN1     = FN5*z4 + FN3;
 416     polyFN0     = polyFN0*z4 + FN2;
 417     polyFN1     = polyFN1*z4 + FN1;
 418     polyFN0     = polyFN0*z4 + FN0;
 419     polyFN0     = polyFN1*z2 + polyFN0;
 420
 421     return polyFN0*polyFD0;
 422 }
 423
 424 /*! Final j-force reduction; this generic implementation works with
 425  *  arbitrary array sizes.
 426  */
 427 static __forceinline__ __device__
 428 void reduce_force_j_generic(float *f_buf, float3 *fout,
 429                             int tidxi, int tidxj, int aidx)
 430 {
 431     if (tidxi < 3)
 432     {
 433         float f = 0.0f;
 434         for (int j = tidxj * c_clSize; j < (tidxj + 1) * c_clSize; j++)
 435         {
 436             f += f_buf[c_fbufStride * tidxi + j];
 437         }
 438
 439         atomicAdd((&fout[aidx].x)+tidxi, f);
 440     }
 441 }
 442
 443 /*! Final j-force reduction; this implementation only with power of two
 444  *  array sizes and with sm >= 3.0
 445  */
 446 #if GMX_PTX_ARCH >= 300
 447 static __forceinline__ __device__
 448 void reduce_force_j_warp_shfl(float3 f, float3 *fout,
 449                               int tidxi, int aidx,
 450                               const unsigned int activemask)
 451 {
 452     f.x += gmx_shfl_down_sync(activemask, f.x, 1);
 453     f.y += gmx_shfl_up_sync  (activemask, f.y, 1);
 454     f.z += gmx_shfl_down_sync(activemask, f.z, 1);
 455
 456     if (tidxi & 1)
 457     {
 458         f.x = f.y;
 459     }
 460
 461     f.x += gmx_shfl_down_sync(activemask, f.x, 2);
 462     f.z += gmx_shfl_up_sync  (activemask, f.z, 2);
 463
 464     if (tidxi & 2)
 465     {
 466         f.x = f.z;
 467     }
 468
 469     f.x += gmx_shfl_down_sync(activemask, f.x, 4);
 470
 471     if (tidxi < 3)
 472     {
 473         atomicAdd((&fout[aidx].x) + tidxi, f.x);
 474     }
 475 }
 476 #endif
 477
 478 /*! Final i-force reduction; this generic implementation works with
 479  *  arbitrary array sizes.
 480  * TODO: add the tidxi < 3 trick
 481  */
 482 static __forceinline__ __device__
 483 void reduce_force_i_generic(float *f_buf, float3 *fout,
 484                             float *fshift_buf, bool bCalcFshift,
 485                             int tidxi, int tidxj, int aidx)
 486 {
 487     if (tidxj < 3)
 488     {
 489         float f = 0.0f;
 490         for (int j = tidxi; j < c_clSizeSq; j += c_clSize)
 491         {
 492             f += f_buf[tidxj * c_fbufStride + j];
 493         }
 494
 495         atomicAdd(&fout[aidx].x + tidxj, f);
 496
 497         if (bCalcFshift)
 498         {
 499             *fshift_buf += f;
 500         }
 501     }
 502 }
 503
 504 /*! Final i-force reduction; this implementation works only with power of two
 505  *  array sizes.
 506  */
 507 static __forceinline__ __device__
 508 void reduce_force_i_pow2(volatile float *f_buf, float3 *fout,
 509                          float *fshift_buf, bool bCalcFshift,
 510                          int tidxi, int tidxj, int aidx)
 511 {
 512     int     i, j;
 513     float   f;
 514
 515     assert(c_clSize == 1 << c_clSizeLog2);
 516
 517     /* Reduce the initial c_clSize values for each i atom to half
 518      * every step by using c_clSize * i threads.
 519      * Can't just use i as loop variable because than nvcc refuses to unroll.
 520      */
 521     i = c_clSize/2;
 522 #pragma unroll 5
 523     for (j = c_clSizeLog2 - 1; j > 0; j--)
 524     {
 525         if (tidxj < i)
 526         {
 527
 528             f_buf[                   tidxj * c_clSize + tidxi] += f_buf[                   (tidxj + i) * c_clSize + tidxi];
 529             f_buf[    c_fbufStride + tidxj * c_clSize + tidxi] += f_buf[    c_fbufStride + (tidxj + i) * c_clSize + tidxi];
 530             f_buf[2 * c_fbufStride + tidxj * c_clSize + tidxi] += f_buf[2 * c_fbufStride + (tidxj + i) * c_clSize + tidxi];
 531         }
 532         i >>= 1;
 533     }
 534
 535     /* i == 1, last reduction step, writing to global mem */
 536     if (tidxj < 3)
 537     {
 538         /* tidxj*c_fbufStride selects x, y or z */
 539         f = f_buf[tidxj * c_fbufStride               + tidxi] +
 540             f_buf[tidxj * c_fbufStride + i * c_clSize + tidxi];
 541
 542         atomicAdd(&(fout[aidx].x) + tidxj, f);
 543
 544         if (bCalcFshift)
 545         {
 546             *fshift_buf += f;
 547         }
 548     }
 549
 550 }
 551
 552 /*! Final i-force reduction wrapper; calls the generic or pow2 reduction depending
 553  *  on whether the size of the array to be reduced is power of two or not.
 554  */
 555 static __forceinline__ __device__
 556 void reduce_force_i(float *f_buf, float3 *f,
 557                     float *fshift_buf, bool bCalcFshift,
 558                     int tidxi, int tidxj, int ai)
 559 {
 560     if ((c_clSize & (c_clSize - 1)))
 561     {
 562         reduce_force_i_generic(f_buf, f, fshift_buf, bCalcFshift, tidxi, tidxj, ai);
 563     }
 564     else
 565     {
 566         reduce_force_i_pow2(f_buf, f, fshift_buf, bCalcFshift, tidxi, tidxj, ai);
 567     }
 568 }
 569
 570 /*! Final i-force reduction; this implementation works only with power of two
 571  *  array sizes and with sm >= 3.0
 572  */
 573 #if GMX_PTX_ARCH >= 300
 574 static __forceinline__ __device__
 575 void reduce_force_i_warp_shfl(float3 fin, float3 *fout,
 576                               float *fshift_buf, bool bCalcFshift,
 577                               int tidxj, int aidx,
 578                               const unsigned int activemask)
 579 {
 580     fin.x += gmx_shfl_down_sync(activemask, fin.x, c_clSize);
 581     fin.y += gmx_shfl_up_sync  (activemask, fin.y, c_clSize);
 582     fin.z += gmx_shfl_down_sync(activemask, fin.z, c_clSize);
 583
 584     if (tidxj & 1)
 585     {
 586         fin.x = fin.y;
 587     }
 588
 589     fin.x += gmx_shfl_down_sync(activemask, fin.x, 2*c_clSize);
 590     fin.z += gmx_shfl_up_sync  (activemask, fin.z, 2*c_clSize);
 591
 592     if (tidxj & 2)
 593     {
 594         fin.x = fin.z;
 595     }
 596
 597     /* Threads 0,1,2 and 4,5,6 increment x,y,z for their warp */
 598     if ((tidxj & 3) < 3)
 599     {
 600         atomicAdd(&fout[aidx].x + (tidxj & 3), fin.x);
 601
 602         if (bCalcFshift)
 603         {
 604             *fshift_buf += fin.x;
 605         }
 606     }
 607 }
 608 #endif
 609
 610 /*! Energy reduction; this implementation works only with power of two
 611  *  array sizes.
 612  */
 613 static __forceinline__ __device__
 614 void reduce_energy_pow2(volatile float *buf,
 615                         float *e_lj, float *e_el,
 616                         unsigned int tidx)
 617 {
 618     int     i, j;
 619     float   e1, e2;
 620
 621     i = warp_size/2;
 622
 623     /* Can't just use i as loop variable because than nvcc refuses to unroll. */
 624 #pragma unroll 10
 625     for (j = warp_size_log2 - 1; j > 0; j--)
 626     {
 627         if (tidx < i)
 628         {
 629             buf[               tidx] += buf[               tidx + i];
 630             buf[c_fbufStride + tidx] += buf[c_fbufStride + tidx + i];
 631         }
 632         i >>= 1;
 633     }
 634
 635     // TODO do this on two threads - requires e_lj and e_el to be stored on adjascent
 636     // memory locations to make sense
 637     /* last reduction step, writing to global mem */
 638     if (tidx == 0)
 639     {
 640         e1 = buf[               tidx] + buf[               tidx + i];
 641         e2 = buf[c_fbufStride + tidx] + buf[c_fbufStride + tidx + i];
 642
 643         atomicAdd(e_lj, e1);
 644         atomicAdd(e_el, e2);
 645     }
 646 }
 647
 648 /*! Energy reduction; this implementation works only with power of two
 649  *  array sizes and with sm >= 3.0
 650  */
 651 #if GMX_PTX_ARCH >= 300
 652 static __forceinline__ __device__
 653 void reduce_energy_warp_shfl(float E_lj, float E_el,
 654                              float *e_lj, float *e_el,
 655                              int tidx,
 656                              const unsigned int activemask)
 657 {
 658     int i, sh;
 659
 660     sh = 1;
 661 #pragma unroll 5
 662     for (i = 0; i < 5; i++)
 663     {
 664         E_lj += gmx_shfl_down_sync(activemask, E_lj, sh);
 665         E_el += gmx_shfl_down_sync(activemask, E_el, sh);
 666         sh   += sh;
 667     }
 668
 669     /* The first thread in the warp writes the reduced energies */
 670     if (tidx == 0 || tidx == warp_size)
 671     {
 672         atomicAdd(e_lj, E_lj);
 673         atomicAdd(e_el, E_el);
 674     }
 675 }
 676 #endif /* GMX_PTX_ARCH */
 677
 678 #undef USE_TEXOBJ
 679
 680 #endif /* NBNXN_CUDA_KERNEL_UTILS_CUH */