src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel_utils.cuh

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2012,2013,2014,2016,2017,2018,2019, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 /*! \internal \file
  37  *  \brief
  38  *  Utility constant and function declaration for the CUDA non-bonded kernels.
  39  *  This header should be included once at the top level, just before the
  40  *  kernels are included (has to be preceded by nbnxn_cuda_types.h).
  41  *
  42  *  \author Szilárd Páll <pall.szilard@gmail.com>
  43  *  \ingroup module_nbnxm
  44  */
  45 #include <assert.h>
  46
  47 /* Note that floating-point constants in CUDA code should be suffixed
  48  * with f (e.g. 0.5f), to stop the compiler producing intermediate
  49  * code that is in double precision.
  50  */
  51
  52 #include "gromacs/gpu_utils/cuda_arch_utils.cuh"
  53 #include "gromacs/gpu_utils/cuda_kernel_utils.cuh"
  54 #include "gromacs/gpu_utils/vectype_ops.cuh"
  55
  56 #include "nbnxm_cuda_types.h"
  57
  58 #ifndef NBNXM_CUDA_KERNEL_UTILS_CUH
  59 #define NBNXM_CUDA_KERNEL_UTILS_CUH
  60
  61 /*! \brief Log of the i and j cluster size.
  62  *  change this together with c_clSize !*/
  63 static const int __device__          c_clSizeLog2  = 3;
  64 /*! \brief Square of cluster size. */
  65 static const int __device__          c_clSizeSq    = c_clSize*c_clSize;
  66 /*! \brief j-cluster size after split (4 in the current implementation). */
  67 static const int __device__          c_splitClSize = c_clSize/c_nbnxnGpuClusterpairSplit;
  68 /*! \brief Stride in the force accumualation buffer */
  69 static const int __device__          c_fbufStride  = c_clSizeSq;
  70 /*! \brief i-cluster interaction mask for a super-cluster with all c_numClPerSupercl=8 bits set */
  71 static const unsigned __device__     superClInteractionMask = ((1U << c_numClPerSupercl) - 1U);
  72
  73 static const float __device__        c_oneSixth    = 0.16666667f;
  74 static const float __device__        c_oneTwelveth = 0.08333333f;
  75
  76
  77 /*! Convert LJ sigma,epsilon parameters to C6,C12. */
  78 static __forceinline__ __device__
  79 void convert_sigma_epsilon_to_c6_c12(const float  sigma,
  80                                      const float  epsilon,
  81                                      float       *c6,
  82                                      float       *c12)
  83 {
  84     float sigma2, sigma6;
  85
  86     sigma2 = sigma * sigma;
  87     sigma6 = sigma2 *sigma2 * sigma2;
  88     *c6    = epsilon * sigma6;
  89     *c12   = *c6 * sigma6;
  90 }
  91
  92 /*! Apply force switch,  force + energy version. */
  93 static __forceinline__ __device__
  94 void calculate_force_switch_F(const  cu_nbparam_t nbparam,
  95                               float               c6,
  96                               float               c12,
  97                               float               inv_r,
  98                               float               r2,
  99                               float              *F_invr)
 100 {
 101     float r, r_switch;
 102
 103     /* force switch constants */
 104     float disp_shift_V2 = nbparam.dispersion_shift.c2;
 105     float disp_shift_V3 = nbparam.dispersion_shift.c3;
 106     float repu_shift_V2 = nbparam.repulsion_shift.c2;
 107     float repu_shift_V3 = nbparam.repulsion_shift.c3;
 108
 109     r         = r2 * inv_r;
 110     r_switch  = r - nbparam.rvdw_switch;
 111     r_switch  = r_switch >= 0.0f ? r_switch : 0.0f;
 112
 113     *F_invr  +=
 114         -c6*(disp_shift_V2 + disp_shift_V3*r_switch)*r_switch*r_switch*inv_r +
 115         c12*(-repu_shift_V2 + repu_shift_V3*r_switch)*r_switch*r_switch*inv_r;
 116 }
 117
 118 /*! Apply force switch, force-only version. */
 119 static __forceinline__ __device__
 120 void calculate_force_switch_F_E(const  cu_nbparam_t nbparam,
 121                                 float               c6,
 122                                 float               c12,
 123                                 float               inv_r,
 124                                 float               r2,
 125                                 float              *F_invr,
 126                                 float              *E_lj)
 127 {
 128     float r, r_switch;
 129
 130     /* force switch constants */
 131     float disp_shift_V2 = nbparam.dispersion_shift.c2;
 132     float disp_shift_V3 = nbparam.dispersion_shift.c3;
 133     float repu_shift_V2 = nbparam.repulsion_shift.c2;
 134     float repu_shift_V3 = nbparam.repulsion_shift.c3;
 135
 136     float disp_shift_F2 = nbparam.dispersion_shift.c2/3;
 137     float disp_shift_F3 = nbparam.dispersion_shift.c3/4;
 138     float repu_shift_F2 = nbparam.repulsion_shift.c2/3;
 139     float repu_shift_F3 = nbparam.repulsion_shift.c3/4;
 140
 141     r         = r2 * inv_r;
 142     r_switch  = r - nbparam.rvdw_switch;
 143     r_switch  = r_switch >= 0.0f ? r_switch : 0.0f;
 144
 145     *F_invr  +=
 146         -c6*(disp_shift_V2 + disp_shift_V3*r_switch)*r_switch*r_switch*inv_r +
 147         c12*(-repu_shift_V2 + repu_shift_V3*r_switch)*r_switch*r_switch*inv_r;
 148     *E_lj    +=
 149         c6*(disp_shift_F2 + disp_shift_F3*r_switch)*r_switch*r_switch*r_switch -
 150         c12*(repu_shift_F2 + repu_shift_F3*r_switch)*r_switch*r_switch*r_switch;
 151 }
 152
 153 /*! Apply potential switch, force-only version. */
 154 static __forceinline__ __device__
 155 void calculate_potential_switch_F(const  cu_nbparam_t nbparam,
 156                                   float               inv_r,
 157                                   float               r2,
 158                                   float              *F_invr,
 159                                   float              *E_lj)
 160 {
 161     float r, r_switch;
 162     float sw, dsw;
 163
 164     /* potential switch constants */
 165     float switch_V3 = nbparam.vdw_switch.c3;
 166     float switch_V4 = nbparam.vdw_switch.c4;
 167     float switch_V5 = nbparam.vdw_switch.c5;
 168     float switch_F2 = 3*nbparam.vdw_switch.c3;
 169     float switch_F3 = 4*nbparam.vdw_switch.c4;
 170     float switch_F4 = 5*nbparam.vdw_switch.c5;
 171
 172     r        = r2 * inv_r;
 173     r_switch = r - nbparam.rvdw_switch;
 174
 175     /* Unlike in the F+E kernel, conditional is faster here */
 176     if (r_switch > 0.0f)
 177     {
 178         sw      = 1.0f + (switch_V3 + (switch_V4 + switch_V5*r_switch)*r_switch)*r_switch*r_switch*r_switch;
 179         dsw     = (switch_F2 + (switch_F3 + switch_F4*r_switch)*r_switch)*r_switch*r_switch;
 180
 181         *F_invr = (*F_invr)*sw - inv_r*(*E_lj)*dsw;
 182     }
 183 }
 184
 185 /*! Apply potential switch, force + energy version. */
 186 static __forceinline__ __device__
 187 void calculate_potential_switch_F_E(const  cu_nbparam_t nbparam,
 188                                     float               inv_r,
 189                                     float               r2,
 190                                     float              *F_invr,
 191                                     float              *E_lj)
 192 {
 193     float r, r_switch;
 194     float sw, dsw;
 195
 196     /* potential switch constants */
 197     float switch_V3 = nbparam.vdw_switch.c3;
 198     float switch_V4 = nbparam.vdw_switch.c4;
 199     float switch_V5 = nbparam.vdw_switch.c5;
 200     float switch_F2 = 3*nbparam.vdw_switch.c3;
 201     float switch_F3 = 4*nbparam.vdw_switch.c4;
 202     float switch_F4 = 5*nbparam.vdw_switch.c5;
 203
 204     r        = r2 * inv_r;
 205     r_switch = r - nbparam.rvdw_switch;
 206     r_switch = r_switch >= 0.0f ? r_switch : 0.0f;
 207
 208     /* Unlike in the F-only kernel, masking is faster here */
 209     sw       = 1.0f + (switch_V3 + (switch_V4 + switch_V5*r_switch)*r_switch)*r_switch*r_switch*r_switch;
 210     dsw      = (switch_F2 + (switch_F3 + switch_F4*r_switch)*r_switch)*r_switch*r_switch;
 211
 212     *F_invr  = (*F_invr)*sw - inv_r*(*E_lj)*dsw;
 213     *E_lj   *= sw;
 214 }
 215
 216
 217 /*! \brief Fetch C6 grid contribution coefficients and return the product of these.
 218  *
 219  *  Depending on what is supported, it fetches parameters either
 220  *  using direct load, texture objects, or texrefs.
 221  */
 222 static __forceinline__ __device__
 223 float calculate_lj_ewald_c6grid(const cu_nbparam_t nbparam,
 224                                 int                typei,
 225                                 int                typej)
 226 {
 227 #if DISABLE_CUDA_TEXTURES
 228     return LDG(&nbparam.nbfp_comb[2*typei]) * LDG(&nbparam.nbfp_comb[2*typej]);
 229 #else
 230     return tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2*typei) * tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2*typej);
 231 #endif /* DISABLE_CUDA_TEXTURES */
 232 }
 233
 234
 235 /*! Calculate LJ-PME grid force contribution with
 236  *  geometric combination rule.
 237  */
 238 static __forceinline__ __device__
 239 void calculate_lj_ewald_comb_geom_F(const cu_nbparam_t nbparam,
 240                                     int                typei,
 241                                     int                typej,
 242                                     float              r2,
 243                                     float              inv_r2,
 244                                     float              lje_coeff2,
 245                                     float              lje_coeff6_6,
 246                                     float             *F_invr)
 247 {
 248     float c6grid, inv_r6_nm, cr2, expmcr2, poly;
 249
 250     c6grid = calculate_lj_ewald_c6grid(nbparam, typei, typej);
 251
 252     /* Recalculate inv_r6 without exclusion mask */
 253     inv_r6_nm = inv_r2*inv_r2*inv_r2;
 254     cr2       = lje_coeff2*r2;
 255     expmcr2   = expf(-cr2);
 256     poly      = 1.0f + cr2 + 0.5f*cr2*cr2;
 257
 258     /* Subtract the grid force from the total LJ force */
 259     *F_invr  += c6grid*(inv_r6_nm - expmcr2*(inv_r6_nm*poly + lje_coeff6_6))*inv_r2;
 260 }
 261
 262
 263 /*! Calculate LJ-PME grid force + energy contribution with
 264  *  geometric combination rule.
 265  */
 266 static __forceinline__ __device__
 267 void calculate_lj_ewald_comb_geom_F_E(const cu_nbparam_t nbparam,
 268                                       int                typei,
 269                                       int                typej,
 270                                       float              r2,
 271                                       float              inv_r2,
 272                                       float              lje_coeff2,
 273                                       float              lje_coeff6_6,
 274                                       float              int_bit,
 275                                       float             *F_invr,
 276                                       float             *E_lj)
 277 {
 278     float c6grid, inv_r6_nm, cr2, expmcr2, poly, sh_mask;
 279
 280     c6grid = calculate_lj_ewald_c6grid(nbparam, typei, typej);
 281
 282     /* Recalculate inv_r6 without exclusion mask */
 283     inv_r6_nm = inv_r2*inv_r2*inv_r2;
 284     cr2       = lje_coeff2*r2;
 285     expmcr2   = expf(-cr2);
 286     poly      = 1.0f + cr2 + 0.5f*cr2*cr2;
 287
 288     /* Subtract the grid force from the total LJ force */
 289     *F_invr  += c6grid*(inv_r6_nm - expmcr2*(inv_r6_nm*poly + lje_coeff6_6))*inv_r2;
 290
 291     /* Shift should be applied only to real LJ pairs */
 292     sh_mask   = nbparam.sh_lj_ewald*int_bit;
 293     *E_lj    += c_oneSixth*c6grid*(inv_r6_nm*(1.0f - expmcr2*poly) + sh_mask);
 294 }
 295
 296 /*! Fetch per-type LJ parameters.
 297  *
 298  *  Depending on what is supported, it fetches parameters either
 299  *  using direct load, texture objects, or texrefs.
 300  */
 301 static __forceinline__ __device__
 302 float2 fetch_nbfp_comb_c6_c12(const cu_nbparam_t nbparam,
 303                               int                type)
 304 {
 305     float2 c6c12;
 306 #if DISABLE_CUDA_TEXTURES
 307     /* Force an 8-byte fetch to save a memory instruction. */
 308     float2 *nbfp_comb = (float2 *)nbparam.nbfp_comb;
 309     c6c12 = LDG(&nbfp_comb[type]);
 310 #else
 311     /* NOTE: as we always do 8-byte aligned loads, we could
 312        fetch float2 here too just as above. */
 313     c6c12.x = tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2*type);
 314     c6c12.y = tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2*type + 1);
 315 #endif /* DISABLE_CUDA_TEXTURES */
 316
 317     return c6c12;
 318 }
 319
 320
 321 /*! Calculate LJ-PME grid force + energy contribution (if E_lj != nullptr) with
 322  *  Lorentz-Berthelot combination rule.
 323  *  We use a single F+E kernel with conditional because the performance impact
 324  *  of this is pretty small and LB on the CPU is anyway very slow.
 325  */
 326 static __forceinline__ __device__
 327 void calculate_lj_ewald_comb_LB_F_E(const cu_nbparam_t nbparam,
 328                                     int                typei,
 329                                     int                typej,
 330                                     float              r2,
 331                                     float              inv_r2,
 332                                     float              lje_coeff2,
 333                                     float              lje_coeff6_6,
 334                                     float              int_bit,
 335                                     float             *F_invr,
 336                                     float             *E_lj)
 337 {
 338     float c6grid, inv_r6_nm, cr2, expmcr2, poly;
 339     float sigma, sigma2, epsilon;
 340
 341     /* sigma and epsilon are scaled to give 6*C6 */
 342     float2 c6c12_i = fetch_nbfp_comb_c6_c12(nbparam, typei);
 343     float2 c6c12_j = fetch_nbfp_comb_c6_c12(nbparam, typej);
 344
 345     sigma   = c6c12_i.x + c6c12_j.x;
 346     epsilon = c6c12_i.y * c6c12_j.y;
 347
 348     sigma2  = sigma*sigma;
 349     c6grid  = epsilon*sigma2*sigma2*sigma2;
 350
 351     /* Recalculate inv_r6 without exclusion mask */
 352     inv_r6_nm = inv_r2*inv_r2*inv_r2;
 353     cr2       = lje_coeff2*r2;
 354     expmcr2   = expf(-cr2);
 355     poly      = 1.0f + cr2 + 0.5f*cr2*cr2;
 356
 357     /* Subtract the grid force from the total LJ force */
 358     *F_invr  += c6grid*(inv_r6_nm - expmcr2*(inv_r6_nm*poly + lje_coeff6_6))*inv_r2;
 359
 360     if (E_lj != nullptr)
 361     {
 362         float sh_mask;
 363
 364         /* Shift should be applied only to real LJ pairs */
 365         sh_mask   = nbparam.sh_lj_ewald*int_bit;
 366         *E_lj    += c_oneSixth*c6grid*(inv_r6_nm*(1.0f - expmcr2*poly) + sh_mask);
 367     }
 368 }
 369
 370
 371 /*! Fetch two consecutive values from the Ewald correction F*r table.
 372  *
 373  *  Depending on what is supported, it fetches parameters either
 374  *  using direct load, texture objects, or texrefs.
 375  */
 376 static __forceinline__ __device__
 377 float2 fetch_coulomb_force_r(const cu_nbparam_t nbparam,
 378                              int                index)
 379 {
 380     float2 d;
 381
 382 #if DISABLE_CUDA_TEXTURES
 383     /* Can't do 8-byte fetch because some of the addresses will be misaligned. */
 384     d.x = LDG(&nbparam.coulomb_tab[index]);
 385     d.y = LDG(&nbparam.coulomb_tab[index + 1]);
 386 #else
 387     d.x = tex1Dfetch<float>(nbparam.coulomb_tab_texobj, index);
 388     d.y = tex1Dfetch<float>(nbparam.coulomb_tab_texobj, index + 1);
 389 #endif // DISABLE_CUDA_TEXTURES
 390
 391     return d;
 392 }
 393
 394 /*! Linear interpolation using exactly two FMA operations.
 395  *
 396  *  Implements numeric equivalent of: (1-t)*d0 + t*d1
 397  *  Note that CUDA does not have fnms, otherwise we'd use
 398  *  fma(t, d1, fnms(t, d0, d0)
 399  *  but input modifiers are designed for this and are fast.
 400  */
 401 template <typename T>
 402 __forceinline__ __host__ __device__
 403 T lerp(T d0, T d1, T t)
 404 {
 405     return fma(t, d1, fma(-t, d0, d0));
 406 }
 407
 408 /*! Interpolate Ewald coulomb force correction using the F*r table.
 409  */
 410 static __forceinline__ __device__
 411 float interpolate_coulomb_force_r(const cu_nbparam_t nbparam,
 412                                   float              r)
 413 {
 414     float  normalized = nbparam.coulomb_tab_scale * r;
 415     int    index      = (int) normalized;
 416     float  fraction   = normalized - index;
 417
 418     float2 d01 = fetch_coulomb_force_r(nbparam, index);
 419
 420     return lerp(d01.x, d01.y, fraction);
 421 }
 422
 423 /*! Fetch C6 and C12 from the parameter table.
 424  *
 425  *  Depending on what is supported, it fetches parameters either
 426  *  using direct load, texture objects, or texrefs.
 427  */
 428 static __forceinline__ __device__
 429 void fetch_nbfp_c6_c12(float               &c6,
 430                        float               &c12,
 431                        const cu_nbparam_t   nbparam,
 432                        int                  baseIndex)
 433 {
 434 #if DISABLE_CUDA_TEXTURES
 435     /* Force an 8-byte fetch to save a memory instruction. */
 436     float2 *nbfp = (float2 *)nbparam.nbfp;
 437     float2  c6c12;
 438     c6c12 = LDG(&nbfp[baseIndex]);
 439     c6    = c6c12.x;
 440     c12   = c6c12.y;
 441 #else
 442     /* NOTE: as we always do 8-byte aligned loads, we could
 443        fetch float2 here too just as above. */
 444     c6  = tex1Dfetch<float>(nbparam.nbfp_texobj, 2*baseIndex);
 445     c12 = tex1Dfetch<float>(nbparam.nbfp_texobj, 2*baseIndex + 1);
 446 #endif // DISABLE_CUDA_TEXTURES
 447 }
 448
 449
 450 /*! Calculate analytical Ewald correction term. */
 451 static __forceinline__ __device__
 452 float pmecorrF(float z2)
 453 {
 454     const float FN6 = -1.7357322914161492954e-8f;
 455     const float FN5 = 1.4703624142580877519e-6f;
 456     const float FN4 = -0.000053401640219807709149f;
 457     const float FN3 = 0.0010054721316683106153f;
 458     const float FN2 = -0.019278317264888380590f;
 459     const float FN1 = 0.069670166153766424023f;
 460     const float FN0 = -0.75225204789749321333f;
 461
 462     const float FD4 = 0.0011193462567257629232f;
 463     const float FD3 = 0.014866955030185295499f;
 464     const float FD2 = 0.11583842382862377919f;
 465     const float FD1 = 0.50736591960530292870f;
 466     const float FD0 = 1.0f;
 467
 468     float       z4;
 469     float       polyFN0, polyFN1, polyFD0, polyFD1;
 470
 471     z4          = z2*z2;
 472
 473     polyFD0     = FD4*z4 + FD2;
 474     polyFD1     = FD3*z4 + FD1;
 475     polyFD0     = polyFD0*z4 + FD0;
 476     polyFD0     = polyFD1*z2 + polyFD0;
 477
 478     polyFD0     = 1.0f/polyFD0;
 479
 480     polyFN0     = FN6*z4 + FN4;
 481     polyFN1     = FN5*z4 + FN3;
 482     polyFN0     = polyFN0*z4 + FN2;
 483     polyFN1     = polyFN1*z4 + FN1;
 484     polyFN0     = polyFN0*z4 + FN0;
 485     polyFN0     = polyFN1*z2 + polyFN0;
 486
 487     return polyFN0*polyFD0;
 488 }
 489
 490 /*! Final j-force reduction; this generic implementation works with
 491  *  arbitrary array sizes.
 492  */
 493 static __forceinline__ __device__
 494 void reduce_force_j_generic(float *f_buf, float3 *fout,
 495                             int tidxi, int tidxj, int aidx)
 496 {
 497     if (tidxi < 3)
 498     {
 499         float f = 0.0f;
 500         for (int j = tidxj * c_clSize; j < (tidxj + 1) * c_clSize; j++)
 501         {
 502             f += f_buf[c_fbufStride * tidxi + j];
 503         }
 504
 505         atomicAdd((&fout[aidx].x)+tidxi, f);
 506     }
 507 }
 508
 509 /*! Final j-force reduction; this implementation only with power of two
 510  *  array sizes.
 511  */
 512 static __forceinline__ __device__
 513 void reduce_force_j_warp_shfl(float3 f, float3 *fout,
 514                               int tidxi, int aidx,
 515                               const unsigned int activemask)
 516 {
 517     f.x += gmx_shfl_down_sync(activemask, f.x, 1);
 518     f.y += gmx_shfl_up_sync  (activemask, f.y, 1);
 519     f.z += gmx_shfl_down_sync(activemask, f.z, 1);
 520
 521     if (tidxi & 1)
 522     {
 523         f.x = f.y;
 524     }
 525
 526     f.x += gmx_shfl_down_sync(activemask, f.x, 2);
 527     f.z += gmx_shfl_up_sync  (activemask, f.z, 2);
 528
 529     if (tidxi & 2)
 530     {
 531         f.x = f.z;
 532     }
 533
 534     f.x += gmx_shfl_down_sync(activemask, f.x, 4);
 535
 536     if (tidxi < 3)
 537     {
 538         atomicAdd((&fout[aidx].x) + tidxi, f.x);
 539     }
 540 }
 541
 542 /*! Final i-force reduction; this generic implementation works with
 543  *  arbitrary array sizes.
 544  * TODO: add the tidxi < 3 trick
 545  */
 546 static __forceinline__ __device__
 547 void reduce_force_i_generic(float *f_buf, float3 *fout,
 548                             float *fshift_buf, bool bCalcFshift,
 549                             int tidxi, int tidxj, int aidx)
 550 {
 551     if (tidxj < 3)
 552     {
 553         float f = 0.0f;
 554         for (int j = tidxi; j < c_clSizeSq; j += c_clSize)
 555         {
 556             f += f_buf[tidxj * c_fbufStride + j];
 557         }
 558
 559         atomicAdd(&fout[aidx].x + tidxj, f);
 560
 561         if (bCalcFshift)
 562         {
 563             *fshift_buf += f;
 564         }
 565     }
 566 }
 567
 568 /*! Final i-force reduction; this implementation works only with power of two
 569  *  array sizes.
 570  */
 571 static __forceinline__ __device__
 572 void reduce_force_i_pow2(volatile float *f_buf, float3 *fout,
 573                          float *fshift_buf, bool bCalcFshift,
 574                          int tidxi, int tidxj, int aidx)
 575 {
 576     int     i, j;
 577     float   f;
 578
 579     assert(c_clSize == 1 << c_clSizeLog2);
 580
 581     /* Reduce the initial c_clSize values for each i atom to half
 582      * every step by using c_clSize * i threads.
 583      * Can't just use i as loop variable because than nvcc refuses to unroll.
 584      */
 585     i = c_clSize/2;
 586 #pragma unroll 5
 587     for (j = c_clSizeLog2 - 1; j > 0; j--)
 588     {
 589         if (tidxj < i)
 590         {
 591
 592             f_buf[                   tidxj * c_clSize + tidxi] += f_buf[                   (tidxj + i) * c_clSize + tidxi];
 593             f_buf[    c_fbufStride + tidxj * c_clSize + tidxi] += f_buf[    c_fbufStride + (tidxj + i) * c_clSize + tidxi];
 594             f_buf[2 * c_fbufStride + tidxj * c_clSize + tidxi] += f_buf[2 * c_fbufStride + (tidxj + i) * c_clSize + tidxi];
 595         }
 596         i >>= 1;
 597     }
 598
 599     /* i == 1, last reduction step, writing to global mem */
 600     if (tidxj < 3)
 601     {
 602         /* tidxj*c_fbufStride selects x, y or z */
 603         f = f_buf[tidxj * c_fbufStride               + tidxi] +
 604             f_buf[tidxj * c_fbufStride + i * c_clSize + tidxi];
 605
 606         atomicAdd(&(fout[aidx].x) + tidxj, f);
 607
 608         if (bCalcFshift)
 609         {
 610             *fshift_buf += f;
 611         }
 612     }
 613
 614 }
 615
 616 /*! Final i-force reduction wrapper; calls the generic or pow2 reduction depending
 617  *  on whether the size of the array to be reduced is power of two or not.
 618  */
 619 static __forceinline__ __device__
 620 void reduce_force_i(float *f_buf, float3 *f,
 621                     float *fshift_buf, bool bCalcFshift,
 622                     int tidxi, int tidxj, int ai)
 623 {
 624     if ((c_clSize & (c_clSize - 1)))
 625     {
 626         reduce_force_i_generic(f_buf, f, fshift_buf, bCalcFshift, tidxi, tidxj, ai);
 627     }
 628     else
 629     {
 630         reduce_force_i_pow2(f_buf, f, fshift_buf, bCalcFshift, tidxi, tidxj, ai);
 631     }
 632 }
 633
 634 /*! Final i-force reduction; this implementation works only with power of two
 635  *  array sizes.
 636  */
 637 static __forceinline__ __device__
 638 void reduce_force_i_warp_shfl(float3 fin, float3 *fout,
 639                               float *fshift_buf, bool bCalcFshift,
 640                               int tidxj, int aidx,
 641                               const unsigned int activemask)
 642 {
 643     fin.x += gmx_shfl_down_sync(activemask, fin.x, c_clSize);
 644     fin.y += gmx_shfl_up_sync  (activemask, fin.y, c_clSize);
 645     fin.z += gmx_shfl_down_sync(activemask, fin.z, c_clSize);
 646
 647     if (tidxj & 1)
 648     {
 649         fin.x = fin.y;
 650     }
 651
 652     fin.x += gmx_shfl_down_sync(activemask, fin.x, 2*c_clSize);
 653     fin.z += gmx_shfl_up_sync  (activemask, fin.z, 2*c_clSize);
 654
 655     if (tidxj & 2)
 656     {
 657         fin.x = fin.z;
 658     }
 659
 660     /* Threads 0,1,2 and 4,5,6 increment x,y,z for their warp */
 661     if ((tidxj & 3) < 3)
 662     {
 663         atomicAdd(&fout[aidx].x + (tidxj & 3), fin.x);
 664
 665         if (bCalcFshift)
 666         {
 667             *fshift_buf += fin.x;
 668         }
 669     }
 670 }
 671
 672 /*! Energy reduction; this implementation works only with power of two
 673  *  array sizes.
 674  */
 675 static __forceinline__ __device__
 676 void reduce_energy_pow2(volatile float *buf,
 677                         float *e_lj, float *e_el,
 678                         unsigned int tidx)
 679 {
 680     float        e1, e2;
 681
 682     unsigned int i = warp_size/2;
 683
 684     /* Can't just use i as loop variable because than nvcc refuses to unroll. */
 685 #pragma unroll 10
 686     for (int j = warp_size_log2 - 1; j > 0; j--)
 687     {
 688         if (tidx < i)
 689         {
 690             buf[               tidx] += buf[               tidx + i];
 691             buf[c_fbufStride + tidx] += buf[c_fbufStride + tidx + i];
 692         }
 693         i >>= 1;
 694     }
 695
 696     // TODO do this on two threads - requires e_lj and e_el to be stored on adjascent
 697     // memory locations to make sense
 698     /* last reduction step, writing to global mem */
 699     if (tidx == 0)
 700     {
 701         e1 = buf[               tidx] + buf[               tidx + i];
 702         e2 = buf[c_fbufStride + tidx] + buf[c_fbufStride + tidx + i];
 703
 704         atomicAdd(e_lj, e1);
 705         atomicAdd(e_el, e2);
 706     }
 707 }
 708
 709 /*! Energy reduction; this implementation works only with power of two
 710  *  array sizes.
 711  */
 712 static __forceinline__ __device__
 713 void reduce_energy_warp_shfl(float E_lj, float E_el,
 714                              float *e_lj, float *e_el,
 715                              int tidx,
 716                              const unsigned int activemask)
 717 {
 718     int i, sh;
 719
 720     sh = 1;
 721 #pragma unroll 5
 722     for (i = 0; i < 5; i++)
 723     {
 724         E_lj += gmx_shfl_down_sync(activemask, E_lj, sh);
 725         E_el += gmx_shfl_down_sync(activemask, E_el, sh);
 726         sh   += sh;
 727     }
 728
 729     /* The first thread in the warp writes the reduced energies */
 730     if (tidx == 0 || tidx == warp_size)
 731     {
 732         atomicAdd(e_lj, E_lj);
 733         atomicAdd(e_el, E_el);
 734     }
 735 }
 736
 737 #endif /* NBNXN_CUDA_KERNEL_UTILS_CUH */