src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2012,2013,2014,2016,2017, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 /*! \internal \file
  37  *  \brief
  38  *  Utility constant and function declaration for the CUDA non-bonded kernels.
  39  *  This header should be included once at the top level, just before the
  40  *  kernels are included (has to be preceded by nbnxn_cuda_types.h).
  41  *
  42  *  \author Szilárd Páll <pall.szilard@gmail.com>
  43  *  \ingroup module_mdlib
  44  */
  45 #include <assert.h>
  46
  47 /* Note that floating-point constants in CUDA code should be suffixed
  48  * with f (e.g. 0.5f), to stop the compiler producing intermediate
  49  * code that is in double precision.
  50  */
  51
  52 #include "gromacs/gpu_utils/cuda_arch_utils.cuh"
  53 #include "gromacs/gpu_utils/cuda_kernel_utils.cuh"
  54 #include "gromacs/gpu_utils/vectype_ops.cuh"
  55
  56 #include "nbnxn_cuda_types.h"
  57
  58 #ifndef NBNXN_CUDA_KERNEL_UTILS_CUH
  59 #define NBNXN_CUDA_KERNEL_UTILS_CUH
  60
  61 /* Use texture objects if supported by the target hardware (and in host pass). */
  62 #if GMX_PTX_ARCH >= 300 || GMX_PTX_ARCH == 0
  63 /* Note: convenience macro, needs to be undef-ed at the end of the file. */
  64 #define USE_TEXOBJ
  65 #endif
  66
  67 /*! \brief Log of the i and j cluster size.
  68  *  change this together with c_clSize !*/
  69 static const int          c_clSizeLog2  = 3;
  70 /*! \brief Square of cluster size. */
  71 static const int          c_clSizeSq    = c_clSize*c_clSize;
  72 /*! \brief j-cluster size after split (4 in the current implementation). */
  73 static const int          c_splitClSize = c_clSize/c_nbnxnGpuClusterpairSplit;
  74 /*! \brief Stride in the force accumualation buffer */
  75 static const int          c_fbufStride  = c_clSizeSq;
  76 /*! \brief i-cluster interaction mask for a super-cluster with all c_numClPerSupercl=8 bits set */
  77 static const unsigned     superClInteractionMask = ((1U << c_numClPerSupercl) - 1U);
  78
  79 static const float        c_oneSixth    = 0.16666667f;
  80 static const float        c_oneTwelveth = 0.08333333f;
  81
  82
  83 /*! Convert LJ sigma,epsilon parameters to C6,C12. */
  84 static __forceinline__ __device__
  85 void convert_sigma_epsilon_to_c6_c12(const float  sigma,
  86                                      const float  epsilon,
  87                                      float       *c6,
  88                                      float       *c12)
  89 {
  90     float sigma2, sigma6;
  91
  92     sigma2 = sigma * sigma;
  93     sigma6 = sigma2 *sigma2 * sigma2;
  94     *c6    = epsilon * sigma6;
  95     *c12   = *c6 * sigma6;
  96 }
  97
  98 /*! Apply force switch,  force + energy version. */
  99 static __forceinline__ __device__
 100 void calculate_force_switch_F(const  cu_nbparam_t nbparam,
 101                               float               c6,
 102                               float               c12,
 103                               float               inv_r,
 104                               float               r2,
 105                               float              *F_invr)
 106 {
 107     float r, r_switch;
 108
 109     /* force switch constants */
 110     float disp_shift_V2 = nbparam.dispersion_shift.c2;
 111     float disp_shift_V3 = nbparam.dispersion_shift.c3;
 112     float repu_shift_V2 = nbparam.repulsion_shift.c2;
 113     float repu_shift_V3 = nbparam.repulsion_shift.c3;
 114
 115     r         = r2 * inv_r;
 116     r_switch  = r - nbparam.rvdw_switch;
 117     r_switch  = r_switch >= 0.0f ? r_switch : 0.0f;
 118
 119     *F_invr  +=
 120         -c6*(disp_shift_V2 + disp_shift_V3*r_switch)*r_switch*r_switch*inv_r +
 121         c12*(-repu_shift_V2 + repu_shift_V3*r_switch)*r_switch*r_switch*inv_r;
 122 }
 123
 124 /*! Apply force switch, force-only version. */
 125 static __forceinline__ __device__
 126 void calculate_force_switch_F_E(const  cu_nbparam_t nbparam,
 127                                 float               c6,
 128                                 float               c12,
 129                                 float               inv_r,
 130                                 float               r2,
 131                                 float              *F_invr,
 132                                 float              *E_lj)
 133 {
 134     float r, r_switch;
 135
 136     /* force switch constants */
 137     float disp_shift_V2 = nbparam.dispersion_shift.c2;
 138     float disp_shift_V3 = nbparam.dispersion_shift.c3;
 139     float repu_shift_V2 = nbparam.repulsion_shift.c2;
 140     float repu_shift_V3 = nbparam.repulsion_shift.c3;
 141
 142     float disp_shift_F2 = nbparam.dispersion_shift.c2/3;
 143     float disp_shift_F3 = nbparam.dispersion_shift.c3/4;
 144     float repu_shift_F2 = nbparam.repulsion_shift.c2/3;
 145     float repu_shift_F3 = nbparam.repulsion_shift.c3/4;
 146
 147     r         = r2 * inv_r;
 148     r_switch  = r - nbparam.rvdw_switch;
 149     r_switch  = r_switch >= 0.0f ? r_switch : 0.0f;
 150
 151     *F_invr  +=
 152         -c6*(disp_shift_V2 + disp_shift_V3*r_switch)*r_switch*r_switch*inv_r +
 153         c12*(-repu_shift_V2 + repu_shift_V3*r_switch)*r_switch*r_switch*inv_r;
 154     *E_lj    +=
 155         c6*(disp_shift_F2 + disp_shift_F3*r_switch)*r_switch*r_switch*r_switch -
 156         c12*(repu_shift_F2 + repu_shift_F3*r_switch)*r_switch*r_switch*r_switch;
 157 }
 158
 159 /*! Apply potential switch, force-only version. */
 160 static __forceinline__ __device__
 161 void calculate_potential_switch_F(const  cu_nbparam_t nbparam,
 162                                   float               inv_r,
 163                                   float               r2,
 164                                   float              *F_invr,
 165                                   float              *E_lj)
 166 {
 167     float r, r_switch;
 168     float sw, dsw;
 169
 170     /* potential switch constants */
 171     float switch_V3 = nbparam.vdw_switch.c3;
 172     float switch_V4 = nbparam.vdw_switch.c4;
 173     float switch_V5 = nbparam.vdw_switch.c5;
 174     float switch_F2 = 3*nbparam.vdw_switch.c3;
 175     float switch_F3 = 4*nbparam.vdw_switch.c4;
 176     float switch_F4 = 5*nbparam.vdw_switch.c5;
 177
 178     r        = r2 * inv_r;
 179     r_switch = r - nbparam.rvdw_switch;
 180
 181     /* Unlike in the F+E kernel, conditional is faster here */
 182     if (r_switch > 0.0f)
 183     {
 184         sw      = 1.0f + (switch_V3 + (switch_V4 + switch_V5*r_switch)*r_switch)*r_switch*r_switch*r_switch;
 185         dsw     = (switch_F2 + (switch_F3 + switch_F4*r_switch)*r_switch)*r_switch*r_switch;
 186
 187         *F_invr = (*F_invr)*sw - inv_r*(*E_lj)*dsw;
 188     }
 189 }
 190
 191 /*! Apply potential switch, force + energy version. */
 192 static __forceinline__ __device__
 193 void calculate_potential_switch_F_E(const  cu_nbparam_t nbparam,
 194                                     float               inv_r,
 195                                     float               r2,
 196                                     float              *F_invr,
 197                                     float              *E_lj)
 198 {
 199     float r, r_switch;
 200     float sw, dsw;
 201
 202     /* potential switch constants */
 203     float switch_V3 = nbparam.vdw_switch.c3;
 204     float switch_V4 = nbparam.vdw_switch.c4;
 205     float switch_V5 = nbparam.vdw_switch.c5;
 206     float switch_F2 = 3*nbparam.vdw_switch.c3;
 207     float switch_F3 = 4*nbparam.vdw_switch.c4;
 208     float switch_F4 = 5*nbparam.vdw_switch.c5;
 209
 210     r        = r2 * inv_r;
 211     r_switch = r - nbparam.rvdw_switch;
 212     r_switch = r_switch >= 0.0f ? r_switch : 0.0f;
 213
 214     /* Unlike in the F-only kernel, masking is faster here */
 215     sw       = 1.0f + (switch_V3 + (switch_V4 + switch_V5*r_switch)*r_switch)*r_switch*r_switch*r_switch;
 216     dsw      = (switch_F2 + (switch_F3 + switch_F4*r_switch)*r_switch)*r_switch*r_switch;
 217
 218     *F_invr  = (*F_invr)*sw - inv_r*(*E_lj)*dsw;
 219     *E_lj   *= sw;
 220 }
 221
 222
 223 /*! \brief Fetch C6 grid contribution coefficients and return the product of these.
 224  *
 225  *  Depending on what is supported, it fetches parameters either
 226  *  using direct load, texture objects, or texrefs.
 227  */
 228 static __forceinline__ __device__
 229 float calculate_lj_ewald_c6grid(const cu_nbparam_t nbparam,
 230                                 int                typei,
 231                                 int                typej)
 232 {
 233 #if DISABLE_CUDA_TEXTURES
 234     return LDG(&nbparam.nbfp_comb[2*typei]) * LDG(&nbparam.nbfp_comb[2*typej]);
 235 #else
 236 #ifdef USE_TEXOBJ
 237     return tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2*typei) * tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2*typej);
 238 #else
 239     return tex1Dfetch(nbfp_comb_texref, 2*typei) * tex1Dfetch(nbfp_comb_texref, 2*typej);
 240 #endif /* USE_TEXOBJ */
 241 #endif /* DISABLE_CUDA_TEXTURES */
 242 }
 243
 244
 245 /*! Calculate LJ-PME grid force contribution with
 246  *  geometric combination rule.
 247  */
 248 static __forceinline__ __device__
 249 void calculate_lj_ewald_comb_geom_F(const cu_nbparam_t nbparam,
 250                                     int                typei,
 251                                     int                typej,
 252                                     float              r2,
 253                                     float              inv_r2,
 254                                     float              lje_coeff2,
 255                                     float              lje_coeff6_6,
 256                                     float             *F_invr)
 257 {
 258     float c6grid, inv_r6_nm, cr2, expmcr2, poly;
 259
 260     c6grid = calculate_lj_ewald_c6grid(nbparam, typei, typej);
 261
 262     /* Recalculate inv_r6 without exclusion mask */
 263     inv_r6_nm = inv_r2*inv_r2*inv_r2;
 264     cr2       = lje_coeff2*r2;
 265     expmcr2   = expf(-cr2);
 266     poly      = 1.0f + cr2 + 0.5f*cr2*cr2;
 267
 268     /* Subtract the grid force from the total LJ force */
 269     *F_invr  += c6grid*(inv_r6_nm - expmcr2*(inv_r6_nm*poly + lje_coeff6_6))*inv_r2;
 270 }
 271
 272
 273 /*! Calculate LJ-PME grid force + energy contribution with
 274  *  geometric combination rule.
 275  */
 276 static __forceinline__ __device__
 277 void calculate_lj_ewald_comb_geom_F_E(const cu_nbparam_t nbparam,
 278                                       int                typei,
 279                                       int                typej,
 280                                       float              r2,
 281                                       float              inv_r2,
 282                                       float              lje_coeff2,
 283                                       float              lje_coeff6_6,
 284                                       float              int_bit,
 285                                       float             *F_invr,
 286                                       float             *E_lj)
 287 {
 288     float c6grid, inv_r6_nm, cr2, expmcr2, poly, sh_mask;
 289
 290     c6grid = calculate_lj_ewald_c6grid(nbparam, typei, typej);
 291
 292     /* Recalculate inv_r6 without exclusion mask */
 293     inv_r6_nm = inv_r2*inv_r2*inv_r2;
 294     cr2       = lje_coeff2*r2;
 295     expmcr2   = expf(-cr2);
 296     poly      = 1.0f + cr2 + 0.5f*cr2*cr2;
 297
 298     /* Subtract the grid force from the total LJ force */
 299     *F_invr  += c6grid*(inv_r6_nm - expmcr2*(inv_r6_nm*poly + lje_coeff6_6))*inv_r2;
 300
 301     /* Shift should be applied only to real LJ pairs */
 302     sh_mask   = nbparam.sh_lj_ewald*int_bit;
 303     *E_lj    += c_oneSixth*c6grid*(inv_r6_nm*(1.0f - expmcr2*poly) + sh_mask);
 304 }
 305
 306 /*! Fetch per-type LJ parameters.
 307  *
 308  *  Depending on what is supported, it fetches parameters either
 309  *  using direct load, texture objects, or texrefs.
 310  */
 311 static __forceinline__ __device__
 312 float2 fetch_nbfp_comb_c6_c12(const cu_nbparam_t nbparam,
 313                               int                type)
 314 {
 315     float2 c6c12;
 316 #if DISABLE_CUDA_TEXTURES
 317     /* Force an 8-byte fetch to save a memory instruction. */
 318     float2 *nbfp_comb = (float2 *)nbparam.nbfp_comb;
 319     c6c12 = LDG(&nbfp_comb[type]);
 320 #else
 321     /* NOTE: as we always do 8-byte aligned loads, we could
 322        fetch float2 here too just as above. */
 323 #ifdef USE_TEXOBJ
 324     c6c12.x = tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2*type);
 325     c6c12.y = tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2*type + 1);
 326 #else
 327     c6c12.x = tex1Dfetch(nbfp_comb_texref, 2*type);
 328     c6c12.y = tex1Dfetch(nbfp_comb_texref, 2*type + 1);
 329 #endif /* USE_TEXOBJ */
 330 #endif /* DISABLE_CUDA_TEXTURES */
 331
 332     return c6c12;
 333 }
 334
 335
 336 /*! Calculate LJ-PME grid force + energy contribution (if E_lj != NULL) with
 337  *  Lorentz-Berthelot combination rule.
 338  *  We use a single F+E kernel with conditional because the performance impact
 339  *  of this is pretty small and LB on the CPU is anyway very slow.
 340  */
 341 static __forceinline__ __device__
 342 void calculate_lj_ewald_comb_LB_F_E(const cu_nbparam_t nbparam,
 343                                     int                typei,
 344                                     int                typej,
 345                                     float              r2,
 346                                     float              inv_r2,
 347                                     float              lje_coeff2,
 348                                     float              lje_coeff6_6,
 349                                     float              int_bit,
 350                                     float             *F_invr,
 351                                     float             *E_lj)
 352 {
 353     float c6grid, inv_r6_nm, cr2, expmcr2, poly;
 354     float sigma, sigma2, epsilon;
 355
 356     /* sigma and epsilon are scaled to give 6*C6 */
 357     float2 c6c12_i = fetch_nbfp_comb_c6_c12(nbparam, typei);
 358     float2 c6c12_j = fetch_nbfp_comb_c6_c12(nbparam, typej);
 359
 360     sigma   = c6c12_i.x + c6c12_j.x;
 361     epsilon = c6c12_i.y * c6c12_j.y;
 362
 363     sigma2  = sigma*sigma;
 364     c6grid  = epsilon*sigma2*sigma2*sigma2;
 365
 366     /* Recalculate inv_r6 without exclusion mask */
 367     inv_r6_nm = inv_r2*inv_r2*inv_r2;
 368     cr2       = lje_coeff2*r2;
 369     expmcr2   = expf(-cr2);
 370     poly      = 1.0f + cr2 + 0.5f*cr2*cr2;
 371
 372     /* Subtract the grid force from the total LJ force */
 373     *F_invr  += c6grid*(inv_r6_nm - expmcr2*(inv_r6_nm*poly + lje_coeff6_6))*inv_r2;
 374
 375     if (E_lj != NULL)
 376     {
 377         float sh_mask;
 378
 379         /* Shift should be applied only to real LJ pairs */
 380         sh_mask   = nbparam.sh_lj_ewald*int_bit;
 381         *E_lj    += c_oneSixth*c6grid*(inv_r6_nm*(1.0f - expmcr2*poly) + sh_mask);
 382     }
 383 }
 384
 385
 386 /*! Fetch two consecutive values from the Ewald correction F*r table.
 387  *
 388  *  Depending on what is supported, it fetches parameters either
 389  *  using direct load, texture objects, or texrefs.
 390  */
 391 static __forceinline__ __device__
 392 float2 fetch_coulomb_force_r(const cu_nbparam_t nbparam,
 393                              int                index)
 394 {
 395     float2 d;
 396
 397 #if DISABLE_CUDA_TEXTURES
 398     /* Can't do 8-byte fetch because some of the addresses will be misaligned. */
 399     d.x = LDG(&nbparam.coulomb_tab[index]);
 400     d.y = LDG(&nbparam.coulomb_tab[index + 1]);
 401 #else
 402 #ifdef USE_TEXOBJ
 403     d.x = tex1Dfetch<float>(nbparam.coulomb_tab_texobj, index);
 404     d.y = tex1Dfetch<float>(nbparam.coulomb_tab_texobj, index + 1);
 405 #else
 406     d.x =  tex1Dfetch(coulomb_tab_texref, index);
 407     d.y =  tex1Dfetch(coulomb_tab_texref, index + 1);
 408 #endif // USE_TEXOBJ
 409 #endif // DISABLE_CUDA_TEXTURES
 410
 411     return d;
 412 }
 413
 414 /*! Linear interpolation using exactly two FMA operations.
 415  *
 416  *  Implements numeric equivalent of: (1-t)*d0 + t*d1
 417  *  Note that CUDA does not have fnms, otherwise we'd use
 418  *  fma(t, d1, fnms(t, d0, d0)
 419  *  but input modifiers are designed for this and are fast.
 420  */
 421 template <typename T>
 422 __forceinline__ __host__ __device__
 423 T lerp(T d0, T d1, T t)
 424 {
 425     return fma(t, d1, fma(-t, d0, d0));
 426 }
 427
 428 /*! Interpolate Ewald coulomb force correction using the F*r table.
 429  */
 430 static __forceinline__ __device__
 431 float interpolate_coulomb_force_r(const cu_nbparam_t nbparam,
 432                                   float              r)
 433 {
 434     float  normalized = nbparam.coulomb_tab_scale * r;
 435     int    index      = (int) normalized;
 436     float  fraction   = normalized - index;
 437
 438     float2 d01 = fetch_coulomb_force_r(nbparam, index);
 439
 440     return lerp(d01.x, d01.y, fraction);
 441 }
 442
 443 /*! Fetch C6 and C12 from the parameter table.
 444  *
 445  *  Depending on what is supported, it fetches parameters either
 446  *  using direct load, texture objects, or texrefs.
 447  */
 448 static __forceinline__ __device__
 449 void fetch_nbfp_c6_c12(float               &c6,
 450                        float               &c12,
 451                        const cu_nbparam_t   nbparam,
 452                        int                  baseIndex)
 453 {
 454 #if DISABLE_CUDA_TEXTURES
 455     /* Force an 8-byte fetch to save a memory instruction. */
 456     float2 *nbfp = (float2 *)nbparam.nbfp;
 457     float2  c6c12;
 458     c6c12 = LDG(&nbfp[baseIndex]);
 459     c6    = c6c12.x;
 460     c12   = c6c12.y;
 461 #else
 462     /* NOTE: as we always do 8-byte aligned loads, we could
 463        fetch float2 here too just as above. */
 464 #ifdef USE_TEXOBJ
 465     c6  = tex1Dfetch<float>(nbparam.nbfp_texobj, 2*baseIndex);
 466     c12 = tex1Dfetch<float>(nbparam.nbfp_texobj, 2*baseIndex + 1);
 467 #else
 468     c6  = tex1Dfetch(nbfp_texref, 2*baseIndex);
 469     c12 = tex1Dfetch(nbfp_texref, 2*baseIndex + 1);
 470 #endif
 471 #endif // DISABLE_CUDA_TEXTURES
 472 }
 473
 474
 475 /*! Calculate analytical Ewald correction term. */
 476 static __forceinline__ __device__
 477 float pmecorrF(float z2)
 478 {
 479     const float FN6 = -1.7357322914161492954e-8f;
 480     const float FN5 = 1.4703624142580877519e-6f;
 481     const float FN4 = -0.000053401640219807709149f;
 482     const float FN3 = 0.0010054721316683106153f;
 483     const float FN2 = -0.019278317264888380590f;
 484     const float FN1 = 0.069670166153766424023f;
 485     const float FN0 = -0.75225204789749321333f;
 486
 487     const float FD4 = 0.0011193462567257629232f;
 488     const float FD3 = 0.014866955030185295499f;
 489     const float FD2 = 0.11583842382862377919f;
 490     const float FD1 = 0.50736591960530292870f;
 491     const float FD0 = 1.0f;
 492
 493     float       z4;
 494     float       polyFN0, polyFN1, polyFD0, polyFD1;
 495
 496     z4          = z2*z2;
 497
 498     polyFD0     = FD4*z4 + FD2;
 499     polyFD1     = FD3*z4 + FD1;
 500     polyFD0     = polyFD0*z4 + FD0;
 501     polyFD0     = polyFD1*z2 + polyFD0;
 502
 503     polyFD0     = 1.0f/polyFD0;
 504
 505     polyFN0     = FN6*z4 + FN4;
 506     polyFN1     = FN5*z4 + FN3;
 507     polyFN0     = polyFN0*z4 + FN2;
 508     polyFN1     = polyFN1*z4 + FN1;
 509     polyFN0     = polyFN0*z4 + FN0;
 510     polyFN0     = polyFN1*z2 + polyFN0;
 511
 512     return polyFN0*polyFD0;
 513 }
 514
 515 /*! Final j-force reduction; this generic implementation works with
 516  *  arbitrary array sizes.
 517  */
 518 static __forceinline__ __device__
 519 void reduce_force_j_generic(float *f_buf, float3 *fout,
 520                             int tidxi, int tidxj, int aidx)
 521 {
 522     if (tidxi < 3)
 523     {
 524         float f = 0.0f;
 525         for (int j = tidxj * c_clSize; j < (tidxj + 1) * c_clSize; j++)
 526         {
 527             f += f_buf[c_fbufStride * tidxi + j];
 528         }
 529
 530         atomicAdd((&fout[aidx].x)+tidxi, f);
 531     }
 532 }
 533
 534 /*! Final j-force reduction; this implementation only with power of two
 535  *  array sizes and with sm >= 3.0
 536  */
 537 #if GMX_PTX_ARCH >= 300 || GMX_PTX_ARCH == 0
 538 static __forceinline__ __device__
 539 void reduce_force_j_warp_shfl(float3 f, float3 *fout,
 540                               int tidxi, int aidx,
 541                               const unsigned int activemask)
 542 {
 543     f.x += gmx_shfl_down_sync(activemask, f.x, 1);
 544     f.y += gmx_shfl_up_sync  (activemask, f.y, 1);
 545     f.z += gmx_shfl_down_sync(activemask, f.z, 1);
 546
 547     if (tidxi & 1)
 548     {
 549         f.x = f.y;
 550     }
 551
 552     f.x += gmx_shfl_down_sync(activemask, f.x, 2);
 553     f.z += gmx_shfl_up_sync  (activemask, f.z, 2);
 554
 555     if (tidxi & 2)
 556     {
 557         f.x = f.z;
 558     }
 559
 560     f.x += gmx_shfl_down_sync(activemask, f.x, 4);
 561
 562     if (tidxi < 3)
 563     {
 564         atomicAdd((&fout[aidx].x) + tidxi, f.x);
 565     }
 566 }
 567 #endif
 568
 569 /*! Final i-force reduction; this generic implementation works with
 570  *  arbitrary array sizes.
 571  * TODO: add the tidxi < 3 trick
 572  */
 573 static __forceinline__ __device__
 574 void reduce_force_i_generic(float *f_buf, float3 *fout,
 575                             float *fshift_buf, bool bCalcFshift,
 576                             int tidxi, int tidxj, int aidx)
 577 {
 578     if (tidxj < 3)
 579     {
 580         float f = 0.0f;
 581         for (int j = tidxi; j < c_clSizeSq; j += c_clSize)
 582         {
 583             f += f_buf[tidxj * c_fbufStride + j];
 584         }
 585
 586         atomicAdd(&fout[aidx].x + tidxj, f);
 587
 588         if (bCalcFshift)
 589         {
 590             *fshift_buf += f;
 591         }
 592     }
 593 }
 594
 595 /*! Final i-force reduction; this implementation works only with power of two
 596  *  array sizes.
 597  */
 598 static __forceinline__ __device__
 599 void reduce_force_i_pow2(volatile float *f_buf, float3 *fout,
 600                          float *fshift_buf, bool bCalcFshift,
 601                          int tidxi, int tidxj, int aidx)
 602 {
 603     int     i, j;
 604     float   f;
 605
 606     assert(c_clSize == 1 << c_clSizeLog2);
 607
 608     /* Reduce the initial c_clSize values for each i atom to half
 609      * every step by using c_clSize * i threads.
 610      * Can't just use i as loop variable because than nvcc refuses to unroll.
 611      */
 612     i = c_clSize/2;
 613 #pragma unroll 5
 614     for (j = c_clSizeLog2 - 1; j > 0; j--)
 615     {
 616         if (tidxj < i)
 617         {
 618
 619             f_buf[                   tidxj * c_clSize + tidxi] += f_buf[                   (tidxj + i) * c_clSize + tidxi];
 620             f_buf[    c_fbufStride + tidxj * c_clSize + tidxi] += f_buf[    c_fbufStride + (tidxj + i) * c_clSize + tidxi];
 621             f_buf[2 * c_fbufStride + tidxj * c_clSize + tidxi] += f_buf[2 * c_fbufStride + (tidxj + i) * c_clSize + tidxi];
 622         }
 623         i >>= 1;
 624     }
 625
 626     /* i == 1, last reduction step, writing to global mem */
 627     if (tidxj < 3)
 628     {
 629         /* tidxj*c_fbufStride selects x, y or z */
 630         f = f_buf[tidxj * c_fbufStride               + tidxi] +
 631             f_buf[tidxj * c_fbufStride + i * c_clSize + tidxi];
 632
 633         atomicAdd(&(fout[aidx].x) + tidxj, f);
 634
 635         if (bCalcFshift)
 636         {
 637             *fshift_buf += f;
 638         }
 639     }
 640
 641 }
 642
 643 /*! Final i-force reduction wrapper; calls the generic or pow2 reduction depending
 644  *  on whether the size of the array to be reduced is power of two or not.
 645  */
 646 static __forceinline__ __device__
 647 void reduce_force_i(float *f_buf, float3 *f,
 648                     float *fshift_buf, bool bCalcFshift,
 649                     int tidxi, int tidxj, int ai)
 650 {
 651     if ((c_clSize & (c_clSize - 1)))
 652     {
 653         reduce_force_i_generic(f_buf, f, fshift_buf, bCalcFshift, tidxi, tidxj, ai);
 654     }
 655     else
 656     {
 657         reduce_force_i_pow2(f_buf, f, fshift_buf, bCalcFshift, tidxi, tidxj, ai);
 658     }
 659 }
 660
 661 /*! Final i-force reduction; this implementation works only with power of two
 662  *  array sizes and with sm >= 3.0
 663  */
 664 #if GMX_PTX_ARCH >= 300 || GMX_PTX_ARCH == 0
 665 static __forceinline__ __device__
 666 void reduce_force_i_warp_shfl(float3 fin, float3 *fout,
 667                               float *fshift_buf, bool bCalcFshift,
 668                               int tidxj, int aidx,
 669                               const unsigned int activemask)
 670 {
 671     fin.x += gmx_shfl_down_sync(activemask, fin.x, c_clSize);
 672     fin.y += gmx_shfl_up_sync  (activemask, fin.y, c_clSize);
 673     fin.z += gmx_shfl_down_sync(activemask, fin.z, c_clSize);
 674
 675     if (tidxj & 1)
 676     {
 677         fin.x = fin.y;
 678     }
 679
 680     fin.x += gmx_shfl_down_sync(activemask, fin.x, 2*c_clSize);
 681     fin.z += gmx_shfl_up_sync  (activemask, fin.z, 2*c_clSize);
 682
 683     if (tidxj & 2)
 684     {
 685         fin.x = fin.z;
 686     }
 687
 688     /* Threads 0,1,2 and 4,5,6 increment x,y,z for their warp */
 689     if ((tidxj & 3) < 3)
 690     {
 691         atomicAdd(&fout[aidx].x + (tidxj & 3), fin.x);
 692
 693         if (bCalcFshift)
 694         {
 695             *fshift_buf += fin.x;
 696         }
 697     }
 698 }
 699 #endif
 700
 701 /*! Energy reduction; this implementation works only with power of two
 702  *  array sizes.
 703  */
 704 static __forceinline__ __device__
 705 void reduce_energy_pow2(volatile float *buf,
 706                         float *e_lj, float *e_el,
 707                         unsigned int tidx)
 708 {
 709     int     i, j;
 710     float   e1, e2;
 711
 712     i = warp_size/2;
 713
 714     /* Can't just use i as loop variable because than nvcc refuses to unroll. */
 715 #pragma unroll 10
 716     for (j = warp_size_log2 - 1; j > 0; j--)
 717     {
 718         if (tidx < i)
 719         {
 720             buf[               tidx] += buf[               tidx + i];
 721             buf[c_fbufStride + tidx] += buf[c_fbufStride + tidx + i];
 722         }
 723         i >>= 1;
 724     }
 725
 726     // TODO do this on two threads - requires e_lj and e_el to be stored on adjascent
 727     // memory locations to make sense
 728     /* last reduction step, writing to global mem */
 729     if (tidx == 0)
 730     {
 731         e1 = buf[               tidx] + buf[               tidx + i];
 732         e2 = buf[c_fbufStride + tidx] + buf[c_fbufStride + tidx + i];
 733
 734         atomicAdd(e_lj, e1);
 735         atomicAdd(e_el, e2);
 736     }
 737 }
 738
 739 /*! Energy reduction; this implementation works only with power of two
 740  *  array sizes and with sm >= 3.0
 741  */
 742 #if GMX_PTX_ARCH >= 300 || GMX_PTX_ARCH == 0
 743 static __forceinline__ __device__
 744 void reduce_energy_warp_shfl(float E_lj, float E_el,
 745                              float *e_lj, float *e_el,
 746                              int tidx,
 747                              const unsigned int activemask)
 748 {
 749     int i, sh;
 750
 751     sh = 1;
 752 #pragma unroll 5
 753     for (i = 0; i < 5; i++)
 754     {
 755         E_lj += gmx_shfl_down_sync(activemask, E_lj, sh);
 756         E_el += gmx_shfl_down_sync(activemask, E_el, sh);
 757         sh   += sh;
 758     }
 759
 760     /* The first thread in the warp writes the reduced energies */
 761     if (tidx == 0 || tidx == warp_size)
 762     {
 763         atomicAdd(e_lj, E_lj);
 764         atomicAdd(e_el, E_el);
 765     }
 766 }
 767 #endif /* GMX_PTX_ARCH */
 768
 769 #undef USE_TEXOBJ
 770
 771 #endif /* NBNXN_CUDA_KERNEL_UTILS_CUH */