src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2012,2013,2014,  by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 /* Note that floating-point constants in CUDA code should be suffixed
  37  * with f (e.g. 0.5f), to stop the compiler producing intermediate
  38  * code that is in double precision.
  39  */
  40 #include "config.h"
  41
  42 #include "gromacs/gmxlib/cuda_tools/vectype_ops.cuh"
  43
  44 #ifndef NBNXN_CUDA_KERNEL_UTILS_CUH
  45 #define NBNXN_CUDA_KERNEL_UTILS_CUH
  46
  47
  48 #if defined HAVE_CUDA_TEXOBJ_SUPPORT && __CUDA_ARCH__ >= 300
  49 /* Note: convenience macro, needs to be undef-ed at the end of the file. */
  50 #define USE_TEXOBJ
  51 #endif
  52
  53 #define WARP_SIZE_POW2_EXPONENT     (5)
  54 #define CL_SIZE_POW2_EXPONENT       (3)  /* change this together with GPU_NS_CLUSTER_SIZE !*/
  55 #define CL_SIZE_SQ                  (CL_SIZE * CL_SIZE)
  56 #define FBUF_STRIDE                 (CL_SIZE_SQ)
  57
  58 #define ONE_SIXTH_F     0.16666667f
  59 #define ONE_TWELVETH_F  0.08333333f
  60
  61
  62 /*! i-cluster interaction mask for a super-cluster with all NCL_PER_SUPERCL bits set */
  63 const unsigned supercl_interaction_mask = ((1U << NCL_PER_SUPERCL) - 1U);
  64
  65 /*! Apply force switch,  force + energy version. */
  66 static inline __device__
  67 void calculate_force_switch_F(const  cu_nbparam_t nbparam,
  68                               float               c6,
  69                               float               c12,
  70                               float               inv_r,
  71                               float               r2,
  72                               float              *F_invr)
  73 {
  74     float r, r_switch;
  75
  76     /* force switch constants */
  77     float disp_shift_V2 = nbparam.dispersion_shift.c2;
  78     float disp_shift_V3 = nbparam.dispersion_shift.c3;
  79     float repu_shift_V2 = nbparam.repulsion_shift.c2;
  80     float repu_shift_V3 = nbparam.repulsion_shift.c3;
  81
  82     r         = r2 * inv_r;
  83     r_switch  = r - nbparam.rvdw_switch;
  84     r_switch  = r_switch >= 0.0f ? r_switch : 0.0f;
  85
  86     *F_invr  +=
  87         -c6*(disp_shift_V2 + disp_shift_V3*r_switch)*r_switch*r_switch*inv_r +
  88         c12*(-repu_shift_V2 + repu_shift_V3*r_switch)*r_switch*r_switch*inv_r;
  89 }
  90
  91 /*! Apply force switch, force-only version. */
  92 static inline __device__
  93 void calculate_force_switch_F_E(const  cu_nbparam_t nbparam,
  94                                 float               c6,
  95                                 float               c12,
  96                                 float               inv_r,
  97                                 float               r2,
  98                                 float              *F_invr,
  99                                 float              *E_lj)
 100 {
 101     float r, r_switch;
 102
 103     /* force switch constants */
 104     float disp_shift_V2 = nbparam.dispersion_shift.c2;
 105     float disp_shift_V3 = nbparam.dispersion_shift.c3;
 106     float repu_shift_V2 = nbparam.repulsion_shift.c2;
 107     float repu_shift_V3 = nbparam.repulsion_shift.c3;
 108
 109     float disp_shift_F2 = nbparam.dispersion_shift.c2/3;
 110     float disp_shift_F3 = nbparam.dispersion_shift.c3/4;
 111     float repu_shift_F2 = nbparam.repulsion_shift.c2/3;
 112     float repu_shift_F3 = nbparam.repulsion_shift.c3/4;
 113
 114     r         = r2 * inv_r;
 115     r_switch  = r - nbparam.rvdw_switch;
 116     r_switch  = r_switch >= 0.0f ? r_switch : 0.0f;
 117
 118     *F_invr  +=
 119         -c6*(disp_shift_V2 + disp_shift_V3*r_switch)*r_switch*r_switch*inv_r +
 120         c12*(-repu_shift_V2 + repu_shift_V3*r_switch)*r_switch*r_switch*inv_r;
 121     *E_lj    +=
 122         c6*(disp_shift_F2 + disp_shift_F3*r_switch)*r_switch*r_switch*r_switch -
 123         c12*(repu_shift_F2 + repu_shift_F3*r_switch)*r_switch*r_switch*r_switch;
 124 }
 125
 126 /*! Apply potential switch, force-only version. */
 127 static inline __device__
 128 void calculate_potential_switch_F(const  cu_nbparam_t nbparam,
 129                                   float               c6,
 130                                   float               c12,
 131                                   float               inv_r,
 132                                   float               r2,
 133                                   float              *F_invr,
 134                                   float              *E_lj)
 135 {
 136     float r, r_switch;
 137     float sw, dsw;
 138
 139     /* potential switch constants */
 140     float switch_V3 = nbparam.vdw_switch.c3;
 141     float switch_V4 = nbparam.vdw_switch.c4;
 142     float switch_V5 = nbparam.vdw_switch.c5;
 143     float switch_F2 = 3*nbparam.vdw_switch.c3;
 144     float switch_F3 = 4*nbparam.vdw_switch.c4;
 145     float switch_F4 = 5*nbparam.vdw_switch.c5;
 146
 147     r        = r2 * inv_r;
 148     r_switch = r - nbparam.rvdw_switch;
 149
 150     /* Unlike in the F+E kernel, conditional is faster here */
 151     if (r_switch > 0.0f)
 152     {
 153         sw      = 1.0f + (switch_V3 + (switch_V4 + switch_V5*r_switch)*r_switch)*r_switch*r_switch*r_switch;
 154         dsw     = (switch_F2 + (switch_F3 + switch_F4*r_switch)*r_switch)*r_switch*r_switch;
 155
 156         *F_invr = (*F_invr)*sw - inv_r*(*E_lj)*dsw;
 157     }
 158 }
 159
 160 /*! Apply potential switch, force + energy version. */
 161 static inline __device__
 162 void calculate_potential_switch_F_E(const  cu_nbparam_t nbparam,
 163                                     float               c6,
 164                                     float               c12,
 165                                     float               inv_r,
 166                                     float               r2,
 167                                     float              *F_invr,
 168                                     float              *E_lj)
 169 {
 170     float r, r_switch;
 171     float sw, dsw;
 172
 173     /* potential switch constants */
 174     float switch_V3 = nbparam.vdw_switch.c3;
 175     float switch_V4 = nbparam.vdw_switch.c4;
 176     float switch_V5 = nbparam.vdw_switch.c5;
 177     float switch_F2 = 3*nbparam.vdw_switch.c3;
 178     float switch_F3 = 4*nbparam.vdw_switch.c4;
 179     float switch_F4 = 5*nbparam.vdw_switch.c5;
 180
 181     r        = r2 * inv_r;
 182     r_switch = r - nbparam.rvdw_switch;
 183     r_switch = r_switch >= 0.0f ? r_switch : 0.0f;
 184
 185     /* Unlike in the F-only kernel, masking is faster here */
 186     sw       = 1.0f + (switch_V3 + (switch_V4 + switch_V5*r_switch)*r_switch)*r_switch*r_switch*r_switch;
 187     dsw      = (switch_F2 + (switch_F3 + switch_F4*r_switch)*r_switch)*r_switch*r_switch;
 188
 189     *F_invr  = (*F_invr)*sw - inv_r*(*E_lj)*dsw;
 190     *E_lj   *= sw;
 191 }
 192
 193 /*! Calculate LJ-PME grid force contribution with
 194  *  geometric combination rule.
 195  */
 196 static inline __device__
 197 void calculate_lj_ewald_comb_geom_F(const cu_nbparam_t nbparam,
 198                                     int                typei,
 199                                     int                typej,
 200                                     float              r2,
 201                                     float              inv_r2,
 202                                     float              lje_coeff2,
 203                                     float              lje_coeff6_6,
 204                                     float             *F_invr)
 205 {
 206     float c6grid, inv_r6_nm, cr2, expmcr2, poly;
 207
 208 #ifdef USE_TEXOBJ
 209     c6grid    = tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2*typei) * tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2*typej);
 210 #else
 211     c6grid    = tex1Dfetch(nbfp_comb_texref, 2*typei) * tex1Dfetch(nbfp_comb_texref, 2*typej);
 212 #endif /* USE_TEXOBJ */
 213
 214     /* Recalculate inv_r6 without exclusion mask */
 215     inv_r6_nm = inv_r2*inv_r2*inv_r2;
 216     cr2       = lje_coeff2*r2;
 217     expmcr2   = expf(-cr2);
 218     poly      = 1.0f + cr2 + 0.5f*cr2*cr2;
 219
 220     /* Subtract the grid force from the total LJ force */
 221     *F_invr  += c6grid*(inv_r6_nm - expmcr2*(inv_r6_nm*poly + lje_coeff6_6))*inv_r2;
 222 }
 223
 224 /*! Calculate LJ-PME grid force + energy contribution with
 225  *  geometric combination rule.
 226  */
 227 static inline __device__
 228 void calculate_lj_ewald_comb_geom_F_E(const cu_nbparam_t nbparam,
 229                                       int                typei,
 230                                       int                typej,
 231                                       float              r2,
 232                                       float              inv_r2,
 233                                       float              lje_coeff2,
 234                                       float              lje_coeff6_6,
 235                                       float              int_bit,
 236                                       float             *F_invr,
 237                                       float             *E_lj)
 238 {
 239     float c6grid, inv_r6_nm, cr2, expmcr2, poly, sh_mask;
 240
 241 #ifdef USE_TEXOBJ
 242     c6grid    = tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2*typei) * tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2*typej);
 243 #else
 244     c6grid    = tex1Dfetch(nbfp_comb_texref, 2*typei) * tex1Dfetch(nbfp_comb_texref, 2*typej);
 245 #endif /* USE_TEXOBJ */
 246
 247     /* Recalculate inv_r6 without exclusion mask */
 248     inv_r6_nm = inv_r2*inv_r2*inv_r2;
 249     cr2       = lje_coeff2*r2;
 250     expmcr2   = expf(-cr2);
 251     poly      = 1.0f + cr2 + 0.5f*cr2*cr2;
 252
 253     /* Subtract the grid force from the total LJ force */
 254     *F_invr  += c6grid*(inv_r6_nm - expmcr2*(inv_r6_nm*poly + lje_coeff6_6))*inv_r2;
 255
 256     /* Shift should be applied only to real LJ pairs */
 257     sh_mask   = nbparam.sh_lj_ewald*int_bit;
 258     *E_lj    += ONE_SIXTH_F*c6grid*(inv_r6_nm*(1.0f - expmcr2*poly) + sh_mask);
 259 }
 260
 261 /*! Calculate LJ-PME grid force + energy contribution (if E_lj != NULL) with
 262  *  Lorentz-Berthelot combination rule.
 263  *  We use a single F+E kernel with conditional because the performance impact
 264  *  of this is pretty small and LB on the CPU is anyway very slow.
 265  */
 266 static inline __device__
 267 void calculate_lj_ewald_comb_LB_F_E(const cu_nbparam_t nbparam,
 268                                     int                typei,
 269                                     int                typej,
 270                                     float              r2,
 271                                     float              inv_r2,
 272                                     float              lje_coeff2,
 273                                     float              lje_coeff6_6,
 274                                     float              int_bit,
 275                                     float             *F_invr,
 276                                     float             *E_lj)
 277 {
 278     float c6grid, inv_r6_nm, cr2, expmcr2, poly;
 279     float sigma, sigma2, epsilon;
 280
 281     /* sigma and epsilon are scaled to give 6*C6 */
 282 #ifdef USE_TEXOBJ
 283     sigma   = tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2*typei    ) + tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2*typej    );
 284     epsilon = tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2*typei + 1) * tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2*typej + 1);
 285 #else
 286     sigma   = tex1Dfetch(nbfp_comb_texref, 2*typei    ) + tex1Dfetch(nbfp_comb_texref, 2*typej    );
 287     epsilon = tex1Dfetch(nbfp_comb_texref, 2*typei + 1) * tex1Dfetch(nbfp_comb_texref, 2*typej + 1);
 288 #endif /* USE_TEXOBJ */
 289     sigma2  = sigma*sigma;
 290     c6grid  = epsilon*sigma2*sigma2*sigma2;
 291
 292     /* Recalculate inv_r6 without exclusion mask */
 293     inv_r6_nm = inv_r2*inv_r2*inv_r2;
 294     cr2       = lje_coeff2*r2;
 295     expmcr2   = expf(-cr2);
 296     poly      = 1.0f + cr2 + 0.5f*cr2*cr2;
 297
 298     /* Subtract the grid force from the total LJ force */
 299     *F_invr  += c6grid*(inv_r6_nm - expmcr2*(inv_r6_nm*poly + lje_coeff6_6))*inv_r2;
 300
 301     if (E_lj != NULL)
 302     {
 303         float sh_mask;
 304
 305         /* Shift should be applied only to real LJ pairs */
 306         sh_mask   = nbparam.sh_lj_ewald*int_bit;
 307         *E_lj    += ONE_SIXTH_F*c6grid*(inv_r6_nm*(1.0f - expmcr2*poly) + sh_mask);
 308     }
 309 }
 310
 311 /*! Interpolate Ewald coulomb force using the table through the tex_nbfp texture.
 312  *  Original idea: from the OpenMM project
 313  */
 314 static inline __device__
 315 float interpolate_coulomb_force_r(float r, float scale)
 316 {
 317     float   normalized = scale * r;
 318     int     index      = (int) normalized;
 319     float   fract2     = normalized - index;
 320     float   fract1     = 1.0f - fract2;
 321
 322     return fract1 * tex1Dfetch(coulomb_tab_texref, index)
 323            + fract2 * tex1Dfetch(coulomb_tab_texref, index + 1);
 324 }
 325
 326 #ifdef HAVE_CUDA_TEXOBJ_SUPPORT
 327 static inline __device__
 328 float interpolate_coulomb_force_r(cudaTextureObject_t texobj_coulomb_tab,
 329                                   float r, float scale)
 330 {
 331     float   normalized = scale * r;
 332     int     index      = (int) normalized;
 333     float   fract2     = normalized - index;
 334     float   fract1     = 1.0f - fract2;
 335
 336     return fract1 * tex1Dfetch<float>(texobj_coulomb_tab, index) +
 337            fract2 * tex1Dfetch<float>(texobj_coulomb_tab, index + 1);
 338 }
 339 #endif
 340
 341
 342 /*! Calculate analytical Ewald correction term. */
 343 static inline __device__
 344 float pmecorrF(float z2)
 345 {
 346     const float FN6 = -1.7357322914161492954e-8f;
 347     const float FN5 = 1.4703624142580877519e-6f;
 348     const float FN4 = -0.000053401640219807709149f;
 349     const float FN3 = 0.0010054721316683106153f;
 350     const float FN2 = -0.019278317264888380590f;
 351     const float FN1 = 0.069670166153766424023f;
 352     const float FN0 = -0.75225204789749321333f;
 353
 354     const float FD4 = 0.0011193462567257629232f;
 355     const float FD3 = 0.014866955030185295499f;
 356     const float FD2 = 0.11583842382862377919f;
 357     const float FD1 = 0.50736591960530292870f;
 358     const float FD0 = 1.0f;
 359
 360     float       z4;
 361     float       polyFN0, polyFN1, polyFD0, polyFD1;
 362
 363     z4          = z2*z2;
 364
 365     polyFD0     = FD4*z4 + FD2;
 366     polyFD1     = FD3*z4 + FD1;
 367     polyFD0     = polyFD0*z4 + FD0;
 368     polyFD0     = polyFD1*z2 + polyFD0;
 369
 370     polyFD0     = 1.0f/polyFD0;
 371
 372     polyFN0     = FN6*z4 + FN4;
 373     polyFN1     = FN5*z4 + FN3;
 374     polyFN0     = polyFN0*z4 + FN2;
 375     polyFN1     = polyFN1*z4 + FN1;
 376     polyFN0     = polyFN0*z4 + FN0;
 377     polyFN0     = polyFN1*z2 + polyFN0;
 378
 379     return polyFN0*polyFD0;
 380 }
 381
 382 /*! Final j-force reduction; this generic implementation works with
 383  *  arbitrary array sizes.
 384  */
 385 static inline __device__
 386 void reduce_force_j_generic(float *f_buf, float3 *fout,
 387                             int tidxi, int tidxj, int aidx)
 388 {
 389     if (tidxi < 3)
 390     {
 391         float f = 0.0f;
 392         for (int j = tidxj * CL_SIZE; j < (tidxj + 1) * CL_SIZE; j++)
 393         {
 394             f += f_buf[FBUF_STRIDE * tidxi + j];
 395         }
 396
 397         atomicAdd((&fout[aidx].x)+tidxi, f);
 398     }
 399 }
 400
 401 /*! Final j-force reduction; this implementation only with power of two
 402  *  array sizes and with sm >= 3.0
 403  */
 404 #if __CUDA_ARCH__ >= 300
 405 static inline __device__
 406 void reduce_force_j_warp_shfl(float3 f, float3 *fout,
 407                               int tidxi, int aidx)
 408 {
 409     f.x += __shfl_down(f.x, 1);
 410     f.y += __shfl_up  (f.y, 1);
 411     f.z += __shfl_down(f.z, 1);
 412
 413     if (tidxi & 1)
 414     {
 415         f.x = f.y;
 416     }
 417
 418     f.x += __shfl_down(f.x, 2);
 419     f.z += __shfl_up  (f.z, 2);
 420
 421     if (tidxi & 2)
 422     {
 423         f.x = f.z;
 424     }
 425
 426     f.x += __shfl_down(f.x, 4);
 427
 428     if (tidxi < 3)
 429     {
 430         atomicAdd((&fout[aidx].x) + tidxi, f.x);
 431     }
 432 }
 433 #endif
 434
 435 /*! Final i-force reduction; this generic implementation works with
 436  *  arbitrary array sizes.
 437  * TODO: add the tidxi < 3 trick
 438  */
 439 static inline __device__
 440 void reduce_force_i_generic(float *f_buf, float3 *fout,
 441                             float *fshift_buf, bool bCalcFshift,
 442                             int tidxi, int tidxj, int aidx)
 443 {
 444     if (tidxj < 3)
 445     {
 446         float f = 0.0f;
 447         for (int j = tidxi; j < CL_SIZE_SQ; j += CL_SIZE)
 448         {
 449             f += f_buf[tidxj * FBUF_STRIDE + j];
 450         }
 451
 452         atomicAdd(&fout[aidx].x + tidxj, f);
 453
 454         if (bCalcFshift)
 455         {
 456             *fshift_buf += f;
 457         }
 458     }
 459 }
 460
 461 /*! Final i-force reduction; this implementation works only with power of two
 462  *  array sizes.
 463  */
 464 static inline __device__
 465 void reduce_force_i_pow2(volatile float *f_buf, float3 *fout,
 466                          float *fshift_buf, bool bCalcFshift,
 467                          int tidxi, int tidxj, int aidx)
 468 {
 469     int     i, j;
 470     float   f;
 471
 472     /* Reduce the initial CL_SIZE values for each i atom to half
 473      * every step by using CL_SIZE * i threads.
 474      * Can't just use i as loop variable because than nvcc refuses to unroll.
 475      */
 476     i = CL_SIZE/2;
 477     # pragma unroll 5
 478     for (j = CL_SIZE_POW2_EXPONENT - 1; j > 0; j--)
 479     {
 480         if (tidxj < i)
 481         {
 482
 483             f_buf[                  tidxj * CL_SIZE + tidxi] += f_buf[                  (tidxj + i) * CL_SIZE + tidxi];
 484             f_buf[    FBUF_STRIDE + tidxj * CL_SIZE + tidxi] += f_buf[    FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi];
 485             f_buf[2 * FBUF_STRIDE + tidxj * CL_SIZE + tidxi] += f_buf[2 * FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi];
 486         }
 487         i >>= 1;
 488     }
 489
 490     /* i == 1, last reduction step, writing to global mem */
 491     if (tidxj < 3)
 492     {
 493         /* tidxj*FBUF_STRIDE selects x, y or z */
 494         f = f_buf[tidxj * FBUF_STRIDE               + tidxi] +
 495             f_buf[tidxj * FBUF_STRIDE + i * CL_SIZE + tidxi];
 496
 497         atomicAdd(&(fout[aidx].x) + tidxj, f);
 498
 499         if (bCalcFshift)
 500         {
 501             *fshift_buf += f;
 502         }
 503     }
 504
 505 }
 506
 507 /*! Final i-force reduction wrapper; calls the generic or pow2 reduction depending
 508  *  on whether the size of the array to be reduced is power of two or not.
 509  */
 510 static inline __device__
 511 void reduce_force_i(float *f_buf, float3 *f,
 512                     float *fshift_buf, bool bCalcFshift,
 513                     int tidxi, int tidxj, int ai)
 514 {
 515     if ((CL_SIZE & (CL_SIZE - 1)))
 516     {
 517         reduce_force_i_generic(f_buf, f, fshift_buf, bCalcFshift, tidxi, tidxj, ai);
 518     }
 519     else
 520     {
 521         reduce_force_i_pow2(f_buf, f, fshift_buf, bCalcFshift, tidxi, tidxj, ai);
 522     }
 523 }
 524
 525 /*! Final i-force reduction; this implementation works only with power of two
 526  *  array sizes and with sm >= 3.0
 527  */
 528 #if __CUDA_ARCH__ >= 300
 529 static inline __device__
 530 void reduce_force_i_warp_shfl(float3 fin, float3 *fout,
 531                               float *fshift_buf, bool bCalcFshift,
 532                               int tidxj, int aidx)
 533 {
 534     fin.x += __shfl_down(fin.x, CL_SIZE);
 535     fin.y += __shfl_up  (fin.y, CL_SIZE);
 536     fin.z += __shfl_down(fin.z, CL_SIZE);
 537
 538     if (tidxj & 1)
 539     {
 540         fin.x = fin.y;
 541     }
 542
 543     fin.x += __shfl_down(fin.x, 2*CL_SIZE);
 544     fin.z += __shfl_up  (fin.z, 2*CL_SIZE);
 545
 546     if (tidxj & 2)
 547     {
 548         fin.x = fin.z;
 549     }
 550
 551     /* Threads 0,1,2 and 4,5,6 increment x,y,z for their warp */
 552     if ((tidxj & 3) < 3)
 553     {
 554         atomicAdd(&fout[aidx].x + (tidxj & ~4), fin.x);
 555
 556         if (bCalcFshift)
 557         {
 558             *fshift_buf += fin.x;
 559         }
 560     }
 561 }
 562 #endif
 563
 564 /*! Energy reduction; this implementation works only with power of two
 565  *  array sizes.
 566  */
 567 static inline __device__
 568 void reduce_energy_pow2(volatile float *buf,
 569                         float *e_lj, float *e_el,
 570                         unsigned int tidx)
 571 {
 572     int     i, j;
 573     float   e1, e2;
 574
 575     i = WARP_SIZE/2;
 576
 577     /* Can't just use i as loop variable because than nvcc refuses to unroll. */
 578 # pragma unroll 10
 579     for (j = WARP_SIZE_POW2_EXPONENT - 1; j > 0; j--)
 580     {
 581         if (tidx < i)
 582         {
 583             buf[              tidx] += buf[              tidx + i];
 584             buf[FBUF_STRIDE + tidx] += buf[FBUF_STRIDE + tidx + i];
 585         }
 586         i >>= 1;
 587     }
 588
 589     // TODO do this on two threads - requires e_lj and e_el to be stored on adjascent
 590     // memory locations to make sense
 591     /* last reduction step, writing to global mem */
 592     if (tidx == 0)
 593     {
 594         e1 = buf[              tidx] + buf[              tidx + i];
 595         e2 = buf[FBUF_STRIDE + tidx] + buf[FBUF_STRIDE + tidx + i];
 596
 597         atomicAdd(e_lj, e1);
 598         atomicAdd(e_el, e2);
 599     }
 600 }
 601
 602 /*! Energy reduction; this implementation works only with power of two
 603  *  array sizes and with sm >= 3.0
 604  */
 605 #if __CUDA_ARCH__ >= 300
 606 static inline __device__
 607 void reduce_energy_warp_shfl(float E_lj, float E_el,
 608                              float *e_lj, float *e_el,
 609                              int tidx)
 610 {
 611     int i, sh;
 612
 613     sh = 1;
 614 #pragma unroll 5
 615     for (i = 0; i < 5; i++)
 616     {
 617         E_lj += __shfl_down(E_lj, sh);
 618         E_el += __shfl_down(E_el, sh);
 619         sh   += sh;
 620     }
 621
 622     /* The first thread in the warp writes the reduced energies */
 623     if (tidx == 0 || tidx == WARP_SIZE)
 624     {
 625         atomicAdd(e_lj, E_lj);
 626         atomicAdd(e_el, E_el);
 627     }
 628 }
 629 #endif /* __CUDA_ARCH__ */
 630
 631 #undef USE_TEXOBJ
 632
 633 #endif /* NBNXN_CUDA_KERNEL_UTILS_CUH */