src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2012,2013,2014,  by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 /* Note that floating-point constants in CUDA code should be suffixed
  37  * with f (e.g. 0.5f), to stop the compiler producing intermediate
  38  * code that is in double precision.
  39  */
  40
  41 #include "../../gmxlib/cuda_tools/vectype_ops.cuh"
  42
  43 #ifndef NBNXN_CUDA_KERNEL_UTILS_CUH
  44 #define NBNXN_CUDA_KERNEL_UTILS_CUH
  45
  46 #define WARP_SIZE_POW2_EXPONENT     (5)
  47 #define CL_SIZE_POW2_EXPONENT       (3)  /* change this together with GPU_NS_CLUSTER_SIZE !*/
  48 #define CL_SIZE_SQ                  (CL_SIZE * CL_SIZE)
  49 #define FBUF_STRIDE                 (CL_SIZE_SQ)
  50
  51 #define ONE_SIXTH_F     0.16666667f
  52 #define ONE_TWELVETH_F  0.08333333f
  53
  54
  55 /*! i-cluster interaction mask for a super-cluster with all NCL_PER_SUPERCL bits set */
  56 const unsigned supercl_interaction_mask = ((1U << NCL_PER_SUPERCL) - 1U);
  57
  58 /*! Apply force switch,  force + energy version. */
  59 static inline __device__
  60 void calculate_force_switch_F(const  cu_nbparam_t nbparam,
  61                               float               c6,
  62                               float               c12,
  63                               float               inv_r,
  64                               float               r2,
  65                               float              *F_invr)
  66 {
  67     float r, r_switch;
  68
  69     /* force switch constants */
  70     float disp_shift_V2 = nbparam.dispersion_shift.c2;
  71     float disp_shift_V3 = nbparam.dispersion_shift.c3;
  72     float repu_shift_V2 = nbparam.repulsion_shift.c2;
  73     float repu_shift_V3 = nbparam.repulsion_shift.c3;
  74
  75     r         = r2 * inv_r;
  76     r_switch  = r - nbparam.rvdw_switch;
  77     r_switch  = r_switch >= 0.0f ? r_switch : 0.0f;
  78
  79     *F_invr  +=
  80         -c6*(disp_shift_V2 + disp_shift_V3*r_switch)*r_switch*r_switch*inv_r +
  81         c12*(-repu_shift_V2 + repu_shift_V3*r_switch)*r_switch*r_switch*inv_r;
  82 }
  83
  84 /*! Apply force switch, force-only version. */
  85 static inline __device__
  86 void calculate_force_switch_F_E(const  cu_nbparam_t nbparam,
  87                                 float               c6,
  88                                 float               c12,
  89                                 float               inv_r,
  90                                 float               r2,
  91                                 float              *F_invr,
  92                                 float              *E_lj)
  93 {
  94     float r, r_switch;
  95
  96     /* force switch constants */
  97     float disp_shift_V2 = nbparam.dispersion_shift.c2;
  98     float disp_shift_V3 = nbparam.dispersion_shift.c3;
  99     float repu_shift_V2 = nbparam.repulsion_shift.c2;
 100     float repu_shift_V3 = nbparam.repulsion_shift.c3;
 101
 102     float disp_shift_F2 = nbparam.dispersion_shift.c2/3;
 103     float disp_shift_F3 = nbparam.dispersion_shift.c3/4;
 104     float repu_shift_F2 = nbparam.repulsion_shift.c2/3;
 105     float repu_shift_F3 = nbparam.repulsion_shift.c3/4;
 106
 107     r         = r2 * inv_r;
 108     r_switch  = r - nbparam.rvdw_switch;
 109     r_switch  = r_switch >= 0.0f ? r_switch : 0.0f;
 110
 111     *F_invr  +=
 112         -c6*(disp_shift_V2 + disp_shift_V3*r_switch)*r_switch*r_switch*inv_r +
 113         c12*(-repu_shift_V2 + repu_shift_V3*r_switch)*r_switch*r_switch*inv_r;
 114     *E_lj    +=
 115         c6*(disp_shift_F2 + disp_shift_F3*r_switch)*r_switch*r_switch*r_switch -
 116         c12*(repu_shift_F2 + repu_shift_F3*r_switch)*r_switch*r_switch*r_switch;
 117 }
 118
 119 /*! Apply potential switch, force-only version. */
 120 static inline __device__
 121 void calculate_potential_switch_F(const  cu_nbparam_t nbparam,
 122                                   float               c6,
 123                                   float               c12,
 124                                   float               inv_r,
 125                                   float               r2,
 126                                   float              *F_invr,
 127                                   float              *E_lj)
 128 {
 129     float r, r_switch;
 130     float sw, dsw;
 131
 132     /* potential switch constants */
 133     float switch_V3 = nbparam.vdw_switch.c3;
 134     float switch_V4 = nbparam.vdw_switch.c4;
 135     float switch_V5 = nbparam.vdw_switch.c5;
 136     float switch_F2 = 3*nbparam.vdw_switch.c3;
 137     float switch_F3 = 4*nbparam.vdw_switch.c4;
 138     float switch_F4 = 5*nbparam.vdw_switch.c5;
 139
 140     r        = r2 * inv_r;
 141     r_switch = r - nbparam.rvdw_switch;
 142
 143     /* Unlike in the F+E kernel, conditional is faster here */
 144     if (r_switch > 0.0f)
 145     {
 146         sw      = 1.0f + (switch_V3 + (switch_V4 + switch_V5*r_switch)*r_switch)*r_switch*r_switch*r_switch;
 147         dsw     = (switch_F2 + (switch_F3 + switch_F4*r_switch)*r_switch)*r_switch*r_switch;
 148
 149         *F_invr = (*F_invr)*sw - inv_r*(*E_lj)*dsw;
 150     }
 151 }
 152
 153 /*! Apply potential switch, force + energy version. */
 154 static inline __device__
 155 void calculate_potential_switch_F_E(const  cu_nbparam_t nbparam,
 156                                     float               c6,
 157                                     float               c12,
 158                                     float               inv_r,
 159                                     float               r2,
 160                                     float              *F_invr,
 161                                     float              *E_lj)
 162 {
 163     float r, r_switch;
 164     float sw, dsw;
 165
 166     /* potential switch constants */
 167     float switch_V3 = nbparam.vdw_switch.c3;
 168     float switch_V4 = nbparam.vdw_switch.c4;
 169     float switch_V5 = nbparam.vdw_switch.c5;
 170     float switch_F2 = 3*nbparam.vdw_switch.c3;
 171     float switch_F3 = 4*nbparam.vdw_switch.c4;
 172     float switch_F4 = 5*nbparam.vdw_switch.c5;
 173
 174     r        = r2 * inv_r;
 175     r_switch = r - nbparam.rvdw_switch;
 176     r_switch = r_switch >= 0.0f ? r_switch : 0.0f;
 177
 178     /* Unlike in the F-only kernel, masking is faster here */
 179     sw       = 1.0f + (switch_V3 + (switch_V4 + switch_V5*r_switch)*r_switch)*r_switch*r_switch*r_switch;
 180     dsw      = (switch_F2 + (switch_F3 + switch_F4*r_switch)*r_switch)*r_switch*r_switch;
 181
 182     *F_invr  = (*F_invr)*sw - inv_r*(*E_lj)*dsw;
 183     *E_lj   *= sw;
 184 }
 185
 186 /*! Calculate LJ-PME grid force contribution with
 187  *  geometric combination rule.
 188  */
 189 static inline __device__
 190 void calculate_lj_ewald_comb_geom_F(const cu_nbparam_t nbparam,
 191                                     int                typei,
 192                                     int                typej,
 193                                     float              r2,
 194                                     float              inv_r2,
 195                                     float              lje_coeff2,
 196                                     float              lje_coeff6_6,
 197                                     float             *F_invr)
 198 {
 199     float c6grid, inv_r6_nm, cr2, expmcr2, poly;
 200
 201 #ifdef USE_TEXOBJ
 202     c6grid    = tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2*typei) * tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2*typej);
 203 #else
 204     c6grid    = tex1Dfetch(nbfp_comb_texref, 2*typei) * tex1Dfetch(nbfp_comb_texref, 2*typej);
 205 #endif /* USE_TEXOBJ */
 206
 207     /* Recalculate inv_r6 without exclusion mask */
 208     inv_r6_nm = inv_r2*inv_r2*inv_r2;
 209     cr2       = lje_coeff2*r2;
 210     expmcr2   = expf(-cr2);
 211     poly      = 1.0f + cr2 + 0.5f*cr2*cr2;
 212
 213     /* Subtract the grid force from the total LJ force */
 214     *F_invr  += c6grid*(inv_r6_nm - expmcr2*(inv_r6_nm*poly + lje_coeff6_6))*inv_r2;
 215 }
 216
 217 /*! Calculate LJ-PME grid force + energy contribution with
 218  *  geometric combination rule.
 219  */
 220 static inline __device__
 221 void calculate_lj_ewald_comb_geom_F_E(const cu_nbparam_t nbparam,
 222                                       int                typei,
 223                                       int                typej,
 224                                       float              r2,
 225                                       float              inv_r2,
 226                                       float              lje_coeff2,
 227                                       float              lje_coeff6_6,
 228                                       float              int_bit,
 229                                       float             *F_invr,
 230                                       float             *E_lj)
 231 {
 232     float c6grid, inv_r6_nm, cr2, expmcr2, poly, sh_mask;
 233
 234 #ifdef USE_TEXOBJ
 235     c6grid    = tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2*typei) * tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2*typej);
 236 #else
 237     c6grid    = tex1Dfetch(nbfp_comb_texref, 2*typei) * tex1Dfetch(nbfp_comb_texref, 2*typej);
 238 #endif /* USE_TEXOBJ */
 239
 240     /* Recalculate inv_r6 without exclusion mask */
 241     inv_r6_nm = inv_r2*inv_r2*inv_r2;
 242     cr2       = lje_coeff2*r2;
 243     expmcr2   = expf(-cr2);
 244     poly      = 1.0f + cr2 + 0.5f*cr2*cr2;
 245
 246     /* Subtract the grid force from the total LJ force */
 247     *F_invr  += c6grid*(inv_r6_nm - expmcr2*(inv_r6_nm*poly + lje_coeff6_6))*inv_r2;
 248
 249     /* Shift should be applied only to real LJ pairs */
 250     sh_mask   = nbparam.sh_lj_ewald*int_bit;
 251     *E_lj    += ONE_SIXTH_F*c6grid*(inv_r6_nm*(1.0f - expmcr2*poly) + sh_mask);
 252 }
 253
 254 /*! Calculate LJ-PME grid force + energy contribution (if E_lj != NULL) with
 255  *  Lorentz-Berthelot combination rule.
 256  *  We use a single F+E kernel with conditional because the performance impact
 257  *  of this is pretty small and LB on the CPU is anyway very slow.
 258  */
 259 static inline __device__
 260 void calculate_lj_ewald_comb_LB_F_E(const cu_nbparam_t nbparam,
 261                                     int                typei,
 262                                     int                typej,
 263                                     float              r2,
 264                                     float              inv_r2,
 265                                     float              lje_coeff2,
 266                                     float              lje_coeff6_6,
 267                                     float              int_bit,
 268                                     float             *F_invr,
 269                                     float             *E_lj)
 270 {
 271     float c6grid, inv_r6_nm, cr2, expmcr2, poly;
 272     float sigma, sigma2, epsilon;
 273
 274     /* sigma and epsilon are scaled to give 6*C6 */
 275 #ifdef USE_TEXOBJ
 276     sigma   = tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2*typei    ) + tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2*typej    );
 277     epsilon = tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2*typei + 1) * tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2*typej + 1);
 278 #else
 279     sigma   = tex1Dfetch(nbfp_comb_texref, 2*typei    ) + tex1Dfetch(nbfp_comb_texref, 2*typej    );
 280     epsilon = tex1Dfetch(nbfp_comb_texref, 2*typei + 1) * tex1Dfetch(nbfp_comb_texref, 2*typej + 1);
 281 #endif /* USE_TEXOBJ */
 282     sigma2  = sigma*sigma;
 283     c6grid  = epsilon*sigma2*sigma2*sigma2;
 284
 285     /* Recalculate inv_r6 without exclusion mask */
 286     inv_r6_nm = inv_r2*inv_r2*inv_r2;
 287     cr2       = lje_coeff2*r2;
 288     expmcr2   = expf(-cr2);
 289     poly      = 1.0f + cr2 + 0.5f*cr2*cr2;
 290
 291     /* Subtract the grid force from the total LJ force */
 292     *F_invr  += c6grid*(inv_r6_nm - expmcr2*(inv_r6_nm*poly + lje_coeff6_6))*inv_r2;
 293
 294     if (E_lj != NULL)
 295     {
 296         float sh_mask;
 297
 298         /* Shift should be applied only to real LJ pairs */
 299         sh_mask   = nbparam.sh_lj_ewald*int_bit;
 300         *E_lj    += ONE_SIXTH_F*c6grid*(inv_r6_nm*(1.0f - expmcr2*poly) + sh_mask);
 301     }
 302 }
 303
 304 /*! Interpolate Ewald coulomb force using the table through the tex_nbfp texture.
 305  *  Original idea: from the OpenMM project
 306  */
 307 static inline __device__
 308 float interpolate_coulomb_force_r(float r, float scale)
 309 {
 310     float   normalized = scale * r;
 311     int     index      = (int) normalized;
 312     float   fract2     = normalized - index;
 313     float   fract1     = 1.0f - fract2;
 314
 315     return fract1 * tex1Dfetch(coulomb_tab_texref, index)
 316            + fract2 * tex1Dfetch(coulomb_tab_texref, index + 1);
 317 }
 318
 319 #ifdef TEXOBJ_SUPPORTED
 320 static inline __device__
 321 float interpolate_coulomb_force_r(cudaTextureObject_t texobj_coulomb_tab,
 322                                   float r, float scale)
 323 {
 324     float   normalized = scale * r;
 325     int     index      = (int) normalized;
 326     float   fract2     = normalized - index;
 327     float   fract1     = 1.0f - fract2;
 328
 329     return fract1 * tex1Dfetch<float>(texobj_coulomb_tab, index) +
 330            fract2 * tex1Dfetch<float>(texobj_coulomb_tab, index + 1);
 331 }
 332 #endif
 333
 334
 335 /*! Calculate analytical Ewald correction term. */
 336 static inline __device__
 337 float pmecorrF(float z2)
 338 {
 339     const float FN6 = -1.7357322914161492954e-8f;
 340     const float FN5 = 1.4703624142580877519e-6f;
 341     const float FN4 = -0.000053401640219807709149f;
 342     const float FN3 = 0.0010054721316683106153f;
 343     const float FN2 = -0.019278317264888380590f;
 344     const float FN1 = 0.069670166153766424023f;
 345     const float FN0 = -0.75225204789749321333f;
 346
 347     const float FD4 = 0.0011193462567257629232f;
 348     const float FD3 = 0.014866955030185295499f;
 349     const float FD2 = 0.11583842382862377919f;
 350     const float FD1 = 0.50736591960530292870f;
 351     const float FD0 = 1.0f;
 352
 353     float       z4;
 354     float       polyFN0, polyFN1, polyFD0, polyFD1;
 355
 356     z4          = z2*z2;
 357
 358     polyFD0     = FD4*z4 + FD2;
 359     polyFD1     = FD3*z4 + FD1;
 360     polyFD0     = polyFD0*z4 + FD0;
 361     polyFD0     = polyFD1*z2 + polyFD0;
 362
 363     polyFD0     = 1.0f/polyFD0;
 364
 365     polyFN0     = FN6*z4 + FN4;
 366     polyFN1     = FN5*z4 + FN3;
 367     polyFN0     = polyFN0*z4 + FN2;
 368     polyFN1     = polyFN1*z4 + FN1;
 369     polyFN0     = polyFN0*z4 + FN0;
 370     polyFN0     = polyFN1*z2 + polyFN0;
 371
 372     return polyFN0*polyFD0;
 373 }
 374
 375 /*! Final j-force reduction; this generic implementation works with
 376  *  arbitrary array sizes.
 377  */
 378 static inline __device__
 379 void reduce_force_j_generic(float *f_buf, float3 *fout,
 380                             int tidxi, int tidxj, int aidx)
 381 {
 382     if (tidxi == 0)
 383     {
 384         float3 f = make_float3(0.0f);
 385         for (int j = tidxj * CL_SIZE; j < (tidxj + 1) * CL_SIZE; j++)
 386         {
 387             f.x += f_buf[                  j];
 388             f.y += f_buf[    FBUF_STRIDE + j];
 389             f.z += f_buf[2 * FBUF_STRIDE + j];
 390         }
 391
 392         atomicAdd(&fout[aidx], f);
 393     }
 394 }
 395
 396 /*! Final j-force reduction; this implementation only with power of two
 397  *  array sizes and with sm >= 3.0
 398  */
 399 #if __CUDA_ARCH__ >= 300
 400 static inline __device__
 401 void reduce_force_j_warp_shfl(float3 f, float3 *fout,
 402                               int tidxi, int aidx)
 403 {
 404     int i;
 405
 406 #pragma unroll 3
 407     for (i = 0; i < 3; i++)
 408     {
 409         f.x += __shfl_down(f.x, 1<<i);
 410         f.y += __shfl_down(f.y, 1<<i);
 411         f.z += __shfl_down(f.z, 1<<i);
 412     }
 413
 414     /* Write the reduced j-force on one thread for each j */
 415     if (tidxi == 0)
 416     {
 417         atomicAdd(&fout[aidx], f);
 418     }
 419 }
 420 #endif
 421
 422 /*! Final i-force reduction; this generic implementation works with
 423  *  arbitrary array sizes.
 424  */
 425 static inline __device__
 426 void reduce_force_i_generic(float *f_buf, float3 *fout,
 427                             float3 *fshift_buf, bool bCalcFshift,
 428                             int tidxi, int tidxj, int aidx)
 429 {
 430     if (tidxj == 0)
 431     {
 432         float3 f = make_float3(0.0f);
 433         for (int j = tidxi; j < CL_SIZE_SQ; j += CL_SIZE)
 434         {
 435             f.x += f_buf[                  j];
 436             f.y += f_buf[    FBUF_STRIDE + j];
 437             f.z += f_buf[2 * FBUF_STRIDE + j];
 438         }
 439
 440         atomicAdd(&fout[aidx], f);
 441
 442         if (bCalcFshift)
 443         {
 444             *fshift_buf += f;
 445         }
 446     }
 447 }
 448
 449 /*! Final i-force reduction; this implementation works only with power of two
 450  *  array sizes.
 451  */
 452 static inline __device__
 453 void reduce_force_i_pow2(volatile float *f_buf, float3 *fout,
 454                          float3 *fshift_buf, bool bCalcFshift,
 455                          int tidxi, int tidxj, int aidx)
 456 {
 457     int     i, j;
 458     float3  f = make_float3(0.0f);
 459
 460     /* Reduce the initial CL_SIZE values for each i atom to half
 461      * every step by using CL_SIZE * i threads.
 462      * Can't just use i as loop variable because than nvcc refuses to unroll.
 463      */
 464     i = CL_SIZE/2;
 465     # pragma unroll 5
 466     for (j = CL_SIZE_POW2_EXPONENT - 1; j > 0; j--)
 467     {
 468         if (tidxj < i)
 469         {
 470
 471             f_buf[                  tidxj * CL_SIZE + tidxi] += f_buf[                  (tidxj + i) * CL_SIZE + tidxi];
 472             f_buf[    FBUF_STRIDE + tidxj * CL_SIZE + tidxi] += f_buf[    FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi];
 473             f_buf[2 * FBUF_STRIDE + tidxj * CL_SIZE + tidxi] += f_buf[2 * FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi];
 474         }
 475         i >>= 1;
 476     }
 477
 478     /* i == 1, last reduction step, writing to global mem */
 479     if (tidxj == 0)
 480     {
 481         f.x = f_buf[                  tidxj * CL_SIZE + tidxi] + f_buf[                  (tidxj + i) * CL_SIZE + tidxi];
 482         f.y = f_buf[    FBUF_STRIDE + tidxj * CL_SIZE + tidxi] + f_buf[    FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi];
 483         f.z = f_buf[2 * FBUF_STRIDE + tidxj * CL_SIZE + tidxi] + f_buf[2 * FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi];
 484
 485         atomicAdd(&fout[aidx], f);
 486
 487         if (bCalcFshift)
 488         {
 489             *fshift_buf += f;
 490         }
 491     }
 492 }
 493
 494 /*! Final i-force reduction wrapper; calls the generic or pow2 reduction depending
 495  *  on whether the size of the array to be reduced is power of two or not.
 496  */
 497 static inline __device__
 498 void reduce_force_i(float *f_buf, float3 *f,
 499                     float3 *fshift_buf, bool bCalcFshift,
 500                     int tidxi, int tidxj, int ai)
 501 {
 502     if ((CL_SIZE & (CL_SIZE - 1)))
 503     {
 504         reduce_force_i_generic(f_buf, f, fshift_buf, bCalcFshift, tidxi, tidxj, ai);
 505     }
 506     else
 507     {
 508         reduce_force_i_pow2(f_buf, f, fshift_buf, bCalcFshift, tidxi, tidxj, ai);
 509     }
 510 }
 511
 512 /*! Final i-force reduction; this implementation works only with power of two
 513  *  array sizes and with sm >= 3.0
 514  */
 515 #if __CUDA_ARCH__ >= 300
 516 static inline __device__
 517 void reduce_force_i_warp_shfl(float3 fin, float3 *fout,
 518                               float3 *fshift_buf, bool bCalcFshift,
 519                               int tidxj, int aidx)
 520 {
 521     int j;
 522
 523 #pragma unroll 2
 524     for (j = 0; j < 2; j++)
 525     {
 526         fin.x += __shfl_down(fin.x,  CL_SIZE<<j);
 527         fin.y += __shfl_down(fin.y,  CL_SIZE<<j);
 528         fin.z += __shfl_down(fin.z,  CL_SIZE<<j);
 529     }
 530
 531     /* The first thread in the warp writes the reduced force */
 532     if (tidxj == 0 || tidxj == 4)
 533     {
 534         atomicAdd(&fout[aidx], fin);
 535
 536         if (bCalcFshift)
 537         {
 538             fshift_buf->x += fin.x;
 539             fshift_buf->y += fin.y;
 540             fshift_buf->z += fin.z;
 541         }
 542     }
 543 }
 544 #endif
 545
 546 /*! Energy reduction; this implementation works only with power of two
 547  *  array sizes.
 548  */
 549 static inline __device__
 550 void reduce_energy_pow2(volatile float *buf,
 551                         float *e_lj, float *e_el,
 552                         unsigned int tidx)
 553 {
 554     int     i, j;
 555     float   e1, e2;
 556
 557     i = WARP_SIZE/2;
 558
 559     /* Can't just use i as loop variable because than nvcc refuses to unroll. */
 560 # pragma unroll 10
 561     for (j = WARP_SIZE_POW2_EXPONENT - 1; j > 0; j--)
 562     {
 563         if (tidx < i)
 564         {
 565             buf[              tidx] += buf[              tidx + i];
 566             buf[FBUF_STRIDE + tidx] += buf[FBUF_STRIDE + tidx + i];
 567         }
 568         i >>= 1;
 569     }
 570
 571     /* last reduction step, writing to global mem */
 572     if (tidx == 0)
 573     {
 574         e1 = buf[              tidx] + buf[              tidx + i];
 575         e2 = buf[FBUF_STRIDE + tidx] + buf[FBUF_STRIDE + tidx + i];
 576
 577         atomicAdd(e_lj, e1);
 578         atomicAdd(e_el, e2);
 579     }
 580 }
 581
 582 /*! Energy reduction; this implementation works only with power of two
 583  *  array sizes and with sm >= 3.0
 584  */
 585 #if __CUDA_ARCH__ >= 300
 586 static inline __device__
 587 void reduce_energy_warp_shfl(float E_lj, float E_el,
 588                              float *e_lj, float *e_el,
 589                              int tidx)
 590 {
 591     int i, sh;
 592
 593     sh = 1;
 594 #pragma unroll 5
 595     for (i = 0; i < 5; i++)
 596     {
 597         E_lj += __shfl_down(E_lj, sh);
 598         E_el += __shfl_down(E_el, sh);
 599         sh   += sh;
 600     }
 601
 602     /* The first thread in the warp writes the reduced energies */
 603     if (tidx == 0 || tidx == WARP_SIZE)
 604     {
 605         atomicAdd(e_lj, E_lj);
 606         atomicAdd(e_el, E_el);
 607     }
 608 }
 609 #endif /* __CUDA_ARCH__ */
 610
 611 #endif /* NBNXN_CUDA_KERNEL_UTILS_CUH */