src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2012,2013,2014,  by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 /* Note that floating-point constants in CUDA code should be suffixed
  37  * with f (e.g. 0.5f), to stop the compiler producing intermediate
  38  * code that is in double precision.
  39  */
  40 #include "config.h"
  41
  42 #include "gromacs/gmxlib/cuda_tools/vectype_ops.cuh"
  43
  44 #ifndef NBNXN_CUDA_KERNEL_UTILS_CUH
  45 #define NBNXN_CUDA_KERNEL_UTILS_CUH
  46
  47 #define WARP_SIZE_POW2_EXPONENT     (5)
  48 #define CL_SIZE_POW2_EXPONENT       (3)  /* change this together with GPU_NS_CLUSTER_SIZE !*/
  49 #define CL_SIZE_SQ                  (CL_SIZE * CL_SIZE)
  50 #define FBUF_STRIDE                 (CL_SIZE_SQ)
  51
  52 #define ONE_SIXTH_F     0.16666667f
  53 #define ONE_TWELVETH_F  0.08333333f
  54
  55
  56 /*! i-cluster interaction mask for a super-cluster with all NCL_PER_SUPERCL bits set */
  57 const unsigned supercl_interaction_mask = ((1U << NCL_PER_SUPERCL) - 1U);
  58
  59 /*! Apply force switch,  force + energy version. */
  60 static inline __device__
  61 void calculate_force_switch_F(const  cu_nbparam_t nbparam,
  62                               float               c6,
  63                               float               c12,
  64                               float               inv_r,
  65                               float               r2,
  66                               float              *F_invr)
  67 {
  68     float r, r_switch;
  69
  70     /* force switch constants */
  71     float disp_shift_V2 = nbparam.dispersion_shift.c2;
  72     float disp_shift_V3 = nbparam.dispersion_shift.c3;
  73     float repu_shift_V2 = nbparam.repulsion_shift.c2;
  74     float repu_shift_V3 = nbparam.repulsion_shift.c3;
  75
  76     r         = r2 * inv_r;
  77     r_switch  = r - nbparam.rvdw_switch;
  78     r_switch  = r_switch >= 0.0f ? r_switch : 0.0f;
  79
  80     *F_invr  +=
  81         -c6*(disp_shift_V2 + disp_shift_V3*r_switch)*r_switch*r_switch*inv_r +
  82         c12*(-repu_shift_V2 + repu_shift_V3*r_switch)*r_switch*r_switch*inv_r;
  83 }
  84
  85 /*! Apply force switch, force-only version. */
  86 static inline __device__
  87 void calculate_force_switch_F_E(const  cu_nbparam_t nbparam,
  88                                 float               c6,
  89                                 float               c12,
  90                                 float               inv_r,
  91                                 float               r2,
  92                                 float              *F_invr,
  93                                 float              *E_lj)
  94 {
  95     float r, r_switch;
  96
  97     /* force switch constants */
  98     float disp_shift_V2 = nbparam.dispersion_shift.c2;
  99     float disp_shift_V3 = nbparam.dispersion_shift.c3;
 100     float repu_shift_V2 = nbparam.repulsion_shift.c2;
 101     float repu_shift_V3 = nbparam.repulsion_shift.c3;
 102
 103     float disp_shift_F2 = nbparam.dispersion_shift.c2/3;
 104     float disp_shift_F3 = nbparam.dispersion_shift.c3/4;
 105     float repu_shift_F2 = nbparam.repulsion_shift.c2/3;
 106     float repu_shift_F3 = nbparam.repulsion_shift.c3/4;
 107
 108     r         = r2 * inv_r;
 109     r_switch  = r - nbparam.rvdw_switch;
 110     r_switch  = r_switch >= 0.0f ? r_switch : 0.0f;
 111
 112     *F_invr  +=
 113         -c6*(disp_shift_V2 + disp_shift_V3*r_switch)*r_switch*r_switch*inv_r +
 114         c12*(-repu_shift_V2 + repu_shift_V3*r_switch)*r_switch*r_switch*inv_r;
 115     *E_lj    +=
 116         c6*(disp_shift_F2 + disp_shift_F3*r_switch)*r_switch*r_switch*r_switch -
 117         c12*(repu_shift_F2 + repu_shift_F3*r_switch)*r_switch*r_switch*r_switch;
 118 }
 119
 120 /*! Apply potential switch, force-only version. */
 121 static inline __device__
 122 void calculate_potential_switch_F(const  cu_nbparam_t nbparam,
 123                                   float               c6,
 124                                   float               c12,
 125                                   float               inv_r,
 126                                   float               r2,
 127                                   float              *F_invr,
 128                                   float              *E_lj)
 129 {
 130     float r, r_switch;
 131     float sw, dsw;
 132
 133     /* potential switch constants */
 134     float switch_V3 = nbparam.vdw_switch.c3;
 135     float switch_V4 = nbparam.vdw_switch.c4;
 136     float switch_V5 = nbparam.vdw_switch.c5;
 137     float switch_F2 = 3*nbparam.vdw_switch.c3;
 138     float switch_F3 = 4*nbparam.vdw_switch.c4;
 139     float switch_F4 = 5*nbparam.vdw_switch.c5;
 140
 141     r        = r2 * inv_r;
 142     r_switch = r - nbparam.rvdw_switch;
 143
 144     /* Unlike in the F+E kernel, conditional is faster here */
 145     if (r_switch > 0.0f)
 146     {
 147         sw      = 1.0f + (switch_V3 + (switch_V4 + switch_V5*r_switch)*r_switch)*r_switch*r_switch*r_switch;
 148         dsw     = (switch_F2 + (switch_F3 + switch_F4*r_switch)*r_switch)*r_switch*r_switch;
 149
 150         *F_invr = (*F_invr)*sw - inv_r*(*E_lj)*dsw;
 151     }
 152 }
 153
 154 /*! Apply potential switch, force + energy version. */
 155 static inline __device__
 156 void calculate_potential_switch_F_E(const  cu_nbparam_t nbparam,
 157                                     float               c6,
 158                                     float               c12,
 159                                     float               inv_r,
 160                                     float               r2,
 161                                     float              *F_invr,
 162                                     float              *E_lj)
 163 {
 164     float r, r_switch;
 165     float sw, dsw;
 166
 167     /* potential switch constants */
 168     float switch_V3 = nbparam.vdw_switch.c3;
 169     float switch_V4 = nbparam.vdw_switch.c4;
 170     float switch_V5 = nbparam.vdw_switch.c5;
 171     float switch_F2 = 3*nbparam.vdw_switch.c3;
 172     float switch_F3 = 4*nbparam.vdw_switch.c4;
 173     float switch_F4 = 5*nbparam.vdw_switch.c5;
 174
 175     r        = r2 * inv_r;
 176     r_switch = r - nbparam.rvdw_switch;
 177     r_switch = r_switch >= 0.0f ? r_switch : 0.0f;
 178
 179     /* Unlike in the F-only kernel, masking is faster here */
 180     sw       = 1.0f + (switch_V3 + (switch_V4 + switch_V5*r_switch)*r_switch)*r_switch*r_switch*r_switch;
 181     dsw      = (switch_F2 + (switch_F3 + switch_F4*r_switch)*r_switch)*r_switch*r_switch;
 182
 183     *F_invr  = (*F_invr)*sw - inv_r*(*E_lj)*dsw;
 184     *E_lj   *= sw;
 185 }
 186
 187 /*! Calculate LJ-PME grid force contribution with
 188  *  geometric combination rule.
 189  */
 190 static inline __device__
 191 void calculate_lj_ewald_comb_geom_F(const cu_nbparam_t nbparam,
 192                                     int                typei,
 193                                     int                typej,
 194                                     float              r2,
 195                                     float              inv_r2,
 196                                     float              lje_coeff2,
 197                                     float              lje_coeff6_6,
 198                                     float             *F_invr)
 199 {
 200     float c6grid, inv_r6_nm, cr2, expmcr2, poly;
 201
 202 #ifdef USE_TEXOBJ
 203     c6grid    = tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2*typei) * tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2*typej);
 204 #else
 205     c6grid    = tex1Dfetch(nbfp_comb_texref, 2*typei) * tex1Dfetch(nbfp_comb_texref, 2*typej);
 206 #endif /* USE_TEXOBJ */
 207
 208     /* Recalculate inv_r6 without exclusion mask */
 209     inv_r6_nm = inv_r2*inv_r2*inv_r2;
 210     cr2       = lje_coeff2*r2;
 211     expmcr2   = expf(-cr2);
 212     poly      = 1.0f + cr2 + 0.5f*cr2*cr2;
 213
 214     /* Subtract the grid force from the total LJ force */
 215     *F_invr  += c6grid*(inv_r6_nm - expmcr2*(inv_r6_nm*poly + lje_coeff6_6))*inv_r2;
 216 }
 217
 218 /*! Calculate LJ-PME grid force + energy contribution with
 219  *  geometric combination rule.
 220  */
 221 static inline __device__
 222 void calculate_lj_ewald_comb_geom_F_E(const cu_nbparam_t nbparam,
 223                                       int                typei,
 224                                       int                typej,
 225                                       float              r2,
 226                                       float              inv_r2,
 227                                       float              lje_coeff2,
 228                                       float              lje_coeff6_6,
 229                                       float              int_bit,
 230                                       float             *F_invr,
 231                                       float             *E_lj)
 232 {
 233     float c6grid, inv_r6_nm, cr2, expmcr2, poly, sh_mask;
 234
 235 #ifdef USE_TEXOBJ
 236     c6grid    = tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2*typei) * tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2*typej);
 237 #else
 238     c6grid    = tex1Dfetch(nbfp_comb_texref, 2*typei) * tex1Dfetch(nbfp_comb_texref, 2*typej);
 239 #endif /* USE_TEXOBJ */
 240
 241     /* Recalculate inv_r6 without exclusion mask */
 242     inv_r6_nm = inv_r2*inv_r2*inv_r2;
 243     cr2       = lje_coeff2*r2;
 244     expmcr2   = expf(-cr2);
 245     poly      = 1.0f + cr2 + 0.5f*cr2*cr2;
 246
 247     /* Subtract the grid force from the total LJ force */
 248     *F_invr  += c6grid*(inv_r6_nm - expmcr2*(inv_r6_nm*poly + lje_coeff6_6))*inv_r2;
 249
 250     /* Shift should be applied only to real LJ pairs */
 251     sh_mask   = nbparam.sh_lj_ewald*int_bit;
 252     *E_lj    += ONE_SIXTH_F*c6grid*(inv_r6_nm*(1.0f - expmcr2*poly) + sh_mask);
 253 }
 254
 255 /*! Calculate LJ-PME grid force + energy contribution (if E_lj != NULL) with
 256  *  Lorentz-Berthelot combination rule.
 257  *  We use a single F+E kernel with conditional because the performance impact
 258  *  of this is pretty small and LB on the CPU is anyway very slow.
 259  */
 260 static inline __device__
 261 void calculate_lj_ewald_comb_LB_F_E(const cu_nbparam_t nbparam,
 262                                     int                typei,
 263                                     int                typej,
 264                                     float              r2,
 265                                     float              inv_r2,
 266                                     float              lje_coeff2,
 267                                     float              lje_coeff6_6,
 268                                     float              int_bit,
 269                                     float             *F_invr,
 270                                     float             *E_lj)
 271 {
 272     float c6grid, inv_r6_nm, cr2, expmcr2, poly;
 273     float sigma, sigma2, epsilon;
 274
 275     /* sigma and epsilon are scaled to give 6*C6 */
 276 #ifdef USE_TEXOBJ
 277     sigma   = tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2*typei    ) + tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2*typej    );
 278     epsilon = tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2*typei + 1) * tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2*typej + 1);
 279 #else
 280     sigma   = tex1Dfetch(nbfp_comb_texref, 2*typei    ) + tex1Dfetch(nbfp_comb_texref, 2*typej    );
 281     epsilon = tex1Dfetch(nbfp_comb_texref, 2*typei + 1) * tex1Dfetch(nbfp_comb_texref, 2*typej + 1);
 282 #endif /* USE_TEXOBJ */
 283     sigma2  = sigma*sigma;
 284     c6grid  = epsilon*sigma2*sigma2*sigma2;
 285
 286     /* Recalculate inv_r6 without exclusion mask */
 287     inv_r6_nm = inv_r2*inv_r2*inv_r2;
 288     cr2       = lje_coeff2*r2;
 289     expmcr2   = expf(-cr2);
 290     poly      = 1.0f + cr2 + 0.5f*cr2*cr2;
 291
 292     /* Subtract the grid force from the total LJ force */
 293     *F_invr  += c6grid*(inv_r6_nm - expmcr2*(inv_r6_nm*poly + lje_coeff6_6))*inv_r2;
 294
 295     if (E_lj != NULL)
 296     {
 297         float sh_mask;
 298
 299         /* Shift should be applied only to real LJ pairs */
 300         sh_mask   = nbparam.sh_lj_ewald*int_bit;
 301         *E_lj    += ONE_SIXTH_F*c6grid*(inv_r6_nm*(1.0f - expmcr2*poly) + sh_mask);
 302     }
 303 }
 304
 305 /*! Interpolate Ewald coulomb force using the table through the tex_nbfp texture.
 306  *  Original idea: from the OpenMM project
 307  */
 308 static inline __device__
 309 float interpolate_coulomb_force_r(float r, float scale)
 310 {
 311     float   normalized = scale * r;
 312     int     index      = (int) normalized;
 313     float   fract2     = normalized - index;
 314     float   fract1     = 1.0f - fract2;
 315
 316     return fract1 * tex1Dfetch(coulomb_tab_texref, index)
 317            + fract2 * tex1Dfetch(coulomb_tab_texref, index + 1);
 318 }
 319
 320 #ifdef HAVE_CUDA_TEXOBJ_SUPPORT
 321 static inline __device__
 322 float interpolate_coulomb_force_r(cudaTextureObject_t texobj_coulomb_tab,
 323                                   float r, float scale)
 324 {
 325     float   normalized = scale * r;
 326     int     index      = (int) normalized;
 327     float   fract2     = normalized - index;
 328     float   fract1     = 1.0f - fract2;
 329
 330     return fract1 * tex1Dfetch<float>(texobj_coulomb_tab, index) +
 331            fract2 * tex1Dfetch<float>(texobj_coulomb_tab, index + 1);
 332 }
 333 #endif
 334
 335
 336 /*! Calculate analytical Ewald correction term. */
 337 static inline __device__
 338 float pmecorrF(float z2)
 339 {
 340     const float FN6 = -1.7357322914161492954e-8f;
 341     const float FN5 = 1.4703624142580877519e-6f;
 342     const float FN4 = -0.000053401640219807709149f;
 343     const float FN3 = 0.0010054721316683106153f;
 344     const float FN2 = -0.019278317264888380590f;
 345     const float FN1 = 0.069670166153766424023f;
 346     const float FN0 = -0.75225204789749321333f;
 347
 348     const float FD4 = 0.0011193462567257629232f;
 349     const float FD3 = 0.014866955030185295499f;
 350     const float FD2 = 0.11583842382862377919f;
 351     const float FD1 = 0.50736591960530292870f;
 352     const float FD0 = 1.0f;
 353
 354     float       z4;
 355     float       polyFN0, polyFN1, polyFD0, polyFD1;
 356
 357     z4          = z2*z2;
 358
 359     polyFD0     = FD4*z4 + FD2;
 360     polyFD1     = FD3*z4 + FD1;
 361     polyFD0     = polyFD0*z4 + FD0;
 362     polyFD0     = polyFD1*z2 + polyFD0;
 363
 364     polyFD0     = 1.0f/polyFD0;
 365
 366     polyFN0     = FN6*z4 + FN4;
 367     polyFN1     = FN5*z4 + FN3;
 368     polyFN0     = polyFN0*z4 + FN2;
 369     polyFN1     = polyFN1*z4 + FN1;
 370     polyFN0     = polyFN0*z4 + FN0;
 371     polyFN0     = polyFN1*z2 + polyFN0;
 372
 373     return polyFN0*polyFD0;
 374 }
 375
 376 /*! Final j-force reduction; this generic implementation works with
 377  *  arbitrary array sizes.
 378  */
 379 static inline __device__
 380 void reduce_force_j_generic(float *f_buf, float3 *fout,
 381                             int tidxi, int tidxj, int aidx)
 382 {
 383     if (tidxi < 3)
 384     {
 385         float f = 0.0f;
 386         for (int j = tidxj * CL_SIZE; j < (tidxj + 1) * CL_SIZE; j++)
 387         {
 388             f += f_buf[FBUF_STRIDE * tidxi + j];
 389         }
 390
 391         atomicAdd((&fout[aidx].x)+tidxi, f);
 392     }
 393 }
 394
 395 /*! Final j-force reduction; this implementation only with power of two
 396  *  array sizes and with sm >= 3.0
 397  */
 398 #if __CUDA_ARCH__ >= 300
 399 static inline __device__
 400 void reduce_force_j_warp_shfl(float3 f, float3 *fout,
 401                               int tidxi, int aidx)
 402 {
 403     f.x += __shfl_down(f.x, 1);
 404     f.y += __shfl_up  (f.y, 1);
 405     f.z += __shfl_down(f.z, 1);
 406
 407     if (tidxi & 1)
 408     {
 409         f.x = f.y;
 410     }
 411
 412     f.x += __shfl_down(f.x, 2);
 413     f.z += __shfl_up  (f.z, 2);
 414
 415     if (tidxi & 2)
 416     {
 417         f.x = f.z;
 418     }
 419
 420     f.x += __shfl_down(f.x, 4);
 421
 422     if (tidxi < 3)
 423     {
 424         atomicAdd((&fout[aidx].x) + tidxi, f.x);
 425     }
 426 }
 427 #endif
 428
 429 /*! Final i-force reduction; this generic implementation works with
 430  *  arbitrary array sizes.
 431  * TODO: add the tidxi < 3 trick
 432  */
 433 static inline __device__
 434 void reduce_force_i_generic(float *f_buf, float3 *fout,
 435                             float *fshift_buf, bool bCalcFshift,
 436                             int tidxi, int tidxj, int aidx)
 437 {
 438     if (tidxj < 3)
 439     {
 440         float f = 0.0f;
 441         for (int j = tidxi; j < CL_SIZE_SQ; j += CL_SIZE)
 442         {
 443             f += f_buf[tidxj * FBUF_STRIDE + j];
 444         }
 445
 446         atomicAdd(&fout[aidx].x + tidxj, f);
 447
 448         if (bCalcFshift)
 449         {
 450             *fshift_buf += f;
 451         }
 452     }
 453 }
 454
 455 /*! Final i-force reduction; this implementation works only with power of two
 456  *  array sizes.
 457  */
 458 static inline __device__
 459 void reduce_force_i_pow2(volatile float *f_buf, float3 *fout,
 460                          float *fshift_buf, bool bCalcFshift,
 461                          int tidxi, int tidxj, int aidx)
 462 {
 463     int     i, j;
 464     float   f;
 465
 466     /* Reduce the initial CL_SIZE values for each i atom to half
 467      * every step by using CL_SIZE * i threads.
 468      * Can't just use i as loop variable because than nvcc refuses to unroll.
 469      */
 470     i = CL_SIZE/2;
 471     # pragma unroll 5
 472     for (j = CL_SIZE_POW2_EXPONENT - 1; j > 0; j--)
 473     {
 474         if (tidxj < i)
 475         {
 476
 477             f_buf[                  tidxj * CL_SIZE + tidxi] += f_buf[                  (tidxj + i) * CL_SIZE + tidxi];
 478             f_buf[    FBUF_STRIDE + tidxj * CL_SIZE + tidxi] += f_buf[    FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi];
 479             f_buf[2 * FBUF_STRIDE + tidxj * CL_SIZE + tidxi] += f_buf[2 * FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi];
 480         }
 481         i >>= 1;
 482     }
 483
 484     /* i == 1, last reduction step, writing to global mem */
 485     if (tidxj < 3)
 486     {
 487         /* tidxj*FBUF_STRIDE selects x, y or z */
 488         f = f_buf[tidxj * FBUF_STRIDE               + tidxi] +
 489             f_buf[tidxj * FBUF_STRIDE + i * CL_SIZE + tidxi];
 490
 491         atomicAdd(&(fout[aidx].x) + tidxj, f);
 492
 493         if (bCalcFshift)
 494         {
 495             *fshift_buf += f;
 496         }
 497     }
 498
 499 }
 500
 501 /*! Final i-force reduction wrapper; calls the generic or pow2 reduction depending
 502  *  on whether the size of the array to be reduced is power of two or not.
 503  */
 504 static inline __device__
 505 void reduce_force_i(float *f_buf, float3 *f,
 506                     float *fshift_buf, bool bCalcFshift,
 507                     int tidxi, int tidxj, int ai)
 508 {
 509     if ((CL_SIZE & (CL_SIZE - 1)))
 510     {
 511         reduce_force_i_generic(f_buf, f, fshift_buf, bCalcFshift, tidxi, tidxj, ai);
 512     }
 513     else
 514     {
 515         reduce_force_i_pow2(f_buf, f, fshift_buf, bCalcFshift, tidxi, tidxj, ai);
 516     }
 517 }
 518
 519 /*! Final i-force reduction; this implementation works only with power of two
 520  *  array sizes and with sm >= 3.0
 521  */
 522 #if __CUDA_ARCH__ >= 300
 523 static inline __device__
 524 void reduce_force_i_warp_shfl(float3 fin, float3 *fout,
 525                               float *fshift_buf, bool bCalcFshift,
 526                               int tidxj, int aidx)
 527 {
 528     fin.x += __shfl_down(fin.x, CL_SIZE);
 529     fin.y += __shfl_up  (fin.y, CL_SIZE);
 530     fin.z += __shfl_down(fin.z, CL_SIZE);
 531
 532     if (tidxj & 1)
 533     {
 534         fin.x = fin.y;
 535     }
 536
 537     fin.x += __shfl_down(fin.x, 2*CL_SIZE);
 538     fin.z += __shfl_up  (fin.z, 2*CL_SIZE);
 539
 540     if (tidxj & 2)
 541     {
 542         fin.x = fin.z;
 543     }
 544
 545     /* Threads 0,1,2 and 4,5,6 increment x,y,z for their warp */
 546     if ((tidxj & 3) < 3)
 547     {
 548         atomicAdd(&fout[aidx].x + (tidxj & ~4), fin.x);
 549
 550         if (bCalcFshift)
 551         {
 552             *fshift_buf += fin.x;
 553         }
 554     }
 555 }
 556 #endif
 557
 558 /*! Energy reduction; this implementation works only with power of two
 559  *  array sizes.
 560  */
 561 static inline __device__
 562 void reduce_energy_pow2(volatile float *buf,
 563                         float *e_lj, float *e_el,
 564                         unsigned int tidx)
 565 {
 566     int     i, j;
 567     float   e1, e2;
 568
 569     i = WARP_SIZE/2;
 570
 571     /* Can't just use i as loop variable because than nvcc refuses to unroll. */
 572 # pragma unroll 10
 573     for (j = WARP_SIZE_POW2_EXPONENT - 1; j > 0; j--)
 574     {
 575         if (tidx < i)
 576         {
 577             buf[              tidx] += buf[              tidx + i];
 578             buf[FBUF_STRIDE + tidx] += buf[FBUF_STRIDE + tidx + i];
 579         }
 580         i >>= 1;
 581     }
 582
 583     // TODO do this on two threads - requires e_lj and e_el to be stored on adjascent
 584     // memory locations to make sense
 585     /* last reduction step, writing to global mem */
 586     if (tidx == 0)
 587     {
 588         e1 = buf[              tidx] + buf[              tidx + i];
 589         e2 = buf[FBUF_STRIDE + tidx] + buf[FBUF_STRIDE + tidx + i];
 590
 591         atomicAdd(e_lj, e1);
 592         atomicAdd(e_el, e2);
 593     }
 594 }
 595
 596 /*! Energy reduction; this implementation works only with power of two
 597  *  array sizes and with sm >= 3.0
 598  */
 599 #if __CUDA_ARCH__ >= 300
 600 static inline __device__
 601 void reduce_energy_warp_shfl(float E_lj, float E_el,
 602                              float *e_lj, float *e_el,
 603                              int tidx)
 604 {
 605     int i, sh;
 606
 607     sh = 1;
 608 #pragma unroll 5
 609     for (i = 0; i < 5; i++)
 610     {
 611         E_lj += __shfl_down(E_lj, sh);
 612         E_el += __shfl_down(E_el, sh);
 613         sh   += sh;
 614     }
 615
 616     /* The first thread in the warp writes the reduced energies */
 617     if (tidx == 0 || tidx == WARP_SIZE)
 618     {
 619         atomicAdd(e_lj, E_lj);
 620         atomicAdd(e_el, E_el);
 621     }
 622 }
 623 #endif /* __CUDA_ARCH__ */
 624
 625 #endif /* NBNXN_CUDA_KERNEL_UTILS_CUH */