src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2012,2013,2014,  by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 /* Note that floating-point constants in CUDA code should be suffixed
  37  * with f (e.g. 0.5f), to stop the compiler producing intermediate
  38  * code that is in double precision.
  39  */
  40
  41 #include "../../gmxlib/cuda_tools/vectype_ops.cuh"
  42
  43 #ifndef NBNXN_CUDA_KERNEL_UTILS_CUH
  44 #define NBNXN_CUDA_KERNEL_UTILS_CUH
  45
  46 #define WARP_SIZE_POW2_EXPONENT     (5)
  47 #define CL_SIZE_POW2_EXPONENT       (3)  /* change this together with GPU_NS_CLUSTER_SIZE !*/
  48 #define CL_SIZE_SQ                  (CL_SIZE * CL_SIZE)
  49 #define FBUF_STRIDE                 (CL_SIZE_SQ)
  50
  51 #define ONE_SIXTH_F     0.16666667f
  52 #define ONE_TWELVETH_F  0.08333333f
  53
  54
  55 /*! i-cluster interaction mask for a super-cluster with all NCL_PER_SUPERCL bits set */
  56 const unsigned supercl_interaction_mask = ((1U << NCL_PER_SUPERCL) - 1U);
  57
  58 /*! Apply force switch,  force + energy version. */
  59 static inline __device__
  60 void calculate_force_switch_F(const  cu_nbparam_t nbparam,
  61                               float               c6,
  62                               float               c12,
  63                               float               inv_r,
  64                               float               r2,
  65                               float              *F_invr)
  66 {
  67     float r, r_switch;
  68
  69     /* force switch constants */
  70     float disp_shift_V2 = nbparam.dispersion_shift.c2;
  71     float disp_shift_V3 = nbparam.dispersion_shift.c3;
  72     float repu_shift_V2 = nbparam.repulsion_shift.c2;
  73     float repu_shift_V3 = nbparam.repulsion_shift.c3;
  74
  75     r         = r2 * inv_r;
  76     r_switch  = r - nbparam.rvdw_switch;
  77     r_switch  = r_switch >= 0.0f ? r_switch : 0.0f;
  78
  79     *F_invr  +=
  80         -c6*(disp_shift_V2 + disp_shift_V3*r_switch)*r_switch*r_switch*inv_r +
  81         c12*(-repu_shift_V2 + repu_shift_V3*r_switch)*r_switch*r_switch*inv_r;
  82 }
  83
  84 /*! Apply force switch, force-only version. */
  85 static inline __device__
  86 void calculate_force_switch_F_E(const  cu_nbparam_t nbparam,
  87                                 float               c6,
  88                                 float               c12,
  89                                 float               inv_r,
  90                                 float               r2,
  91                                 float              *F_invr,
  92                                 float              *E_lj)
  93 {
  94     float r, r_switch;
  95
  96     /* force switch constants */
  97     float disp_shift_V2 = nbparam.dispersion_shift.c2;
  98     float disp_shift_V3 = nbparam.dispersion_shift.c3;
  99     float repu_shift_V2 = nbparam.repulsion_shift.c2;
 100     float repu_shift_V3 = nbparam.repulsion_shift.c3;
 101
 102     float disp_shift_F2 = nbparam.dispersion_shift.c2/3;
 103     float disp_shift_F3 = nbparam.dispersion_shift.c3/4;
 104     float repu_shift_F2 = nbparam.repulsion_shift.c2/3;
 105     float repu_shift_F3 = nbparam.repulsion_shift.c3/4;
 106
 107     r         = r2 * inv_r;
 108     r_switch  = r - nbparam.rvdw_switch;
 109     r_switch  = r_switch >= 0.0f ? r_switch : 0.0f;
 110
 111     *F_invr  +=
 112         -c6*(disp_shift_V2 + disp_shift_V3*r_switch)*r_switch*r_switch*inv_r +
 113         c12*(-repu_shift_V2 + repu_shift_V3*r_switch)*r_switch*r_switch*inv_r;
 114     *E_lj    +=
 115         c6*(disp_shift_F2 + disp_shift_F3*r_switch)*r_switch*r_switch*r_switch -
 116         c12*(repu_shift_F2 + repu_shift_F3*r_switch)*r_switch*r_switch*r_switch;
 117 }
 118
 119 /*! Apply potential switch, force-only version. */
 120 static inline __device__
 121 void calculate_potential_switch_F(const  cu_nbparam_t nbparam,
 122                                   float               c6,
 123                                   float               c12,
 124                                   float               inv_r,
 125                                   float               r2,
 126                                   float              *F_invr,
 127                                   float              *E_lj)
 128 {
 129     float r, r_switch;
 130     float sw, dsw;
 131
 132     /* potential switch constants */
 133     float switch_V3 = nbparam.vdw_switch.c3;
 134     float switch_V4 = nbparam.vdw_switch.c4;
 135     float switch_V5 = nbparam.vdw_switch.c5;
 136     float switch_F2 = 3*nbparam.vdw_switch.c3;
 137     float switch_F3 = 4*nbparam.vdw_switch.c4;
 138     float switch_F4 = 5*nbparam.vdw_switch.c5;
 139
 140     r        = r2 * inv_r;
 141     r_switch = r - nbparam.rvdw_switch;
 142
 143     /* Unlike in the F+E kernel, conditional is faster here */
 144     if (r_switch > 0.0f)
 145     {
 146         sw      = 1.0f + (switch_V3 + (switch_V4 + switch_V5*r_switch)*r_switch)*r_switch*r_switch*r_switch;
 147         dsw     = (switch_F2 + (switch_F3 + switch_F4*r_switch)*r_switch)*r_switch*r_switch;
 148
 149         *F_invr = (*F_invr)*sw - inv_r*(*E_lj)*dsw;
 150     }
 151 }
 152
 153 /*! Apply potential switch, force + energy version. */
 154 static inline __device__
 155 void calculate_potential_switch_F_E(const  cu_nbparam_t nbparam,
 156                                     float               c6,
 157                                     float               c12,
 158                                     float               inv_r,
 159                                     float               r2,
 160                                     float              *F_invr,
 161                                     float              *E_lj)
 162 {
 163     float r, r_switch;
 164     float sw, dsw;
 165
 166     /* potential switch constants */
 167     float switch_V3 = nbparam.vdw_switch.c3;
 168     float switch_V4 = nbparam.vdw_switch.c4;
 169     float switch_V5 = nbparam.vdw_switch.c5;
 170     float switch_F2 = 3*nbparam.vdw_switch.c3;
 171     float switch_F3 = 4*nbparam.vdw_switch.c4;
 172     float switch_F4 = 5*nbparam.vdw_switch.c5;
 173
 174     r        = r2 * inv_r;
 175     r_switch = r - nbparam.rvdw_switch;
 176     r_switch = r_switch >= 0.0f ? r_switch : 0.0f;
 177
 178     /* Unlike in the F-only kernel, masking is faster here */
 179     sw       = 1.0f + (switch_V3 + (switch_V4 + switch_V5*r_switch)*r_switch)*r_switch*r_switch*r_switch;
 180     dsw      = (switch_F2 + (switch_F3 + switch_F4*r_switch)*r_switch)*r_switch*r_switch;
 181
 182     *F_invr  = (*F_invr)*sw - inv_r*(*E_lj)*dsw;
 183     *E_lj   *= sw;
 184 }
 185
 186
 187 /*! Interpolate Ewald coulomb force using the table through the tex_nbfp texture.
 188  *  Original idea: from the OpenMM project
 189  */
 190 static inline __device__
 191 float interpolate_coulomb_force_r(float r, float scale)
 192 {
 193     float   normalized = scale * r;
 194     int     index      = (int) normalized;
 195     float   fract2     = normalized - index;
 196     float   fract1     = 1.0f - fract2;
 197
 198     return fract1 * tex1Dfetch(coulomb_tab_texref, index)
 199            + fract2 * tex1Dfetch(coulomb_tab_texref, index + 1);
 200 }
 201
 202 #ifdef TEXOBJ_SUPPORTED
 203 static inline __device__
 204 float interpolate_coulomb_force_r(cudaTextureObject_t texobj_coulomb_tab,
 205                                   float r, float scale)
 206 {
 207     float   normalized = scale * r;
 208     int     index      = (int) normalized;
 209     float   fract2     = normalized - index;
 210     float   fract1     = 1.0f - fract2;
 211
 212     return fract1 * tex1Dfetch<float>(texobj_coulomb_tab, index) +
 213            fract2 * tex1Dfetch<float>(texobj_coulomb_tab, index + 1);
 214 }
 215 #endif
 216
 217
 218 /*! Calculate analytical Ewald correction term. */
 219 static inline __device__
 220 float pmecorrF(float z2)
 221 {
 222     const float FN6 = -1.7357322914161492954e-8f;
 223     const float FN5 = 1.4703624142580877519e-6f;
 224     const float FN4 = -0.000053401640219807709149f;
 225     const float FN3 = 0.0010054721316683106153f;
 226     const float FN2 = -0.019278317264888380590f;
 227     const float FN1 = 0.069670166153766424023f;
 228     const float FN0 = -0.75225204789749321333f;
 229
 230     const float FD4 = 0.0011193462567257629232f;
 231     const float FD3 = 0.014866955030185295499f;
 232     const float FD2 = 0.11583842382862377919f;
 233     const float FD1 = 0.50736591960530292870f;
 234     const float FD0 = 1.0f;
 235
 236     float       z4;
 237     float       polyFN0, polyFN1, polyFD0, polyFD1;
 238
 239     z4          = z2*z2;
 240
 241     polyFD0     = FD4*z4 + FD2;
 242     polyFD1     = FD3*z4 + FD1;
 243     polyFD0     = polyFD0*z4 + FD0;
 244     polyFD0     = polyFD1*z2 + polyFD0;
 245
 246     polyFD0     = 1.0f/polyFD0;
 247
 248     polyFN0     = FN6*z4 + FN4;
 249     polyFN1     = FN5*z4 + FN3;
 250     polyFN0     = polyFN0*z4 + FN2;
 251     polyFN1     = polyFN1*z4 + FN1;
 252     polyFN0     = polyFN0*z4 + FN0;
 253     polyFN0     = polyFN1*z2 + polyFN0;
 254
 255     return polyFN0*polyFD0;
 256 }
 257
 258 /*! Final j-force reduction; this generic implementation works with
 259  *  arbitrary array sizes.
 260  */
 261 static inline __device__
 262 void reduce_force_j_generic(float *f_buf, float3 *fout,
 263                             int tidxi, int tidxj, int aidx)
 264 {
 265     if (tidxi == 0)
 266     {
 267         float3 f = make_float3(0.0f);
 268         for (int j = tidxj * CL_SIZE; j < (tidxj + 1) * CL_SIZE; j++)
 269         {
 270             f.x += f_buf[                  j];
 271             f.y += f_buf[    FBUF_STRIDE + j];
 272             f.z += f_buf[2 * FBUF_STRIDE + j];
 273         }
 274
 275         atomicAdd(&fout[aidx], f);
 276     }
 277 }
 278
 279 /*! Final j-force reduction; this implementation only with power of two
 280  *  array sizes and with sm >= 3.0
 281  */
 282 #if __CUDA_ARCH__ >= 300
 283 static inline __device__
 284 void reduce_force_j_warp_shfl(float3 f, float3 *fout,
 285                               int tidxi, int aidx)
 286 {
 287     int i;
 288
 289 #pragma unroll 3
 290     for (i = 0; i < 3; i++)
 291     {
 292         f.x += __shfl_down(f.x, 1<<i);
 293         f.y += __shfl_down(f.y, 1<<i);
 294         f.z += __shfl_down(f.z, 1<<i);
 295     }
 296
 297     /* Write the reduced j-force on one thread for each j */
 298     if (tidxi == 0)
 299     {
 300         atomicAdd(&fout[aidx], f);
 301     }
 302 }
 303 #endif
 304
 305 /*! Final i-force reduction; this generic implementation works with
 306  *  arbitrary array sizes.
 307  */
 308 static inline __device__
 309 void reduce_force_i_generic(float *f_buf, float3 *fout,
 310                             float3 *fshift_buf, bool bCalcFshift,
 311                             int tidxi, int tidxj, int aidx)
 312 {
 313     if (tidxj == 0)
 314     {
 315         float3 f = make_float3(0.0f);
 316         for (int j = tidxi; j < CL_SIZE_SQ; j += CL_SIZE)
 317         {
 318             f.x += f_buf[                  j];
 319             f.y += f_buf[    FBUF_STRIDE + j];
 320             f.z += f_buf[2 * FBUF_STRIDE + j];
 321         }
 322
 323         atomicAdd(&fout[aidx], f);
 324
 325         if (bCalcFshift)
 326         {
 327             *fshift_buf += f;
 328         }
 329     }
 330 }
 331
 332 /*! Final i-force reduction; this implementation works only with power of two
 333  *  array sizes.
 334  */
 335 static inline __device__
 336 void reduce_force_i_pow2(volatile float *f_buf, float3 *fout,
 337                          float3 *fshift_buf, bool bCalcFshift,
 338                          int tidxi, int tidxj, int aidx)
 339 {
 340     int     i, j;
 341     float3  f = make_float3(0.0f);
 342
 343     /* Reduce the initial CL_SIZE values for each i atom to half
 344      * every step by using CL_SIZE * i threads.
 345      * Can't just use i as loop variable because than nvcc refuses to unroll.
 346      */
 347     i = CL_SIZE/2;
 348     # pragma unroll 5
 349     for (j = CL_SIZE_POW2_EXPONENT - 1; j > 0; j--)
 350     {
 351         if (tidxj < i)
 352         {
 353
 354             f_buf[                  tidxj * CL_SIZE + tidxi] += f_buf[                  (tidxj + i) * CL_SIZE + tidxi];
 355             f_buf[    FBUF_STRIDE + tidxj * CL_SIZE + tidxi] += f_buf[    FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi];
 356             f_buf[2 * FBUF_STRIDE + tidxj * CL_SIZE + tidxi] += f_buf[2 * FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi];
 357         }
 358         i >>= 1;
 359     }
 360
 361     /* i == 1, last reduction step, writing to global mem */
 362     if (tidxj == 0)
 363     {
 364         f.x = f_buf[                  tidxj * CL_SIZE + tidxi] + f_buf[                  (tidxj + i) * CL_SIZE + tidxi];
 365         f.y = f_buf[    FBUF_STRIDE + tidxj * CL_SIZE + tidxi] + f_buf[    FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi];
 366         f.z = f_buf[2 * FBUF_STRIDE + tidxj * CL_SIZE + tidxi] + f_buf[2 * FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi];
 367
 368         atomicAdd(&fout[aidx], f);
 369
 370         if (bCalcFshift)
 371         {
 372             *fshift_buf += f;
 373         }
 374     }
 375 }
 376
 377 /*! Final i-force reduction wrapper; calls the generic or pow2 reduction depending
 378  *  on whether the size of the array to be reduced is power of two or not.
 379  */
 380 static inline __device__
 381 void reduce_force_i(float *f_buf, float3 *f,
 382                     float3 *fshift_buf, bool bCalcFshift,
 383                     int tidxi, int tidxj, int ai)
 384 {
 385     if ((CL_SIZE & (CL_SIZE - 1)))
 386     {
 387         reduce_force_i_generic(f_buf, f, fshift_buf, bCalcFshift, tidxi, tidxj, ai);
 388     }
 389     else
 390     {
 391         reduce_force_i_pow2(f_buf, f, fshift_buf, bCalcFshift, tidxi, tidxj, ai);
 392     }
 393 }
 394
 395 /*! Final i-force reduction; this implementation works only with power of two
 396  *  array sizes and with sm >= 3.0
 397  */
 398 #if __CUDA_ARCH__ >= 300
 399 static inline __device__
 400 void reduce_force_i_warp_shfl(float3 fin, float3 *fout,
 401                               float3 *fshift_buf, bool bCalcFshift,
 402                               int tidxj, int aidx)
 403 {
 404     int j;
 405
 406 #pragma unroll 2
 407     for (j = 0; j < 2; j++)
 408     {
 409         fin.x += __shfl_down(fin.x,  CL_SIZE<<j);
 410         fin.y += __shfl_down(fin.y,  CL_SIZE<<j);
 411         fin.z += __shfl_down(fin.z,  CL_SIZE<<j);
 412     }
 413
 414     /* The first thread in the warp writes the reduced force */
 415     if (tidxj == 0 || tidxj == 4)
 416     {
 417         atomicAdd(&fout[aidx], fin);
 418
 419         if (bCalcFshift)
 420         {
 421             fshift_buf->x += fin.x;
 422             fshift_buf->y += fin.y;
 423             fshift_buf->z += fin.z;
 424         }
 425     }
 426 }
 427 #endif
 428
 429 /*! Energy reduction; this implementation works only with power of two
 430  *  array sizes.
 431  */
 432 static inline __device__
 433 void reduce_energy_pow2(volatile float *buf,
 434                         float *e_lj, float *e_el,
 435                         unsigned int tidx)
 436 {
 437     int     i, j;
 438     float   e1, e2;
 439
 440     i = WARP_SIZE/2;
 441
 442     /* Can't just use i as loop variable because than nvcc refuses to unroll. */
 443 # pragma unroll 10
 444     for (j = WARP_SIZE_POW2_EXPONENT - 1; j > 0; j--)
 445     {
 446         if (tidx < i)
 447         {
 448             buf[              tidx] += buf[              tidx + i];
 449             buf[FBUF_STRIDE + tidx] += buf[FBUF_STRIDE + tidx + i];
 450         }
 451         i >>= 1;
 452     }
 453
 454     /* last reduction step, writing to global mem */
 455     if (tidx == 0)
 456     {
 457         e1 = buf[              tidx] + buf[              tidx + i];
 458         e2 = buf[FBUF_STRIDE + tidx] + buf[FBUF_STRIDE + tidx + i];
 459
 460         atomicAdd(e_lj, e1);
 461         atomicAdd(e_el, e2);
 462     }
 463 }
 464
 465 /*! Energy reduction; this implementation works only with power of two
 466  *  array sizes and with sm >= 3.0
 467  */
 468 #if __CUDA_ARCH__ >= 300
 469 static inline __device__
 470 void reduce_energy_warp_shfl(float E_lj, float E_el,
 471                              float *e_lj, float *e_el,
 472                              int tidx)
 473 {
 474     int i, sh;
 475
 476     sh = 1;
 477 #pragma unroll 5
 478     for (i = 0; i < 5; i++)
 479     {
 480         E_lj += __shfl_down(E_lj, sh);
 481         E_el += __shfl_down(E_el, sh);
 482         sh   += sh;
 483     }
 484
 485     /* The first thread in the warp writes the reduced energies */
 486     if (tidx == 0 || tidx == WARP_SIZE)
 487     {
 488         atomicAdd(e_lj, E_lj);
 489         atomicAdd(e_el, E_el);
 490     }
 491 }
 492 #endif /* __CUDA_ARCH__ */
 493
 494 #endif /* NBNXN_CUDA_KERNEL_UTILS_CUH */