2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2016,2017, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
38 * Utility constant and function declaration for the CUDA non-bonded kernels.
39 * This header should be included once at the top level, just before the
40 * kernels are included (has to be preceded by nbnxn_cuda_types.h).
42 * \author Szilárd Páll <pall.szilard@gmail.com>
43 * \ingroup module_mdlib
49 /* Note that floating-point constants in CUDA code should be suffixed
50 * with f (e.g. 0.5f), to stop the compiler producing intermediate
51 * code that is in double precision.
55 #include "gromacs/gpu_utils/cuda_arch_utils.cuh"
56 #include "gromacs/gpu_utils/vectype_ops.cuh"
58 #include "nbnxn_cuda_types.h"
60 #ifndef NBNXN_CUDA_KERNEL_UTILS_CUH
61 #define NBNXN_CUDA_KERNEL_UTILS_CUH
63 /* Use texture objects if supported by the target hardware. */
64 #if GMX_PTX_ARCH >= 300
65 /* Note: convenience macro, needs to be undef-ed at the end of the file. */
69 /*! \brief Log of the i and j cluster size.
70 * change this together with c_clSize !*/
71 static const int c_clSizeLog2 = 3;
72 /*! \brief Square of cluster size. */
73 static const int c_clSizeSq = c_clSize*c_clSize;
74 /*! \brief j-cluster size after split (4 in the current implementation). */
75 static const int c_splitClSize = c_clSize/c_nbnxnGpuClusterpairSplit;
76 /*! \brief Stride in the force accumualation buffer */
77 static const int c_fbufStride = c_clSizeSq;
79 static const float c_oneSixth = 0.16666667f;
80 static const float c_oneTwelveth = 0.08333333f;
82 /* With multiple compilation units this ensures that texture refs are available
83 in the the kernels' compilation units. */
84 #if !GMX_CUDA_NB_SINGLE_COMPILATION_UNIT
85 /*! Texture reference for LJ C6/C12 parameters; bound to cu_nbparam_t.nbfp */
86 extern texture<float, 1, cudaReadModeElementType> nbfp_texref;
88 /*! Texture reference for LJ-PME parameters; bound to cu_nbparam_t.nbfp_comb */
89 extern texture<float, 1, cudaReadModeElementType> nbfp_comb_texref;
91 /*! Texture reference for Ewald coulomb force table; bound to cu_nbparam_t.coulomb_tab */
92 extern texture<float, 1, cudaReadModeElementType> coulomb_tab_texref;
93 #endif /* GMX_CUDA_NB_SINGLE_COMPILATION_UNIT */
95 /*! Convert LJ sigma,epsilon parameters to C6,C12. */
96 static __forceinline__ __device__
97 void convert_sigma_epsilon_to_c6_c12(const float sigma,
102 float sigma2, sigma6;
104 sigma2 = sigma * sigma;
105 sigma6 = sigma2 *sigma2 * sigma2;
106 *c6 = epsilon * sigma6;
110 /*! Apply force switch, force + energy version. */
111 static __forceinline__ __device__
112 void calculate_force_switch_F(const cu_nbparam_t nbparam,
121 /* force switch constants */
122 float disp_shift_V2 = nbparam.dispersion_shift.c2;
123 float disp_shift_V3 = nbparam.dispersion_shift.c3;
124 float repu_shift_V2 = nbparam.repulsion_shift.c2;
125 float repu_shift_V3 = nbparam.repulsion_shift.c3;
128 r_switch = r - nbparam.rvdw_switch;
129 r_switch = r_switch >= 0.0f ? r_switch : 0.0f;
132 -c6*(disp_shift_V2 + disp_shift_V3*r_switch)*r_switch*r_switch*inv_r +
133 c12*(-repu_shift_V2 + repu_shift_V3*r_switch)*r_switch*r_switch*inv_r;
136 /*! Apply force switch, force-only version. */
137 static __forceinline__ __device__
138 void calculate_force_switch_F_E(const cu_nbparam_t nbparam,
148 /* force switch constants */
149 float disp_shift_V2 = nbparam.dispersion_shift.c2;
150 float disp_shift_V3 = nbparam.dispersion_shift.c3;
151 float repu_shift_V2 = nbparam.repulsion_shift.c2;
152 float repu_shift_V3 = nbparam.repulsion_shift.c3;
154 float disp_shift_F2 = nbparam.dispersion_shift.c2/3;
155 float disp_shift_F3 = nbparam.dispersion_shift.c3/4;
156 float repu_shift_F2 = nbparam.repulsion_shift.c2/3;
157 float repu_shift_F3 = nbparam.repulsion_shift.c3/4;
160 r_switch = r - nbparam.rvdw_switch;
161 r_switch = r_switch >= 0.0f ? r_switch : 0.0f;
164 -c6*(disp_shift_V2 + disp_shift_V3*r_switch)*r_switch*r_switch*inv_r +
165 c12*(-repu_shift_V2 + repu_shift_V3*r_switch)*r_switch*r_switch*inv_r;
167 c6*(disp_shift_F2 + disp_shift_F3*r_switch)*r_switch*r_switch*r_switch -
168 c12*(repu_shift_F2 + repu_shift_F3*r_switch)*r_switch*r_switch*r_switch;
171 /*! Apply potential switch, force-only version. */
172 static __forceinline__ __device__
173 void calculate_potential_switch_F(const cu_nbparam_t nbparam,
184 /* potential switch constants */
185 float switch_V3 = nbparam.vdw_switch.c3;
186 float switch_V4 = nbparam.vdw_switch.c4;
187 float switch_V5 = nbparam.vdw_switch.c5;
188 float switch_F2 = 3*nbparam.vdw_switch.c3;
189 float switch_F3 = 4*nbparam.vdw_switch.c4;
190 float switch_F4 = 5*nbparam.vdw_switch.c5;
193 r_switch = r - nbparam.rvdw_switch;
195 /* Unlike in the F+E kernel, conditional is faster here */
198 sw = 1.0f + (switch_V3 + (switch_V4 + switch_V5*r_switch)*r_switch)*r_switch*r_switch*r_switch;
199 dsw = (switch_F2 + (switch_F3 + switch_F4*r_switch)*r_switch)*r_switch*r_switch;
201 *F_invr = (*F_invr)*sw - inv_r*(*E_lj)*dsw;
205 /*! Apply potential switch, force + energy version. */
206 static __forceinline__ __device__
207 void calculate_potential_switch_F_E(const cu_nbparam_t nbparam,
218 /* potential switch constants */
219 float switch_V3 = nbparam.vdw_switch.c3;
220 float switch_V4 = nbparam.vdw_switch.c4;
221 float switch_V5 = nbparam.vdw_switch.c5;
222 float switch_F2 = 3*nbparam.vdw_switch.c3;
223 float switch_F3 = 4*nbparam.vdw_switch.c4;
224 float switch_F4 = 5*nbparam.vdw_switch.c5;
227 r_switch = r - nbparam.rvdw_switch;
228 r_switch = r_switch >= 0.0f ? r_switch : 0.0f;
230 /* Unlike in the F-only kernel, masking is faster here */
231 sw = 1.0f + (switch_V3 + (switch_V4 + switch_V5*r_switch)*r_switch)*r_switch*r_switch*r_switch;
232 dsw = (switch_F2 + (switch_F3 + switch_F4*r_switch)*r_switch)*r_switch*r_switch;
234 *F_invr = (*F_invr)*sw - inv_r*(*E_lj)*dsw;
238 /*! Calculate LJ-PME grid force contribution with
239 * geometric combination rule.
241 static __forceinline__ __device__
242 void calculate_lj_ewald_comb_geom_F(const cu_nbparam_t nbparam,
251 float c6grid, inv_r6_nm, cr2, expmcr2, poly;
254 c6grid = tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2*typei) * tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2*typej);
256 c6grid = tex1Dfetch(nbfp_comb_texref, 2*typei) * tex1Dfetch(nbfp_comb_texref, 2*typej);
257 #endif /* USE_TEXOBJ */
259 /* Recalculate inv_r6 without exclusion mask */
260 inv_r6_nm = inv_r2*inv_r2*inv_r2;
262 expmcr2 = expf(-cr2);
263 poly = 1.0f + cr2 + 0.5f*cr2*cr2;
265 /* Subtract the grid force from the total LJ force */
266 *F_invr += c6grid*(inv_r6_nm - expmcr2*(inv_r6_nm*poly + lje_coeff6_6))*inv_r2;
269 /*! Calculate LJ-PME grid force + energy contribution with
270 * geometric combination rule.
272 static __forceinline__ __device__
273 void calculate_lj_ewald_comb_geom_F_E(const cu_nbparam_t nbparam,
284 float c6grid, inv_r6_nm, cr2, expmcr2, poly, sh_mask;
287 c6grid = tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2*typei) * tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2*typej);
289 c6grid = tex1Dfetch(nbfp_comb_texref, 2*typei) * tex1Dfetch(nbfp_comb_texref, 2*typej);
290 #endif /* USE_TEXOBJ */
292 /* Recalculate inv_r6 without exclusion mask */
293 inv_r6_nm = inv_r2*inv_r2*inv_r2;
295 expmcr2 = expf(-cr2);
296 poly = 1.0f + cr2 + 0.5f*cr2*cr2;
298 /* Subtract the grid force from the total LJ force */
299 *F_invr += c6grid*(inv_r6_nm - expmcr2*(inv_r6_nm*poly + lje_coeff6_6))*inv_r2;
301 /* Shift should be applied only to real LJ pairs */
302 sh_mask = nbparam.sh_lj_ewald*int_bit;
303 *E_lj += c_oneSixth*c6grid*(inv_r6_nm*(1.0f - expmcr2*poly) + sh_mask);
306 /*! Calculate LJ-PME grid force + energy contribution (if E_lj != NULL) with
307 * Lorentz-Berthelot combination rule.
308 * We use a single F+E kernel with conditional because the performance impact
309 * of this is pretty small and LB on the CPU is anyway very slow.
311 static __forceinline__ __device__
312 void calculate_lj_ewald_comb_LB_F_E(const cu_nbparam_t nbparam,
323 float c6grid, inv_r6_nm, cr2, expmcr2, poly;
324 float sigma, sigma2, epsilon;
326 /* sigma and epsilon are scaled to give 6*C6 */
328 sigma = tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2*typei ) + tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2*typej );
329 epsilon = tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2*typei + 1) * tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2*typej + 1);
331 sigma = tex1Dfetch(nbfp_comb_texref, 2*typei ) + tex1Dfetch(nbfp_comb_texref, 2*typej );
332 epsilon = tex1Dfetch(nbfp_comb_texref, 2*typei + 1) * tex1Dfetch(nbfp_comb_texref, 2*typej + 1);
333 #endif /* USE_TEXOBJ */
334 sigma2 = sigma*sigma;
335 c6grid = epsilon*sigma2*sigma2*sigma2;
337 /* Recalculate inv_r6 without exclusion mask */
338 inv_r6_nm = inv_r2*inv_r2*inv_r2;
340 expmcr2 = expf(-cr2);
341 poly = 1.0f + cr2 + 0.5f*cr2*cr2;
343 /* Subtract the grid force from the total LJ force */
344 *F_invr += c6grid*(inv_r6_nm - expmcr2*(inv_r6_nm*poly + lje_coeff6_6))*inv_r2;
350 /* Shift should be applied only to real LJ pairs */
351 sh_mask = nbparam.sh_lj_ewald*int_bit;
352 *E_lj += c_oneSixth*c6grid*(inv_r6_nm*(1.0f - expmcr2*poly) + sh_mask);
356 /*! Interpolate Ewald coulomb force using the table through the tex_nbfp texture.
357 * Original idea: from the OpenMM project
359 static __forceinline__ __device__
360 float interpolate_coulomb_force_r(float r, float scale)
362 float normalized = scale * r;
363 int index = (int) normalized;
364 float fract2 = normalized - index;
365 float fract1 = 1.0f - fract2;
367 return fract1 * tex1Dfetch(coulomb_tab_texref, index)
368 + fract2 * tex1Dfetch(coulomb_tab_texref, index + 1);
371 static __forceinline__ __device__
372 float interpolate_coulomb_force_r(cudaTextureObject_t texobj_coulomb_tab,
373 float r, float scale)
375 float normalized = scale * r;
376 int index = (int) normalized;
377 float fract2 = normalized - index;
378 float fract1 = 1.0f - fract2;
380 return fract1 * tex1Dfetch<float>(texobj_coulomb_tab, index) +
381 fract2 * tex1Dfetch<float>(texobj_coulomb_tab, index + 1);
384 /*! Calculate analytical Ewald correction term. */
385 static __forceinline__ __device__
386 float pmecorrF(float z2)
388 const float FN6 = -1.7357322914161492954e-8f;
389 const float FN5 = 1.4703624142580877519e-6f;
390 const float FN4 = -0.000053401640219807709149f;
391 const float FN3 = 0.0010054721316683106153f;
392 const float FN2 = -0.019278317264888380590f;
393 const float FN1 = 0.069670166153766424023f;
394 const float FN0 = -0.75225204789749321333f;
396 const float FD4 = 0.0011193462567257629232f;
397 const float FD3 = 0.014866955030185295499f;
398 const float FD2 = 0.11583842382862377919f;
399 const float FD1 = 0.50736591960530292870f;
400 const float FD0 = 1.0f;
403 float polyFN0, polyFN1, polyFD0, polyFD1;
407 polyFD0 = FD4*z4 + FD2;
408 polyFD1 = FD3*z4 + FD1;
409 polyFD0 = polyFD0*z4 + FD0;
410 polyFD0 = polyFD1*z2 + polyFD0;
412 polyFD0 = 1.0f/polyFD0;
414 polyFN0 = FN6*z4 + FN4;
415 polyFN1 = FN5*z4 + FN3;
416 polyFN0 = polyFN0*z4 + FN2;
417 polyFN1 = polyFN1*z4 + FN1;
418 polyFN0 = polyFN0*z4 + FN0;
419 polyFN0 = polyFN1*z2 + polyFN0;
421 return polyFN0*polyFD0;
424 /*! Final j-force reduction; this generic implementation works with
425 * arbitrary array sizes.
427 static __forceinline__ __device__
428 void reduce_force_j_generic(float *f_buf, float3 *fout,
429 int tidxi, int tidxj, int aidx)
434 for (int j = tidxj * c_clSize; j < (tidxj + 1) * c_clSize; j++)
436 f += f_buf[c_fbufStride * tidxi + j];
439 atomicAdd((&fout[aidx].x)+tidxi, f);
443 /*! Final j-force reduction; this implementation only with power of two
444 * array sizes and with sm >= 3.0
446 #if GMX_PTX_ARCH >= 300
447 static __forceinline__ __device__
448 void reduce_force_j_warp_shfl(float3 f, float3 *fout,
450 const unsigned int activemask)
452 f.x += gmx_shfl_down_sync(activemask, f.x, 1);
453 f.y += gmx_shfl_up_sync (activemask, f.y, 1);
454 f.z += gmx_shfl_down_sync(activemask, f.z, 1);
461 f.x += gmx_shfl_down_sync(activemask, f.x, 2);
462 f.z += gmx_shfl_up_sync (activemask, f.z, 2);
469 f.x += gmx_shfl_down_sync(activemask, f.x, 4);
473 atomicAdd((&fout[aidx].x) + tidxi, f.x);
478 /*! Final i-force reduction; this generic implementation works with
479 * arbitrary array sizes.
480 * TODO: add the tidxi < 3 trick
482 static __forceinline__ __device__
483 void reduce_force_i_generic(float *f_buf, float3 *fout,
484 float *fshift_buf, bool bCalcFshift,
485 int tidxi, int tidxj, int aidx)
490 for (int j = tidxi; j < c_clSizeSq; j += c_clSize)
492 f += f_buf[tidxj * c_fbufStride + j];
495 atomicAdd(&fout[aidx].x + tidxj, f);
504 /*! Final i-force reduction; this implementation works only with power of two
507 static __forceinline__ __device__
508 void reduce_force_i_pow2(volatile float *f_buf, float3 *fout,
509 float *fshift_buf, bool bCalcFshift,
510 int tidxi, int tidxj, int aidx)
515 assert(c_clSize == 1 << c_clSizeLog2);
517 /* Reduce the initial c_clSize values for each i atom to half
518 * every step by using c_clSize * i threads.
519 * Can't just use i as loop variable because than nvcc refuses to unroll.
523 for (j = c_clSizeLog2 - 1; j > 0; j--)
528 f_buf[ tidxj * c_clSize + tidxi] += f_buf[ (tidxj + i) * c_clSize + tidxi];
529 f_buf[ c_fbufStride + tidxj * c_clSize + tidxi] += f_buf[ c_fbufStride + (tidxj + i) * c_clSize + tidxi];
530 f_buf[2 * c_fbufStride + tidxj * c_clSize + tidxi] += f_buf[2 * c_fbufStride + (tidxj + i) * c_clSize + tidxi];
535 /* i == 1, last reduction step, writing to global mem */
538 /* tidxj*c_fbufStride selects x, y or z */
539 f = f_buf[tidxj * c_fbufStride + tidxi] +
540 f_buf[tidxj * c_fbufStride + i * c_clSize + tidxi];
542 atomicAdd(&(fout[aidx].x) + tidxj, f);
552 /*! Final i-force reduction wrapper; calls the generic or pow2 reduction depending
553 * on whether the size of the array to be reduced is power of two or not.
555 static __forceinline__ __device__
556 void reduce_force_i(float *f_buf, float3 *f,
557 float *fshift_buf, bool bCalcFshift,
558 int tidxi, int tidxj, int ai)
560 if ((c_clSize & (c_clSize - 1)))
562 reduce_force_i_generic(f_buf, f, fshift_buf, bCalcFshift, tidxi, tidxj, ai);
566 reduce_force_i_pow2(f_buf, f, fshift_buf, bCalcFshift, tidxi, tidxj, ai);
570 /*! Final i-force reduction; this implementation works only with power of two
571 * array sizes and with sm >= 3.0
573 #if GMX_PTX_ARCH >= 300
574 static __forceinline__ __device__
575 void reduce_force_i_warp_shfl(float3 fin, float3 *fout,
576 float *fshift_buf, bool bCalcFshift,
578 const unsigned int activemask)
580 fin.x += gmx_shfl_down_sync(activemask, fin.x, c_clSize);
581 fin.y += gmx_shfl_up_sync (activemask, fin.y, c_clSize);
582 fin.z += gmx_shfl_down_sync(activemask, fin.z, c_clSize);
589 fin.x += gmx_shfl_down_sync(activemask, fin.x, 2*c_clSize);
590 fin.z += gmx_shfl_up_sync (activemask, fin.z, 2*c_clSize);
597 /* Threads 0,1,2 and 4,5,6 increment x,y,z for their warp */
600 atomicAdd(&fout[aidx].x + (tidxj & 3), fin.x);
604 *fshift_buf += fin.x;
610 /*! Energy reduction; this implementation works only with power of two
613 static __forceinline__ __device__
614 void reduce_energy_pow2(volatile float *buf,
615 float *e_lj, float *e_el,
623 /* Can't just use i as loop variable because than nvcc refuses to unroll. */
625 for (j = warp_size_log2 - 1; j > 0; j--)
629 buf[ tidx] += buf[ tidx + i];
630 buf[c_fbufStride + tidx] += buf[c_fbufStride + tidx + i];
635 // TODO do this on two threads - requires e_lj and e_el to be stored on adjascent
636 // memory locations to make sense
637 /* last reduction step, writing to global mem */
640 e1 = buf[ tidx] + buf[ tidx + i];
641 e2 = buf[c_fbufStride + tidx] + buf[c_fbufStride + tidx + i];
648 /*! Energy reduction; this implementation works only with power of two
649 * array sizes and with sm >= 3.0
651 #if GMX_PTX_ARCH >= 300
652 static __forceinline__ __device__
653 void reduce_energy_warp_shfl(float E_lj, float E_el,
654 float *e_lj, float *e_el,
656 const unsigned int activemask)
662 for (i = 0; i < 5; i++)
664 E_lj += gmx_shfl_down_sync(activemask, E_lj, sh);
665 E_el += gmx_shfl_down_sync(activemask, E_el, sh);
669 /* The first thread in the warp writes the reduced energies */
670 if (tidx == 0 || tidx == warp_size)
672 atomicAdd(e_lj, E_lj);
673 atomicAdd(e_el, E_el);
676 #endif /* GMX_PTX_ARCH */
680 #endif /* NBNXN_CUDA_KERNEL_UTILS_CUH */