2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 /* Note that floating-point constants in CUDA code should be suffixed
37 * with f (e.g. 0.5f), to stop the compiler producing intermediate
38 * code that is in double precision.
41 #include "../../gmxlib/cuda_tools/vectype_ops.cuh"
43 #ifndef NBNXN_CUDA_KERNEL_UTILS_CUH
44 #define NBNXN_CUDA_KERNEL_UTILS_CUH
46 #define WARP_SIZE_POW2_EXPONENT (5)
47 #define CL_SIZE_POW2_EXPONENT (3) /* change this together with GPU_NS_CLUSTER_SIZE !*/
48 #define CL_SIZE_SQ (CL_SIZE * CL_SIZE)
49 #define FBUF_STRIDE (CL_SIZE_SQ)
51 #define ONE_SIXTH_F 0.16666667f
52 #define ONE_TWELVETH_F 0.08333333f
55 /*! i-cluster interaction mask for a super-cluster with all NCL_PER_SUPERCL bits set */
56 const unsigned supercl_interaction_mask = ((1U << NCL_PER_SUPERCL) - 1U);
58 /*! Apply force switch, force + energy version. */
59 static inline __device__
60 void calculate_force_switch_F(const cu_nbparam_t nbparam,
69 /* force switch constants */
70 float disp_shift_V2 = nbparam.dispersion_shift.c2;
71 float disp_shift_V3 = nbparam.dispersion_shift.c3;
72 float repu_shift_V2 = nbparam.repulsion_shift.c2;
73 float repu_shift_V3 = nbparam.repulsion_shift.c3;
76 r_switch = r - nbparam.rvdw_switch;
77 r_switch = r_switch >= 0.0f ? r_switch : 0.0f;
80 -c6*(disp_shift_V2 + disp_shift_V3*r_switch)*r_switch*r_switch*inv_r +
81 c12*(-repu_shift_V2 + repu_shift_V3*r_switch)*r_switch*r_switch*inv_r;
84 /*! Apply force switch, force-only version. */
85 static inline __device__
86 void calculate_force_switch_F_E(const cu_nbparam_t nbparam,
96 /* force switch constants */
97 float disp_shift_V2 = nbparam.dispersion_shift.c2;
98 float disp_shift_V3 = nbparam.dispersion_shift.c3;
99 float repu_shift_V2 = nbparam.repulsion_shift.c2;
100 float repu_shift_V3 = nbparam.repulsion_shift.c3;
102 float disp_shift_F2 = nbparam.dispersion_shift.c2/3;
103 float disp_shift_F3 = nbparam.dispersion_shift.c3/4;
104 float repu_shift_F2 = nbparam.repulsion_shift.c2/3;
105 float repu_shift_F3 = nbparam.repulsion_shift.c3/4;
108 r_switch = r - nbparam.rvdw_switch;
109 r_switch = r_switch >= 0.0f ? r_switch : 0.0f;
112 -c6*(disp_shift_V2 + disp_shift_V3*r_switch)*r_switch*r_switch*inv_r +
113 c12*(-repu_shift_V2 + repu_shift_V3*r_switch)*r_switch*r_switch*inv_r;
115 c6*(disp_shift_F2 + disp_shift_F3*r_switch)*r_switch*r_switch*r_switch -
116 c12*(repu_shift_F2 + repu_shift_F3*r_switch)*r_switch*r_switch*r_switch;
119 /*! Apply potential switch, force-only version. */
120 static inline __device__
121 void calculate_potential_switch_F(const cu_nbparam_t nbparam,
132 /* potential switch constants */
133 float switch_V3 = nbparam.vdw_switch.c3;
134 float switch_V4 = nbparam.vdw_switch.c4;
135 float switch_V5 = nbparam.vdw_switch.c5;
136 float switch_F2 = 3*nbparam.vdw_switch.c3;
137 float switch_F3 = 4*nbparam.vdw_switch.c4;
138 float switch_F4 = 5*nbparam.vdw_switch.c5;
141 r_switch = r - nbparam.rvdw_switch;
143 /* Unlike in the F+E kernel, conditional is faster here */
146 sw = 1.0f + (switch_V3 + (switch_V4 + switch_V5*r_switch)*r_switch)*r_switch*r_switch*r_switch;
147 dsw = (switch_F2 + (switch_F3 + switch_F4*r_switch)*r_switch)*r_switch*r_switch;
149 *F_invr = (*F_invr)*sw - inv_r*(*E_lj)*dsw;
153 /*! Apply potential switch, force + energy version. */
154 static inline __device__
155 void calculate_potential_switch_F_E(const cu_nbparam_t nbparam,
166 /* potential switch constants */
167 float switch_V3 = nbparam.vdw_switch.c3;
168 float switch_V4 = nbparam.vdw_switch.c4;
169 float switch_V5 = nbparam.vdw_switch.c5;
170 float switch_F2 = 3*nbparam.vdw_switch.c3;
171 float switch_F3 = 4*nbparam.vdw_switch.c4;
172 float switch_F4 = 5*nbparam.vdw_switch.c5;
175 r_switch = r - nbparam.rvdw_switch;
176 r_switch = r_switch >= 0.0f ? r_switch : 0.0f;
178 /* Unlike in the F-only kernel, masking is faster here */
179 sw = 1.0f + (switch_V3 + (switch_V4 + switch_V5*r_switch)*r_switch)*r_switch*r_switch*r_switch;
180 dsw = (switch_F2 + (switch_F3 + switch_F4*r_switch)*r_switch)*r_switch*r_switch;
182 *F_invr = (*F_invr)*sw - inv_r*(*E_lj)*dsw;
186 /*! Calculate LJ-PME grid force contribution with
187 * geometric combination rule.
189 static inline __device__
190 void calculate_lj_ewald_comb_geom_F(const cu_nbparam_t nbparam,
199 float c6grid, inv_r6_nm, cr2, expmcr2, poly;
202 c6grid = tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2*typei) * tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2*typej);
204 c6grid = tex1Dfetch(nbfp_comb_texref, 2*typei) * tex1Dfetch(nbfp_comb_texref, 2*typej);
205 #endif /* USE_TEXOBJ */
207 /* Recalculate inv_r6 without exclusion mask */
208 inv_r6_nm = inv_r2*inv_r2*inv_r2;
210 expmcr2 = expf(-cr2);
211 poly = 1.0f + cr2 + 0.5f*cr2*cr2;
213 /* Subtract the grid force from the total LJ force */
214 *F_invr += c6grid*(inv_r6_nm - expmcr2*(inv_r6_nm*poly + lje_coeff6_6))*inv_r2;
217 /*! Calculate LJ-PME grid force + energy contribution with
218 * geometric combination rule.
220 static inline __device__
221 void calculate_lj_ewald_comb_geom_F_E(const cu_nbparam_t nbparam,
232 float c6grid, inv_r6_nm, cr2, expmcr2, poly, sh_mask;
235 c6grid = tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2*typei) * tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2*typej);
237 c6grid = tex1Dfetch(nbfp_comb_texref, 2*typei) * tex1Dfetch(nbfp_comb_texref, 2*typej);
238 #endif /* USE_TEXOBJ */
240 /* Recalculate inv_r6 without exclusion mask */
241 inv_r6_nm = inv_r2*inv_r2*inv_r2;
243 expmcr2 = expf(-cr2);
244 poly = 1.0f + cr2 + 0.5f*cr2*cr2;
246 /* Subtract the grid force from the total LJ force */
247 *F_invr += c6grid*(inv_r6_nm - expmcr2*(inv_r6_nm*poly + lje_coeff6_6))*inv_r2;
249 /* Shift should be applied only to real LJ pairs */
250 sh_mask = nbparam.sh_lj_ewald*int_bit;
251 *E_lj += ONE_SIXTH_F*c6grid*(inv_r6_nm*(1.0f - expmcr2*poly) + sh_mask);
254 /*! Calculate LJ-PME grid force + energy contribution (if E_lj != NULL) with
255 * Lorentz-Berthelot combination rule.
256 * We use a single F+E kernel with conditional because the performance impact
257 * of this is pretty small and LB on the CPU is anyway very slow.
259 static inline __device__
260 void calculate_lj_ewald_comb_LB_F_E(const cu_nbparam_t nbparam,
271 float c6grid, inv_r6_nm, cr2, expmcr2, poly;
272 float sigma, sigma2, epsilon;
274 /* sigma and epsilon are scaled to give 6*C6 */
276 sigma = tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2*typei ) + tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2*typej );
277 epsilon = tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2*typei + 1) * tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2*typej + 1);
279 sigma = tex1Dfetch(nbfp_comb_texref, 2*typei ) + tex1Dfetch(nbfp_comb_texref, 2*typej );
280 epsilon = tex1Dfetch(nbfp_comb_texref, 2*typei + 1) * tex1Dfetch(nbfp_comb_texref, 2*typej + 1);
281 #endif /* USE_TEXOBJ */
282 sigma2 = sigma*sigma;
283 c6grid = epsilon*sigma2*sigma2*sigma2;
285 /* Recalculate inv_r6 without exclusion mask */
286 inv_r6_nm = inv_r2*inv_r2*inv_r2;
288 expmcr2 = expf(-cr2);
289 poly = 1.0f + cr2 + 0.5f*cr2*cr2;
291 /* Subtract the grid force from the total LJ force */
292 *F_invr += c6grid*(inv_r6_nm - expmcr2*(inv_r6_nm*poly + lje_coeff6_6))*inv_r2;
298 /* Shift should be applied only to real LJ pairs */
299 sh_mask = nbparam.sh_lj_ewald*int_bit;
300 *E_lj += ONE_SIXTH_F*c6grid*(inv_r6_nm*(1.0f - expmcr2*poly) + sh_mask);
304 /*! Interpolate Ewald coulomb force using the table through the tex_nbfp texture.
305 * Original idea: from the OpenMM project
307 static inline __device__
308 float interpolate_coulomb_force_r(float r, float scale)
310 float normalized = scale * r;
311 int index = (int) normalized;
312 float fract2 = normalized - index;
313 float fract1 = 1.0f - fract2;
315 return fract1 * tex1Dfetch(coulomb_tab_texref, index)
316 + fract2 * tex1Dfetch(coulomb_tab_texref, index + 1);
319 #ifdef TEXOBJ_SUPPORTED
320 static inline __device__
321 float interpolate_coulomb_force_r(cudaTextureObject_t texobj_coulomb_tab,
322 float r, float scale)
324 float normalized = scale * r;
325 int index = (int) normalized;
326 float fract2 = normalized - index;
327 float fract1 = 1.0f - fract2;
329 return fract1 * tex1Dfetch<float>(texobj_coulomb_tab, index) +
330 fract2 * tex1Dfetch<float>(texobj_coulomb_tab, index + 1);
335 /*! Calculate analytical Ewald correction term. */
336 static inline __device__
337 float pmecorrF(float z2)
339 const float FN6 = -1.7357322914161492954e-8f;
340 const float FN5 = 1.4703624142580877519e-6f;
341 const float FN4 = -0.000053401640219807709149f;
342 const float FN3 = 0.0010054721316683106153f;
343 const float FN2 = -0.019278317264888380590f;
344 const float FN1 = 0.069670166153766424023f;
345 const float FN0 = -0.75225204789749321333f;
347 const float FD4 = 0.0011193462567257629232f;
348 const float FD3 = 0.014866955030185295499f;
349 const float FD2 = 0.11583842382862377919f;
350 const float FD1 = 0.50736591960530292870f;
351 const float FD0 = 1.0f;
354 float polyFN0, polyFN1, polyFD0, polyFD1;
358 polyFD0 = FD4*z4 + FD2;
359 polyFD1 = FD3*z4 + FD1;
360 polyFD0 = polyFD0*z4 + FD0;
361 polyFD0 = polyFD1*z2 + polyFD0;
363 polyFD0 = 1.0f/polyFD0;
365 polyFN0 = FN6*z4 + FN4;
366 polyFN1 = FN5*z4 + FN3;
367 polyFN0 = polyFN0*z4 + FN2;
368 polyFN1 = polyFN1*z4 + FN1;
369 polyFN0 = polyFN0*z4 + FN0;
370 polyFN0 = polyFN1*z2 + polyFN0;
372 return polyFN0*polyFD0;
375 /*! Final j-force reduction; this generic implementation works with
376 * arbitrary array sizes.
378 static inline __device__
379 void reduce_force_j_generic(float *f_buf, float3 *fout,
380 int tidxi, int tidxj, int aidx)
384 float3 f = make_float3(0.0f);
385 for (int j = tidxj * CL_SIZE; j < (tidxj + 1) * CL_SIZE; j++)
388 f.y += f_buf[ FBUF_STRIDE + j];
389 f.z += f_buf[2 * FBUF_STRIDE + j];
392 atomicAdd(&fout[aidx], f);
396 /*! Final j-force reduction; this implementation only with power of two
397 * array sizes and with sm >= 3.0
399 #if __CUDA_ARCH__ >= 300
400 static inline __device__
401 void reduce_force_j_warp_shfl(float3 f, float3 *fout,
407 for (i = 0; i < 3; i++)
409 f.x += __shfl_down(f.x, 1<<i);
410 f.y += __shfl_down(f.y, 1<<i);
411 f.z += __shfl_down(f.z, 1<<i);
414 /* Write the reduced j-force on one thread for each j */
417 atomicAdd(&fout[aidx], f);
422 /*! Final i-force reduction; this generic implementation works with
423 * arbitrary array sizes.
425 static inline __device__
426 void reduce_force_i_generic(float *f_buf, float3 *fout,
427 float3 *fshift_buf, bool bCalcFshift,
428 int tidxi, int tidxj, int aidx)
432 float3 f = make_float3(0.0f);
433 for (int j = tidxi; j < CL_SIZE_SQ; j += CL_SIZE)
436 f.y += f_buf[ FBUF_STRIDE + j];
437 f.z += f_buf[2 * FBUF_STRIDE + j];
440 atomicAdd(&fout[aidx], f);
449 /*! Final i-force reduction; this implementation works only with power of two
452 static inline __device__
453 void reduce_force_i_pow2(volatile float *f_buf, float3 *fout,
454 float3 *fshift_buf, bool bCalcFshift,
455 int tidxi, int tidxj, int aidx)
458 float3 f = make_float3(0.0f);
460 /* Reduce the initial CL_SIZE values for each i atom to half
461 * every step by using CL_SIZE * i threads.
462 * Can't just use i as loop variable because than nvcc refuses to unroll.
466 for (j = CL_SIZE_POW2_EXPONENT - 1; j > 0; j--)
471 f_buf[ tidxj * CL_SIZE + tidxi] += f_buf[ (tidxj + i) * CL_SIZE + tidxi];
472 f_buf[ FBUF_STRIDE + tidxj * CL_SIZE + tidxi] += f_buf[ FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi];
473 f_buf[2 * FBUF_STRIDE + tidxj * CL_SIZE + tidxi] += f_buf[2 * FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi];
478 /* i == 1, last reduction step, writing to global mem */
481 f.x = f_buf[ tidxj * CL_SIZE + tidxi] + f_buf[ (tidxj + i) * CL_SIZE + tidxi];
482 f.y = f_buf[ FBUF_STRIDE + tidxj * CL_SIZE + tidxi] + f_buf[ FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi];
483 f.z = f_buf[2 * FBUF_STRIDE + tidxj * CL_SIZE + tidxi] + f_buf[2 * FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi];
485 atomicAdd(&fout[aidx], f);
494 /*! Final i-force reduction wrapper; calls the generic or pow2 reduction depending
495 * on whether the size of the array to be reduced is power of two or not.
497 static inline __device__
498 void reduce_force_i(float *f_buf, float3 *f,
499 float3 *fshift_buf, bool bCalcFshift,
500 int tidxi, int tidxj, int ai)
502 if ((CL_SIZE & (CL_SIZE - 1)))
504 reduce_force_i_generic(f_buf, f, fshift_buf, bCalcFshift, tidxi, tidxj, ai);
508 reduce_force_i_pow2(f_buf, f, fshift_buf, bCalcFshift, tidxi, tidxj, ai);
512 /*! Final i-force reduction; this implementation works only with power of two
513 * array sizes and with sm >= 3.0
515 #if __CUDA_ARCH__ >= 300
516 static inline __device__
517 void reduce_force_i_warp_shfl(float3 fin, float3 *fout,
518 float3 *fshift_buf, bool bCalcFshift,
524 for (j = 0; j < 2; j++)
526 fin.x += __shfl_down(fin.x, CL_SIZE<<j);
527 fin.y += __shfl_down(fin.y, CL_SIZE<<j);
528 fin.z += __shfl_down(fin.z, CL_SIZE<<j);
531 /* The first thread in the warp writes the reduced force */
532 if (tidxj == 0 || tidxj == 4)
534 atomicAdd(&fout[aidx], fin);
538 fshift_buf->x += fin.x;
539 fshift_buf->y += fin.y;
540 fshift_buf->z += fin.z;
546 /*! Energy reduction; this implementation works only with power of two
549 static inline __device__
550 void reduce_energy_pow2(volatile float *buf,
551 float *e_lj, float *e_el,
559 /* Can't just use i as loop variable because than nvcc refuses to unroll. */
561 for (j = WARP_SIZE_POW2_EXPONENT - 1; j > 0; j--)
565 buf[ tidx] += buf[ tidx + i];
566 buf[FBUF_STRIDE + tidx] += buf[FBUF_STRIDE + tidx + i];
571 /* last reduction step, writing to global mem */
574 e1 = buf[ tidx] + buf[ tidx + i];
575 e2 = buf[FBUF_STRIDE + tidx] + buf[FBUF_STRIDE + tidx + i];
582 /*! Energy reduction; this implementation works only with power of two
583 * array sizes and with sm >= 3.0
585 #if __CUDA_ARCH__ >= 300
586 static inline __device__
587 void reduce_energy_warp_shfl(float E_lj, float E_el,
588 float *e_lj, float *e_el,
595 for (i = 0; i < 5; i++)
597 E_lj += __shfl_down(E_lj, sh);
598 E_el += __shfl_down(E_el, sh);
602 /* The first thread in the warp writes the reduced energies */
603 if (tidx == 0 || tidx == WARP_SIZE)
605 atomicAdd(e_lj, E_lj);
606 atomicAdd(e_el, E_el);
609 #endif /* __CUDA_ARCH__ */
611 #endif /* NBNXN_CUDA_KERNEL_UTILS_CUH */