2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2012, The GROMACS development team,
6 * check out http://www.gromacs.org for more information.
7 * Copyright (c) 2012,2013, by the GROMACS development team, led by
8 * David van der Spoel, Berk Hess, Erik Lindahl, and including many
9 * others, as listed in the AUTHORS file in the top-level source
10 * directory and at http://www.gromacs.org.
12 * GROMACS is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU Lesser General Public License
14 * as published by the Free Software Foundation; either version 2.1
15 * of the License, or (at your option) any later version.
17 * GROMACS is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 * Lesser General Public License for more details.
22 * You should have received a copy of the GNU Lesser General Public
23 * License along with GROMACS; if not, see
24 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
25 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
27 * If you want to redistribute modifications to GROMACS, please
28 * consider that scientific software is very special. Version
29 * control is crucial - bugs must be traceable. We will be happy to
30 * consider code for inclusion in the official distribution, but
31 * derived work must not be called official GROMACS. Details are found
32 * in the README & COPYING files - if they are missing, get the
33 * official version at http://www.gromacs.org.
35 * To help us fund GROMACS development, we humbly ask that you cite
36 * the research papers on the package. Check out http://www.gromacs.org.
39 /* Note that floating-point constants in CUDA code should be suffixed
40 * with f (e.g. 0.5f), to stop the compiler producing intermediate
41 * code that is in double precision.
44 #include "../../gmxlib/cuda_tools/vectype_ops.cuh"
46 #ifndef NBNXN_CUDA_KERNEL_UTILS_CUH
47 #define NBNXN_CUDA_KERNEL_UTILS_CUH
49 #define WARP_SIZE_POW2_EXPONENT (5)
50 #define CL_SIZE_POW2_EXPONENT (3) /* change this together with GPU_NS_CLUSTER_SIZE !*/
51 #define CL_SIZE_SQ (CL_SIZE * CL_SIZE)
52 #define FBUF_STRIDE (CL_SIZE_SQ)
54 /*! i-cluster interaction mask for a super-cluster with all NCL_PER_SUPERCL bits set */
55 const unsigned supercl_interaction_mask = ((1U << NCL_PER_SUPERCL) - 1U);
57 /*! Interpolate Ewald coulomb force using the table through the tex_nbfp texture.
58 * Original idea: OpenMM
60 static inline __device__
61 float interpolate_coulomb_force_r(float r, float scale)
63 float normalized = scale * r;
64 int index = (int) normalized;
65 float fract2 = normalized - index;
66 float fract1 = 1.0f - fract2;
68 return fract1 * tex1Dfetch(tex_coulomb_tab, index)
69 + fract2 * tex1Dfetch(tex_coulomb_tab, index + 1);
72 /*! Calculate analytical Ewald correction term. */
73 static inline __device__
74 float pmecorrF(float z2)
76 const float FN6 = -1.7357322914161492954e-8f;
77 const float FN5 = 1.4703624142580877519e-6f;
78 const float FN4 = -0.000053401640219807709149f;
79 const float FN3 = 0.0010054721316683106153f;
80 const float FN2 = -0.019278317264888380590f;
81 const float FN1 = 0.069670166153766424023f;
82 const float FN0 = -0.75225204789749321333f;
84 const float FD4 = 0.0011193462567257629232f;
85 const float FD3 = 0.014866955030185295499f;
86 const float FD2 = 0.11583842382862377919f;
87 const float FD1 = 0.50736591960530292870f;
88 const float FD0 = 1.0f;
91 float polyFN0,polyFN1,polyFD0,polyFD1;
95 polyFD0 = FD4*z4 + FD2;
96 polyFD1 = FD3*z4 + FD1;
97 polyFD0 = polyFD0*z4 + FD0;
98 polyFD0 = polyFD1*z2 + polyFD0;
100 polyFD0 = 1.0f/polyFD0;
102 polyFN0 = FN6*z4 + FN4;
103 polyFN1 = FN5*z4 + FN3;
104 polyFN0 = polyFN0*z4 + FN2;
105 polyFN1 = polyFN1*z4 + FN1;
106 polyFN0 = polyFN0*z4 + FN0;
107 polyFN0 = polyFN1*z2 + polyFN0;
109 return polyFN0*polyFD0;
112 /*! Final j-force reduction; this generic implementation works with
113 * arbitrary array sizes.
115 static inline __device__
116 void reduce_force_j_generic(float *f_buf, float3 *fout,
117 int tidxi, int tidxj, int aidx)
121 float3 f = make_float3(0.0f);
122 for (int j = tidxj * CL_SIZE; j < (tidxj + 1) * CL_SIZE; j++)
125 f.y += f_buf[ FBUF_STRIDE + j];
126 f.z += f_buf[2 * FBUF_STRIDE + j];
129 atomicAdd(&fout[aidx], f);
133 /*! Final j-force reduction; this implementation only with power of two
134 * array sizes and with sm >= 3.0
136 #if __CUDA_ARCH__ >= 300
137 static inline __device__
138 void reduce_force_j_warp_shfl(float3 f, float3 *fout,
144 for (i = 0; i < 3; i++)
146 f.x += __shfl_down(f.x, 1<<i);
147 f.y += __shfl_down(f.y, 1<<i);
148 f.z += __shfl_down(f.z, 1<<i);
151 /* Write the reduced j-force on one thread for each j */
154 atomicAdd(&fout[aidx], f);
159 /*! Final i-force reduction; this generic implementation works with
160 * arbitrary array sizes.
162 static inline __device__
163 void reduce_force_i_generic(float *f_buf, float3 *fout,
164 float3 *fshift_buf, bool bCalcFshift,
165 int tidxi, int tidxj, int aidx)
169 float3 f = make_float3(0.0f);
170 for (int j = tidxi; j < CL_SIZE_SQ; j += CL_SIZE)
173 f.y += f_buf[ FBUF_STRIDE + j];
174 f.z += f_buf[2 * FBUF_STRIDE + j];
177 atomicAdd(&fout[aidx], f);
186 /*! Final i-force reduction; this implementation works only with power of two
189 static inline __device__
190 void reduce_force_i_pow2(volatile float *f_buf, float3 *fout,
191 float3 *fshift_buf, bool bCalcFshift,
192 int tidxi, int tidxj, int aidx)
195 float3 f = make_float3(0.0f);
197 /* Reduce the initial CL_SIZE values for each i atom to half
198 * every step by using CL_SIZE * i threads.
199 * Can't just use i as loop variable because than nvcc refuses to unroll.
203 for (j = CL_SIZE_POW2_EXPONENT - 1; j > 0; j--)
208 f_buf[ tidxj * CL_SIZE + tidxi] += f_buf[ (tidxj + i) * CL_SIZE + tidxi];
209 f_buf[ FBUF_STRIDE + tidxj * CL_SIZE + tidxi] += f_buf[ FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi];
210 f_buf[2 * FBUF_STRIDE + tidxj * CL_SIZE + tidxi] += f_buf[2 * FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi];
215 /* i == 1, last reduction step, writing to global mem */
218 f.x = f_buf[ tidxj * CL_SIZE + tidxi] + f_buf[ (tidxj + i) * CL_SIZE + tidxi];
219 f.y = f_buf[ FBUF_STRIDE + tidxj * CL_SIZE + tidxi] + f_buf[ FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi];
220 f.z = f_buf[2 * FBUF_STRIDE + tidxj * CL_SIZE + tidxi] + f_buf[2 * FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi];
222 atomicAdd(&fout[aidx], f);
231 /*! Final i-force reduction wrapper; calls the generic or pow2 reduction depending
232 * on whether the size of the array to be reduced is power of two or not.
234 static inline __device__
235 void reduce_force_i(float *f_buf, float3 *f,
236 float3 *fshift_buf, bool bCalcFshift,
237 int tidxi, int tidxj, int ai)
239 if ((CL_SIZE & (CL_SIZE - 1)))
241 reduce_force_i_generic(f_buf, f, fshift_buf, bCalcFshift, tidxi, tidxj, ai);
245 reduce_force_i_pow2(f_buf, f, fshift_buf, bCalcFshift, tidxi, tidxj, ai);
249 /*! Final i-force reduction; this implementation works only with power of two
250 * array sizes and with sm >= 3.0
252 #if __CUDA_ARCH__ >= 300
253 static inline __device__
254 void reduce_force_i_warp_shfl(float3 fin, float3 *fout,
255 float3 *fshift_buf, bool bCalcFshift,
261 for (j = 0; j < 2; j++)
263 fin.x += __shfl_down(fin.x, CL_SIZE<<j);
264 fin.y += __shfl_down(fin.y, CL_SIZE<<j);
265 fin.z += __shfl_down(fin.z, CL_SIZE<<j);
268 /* The first thread in the warp writes the reduced force */
269 if (tidxj == 0 || tidxj == 4)
271 atomicAdd(&fout[aidx], fin);
275 fshift_buf->x += fin.x;
276 fshift_buf->y += fin.y;
277 fshift_buf->z += fin.z;
283 /*! Energy reduction; this implementation works only with power of two
286 static inline __device__
287 void reduce_energy_pow2(volatile float *buf,
288 float *e_lj, float *e_el,
296 /* Can't just use i as loop variable because than nvcc refuses to unroll. */
298 for (j = WARP_SIZE_POW2_EXPONENT - 1; j > 0; j--)
302 buf[ tidx] += buf[ tidx + i];
303 buf[FBUF_STRIDE + tidx] += buf[FBUF_STRIDE + tidx + i];
308 /* last reduction step, writing to global mem */
311 e1 = buf[ tidx] + buf[ tidx + i];
312 e2 = buf[FBUF_STRIDE + tidx] + buf[FBUF_STRIDE + tidx + i];
319 /*! Energy reduction; this implementation works only with power of two
320 * array sizes and with sm >= 3.0
322 #if __CUDA_ARCH__ >= 300
323 static inline __device__
324 void reduce_energy_warp_shfl(float E_lj, float E_el,
325 float *e_lj, float *e_el,
332 for (i = 0; i < 5; i++)
334 E_lj += __shfl_down(E_lj,sh);
335 E_el += __shfl_down(E_el,sh);
339 /* The first thread in the warp writes the reduced energies */
340 if (tidx == 0 || tidx == WARP_SIZE)
342 atomicAdd(e_lj,E_lj);
343 atomicAdd(e_el,E_el);
346 #endif /* __CUDA_ARCH__ */
348 #endif /* NBNXN_CUDA_KERNEL_UTILS_CUH */