2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2012, The GROMACS development team,
6 * check out http://www.gromacs.org for more information.
7 * Copyright (c) 2012,2013, by the GROMACS development team, led by
8 * David van der Spoel, Berk Hess, Erik Lindahl, and including many
9 * others, as listed in the AUTHORS file in the top-level source
10 * directory and at http://www.gromacs.org.
12 * GROMACS is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU Lesser General Public License
14 * as published by the Free Software Foundation; either version 2.1
15 * of the License, or (at your option) any later version.
17 * GROMACS is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 * Lesser General Public License for more details.
22 * You should have received a copy of the GNU Lesser General Public
23 * License along with GROMACS; if not, see
24 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
25 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
27 * If you want to redistribute modifications to GROMACS, please
28 * consider that scientific software is very special. Version
29 * control is crucial - bugs must be traceable. We will be happy to
30 * consider code for inclusion in the official distribution, but
31 * derived work must not be called official GROMACS. Details are found
32 * in the README & COPYING files - if they are missing, get the
33 * official version at http://www.gromacs.org.
35 * To help us fund GROMACS development, we humbly ask that you cite
36 * the research papers on the package. Check out http://www.gromacs.org.
39 /* Note that floating-point constants in CUDA code should be suffixed
40 * with f (e.g. 0.5f), to stop the compiler producing intermediate
41 * code that is in double precision.
44 #include "../../gmxlib/cuda_tools/vectype_ops.cuh"
46 #ifndef NBNXN_CUDA_KERNEL_UTILS_CUH
47 #define NBNXN_CUDA_KERNEL_UTILS_CUH
49 #define WARP_SIZE_POW2_EXPONENT (5)
50 #define CL_SIZE_POW2_EXPONENT (3) /* change this together with GPU_NS_CLUSTER_SIZE !*/
51 #define CL_SIZE_SQ (CL_SIZE * CL_SIZE)
52 #define FBUF_STRIDE (CL_SIZE_SQ)
54 /*! i-cluster interaction mask for a super-cluster with all NCL_PER_SUPERCL bits set */
55 const unsigned supercl_interaction_mask = ((1U << NCL_PER_SUPERCL) - 1U);
57 /*! Interpolate Ewald coulomb force using the table through the tex_nbfp texture.
58 * Original idea: OpenMM
60 static inline __device__
61 float interpolate_coulomb_force_r(float r, float scale)
63 float normalized = scale * r;
64 int index = (int) normalized;
65 float fract2 = normalized - index;
66 float fract1 = 1.0f - fract2;
68 return fract1 * tex1Dfetch(coulomb_tab_texref, index)
69 + fract2 * tex1Dfetch(coulomb_tab_texref, index + 1);
72 #ifdef TEXOBJ_SUPPORTED
73 static inline __device__
74 float interpolate_coulomb_force_r(cudaTextureObject_t texobj_coulomb_tab,
77 float normalized = scale * r;
78 int index = (int) normalized;
79 float fract2 = normalized - index;
80 float fract1 = 1.0f - fract2;
82 return fract1 * tex1Dfetch<float>(texobj_coulomb_tab, index) +
83 fract2 * tex1Dfetch<float>(texobj_coulomb_tab, index + 1);
88 /*! Calculate analytical Ewald correction term. */
89 static inline __device__
90 float pmecorrF(float z2)
92 const float FN6 = -1.7357322914161492954e-8f;
93 const float FN5 = 1.4703624142580877519e-6f;
94 const float FN4 = -0.000053401640219807709149f;
95 const float FN3 = 0.0010054721316683106153f;
96 const float FN2 = -0.019278317264888380590f;
97 const float FN1 = 0.069670166153766424023f;
98 const float FN0 = -0.75225204789749321333f;
100 const float FD4 = 0.0011193462567257629232f;
101 const float FD3 = 0.014866955030185295499f;
102 const float FD2 = 0.11583842382862377919f;
103 const float FD1 = 0.50736591960530292870f;
104 const float FD0 = 1.0f;
107 float polyFN0,polyFN1,polyFD0,polyFD1;
111 polyFD0 = FD4*z4 + FD2;
112 polyFD1 = FD3*z4 + FD1;
113 polyFD0 = polyFD0*z4 + FD0;
114 polyFD0 = polyFD1*z2 + polyFD0;
116 polyFD0 = 1.0f/polyFD0;
118 polyFN0 = FN6*z4 + FN4;
119 polyFN1 = FN5*z4 + FN3;
120 polyFN0 = polyFN0*z4 + FN2;
121 polyFN1 = polyFN1*z4 + FN1;
122 polyFN0 = polyFN0*z4 + FN0;
123 polyFN0 = polyFN1*z2 + polyFN0;
125 return polyFN0*polyFD0;
128 /*! Final j-force reduction; this generic implementation works with
129 * arbitrary array sizes.
131 static inline __device__
132 void reduce_force_j_generic(float *f_buf, float3 *fout,
133 int tidxi, int tidxj, int aidx)
137 float3 f = make_float3(0.0f);
138 for (int j = tidxj * CL_SIZE; j < (tidxj + 1) * CL_SIZE; j++)
141 f.y += f_buf[ FBUF_STRIDE + j];
142 f.z += f_buf[2 * FBUF_STRIDE + j];
145 atomicAdd(&fout[aidx], f);
149 /*! Final j-force reduction; this implementation only with power of two
150 * array sizes and with sm >= 3.0
152 #if __CUDA_ARCH__ >= 300
153 static inline __device__
154 void reduce_force_j_warp_shfl(float3 f, float3 *fout,
160 for (i = 0; i < 3; i++)
162 f.x += __shfl_down(f.x, 1<<i);
163 f.y += __shfl_down(f.y, 1<<i);
164 f.z += __shfl_down(f.z, 1<<i);
167 /* Write the reduced j-force on one thread for each j */
170 atomicAdd(&fout[aidx], f);
175 /*! Final i-force reduction; this generic implementation works with
176 * arbitrary array sizes.
178 static inline __device__
179 void reduce_force_i_generic(float *f_buf, float3 *fout,
180 float3 *fshift_buf, bool bCalcFshift,
181 int tidxi, int tidxj, int aidx)
185 float3 f = make_float3(0.0f);
186 for (int j = tidxi; j < CL_SIZE_SQ; j += CL_SIZE)
189 f.y += f_buf[ FBUF_STRIDE + j];
190 f.z += f_buf[2 * FBUF_STRIDE + j];
193 atomicAdd(&fout[aidx], f);
202 /*! Final i-force reduction; this implementation works only with power of two
205 static inline __device__
206 void reduce_force_i_pow2(volatile float *f_buf, float3 *fout,
207 float3 *fshift_buf, bool bCalcFshift,
208 int tidxi, int tidxj, int aidx)
211 float3 f = make_float3(0.0f);
213 /* Reduce the initial CL_SIZE values for each i atom to half
214 * every step by using CL_SIZE * i threads.
215 * Can't just use i as loop variable because than nvcc refuses to unroll.
219 for (j = CL_SIZE_POW2_EXPONENT - 1; j > 0; j--)
224 f_buf[ tidxj * CL_SIZE + tidxi] += f_buf[ (tidxj + i) * CL_SIZE + tidxi];
225 f_buf[ FBUF_STRIDE + tidxj * CL_SIZE + tidxi] += f_buf[ FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi];
226 f_buf[2 * FBUF_STRIDE + tidxj * CL_SIZE + tidxi] += f_buf[2 * FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi];
231 /* i == 1, last reduction step, writing to global mem */
234 f.x = f_buf[ tidxj * CL_SIZE + tidxi] + f_buf[ (tidxj + i) * CL_SIZE + tidxi];
235 f.y = f_buf[ FBUF_STRIDE + tidxj * CL_SIZE + tidxi] + f_buf[ FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi];
236 f.z = f_buf[2 * FBUF_STRIDE + tidxj * CL_SIZE + tidxi] + f_buf[2 * FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi];
238 atomicAdd(&fout[aidx], f);
247 /*! Final i-force reduction wrapper; calls the generic or pow2 reduction depending
248 * on whether the size of the array to be reduced is power of two or not.
250 static inline __device__
251 void reduce_force_i(float *f_buf, float3 *f,
252 float3 *fshift_buf, bool bCalcFshift,
253 int tidxi, int tidxj, int ai)
255 if ((CL_SIZE & (CL_SIZE - 1)))
257 reduce_force_i_generic(f_buf, f, fshift_buf, bCalcFshift, tidxi, tidxj, ai);
261 reduce_force_i_pow2(f_buf, f, fshift_buf, bCalcFshift, tidxi, tidxj, ai);
265 /*! Final i-force reduction; this implementation works only with power of two
266 * array sizes and with sm >= 3.0
268 #if __CUDA_ARCH__ >= 300
269 static inline __device__
270 void reduce_force_i_warp_shfl(float3 fin, float3 *fout,
271 float3 *fshift_buf, bool bCalcFshift,
277 for (j = 0; j < 2; j++)
279 fin.x += __shfl_down(fin.x, CL_SIZE<<j);
280 fin.y += __shfl_down(fin.y, CL_SIZE<<j);
281 fin.z += __shfl_down(fin.z, CL_SIZE<<j);
284 /* The first thread in the warp writes the reduced force */
285 if (tidxj == 0 || tidxj == 4)
287 atomicAdd(&fout[aidx], fin);
291 fshift_buf->x += fin.x;
292 fshift_buf->y += fin.y;
293 fshift_buf->z += fin.z;
299 /*! Energy reduction; this implementation works only with power of two
302 static inline __device__
303 void reduce_energy_pow2(volatile float *buf,
304 float *e_lj, float *e_el,
312 /* Can't just use i as loop variable because than nvcc refuses to unroll. */
314 for (j = WARP_SIZE_POW2_EXPONENT - 1; j > 0; j--)
318 buf[ tidx] += buf[ tidx + i];
319 buf[FBUF_STRIDE + tidx] += buf[FBUF_STRIDE + tidx + i];
324 /* last reduction step, writing to global mem */
327 e1 = buf[ tidx] + buf[ tidx + i];
328 e2 = buf[FBUF_STRIDE + tidx] + buf[FBUF_STRIDE + tidx + i];
335 /*! Energy reduction; this implementation works only with power of two
336 * array sizes and with sm >= 3.0
338 #if __CUDA_ARCH__ >= 300
339 static inline __device__
340 void reduce_energy_warp_shfl(float E_lj, float E_el,
341 float *e_lj, float *e_el,
348 for (i = 0; i < 5; i++)
350 E_lj += __shfl_down(E_lj,sh);
351 E_el += __shfl_down(E_el,sh);
355 /* The first thread in the warp writes the reduced energies */
356 if (tidx == 0 || tidx == WARP_SIZE)
358 atomicAdd(e_lj,E_lj);
359 atomicAdd(e_el,E_el);
362 #endif /* __CUDA_ARCH__ */
364 #endif /* NBNXN_CUDA_KERNEL_UTILS_CUH */