2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 /* Note that floating-point constants in CUDA code should be suffixed
37 * with f (e.g. 0.5f), to stop the compiler producing intermediate
38 * code that is in double precision.
41 #include "../../gmxlib/cuda_tools/vectype_ops.cuh"
43 #ifndef NBNXN_CUDA_KERNEL_UTILS_CUH
44 #define NBNXN_CUDA_KERNEL_UTILS_CUH
46 #define WARP_SIZE_POW2_EXPONENT (5)
47 #define CL_SIZE_POW2_EXPONENT (3) /* change this together with GPU_NS_CLUSTER_SIZE !*/
48 #define CL_SIZE_SQ (CL_SIZE * CL_SIZE)
49 #define FBUF_STRIDE (CL_SIZE_SQ)
51 /*! i-cluster interaction mask for a super-cluster with all NCL_PER_SUPERCL bits set */
52 const unsigned supercl_interaction_mask = ((1U << NCL_PER_SUPERCL) - 1U);
54 /*! Interpolate Ewald coulomb force using the table through the tex_nbfp texture.
55 * Original idea: from the OpenMM project
57 static inline __device__
58 float interpolate_coulomb_force_r(float r, float scale)
60 float normalized = scale * r;
61 int index = (int) normalized;
62 float fract2 = normalized - index;
63 float fract1 = 1.0f - fract2;
65 return fract1 * tex1Dfetch(coulomb_tab_texref, index)
66 + fract2 * tex1Dfetch(coulomb_tab_texref, index + 1);
69 #ifdef TEXOBJ_SUPPORTED
70 static inline __device__
71 float interpolate_coulomb_force_r(cudaTextureObject_t texobj_coulomb_tab,
74 float normalized = scale * r;
75 int index = (int) normalized;
76 float fract2 = normalized - index;
77 float fract1 = 1.0f - fract2;
79 return fract1 * tex1Dfetch<float>(texobj_coulomb_tab, index) +
80 fract2 * tex1Dfetch<float>(texobj_coulomb_tab, index + 1);
85 /*! Calculate analytical Ewald correction term. */
86 static inline __device__
87 float pmecorrF(float z2)
89 const float FN6 = -1.7357322914161492954e-8f;
90 const float FN5 = 1.4703624142580877519e-6f;
91 const float FN4 = -0.000053401640219807709149f;
92 const float FN3 = 0.0010054721316683106153f;
93 const float FN2 = -0.019278317264888380590f;
94 const float FN1 = 0.069670166153766424023f;
95 const float FN0 = -0.75225204789749321333f;
97 const float FD4 = 0.0011193462567257629232f;
98 const float FD3 = 0.014866955030185295499f;
99 const float FD2 = 0.11583842382862377919f;
100 const float FD1 = 0.50736591960530292870f;
101 const float FD0 = 1.0f;
104 float polyFN0, polyFN1, polyFD0, polyFD1;
108 polyFD0 = FD4*z4 + FD2;
109 polyFD1 = FD3*z4 + FD1;
110 polyFD0 = polyFD0*z4 + FD0;
111 polyFD0 = polyFD1*z2 + polyFD0;
113 polyFD0 = 1.0f/polyFD0;
115 polyFN0 = FN6*z4 + FN4;
116 polyFN1 = FN5*z4 + FN3;
117 polyFN0 = polyFN0*z4 + FN2;
118 polyFN1 = polyFN1*z4 + FN1;
119 polyFN0 = polyFN0*z4 + FN0;
120 polyFN0 = polyFN1*z2 + polyFN0;
122 return polyFN0*polyFD0;
125 /*! Final j-force reduction; this generic implementation works with
126 * arbitrary array sizes.
128 static inline __device__
129 void reduce_force_j_generic(float *f_buf, float3 *fout,
130 int tidxi, int tidxj, int aidx)
134 float3 f = make_float3(0.0f);
135 for (int j = tidxj * CL_SIZE; j < (tidxj + 1) * CL_SIZE; j++)
138 f.y += f_buf[ FBUF_STRIDE + j];
139 f.z += f_buf[2 * FBUF_STRIDE + j];
142 atomicAdd(&fout[aidx], f);
146 /*! Final j-force reduction; this implementation only with power of two
147 * array sizes and with sm >= 3.0
149 #if __CUDA_ARCH__ >= 300
150 static inline __device__
151 void reduce_force_j_warp_shfl(float3 f, float3 *fout,
157 for (i = 0; i < 3; i++)
159 f.x += __shfl_down(f.x, 1<<i);
160 f.y += __shfl_down(f.y, 1<<i);
161 f.z += __shfl_down(f.z, 1<<i);
164 /* Write the reduced j-force on one thread for each j */
167 atomicAdd(&fout[aidx], f);
172 /*! Final i-force reduction; this generic implementation works with
173 * arbitrary array sizes.
175 static inline __device__
176 void reduce_force_i_generic(float *f_buf, float3 *fout,
177 float3 *fshift_buf, bool bCalcFshift,
178 int tidxi, int tidxj, int aidx)
182 float3 f = make_float3(0.0f);
183 for (int j = tidxi; j < CL_SIZE_SQ; j += CL_SIZE)
186 f.y += f_buf[ FBUF_STRIDE + j];
187 f.z += f_buf[2 * FBUF_STRIDE + j];
190 atomicAdd(&fout[aidx], f);
199 /*! Final i-force reduction; this implementation works only with power of two
202 static inline __device__
203 void reduce_force_i_pow2(volatile float *f_buf, float3 *fout,
204 float3 *fshift_buf, bool bCalcFshift,
205 int tidxi, int tidxj, int aidx)
208 float3 f = make_float3(0.0f);
210 /* Reduce the initial CL_SIZE values for each i atom to half
211 * every step by using CL_SIZE * i threads.
212 * Can't just use i as loop variable because than nvcc refuses to unroll.
216 for (j = CL_SIZE_POW2_EXPONENT - 1; j > 0; j--)
221 f_buf[ tidxj * CL_SIZE + tidxi] += f_buf[ (tidxj + i) * CL_SIZE + tidxi];
222 f_buf[ FBUF_STRIDE + tidxj * CL_SIZE + tidxi] += f_buf[ FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi];
223 f_buf[2 * FBUF_STRIDE + tidxj * CL_SIZE + tidxi] += f_buf[2 * FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi];
228 /* i == 1, last reduction step, writing to global mem */
231 f.x = f_buf[ tidxj * CL_SIZE + tidxi] + f_buf[ (tidxj + i) * CL_SIZE + tidxi];
232 f.y = f_buf[ FBUF_STRIDE + tidxj * CL_SIZE + tidxi] + f_buf[ FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi];
233 f.z = f_buf[2 * FBUF_STRIDE + tidxj * CL_SIZE + tidxi] + f_buf[2 * FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi];
235 atomicAdd(&fout[aidx], f);
244 /*! Final i-force reduction wrapper; calls the generic or pow2 reduction depending
245 * on whether the size of the array to be reduced is power of two or not.
247 static inline __device__
248 void reduce_force_i(float *f_buf, float3 *f,
249 float3 *fshift_buf, bool bCalcFshift,
250 int tidxi, int tidxj, int ai)
252 if ((CL_SIZE & (CL_SIZE - 1)))
254 reduce_force_i_generic(f_buf, f, fshift_buf, bCalcFshift, tidxi, tidxj, ai);
258 reduce_force_i_pow2(f_buf, f, fshift_buf, bCalcFshift, tidxi, tidxj, ai);
262 /*! Final i-force reduction; this implementation works only with power of two
263 * array sizes and with sm >= 3.0
265 #if __CUDA_ARCH__ >= 300
266 static inline __device__
267 void reduce_force_i_warp_shfl(float3 fin, float3 *fout,
268 float3 *fshift_buf, bool bCalcFshift,
274 for (j = 0; j < 2; j++)
276 fin.x += __shfl_down(fin.x, CL_SIZE<<j);
277 fin.y += __shfl_down(fin.y, CL_SIZE<<j);
278 fin.z += __shfl_down(fin.z, CL_SIZE<<j);
281 /* The first thread in the warp writes the reduced force */
282 if (tidxj == 0 || tidxj == 4)
284 atomicAdd(&fout[aidx], fin);
288 fshift_buf->x += fin.x;
289 fshift_buf->y += fin.y;
290 fshift_buf->z += fin.z;
296 /*! Energy reduction; this implementation works only with power of two
299 static inline __device__
300 void reduce_energy_pow2(volatile float *buf,
301 float *e_lj, float *e_el,
309 /* Can't just use i as loop variable because than nvcc refuses to unroll. */
311 for (j = WARP_SIZE_POW2_EXPONENT - 1; j > 0; j--)
315 buf[ tidx] += buf[ tidx + i];
316 buf[FBUF_STRIDE + tidx] += buf[FBUF_STRIDE + tidx + i];
321 /* last reduction step, writing to global mem */
324 e1 = buf[ tidx] + buf[ tidx + i];
325 e2 = buf[FBUF_STRIDE + tidx] + buf[FBUF_STRIDE + tidx + i];
332 /*! Energy reduction; this implementation works only with power of two
333 * array sizes and with sm >= 3.0
335 #if __CUDA_ARCH__ >= 300
336 static inline __device__
337 void reduce_energy_warp_shfl(float E_lj, float E_el,
338 float *e_lj, float *e_el,
345 for (i = 0; i < 5; i++)
347 E_lj += __shfl_down(E_lj, sh);
348 E_el += __shfl_down(E_el, sh);
352 /* The first thread in the warp writes the reduced energies */
353 if (tidx == 0 || tidx == WARP_SIZE)
355 atomicAdd(e_lj, E_lj);
356 atomicAdd(e_el, E_el);
359 #endif /* __CUDA_ARCH__ */
361 #endif /* NBNXN_CUDA_KERNEL_UTILS_CUH */