2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2012, The GROMACS development team,
6 * check out http://www.gromacs.org for more information.
7 * Copyright (c) 2012,2013, by the GROMACS development team, led by
8 * David van der Spoel, Berk Hess, Erik Lindahl, and including many
9 * others, as listed in the AUTHORS file in the top-level source
10 * directory and at http://www.gromacs.org.
12 * GROMACS is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU Lesser General Public License
14 * as published by the Free Software Foundation; either version 2.1
15 * of the License, or (at your option) any later version.
17 * GROMACS is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 * Lesser General Public License for more details.
22 * You should have received a copy of the GNU Lesser General Public
23 * License along with GROMACS; if not, see
24 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
25 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
27 * If you want to redistribute modifications to GROMACS, please
28 * consider that scientific software is very special. Version
29 * control is crucial - bugs must be traceable. We will be happy to
30 * consider code for inclusion in the official distribution, but
31 * derived work must not be called official GROMACS. Details are found
32 * in the README & COPYING files - if they are missing, get the
33 * official version at http://www.gromacs.org.
35 * To help us fund GROMACS development, we humbly ask that you cite
36 * the research papers on the package. Check out http://www.gromacs.org.
39 /* Note that floating-point constants in CUDA code should be suffixed
40 * with f (e.g. 0.5f), to stop the compiler producing intermediate
41 * code that is in double precision.
44 #include "../../gmxlib/cuda_tools/vectype_ops.cuh"
46 #ifndef NBNXN_CUDA_KERNEL_UTILS_CUH
47 #define NBNXN_CUDA_KERNEL_UTILS_CUH
49 #define WARP_SIZE_POW2_EXPONENT (5)
50 #define CL_SIZE_POW2_EXPONENT (3) /* change this together with GPU_NS_CLUSTER_SIZE !*/
51 #define CL_SIZE_SQ (CL_SIZE * CL_SIZE)
52 #define FBUF_STRIDE (CL_SIZE_SQ)
54 /*! i-cluster interaction mask for a super-cluster with all NCL_PER_SUPERCL bits set */
55 const unsigned supercl_interaction_mask = ((1U << NCL_PER_SUPERCL) - 1U);
57 /*! Interpolate Ewald coulomb force using the table through the tex_nbfp texture.
58 * Original idea: OpenMM
60 static inline __device__
61 float interpolate_coulomb_force_r(float r, float scale)
63 float normalized = scale * r;
64 int index = (int) normalized;
65 float fract2 = normalized - index;
66 float fract1 = 1.0f - fract2;
68 return fract1 * tex1Dfetch(tex_coulomb_tab, index)
69 + fract2 * tex1Dfetch(tex_coulomb_tab, index + 1);
72 /*! Final j-force reduction; this generic implementation works with
73 * arbitrary array sizes.
75 static inline __device__
76 void reduce_force_j_generic(float *f_buf, float3 *fout,
77 int tidxi, int tidxj, int aidx)
81 float3 f = make_float3(0.0f);
82 for (int j = tidxj * CL_SIZE; j < (tidxj + 1) * CL_SIZE; j++)
85 f.y += f_buf[ FBUF_STRIDE + j];
86 f.z += f_buf[2 * FBUF_STRIDE + j];
89 atomicAdd(&fout[aidx], f);
93 /*! Final j-force reduction; this implementation only with power of two
94 * array sizes and with sm >= 3.0
96 #if __CUDA_ARCH__ >= 300
97 static inline __device__
98 void reduce_force_j_warp_shfl(float3 f, float3 *fout,
104 for (i = 0; i < 3; i++)
106 f.x += __shfl_down(f.x, 1<<i);
107 f.y += __shfl_down(f.y, 1<<i);
108 f.z += __shfl_down(f.z, 1<<i);
111 /* Write the reduced j-force on one thread for each j */
114 atomicAdd(&fout[aidx], f);
119 /*! Final i-force reduction; this generic implementation works with
120 * arbitrary array sizes.
122 static inline __device__
123 void reduce_force_i_generic(float *f_buf, float3 *fout,
124 float3 *fshift_buf, bool bCalcFshift,
125 int tidxi, int tidxj, int aidx)
129 float3 f = make_float3(0.0f);
130 for (int j = tidxi; j < CL_SIZE_SQ; j += CL_SIZE)
133 f.y += f_buf[ FBUF_STRIDE + j];
134 f.z += f_buf[2 * FBUF_STRIDE + j];
137 atomicAdd(&fout[aidx], f);
146 /*! Final i-force reduction; this implementation works only with power of two
149 static inline __device__
150 void reduce_force_i_pow2(volatile float *f_buf, float3 *fout,
151 float3 *fshift_buf, bool bCalcFshift,
152 int tidxi, int tidxj, int aidx)
155 float3 f = make_float3(0.0f);
157 /* Reduce the initial CL_SIZE values for each i atom to half
158 * every step by using CL_SIZE * i threads.
159 * Can't just use i as loop variable because than nvcc refuses to unroll.
163 for (j = CL_SIZE_POW2_EXPONENT - 1; j > 0; j--)
168 f_buf[ tidxj * CL_SIZE + tidxi] += f_buf[ (tidxj + i) * CL_SIZE + tidxi];
169 f_buf[ FBUF_STRIDE + tidxj * CL_SIZE + tidxi] += f_buf[ FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi];
170 f_buf[2 * FBUF_STRIDE + tidxj * CL_SIZE + tidxi] += f_buf[2 * FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi];
175 /* i == 1, last reduction step, writing to global mem */
178 f.x = f_buf[ tidxj * CL_SIZE + tidxi] + f_buf[ (tidxj + i) * CL_SIZE + tidxi];
179 f.y = f_buf[ FBUF_STRIDE + tidxj * CL_SIZE + tidxi] + f_buf[ FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi];
180 f.z = f_buf[2 * FBUF_STRIDE + tidxj * CL_SIZE + tidxi] + f_buf[2 * FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi];
182 atomicAdd(&fout[aidx], f);
191 /*! Final i-force reduction wrapper; calls the generic or pow2 reduction depending
192 * on whether the size of the array to be reduced is power of two or not.
194 static inline __device__
195 void reduce_force_i(float *f_buf, float3 *f,
196 float3 *fshift_buf, bool bCalcFshift,
197 int tidxi, int tidxj, int ai)
199 if ((CL_SIZE & (CL_SIZE - 1)))
201 reduce_force_i_generic(f_buf, f, fshift_buf, bCalcFshift, tidxi, tidxj, ai);
205 reduce_force_i_pow2(f_buf, f, fshift_buf, bCalcFshift, tidxi, tidxj, ai);
209 /*! Final i-force reduction; this implementation works only with power of two
210 * array sizes and with sm >= 3.0
212 #if __CUDA_ARCH__ >= 300
213 static inline __device__
214 void reduce_force_i_warp_shfl(float3 fin, float3 *fout,
215 float3 *fshift_buf, bool bCalcFshift,
221 for (j = 0; j < 2; j++)
223 fin.x += __shfl_down(fin.x, CL_SIZE<<j);
224 fin.y += __shfl_down(fin.y, CL_SIZE<<j);
225 fin.z += __shfl_down(fin.z, CL_SIZE<<j);
228 /* The first thread in the warp writes the reduced force */
229 if (tidxj == 0 || tidxj == 4)
231 atomicAdd(&fout[aidx], fin);
235 fshift_buf->x += fin.x;
236 fshift_buf->y += fin.y;
237 fshift_buf->z += fin.z;
243 /*! Energy reduction; this implementation works only with power of two
246 static inline __device__
247 void reduce_energy_pow2(volatile float *buf,
248 float *e_lj, float *e_el,
256 /* Can't just use i as loop variable because than nvcc refuses to unroll. */
258 for (j = WARP_SIZE_POW2_EXPONENT - 1; j > 0; j--)
262 buf[ tidx] += buf[ tidx + i];
263 buf[FBUF_STRIDE + tidx] += buf[FBUF_STRIDE + tidx + i];
268 /* last reduction step, writing to global mem */
271 e1 = buf[ tidx] + buf[ tidx + i];
272 e2 = buf[FBUF_STRIDE + tidx] + buf[FBUF_STRIDE + tidx + i];
279 /*! Energy reduction; this implementation works only with power of two
280 * array sizes and with sm >= 3.0
282 #if __CUDA_ARCH__ >= 300
283 static inline __device__
284 void reduce_energy_warp_shfl(float E_lj, float E_el,
285 float *e_lj, float *e_el,
292 for (i = 0; i < 5; i++)
294 E_lj += __shfl_down(E_lj,sh);
295 E_el += __shfl_down(E_el,sh);
299 /* The first thread in the warp writes the reduced energies */
300 if (tidx == 0 || tidx == WARP_SIZE)
302 atomicAdd(e_lj,E_lj);
303 atomicAdd(e_el,E_el);
306 #endif /* __CUDA_ARCH__ */
308 #endif /* NBNXN_CUDA_KERNEL_UTILS_CUH */