1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
4 * This source code is part of
8 * GROningen MAchine for Chemical Simulations
10 * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
11 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
12 * Copyright (c) 2001-2012, The GROMACS development team,
13 * check out http://www.gromacs.org for more information.
15 * This program is free software; you can redistribute it and/or
16 * modify it under the terms of the GNU General Public License
17 * as published by the Free Software Foundation; either version 2
18 * of the License, or (at your option) any later version.
20 * If you want to redistribute modifications, please consider that
21 * scientific software is very special. Version control is crucial -
22 * bugs must be traceable. We will be happy to consider code for
23 * inclusion in the official distribution, but derived work must not
24 * be called official GROMACS. Details are found in the README & COPYING
25 * files - if they are missing, get the official version at www.gromacs.org.
27 * To help us fund GROMACS development, we humbly ask that you cite
28 * the papers on the package - you can find them in the top README file.
30 * For more info, check our website at http://www.gromacs.org
33 * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
35 #include "../../gmxlib/cuda_tools/vectype_ops.cuh"
37 #ifndef NBNXN_CUDA_KERNEL_UTILS_CUH
38 #define NBNXN_CUDA_KERNEL_UTILS_CUH
40 #define WARP_SIZE_POW2_EXPONENT (5)
41 #define CL_SIZE_POW2_EXPONENT (3) /* change this together with GPU_NS_CLUSTER_SIZE !*/
42 #define CL_SIZE_SQ (CL_SIZE * CL_SIZE)
43 #define FBUF_STRIDE (CL_SIZE_SQ)
45 /*! Interpolate Ewald coulomb force using the table through the tex_nbfp texture.
46 * Original idea: OpenMM
48 static inline __device__
49 float interpolate_coulomb_force_r(float r, float scale)
51 float normalized = scale * r;
52 int index = (int) normalized;
53 float fract2 = normalized - index;
54 float fract1 = 1.0f - fract2;
56 return fract1 * tex1Dfetch(tex_coulomb_tab, index)
57 + fract2 * tex1Dfetch(tex_coulomb_tab, index + 1);
60 /*! Final j-force reduction; this generic implementation works with
61 * arbitrary array sizes.
63 static inline __device__
64 void reduce_force_j_generic(float *f_buf, float3 *fout,
65 int tidxi, int tidxj, int aidx)
69 float3 f = make_float3(0.0f);
70 for (int j = tidxj * CL_SIZE; j < (tidxj + 1) * CL_SIZE; j++)
73 f.y += f_buf[ FBUF_STRIDE + j];
74 f.z += f_buf[2 * FBUF_STRIDE + j];
77 atomicAdd(&fout[aidx], f);
81 /*! Final j-force reduction; this implementation only with power of two
82 * array sizes and with sm >= 3.0
84 #if __CUDA_ARCH__ >= 300
85 static inline __device__
86 void reduce_force_j_warp_shfl(float3 f, float3 *fout,
92 for (i = 0; i < 3; i++)
94 f.x += __shfl_down(f.x, 1<<i);
95 f.y += __shfl_down(f.y, 1<<i);
96 f.z += __shfl_down(f.z, 1<<i);
99 /* Write the reduced j-force on one thread for each j */
102 atomicAdd(&fout[aidx], f);
107 /*! Final i-force reduction; this generic implementation works with
108 * arbitrary array sizes.
110 static inline __device__
111 void reduce_force_i_generic(float *f_buf, float3 *fout,
112 float3 *fshift_buf, bool bCalcFshift,
113 int tidxi, int tidxj, int aidx)
117 float3 f = make_float3(0.0f);
118 for (int j = tidxi; j < CL_SIZE_SQ; j += CL_SIZE)
121 f.y += f_buf[ FBUF_STRIDE + j];
122 f.z += f_buf[2 * FBUF_STRIDE + j];
125 atomicAdd(&fout[aidx], f);
134 /*! Final i-force reduction; this implementation works only with power of two
137 static inline __device__
138 void reduce_force_i_pow2(volatile float *f_buf, float3 *fout,
139 float3 *fshift_buf, bool bCalcFshift,
140 int tidxi, int tidxj, int aidx)
143 float3 f = make_float3(0.0f);
145 /* Reduce the initial CL_SIZE values for each i atom to half
146 * every step by using CL_SIZE * i threads.
147 * Can't just use i as loop variable because than nvcc refuses to unroll.
151 for (j = CL_SIZE_POW2_EXPONENT - 1; j > 0; j--)
156 f_buf[ tidxj * CL_SIZE + tidxi] += f_buf[ (tidxj + i) * CL_SIZE + tidxi];
157 f_buf[ FBUF_STRIDE + tidxj * CL_SIZE + tidxi] += f_buf[ FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi];
158 f_buf[2 * FBUF_STRIDE + tidxj * CL_SIZE + tidxi] += f_buf[2 * FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi];
163 /* i == 1, last reduction step, writing to global mem */
166 f.x = f_buf[ tidxj * CL_SIZE + tidxi] + f_buf[ (tidxj + i) * CL_SIZE + tidxi];
167 f.y = f_buf[ FBUF_STRIDE + tidxj * CL_SIZE + tidxi] + f_buf[ FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi];
168 f.z = f_buf[2 * FBUF_STRIDE + tidxj * CL_SIZE + tidxi] + f_buf[2 * FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi];
170 atomicAdd(&fout[aidx], f);
179 /*! Final i-force reduction wrapper; calls the generic or pow2 reduction depending
180 * on whether the size of the array to be reduced is power of two or not.
182 static inline __device__
183 void reduce_force_i(float *f_buf, float3 *f,
184 float3 *fshift_buf, bool bCalcFshift,
185 int tidxi, int tidxj, int ai)
187 if ((CL_SIZE & (CL_SIZE - 1)))
189 reduce_force_i_generic(f_buf, f, fshift_buf, bCalcFshift, tidxi, tidxj, ai);
193 reduce_force_i_pow2(f_buf, f, fshift_buf, bCalcFshift, tidxi, tidxj, ai);
197 /*! Final i-force reduction; this implementation works only with power of two
198 * array sizes and with sm >= 3.0
200 #if __CUDA_ARCH__ >= 300
201 static inline __device__
202 void reduce_force_i_warp_shfl(float3 fin, float3 *fout,
203 float3 *fshift_buf, bool bCalcFshift,
209 for (j = 0; j < 2; j++)
211 fin.x += __shfl_down(fin.x, CL_SIZE<<j);
212 fin.y += __shfl_down(fin.y, CL_SIZE<<j);
213 fin.z += __shfl_down(fin.z, CL_SIZE<<j);
216 /* The first thread in the warp writes the reduced force */
217 if (tidxj == 0 || tidxj == 4)
219 atomicAdd(&fout[aidx], fin);
223 fshift_buf->x += fin.x;
224 fshift_buf->y += fin.y;
225 fshift_buf->z += fin.z;
231 /*! Energy reduction; this implementation works only with power of two
234 static inline __device__
235 void reduce_energy_pow2(volatile float *buf,
236 float *e_lj, float *e_el,
244 /* Can't just use i as loop variable because than nvcc refuses to unroll. */
246 for (j = WARP_SIZE_POW2_EXPONENT - 1; j > 0; j--)
250 buf[ tidx] += buf[ tidx + i];
251 buf[FBUF_STRIDE + tidx] += buf[FBUF_STRIDE + tidx + i];
256 /* last reduction step, writing to global mem */
259 e1 = buf[ tidx] + buf[ tidx + i];
260 e2 = buf[FBUF_STRIDE + tidx] + buf[FBUF_STRIDE + tidx + i];
267 /*! Energy reduction; this implementation works only with power of two
268 * array sizes and with sm >= 3.0
270 #if __CUDA_ARCH__ >= 300
271 static inline __device__
272 void reduce_energy_warp_shfl(float E_lj, float E_el,
273 float *e_lj, float *e_el,
280 for (i = 0; i < 5; i++)
282 E_lj += __shfl_down(E_lj,sh);
283 E_el += __shfl_down(E_el,sh);
287 /* The first thread in the warp writes the reduced energies */
288 if (tidx == 0 || tidx == WARP_SIZE)
290 atomicAdd(e_lj,E_lj);
291 atomicAdd(e_el,E_el);
294 #endif /* __CUDA_ARCH__ */
296 #endif /* NBNXN_CUDA_KERNEL_UTILS_CUH */