2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2016,2017,2018, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 #include "vectype_ops.clh"
38 #define CL_SIZE (NBNXN_GPU_CLUSTER_SIZE)
39 #define NCL_PER_SUPERCL (NBNXN_GPU_NCLUSTER_PER_SUPERCLUSTER)
41 #define WARP_SIZE (CL_SIZE*CL_SIZE/2)
43 #undef KERNEL_UTILS_INLINE
44 #ifdef KERNEL_UTILS_INLINE
45 #define __INLINE__ inline
50 /* 1.0 / sqrt(M_PI) */
51 #define M_FLOAT_1_SQRTPI 0.564189583547756f
55 #ifndef NBNXN_OPENCL_KERNEL_UTILS_CLH
56 #define NBNXN_OPENCL_KERNEL_UTILS_CLH
58 __constant sampler_t generic_sampler = (CLK_NORMALIZED_COORDS_FALSE /* Natural coords */
59 | CLK_ADDRESS_NONE /* No clamp/repeat*/
60 | CLK_FILTER_NEAREST); /* No interpolation */
65 #define WARP_SIZE_LOG2 (5)
66 #define CL_SIZE_LOG2 (3)
68 #define WARP_SIZE_LOG2 (3)
69 #define CL_SIZE_LOG2 (2)
71 #error unsupported CL_SIZE
74 #define CL_SIZE_SQ (CL_SIZE * CL_SIZE)
75 #define FBUF_STRIDE (CL_SIZE_SQ)
77 #define ONE_SIXTH_F 0.16666667f
78 #define ONE_TWELVETH_F 0.08333333f
81 // Data structures shared between OpenCL device code and OpenCL host code
82 // TODO: review, improve
83 // Replaced real by float for now, to avoid including any other header
90 /* Used with potential switching:
91 * rsw = max(r - r_switch, 0)
92 * sw = 1 + c3*rsw^3 + c4*rsw^4 + c5*rsw^5
93 * dsw = 3*c3*rsw^2 + 4*c4*rsw^3 + 5*c5*rsw^4
94 * force = force*dsw - potential*sw
103 // Data structure shared between the OpenCL device code and OpenCL host code
104 // Must not contain OpenCL objects (buffers)
105 typedef struct cl_nbparam_params
108 int eeltype; /**< type of electrostatics, takes values from #eelCu */
109 int vdwtype; /**< type of VdW impl., takes values from #evdwCu */
111 float epsfac; /**< charge multiplication factor */
112 float c_rf; /**< Reaction-field/plain cutoff electrostatics const. */
113 float two_k_rf; /**< Reaction-field electrostatics constant */
114 float ewald_beta; /**< Ewald/PME parameter */
115 float sh_ewald; /**< Ewald/PME correction term substracted from the direct-space potential */
116 float sh_lj_ewald; /**< LJ-Ewald/PME correction term added to the correction potential */
117 float ewaldcoeff_lj; /**< LJ-Ewald/PME coefficient */
119 float rcoulomb_sq; /**< Coulomb cut-off squared */
121 float rvdw_sq; /**< VdW cut-off squared */
122 float rvdw_switch; /**< VdW switched cut-off */
123 float rlistOuter_sq; /**< Full, outer pair-list cut-off squared */
124 float rlistInner_sq; /**< Inner, dynamic pruned pair-list cut-off squared XXX: this is only needed in the pruning kernels, but for now we also pass it to the nonbondeds */
126 shift_consts_t dispersion_shift; /**< VdW shift dispersion constants */
127 shift_consts_t repulsion_shift; /**< VdW shift repulsion constants */
128 switch_consts_t vdw_switch; /**< VdW switch constants */
130 /* Ewald Coulomb force table data - accessed through texture memory */
131 float coulomb_tab_scale; /**< table scale/spacing */
132 }cl_nbparam_params_t;
135 int sci; /* i-super-cluster */
136 int shift; /* Shift vector index plus possible flags */
137 int cj4_ind_start; /* Start index into cj4 */
138 int cj4_ind_end; /* End index into cj4 */
142 unsigned int imask; /* The i-cluster interactions mask for 1 warp */
143 int excl_ind; /* Index into the exclusion array for 1 warp */
147 int cj[4]; /* The 4 j-clusters */
148 nbnxn_im_ei_t imei[2]; /* The i-cluster mask data for 2 warps */
153 unsigned int pair[CL_SIZE*CL_SIZE/2]; /* Topology exclusion interaction bits for one warp,
154 * each unsigned has bitS for 4*8 i clusters
158 /*! i-cluster interaction mask for a super-cluster with all NCL_PER_SUPERCL bits set */
159 __constant unsigned supercl_interaction_mask = ((1U << NCL_PER_SUPERCL) - 1U);
162 /*! \brief Preload cj4 into local memory.
164 * - For AMD we load once for a wavefront of 64 threads (on 4 threads * NTHREAD_Z)
165 * - For NVIDIA once per warp (on 2x4 threads * NTHREAD_Z)
166 * - Same as AMD in the nowarp kernel; we do not assume execution width and therefore
167 * the caller needs to sync.
169 * It is the caller's responsibility to make sure that data is consumed only when
170 * it's ready. This function does not call a barrier.
172 __INLINE__ __device__
173 void preloadCj4(__local int *sm_cjPreload,
174 const __global int *gm_cj,
184 const int c_clSize = CL_SIZE;
185 const int c_nbnxnGpuJgroupSize = NBNXN_GPU_JGROUP_SIZE;
186 const int c_nbnxnGpuClusterpairSplit = 2;
187 const int c_splitClSize = c_clSize/c_nbnxnGpuClusterpairSplit;
189 /* Pre-load cj into shared memory */
190 #if defined _NVIDIA_SOURCE_
191 /* on both warps separately for NVIDIA */
192 if ((tidxj == 0 | tidxj == 4) & (tidxi < c_nbnxnGpuJgroupSize))
194 sm_cjPreload[tidxi + tidxj * c_nbnxnGpuJgroupSize/c_splitClSize] = gm_cj[tidxi];
196 #else // AMD or nowarp
197 /* Note that with "nowarp" / on hardware with wavefronts <64 a barrier is needed after preload. */
198 if (tidxj == 0 & tidxi < c_nbnxnGpuJgroupSize)
200 sm_cjPreload[tidxi] = gm_cj[tidxi];
205 /* \brief Load a cj given a jm index.
207 * If cj4 preloading is enabled, it loads from the local memory, otherwise from global.
209 __INLINE__ __device__
210 int loadCj(__local int *sm_cjPreload,
211 const __global int *gm_cj,
216 const int c_clSize = CL_SIZE;
217 const int c_nbnxnGpuJgroupSize = NBNXN_GPU_JGROUP_SIZE;
218 const int c_nbnxnGpuClusterpairSplit = 2;
219 const int c_splitClSize = c_clSize/c_nbnxnGpuClusterpairSplit;
222 #if defined _NVIDIA_SOURCE_
223 int warpLoadOffset = (tidxj & 4) * c_nbnxnGpuJgroupSize/c_splitClSize;
224 #elif defined _AMD_SOURCE_
225 int warpLoadOffset = 0;
229 return sm_cjPreload[jm + warpLoadOffset];
236 /*! Convert LJ sigma,epsilon parameters to C6,C12. */
237 __INLINE__ __device__
238 void convert_sigma_epsilon_to_c6_c12(const float sigma,
243 float sigma2, sigma6;
245 sigma2 = sigma * sigma;
246 sigma6 = sigma2 *sigma2 * sigma2;
247 *c6 = epsilon * sigma6;
252 /*! Apply force switch, force + energy version. */
253 __INLINE__ __device__
254 void calculate_force_switch_F(cl_nbparam_params_t *nbparam,
263 /* force switch constants */
264 float disp_shift_V2 = nbparam->dispersion_shift.c2;
265 float disp_shift_V3 = nbparam->dispersion_shift.c3;
266 float repu_shift_V2 = nbparam->repulsion_shift.c2;
267 float repu_shift_V3 = nbparam->repulsion_shift.c3;
270 r_switch = r - nbparam->rvdw_switch;
271 r_switch = r_switch >= 0.0f ? r_switch : 0.0f;
274 -c6*(disp_shift_V2 + disp_shift_V3*r_switch)*r_switch*r_switch*inv_r +
275 c12*(-repu_shift_V2 + repu_shift_V3*r_switch)*r_switch*r_switch*inv_r;
278 /*! Apply force switch, force-only version. */
279 __INLINE__ __device__
280 void calculate_force_switch_F_E(cl_nbparam_params_t *nbparam,
290 /* force switch constants */
291 float disp_shift_V2 = nbparam->dispersion_shift.c2;
292 float disp_shift_V3 = nbparam->dispersion_shift.c3;
293 float repu_shift_V2 = nbparam->repulsion_shift.c2;
294 float repu_shift_V3 = nbparam->repulsion_shift.c3;
296 float disp_shift_F2 = nbparam->dispersion_shift.c2/3;
297 float disp_shift_F3 = nbparam->dispersion_shift.c3/4;
298 float repu_shift_F2 = nbparam->repulsion_shift.c2/3;
299 float repu_shift_F3 = nbparam->repulsion_shift.c3/4;
302 r_switch = r - nbparam->rvdw_switch;
303 r_switch = r_switch >= 0.0f ? r_switch : 0.0f;
306 -c6*(disp_shift_V2 + disp_shift_V3*r_switch)*r_switch*r_switch*inv_r +
307 c12*(-repu_shift_V2 + repu_shift_V3*r_switch)*r_switch*r_switch*inv_r;
309 c6*(disp_shift_F2 + disp_shift_F3*r_switch)*r_switch*r_switch*r_switch -
310 c12*(repu_shift_F2 + repu_shift_F3*r_switch)*r_switch*r_switch*r_switch;
313 /*! Apply potential switch, force-only version. */
314 __INLINE__ __device__
315 void calculate_potential_switch_F(cl_nbparam_params_t *nbparam,
324 /* potential switch constants */
325 float switch_V3 = nbparam->vdw_switch.c3;
326 float switch_V4 = nbparam->vdw_switch.c4;
327 float switch_V5 = nbparam->vdw_switch.c5;
328 float switch_F2 = nbparam->vdw_switch.c3;
329 float switch_F3 = nbparam->vdw_switch.c4;
330 float switch_F4 = nbparam->vdw_switch.c5;
333 r_switch = r - nbparam->rvdw_switch;
335 /* Unlike in the F+E kernel, conditional is faster here */
338 sw = 1.0f + (switch_V3 + (switch_V4 + switch_V5*r_switch)*r_switch)*r_switch*r_switch*r_switch;
339 dsw = (switch_F2 + (switch_F3 + switch_F4*r_switch)*r_switch)*r_switch*r_switch;
341 *F_invr = (*F_invr)*sw - inv_r*(*E_lj)*dsw;
345 /*! Apply potential switch, force + energy version. */
346 __INLINE__ __device__
347 void calculate_potential_switch_F_E(cl_nbparam_params_t *nbparam,
356 /* potential switch constants */
357 float switch_V3 = nbparam->vdw_switch.c3;
358 float switch_V4 = nbparam->vdw_switch.c4;
359 float switch_V5 = nbparam->vdw_switch.c5;
360 float switch_F2 = nbparam->vdw_switch.c3;
361 float switch_F3 = nbparam->vdw_switch.c4;
362 float switch_F4 = nbparam->vdw_switch.c5;
365 r_switch = r - nbparam->rvdw_switch;
366 r_switch = r_switch >= 0.0f ? r_switch : 0.0f;
368 /* Unlike in the F-only kernel, masking is faster here */
369 sw = 1.0f + (switch_V3 + (switch_V4 + switch_V5*r_switch)*r_switch)*r_switch*r_switch*r_switch;
370 dsw = (switch_F2 + (switch_F3 + switch_F4*r_switch)*r_switch)*r_switch*r_switch;
372 *F_invr = (*F_invr)*sw - inv_r*(*E_lj)*dsw;
376 /*! Calculate LJ-PME grid force contribution with
377 * geometric combination rule.
379 __INLINE__ __device__
380 void calculate_lj_ewald_comb_geom_F(__constant float * nbfp_comb_climg2d,
389 float c6grid, inv_r6_nm, cr2, expmcr2, poly;
391 c6grid = nbfp_comb_climg2d[2*typei]*nbfp_comb_climg2d[2*typej];
393 /* Recalculate inv_r6 without exclusion mask */
394 inv_r6_nm = inv_r2*inv_r2*inv_r2;
397 poly = 1.0f + cr2 + 0.5f*cr2*cr2;
399 /* Subtract the grid force from the total LJ force */
400 *F_invr += c6grid*(inv_r6_nm - expmcr2*(inv_r6_nm*poly + lje_coeff6_6))*inv_r2;
403 /*! Calculate LJ-PME grid force + energy contribution with
404 * geometric combination rule.
406 __INLINE__ __device__
407 void calculate_lj_ewald_comb_geom_F_E(__constant float *nbfp_comb_climg2d,
408 cl_nbparam_params_t *nbparam,
419 float c6grid, inv_r6_nm, cr2, expmcr2, poly, sh_mask;
421 c6grid = nbfp_comb_climg2d[2*typei]*nbfp_comb_climg2d[2*typej];
423 /* Recalculate inv_r6 without exclusion mask */
424 inv_r6_nm = inv_r2*inv_r2*inv_r2;
427 poly = 1.0f + cr2 + 0.5f*cr2*cr2;
429 /* Subtract the grid force from the total LJ force */
430 *F_invr += c6grid*(inv_r6_nm - expmcr2*(inv_r6_nm*poly + lje_coeff6_6))*inv_r2;
432 /* Shift should be applied only to real LJ pairs */
433 sh_mask = nbparam->sh_lj_ewald*int_bit;
434 *E_lj += ONE_SIXTH_F*c6grid*(inv_r6_nm*(1.0f - expmcr2*poly) + sh_mask);
437 /*! Calculate LJ-PME grid force + energy contribution (if E_lj != NULL) with
438 * Lorentz-Berthelot combination rule.
439 * We use a single F+E kernel with conditional because the performance impact
440 * of this is pretty small and LB on the CPU is anyway very slow.
442 __INLINE__ __device__
443 void calculate_lj_ewald_comb_LB_F_E(__constant float *nbfp_comb_climg2d,
444 cl_nbparam_params_t *nbparam,
456 float c6grid, inv_r6_nm, cr2, expmcr2, poly;
457 float sigma, sigma2, epsilon;
459 /* sigma and epsilon are scaled to give 6*C6 */
460 sigma = nbfp_comb_climg2d[2*typei] + nbfp_comb_climg2d[2*typej];
462 epsilon = nbfp_comb_climg2d[2*typei+1]*nbfp_comb_climg2d[2*typej+1];
464 sigma2 = sigma*sigma;
465 c6grid = epsilon*sigma2*sigma2*sigma2;
467 /* Recalculate inv_r6 without exclusion mask */
468 inv_r6_nm = inv_r2*inv_r2*inv_r2;
471 poly = 1.0f + cr2 + 0.5f*cr2*cr2;
473 /* Subtract the grid force from the total LJ force */
474 *F_invr += c6grid*(inv_r6_nm - expmcr2*(inv_r6_nm*poly + lje_coeff6_6))*inv_r2;
476 if (with_E_lj == true)
480 /* Shift should be applied only to real LJ pairs */
481 sh_mask = nbparam->sh_lj_ewald*int_bit;
482 *E_lj += ONE_SIXTH_F*c6grid*(inv_r6_nm*(1.0f - expmcr2*poly) + sh_mask);
486 /*! Interpolate Ewald coulomb force using the table through the tex_nbfp texture.
487 * Original idea: from the OpenMM project
489 __INLINE__ __device__ float
490 interpolate_coulomb_force_r(__constant float *coulomb_tab_climg2d,
494 float normalized = scale * r;
495 int index = (int) normalized;
496 float fract2 = normalized - index;
497 float fract1 = 1.0f - fract2;
499 return fract1*coulomb_tab_climg2d[index] +
500 fract2*coulomb_tab_climg2d[index + 1];
503 /*! Calculate analytical Ewald correction term. */
504 __INLINE__ __device__
505 float pmecorrF(float z2)
507 const float FN6 = -1.7357322914161492954e-8f;
508 const float FN5 = 1.4703624142580877519e-6f;
509 const float FN4 = -0.000053401640219807709149f;
510 const float FN3 = 0.0010054721316683106153f;
511 const float FN2 = -0.019278317264888380590f;
512 const float FN1 = 0.069670166153766424023f;
513 const float FN0 = -0.75225204789749321333f;
515 const float FD4 = 0.0011193462567257629232f;
516 const float FD3 = 0.014866955030185295499f;
517 const float FD2 = 0.11583842382862377919f;
518 const float FD1 = 0.50736591960530292870f;
519 const float FD0 = 1.0f;
522 float polyFN0, polyFN1, polyFD0, polyFD1;
526 polyFD0 = FD4*z4 + FD2;
527 polyFD1 = FD3*z4 + FD1;
528 polyFD0 = polyFD0*z4 + FD0;
529 polyFD0 = polyFD1*z2 + polyFD0;
531 polyFD0 = 1.0f/polyFD0;
533 polyFN0 = FN6*z4 + FN4;
534 polyFN1 = FN5*z4 + FN3;
535 polyFN0 = polyFN0*z4 + FN2;
536 polyFN1 = polyFN1*z4 + FN1;
537 polyFN0 = polyFN0*z4 + FN0;
538 polyFN0 = polyFN1*z2 + polyFN0;
540 return polyFN0*polyFD0;
543 /*! Final j-force reduction; this generic implementation works with
544 * arbitrary array sizes.
546 /* AMD OpenCL compiler error "Undeclared function index 1024" if __INLINE__d */
547 __INLINE__ __device__
548 void reduce_force_j_generic(__local float *f_buf, __global float *fout,
549 int tidxi, int tidxj, int aidx)
551 /* Split the reduction between the first 3 column threads
552 Threads with column id 0 will do the reduction for (float3).x components
553 Threads with column id 1 will do the reduction for (float3).y components
554 Threads with column id 2 will do the reduction for (float3).z components.
555 The reduction is performed for each line tidxj of f_buf. */
559 for (int j = tidxj * CL_SIZE; j < (tidxj + 1) * CL_SIZE; j++)
561 f += f_buf[FBUF_STRIDE * tidxi + j];
564 atomicAdd_g_f(&fout[3 * aidx + tidxi], f);
568 /*! Final i-force reduction; this generic implementation works with
569 * arbitrary array sizes.
571 __INLINE__ __device__
572 void reduce_force_i_generic(__local float *f_buf, __global float *fout,
573 float *fshift_buf, bool bCalcFshift,
574 int tidxi, int tidxj, int aidx)
576 /* Split the reduction between the first 3 line threads
577 Threads with line id 0 will do the reduction for (float3).x components
578 Threads with line id 1 will do the reduction for (float3).y components
579 Threads with line id 2 will do the reduction for (float3).z components. */
583 for (int j = tidxi; j < CL_SIZE_SQ; j += CL_SIZE)
585 f += f_buf[tidxj * FBUF_STRIDE + j];
588 atomicAdd_g_f(&fout[3 * aidx + tidxj], f);
595 barrier(CLK_LOCAL_MEM_FENCE);
598 /*! Final i-force reduction; this implementation works only with power of two
601 __INLINE__ __device__
602 void reduce_force_i_pow2(volatile __local float *f_buf, __global float *fout,
603 float *fshift_buf, bool bCalcFshift,
604 int tidxi, int tidxj, int aidx)
607 /* Reduce the initial CL_SIZE values for each i atom to half
608 * every step by using CL_SIZE * i threads.
609 * Can't just use i as loop variable because than nvcc refuses to unroll.
612 for (j = CL_SIZE_LOG2 - 1; j > 0; j--)
617 f_buf[ tidxj * CL_SIZE + tidxi] += f_buf[ (tidxj + i) * CL_SIZE + tidxi];
618 f_buf[ FBUF_STRIDE + tidxj * CL_SIZE + tidxi] += f_buf[ FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi];
619 f_buf[2 * FBUF_STRIDE + tidxj * CL_SIZE + tidxi] += f_buf[2 * FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi];
624 * a) for CL_SIZE<8: id 2 (doing z in next block) is in 2nd warp
625 * b) for all CL_SIZE a barrier is needed before f_buf is reused by next reduce_force_i call
627 barrier(CLK_LOCAL_MEM_FENCE);
629 /* i == 1, last reduction step, writing to global mem */
630 /* Split the reduction between the first 3 line threads
631 Threads with line id 0 will do the reduction for (float3).x components
632 Threads with line id 1 will do the reduction for (float3).y components
633 Threads with line id 2 will do the reduction for (float3).z components. */
636 float f = f_buf[tidxj * FBUF_STRIDE + tidxi] + f_buf[tidxj * FBUF_STRIDE + i * CL_SIZE + tidxi];
638 atomicAdd_g_f(&fout[3 * aidx + tidxj], f);
647 /*! Final i-force reduction wrapper; calls the generic or pow2 reduction depending
648 * on whether the size of the array to be reduced is power of two or not.
650 __INLINE__ __device__
651 void reduce_force_i(__local float *f_buf, __global float *f,
652 float *fshift_buf, bool bCalcFshift,
653 int tidxi, int tidxj, int ai)
655 if ((CL_SIZE & (CL_SIZE - 1)))
657 reduce_force_i_generic(f_buf, f, fshift_buf, bCalcFshift, tidxi, tidxj, ai);
661 reduce_force_i_pow2(f_buf, f, fshift_buf, bCalcFshift, tidxi, tidxj, ai);
665 /*! Energy reduction; this implementation works only with power of two
668 __INLINE__ __device__
669 void reduce_energy_pow2(volatile __local float *buf,
670 volatile __global float *e_lj,
671 volatile __global float *e_el,
679 /* Can't just use i as loop variable because than nvcc refuses to unroll. */
680 for (j = WARP_SIZE_LOG2 - 1; j > 0; j--)
684 buf[ tidx] += buf[ tidx + i];
685 buf[FBUF_STRIDE + tidx] += buf[FBUF_STRIDE + tidx + i];
690 /* last reduction step, writing to global mem */
693 e1 = buf[ tidx] + buf[ tidx + i];
694 e2 = buf[FBUF_STRIDE + tidx] + buf[FBUF_STRIDE + tidx + i];
696 atomicAdd_g_f(e_lj, e1);
697 atomicAdd_g_f(e_el, e2);
701 #endif /* NBNXN_OPENCL_KERNEL_UTILS_CLH */