2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2016,2017 by the GROMACS development team.
5 * Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by
6 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
7 * and including many others, as listed in the AUTHORS file in the
8 * top-level source directory and at http://www.gromacs.org.
10 * GROMACS is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public License
12 * as published by the Free Software Foundation; either version 2.1
13 * of the License, or (at your option) any later version.
15 * GROMACS is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with GROMACS; if not, see
22 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
23 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
25 * If you want to redistribute modifications to GROMACS, please
26 * consider that scientific software is very special. Version
27 * control is crucial - bugs must be traceable. We will be happy to
28 * consider code for inclusion in the official distribution, but
29 * derived work must not be called official GROMACS. Details are found
30 * in the README & COPYING files - if they are missing, get the
31 * official version at http://www.gromacs.org.
33 * To help us fund GROMACS development, we humbly ask that you cite
34 * the research papers on the package. Check out http://www.gromacs.org.
38 * Utility constant and function declaration for the OpenCL non-bonded kernels.
39 * This header should be included once at the top level, just before the
40 * kernels are included (has to be preceded by nbnxn_ocl_types.h).
42 * \author Szilárd Páll <pall.szilard@gmail.com>
43 * \ingroup module_nbnxm
48 #include "gromacs/gpu_utils/device_utils.clh"
49 #include "gromacs/gpu_utils/vectype_ops.clh"
51 #include "nbnxm_ocl_consts.h"
53 #define CL_SIZE (c_nbnxnGpuClusterSize)
55 #define WARP_SIZE (CL_SIZE * CL_SIZE / 2) // Currently only c_nbnxnGpuClusterpairSplit=2 supported
57 #if defined _NVIDIA_SOURCE_ || defined _AMD_SOURCE_
58 /* Currently we enable CJ prefetch for AMD/NVIDIA and disable it for other vendors
59 * Note that this should precede the kernel_utils include.
61 # define USE_CJ_PREFETCH 1
63 # define USE_CJ_PREFETCH 0
66 #if defined cl_intel_subgroups || defined cl_khr_subgroups \
67 || (defined __OPENCL_VERSION__ && __OPENCL_VERSION__ >= 210)
68 # define HAVE_SUBGROUP 1
70 # define HAVE_SUBGROUP 0
73 #ifdef cl_intel_subgroups
74 # define HAVE_INTEL_SUBGROUP 1
76 # define HAVE_INTEL_SUBGROUP 0
79 #if defined _INTEL_SOURCE_
80 # define SUBGROUP_SIZE 8
81 #elif defined _AMD_SOURCE_
82 # define SUBGROUP_SIZE 64
84 # define SUBGROUP_SIZE 32
87 #define REDUCE_SHUFFLE (HAVE_INTEL_SUBGROUP && CL_SIZE == 4 && SUBGROUP_SIZE == WARP_SIZE)
88 #define USE_SUBGROUP_ANY (HAVE_SUBGROUP && SUBGROUP_SIZE == WARP_SIZE)
89 #define USE_SUBGROUP_PRELOAD HAVE_INTEL_SUBGROUP
91 /* 1.0 / sqrt(M_PI) */
92 #define M_FLOAT_1_SQRTPI 0.564189583547756F
96 #ifndef NBNXN_OPENCL_KERNEL_UTILS_CLH
97 # define NBNXN_OPENCL_KERNEL_UTILS_CLH
100 # define WARP_SIZE_LOG2 (5)
101 # define CL_SIZE_LOG2 (3)
103 # define WARP_SIZE_LOG2 (3)
104 # define CL_SIZE_LOG2 (2)
106 # error unsupported CL_SIZE
109 # define CL_SIZE_SQ (CL_SIZE * CL_SIZE)
110 # define FBUF_STRIDE (CL_SIZE_SQ)
112 # define ONE_SIXTH_F 0.16666667F
113 # define ONE_TWELVETH_F 0.08333333F
119 # define gmx_unused __attribute__((unused))
124 /*! \brief Single precision floating point short vector type (as rvec in the CPU codebase).
125 * Currently only used to avoid float3 register arrays.
127 typedef float fvec[3];
129 // Data structures shared between OpenCL device code and OpenCL host code
130 // TODO: review, improve
131 // Replaced real by float for now, to avoid including any other header
139 /* Used with potential switching:
140 * rsw = max(r - r_switch, 0)
141 * sw = 1 + c3*rsw^3 + c4*rsw^4 + c5*rsw^5
142 * dsw = 3*c3*rsw^2 + 4*c4*rsw^3 + 5*c5*rsw^4
143 * force = force*dsw - potential*sw
153 // Data structure shared between the OpenCL device code and OpenCL host code
154 // Must not contain OpenCL objects (buffers)
155 typedef struct cl_nbparam_params
158 //! type of electrostatics, takes values from #ElecType
160 //! type of VdW impl., takes values from #VdwType
163 //! charge multiplication factor
165 //! Reaction-field/plain cutoff electrostatics const.
167 //! Reaction-field electrostatics constant
169 //! Ewald/PME parameter
171 //! Ewald/PME correction term substracted from the direct-space potential
173 //! LJ-Ewald/PME correction term added to the correction potential
175 //! LJ-Ewald/PME coefficient
178 //! Coulomb cut-off squared
181 //! VdW cut-off squared
183 //! VdW switched cut-off
185 //! Full, outer pair-list cut-off squared
187 //! Inner, dynamic pruned pair-list cut-off squared XXX: this is only needed in the pruning kernels, but for now we also pass it to the nonbondeds
190 //! VdW shift dispersion constants
191 shift_consts_t dispersion_shift;
192 //! VdW shift repulsion constants
193 shift_consts_t repulsion_shift;
194 //! VdW switch constants
195 switch_consts_t vdw_switch;
197 /* Ewald Coulomb force table data - accessed through texture memory */
198 //! table scale/spacing
199 float coulomb_tab_scale;
200 } cl_nbparam_params_t;
206 //! Shift vector index plus possible flags
208 //! Start index into cj4
210 //! End index into cj4
216 //! The i-cluster interactions mask for 1 warp
218 //! Index into the exclusion array for 1 warp
226 //! The i-cluster mask data for 2 warps
227 nbnxn_im_ei_t imei[2];
233 unsigned int pair[CL_SIZE * CL_SIZE / 2]; /* Topology exclusion interaction bits for one warp,
234 * each unsigned has bitS for 4*8 i clusters
238 /*! i-cluster interaction mask for a super-cluster with all c_nbnxnGpuNumClusterPerSupercluster bits set */
239 __constant const unsigned supercl_interaction_mask = ((1U << c_nbnxnGpuNumClusterPerSupercluster) - 1U);
241 /*! Minimum single precision threshold for r^2 to avoid r^-12 overflow. */
242 __constant const float c_nbnxnMinDistanceSquared = NBNXM_MIN_DISTANCE_SQUARED_VALUE_FLOAT;
244 gmx_opencl_inline void preloadCj4Generic(__local int* sm_cjPreload,
245 const __global int* gm_cj,
248 bool gmx_unused iMaskCond)
250 /* Pre-load cj into shared memory */
251 # if defined _AMD_SOURCE_ // TODO: fix by setting c_nbnxnGpuClusterpairSplit properly
252 if (tidxj == 0 & tidxi < c_nbnxnGpuJgroupSize)
254 sm_cjPreload[tidxi] = gm_cj[tidxi];
257 const int c_clSize = CL_SIZE;
258 const int c_nbnxnGpuClusterpairSplit = 2;
259 const int c_splitClSize = c_clSize / c_nbnxnGpuClusterpairSplit;
260 if ((tidxj == 0 | tidxj == c_splitClSize) & (tidxi < c_nbnxnGpuJgroupSize))
262 sm_cjPreload[tidxi + tidxj * c_nbnxnGpuJgroupSize / c_splitClSize] = gm_cj[tidxi];
268 # if USE_SUBGROUP_PRELOAD
269 gmx_opencl_inline int preloadCj4Subgroup(const __global int* gm_cj)
271 // loads subgroup-size # of elements (8) instead of the 4 required
272 // equivalent to *cjs = *gm_cj
273 return intel_sub_group_block_read((const __global uint*)gm_cj);
275 # endif // USE_SUBGROUP_PRELOAD
277 # if USE_SUBGROUP_PRELOAD
278 typedef size_t CjType;
280 typedef __local int* CjType;
283 /*! \brief Preload cj4
285 * - For AMD we load once for a wavefront of 64 threads (on 4 threads * NTHREAD_Z)
286 * - For NVIDIA once per warp (on 2x4 threads * NTHREAD_Z)
287 * - For Intel(/USE_SUBGROUP_PRELOAD) loads into private memory(/register) instead of local memory
289 * It is the caller's responsibility to make sure that data is consumed only when
290 * it's ready. This function does not call a barrier.
292 gmx_opencl_inline void preloadCj4(CjType gmx_unused* cjs,
293 const __global int gmx_unused* gm_cj,
294 int gmx_unused tidxi,
295 int gmx_unused tidxj,
296 bool gmx_unused iMaskCond)
298 # if USE_SUBGROUP_PRELOAD
299 *cjs = preloadCj4Subgroup(gm_cj);
300 # elif USE_CJ_PREFETCH
301 preloadCj4Generic(*cjs, gm_cj, tidxi, tidxj, iMaskCond);
307 gmx_opencl_inline int loadCjPreload(__local int* sm_cjPreload, int jm, int gmx_unused tidxi, int gmx_unused tidxj)
309 # if defined _AMD_SOURCE_
310 int warpLoadOffset = 0; // TODO: fix by setting c_nbnxnGpuClusterpairSplit properly
312 const int c_clSize = CL_SIZE;
313 const int c_nbnxnGpuClusterpairSplit = 2;
314 const int c_splitClSize = c_clSize / c_nbnxnGpuClusterpairSplit;
315 int warpLoadOffset = (tidxj & c_splitClSize) * c_nbnxnGpuJgroupSize / c_splitClSize;
317 return sm_cjPreload[jm + warpLoadOffset];
320 /* \brief Load a cj given a jm index.
322 * If cj4 preloading is enabled, it loads from the local memory, otherwise from global.
324 gmx_opencl_inline int
325 loadCj(CjType cjs, const __global int gmx_unused* gm_cj, int jm, int gmx_unused tidxi, int gmx_unused tidxj)
327 # if USE_SUBGROUP_PRELOAD
328 return sub_group_broadcast(cjs, jm);
329 # elif USE_CJ_PREFETCH
330 return loadCjPreload(cjs, jm, tidxi, tidxj);
336 /*! Convert LJ sigma,epsilon parameters to C6,C12. */
337 gmx_opencl_inline float2 convert_sigma_epsilon_to_c6_c12(const float sigma, const float epsilon)
339 const float sigma2 = sigma * sigma;
340 const float sigma6 = sigma2 * sigma2 * sigma2;
341 const float c6 = epsilon * sigma6;
342 const float2 c6c12 = (float2)(c6, /* c6 */
343 c6 * sigma6); /* c12 */
348 /*! Apply force switch, force + energy version. */
349 gmx_opencl_inline void calculate_force_switch_F(const cl_nbparam_params_t* nbparam,
356 /* force switch constants */
357 const float disp_shift_V2 = nbparam->dispersion_shift.c2;
358 const float disp_shift_V3 = nbparam->dispersion_shift.c3;
359 const float repu_shift_V2 = nbparam->repulsion_shift.c2;
360 const float repu_shift_V3 = nbparam->repulsion_shift.c3;
362 const float r = r2 * inv_r;
363 float r_switch = r - nbparam->rvdw_switch;
364 r_switch = r_switch >= 0.0F ? r_switch : 0.0F;
366 *F_invr += -c6 * (disp_shift_V2 + disp_shift_V3 * r_switch) * r_switch * r_switch * inv_r
367 + c12 * (repu_shift_V2 + repu_shift_V3 * r_switch) * r_switch * r_switch * inv_r;
370 /*! Apply force switch, force-only version. */
371 gmx_opencl_inline void calculate_force_switch_F_E(const cl_nbparam_params_t* nbparam,
379 /* force switch constants */
380 const float disp_shift_V2 = nbparam->dispersion_shift.c2;
381 const float disp_shift_V3 = nbparam->dispersion_shift.c3;
382 const float repu_shift_V2 = nbparam->repulsion_shift.c2;
383 const float repu_shift_V3 = nbparam->repulsion_shift.c3;
385 const float disp_shift_F2 = nbparam->dispersion_shift.c2 / 3;
386 const float disp_shift_F3 = nbparam->dispersion_shift.c3 / 4;
387 const float repu_shift_F2 = nbparam->repulsion_shift.c2 / 3;
388 const float repu_shift_F3 = nbparam->repulsion_shift.c3 / 4;
390 const float r = r2 * inv_r;
391 float r_switch = r - nbparam->rvdw_switch;
392 r_switch = r_switch >= 0.0F ? r_switch : 0.0F;
394 *F_invr += -c6 * (disp_shift_V2 + disp_shift_V3 * r_switch) * r_switch * r_switch * inv_r
395 + c12 * (repu_shift_V2 + repu_shift_V3 * r_switch) * r_switch * r_switch * inv_r;
396 *E_lj += c6 * (disp_shift_F2 + disp_shift_F3 * r_switch) * r_switch * r_switch * r_switch
397 - c12 * (repu_shift_F2 + repu_shift_F3 * r_switch) * r_switch * r_switch * r_switch;
400 /*! Apply potential switch, force-only version. */
401 gmx_opencl_inline void calculate_potential_switch_F(const cl_nbparam_params_t* nbparam,
407 /* potential switch constants */
408 const float switch_V3 = nbparam->vdw_switch.c3;
409 const float switch_V4 = nbparam->vdw_switch.c4;
410 const float switch_V5 = nbparam->vdw_switch.c5;
411 const float switch_F2 = nbparam->vdw_switch.c3;
412 const float switch_F3 = nbparam->vdw_switch.c4;
413 const float switch_F4 = nbparam->vdw_switch.c5;
415 const float r = r2 * inv_r;
416 const float r_switch = r - nbparam->rvdw_switch;
418 /* Unlike in the F+E kernel, conditional is faster here */
421 const float sw = 1.0F
422 + (switch_V3 + (switch_V4 + switch_V5 * r_switch) * r_switch) * r_switch
423 * r_switch * r_switch;
424 const float dsw = (switch_F2 + (switch_F3 + switch_F4 * r_switch) * r_switch) * r_switch * r_switch;
426 *F_invr = (*F_invr) * sw - inv_r * (*E_lj) * dsw;
430 /*! Apply potential switch, force + energy version. */
431 gmx_opencl_inline void calculate_potential_switch_F_E(const cl_nbparam_params_t* nbparam,
437 /* potential switch constants */
438 const float switch_V3 = nbparam->vdw_switch.c3;
439 const float switch_V4 = nbparam->vdw_switch.c4;
440 const float switch_V5 = nbparam->vdw_switch.c5;
441 const float switch_F2 = nbparam->vdw_switch.c3;
442 const float switch_F3 = nbparam->vdw_switch.c4;
443 const float switch_F4 = nbparam->vdw_switch.c5;
445 const float r = r2 * inv_r;
446 float r_switch = r - nbparam->rvdw_switch;
447 r_switch = r_switch >= 0.0F ? r_switch : 0.0F;
449 /* Unlike in the F-only kernel, masking is faster here */
451 1.0F + (switch_V3 + (switch_V4 + switch_V5 * r_switch) * r_switch) * r_switch * r_switch * r_switch;
452 const float dsw = (switch_F2 + (switch_F3 + switch_F4 * r_switch) * r_switch) * r_switch * r_switch;
454 *F_invr = (*F_invr) * sw - inv_r * (*E_lj) * dsw;
458 /*! \brief Fetch C6 grid contribution coefficients and return the product of these.
460 gmx_opencl_inline float calculate_lj_ewald_c6grid(__constant const float2* nbfp_comb, int typei, int typej)
462 const float c6_i = nbfp_comb[typei].x;
463 const float c6_j = nbfp_comb[typej].x;
467 /*! Calculate LJ-PME grid force contribution with
468 * geometric combination rule.
470 gmx_opencl_inline void calculate_lj_ewald_comb_geom_F(__constant const float2* nbfp_comb,
479 const float c6grid = calculate_lj_ewald_c6grid(nbfp_comb, typei, typej);
481 /* Recalculate inv_r6 without exclusion mask */
482 const float inv_r6_nm = inv_r2 * inv_r2 * inv_r2;
483 const float cr2 = lje_coeff2 * r2;
484 const float expmcr2 = exp(-cr2);
485 const float poly = 1.0F + cr2 + HALF_F * cr2 * cr2;
487 /* Subtract the grid force from the total LJ force */
488 *F_invr += c6grid * (inv_r6_nm - expmcr2 * (inv_r6_nm * poly + lje_coeff6_6)) * inv_r2;
491 /*! Calculate LJ-PME grid force + energy contribution with
492 * geometric combination rule.
494 gmx_opencl_inline void calculate_lj_ewald_comb_geom_F_E(__constant const float2* nbfp_comb,
495 const cl_nbparam_params_t* nbparam,
506 const float c6grid = calculate_lj_ewald_c6grid(nbfp_comb, typei, typej);
508 /* Recalculate inv_r6 without exclusion mask */
509 const float inv_r6_nm = inv_r2 * inv_r2 * inv_r2;
510 const float cr2 = lje_coeff2 * r2;
511 const float expmcr2 = exp(-cr2);
512 const float poly = 1.0F + cr2 + HALF_F * cr2 * cr2;
514 /* Subtract the grid force from the total LJ force */
515 *F_invr += c6grid * (inv_r6_nm - expmcr2 * (inv_r6_nm * poly + lje_coeff6_6)) * inv_r2;
517 /* Shift should be applied only to real LJ pairs */
518 const float sh_mask = nbparam->sh_lj_ewald * int_bit;
519 *E_lj += ONE_SIXTH_F * c6grid * (inv_r6_nm * (1.0F - expmcr2 * poly) + sh_mask);
522 /*! Calculate LJ-PME grid force + energy contribution (if E_lj != NULL) with
523 * Lorentz-Berthelot combination rule.
524 * We use a single F+E kernel with conditional because the performance impact
525 * of this is pretty small and LB on the CPU is anyway very slow.
527 gmx_opencl_inline void calculate_lj_ewald_comb_LB_F_E(__constant const float2* nbfp_comb,
528 const cl_nbparam_params_t* nbparam,
540 /* sigma and epsilon are scaled to give 6*C6 */
541 const float2 c6c12_i = nbfp_comb[typei];
542 const float2 c6c12_j = nbfp_comb[typej];
543 const float sigma = c6c12_i.x + c6c12_j.x;
544 const float epsilon = c6c12_i.y * c6c12_j.y;
546 const float sigma2 = sigma * sigma;
547 const float c6grid = epsilon * sigma2 * sigma2 * sigma2;
549 /* Recalculate inv_r6 without exclusion mask */
550 const float inv_r6_nm = inv_r2 * inv_r2 * inv_r2;
551 const float cr2 = lje_coeff2 * r2;
552 const float expmcr2 = exp(-cr2);
553 const float poly = 1.0F + cr2 + HALF_F * cr2 * cr2;
555 /* Subtract the grid force from the total LJ force */
556 *F_invr += c6grid * (inv_r6_nm - expmcr2 * (inv_r6_nm * poly + lje_coeff6_6)) * inv_r2;
561 /* Shift should be applied only to real LJ pairs */
562 const float sh_mask = nbparam->sh_lj_ewald * int_bit;
563 *E_lj += ONE_SIXTH_F * c6grid * (inv_r6_nm * (1.0F - expmcr2 * poly) + sh_mask);
567 /*! Interpolate Ewald coulomb force using the table through the tex_nbfp texture.
568 * Original idea: from the OpenMM project
570 gmx_opencl_inline float interpolate_coulomb_force_r(__constant const float* coulomb_tab, float r, float scale)
572 float normalized = scale * r;
573 int index = (int)normalized;
574 float fract2 = normalized - (float)index;
575 float fract1 = 1.0F - fract2;
577 return fract1 * coulomb_tab[index] + fract2 * coulomb_tab[index + 1];
580 /*! Calculate analytical Ewald correction term. */
581 gmx_opencl_inline float pmecorrF(float z2)
583 const float FN6 = -1.7357322914161492954e-8F;
584 const float FN5 = 1.4703624142580877519e-6F;
585 const float FN4 = -0.000053401640219807709149F;
586 const float FN3 = 0.0010054721316683106153F;
587 const float FN2 = -0.019278317264888380590F;
588 const float FN1 = 0.069670166153766424023F;
589 const float FN0 = -0.75225204789749321333F;
591 const float FD4 = 0.0011193462567257629232F;
592 const float FD3 = 0.014866955030185295499F;
593 const float FD2 = 0.11583842382862377919F;
594 const float FD1 = 0.50736591960530292870F;
595 const float FD0 = 1.0F;
597 const float z4 = z2 * z2;
599 float polyFD0 = FD4 * z4 + FD2;
600 float polyFD1 = FD3 * z4 + FD1;
601 polyFD0 = polyFD0 * z4 + FD0;
602 polyFD0 = polyFD1 * z2 + polyFD0;
604 polyFD0 = 1.0F / polyFD0;
606 float polyFN0 = FN6 * z4 + FN4;
607 float polyFN1 = FN5 * z4 + FN3;
608 polyFN0 = polyFN0 * z4 + FN2;
609 polyFN1 = polyFN1 * z4 + FN1;
610 polyFN0 = polyFN0 * z4 + FN0;
611 polyFN0 = polyFN1 * z2 + polyFN0;
613 return polyFN0 * polyFD0;
617 gmx_opencl_inline void
618 reduce_force_j_shfl(float3 fin, __global float* fout, int gmx_unused tidxi, int gmx_unused tidxj, int aidx)
620 /* Only does reduction over 4 elements in cluster. Needs to be changed
621 * for CL_SIZE>4. See CUDA code for required code */
622 fin.x += intel_sub_group_shuffle_down(fin.x, fin.x, 1);
623 fin.y += intel_sub_group_shuffle_up(fin.y, fin.y, 1);
624 fin.z += intel_sub_group_shuffle_down(fin.z, fin.z, 1);
625 if ((tidxi & 1) == 1)
629 fin.x += intel_sub_group_shuffle_down(fin.x, fin.x, 2);
630 fin.z += intel_sub_group_shuffle_up(fin.z, fin.z, 2);
637 atomicAdd_g_f(&fout[3 * aidx + tidxi], fin.x);
642 gmx_opencl_inline void
643 reduce_force_j_generic(__local float* f_buf, float3 fcj_buf, __global float* fout, int tidxi, int tidxj, int aidx)
645 int tidx = tidxi + tidxj * CL_SIZE;
646 f_buf[tidx] = fcj_buf.x;
647 f_buf[FBUF_STRIDE + tidx] = fcj_buf.y;
648 f_buf[2 * FBUF_STRIDE + tidx] = fcj_buf.z;
650 /* Split the reduction between the first 3 column threads
651 Threads with column id 0 will do the reduction for (float3).x components
652 Threads with column id 1 will do the reduction for (float3).y components
653 Threads with column id 2 will do the reduction for (float3).z components.
654 The reduction is performed for each line tidxj of f_buf. */
658 for (int j = tidxj * CL_SIZE; j < (tidxj + 1) * CL_SIZE; j++)
660 f += f_buf[FBUF_STRIDE * tidxi + j];
663 atomicAdd_g_f(&fout[3 * aidx + tidxi], f);
667 /*! Final j-force reduction
669 gmx_opencl_inline void reduce_force_j(__local float gmx_unused* f_buf,
671 __global float* fout,
677 reduce_force_j_shfl(fcj_buf, fout, tidxi, tidxj, aidx);
679 reduce_force_j_generic(f_buf, fcj_buf, fout, tidxi, tidxj, aidx);
684 gmx_opencl_inline void reduce_force_i_and_shift_shfl(__private fvec fci_buf[],
685 __global float* fout,
691 __global float* fshift)
693 /* Only does reduction over 4 elements in cluster (2 per warp). Needs to be changed
695 float2 fshift_buf = 0;
696 for (int ci_offset = 0; ci_offset < c_nbnxnGpuNumClusterPerSupercluster; ci_offset++)
698 int aidx = (sci * c_nbnxnGpuNumClusterPerSupercluster + ci_offset) * CL_SIZE + tidxi;
699 float3 fin = (float3)(fci_buf[ci_offset][0], fci_buf[ci_offset][1], fci_buf[ci_offset][2]);
700 fin.x += intel_sub_group_shuffle_down(fin.x, fin.x, CL_SIZE);
701 fin.y += intel_sub_group_shuffle_up(fin.y, fin.y, CL_SIZE);
702 fin.z += intel_sub_group_shuffle_down(fin.z, fin.z, CL_SIZE);
708 /* Threads 0,1 and 2,3 increment x,y for their warp */
709 atomicAdd_g_f(&fout[3 * aidx + (tidxj & 1)], fin.x);
712 fshift_buf[0] += fin.x;
714 /* Threads 0 and 2 increment z for their warp */
715 if ((tidxj & 1) == 0)
717 atomicAdd_g_f(&fout[3 * aidx + 2], fin.z);
720 fshift_buf[1] += fin.z;
724 /* add up local shift forces into global mem */
727 // Threads 0,1 and 2,3 update x,y
728 atomicAdd_g_f(&(fshift[3 * shift + (tidxj & 1)]), fshift_buf[0]);
729 // Threads 0 and 2 update z
730 if ((tidxj & 1) == 0)
732 atomicAdd_g_f(&(fshift[3 * shift + 2]), fshift_buf[1]);
738 /*! Final i-force reduction; this implementation works only with power of two
741 gmx_opencl_inline void reduce_force_i_and_shift_pow2(volatile __local float* f_buf,
742 __private fvec fci_buf[],
743 __global float* fout,
749 __global float* fshift)
751 float fshift_buf = 0;
752 for (int ci_offset = 0; ci_offset < c_nbnxnGpuNumClusterPerSupercluster; ci_offset++)
754 int aidx = (sci * c_nbnxnGpuNumClusterPerSupercluster + ci_offset) * CL_SIZE + tidxi;
755 int tidx = tidxi + tidxj * CL_SIZE;
756 /* store i forces in shmem */
757 f_buf[tidx] = fci_buf[ci_offset][0];
758 f_buf[FBUF_STRIDE + tidx] = fci_buf[ci_offset][1];
759 f_buf[2 * FBUF_STRIDE + tidx] = fci_buf[ci_offset][2];
760 barrier(CLK_LOCAL_MEM_FENCE);
762 /* Reduce the initial CL_SIZE values for each i atom to half
763 * every step by using CL_SIZE * i threads.
764 * Can't just use i as loop variable because than nvcc refuses to unroll.
767 for (int j = CL_SIZE_LOG2 - 1; j > 0; j--)
772 f_buf[tidxj * CL_SIZE + tidxi] += f_buf[(tidxj + i) * CL_SIZE + tidxi];
773 f_buf[FBUF_STRIDE + tidxj * CL_SIZE + tidxi] +=
774 f_buf[FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi];
775 f_buf[2 * FBUF_STRIDE + tidxj * CL_SIZE + tidxi] +=
776 f_buf[2 * FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi];
781 * a) for CL_SIZE<8: id 2 (doing z in next block) is in 2nd warp
782 * b) for all CL_SIZE a barrier is needed before f_buf is reused by next reduce_force_i call
783 * TODO: Test on Nvidia for performance difference between having the barrier here or after the atomicAdd
785 barrier(CLK_LOCAL_MEM_FENCE);
787 /* i == 1, last reduction step, writing to global mem */
788 /* Split the reduction between the first 3 line threads
789 Threads with line id 0 will do the reduction for (float3).x components
790 Threads with line id 1 will do the reduction for (float3).y components
791 Threads with line id 2 will do the reduction for (float3).z components. */
794 float f = f_buf[tidxj * FBUF_STRIDE + tidxi] + f_buf[tidxj * FBUF_STRIDE + i * CL_SIZE + tidxi];
796 atomicAdd_g_f(&fout[3 * aidx + tidxj], f);
804 /* add up local shift forces into global mem */
807 /* Only threads with tidxj < 3 will update fshift.
808 The threads performing the update, must be the same as the threads
809 storing the reduction result above.
813 atomicAdd_g_f(&(fshift[3 * shift + tidxj]), fshift_buf);
818 /*! Final i-force reduction
820 gmx_opencl_inline void reduce_force_i_and_shift(__local float gmx_unused* f_buf,
821 __private fvec fci_buf[],
828 __global float* fshift)
831 reduce_force_i_and_shift_shfl(fci_buf, f, bCalcFshift, tidxi, tidxj, sci, shift, fshift);
833 reduce_force_i_and_shift_pow2(f_buf, fci_buf, f, bCalcFshift, tidxi, tidxj, sci, shift, fshift);
839 gmx_opencl_inline void reduce_energy_shfl(float E_lj,
841 volatile __global float* e_lj,
842 volatile __global float* e_el,
845 E_lj = sub_group_reduce_add(E_lj);
846 E_el = sub_group_reduce_add(E_el);
847 /* Should be get_sub_group_local_id()==0. Doesn't work with Intel Classic driver.
848 * To make it possible to use REDUCE_SHUFFLE with single subgroup per i-j pair
849 * (e.g. subgroup size 16 with CL_SIZE 4), either this "if" needs to be changed or
850 * the definition of WARP_SIZE (currently CL_SIZE*CL_SIZE/2) needs to be changed
851 * (by supporting c_nbnxnGpuClusterpairSplit=1). */
852 if (tidx == 0 || tidx == WARP_SIZE)
854 atomicAdd_g_f(e_lj, E_lj);
855 atomicAdd_g_f(e_el, E_el);
860 /*! Energy reduction; this implementation works only with power of two
863 gmx_opencl_inline void reduce_energy_pow2(volatile __local float* buf,
864 volatile __global float* e_lj,
865 volatile __global float* e_el,
868 int i = WARP_SIZE / 2;
870 /* Can't just use i as loop variable because than nvcc refuses to unroll. */
871 for (int j = WARP_SIZE_LOG2 - 1; j > 0; j--)
875 buf[tidx] += buf[tidx + i];
876 buf[FBUF_STRIDE + tidx] += buf[FBUF_STRIDE + tidx + i];
881 /* last reduction step, writing to global mem */
884 float e1 = buf[tidx] + buf[tidx + i];
885 float e2 = buf[FBUF_STRIDE + tidx] + buf[FBUF_STRIDE + tidx + i];
887 atomicAdd_g_f(e_lj, e1);
888 atomicAdd_g_f(e_el, e2);
892 gmx_opencl_inline void reduce_energy(volatile __local float gmx_unused* buf,
895 volatile __global float* e_lj,
896 volatile __global float* e_el,
900 reduce_energy_shfl(E_lj, E_el, e_lj, e_el, tidx);
902 /* flush the energies to shmem and reduce them */
904 buf[FBUF_STRIDE + tidx] = E_el;
905 reduce_energy_pow2(buf + (tidx & WARP_SIZE), e_lj, e_el, tidx & ~WARP_SIZE);
909 gmx_opencl_inline bool gmx_sub_group_any_localmem(volatile __local int* warp_any, int widx, bool pred)
916 bool ret = warp_any[widx];
923 //! Returns a true if predicate is true for any work item in warp
924 gmx_opencl_inline bool gmx_sub_group_any(volatile __local int gmx_unused* warp_any, int gmx_unused widx, bool pred)
926 # if USE_SUBGROUP_ANY
927 return sub_group_any(pred);
929 return gmx_sub_group_any_localmem(warp_any, widx, pred);
933 #endif /* NBNXN_OPENCL_KERNEL_UTILS_CLH */