/* * This file is part of the GROMACS molecular simulation package. * * Copyright (c) 2012,2013,2014,2016,2017 by the GROMACS development team. * Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. * * GROMACS is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public License * as published by the Free Software Foundation; either version 2.1 * of the License, or (at your option) any later version. * * GROMACS is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with GROMACS; if not, see * http://www.gnu.org/licenses, or write to the Free Software Foundation, * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. * * If you want to redistribute modifications to GROMACS, please * consider that scientific software is very special. Version * control is crucial - bugs must be traceable. We will be happy to * consider code for inclusion in the official distribution, but * derived work must not be called official GROMACS. Details are found * in the README & COPYING files - if they are missing, get the * official version at http://www.gromacs.org. * * To help us fund GROMACS development, we humbly ask that you cite * the research papers on the package. Check out http://www.gromacs.org. */ /*! \internal \file * \brief * Utility constant and function declaration for the OpenCL non-bonded kernels. * This header should be included once at the top level, just before the * kernels are included (has to be preceded by nbnxn_ocl_types.h). * * \author Szilárd Páll * \ingroup module_nbnxm */ #define GMX_DOUBLE 0 #include "gromacs/gpu_utils/device_utils.clh" #include "gromacs/gpu_utils/vectype_ops.clh" #include "nbnxm_ocl_consts.h" #define CL_SIZE (c_nbnxnGpuClusterSize) #define WARP_SIZE (CL_SIZE * CL_SIZE / 2) // Currently only c_nbnxnGpuClusterpairSplit=2 supported #if defined _NVIDIA_SOURCE_ || defined _AMD_SOURCE_ /* Currently we enable CJ prefetch for AMD/NVIDIA and disable it for other vendors * Note that this should precede the kernel_utils include. */ # define USE_CJ_PREFETCH 1 #else # define USE_CJ_PREFETCH 0 #endif #if defined cl_intel_subgroups || defined cl_khr_subgroups \ || (defined __OPENCL_VERSION__ && __OPENCL_VERSION__ >= 210) # define HAVE_SUBGROUP 1 #else # define HAVE_SUBGROUP 0 #endif #ifdef cl_intel_subgroups # define HAVE_INTEL_SUBGROUP 1 #else # define HAVE_INTEL_SUBGROUP 0 #endif #if defined _INTEL_SOURCE_ # define SUBGROUP_SIZE 8 #elif defined _AMD_SOURCE_ # define SUBGROUP_SIZE 64 #else # define SUBGROUP_SIZE 32 #endif #define REDUCE_SHUFFLE (HAVE_INTEL_SUBGROUP && CL_SIZE == 4 && SUBGROUP_SIZE == WARP_SIZE) #define USE_SUBGROUP_ANY (HAVE_SUBGROUP && SUBGROUP_SIZE == WARP_SIZE) #define USE_SUBGROUP_PRELOAD HAVE_INTEL_SUBGROUP /* 1.0 / sqrt(M_PI) */ #define M_FLOAT_1_SQRTPI 0.564189583547756F //------------------- #ifndef NBNXN_OPENCL_KERNEL_UTILS_CLH # define NBNXN_OPENCL_KERNEL_UTILS_CLH # if CL_SIZE == 8 # define WARP_SIZE_LOG2 (5) # define CL_SIZE_LOG2 (3) # elif CL_SIZE == 4 # define WARP_SIZE_LOG2 (3) # define CL_SIZE_LOG2 (2) # else # error unsupported CL_SIZE # endif # define CL_SIZE_SQ (CL_SIZE * CL_SIZE) # define FBUF_STRIDE (CL_SIZE_SQ) # define ONE_SIXTH_F 0.16666667F # define ONE_TWELVETH_F 0.08333333F # define HALF_F 0.5F # ifdef __GNUC__ /* GCC, clang */ # define gmx_unused __attribute__((unused)) # else # define gmx_unused # endif /*! \brief Single precision floating point short vector type (as rvec in the CPU codebase). * Currently only used to avoid float3 register arrays. */ typedef float fvec[3]; // Data structures shared between OpenCL device code and OpenCL host code // TODO: review, improve // Replaced real by float for now, to avoid including any other header typedef struct { float c2; float c3; float cpot; } shift_consts_t; /* Used with potential switching: * rsw = max(r - r_switch, 0) * sw = 1 + c3*rsw^3 + c4*rsw^4 + c5*rsw^5 * dsw = 3*c3*rsw^2 + 4*c4*rsw^3 + 5*c5*rsw^4 * force = force*dsw - potential*sw * potential *= sw */ typedef struct { float c3; float c4; float c5; } switch_consts_t; // Data structure shared between the OpenCL device code and OpenCL host code // Must not contain OpenCL objects (buffers) typedef struct cl_nbparam_params { //! type of electrostatics, takes values from #ElecType int elecType; //! type of VdW impl., takes values from #VdwType int vdwType; //! charge multiplication factor float epsfac; //! Reaction-field/plain cutoff electrostatics const. float c_rf; //! Reaction-field electrostatics constant float two_k_rf; //! Ewald/PME parameter float ewald_beta; //! Ewald/PME correction term substracted from the direct-space potential float sh_ewald; //! LJ-Ewald/PME correction term added to the correction potential float sh_lj_ewald; //! LJ-Ewald/PME coefficient float ewaldcoeff_lj; //! Coulomb cut-off squared float rcoulomb_sq; //! VdW cut-off squared float rvdw_sq; //! VdW switched cut-off float rvdw_switch; //! Full, outer pair-list cut-off squared float rlistOuter_sq; //! Inner, dynamic pruned pair-list cut-off squared XXX: this is only needed in the pruning kernels, but for now we also pass it to the nonbondeds float rlistInner_sq; //! VdW shift dispersion constants shift_consts_t dispersion_shift; //! VdW shift repulsion constants shift_consts_t repulsion_shift; //! VdW switch constants switch_consts_t vdw_switch; /* Ewald Coulomb force table data - accessed through texture memory */ //! table scale/spacing float coulomb_tab_scale; } cl_nbparam_params_t; typedef struct { //! i-super-cluster int sci; //! Shift vector index plus possible flags int shift; //! Start index into cj4 int cj4_ind_start; //! End index into cj4 int cj4_ind_end; } nbnxn_sci_t; typedef struct { //! The i-cluster interactions mask for 1 warp unsigned int imask; //! Index into the exclusion array for 1 warp int excl_ind; } nbnxn_im_ei_t; typedef struct { //! The 4 j-clusters int cj[4]; //! The i-cluster mask data for 2 warps nbnxn_im_ei_t imei[2]; } nbnxn_cj4_t; typedef struct { unsigned int pair[CL_SIZE * CL_SIZE / 2]; /* Topology exclusion interaction bits for one warp, * each unsigned has bitS for 4*8 i clusters */ } nbnxn_excl_t; /*! i-cluster interaction mask for a super-cluster with all c_nbnxnGpuNumClusterPerSupercluster bits set */ __constant const unsigned supercl_interaction_mask = ((1U << c_nbnxnGpuNumClusterPerSupercluster) - 1U); /*! Minimum single precision threshold for r^2 to avoid r^-12 overflow. */ __constant const float c_nbnxnMinDistanceSquared = NBNXM_MIN_DISTANCE_SQUARED_VALUE_FLOAT; gmx_opencl_inline void preloadCj4Generic(__local int* sm_cjPreload, const __global int* gm_cj, int tidxi, int tidxj, bool gmx_unused iMaskCond) { /* Pre-load cj into shared memory */ # if defined _AMD_SOURCE_ // TODO: fix by setting c_nbnxnGpuClusterpairSplit properly if (tidxj == 0 & tidxi < c_nbnxnGpuJgroupSize) { sm_cjPreload[tidxi] = gm_cj[tidxi]; } # else const int c_clSize = CL_SIZE; const int c_nbnxnGpuClusterpairSplit = 2; const int c_splitClSize = c_clSize / c_nbnxnGpuClusterpairSplit; if ((tidxj == 0 | tidxj == c_splitClSize) & (tidxi < c_nbnxnGpuJgroupSize)) { sm_cjPreload[tidxi + tidxj * c_nbnxnGpuJgroupSize / c_splitClSize] = gm_cj[tidxi]; } # endif } # if USE_SUBGROUP_PRELOAD gmx_opencl_inline int preloadCj4Subgroup(const __global int* gm_cj) { // loads subgroup-size # of elements (8) instead of the 4 required // equivalent to *cjs = *gm_cj return intel_sub_group_block_read((const __global uint*)gm_cj); } # endif // USE_SUBGROUP_PRELOAD # if USE_SUBGROUP_PRELOAD typedef size_t CjType; # else typedef __local int* CjType; # endif /*! \brief Preload cj4 * * - For AMD we load once for a wavefront of 64 threads (on 4 threads * NTHREAD_Z) * - For NVIDIA once per warp (on 2x4 threads * NTHREAD_Z) * - For Intel(/USE_SUBGROUP_PRELOAD) loads into private memory(/register) instead of local memory * * It is the caller's responsibility to make sure that data is consumed only when * it's ready. This function does not call a barrier. */ gmx_opencl_inline void preloadCj4(CjType gmx_unused* cjs, const __global int gmx_unused* gm_cj, int gmx_unused tidxi, int gmx_unused tidxj, bool gmx_unused iMaskCond) { # if USE_SUBGROUP_PRELOAD *cjs = preloadCj4Subgroup(gm_cj); # elif USE_CJ_PREFETCH preloadCj4Generic(*cjs, gm_cj, tidxi, tidxj, iMaskCond); # else // nothing to do # endif } gmx_opencl_inline int loadCjPreload(__local int* sm_cjPreload, int jm, int gmx_unused tidxi, int gmx_unused tidxj) { # if defined _AMD_SOURCE_ int warpLoadOffset = 0; // TODO: fix by setting c_nbnxnGpuClusterpairSplit properly # else const int c_clSize = CL_SIZE; const int c_nbnxnGpuClusterpairSplit = 2; const int c_splitClSize = c_clSize / c_nbnxnGpuClusterpairSplit; int warpLoadOffset = (tidxj & c_splitClSize) * c_nbnxnGpuJgroupSize / c_splitClSize; # endif return sm_cjPreload[jm + warpLoadOffset]; } /* \brief Load a cj given a jm index. * * If cj4 preloading is enabled, it loads from the local memory, otherwise from global. */ gmx_opencl_inline int loadCj(CjType cjs, const __global int gmx_unused* gm_cj, int jm, int gmx_unused tidxi, int gmx_unused tidxj) { # if USE_SUBGROUP_PRELOAD return sub_group_broadcast(cjs, jm); # elif USE_CJ_PREFETCH return loadCjPreload(cjs, jm, tidxi, tidxj); # else return gm_cj[jm]; # endif } /*! Convert LJ sigma,epsilon parameters to C6,C12. */ gmx_opencl_inline float2 convert_sigma_epsilon_to_c6_c12(const float sigma, const float epsilon) { const float sigma2 = sigma * sigma; const float sigma6 = sigma2 * sigma2 * sigma2; const float c6 = epsilon * sigma6; const float2 c6c12 = (float2)(c6, /* c6 */ c6 * sigma6); /* c12 */ return c6c12; } /*! Apply force switch, force + energy version. */ gmx_opencl_inline void calculate_force_switch_F(const cl_nbparam_params_t* nbparam, float c6, float c12, float inv_r, float r2, float* F_invr) { /* force switch constants */ const float disp_shift_V2 = nbparam->dispersion_shift.c2; const float disp_shift_V3 = nbparam->dispersion_shift.c3; const float repu_shift_V2 = nbparam->repulsion_shift.c2; const float repu_shift_V3 = nbparam->repulsion_shift.c3; const float r = r2 * inv_r; float r_switch = r - nbparam->rvdw_switch; r_switch = r_switch >= 0.0F ? r_switch : 0.0F; *F_invr += -c6 * (disp_shift_V2 + disp_shift_V3 * r_switch) * r_switch * r_switch * inv_r + c12 * (repu_shift_V2 + repu_shift_V3 * r_switch) * r_switch * r_switch * inv_r; } /*! Apply force switch, force-only version. */ gmx_opencl_inline void calculate_force_switch_F_E(const cl_nbparam_params_t* nbparam, float c6, float c12, float inv_r, float r2, float* F_invr, float* E_lj) { /* force switch constants */ const float disp_shift_V2 = nbparam->dispersion_shift.c2; const float disp_shift_V3 = nbparam->dispersion_shift.c3; const float repu_shift_V2 = nbparam->repulsion_shift.c2; const float repu_shift_V3 = nbparam->repulsion_shift.c3; const float disp_shift_F2 = nbparam->dispersion_shift.c2 / 3; const float disp_shift_F3 = nbparam->dispersion_shift.c3 / 4; const float repu_shift_F2 = nbparam->repulsion_shift.c2 / 3; const float repu_shift_F3 = nbparam->repulsion_shift.c3 / 4; const float r = r2 * inv_r; float r_switch = r - nbparam->rvdw_switch; r_switch = r_switch >= 0.0F ? r_switch : 0.0F; *F_invr += -c6 * (disp_shift_V2 + disp_shift_V3 * r_switch) * r_switch * r_switch * inv_r + c12 * (repu_shift_V2 + repu_shift_V3 * r_switch) * r_switch * r_switch * inv_r; *E_lj += c6 * (disp_shift_F2 + disp_shift_F3 * r_switch) * r_switch * r_switch * r_switch - c12 * (repu_shift_F2 + repu_shift_F3 * r_switch) * r_switch * r_switch * r_switch; } /*! Apply potential switch, force-only version. */ gmx_opencl_inline void calculate_potential_switch_F(const cl_nbparam_params_t* nbparam, float inv_r, float r2, float* F_invr, const float* E_lj) { /* potential switch constants */ const float switch_V3 = nbparam->vdw_switch.c3; const float switch_V4 = nbparam->vdw_switch.c4; const float switch_V5 = nbparam->vdw_switch.c5; const float switch_F2 = nbparam->vdw_switch.c3; const float switch_F3 = nbparam->vdw_switch.c4; const float switch_F4 = nbparam->vdw_switch.c5; const float r = r2 * inv_r; const float r_switch = r - nbparam->rvdw_switch; /* Unlike in the F+E kernel, conditional is faster here */ if (r_switch > 0.0F) { const float sw = 1.0F + (switch_V3 + (switch_V4 + switch_V5 * r_switch) * r_switch) * r_switch * r_switch * r_switch; const float dsw = (switch_F2 + (switch_F3 + switch_F4 * r_switch) * r_switch) * r_switch * r_switch; *F_invr = (*F_invr) * sw - inv_r * (*E_lj) * dsw; } } /*! Apply potential switch, force + energy version. */ gmx_opencl_inline void calculate_potential_switch_F_E(const cl_nbparam_params_t* nbparam, float inv_r, float r2, float* F_invr, float* E_lj) { /* potential switch constants */ const float switch_V3 = nbparam->vdw_switch.c3; const float switch_V4 = nbparam->vdw_switch.c4; const float switch_V5 = nbparam->vdw_switch.c5; const float switch_F2 = nbparam->vdw_switch.c3; const float switch_F3 = nbparam->vdw_switch.c4; const float switch_F4 = nbparam->vdw_switch.c5; const float r = r2 * inv_r; float r_switch = r - nbparam->rvdw_switch; r_switch = r_switch >= 0.0F ? r_switch : 0.0F; /* Unlike in the F-only kernel, masking is faster here */ const float sw = 1.0F + (switch_V3 + (switch_V4 + switch_V5 * r_switch) * r_switch) * r_switch * r_switch * r_switch; const float dsw = (switch_F2 + (switch_F3 + switch_F4 * r_switch) * r_switch) * r_switch * r_switch; *F_invr = (*F_invr) * sw - inv_r * (*E_lj) * dsw; *E_lj *= sw; } /*! \brief Fetch C6 grid contribution coefficients and return the product of these. */ gmx_opencl_inline float calculate_lj_ewald_c6grid(__constant const float2* nbfp_comb, int typei, int typej) { const float c6_i = nbfp_comb[typei].x; const float c6_j = nbfp_comb[typej].x; return c6_i * c6_j; } /*! Calculate LJ-PME grid force contribution with * geometric combination rule. */ gmx_opencl_inline void calculate_lj_ewald_comb_geom_F(__constant const float2* nbfp_comb, int typei, int typej, float r2, float inv_r2, float lje_coeff2, float lje_coeff6_6, float* F_invr) { const float c6grid = calculate_lj_ewald_c6grid(nbfp_comb, typei, typej); /* Recalculate inv_r6 without exclusion mask */ const float inv_r6_nm = inv_r2 * inv_r2 * inv_r2; const float cr2 = lje_coeff2 * r2; const float expmcr2 = exp(-cr2); const float poly = 1.0F + cr2 + HALF_F * cr2 * cr2; /* Subtract the grid force from the total LJ force */ *F_invr += c6grid * (inv_r6_nm - expmcr2 * (inv_r6_nm * poly + lje_coeff6_6)) * inv_r2; } /*! Calculate LJ-PME grid force + energy contribution with * geometric combination rule. */ gmx_opencl_inline void calculate_lj_ewald_comb_geom_F_E(__constant const float2* nbfp_comb, const cl_nbparam_params_t* nbparam, int typei, int typej, float r2, float inv_r2, float lje_coeff2, float lje_coeff6_6, float int_bit, float* F_invr, float* E_lj) { const float c6grid = calculate_lj_ewald_c6grid(nbfp_comb, typei, typej); /* Recalculate inv_r6 without exclusion mask */ const float inv_r6_nm = inv_r2 * inv_r2 * inv_r2; const float cr2 = lje_coeff2 * r2; const float expmcr2 = exp(-cr2); const float poly = 1.0F + cr2 + HALF_F * cr2 * cr2; /* Subtract the grid force from the total LJ force */ *F_invr += c6grid * (inv_r6_nm - expmcr2 * (inv_r6_nm * poly + lje_coeff6_6)) * inv_r2; /* Shift should be applied only to real LJ pairs */ const float sh_mask = nbparam->sh_lj_ewald * int_bit; *E_lj += ONE_SIXTH_F * c6grid * (inv_r6_nm * (1.0F - expmcr2 * poly) + sh_mask); } /*! Calculate LJ-PME grid force + energy contribution (if E_lj != NULL) with * Lorentz-Berthelot combination rule. * We use a single F+E kernel with conditional because the performance impact * of this is pretty small and LB on the CPU is anyway very slow. */ gmx_opencl_inline void calculate_lj_ewald_comb_LB_F_E(__constant const float2* nbfp_comb, const cl_nbparam_params_t* nbparam, int typei, int typej, float r2, float inv_r2, float lje_coeff2, float lje_coeff6_6, float int_bit, bool with_E_lj, float* F_invr, float* E_lj) { /* sigma and epsilon are scaled to give 6*C6 */ const float2 c6c12_i = nbfp_comb[typei]; const float2 c6c12_j = nbfp_comb[typej]; const float sigma = c6c12_i.x + c6c12_j.x; const float epsilon = c6c12_i.y * c6c12_j.y; const float sigma2 = sigma * sigma; const float c6grid = epsilon * sigma2 * sigma2 * sigma2; /* Recalculate inv_r6 without exclusion mask */ const float inv_r6_nm = inv_r2 * inv_r2 * inv_r2; const float cr2 = lje_coeff2 * r2; const float expmcr2 = exp(-cr2); const float poly = 1.0F + cr2 + HALF_F * cr2 * cr2; /* Subtract the grid force from the total LJ force */ *F_invr += c6grid * (inv_r6_nm - expmcr2 * (inv_r6_nm * poly + lje_coeff6_6)) * inv_r2; if (with_E_lj) { /* Shift should be applied only to real LJ pairs */ const float sh_mask = nbparam->sh_lj_ewald * int_bit; *E_lj += ONE_SIXTH_F * c6grid * (inv_r6_nm * (1.0F - expmcr2 * poly) + sh_mask); } } /*! Interpolate Ewald coulomb force using the table through the tex_nbfp texture. * Original idea: from the OpenMM project */ gmx_opencl_inline float interpolate_coulomb_force_r(__constant const float* coulomb_tab, float r, float scale) { float normalized = scale * r; int index = (int)normalized; float fract2 = normalized - (float)index; float fract1 = 1.0F - fract2; return fract1 * coulomb_tab[index] + fract2 * coulomb_tab[index + 1]; } /*! Calculate analytical Ewald correction term. */ gmx_opencl_inline float pmecorrF(float z2) { const float FN6 = -1.7357322914161492954e-8F; const float FN5 = 1.4703624142580877519e-6F; const float FN4 = -0.000053401640219807709149F; const float FN3 = 0.0010054721316683106153F; const float FN2 = -0.019278317264888380590F; const float FN1 = 0.069670166153766424023F; const float FN0 = -0.75225204789749321333F; const float FD4 = 0.0011193462567257629232F; const float FD3 = 0.014866955030185295499F; const float FD2 = 0.11583842382862377919F; const float FD1 = 0.50736591960530292870F; const float FD0 = 1.0F; const float z4 = z2 * z2; float polyFD0 = FD4 * z4 + FD2; float polyFD1 = FD3 * z4 + FD1; polyFD0 = polyFD0 * z4 + FD0; polyFD0 = polyFD1 * z2 + polyFD0; polyFD0 = 1.0F / polyFD0; float polyFN0 = FN6 * z4 + FN4; float polyFN1 = FN5 * z4 + FN3; polyFN0 = polyFN0 * z4 + FN2; polyFN1 = polyFN1 * z4 + FN1; polyFN0 = polyFN0 * z4 + FN0; polyFN0 = polyFN1 * z2 + polyFN0; return polyFN0 * polyFD0; } # if REDUCE_SHUFFLE gmx_opencl_inline void reduce_force_j_shfl(float3 fin, __global float* fout, int gmx_unused tidxi, int gmx_unused tidxj, int aidx) { /* Only does reduction over 4 elements in cluster. Needs to be changed * for CL_SIZE>4. See CUDA code for required code */ fin.x += intel_sub_group_shuffle_down(fin.x, fin.x, 1); fin.y += intel_sub_group_shuffle_up(fin.y, fin.y, 1); fin.z += intel_sub_group_shuffle_down(fin.z, fin.z, 1); if ((tidxi & 1) == 1) { fin.x = fin.y; } fin.x += intel_sub_group_shuffle_down(fin.x, fin.x, 2); fin.z += intel_sub_group_shuffle_up(fin.z, fin.z, 2); if (tidxi == 2) { fin.x = fin.z; } if (tidxi < 3) { atomicAdd_g_f(&fout[3 * aidx + tidxi], fin.x); } } # endif gmx_opencl_inline void reduce_force_j_generic(__local float* f_buf, float3 fcj_buf, __global float* fout, int tidxi, int tidxj, int aidx) { int tidx = tidxi + tidxj * CL_SIZE; f_buf[tidx] = fcj_buf.x; f_buf[FBUF_STRIDE + tidx] = fcj_buf.y; f_buf[2 * FBUF_STRIDE + tidx] = fcj_buf.z; /* Split the reduction between the first 3 column threads Threads with column id 0 will do the reduction for (float3).x components Threads with column id 1 will do the reduction for (float3).y components Threads with column id 2 will do the reduction for (float3).z components. The reduction is performed for each line tidxj of f_buf. */ if (tidxi < 3) { float f = 0.0F; for (int j = tidxj * CL_SIZE; j < (tidxj + 1) * CL_SIZE; j++) { f += f_buf[FBUF_STRIDE * tidxi + j]; } atomicAdd_g_f(&fout[3 * aidx + tidxi], f); } } /*! Final j-force reduction */ gmx_opencl_inline void reduce_force_j(__local float gmx_unused* f_buf, float3 fcj_buf, __global float* fout, int tidxi, int tidxj, int aidx) { # if REDUCE_SHUFFLE reduce_force_j_shfl(fcj_buf, fout, tidxi, tidxj, aidx); # else reduce_force_j_generic(f_buf, fcj_buf, fout, tidxi, tidxj, aidx); # endif } # if REDUCE_SHUFFLE gmx_opencl_inline void reduce_force_i_and_shift_shfl(__private fvec fci_buf[], __global float* fout, bool bCalcFshift, int tidxi, int tidxj, int sci, int shift, __global float* fshift) { /* Only does reduction over 4 elements in cluster (2 per warp). Needs to be changed * for CL_SIZE>4.*/ float2 fshift_buf = 0; for (int ci_offset = 0; ci_offset < c_nbnxnGpuNumClusterPerSupercluster; ci_offset++) { int aidx = (sci * c_nbnxnGpuNumClusterPerSupercluster + ci_offset) * CL_SIZE + tidxi; float3 fin = (float3)(fci_buf[ci_offset][0], fci_buf[ci_offset][1], fci_buf[ci_offset][2]); fin.x += intel_sub_group_shuffle_down(fin.x, fin.x, CL_SIZE); fin.y += intel_sub_group_shuffle_up(fin.y, fin.y, CL_SIZE); fin.z += intel_sub_group_shuffle_down(fin.z, fin.z, CL_SIZE); if (tidxj & 1) { fin.x = fin.y; } /* Threads 0,1 and 2,3 increment x,y for their warp */ atomicAdd_g_f(&fout[3 * aidx + (tidxj & 1)], fin.x); if (bCalcFshift) { fshift_buf[0] += fin.x; } /* Threads 0 and 2 increment z for their warp */ if ((tidxj & 1) == 0) { atomicAdd_g_f(&fout[3 * aidx + 2], fin.z); if (bCalcFshift) { fshift_buf[1] += fin.z; } } } /* add up local shift forces into global mem */ if (bCalcFshift) { // Threads 0,1 and 2,3 update x,y atomicAdd_g_f(&(fshift[3 * shift + (tidxj & 1)]), fshift_buf[0]); // Threads 0 and 2 update z if ((tidxj & 1) == 0) { atomicAdd_g_f(&(fshift[3 * shift + 2]), fshift_buf[1]); } } } # endif /*! Final i-force reduction; this implementation works only with power of two * array sizes. */ gmx_opencl_inline void reduce_force_i_and_shift_pow2(volatile __local float* f_buf, __private fvec fci_buf[], __global float* fout, bool bCalcFshift, int tidxi, int tidxj, int sci, int shift, __global float* fshift) { float fshift_buf = 0; for (int ci_offset = 0; ci_offset < c_nbnxnGpuNumClusterPerSupercluster; ci_offset++) { int aidx = (sci * c_nbnxnGpuNumClusterPerSupercluster + ci_offset) * CL_SIZE + tidxi; int tidx = tidxi + tidxj * CL_SIZE; /* store i forces in shmem */ f_buf[tidx] = fci_buf[ci_offset][0]; f_buf[FBUF_STRIDE + tidx] = fci_buf[ci_offset][1]; f_buf[2 * FBUF_STRIDE + tidx] = fci_buf[ci_offset][2]; barrier(CLK_LOCAL_MEM_FENCE); /* Reduce the initial CL_SIZE values for each i atom to half * every step by using CL_SIZE * i threads. * Can't just use i as loop variable because than nvcc refuses to unroll. */ int i = CL_SIZE / 2; for (int j = CL_SIZE_LOG2 - 1; j > 0; j--) { if (tidxj < i) { f_buf[tidxj * CL_SIZE + tidxi] += f_buf[(tidxj + i) * CL_SIZE + tidxi]; f_buf[FBUF_STRIDE + tidxj * CL_SIZE + tidxi] += f_buf[FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi]; f_buf[2 * FBUF_STRIDE + tidxj * CL_SIZE + tidxi] += f_buf[2 * FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi]; } i >>= 1; } /* needed because * a) for CL_SIZE<8: id 2 (doing z in next block) is in 2nd warp * b) for all CL_SIZE a barrier is needed before f_buf is reused by next reduce_force_i call * TODO: Test on Nvidia for performance difference between having the barrier here or after the atomicAdd */ barrier(CLK_LOCAL_MEM_FENCE); /* i == 1, last reduction step, writing to global mem */ /* Split the reduction between the first 3 line threads Threads with line id 0 will do the reduction for (float3).x components Threads with line id 1 will do the reduction for (float3).y components Threads with line id 2 will do the reduction for (float3).z components. */ if (tidxj < 3) { float f = f_buf[tidxj * FBUF_STRIDE + tidxi] + f_buf[tidxj * FBUF_STRIDE + i * CL_SIZE + tidxi]; atomicAdd_g_f(&fout[3 * aidx + tidxj], f); if (bCalcFshift) { fshift_buf += f; } } } /* add up local shift forces into global mem */ if (bCalcFshift) { /* Only threads with tidxj < 3 will update fshift. The threads performing the update, must be the same as the threads storing the reduction result above. */ if (tidxj < 3) { atomicAdd_g_f(&(fshift[3 * shift + tidxj]), fshift_buf); } } } /*! Final i-force reduction */ gmx_opencl_inline void reduce_force_i_and_shift(__local float gmx_unused* f_buf, __private fvec fci_buf[], __global float* f, bool bCalcFshift, int tidxi, int tidxj, int sci, int shift, __global float* fshift) { # if REDUCE_SHUFFLE reduce_force_i_and_shift_shfl(fci_buf, f, bCalcFshift, tidxi, tidxj, sci, shift, fshift); # else reduce_force_i_and_shift_pow2(f_buf, fci_buf, f, bCalcFshift, tidxi, tidxj, sci, shift, fshift); # endif } # if REDUCE_SHUFFLE gmx_opencl_inline void reduce_energy_shfl(float E_lj, float E_el, volatile __global float* e_lj, volatile __global float* e_el, int tidx) { E_lj = sub_group_reduce_add(E_lj); E_el = sub_group_reduce_add(E_el); /* Should be get_sub_group_local_id()==0. Doesn't work with Intel Classic driver. * To make it possible to use REDUCE_SHUFFLE with single subgroup per i-j pair * (e.g. subgroup size 16 with CL_SIZE 4), either this "if" needs to be changed or * the definition of WARP_SIZE (currently CL_SIZE*CL_SIZE/2) needs to be changed * (by supporting c_nbnxnGpuClusterpairSplit=1). */ if (tidx == 0 || tidx == WARP_SIZE) { atomicAdd_g_f(e_lj, E_lj); atomicAdd_g_f(e_el, E_el); } } # endif /*! Energy reduction; this implementation works only with power of two * array sizes. */ gmx_opencl_inline void reduce_energy_pow2(volatile __local float* buf, volatile __global float* e_lj, volatile __global float* e_el, int tidx) { int i = WARP_SIZE / 2; /* Can't just use i as loop variable because than nvcc refuses to unroll. */ for (int j = WARP_SIZE_LOG2 - 1; j > 0; j--) { if (tidx < i) { buf[tidx] += buf[tidx + i]; buf[FBUF_STRIDE + tidx] += buf[FBUF_STRIDE + tidx + i]; } i >>= 1; } /* last reduction step, writing to global mem */ if (tidx == 0) { float e1 = buf[tidx] + buf[tidx + i]; float e2 = buf[FBUF_STRIDE + tidx] + buf[FBUF_STRIDE + tidx + i]; atomicAdd_g_f(e_lj, e1); atomicAdd_g_f(e_el, e2); } } gmx_opencl_inline void reduce_energy(volatile __local float gmx_unused* buf, float E_lj, float E_el, volatile __global float* e_lj, volatile __global float* e_el, int tidx) { # if REDUCE_SHUFFLE reduce_energy_shfl(E_lj, E_el, e_lj, e_el, tidx); # else /* flush the energies to shmem and reduce them */ buf[tidx] = E_lj; buf[FBUF_STRIDE + tidx] = E_el; reduce_energy_pow2(buf + (tidx & WARP_SIZE), e_lj, e_el, tidx & ~WARP_SIZE); # endif } gmx_opencl_inline bool gmx_sub_group_any_localmem(volatile __local int* warp_any, int widx, bool pred) { if (pred) { warp_any[widx] = 1; } bool ret = warp_any[widx]; warp_any[widx] = 0; return ret; } //! Returns a true if predicate is true for any work item in warp gmx_opencl_inline bool gmx_sub_group_any(volatile __local int gmx_unused* warp_any, int gmx_unused widx, bool pred) { # if USE_SUBGROUP_ANY return sub_group_any(pred); # else return gmx_sub_group_any_localmem(warp_any, widx, pred); # endif } #endif /* NBNXN_OPENCL_KERNEL_UTILS_CLH */