/*
 * This file is part of the GROMACS molecular simulation package.
 *
 * Copyright (c) 2012,2013,2014,2016,2017 by the GROMACS development team.
 * Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by
 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
 * and including many others, as listed in the AUTHORS file in the
 * top-level source directory and at http://www.gromacs.org.
 *
 * GROMACS is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public License
 * as published by the Free Software Foundation; either version 2.1
 * of the License, or (at your option) any later version.
 *
 * GROMACS is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with GROMACS; if not, see
 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
 *
 * If you want to redistribute modifications to GROMACS, please
 * consider that scientific software is very special. Version
 * control is crucial - bugs must be traceable. We will be happy to
 * consider code for inclusion in the official distribution, but
 * derived work must not be called official GROMACS. Details are found
 * in the README & COPYING files - if they are missing, get the
 * official version at http://www.gromacs.org.
 *
 * To help us fund GROMACS development, we humbly ask that you cite
 * the research papers on the package. Check out http://www.gromacs.org.
 */
/*! \internal \file
 *  \brief
 *  Utility constant and function declaration for the OpenCL non-bonded kernels.
 *  This header should be included once at the top level, just before the
 *  kernels are included (has to be preceded by nbnxn_ocl_types.h).
 *
 *  \author Szilárd Páll <pall.szilard@gmail.com>
 *  \ingroup module_nbnxm
 */

#define GMX_DOUBLE 0

#include "gromacs/gpu_utils/device_utils.clh"
#include "gromacs/gpu_utils/vectype_ops.clh"

#include "nbnxm_ocl_consts.h"

#define CL_SIZE (c_nbnxnGpuClusterSize)

#define WARP_SIZE (CL_SIZE * CL_SIZE / 2) // Currently only c_nbnxnGpuClusterpairSplit=2 supported

#if defined _NVIDIA_SOURCE_ || defined _AMD_SOURCE_
/* Currently we enable CJ prefetch for AMD/NVIDIA and disable it for other vendors
 * Note that this should precede the kernel_utils include.
 */
#    define USE_CJ_PREFETCH 1
#else
#    define USE_CJ_PREFETCH 0
#endif

#if defined cl_intel_subgroups || defined cl_khr_subgroups \
        || (defined __OPENCL_VERSION__ && __OPENCL_VERSION__ >= 210)
#    define HAVE_SUBGROUP 1
#else
#    define HAVE_SUBGROUP 0
#endif

#ifdef cl_intel_subgroups
#    define HAVE_INTEL_SUBGROUP 1
#else
#    define HAVE_INTEL_SUBGROUP 0
#endif

#if defined _INTEL_SOURCE_
#    define SUBGROUP_SIZE 8
#elif defined _AMD_SOURCE_
#    define SUBGROUP_SIZE 64
#else
#    define SUBGROUP_SIZE 32
#endif

#define REDUCE_SHUFFLE (HAVE_INTEL_SUBGROUP && CL_SIZE == 4 && SUBGROUP_SIZE == WARP_SIZE)
#define USE_SUBGROUP_ANY (HAVE_SUBGROUP && SUBGROUP_SIZE == WARP_SIZE)
#define USE_SUBGROUP_PRELOAD HAVE_INTEL_SUBGROUP

/* 1.0 / sqrt(M_PI) */
#define M_FLOAT_1_SQRTPI 0.564189583547756F

//-------------------

#ifndef NBNXN_OPENCL_KERNEL_UTILS_CLH
#    define NBNXN_OPENCL_KERNEL_UTILS_CLH

#    if CL_SIZE == 8
#        define WARP_SIZE_LOG2 (5)
#        define CL_SIZE_LOG2 (3)
#    elif CL_SIZE == 4
#        define WARP_SIZE_LOG2 (3)
#        define CL_SIZE_LOG2 (2)
#    else
#        error unsupported CL_SIZE
#    endif

#    define CL_SIZE_SQ (CL_SIZE * CL_SIZE)
#    define FBUF_STRIDE (CL_SIZE_SQ)

#    define ONE_SIXTH_F 0.16666667F
#    define ONE_TWELVETH_F 0.08333333F
#    define HALF_F 0.5F


#    ifdef __GNUC__
/* GCC, clang */
#        define gmx_unused __attribute__((unused))
#    else
#        define gmx_unused
#    endif

/*! \brief Single precision floating point short vector type (as rvec in the CPU codebase).
 *   Currently only used to avoid float3 register arrays.
 */
typedef float fvec[3];

// Data structures shared between OpenCL device code and OpenCL host code
// TODO: review, improve
// Replaced real by float for now, to avoid including any other header
typedef struct
{
    float c2;
    float c3;
    float cpot;
} shift_consts_t;

/* Used with potential switching:
 * rsw        = max(r - r_switch, 0)
 * sw         = 1 + c3*rsw^3 + c4*rsw^4 + c5*rsw^5
 * dsw        = 3*c3*rsw^2 + 4*c4*rsw^3 + 5*c5*rsw^4
 * force      = force*dsw - potential*sw
 * potential *= sw
 */
typedef struct
{
    float c3;
    float c4;
    float c5;
} switch_consts_t;

// Data structure shared between the OpenCL device code and OpenCL host code
// Must not contain OpenCL objects (buffers)
typedef struct cl_nbparam_params
{

    //! type of electrostatics, takes values from #ElecType
    int elecType;
    //! type of VdW impl., takes values from #VdwType
    int vdwType;

    //! charge multiplication factor
    float epsfac;
    //! Reaction-field/plain cutoff electrostatics const.
    float c_rf;
    //! Reaction-field electrostatics constant
    float two_k_rf;
    //! Ewald/PME parameter
    float ewald_beta;
    //! Ewald/PME correction term substracted from the direct-space potential
    float sh_ewald;
    //! LJ-Ewald/PME correction term added to the correction potential
    float sh_lj_ewald;
    //! LJ-Ewald/PME coefficient
    float ewaldcoeff_lj;

    //! Coulomb cut-off squared
    float rcoulomb_sq;

    //! VdW cut-off squared
    float rvdw_sq;
    //! VdW switched cut-off
    float rvdw_switch;
    //! Full, outer pair-list cut-off squared
    float rlistOuter_sq;
    //! Inner, dynamic pruned pair-list cut-off squared  XXX: this is only needed in the pruning kernels, but for now we also pass it to the nonbondeds
    float rlistInner_sq;

    //! VdW shift dispersion constants
    shift_consts_t dispersion_shift;
    //! VdW shift repulsion constants
    shift_consts_t repulsion_shift;
    //! VdW switch constants
    switch_consts_t vdw_switch;

    /* Ewald Coulomb force table data - accessed through texture memory */
    //! table scale/spacing
    float coulomb_tab_scale;
} cl_nbparam_params_t;

typedef struct
{
    //! i-super-cluster
    int sci;
    //! Shift vector index plus possible flags
    int shift;
    //! Start index into cj4
    int cj4_ind_start;
    //! End index into cj4
    int cj4_ind_end;
} nbnxn_sci_t;

typedef struct
{
    //! The i-cluster interactions mask for 1 warp
    unsigned int imask;
    //! Index into the exclusion array for 1 warp
    int excl_ind;
} nbnxn_im_ei_t;

typedef struct
{
    //! The 4 j-clusters
    int cj[4];
    //! The i-cluster mask data       for 2 warps
    nbnxn_im_ei_t imei[2];
} nbnxn_cj4_t;


typedef struct
{
    unsigned int pair[CL_SIZE * CL_SIZE / 2]; /* Topology exclusion interaction bits for one warp,
                                               * each unsigned has bitS for 4*8 i clusters
                                               */
} nbnxn_excl_t;

/*! i-cluster interaction mask for a super-cluster with all c_nbnxnGpuNumClusterPerSupercluster bits set */
__constant const unsigned supercl_interaction_mask = ((1U << c_nbnxnGpuNumClusterPerSupercluster) - 1U);

/*! Minimum single precision threshold for r^2 to avoid r^-12 overflow. */
__constant const float c_nbnxnMinDistanceSquared = NBNXM_MIN_DISTANCE_SQUARED_VALUE_FLOAT;

gmx_opencl_inline void preloadCj4Generic(__local int*        sm_cjPreload,
                                         const __global int* gm_cj,
                                         int                 tidxi,
                                         int                 tidxj,
                                         bool gmx_unused     iMaskCond)
{
    /* Pre-load cj into shared memory */
#    if defined _AMD_SOURCE_ // TODO: fix by setting c_nbnxnGpuClusterpairSplit properly
    if (tidxj == 0 & tidxi < c_nbnxnGpuJgroupSize)
    {
        sm_cjPreload[tidxi] = gm_cj[tidxi];
    }
#    else
    const int c_clSize                   = CL_SIZE;
    const int c_nbnxnGpuClusterpairSplit = 2;
    const int c_splitClSize              = c_clSize / c_nbnxnGpuClusterpairSplit;
    if ((tidxj == 0 | tidxj == c_splitClSize) & (tidxi < c_nbnxnGpuJgroupSize))
    {
        sm_cjPreload[tidxi + tidxj * c_nbnxnGpuJgroupSize / c_splitClSize] = gm_cj[tidxi];
    }
#    endif
}


#    if USE_SUBGROUP_PRELOAD
gmx_opencl_inline int preloadCj4Subgroup(const __global int* gm_cj)
{
    // loads subgroup-size # of elements (8) instead of the 4 required
    // equivalent to *cjs = *gm_cj
    return intel_sub_group_block_read((const __global uint*)gm_cj);
}
#    endif // USE_SUBGROUP_PRELOAD

#    if USE_SUBGROUP_PRELOAD
typedef size_t CjType;
#    else
typedef __local int* CjType;
#    endif

/*! \brief Preload cj4
 *
 * - For AMD we load once for a wavefront of 64 threads (on 4 threads * NTHREAD_Z)
 * - For NVIDIA once per warp (on 2x4 threads * NTHREAD_Z)
 * - For Intel(/USE_SUBGROUP_PRELOAD) loads into private memory(/register) instead of local memory
 *
 * It is the caller's responsibility to make sure that data is consumed only when
 * it's ready. This function does not call a barrier.
 */
gmx_opencl_inline void preloadCj4(CjType gmx_unused* cjs,
                                  const __global int gmx_unused* gm_cj,
                                  int gmx_unused                 tidxi,
                                  int gmx_unused                 tidxj,
                                  bool gmx_unused                iMaskCond)
{
#    if USE_SUBGROUP_PRELOAD
    *cjs = preloadCj4Subgroup(gm_cj);
#    elif USE_CJ_PREFETCH
    preloadCj4Generic(*cjs, gm_cj, tidxi, tidxj, iMaskCond);
#    else
    // nothing to do
#    endif
}

gmx_opencl_inline int loadCjPreload(__local int* sm_cjPreload, int jm, int gmx_unused tidxi, int gmx_unused tidxj)
{
#    if defined _AMD_SOURCE_
    int         warpLoadOffset = 0; // TODO: fix by setting c_nbnxnGpuClusterpairSplit properly
#    else
    const int c_clSize = CL_SIZE;
    const int c_nbnxnGpuClusterpairSplit = 2;
    const int c_splitClSize = c_clSize / c_nbnxnGpuClusterpairSplit;
    int warpLoadOffset = (tidxj & c_splitClSize) * c_nbnxnGpuJgroupSize / c_splitClSize;
#    endif
    return sm_cjPreload[jm + warpLoadOffset];
}

/* \brief Load a cj given a jm index.
 *
 * If cj4 preloading is enabled, it loads from the local memory, otherwise from global.
 */
gmx_opencl_inline int
loadCj(CjType cjs, const __global int gmx_unused* gm_cj, int jm, int gmx_unused tidxi, int gmx_unused tidxj)
{
#    if USE_SUBGROUP_PRELOAD
    return sub_group_broadcast(cjs, jm);
#    elif USE_CJ_PREFETCH
    return loadCjPreload(cjs, jm, tidxi, tidxj);
#    else
    return gm_cj[jm];
#    endif
}

/*! Convert LJ sigma,epsilon parameters to C6,C12. */
gmx_opencl_inline float2 convert_sigma_epsilon_to_c6_c12(const float sigma, const float epsilon)
{
    const float  sigma2 = sigma * sigma;
    const float  sigma6 = sigma2 * sigma2 * sigma2;
    const float  c6     = epsilon * sigma6;
    const float2 c6c12  = (float2)(c6,           /* c6 */
                                  c6 * sigma6); /* c12 */
    return c6c12;
}


/*! Apply force switch,  force + energy version. */
gmx_opencl_inline void calculate_force_switch_F(const cl_nbparam_params_t* nbparam,
                                                float                      c6,
                                                float                      c12,
                                                float                      inv_r,
                                                float                      r2,
                                                float*                     F_invr)
{
    /* force switch constants */
    const float disp_shift_V2 = nbparam->dispersion_shift.c2;
    const float disp_shift_V3 = nbparam->dispersion_shift.c3;
    const float repu_shift_V2 = nbparam->repulsion_shift.c2;
    const float repu_shift_V3 = nbparam->repulsion_shift.c3;

    const float r        = r2 * inv_r;
    float       r_switch = r - nbparam->rvdw_switch;
    r_switch             = r_switch >= 0.0F ? r_switch : 0.0F;

    *F_invr += -c6 * (disp_shift_V2 + disp_shift_V3 * r_switch) * r_switch * r_switch * inv_r
               + c12 * (repu_shift_V2 + repu_shift_V3 * r_switch) * r_switch * r_switch * inv_r;
}

/*! Apply force switch, force-only version. */
gmx_opencl_inline void calculate_force_switch_F_E(const cl_nbparam_params_t* nbparam,
                                                  float                      c6,
                                                  float                      c12,
                                                  float                      inv_r,
                                                  float                      r2,
                                                  float*                     F_invr,
                                                  float*                     E_lj)
{
    /* force switch constants */
    const float disp_shift_V2 = nbparam->dispersion_shift.c2;
    const float disp_shift_V3 = nbparam->dispersion_shift.c3;
    const float repu_shift_V2 = nbparam->repulsion_shift.c2;
    const float repu_shift_V3 = nbparam->repulsion_shift.c3;

    const float disp_shift_F2 = nbparam->dispersion_shift.c2 / 3;
    const float disp_shift_F3 = nbparam->dispersion_shift.c3 / 4;
    const float repu_shift_F2 = nbparam->repulsion_shift.c2 / 3;
    const float repu_shift_F3 = nbparam->repulsion_shift.c3 / 4;

    const float r        = r2 * inv_r;
    float       r_switch = r - nbparam->rvdw_switch;
    r_switch             = r_switch >= 0.0F ? r_switch : 0.0F;

    *F_invr += -c6 * (disp_shift_V2 + disp_shift_V3 * r_switch) * r_switch * r_switch * inv_r
               + c12 * (repu_shift_V2 + repu_shift_V3 * r_switch) * r_switch * r_switch * inv_r;
    *E_lj += c6 * (disp_shift_F2 + disp_shift_F3 * r_switch) * r_switch * r_switch * r_switch
             - c12 * (repu_shift_F2 + repu_shift_F3 * r_switch) * r_switch * r_switch * r_switch;
}

/*! Apply potential switch, force-only version. */
gmx_opencl_inline void calculate_potential_switch_F(const cl_nbparam_params_t* nbparam,
                                                    float                      inv_r,
                                                    float                      r2,
                                                    float*                     F_invr,
                                                    const float*               E_lj)
{
    /* potential switch constants */
    const float switch_V3 = nbparam->vdw_switch.c3;
    const float switch_V4 = nbparam->vdw_switch.c4;
    const float switch_V5 = nbparam->vdw_switch.c5;
    const float switch_F2 = nbparam->vdw_switch.c3;
    const float switch_F3 = nbparam->vdw_switch.c4;
    const float switch_F4 = nbparam->vdw_switch.c5;

    const float r        = r2 * inv_r;
    const float r_switch = r - nbparam->rvdw_switch;

    /* Unlike in the F+E kernel, conditional is faster here */
    if (r_switch > 0.0F)
    {
        const float sw = 1.0F
                         + (switch_V3 + (switch_V4 + switch_V5 * r_switch) * r_switch) * r_switch
                                   * r_switch * r_switch;
        const float dsw = (switch_F2 + (switch_F3 + switch_F4 * r_switch) * r_switch) * r_switch * r_switch;

        *F_invr = (*F_invr) * sw - inv_r * (*E_lj) * dsw;
    }
}

/*! Apply potential switch, force + energy version. */
gmx_opencl_inline void calculate_potential_switch_F_E(const cl_nbparam_params_t* nbparam,
                                                      float                      inv_r,
                                                      float                      r2,
                                                      float*                     F_invr,
                                                      float*                     E_lj)
{
    /* potential switch constants */
    const float switch_V3 = nbparam->vdw_switch.c3;
    const float switch_V4 = nbparam->vdw_switch.c4;
    const float switch_V5 = nbparam->vdw_switch.c5;
    const float switch_F2 = nbparam->vdw_switch.c3;
    const float switch_F3 = nbparam->vdw_switch.c4;
    const float switch_F4 = nbparam->vdw_switch.c5;

    const float r        = r2 * inv_r;
    float       r_switch = r - nbparam->rvdw_switch;
    r_switch             = r_switch >= 0.0F ? r_switch : 0.0F;

    /* Unlike in the F-only kernel, masking is faster here */
    const float sw =
            1.0F + (switch_V3 + (switch_V4 + switch_V5 * r_switch) * r_switch) * r_switch * r_switch * r_switch;
    const float dsw = (switch_F2 + (switch_F3 + switch_F4 * r_switch) * r_switch) * r_switch * r_switch;

    *F_invr = (*F_invr) * sw - inv_r * (*E_lj) * dsw;
    *E_lj *= sw;
}

/*! \brief Fetch C6 grid contribution coefficients and return the product of these.
 */
gmx_opencl_inline float calculate_lj_ewald_c6grid(__constant const float2* nbfp_comb, int typei, int typej)
{
    const float c6_i = nbfp_comb[typei].x;
    const float c6_j = nbfp_comb[typej].x;
    return c6_i * c6_j;
}

/*! Calculate LJ-PME grid force contribution with
 *  geometric combination rule.
 */
gmx_opencl_inline void calculate_lj_ewald_comb_geom_F(__constant const float2* nbfp_comb,
                                                      int                      typei,
                                                      int                      typej,
                                                      float                    r2,
                                                      float                    inv_r2,
                                                      float                    lje_coeff2,
                                                      float                    lje_coeff6_6,
                                                      float*                   F_invr)
{
    const float c6grid = calculate_lj_ewald_c6grid(nbfp_comb, typei, typej);

    /* Recalculate inv_r6 without exclusion mask */
    const float inv_r6_nm = inv_r2 * inv_r2 * inv_r2;
    const float cr2       = lje_coeff2 * r2;
    const float expmcr2   = exp(-cr2);
    const float poly      = 1.0F + cr2 + HALF_F * cr2 * cr2;

    /* Subtract the grid force from the total LJ force */
    *F_invr += c6grid * (inv_r6_nm - expmcr2 * (inv_r6_nm * poly + lje_coeff6_6)) * inv_r2;
}

/*! Calculate LJ-PME grid force + energy contribution with
 *  geometric combination rule.
 */
gmx_opencl_inline void calculate_lj_ewald_comb_geom_F_E(__constant const float2*   nbfp_comb,
                                                        const cl_nbparam_params_t* nbparam,
                                                        int                        typei,
                                                        int                        typej,
                                                        float                      r2,
                                                        float                      inv_r2,
                                                        float                      lje_coeff2,
                                                        float                      lje_coeff6_6,
                                                        float                      int_bit,
                                                        float*                     F_invr,
                                                        float*                     E_lj)
{
    const float c6grid = calculate_lj_ewald_c6grid(nbfp_comb, typei, typej);

    /* Recalculate inv_r6 without exclusion mask */
    const float inv_r6_nm = inv_r2 * inv_r2 * inv_r2;
    const float cr2       = lje_coeff2 * r2;
    const float expmcr2   = exp(-cr2);
    const float poly      = 1.0F + cr2 + HALF_F * cr2 * cr2;

    /* Subtract the grid force from the total LJ force */
    *F_invr += c6grid * (inv_r6_nm - expmcr2 * (inv_r6_nm * poly + lje_coeff6_6)) * inv_r2;

    /* Shift should be applied only to real LJ pairs */
    const float sh_mask = nbparam->sh_lj_ewald * int_bit;
    *E_lj += ONE_SIXTH_F * c6grid * (inv_r6_nm * (1.0F - expmcr2 * poly) + sh_mask);
}

/*! Calculate LJ-PME grid force + energy contribution (if E_lj != NULL) with
 *  Lorentz-Berthelot combination rule.
 *  We use a single F+E kernel with conditional because the performance impact
 *  of this is pretty small and LB on the CPU is anyway very slow.
 */
gmx_opencl_inline void calculate_lj_ewald_comb_LB_F_E(__constant const float2*   nbfp_comb,
                                                      const cl_nbparam_params_t* nbparam,
                                                      int                        typei,
                                                      int                        typej,
                                                      float                      r2,
                                                      float                      inv_r2,
                                                      float                      lje_coeff2,
                                                      float                      lje_coeff6_6,
                                                      float                      int_bit,
                                                      bool                       with_E_lj,
                                                      float*                     F_invr,
                                                      float*                     E_lj)
{
    /* sigma and epsilon are scaled to give 6*C6 */
    const float2 c6c12_i = nbfp_comb[typei];
    const float2 c6c12_j = nbfp_comb[typej];
    const float  sigma   = c6c12_i.x + c6c12_j.x;
    const float  epsilon = c6c12_i.y * c6c12_j.y;

    const float sigma2 = sigma * sigma;
    const float c6grid = epsilon * sigma2 * sigma2 * sigma2;

    /* Recalculate inv_r6 without exclusion mask */
    const float inv_r6_nm = inv_r2 * inv_r2 * inv_r2;
    const float cr2       = lje_coeff2 * r2;
    const float expmcr2   = exp(-cr2);
    const float poly      = 1.0F + cr2 + HALF_F * cr2 * cr2;

    /* Subtract the grid force from the total LJ force */
    *F_invr += c6grid * (inv_r6_nm - expmcr2 * (inv_r6_nm * poly + lje_coeff6_6)) * inv_r2;

    if (with_E_lj)
    {

        /* Shift should be applied only to real LJ pairs */
        const float sh_mask = nbparam->sh_lj_ewald * int_bit;
        *E_lj += ONE_SIXTH_F * c6grid * (inv_r6_nm * (1.0F - expmcr2 * poly) + sh_mask);
    }
}

/*! Interpolate Ewald coulomb force using the table through the tex_nbfp texture.
 *  Original idea: from the OpenMM project
 */
gmx_opencl_inline float interpolate_coulomb_force_r(__constant const float* coulomb_tab, float r, float scale)
{
    float normalized = scale * r;
    int   index      = (int)normalized;
    float fract2     = normalized - (float)index;
    float fract1     = 1.0F - fract2;

    return fract1 * coulomb_tab[index] + fract2 * coulomb_tab[index + 1];
}

/*! Calculate analytical Ewald correction term. */
gmx_opencl_inline float pmecorrF(float z2)
{
    const float FN6 = -1.7357322914161492954e-8F;
    const float FN5 = 1.4703624142580877519e-6F;
    const float FN4 = -0.000053401640219807709149F;
    const float FN3 = 0.0010054721316683106153F;
    const float FN2 = -0.019278317264888380590F;
    const float FN1 = 0.069670166153766424023F;
    const float FN0 = -0.75225204789749321333F;

    const float FD4 = 0.0011193462567257629232F;
    const float FD3 = 0.014866955030185295499F;
    const float FD2 = 0.11583842382862377919F;
    const float FD1 = 0.50736591960530292870F;
    const float FD0 = 1.0F;

    const float z4 = z2 * z2;

    float polyFD0 = FD4 * z4 + FD2;
    float polyFD1 = FD3 * z4 + FD1;
    polyFD0       = polyFD0 * z4 + FD0;
    polyFD0       = polyFD1 * z2 + polyFD0;

    polyFD0 = 1.0F / polyFD0;

    float polyFN0 = FN6 * z4 + FN4;
    float polyFN1 = FN5 * z4 + FN3;
    polyFN0       = polyFN0 * z4 + FN2;
    polyFN1       = polyFN1 * z4 + FN1;
    polyFN0       = polyFN0 * z4 + FN0;
    polyFN0       = polyFN1 * z2 + polyFN0;

    return polyFN0 * polyFD0;
}

#    if REDUCE_SHUFFLE
gmx_opencl_inline void
reduce_force_j_shfl(float3 fin, __global float* fout, int gmx_unused tidxi, int gmx_unused tidxj, int aidx)
{
    /* Only does reduction over 4 elements in cluster. Needs to be changed
     * for CL_SIZE>4. See CUDA code for required code */
    fin.x += intel_sub_group_shuffle_down(fin.x, fin.x, 1);
    fin.y += intel_sub_group_shuffle_up(fin.y, fin.y, 1);
    fin.z += intel_sub_group_shuffle_down(fin.z, fin.z, 1);
    if ((tidxi & 1) == 1)
    {
        fin.x = fin.y;
    }
    fin.x += intel_sub_group_shuffle_down(fin.x, fin.x, 2);
    fin.z += intel_sub_group_shuffle_up(fin.z, fin.z, 2);
    if (tidxi == 2)
    {
        fin.x = fin.z;
    }
    if (tidxi < 3)
    {
        atomicAdd_g_f(&fout[3 * aidx + tidxi], fin.x);
    }
}
#    endif

gmx_opencl_inline void
reduce_force_j_generic(__local float* f_buf, float3 fcj_buf, __global float* fout, int tidxi, int tidxj, int aidx)
{
    int tidx                      = tidxi + tidxj * CL_SIZE;
    f_buf[tidx]                   = fcj_buf.x;
    f_buf[FBUF_STRIDE + tidx]     = fcj_buf.y;
    f_buf[2 * FBUF_STRIDE + tidx] = fcj_buf.z;

    /* Split the reduction between the first 3 column threads
       Threads with column id 0 will do the reduction for (float3).x components
       Threads with column id 1 will do the reduction for (float3).y components
       Threads with column id 2 will do the reduction for (float3).z components.
       The reduction is performed for each line tidxj of f_buf. */
    if (tidxi < 3)
    {
        float f = 0.0F;
        for (int j = tidxj * CL_SIZE; j < (tidxj + 1) * CL_SIZE; j++)
        {
            f += f_buf[FBUF_STRIDE * tidxi + j];
        }

        atomicAdd_g_f(&fout[3 * aidx + tidxi], f);
    }
}

/*! Final j-force reduction
 */
gmx_opencl_inline void reduce_force_j(__local float gmx_unused* f_buf,
                                      float3                    fcj_buf,
                                      __global float*           fout,
                                      int                       tidxi,
                                      int                       tidxj,
                                      int                       aidx)
{
#    if REDUCE_SHUFFLE
    reduce_force_j_shfl(fcj_buf, fout, tidxi, tidxj, aidx);
#    else
    reduce_force_j_generic(f_buf, fcj_buf, fout, tidxi, tidxj, aidx);
#    endif
}

#    if REDUCE_SHUFFLE
gmx_opencl_inline void reduce_force_i_and_shift_shfl(__private fvec  fci_buf[],
                                                     __global float* fout,
                                                     bool            bCalcFshift,
                                                     int             tidxi,
                                                     int             tidxj,
                                                     int             sci,
                                                     int             shift,
                                                     __global float* fshift)
{
    /* Only does reduction over 4 elements in cluster (2 per warp). Needs to be changed
     * for CL_SIZE>4.*/
    float2 fshift_buf = 0;
    for (int ci_offset = 0; ci_offset < c_nbnxnGpuNumClusterPerSupercluster; ci_offset++)
    {
        int    aidx = (sci * c_nbnxnGpuNumClusterPerSupercluster + ci_offset) * CL_SIZE + tidxi;
        float3 fin  = (float3)(fci_buf[ci_offset][0], fci_buf[ci_offset][1], fci_buf[ci_offset][2]);
        fin.x += intel_sub_group_shuffle_down(fin.x, fin.x, CL_SIZE);
        fin.y += intel_sub_group_shuffle_up(fin.y, fin.y, CL_SIZE);
        fin.z += intel_sub_group_shuffle_down(fin.z, fin.z, CL_SIZE);

        if (tidxj & 1)
        {
            fin.x = fin.y;
        }
        /* Threads 0,1 and 2,3 increment x,y for their warp */
        atomicAdd_g_f(&fout[3 * aidx + (tidxj & 1)], fin.x);
        if (bCalcFshift)
        {
            fshift_buf[0] += fin.x;
        }
        /* Threads 0 and 2 increment z for their warp */
        if ((tidxj & 1) == 0)
        {
            atomicAdd_g_f(&fout[3 * aidx + 2], fin.z);
            if (bCalcFshift)
            {
                fshift_buf[1] += fin.z;
            }
        }
    }
    /* add up local shift forces into global mem */
    if (bCalcFshift)
    {
        // Threads 0,1 and 2,3 update x,y
        atomicAdd_g_f(&(fshift[3 * shift + (tidxj & 1)]), fshift_buf[0]);
        // Threads 0 and 2 update z
        if ((tidxj & 1) == 0)
        {
            atomicAdd_g_f(&(fshift[3 * shift + 2]), fshift_buf[1]);
        }
    }
}
#    endif

/*! Final i-force reduction; this implementation works only with power of two
 *  array sizes.
 */
gmx_opencl_inline void reduce_force_i_and_shift_pow2(volatile __local float* f_buf,
                                                     __private fvec          fci_buf[],
                                                     __global float*         fout,
                                                     bool                    bCalcFshift,
                                                     int                     tidxi,
                                                     int                     tidxj,
                                                     int                     sci,
                                                     int                     shift,
                                                     __global float*         fshift)
{
    float fshift_buf = 0;
    for (int ci_offset = 0; ci_offset < c_nbnxnGpuNumClusterPerSupercluster; ci_offset++)
    {
        int aidx = (sci * c_nbnxnGpuNumClusterPerSupercluster + ci_offset) * CL_SIZE + tidxi;
        int tidx = tidxi + tidxj * CL_SIZE;
        /* store i forces in shmem */
        f_buf[tidx]                   = fci_buf[ci_offset][0];
        f_buf[FBUF_STRIDE + tidx]     = fci_buf[ci_offset][1];
        f_buf[2 * FBUF_STRIDE + tidx] = fci_buf[ci_offset][2];
        barrier(CLK_LOCAL_MEM_FENCE);

        /* Reduce the initial CL_SIZE values for each i atom to half
         * every step by using CL_SIZE * i threads.
         * Can't just use i as loop variable because than nvcc refuses to unroll.
         */
        int i = CL_SIZE / 2;
        for (int j = CL_SIZE_LOG2 - 1; j > 0; j--)
        {
            if (tidxj < i)
            {

                f_buf[tidxj * CL_SIZE + tidxi] += f_buf[(tidxj + i) * CL_SIZE + tidxi];
                f_buf[FBUF_STRIDE + tidxj * CL_SIZE + tidxi] +=
                        f_buf[FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi];
                f_buf[2 * FBUF_STRIDE + tidxj * CL_SIZE + tidxi] +=
                        f_buf[2 * FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi];
            }
            i >>= 1;
        }
        /* needed because
         * a) for CL_SIZE<8: id 2 (doing z in next block) is in 2nd warp
         * b) for all CL_SIZE a barrier is needed before f_buf is reused by next reduce_force_i call
         * TODO: Test on Nvidia for performance difference between having the barrier here or after the atomicAdd
         */
        barrier(CLK_LOCAL_MEM_FENCE);

        /* i == 1, last reduction step, writing to global mem */
        /* Split the reduction between the first 3 line threads
           Threads with line id 0 will do the reduction for (float3).x components
           Threads with line id 1 will do the reduction for (float3).y components
           Threads with line id 2 will do the reduction for (float3).z components. */
        if (tidxj < 3)
        {
            float f = f_buf[tidxj * FBUF_STRIDE + tidxi] + f_buf[tidxj * FBUF_STRIDE + i * CL_SIZE + tidxi];

            atomicAdd_g_f(&fout[3 * aidx + tidxj], f);

            if (bCalcFshift)
            {
                fshift_buf += f;
            }
        }
    }
    /* add up local shift forces into global mem */
    if (bCalcFshift)
    {
        /* Only threads with tidxj < 3 will update fshift.
           The threads performing the update, must be the same as the threads
           storing the reduction result above.
         */
        if (tidxj < 3)
        {
            atomicAdd_g_f(&(fshift[3 * shift + tidxj]), fshift_buf);
        }
    }
}

/*! Final i-force reduction
 */
gmx_opencl_inline void reduce_force_i_and_shift(__local float gmx_unused* f_buf,
                                                __private fvec            fci_buf[],
                                                __global float*           f,
                                                bool                      bCalcFshift,
                                                int                       tidxi,
                                                int                       tidxj,
                                                int                       sci,
                                                int                       shift,
                                                __global float*           fshift)
{
#    if REDUCE_SHUFFLE
    reduce_force_i_and_shift_shfl(fci_buf, f, bCalcFshift, tidxi, tidxj, sci, shift, fshift);
#    else
    reduce_force_i_and_shift_pow2(f_buf, fci_buf, f, bCalcFshift, tidxi, tidxj, sci, shift, fshift);
#    endif
}


#    if REDUCE_SHUFFLE
gmx_opencl_inline void reduce_energy_shfl(float                    E_lj,
                                          float                    E_el,
                                          volatile __global float* e_lj,
                                          volatile __global float* e_el,
                                          int                      tidx)
{
    E_lj = sub_group_reduce_add(E_lj);
    E_el = sub_group_reduce_add(E_el);
    /* Should be get_sub_group_local_id()==0. Doesn't work with Intel Classic driver.
     * To make it possible to use REDUCE_SHUFFLE with single subgroup per i-j pair
     * (e.g. subgroup size 16 with CL_SIZE 4), either this "if" needs to be changed or
     * the definition of WARP_SIZE (currently CL_SIZE*CL_SIZE/2) needs to be changed
     * (by supporting c_nbnxnGpuClusterpairSplit=1). */
    if (tidx == 0 || tidx == WARP_SIZE)
    {
        atomicAdd_g_f(e_lj, E_lj);
        atomicAdd_g_f(e_el, E_el);
    }
}
#    endif

/*! Energy reduction; this implementation works only with power of two
 *  array sizes.
 */
gmx_opencl_inline void reduce_energy_pow2(volatile __local float*  buf,
                                          volatile __global float* e_lj,
                                          volatile __global float* e_el,
                                          int                      tidx)
{
    int i = WARP_SIZE / 2;

    /* Can't just use i as loop variable because than nvcc refuses to unroll. */
    for (int j = WARP_SIZE_LOG2 - 1; j > 0; j--)
    {
        if (tidx < i)
        {
            buf[tidx] += buf[tidx + i];
            buf[FBUF_STRIDE + tidx] += buf[FBUF_STRIDE + tidx + i];
        }
        i >>= 1;
    }

    /* last reduction step, writing to global mem */
    if (tidx == 0)
    {
        float e1 = buf[tidx] + buf[tidx + i];
        float e2 = buf[FBUF_STRIDE + tidx] + buf[FBUF_STRIDE + tidx + i];

        atomicAdd_g_f(e_lj, e1);
        atomicAdd_g_f(e_el, e2);
    }
}

gmx_opencl_inline void reduce_energy(volatile __local float gmx_unused* buf,
                                     float                              E_lj,
                                     float                              E_el,
                                     volatile __global float*           e_lj,
                                     volatile __global float*           e_el,
                                     int                                tidx)
{
#    if REDUCE_SHUFFLE
    reduce_energy_shfl(E_lj, E_el, e_lj, e_el, tidx);
#    else
    /* flush the energies to shmem and reduce them */
    buf[tidx] = E_lj;
    buf[FBUF_STRIDE + tidx] = E_el;
    reduce_energy_pow2(buf + (tidx & WARP_SIZE), e_lj, e_el, tidx & ~WARP_SIZE);
#    endif
}

gmx_opencl_inline bool gmx_sub_group_any_localmem(volatile __local int* warp_any, int widx, bool pred)
{
    if (pred)
    {
        warp_any[widx] = 1;
    }

    bool ret = warp_any[widx];

    warp_any[widx] = 0;

    return ret;
}

//! Returns a true if predicate is true for any work item in warp
gmx_opencl_inline bool gmx_sub_group_any(volatile __local int gmx_unused* warp_any, int gmx_unused widx, bool pred)
{
#    if USE_SUBGROUP_ANY
    return sub_group_any(pred);
#    else
    return gmx_sub_group_any_localmem(warp_any, widx, pred);
#    endif
}

#endif /* NBNXN_OPENCL_KERNEL_UTILS_CLH */