/*
 * This file is part of the GROMACS molecular simulation package.
 *
 * Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by
 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
 * and including many others, as listed in the AUTHORS file in the
 * top-level source directory and at http://www.gromacs.org.
 *
 * GROMACS is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public License
 * as published by the Free Software Foundation; either version 2.1
 * of the License, or (at your option) any later version.
 *
 * GROMACS is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with GROMACS; if not, see
 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
 *
 * If you want to redistribute modifications to GROMACS, please
 * consider that scientific software is very special. Version
 * control is crucial - bugs must be traceable. We will be happy to
 * consider code for inclusion in the official distribution, but
 * derived work must not be called official GROMACS. Details are found
 * in the README & COPYING files - if they are missing, get the
 * official version at http://www.gromacs.org.
 *
 * To help us fund GROMACS development, we humbly ask that you cite
 * the research papers on the package. Check out http://www.gromacs.org.
 */

/*! \internal \file
 *  \brief Implements PME OpenCL force gathering kernel.
 * When including this and other PME OpenCL kernel files, plenty of common
 * constants/macros are expected to be defined (such as "order" which is PME interpolation order).
 * For details, please see how pme_program.cl is compiled in pme_gpu_program_impl_ocl.cpp.
 *
 * This file's kernels specifically expect the following definitions:
 *
 * - atomsPerBlock which expresses how many atoms are processed by a single work group
 * - order which is a PME interpolation order
 * - wrapX and wrapY must evaluate to either true or false to specify whether the grid overlap
 * in dimension X/Y is to be used
 *
 *  \author Aleksei Iupinov <a.yupinov@gmail.com>
 */

#include "pme_gpu_calculate_splines.clh"
#include "pme_gpu_types.h"

#ifndef COMPILE_GATHER_HELPERS_ONCE
#    define COMPILE_GATHER_HELPERS_ONCE

/*! \brief
 * Unrolls the dynamic index accesses to the constant grid sizes to avoid local memory operations.
 */
inline float read_grid_size(const float* realGridSizeFP, const int dimIndex)
{
    switch (dimIndex)
    {
        case XX: return realGridSizeFP[XX];
        case YY: return realGridSizeFP[YY];
        case ZZ: return realGridSizeFP[ZZ];
        default: assert(false); break;
    }
    assert(false);
    return 0.0F;
}

/*! \brief Reduce the partial force contributions.
 *
 * FIXME: this reduction should be simplified and improved, it does 3x16 force component
 *        reduction per 16 threads so no extra shared mem should be needed for intermediates
 *        or passing results back.
 *
 * \param[out]      sm_forces          Local memory array with the output forces (rvec).
 * \param[in]       atomIndexLocal     Local atom index
 * \param[in]       splineIndex        Spline index
 * \param[in]       lineIndex          Line index (same as threadLocalId)
 * \param[in]       realGridSizeFP     Local grid size constant
 * \param[in]       fx                 Input force partial component X
 * \param[in]       fy                 Input force partial component Y
 * \param[in]       fz                 Input force partial component Z
 * \param[in,out]   sm_forceReduction  Reduction working buffer
 * \param[in]       sm_forceTemp       Convenience pointers into \p sm_forceReduction
 */
inline void reduce_atom_forces(__local float* __restrict__ sm_forces,
                               const int    atomIndexLocal,
                               const int    splineIndex,
                               const int    lineIndex,
                               const float* realGridSizeFP,
                               float        fx,
                               float        fy,
                               float        fz,
                               __local float* __restrict__ sm_forceReduction,
                               __local float** __restrict__ sm_forceTemp)

{
    // TODO: implement AMD intrinsics reduction, like with shuffles in CUDA version. #2514

    /* Number of data components and threads for a single atom */
#    define atomDataSize threadsPerAtom
    // We use blockSize local memory elements to read fx, or fy, or fz, and then reduce them to fit into smemPerDim elements
    // All those guys are defines and not consts, because they go into the local memory array size.
#    define blockSize (atomsPerBlock * atomDataSize)
#    define smemPerDim warp_size
#    define smemReserved (DIM * smemPerDim)

    const int numWarps  = blockSize / smemPerDim;
    const int minStride = max(1, atomDataSize / numWarps);

#    pragma unroll DIM
    for (int dimIndex = 0; dimIndex < DIM; dimIndex++)
    {
        int elementIndex = smemReserved + lineIndex;
        // Store input force contributions
        sm_forceReduction[elementIndex] = (dimIndex == XX) ? fx : (dimIndex == YY) ? fy : fz;
#    if (warp_size < 48)
        // sync here when exec width is smaller than the size of the sm_forceReduction
        // buffer flushed to local mem above (size 3*16) as different warps will consume
        // the data below.
        barrier(CLK_LOCAL_MEM_FENCE);
#    endif

        // Reduce to fit into smemPerDim (warp size)
#    pragma unroll
        for (int redStride = atomDataSize >> 1; redStride > minStride; redStride >>= 1)
        {
            if (splineIndex < redStride)
            {
                sm_forceReduction[elementIndex] += sm_forceReduction[elementIndex + redStride];
            }
        }
        barrier(CLK_LOCAL_MEM_FENCE);
        // Last iteration - packing everything to be nearby, storing convenience pointer
        sm_forceTemp[dimIndex] = sm_forceReduction + dimIndex * smemPerDim;
        int redStride          = minStride;
        if (splineIndex < redStride)
        {
            const int packedIndex = atomIndexLocal * redStride + splineIndex;
            sm_forceTemp[dimIndex][packedIndex] =
                    sm_forceReduction[elementIndex] + sm_forceReduction[elementIndex + redStride];
        }

        // barrier only needed for the last iteration on hardware with >=64-wide execution (e.g. AMD)
#    if (warp_size < 64)
        barrier(CLK_LOCAL_MEM_FENCE);
#    endif
    }

#    if (warp_size >= 64)
    barrier(CLK_LOCAL_MEM_FENCE);
#    endif

    assert((blockSize / warp_size) >= DIM);

    const int warpIndex = lineIndex / warp_size;
    const int dimIndex  = warpIndex;

    // First 3 warps can now process 1 dimension each
    if (dimIndex < DIM)
    {
        const int sourceIndex = lineIndex % warp_size;
#    pragma unroll
        for (int redStride = minStride >> 1; redStride > 1; redStride >>= 1)
        {
            if (!(splineIndex & redStride))
            {
                sm_forceTemp[dimIndex][sourceIndex] += sm_forceTemp[dimIndex][sourceIndex + redStride];
            }
        }

        const float n         = read_grid_size(realGridSizeFP, dimIndex);
        const int   atomIndex = sourceIndex / minStride;
        if (sourceIndex == minStride * atomIndex)
        {
            sm_forces[atomIndex * DIM + dimIndex] =
                    (sm_forceTemp[dimIndex][sourceIndex] + sm_forceTemp[dimIndex][sourceIndex + 1]) * n;
        }
    }
}

/*! \brief Calculate the sum of the force partial components (in X, Y and Z)
 *
 * \param[out] fx                 The force partial component in the X dimension.
 * \param[out] fy                 The force partial component in the Y dimension.
 * \param[out] fz                 The force partial component in the Z dimension.
 * \param[in] ixBase              The grid line index base value in the X dimension.
 * \param[in] nx                  The grid real size in the X dimension.
 * \param[in] pny                 The padded grid real size in the Y dimension.
 * \param[in] pnz                 The padded grid real size in the Z dimension.
 * \param[in] constOffset         The offset to calculate the global grid index.
 * \param[in] splineIndexBase     The base value of the spline parameter index.
 * \param[in] tdy                 The theta and dtheta in the Y dimension.
 * \param[in] tdz                 The theta and dtheta in the Z dimension.
 * \param[in] sm_splineParams     Shared memory array of spline parameters.
 * \param[in] gm_grid             Global memory array of the grid to use.
 */
inline void sumForceComponents(float*        fx,
                               float*        fy,
                               float*        fz,
                               const int     ixBase,
                               const int     nx,
                               const int     pny,
                               const int     pnz,
                               const int     constOffset,
                               const int     splineIndexBase,
                               const float2  tdy,
                               const float2  tdz,
                               __local const float2* __restrict__ sm_splineParams,
                               __global const float* __restrict__ gm_grid)
{
#    pragma unroll order
    for (int ithx = 0; (ithx < order); ithx++)
    {
        int ix = ixBase + ithx;
        if (wrapX & (ix >= nx))
        {
            ix -= nx;
        }
        const int gridIndexGlobal = ix * pny * pnz + constOffset;
        assert(gridIndexGlobal >= 0);
        const float gridValue = gm_grid[gridIndexGlobal];
        assert(isfinite(gridValue));
        const int    splineIndexX = getSplineParamIndex(splineIndexBase, XX, ithx);
        const float2 tdx          = sm_splineParams[splineIndexX];
        const float  fxy1         = tdz.x * gridValue;
        const float  fz1          = tdz.y * gridValue;
        *fx += tdx.y * tdy.x * fxy1;
        *fy += tdx.x * tdy.y * fxy1;
        *fz += tdx.x * tdy.x * fz1;
    }
}

/*! \brief Calculate the grid forces and store them in shared memory.
 *
 * \param[in,out] sm_forces       Shared memory array with the output forces.
 * \param[in] forceIndexLocal     The local (per thread) index in the sm_forces array.
 * \param[in] forceIndexGlobal    The index of the thread in the gm_coefficients array.
 * \param[in] recipBox            The reciprocal box.
 * \param[in] scale               The scale to use when calculating the forces (only relevant when
 * using two grids).
 * \param[in] gm_coefficients     Global memory array of the coefficients to use.
 */
inline void calculateAndStoreGridForces(__local float* __restrict__ sm_forces,
                                        const int   forceIndexLocal,
                                        const int   forceIndexGlobal,
                                        const float recipBox[DIM][DIM],
                                        const float scale,
                                        __global const float* __restrict__ gm_coefficients)
{
    const float3 atomForces     = vload3(forceIndexLocal, sm_forces);
    float        negCoefficient = -scale * gm_coefficients[forceIndexGlobal];
    float3       result;
    result.x = negCoefficient * recipBox[XX][XX] * atomForces.x;
    result.y = negCoefficient * (recipBox[XX][YY] * atomForces.x + recipBox[YY][YY] * atomForces.y);
    result.z = negCoefficient
               * (recipBox[XX][ZZ] * atomForces.x + recipBox[YY][ZZ] * atomForces.y
                  + recipBox[ZZ][ZZ] * atomForces.z);
    vstore3(result, forceIndexLocal, sm_forces);
}

#endif // COMPILE_GATHER_HELPERS_ONCE

/*! \brief
 * An OpenCL kernel which gathers the atom forces from the grid.
 * The grid is assumed to be wrapped in dimension Z.
 * Please see the file description for additional defines which this kernel expects.
 *
 * \param[in]     kernelParams         All the PME GPU data.
 * \param[in]     gm_coefficientsA     Atom charges/coefficients in the unperturbed state, or FEP
 * state A.
 * \param[in]     gm_coefficientsB     Atom charges/coefficients in FEP state B. Only used
 * when spreading interpolated coefficients on one grid or spreading two sets of coefficients on two
 * separate grids.
 * \param[in]     gm_gridA             Global 3D grid for the unperturbed state, FEP
 * state A or the single grid used for interpolated coefficients on one grid in FEP A/B.
 * \param[in]     gm_gridB             Global 3D grid for FEP state B when using dual
 * grids (when calculating energy and virials).
 * \param[in]     gm_theta             Atom spline parameter values
 * \param[in]     gm_dtheta            Atom spline parameter derivatives
 * \param[in]     gm_gridlineIndices   Atom gridline indices (ivec)
 * \param[in,out] gm_forces            Atom forces (rvec)
 */
__attribute__((reqd_work_group_size(order, order, atomsPerBlock)))
__kernel void CUSTOMIZED_KERNEL_NAME(pme_gather_kernel)(const struct PmeOpenCLKernelParams kernelParams,
                                                        __global const float* __restrict__ gm_coefficientsA,
                                                        __global const float* __restrict__ gm_coefficientsB,
                                                        __global const float* __restrict__ gm_gridA,
                                                        __global const float* __restrict__ gm_gridB,
                                                        __global const float* __restrict__ gm_theta,
                                                        __global const float* __restrict__ gm_dtheta,
                                                        __global const int* __restrict__ gm_gridlineIndices,
                                                        __global float* __restrict__ gm_forces)
{
    assert(numGrids == 1 || numGrids == 2);

    /* These are the atom indices - for the shared and global memory */
    const int atomIndexLocal  = get_local_id(ZZ);
    const int atomIndexOffset = (int)get_group_id(XX) * atomsPerBlock;
    const int atomIndexGlobal = atomIndexOffset + atomIndexLocal;

/* Some sizes which are defines and not consts because they go into the array size */
#define blockSize (atomsPerBlock * atomDataSize)
    assert(blockSize == (get_local_size(0) * get_local_size(1) * get_local_size(2)));
#define smemPerDim warp_size
#define smemReserved (DIM * smemPerDim)
#define totalSharedMemory (smemReserved + blockSize)
#define gridlineIndicesSize (atomsPerBlock * DIM)
#define splineParamsSize (atomsPerBlock * DIM * order)

    __local int    sm_gridlineIndices[gridlineIndicesSize];
    __local float2 sm_splineParams[splineParamsSize]; /* Theta/dtheta pairs  as .x/.y */

    /* Spline Y/Z coordinates */
    const int ithy = get_local_id(YY);
    const int ithz = get_local_id(XX);

    assert((get_local_id(2) * get_local_size(1) + get_local_id(1)) * get_local_size(0) + get_local_id(0)
           <= MAX_INT);
    const int threadLocalId =
            (int)((get_local_id(2) * get_local_size(1) + get_local_id(1)) * get_local_size(0)
                  + get_local_id(0));

    /* These are the spline contribution indices in shared memory */
    assert((get_local_id(1) * get_local_size(0) + get_local_id(0)) <= MAX_INT);
    const int splineIndex =
            (int)(get_local_id(1) * get_local_size(0)
                  + get_local_id(0));    /* Relative to the current particle , 0..15 for order 4 */
    const int lineIndex = threadLocalId; /* And to all the block's particles */

    /* Staging the atom gridline indices, DIM * atomsPerBlock threads */
    const int localGridlineIndicesIndex = threadLocalId;
    const int globalGridlineIndicesIndex =
            (int)get_group_id(XX) * gridlineIndicesSize + localGridlineIndicesIndex;
    if (localGridlineIndicesIndex < gridlineIndicesSize)
    {
        sm_gridlineIndices[localGridlineIndicesIndex] = gm_gridlineIndices[globalGridlineIndicesIndex];
        assert(sm_gridlineIndices[localGridlineIndicesIndex] >= 0);
    }
    /* Staging the spline parameters, DIM * order * atomsPerBlock threads */
    const int localSplineParamsIndex = threadLocalId;
    const int globalSplineParamsIndex = (int)get_group_id(XX) * splineParamsSize + localSplineParamsIndex;
    if (localSplineParamsIndex < splineParamsSize)
    {
        sm_splineParams[localSplineParamsIndex].x = gm_theta[globalSplineParamsIndex];
        sm_splineParams[localSplineParamsIndex].y = gm_dtheta[globalSplineParamsIndex];
        assert(isfinite(sm_splineParams[localSplineParamsIndex].x));
        assert(isfinite(sm_splineParams[localSplineParamsIndex].y));
    }
    barrier(CLK_LOCAL_MEM_FENCE);

    float fx = 0.0F;
    float fy = 0.0F;
    float fz = 0.0F;

    int chargeCheck = pme_gpu_check_atom_charge(gm_coefficientsA[atomIndexGlobal]);

    const int nx  = kernelParams.grid.realGridSize[XX];
    const int ny  = kernelParams.grid.realGridSize[YY];
    const int nz  = kernelParams.grid.realGridSize[ZZ];
    const int pny = kernelParams.grid.realGridSizePadded[YY];
    const int pnz = kernelParams.grid.realGridSizePadded[ZZ];

    const int atomWarpIndex = atomIndexLocal % atomsPerWarp;
    const int warpIndex     = atomIndexLocal / atomsPerWarp;

    const int    splineIndexBase = getSplineParamIndexBase(warpIndex, atomWarpIndex);
    const int    splineIndexY    = getSplineParamIndex(splineIndexBase, YY, ithy);
    const float2 tdy             = sm_splineParams[splineIndexY];
    const int    splineIndexZ    = getSplineParamIndex(splineIndexBase, ZZ, ithz);
    const float2 tdz             = sm_splineParams[splineIndexZ];

    const int ixBase = sm_gridlineIndices[atomIndexLocal * DIM + XX];
    int       iy     = sm_gridlineIndices[atomIndexLocal * DIM + YY] + ithy;
    if (wrapY & (iy >= ny))
    {
        iy -= ny;
    }
    int iz = sm_gridlineIndices[atomIndexLocal * DIM + ZZ] + ithz;
    if (iz >= nz)
    {
        iz -= nz;
    }
    const int constOffset = iy * pnz + iz;

    if (chargeCheck)
    {
        sumForceComponents(
                &fx, &fy, &fz, ixBase, nx, pny, pnz, constOffset, splineIndexBase, tdy, tdz, sm_splineParams, gm_gridA);
    }

    // Reduction of partial force contributions
    __local float sm_forces[atomsPerBlock * DIM];

    __local float  sm_forceReduction[totalSharedMemory];
    __local float* sm_forceTemp[DIM];

    reduce_atom_forces(sm_forces,
                       atomIndexLocal,
                       splineIndex,
                       lineIndex,
                       kernelParams.grid.realGridSizeFP,
                       fx,
                       fy,
                       fz,
                       sm_forceReduction,
                       sm_forceTemp);
    barrier(CLK_LOCAL_MEM_FENCE);

    /* Calculating the final forces with no component branching, atomsPerBlock threads */
    const int   forceIndexLocal  = threadLocalId;
    const int   forceIndexGlobal = atomIndexOffset + forceIndexLocal;
    const float scale            = kernelParams.current.scale;
    if (forceIndexLocal < atomsPerBlock)
    {
        calculateAndStoreGridForces(
                sm_forces, forceIndexLocal, forceIndexGlobal, kernelParams.current.recipBox, scale, gm_coefficientsA);
    }

#if !defined(_AMD_SOURCE_) && !defined(_NVIDIA_SOURCE_)
    /* This is only here for execution of e.g. 32-sized warps on 16-wide hardware; this was
     * __syncwarp() in CUDA. #2519
     */
    barrier(CLK_LOCAL_MEM_FENCE);
#endif

    assert(atomsPerBlock <= warp_size);

    /* Writing or adding the final forces component-wise, single warp */
    const int blockForcesSize = atomsPerBlock * DIM;
    const int numIter         = (blockForcesSize + warp_size - 1) / warp_size;
    const int iterThreads     = blockForcesSize / numIter;
    if (threadLocalId < iterThreads)
    {
#pragma unroll
        for (int i = 0; i < numIter; i++)
        {
            const int outputIndexLocal  = i * iterThreads + threadLocalId;
            const int outputIndexGlobal = (int)get_group_id(XX) * blockForcesSize + outputIndexLocal;
            const float outputForceComponent = sm_forces[outputIndexLocal];
            gm_forces[outputIndexGlobal]     = outputForceComponent;
        }
    }

    if (numGrids == 2)
    {
        barrier(CLK_LOCAL_MEM_FENCE);
        fx          = 0.0F;
        fy          = 0.0F;
        fz          = 0.0F;
        chargeCheck = pme_gpu_check_atom_charge(gm_coefficientsB[atomIndexGlobal]);
        if (chargeCheck)
        {
            sumForceComponents(
                    &fx, &fy, &fz, ixBase, nx, pny, pnz, constOffset, splineIndexBase, tdy, tdz, sm_splineParams, gm_gridB);
        }
        reduce_atom_forces(sm_forces,
                           atomIndexLocal,
                           splineIndex,
                           lineIndex,
                           kernelParams.grid.realGridSizeFP,
                           fx,
                           fy,
                           fz,
                           sm_forceReduction,
                           sm_forceTemp);
        barrier(CLK_LOCAL_MEM_FENCE);
        if (forceIndexLocal < atomsPerBlock)
        {
            calculateAndStoreGridForces(sm_forces,
                                        forceIndexLocal,
                                        forceIndexGlobal,
                                        kernelParams.current.recipBox,
                                        1.0F - scale,
                                        gm_coefficientsB);
        }

#if !defined(_AMD_SOURCE_) && !defined(_NVIDIA_SOURCE_)
        /* This is only here for execution of e.g. 32-sized warps on 16-wide hardware; this was
         * __syncwarp() in CUDA. #2519
         */
        barrier(CLK_LOCAL_MEM_FENCE);
#endif

        /* Writing or adding the final forces component-wise, single warp */
        if (threadLocalId < iterThreads)
        {
#pragma unroll
            for (int i = 0; i < numIter; i++)
            {
                const int outputIndexLocal = i * iterThreads + threadLocalId;
                const int outputIndexGlobal = (int)get_group_id(XX) * blockForcesSize + outputIndexLocal;
                const float outputForceComponent = sm_forces[outputIndexLocal];
                gm_forces[outputIndexGlobal] += outputForceComponent;
            }
        }
    }
}