/*
 * This file is part of the GROMACS molecular simulation package.
 *
 * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
 * and including many others, as listed in the AUTHORS file in the
 * top-level source directory and at http://www.gromacs.org.
 *
 * GROMACS is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public License
 * as published by the Free Software Foundation; either version 2.1
 * of the License, or (at your option) any later version.
 *
 * GROMACS is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with GROMACS; if not, see
 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
 *
 * If you want to redistribute modifications to GROMACS, please
 * consider that scientific software is very special. Version
 * control is crucial - bugs must be traceable. We will be happy to
 * consider code for inclusion in the official distribution, but
 * derived work must not be called official GROMACS. Details are found
 * in the README & COPYING files - if they are missing, get the
 * official version at http://www.gromacs.org.
 *
 * To help us fund GROMACS development, we humbly ask that you cite
 * the research papers on the package. Check out http://www.gromacs.org.
 */

/*! \internal \file
 *  \brief Implements PME OpenCL solving kernel.
 * When including this and other PME OpenCL kernel files, plenty of common
 * constants/macros are expected to be defined.
 * For details, please see how pme_program.cl is compiled in pme_gpu_program_impl_ocl.cpp.
 *
 * This file's solving kernel specifically expects the following definitions for its flavors:
 *
 * - gridOrdering must be defined to either XYZ or YZX
 * and corresponds to the dimension order of the grid (GridOrdering enum in CUDA kernels);
 * - computeEnergyAndVirial must evaluate to true or false, and expresses
 * whether the reduction is performed.
 *
 *  \author Aleksei Iupinov <a.yupinov@gmail.com>
 */

#include "gromacs/gpu_utils/vectype_ops.clh"

#include "pme_gpu_types.h"

/*! \brief
 * PME complex grid solver kernel function.
 * Please see the file description for additional defines which this kernel expects.
 *
 * \param[in]     kernelParams         Input PME GPU data in constant memory.
 * \param[in]     gm_splineModuli      B-Spline moduli.
 * \param[out]    gm_virialAndEnergy   Reduced virial and enrgy (only with computeEnergyAndVirial ==
 * true)
 * \param[in,out] gm_grid              Fourier grid to transform.
 */
__attribute__((work_group_size_hint(c_solveMaxWorkGroupSize, 1, 1)))
__kernel void CUSTOMIZED_KERNEL_NAME(pme_solve_kernel)(const struct PmeOpenCLKernelParams kernelParams,
                                                       __global const float* __restrict__ gm_splineModuli,
                                                       __global float* __restrict__ gm_virialAndEnergy,
                                                       __global float2* __restrict__ gm_grid)
{
    /* This kernel supports 2 different grid dimension orderings: YZX and XYZ */
    int majorDim  = 0;
    int middleDim = 0;
    int minorDim  = 0;
    if (gridOrdering == YZX)
    {
        majorDim  = YY;
        middleDim = ZZ;
        minorDim  = XX;
    }
    if (gridOrdering == XYZ)
    {
        majorDim  = XX;
        middleDim = YY;
        minorDim  = ZZ;
    }

    __global const float* __restrict__ gm_splineValueMajor =
            gm_splineModuli + kernelParams.grid.splineValuesOffset[majorDim];
    __global const float* __restrict__ gm_splineValueMiddle =
            gm_splineModuli + kernelParams.grid.splineValuesOffset[middleDim];
    __global const float* __restrict__ gm_splineValueMinor =
            gm_splineModuli + kernelParams.grid.splineValuesOffset[minorDim];

    /* Various grid sizes and indices */
    const int localOffsetMinor  = 0; // unused
    const int localOffsetMajor  = 0; // unused
    const int localOffsetMiddle = 0; // unused
    const int localSizeMinor    = kernelParams.grid.complexGridSizePadded[minorDim];
    const int localSizeMiddle   = kernelParams.grid.complexGridSizePadded[middleDim];
    const int localCountMiddle  = kernelParams.grid.complexGridSize[middleDim];
    const int localCountMinor   = kernelParams.grid.complexGridSize[minorDim];
    const int nMajor            = kernelParams.grid.realGridSize[majorDim];
    const int nMiddle           = kernelParams.grid.realGridSize[middleDim];
    const int nMinor            = kernelParams.grid.realGridSize[minorDim];
    const int maxkMajor         = (nMajor + 1) / 2;  // X or Y
    const int maxkMiddle        = (nMiddle + 1) / 2; // Y OR Z => only check for !YZX
    const int maxkMinor         = (nMinor + 1) / 2;  // Z or X => only check for YZX

    /* Each thread works on one cell of the Fourier space complex 3D grid (gm_grid).
     * Each block handles up to c_solveMaxWorkGroupSize cells -
     * depending on the grid contiguous dimension size,
     * that can range from a part of a single gridline to several complete gridlines.
     */
    const int threadLocalId     = get_local_id(XX);
    const int gridLineSize      = localCountMinor;
    const int gridLineIndex     = threadLocalId / gridLineSize;
    const int gridLineCellIndex = threadLocalId - gridLineSize * gridLineIndex;
    const int gridLinesPerBlock = max((int)(get_local_size(XX)) / gridLineSize, 1);
    const int activeWarps       = ((int)get_local_size(XX) / warp_size);
    assert((get_group_id(XX) * get_local_size(XX)) < MAX_INT);
    const int indexMinor  = (int)get_group_id(XX) * (int)get_local_size(XX) + gridLineCellIndex;
    const int indexMiddle = (int)get_group_id(YY) * gridLinesPerBlock + gridLineIndex;
    const int indexMajor  = (int)get_group_id(ZZ);

    /* Optional outputs */
    float energy = 0.0F;
    float virxx  = 0.0F;
    float virxy  = 0.0F;
    float virxz  = 0.0F;
    float viryy  = 0.0F;
    float viryz  = 0.0F;
    float virzz  = 0.0F;

    assert(indexMajor < kernelParams.grid.complexGridSize[majorDim]);
    if ((indexMiddle < localCountMiddle) & (indexMinor < localCountMinor)
        & (gridLineIndex < gridLinesPerBlock))
    {
        /* The offset should be equal to the global thread index for coalesced access */
        const int gridThreadIndex =
                (indexMajor * localSizeMiddle + indexMiddle) * localSizeMinor + indexMinor;
        __global float2* __restrict__ gm_gridCell = gm_grid + gridThreadIndex;

        const int kMajor = indexMajor + localOffsetMajor;
        /* Checking either X in XYZ, or Y in YZX cases */
        const float mMajor = (float)((kMajor < maxkMajor) ? kMajor : (kMajor - nMajor));

        const int kMiddle = indexMiddle + localOffsetMiddle;
        float     mMiddle = (float)kMiddle;
        /* Checking Y in XYZ case */
        if (gridOrdering == XYZ)
        {
            mMiddle = (float)((kMiddle < maxkMiddle) ? kMiddle : (kMiddle - nMiddle));
        }
        const int kMinor = localOffsetMinor + indexMinor;
        float     mMinor = (float)kMinor;
        /* Checking X in YZX case */
        if (gridOrdering == YZX)
        {
            mMinor = (float)((kMinor < maxkMinor) ? kMinor : (kMinor - nMinor));
        }
        /* We should skip the k-space point (0,0,0) */
        const bool notZeroPoint = (kMinor > 0) | (kMajor > 0) | (kMiddle > 0);

        float mX = 0.0F;
        float mY = 0.0F;
        float mZ = 0.0F;
        if (gridOrdering == YZX)
        {
            mX = mMinor;
            mY = mMajor;
            mZ = mMiddle;
        }
        if (gridOrdering == XYZ)
        {
            mX = mMajor;
            mY = mMiddle;
            mZ = mMinor;
        }

        /* 0.5 correction factor for the first and last components of a Z dimension */
        float       corner_fac   = 1.0F;
        const float z_corner_fac = 0.5F;
        if (gridOrdering == YZX)
        {
            if ((kMiddle == 0) | (kMiddle == maxkMiddle))
            {
                corner_fac = z_corner_fac;
            }
        }
        if (gridOrdering == XYZ)
        {
            if ((kMinor == 0) | (kMinor == maxkMinor))
            {
                corner_fac = z_corner_fac;
            }
        }

        if (notZeroPoint)
        {
            const float mhxk = mX * kernelParams.current.recipBox[XX][XX];
            const float mhyk = mX * kernelParams.current.recipBox[XX][YY]
                               + mY * kernelParams.current.recipBox[YY][YY];
            const float mhzk = mX * kernelParams.current.recipBox[XX][ZZ]
                               + mY * kernelParams.current.recipBox[YY][ZZ]
                               + mZ * kernelParams.current.recipBox[ZZ][ZZ];

            const float m2k = mhxk * mhxk + mhyk * mhyk + mhzk * mhzk;
            assert(m2k != 0.0F);
            const float denom = m2k * M_PI_F * kernelParams.current.boxVolume * gm_splineValueMajor[kMajor]
                                * gm_splineValueMiddle[kMiddle] * gm_splineValueMinor[kMinor];
            assert(isfinite(denom));
            assert(denom != 0.0F);
            const float tmp1   = exp(-kernelParams.grid.ewaldFactor * m2k);
            const float etermk = kernelParams.constants.elFactor * tmp1 / denom;

            float2       gridValue    = *gm_gridCell;
            const float2 oldGridValue = gridValue;

            gridValue.x *= etermk;
            gridValue.y *= etermk;
            *gm_gridCell = gridValue;

            if (computeEnergyAndVirial)
            {
                const float tmp1k =
                        2.0F * (gridValue.x * oldGridValue.x + gridValue.y * oldGridValue.y);

                const float vfactor = (kernelParams.grid.ewaldFactor + 1.0F / m2k) * 2.0F;
                const float ets2    = corner_fac * tmp1k;
                energy              = ets2;

                const float ets2vf = ets2 * vfactor;

                virxx = ets2vf * mhxk * mhxk - ets2;
                virxy = ets2vf * mhxk * mhyk;
                virxz = ets2vf * mhxk * mhzk;
                viryy = ets2vf * mhyk * mhyk - ets2;
                viryz = ets2vf * mhyk * mhzk;
                virzz = ets2vf * mhzk * mhzk - ets2;
            }
        }
    }

    // This is only for reduction below. OpenCL 1.2: all local memory must be declared on kernel scope.
    __local float sm_virialAndEnergy[c_virialAndEnergyCount * warp_size];

    /* Optional energy/virial reduction */
    if (computeEnergyAndVirial)
    {
        // TODO: implement AMD intrinsics reduction, like with shuffles in CUDA version. #2514

        /* Shared memory reduction with atomics.
         * Each component is first reduced into warp_size positions in the shared memory;
         * Then first few warps reduce everything further and add to the global memory.
         * This can likely be improved, but is anyway faster than the previous straightforward reduction,
         * which was using too much shared memory (for storing all 7 floats on each thread).
         */

        const int  lane      = threadLocalId & (warp_size - 1);
        const int  warpIndex = threadLocalId / warp_size;
        const bool firstWarp = (warpIndex == 0);
        if (firstWarp)
        {
            sm_virialAndEnergy[0 * warp_size + lane] = virxx;
            sm_virialAndEnergy[1 * warp_size + lane] = viryy;
            sm_virialAndEnergy[2 * warp_size + lane] = virzz;
            sm_virialAndEnergy[3 * warp_size + lane] = virxy;
            sm_virialAndEnergy[4 * warp_size + lane] = virxz;
            // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers, readability-magic-numbers)
            sm_virialAndEnergy[5 * warp_size + lane] = viryz;
            // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers, readability-magic-numbers)
            sm_virialAndEnergy[6 * warp_size + lane] = energy;
        }
        barrier(CLK_LOCAL_MEM_FENCE);
        if (!firstWarp)
        {
            atomicAdd_l_f(sm_virialAndEnergy + 0 * warp_size + lane, virxx);
            atomicAdd_l_f(sm_virialAndEnergy + 1 * warp_size + lane, viryy);
            atomicAdd_l_f(sm_virialAndEnergy + 2 * warp_size + lane, virzz);
            atomicAdd_l_f(sm_virialAndEnergy + 3 * warp_size + lane, virxy);
            atomicAdd_l_f(sm_virialAndEnergy + 4 * warp_size + lane, virxz);
            // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers, readability-magic-numbers)
            atomicAdd_l_f(sm_virialAndEnergy + 5 * warp_size + lane, viryz);
            // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers, readability-magic-numbers)
            atomicAdd_l_f(sm_virialAndEnergy + 6 * warp_size + lane, energy);
        }
        barrier(CLK_LOCAL_MEM_FENCE);

        const int numIter = (c_virialAndEnergyCount + activeWarps - 1) / activeWarps;
        for (int i = 0; i < numIter; i++)
        {
            const int componentIndex = i * activeWarps + warpIndex;
            if (componentIndex < c_virialAndEnergyCount)
            {
                const int targetIndex = componentIndex * warp_size + lane;
#pragma unroll
                for (int reductionStride = warp_size >> 1; reductionStride >= 1; reductionStride >>= 1)
                {
                    if (lane < reductionStride)
                    {
                        sm_virialAndEnergy[targetIndex] +=
                                sm_virialAndEnergy[targetIndex + reductionStride];
                    }
#ifdef _NVIDIA_SOURCE_
                    /* FIXME: this execution happens within execution width aka warp, but somehow
                     * NVIDIA OpenCL of all things fails without the memory barrier here. #2519
                     */
                    barrier(CLK_LOCAL_MEM_FENCE);
#endif
                }
                if (lane == 0)
                {
                    atomicAdd_g_f(gm_virialAndEnergy + componentIndex, sm_virialAndEnergy[targetIndex]);
                }
            }
        }
    }
}