/* * This file is part of the GROMACS molecular simulation package. * * Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. * * GROMACS is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public License * as published by the Free Software Foundation; either version 2.1 * of the License, or (at your option) any later version. * * GROMACS is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with GROMACS; if not, see * http://www.gnu.org/licenses, or write to the Free Software Foundation, * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. * * If you want to redistribute modifications to GROMACS, please * consider that scientific software is very special. Version * control is crucial - bugs must be traceable. We will be happy to * consider code for inclusion in the official distribution, but * derived work must not be called official GROMACS. Details are found * in the README & COPYING files - if they are missing, get the * official version at http://www.gromacs.org. * * To help us fund GROMACS development, we humbly ask that you cite * the research papers on the package. Check out http://www.gromacs.org. */ /*! \internal \file * \brief Implements PME OpenCL force gathering kernel. * When including this and other PME OpenCL kernel files, plenty of common * constants/macros are expected to be defined (such as "order" which is PME interpolation order). * For details, please see how pme_program.cl is compiled in pme_gpu_program_impl_ocl.cpp. * * This file's kernels specifically expect the following definitions: * * - atomsPerBlock which expresses how many atoms are processed by a single work group * - order which is a PME interpolation order * - wrapX and wrapY must evaluate to either true or false to specify whether the grid overlap * in dimension X/Y is to be used * * \author Aleksei Iupinov */ #include "pme_gpu_calculate_splines.clh" #include "pme_gpu_types.h" #ifndef COMPILE_GATHER_HELPERS_ONCE # define COMPILE_GATHER_HELPERS_ONCE /*! \brief * Unrolls the dynamic index accesses to the constant grid sizes to avoid local memory operations. */ inline float read_grid_size(const float* realGridSizeFP, const int dimIndex) { switch (dimIndex) { case XX: return realGridSizeFP[XX]; case YY: return realGridSizeFP[YY]; case ZZ: return realGridSizeFP[ZZ]; default: assert(false); break; } assert(false); return 0.0F; } /*! \brief Reduce the partial force contributions. * * FIXME: this reduction should be simplified and improved, it does 3x16 force component * reduction per 16 threads so no extra shared mem should be needed for intermediates * or passing results back. * * \param[out] sm_forces Local memory array with the output forces (rvec). * \param[in] atomIndexLocal Local atom index * \param[in] splineIndex Spline index * \param[in] lineIndex Line index (same as threadLocalId) * \param[in] realGridSizeFP Local grid size constant * \param[in] fx Input force partial component X * \param[in] fy Input force partial component Y * \param[in] fz Input force partial component Z * \param[in,out] sm_forceReduction Reduction working buffer * \param[in] sm_forceTemp Convenience pointers into \p sm_forceReduction */ inline void reduce_atom_forces(__local float* __restrict__ sm_forces, const int atomIndexLocal, const int splineIndex, const int lineIndex, const float* realGridSizeFP, float fx, float fy, float fz, __local float* __restrict__ sm_forceReduction, __local float** __restrict__ sm_forceTemp) { // TODO: implement AMD intrinsics reduction, like with shuffles in CUDA version. #2514 /* Number of data components and threads for a single atom */ # define atomDataSize threadsPerAtom // We use blockSize local memory elements to read fx, or fy, or fz, and then reduce them to fit into smemPerDim elements // All those guys are defines and not consts, because they go into the local memory array size. # define blockSize (atomsPerBlock * atomDataSize) # define smemPerDim warp_size # define smemReserved (DIM * smemPerDim) const int numWarps = blockSize / smemPerDim; const int minStride = max(1, atomDataSize / numWarps); # pragma unroll DIM for (int dimIndex = 0; dimIndex < DIM; dimIndex++) { int elementIndex = smemReserved + lineIndex; // Store input force contributions sm_forceReduction[elementIndex] = (dimIndex == XX) ? fx : (dimIndex == YY) ? fy : fz; # if (warp_size < 48) // sync here when exec width is smaller than the size of the sm_forceReduction // buffer flushed to local mem above (size 3*16) as different warps will consume // the data below. barrier(CLK_LOCAL_MEM_FENCE); # endif // Reduce to fit into smemPerDim (warp size) # pragma unroll for (int redStride = atomDataSize >> 1; redStride > minStride; redStride >>= 1) { if (splineIndex < redStride) { sm_forceReduction[elementIndex] += sm_forceReduction[elementIndex + redStride]; } } barrier(CLK_LOCAL_MEM_FENCE); // Last iteration - packing everything to be nearby, storing convenience pointer sm_forceTemp[dimIndex] = sm_forceReduction + dimIndex * smemPerDim; int redStride = minStride; if (splineIndex < redStride) { const int packedIndex = atomIndexLocal * redStride + splineIndex; sm_forceTemp[dimIndex][packedIndex] = sm_forceReduction[elementIndex] + sm_forceReduction[elementIndex + redStride]; } // barrier only needed for the last iteration on hardware with >=64-wide execution (e.g. AMD) # if (warp_size < 64) barrier(CLK_LOCAL_MEM_FENCE); # endif } # if (warp_size >= 64) barrier(CLK_LOCAL_MEM_FENCE); # endif assert((blockSize / warp_size) >= DIM); const int warpIndex = lineIndex / warp_size; const int dimIndex = warpIndex; // First 3 warps can now process 1 dimension each if (dimIndex < DIM) { const int sourceIndex = lineIndex % warp_size; # pragma unroll for (int redStride = minStride >> 1; redStride > 1; redStride >>= 1) { if (!(splineIndex & redStride)) { sm_forceTemp[dimIndex][sourceIndex] += sm_forceTemp[dimIndex][sourceIndex + redStride]; } } const float n = read_grid_size(realGridSizeFP, dimIndex); const int atomIndex = sourceIndex / minStride; if (sourceIndex == minStride * atomIndex) { sm_forces[atomIndex * DIM + dimIndex] = (sm_forceTemp[dimIndex][sourceIndex] + sm_forceTemp[dimIndex][sourceIndex + 1]) * n; } } } /*! \brief Calculate the sum of the force partial components (in X, Y and Z) * * \param[out] fx The force partial component in the X dimension. * \param[out] fy The force partial component in the Y dimension. * \param[out] fz The force partial component in the Z dimension. * \param[in] ixBase The grid line index base value in the X dimension. * \param[in] nx The grid real size in the X dimension. * \param[in] pny The padded grid real size in the Y dimension. * \param[in] pnz The padded grid real size in the Z dimension. * \param[in] constOffset The offset to calculate the global grid index. * \param[in] splineIndexBase The base value of the spline parameter index. * \param[in] tdy The theta and dtheta in the Y dimension. * \param[in] tdz The theta and dtheta in the Z dimension. * \param[in] sm_splineParams Shared memory array of spline parameters. * \param[in] gm_grid Global memory array of the grid to use. */ inline void sumForceComponents(float* fx, float* fy, float* fz, const int ixBase, const int nx, const int pny, const int pnz, const int constOffset, const int splineIndexBase, const float2 tdy, const float2 tdz, __local const float2* __restrict__ sm_splineParams, __global const float* __restrict__ gm_grid) { # pragma unroll order for (int ithx = 0; (ithx < order); ithx++) { int ix = ixBase + ithx; if (wrapX & (ix >= nx)) { ix -= nx; } const int gridIndexGlobal = ix * pny * pnz + constOffset; assert(gridIndexGlobal >= 0); const float gridValue = gm_grid[gridIndexGlobal]; assert(isfinite(gridValue)); const int splineIndexX = getSplineParamIndex(splineIndexBase, XX, ithx); const float2 tdx = sm_splineParams[splineIndexX]; const float fxy1 = tdz.x * gridValue; const float fz1 = tdz.y * gridValue; *fx += tdx.y * tdy.x * fxy1; *fy += tdx.x * tdy.y * fxy1; *fz += tdx.x * tdy.x * fz1; } } /*! \brief Calculate the grid forces and store them in shared memory. * * \param[in,out] sm_forces Shared memory array with the output forces. * \param[in] forceIndexLocal The local (per thread) index in the sm_forces array. * \param[in] forceIndexGlobal The index of the thread in the gm_coefficients array. * \param[in] recipBox The reciprocal box. * \param[in] scale The scale to use when calculating the forces (only relevant when * using two grids). * \param[in] gm_coefficients Global memory array of the coefficients to use. */ inline void calculateAndStoreGridForces(__local float* __restrict__ sm_forces, const int forceIndexLocal, const int forceIndexGlobal, const float recipBox[DIM][DIM], const float scale, __global const float* __restrict__ gm_coefficients) { const float3 atomForces = vload3(forceIndexLocal, sm_forces); float negCoefficient = -scale * gm_coefficients[forceIndexGlobal]; float3 result; result.x = negCoefficient * recipBox[XX][XX] * atomForces.x; result.y = negCoefficient * (recipBox[XX][YY] * atomForces.x + recipBox[YY][YY] * atomForces.y); result.z = negCoefficient * (recipBox[XX][ZZ] * atomForces.x + recipBox[YY][ZZ] * atomForces.y + recipBox[ZZ][ZZ] * atomForces.z); vstore3(result, forceIndexLocal, sm_forces); } #endif // COMPILE_GATHER_HELPERS_ONCE /*! \brief * An OpenCL kernel which gathers the atom forces from the grid. * The grid is assumed to be wrapped in dimension Z. * Please see the file description for additional defines which this kernel expects. * * \param[in] kernelParams All the PME GPU data. * \param[in] gm_coefficientsA Atom charges/coefficients in the unperturbed state, or FEP * state A. * \param[in] gm_coefficientsB Atom charges/coefficients in FEP state B. Only used * when spreading interpolated coefficients on one grid or spreading two sets of coefficients on two * separate grids. * \param[in] gm_gridA Global 3D grid for the unperturbed state, FEP * state A or the single grid used for interpolated coefficients on one grid in FEP A/B. * \param[in] gm_gridB Global 3D grid for FEP state B when using dual * grids (when calculating energy and virials). * \param[in] gm_theta Atom spline parameter values * \param[in] gm_dtheta Atom spline parameter derivatives * \param[in] gm_gridlineIndices Atom gridline indices (ivec) * \param[in,out] gm_forces Atom forces (rvec) */ __attribute__((reqd_work_group_size(order, order, atomsPerBlock))) __kernel void CUSTOMIZED_KERNEL_NAME(pme_gather_kernel)(const struct PmeOpenCLKernelParams kernelParams, __global const float* __restrict__ gm_coefficientsA, __global const float* __restrict__ gm_coefficientsB, __global const float* __restrict__ gm_gridA, __global const float* __restrict__ gm_gridB, __global const float* __restrict__ gm_theta, __global const float* __restrict__ gm_dtheta, __global const int* __restrict__ gm_gridlineIndices, __global float* __restrict__ gm_forces) { assert(numGrids == 1 || numGrids == 2); /* These are the atom indices - for the shared and global memory */ const int atomIndexLocal = get_local_id(ZZ); const int atomIndexOffset = (int)get_group_id(XX) * atomsPerBlock; const int atomIndexGlobal = atomIndexOffset + atomIndexLocal; /* Some sizes which are defines and not consts because they go into the array size */ #define blockSize (atomsPerBlock * atomDataSize) assert(blockSize == (get_local_size(0) * get_local_size(1) * get_local_size(2))); #define smemPerDim warp_size #define smemReserved (DIM * smemPerDim) #define totalSharedMemory (smemReserved + blockSize) #define gridlineIndicesSize (atomsPerBlock * DIM) #define splineParamsSize (atomsPerBlock * DIM * order) __local int sm_gridlineIndices[gridlineIndicesSize]; __local float2 sm_splineParams[splineParamsSize]; /* Theta/dtheta pairs as .x/.y */ /* Spline Y/Z coordinates */ const int ithy = get_local_id(YY); const int ithz = get_local_id(XX); assert((get_local_id(2) * get_local_size(1) + get_local_id(1)) * get_local_size(0) + get_local_id(0) <= MAX_INT); const int threadLocalId = (int)((get_local_id(2) * get_local_size(1) + get_local_id(1)) * get_local_size(0) + get_local_id(0)); /* These are the spline contribution indices in shared memory */ assert((get_local_id(1) * get_local_size(0) + get_local_id(0)) <= MAX_INT); const int splineIndex = (int)(get_local_id(1) * get_local_size(0) + get_local_id(0)); /* Relative to the current particle , 0..15 for order 4 */ const int lineIndex = threadLocalId; /* And to all the block's particles */ /* Staging the atom gridline indices, DIM * atomsPerBlock threads */ const int localGridlineIndicesIndex = threadLocalId; const int globalGridlineIndicesIndex = (int)get_group_id(XX) * gridlineIndicesSize + localGridlineIndicesIndex; if (localGridlineIndicesIndex < gridlineIndicesSize) { sm_gridlineIndices[localGridlineIndicesIndex] = gm_gridlineIndices[globalGridlineIndicesIndex]; assert(sm_gridlineIndices[localGridlineIndicesIndex] >= 0); } /* Staging the spline parameters, DIM * order * atomsPerBlock threads */ const int localSplineParamsIndex = threadLocalId; const int globalSplineParamsIndex = (int)get_group_id(XX) * splineParamsSize + localSplineParamsIndex; if (localSplineParamsIndex < splineParamsSize) { sm_splineParams[localSplineParamsIndex].x = gm_theta[globalSplineParamsIndex]; sm_splineParams[localSplineParamsIndex].y = gm_dtheta[globalSplineParamsIndex]; assert(isfinite(sm_splineParams[localSplineParamsIndex].x)); assert(isfinite(sm_splineParams[localSplineParamsIndex].y)); } barrier(CLK_LOCAL_MEM_FENCE); float fx = 0.0F; float fy = 0.0F; float fz = 0.0F; int chargeCheck = pme_gpu_check_atom_charge(gm_coefficientsA[atomIndexGlobal]); const int nx = kernelParams.grid.realGridSize[XX]; const int ny = kernelParams.grid.realGridSize[YY]; const int nz = kernelParams.grid.realGridSize[ZZ]; const int pny = kernelParams.grid.realGridSizePadded[YY]; const int pnz = kernelParams.grid.realGridSizePadded[ZZ]; const int atomWarpIndex = atomIndexLocal % atomsPerWarp; const int warpIndex = atomIndexLocal / atomsPerWarp; const int splineIndexBase = getSplineParamIndexBase(warpIndex, atomWarpIndex); const int splineIndexY = getSplineParamIndex(splineIndexBase, YY, ithy); const float2 tdy = sm_splineParams[splineIndexY]; const int splineIndexZ = getSplineParamIndex(splineIndexBase, ZZ, ithz); const float2 tdz = sm_splineParams[splineIndexZ]; const int ixBase = sm_gridlineIndices[atomIndexLocal * DIM + XX]; int iy = sm_gridlineIndices[atomIndexLocal * DIM + YY] + ithy; if (wrapY & (iy >= ny)) { iy -= ny; } int iz = sm_gridlineIndices[atomIndexLocal * DIM + ZZ] + ithz; if (iz >= nz) { iz -= nz; } const int constOffset = iy * pnz + iz; if (chargeCheck) { sumForceComponents( &fx, &fy, &fz, ixBase, nx, pny, pnz, constOffset, splineIndexBase, tdy, tdz, sm_splineParams, gm_gridA); } // Reduction of partial force contributions __local float sm_forces[atomsPerBlock * DIM]; __local float sm_forceReduction[totalSharedMemory]; __local float* sm_forceTemp[DIM]; reduce_atom_forces(sm_forces, atomIndexLocal, splineIndex, lineIndex, kernelParams.grid.realGridSizeFP, fx, fy, fz, sm_forceReduction, sm_forceTemp); barrier(CLK_LOCAL_MEM_FENCE); /* Calculating the final forces with no component branching, atomsPerBlock threads */ const int forceIndexLocal = threadLocalId; const int forceIndexGlobal = atomIndexOffset + forceIndexLocal; const float scale = kernelParams.current.scale; if (forceIndexLocal < atomsPerBlock) { calculateAndStoreGridForces( sm_forces, forceIndexLocal, forceIndexGlobal, kernelParams.current.recipBox, scale, gm_coefficientsA); } #if !defined(_AMD_SOURCE_) && !defined(_NVIDIA_SOURCE_) /* This is only here for execution of e.g. 32-sized warps on 16-wide hardware; this was * __syncwarp() in CUDA. #2519 */ barrier(CLK_LOCAL_MEM_FENCE); #endif assert(atomsPerBlock <= warp_size); /* Writing or adding the final forces component-wise, single warp */ const int blockForcesSize = atomsPerBlock * DIM; const int numIter = (blockForcesSize + warp_size - 1) / warp_size; const int iterThreads = blockForcesSize / numIter; if (threadLocalId < iterThreads) { #pragma unroll for (int i = 0; i < numIter; i++) { const int outputIndexLocal = i * iterThreads + threadLocalId; const int outputIndexGlobal = (int)get_group_id(XX) * blockForcesSize + outputIndexLocal; const float outputForceComponent = sm_forces[outputIndexLocal]; gm_forces[outputIndexGlobal] = outputForceComponent; } } if (numGrids == 2) { barrier(CLK_LOCAL_MEM_FENCE); fx = 0.0F; fy = 0.0F; fz = 0.0F; chargeCheck = pme_gpu_check_atom_charge(gm_coefficientsB[atomIndexGlobal]); if (chargeCheck) { sumForceComponents( &fx, &fy, &fz, ixBase, nx, pny, pnz, constOffset, splineIndexBase, tdy, tdz, sm_splineParams, gm_gridB); } reduce_atom_forces(sm_forces, atomIndexLocal, splineIndex, lineIndex, kernelParams.grid.realGridSizeFP, fx, fy, fz, sm_forceReduction, sm_forceTemp); barrier(CLK_LOCAL_MEM_FENCE); if (forceIndexLocal < atomsPerBlock) { calculateAndStoreGridForces(sm_forces, forceIndexLocal, forceIndexGlobal, kernelParams.current.recipBox, 1.0F - scale, gm_coefficientsB); } #if !defined(_AMD_SOURCE_) && !defined(_NVIDIA_SOURCE_) /* This is only here for execution of e.g. 32-sized warps on 16-wide hardware; this was * __syncwarp() in CUDA. #2519 */ barrier(CLK_LOCAL_MEM_FENCE); #endif /* Writing or adding the final forces component-wise, single warp */ if (threadLocalId < iterThreads) { #pragma unroll for (int i = 0; i < numIter; i++) { const int outputIndexLocal = i * iterThreads + threadLocalId; const int outputIndexGlobal = (int)get_group_id(XX) * blockForcesSize + outputIndexLocal; const float outputForceComponent = sm_forces[outputIndexLocal]; gm_forces[outputIndexGlobal] += outputForceComponent; } } } }