From f466950153163ea6d36fa01b315fa52045038a72 Mon Sep 17 00:00:00 2001 From: Andrey Alekseenko Date: Sat, 25 Sep 2021 09:55:00 +0000 Subject: [PATCH] SYCL PME Gather kernel --- src/gromacs/ewald/CMakeLists.txt | 3 +- src/gromacs/ewald/pme_gather.cu | 2 - src/gromacs/ewald/pme_gather_sycl.cpp | 721 ++++++++++++++++++ src/gromacs/ewald/pme_gather_sycl.h | 66 ++ .../ewald/pme_gpu_program_impl_sycl.cpp | 51 +- src/gromacs/ewald/tests/pmegathertest.cpp | 9 +- 6 files changed, 837 insertions(+), 15 deletions(-) create mode 100644 src/gromacs/ewald/pme_gather_sycl.cpp create mode 100644 src/gromacs/ewald/pme_gather_sycl.h diff --git a/src/gromacs/ewald/CMakeLists.txt b/src/gromacs/ewald/CMakeLists.txt index 27dc116a28..78d2a4ed02 100644 --- a/src/gromacs/ewald/CMakeLists.txt +++ b/src/gromacs/ewald/CMakeLists.txt @@ -86,9 +86,9 @@ elseif (GMX_GPU_OPENCL) pme_gpu_timings.cpp ) elseif (GMX_GPU_SYCL) - # SYCL-TODO: proper implementation gmx_add_libgromacs_sources( # GPU-specific sources + pme_gather_sycl.cpp pme_gpu.cpp pme_gpu_internal.cpp pme_gpu_program_impl_sycl.cpp @@ -96,6 +96,7 @@ elseif (GMX_GPU_SYCL) pme_gpu_timings.cpp ) _gmx_add_files_to_property(SYCL_SOURCES + pme_gather_sycl.cpp pme_gpu_internal.cpp pme_gpu_program.cpp pme_gpu_program_impl_sycl.cpp diff --git a/src/gromacs/ewald/pme_gather.cu b/src/gromacs/ewald/pme_gather.cu index 28de3817e5..3964b3e7c9 100644 --- a/src/gromacs/ewald/pme_gather.cu +++ b/src/gromacs/ewald/pme_gather.cu @@ -310,8 +310,6 @@ __device__ __forceinline__ void sumForceComponents(float* __restrict__ fx, * \param[in] gm_coefficients Global memory array of the coefficients to use for an unperturbed * or FEP in state A if a single grid is used (\p multiCoefficientsSingleGrid == true).If two * separate grids are used this should be the coefficients of the grid in question. - * \param[in] gm_coefficientsB Global memory array of the coefficients to use for FEP in state B. - * Should be nullptr if two separate grids are used. */ __device__ __forceinline__ void calculateAndStoreGridForces(float3* __restrict__ sm_forces, const int forceIndexLocal, diff --git a/src/gromacs/ewald/pme_gather_sycl.cpp b/src/gromacs/ewald/pme_gather_sycl.cpp new file mode 100644 index 0000000000..b03a7804b3 --- /dev/null +++ b/src/gromacs/ewald/pme_gather_sycl.cpp @@ -0,0 +1,721 @@ +/* + * This file is part of the GROMACS molecular simulation package. + * + * Copyright (c) 2021, by the GROMACS development team, led by + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, + * and including many others, as listed in the AUTHORS file in the + * top-level source directory and at http://www.gromacs.org. + * + * GROMACS is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 + * of the License, or (at your option) any later version. + * + * GROMACS is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with GROMACS; if not, see + * http://www.gnu.org/licenses, or write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * If you want to redistribute modifications to GROMACS, please + * consider that scientific software is very special. Version + * control is crucial - bugs must be traceable. We will be happy to + * consider code for inclusion in the official distribution, but + * derived work must not be called official GROMACS. Details are found + * in the README & COPYING files - if they are missing, get the + * official version at http://www.gromacs.org. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the research papers on the package. Check out http://www.gromacs.org. + */ + +/*! \internal \file + * \brief Implements PME force gathering in SYCL. + * + * \author Andrey Alekseenko + */ + +#include "gmxpre.h" + +#include "pme_gather_sycl.h" + +#include "gromacs/gpu_utils/gmxsycl.h" +#include "gromacs/gpu_utils/gputraits_sycl.h" +#include "gromacs/gpu_utils/sycl_kernel_utils.h" +#include "gromacs/gpu_utils/syclutils.h" +#include "gromacs/math/functions.h" + +#include "pme_gpu_calculate_splines_sycl.h" +#include "pme_grid.h" +#include "pme_gpu_constants.h" +#include "pme_gpu_types_host.h" + + +/*! \brief Reduce the partial force contributions. + * + * \tparam order The PME order (must be 4). + * \tparam atomDataSize The number of partial force contributions for each atom (currently + * order^2 == 16). + * \tparam workGroupSize The size of a work-group. + * \tparam subGroupSize The size of a sub-group. + * + * \param[in] itemIdx SYCL thread ID. + * \param[out] sm_forces Shared memory array with the output forces (number of elements + * is number of atoms per block). + * \param[in] atomIndexLocal Local atom index. + * \param[in] splineIndex Spline index. + * \param[in] lineIndex Line index (same as threadLocalId) + * \param[in] realGridSizeFP Local grid size constant + * \param[in] fx Input force partial component X + * \param[in] fy Input force partial component Y + * \param[in] fz Input force partial component Z + */ +template +inline void reduceAtomForces(cl::sycl::nd_item<3> itemIdx, + cl::sycl::local_ptr sm_forces, + const int atomIndexLocal, + const int splineIndex, + const int gmx_unused lineIndex, + const float realGridSizeFP[3], + float& fx, // NOLINT(google-runtime-references) + float& fy, // NOLINT(google-runtime-references) + float& fz) // NOLINT(google-runtime-references) +{ + static_assert(gmx::isPowerOfTwo(order)); + // TODO: find out if this is the best in terms of transactions count + static_assert(order == 4, "Only order of 4 is implemented"); + + sycl_2020::sub_group sg = itemIdx.get_sub_group(); + + static_assert(atomDataSize <= subGroupSize, + "TODO: rework for atomDataSize > subGroupSize (order 8 or larger)"); + + fx += sycl_2020::shift_left(sg, fx, 1); + fy += sycl_2020::shift_right(sg, fy, 1); + fz += sycl_2020::shift_left(sg, fz, 1); + if (splineIndex & 1) + { + fx = fy; + } + fx += sycl_2020::shift_left(sg, fx, 2); + fz += sycl_2020::shift_right(sg, fz, 2); + if (splineIndex & 2) + { + fx = fz; + } + // We have to just further reduce those groups of 4 + for (int delta = 4; delta < atomDataSize; delta *= 2) + { + fx += sycl_2020::shift_left(sg, fx, delta); + } + const int dimIndex = splineIndex; + if (dimIndex < DIM) + { + const float n = realGridSizeFP[dimIndex]; + sm_forces[atomIndexLocal][dimIndex] = fx * n; + } +} + +/*! \brief Calculate the sum of the force partial components (in X, Y and Z) + * + * \tparam order The PME order (must be 4). + * \tparam atomsPerWarp The number of atoms per GPU warp. + * \tparam wrapX Tells if the grid is wrapped in the X dimension. + * \tparam wrapY Tells if the grid is wrapped in the Y dimension. + * \param[out] fx The force partial component in the X dimension. + * \param[out] fy The force partial component in the Y dimension. + * \param[out] fz The force partial component in the Z dimension. + * \param[in] ithyMin The thread minimum index in the Y dimension. + * \param[in] ithyMax The thread maximum index in the Y dimension. + * \param[in] ixBase The grid line index base value in the X dimension. + * \param[in] iz The grid line index in the Z dimension. + * \param[in] nx The grid real size in the X dimension. + * \param[in] ny The grid real size in the Y dimension. + * \param[in] pny The padded grid real size in the Y dimension. + * \param[in] pnz The padded grid real size in the Z dimension. + * \param[in] atomIndexLocal The atom index for this thread. + * \param[in] splineIndexBase The base value of the spline parameter index. + * \param[in] tdz The theta and dtheta in the Z dimension. + * \param[in] sm_gridlineIndices Shared memory array of grid line indices. + * \param[in] sm_theta Shared memory array of atom theta values. + * \param[in] sm_dtheta Shared memory array of atom dtheta values. + * \param[in] gm_grid Global memory array of the grid to use. + */ +template +inline void sumForceComponents(cl::sycl::private_ptr fx, + cl::sycl::private_ptr fy, + cl::sycl::private_ptr fz, + const int ithyMin, + const int ithyMax, + const int ixBase, + const int iz, + const int nx, + const int ny, + const int pny, + const int pnz, + const int atomIndexLocal, + const int splineIndexBase, + const cl::sycl::float2 tdz, + const cl::sycl::local_ptr sm_gridlineIndices, + const cl::sycl::local_ptr sm_theta, + const cl::sycl::local_ptr sm_dtheta, + const cl::sycl::global_ptr gm_grid) +{ + for (int ithy = ithyMin; ithy < ithyMax; ithy++) + { + const int splineIndexY = getSplineParamIndex(splineIndexBase, YY, ithy); + const cl::sycl::float2 tdy{ sm_theta[splineIndexY], sm_dtheta[splineIndexY] }; + + int iy = sm_gridlineIndices[atomIndexLocal * DIM + YY] + ithy; + if (wrapY & (iy >= ny)) + { + iy -= ny; + } + const int constOffset = iy * pnz + iz; + +#pragma unroll + for (int ithx = 0; ithx < order; ithx++) + { + int ix = ixBase + ithx; + if (wrapX & (ix >= nx)) + { + ix -= nx; + } + const int gridIndexGlobal = ix * pny * pnz + constOffset; + assert(gridIndexGlobal >= 0); + const float gridValue = gm_grid[gridIndexGlobal]; + assertIsFinite(gridValue); + const int splineIndexX = getSplineParamIndex(splineIndexBase, XX, ithx); + const cl::sycl::float2 tdx{ sm_theta[splineIndexX], sm_dtheta[splineIndexX] }; + const float fxy1 = tdz[XX] * gridValue; + const float fz1 = tdz[YY] * gridValue; + *fx += tdx[YY] * tdy[XX] * fxy1; + *fy += tdx[XX] * tdy[YY] * fxy1; + *fz += tdx[XX] * tdy[XX] * fz1; + } + } +} + + +/*! \brief Calculate the grid forces and store them in shared memory. + * + * \param[in,out] sm_forces Shared memory array with the output forces. + * \param[in] forceIndexLocal The local (per thread) index in the sm_forces array. + * \param[in] forceIndexGlobal The index of the thread in the gm_coefficients array. + * \param[in] recipBox0 The reciprocal box (first vector). + * \param[in] recipBox1 The reciprocal box (second vector). + * \param[in] recipBox2 The reciprocal box (third vector). + * \param[in] scale The scale to use when calculating the forces. For gm_coefficientsB + * (when using multiple coefficients on a single grid) the scale will be (1.0 - scale). + * \param[in] gm_coefficients Global memory array of the coefficients to use for an unperturbed + * or FEP in state A if a single grid is used (\p multiCoefficientsSingleGrid == true).If two + * separate grids are used this should be the coefficients of the grid in question. + */ +inline void calculateAndStoreGridForces(cl::sycl::local_ptr sm_forces, + const int forceIndexLocal, + const int forceIndexGlobal, + const Float3& recipBox0, + const Float3& recipBox1, + const Float3& recipBox2, + const float scale, + const cl::sycl::global_ptr gm_coefficients) +{ + const Float3 atomForces = sm_forces[forceIndexLocal]; + float negCoefficient = -scale * gm_coefficients[forceIndexGlobal]; + Float3 result; + result[XX] = negCoefficient * recipBox0[XX] * atomForces[XX]; + result[YY] = negCoefficient * (recipBox0[YY] * atomForces[XX] + recipBox1[YY] * atomForces[YY]); + result[ZZ] = negCoefficient + * (recipBox0[ZZ] * atomForces[XX] + recipBox1[ZZ] * atomForces[YY] + + recipBox2[ZZ] * atomForces[ZZ]); + sm_forces[forceIndexLocal] = result; +} + +/*! \brief + * A SYCL kernel which gathers the atom forces from the grid. + * The grid is assumed to be wrapped in dimension Z. + * + * \tparam order PME interpolation order. + * \tparam wrapX A boolean which tells if the grid overlap in dimension X should + * be wrapped. + * \tparam wrapY A boolean which tells if the grid overlap in dimension Y should + * be wrapped. + * \tparam numGrids The number of grids to use in the kernel. Can be 1 or 2. + * \tparam writeGlobal Tells if we should read spline values from global memory. + * \tparam threadsPerAtom How many threads work on each atom. + * \tparam subGroupSize Size of the sub-group. + */ +template +auto pmeGatherKernel(cl::sycl::handler& cgh, + const int nAtoms, + DeviceAccessor a_gridA, + OptionalAccessor a_gridB, + DeviceAccessor a_coefficientsA, + OptionalAccessor a_coefficientsB, + OptionalAccessor a_coordinates, + DeviceAccessor a_forces, + DeviceAccessor a_theta, + DeviceAccessor a_dtheta, + DeviceAccessor a_gridlineIndices, + OptionalAccessor a_fractShiftsTable, + OptionalAccessor a_gridlineIndicesTable, + const gmx::IVec tablesOffsets, + const gmx::IVec realGridSize, + const gmx::RVec realGridSizeFP, + const gmx::IVec realGridSizePadded, + const gmx::RVec currentRecipBox0, + const gmx::RVec currentRecipBox1, + const gmx::RVec currentRecipBox2, + const float scale) +{ + static_assert(numGrids == 1 || numGrids == 2); + + constexpr int threadsPerAtomValue = (threadsPerAtom == ThreadsPerAtom::Order) ? order : order * order; + constexpr int atomDataSize = threadsPerAtomValue; + constexpr int atomsPerBlock = (c_gatherMaxWarpsPerBlock * subGroupSize) / atomDataSize; + // Number of atoms processed by a single warp in spread and gather + static_assert(subGroupSize >= atomDataSize); + constexpr int atomsPerWarp = subGroupSize / atomDataSize; + constexpr int blockSize = atomsPerBlock * atomDataSize; + constexpr int splineParamsSize = atomsPerBlock * DIM * order; + constexpr int gridlineIndicesSize = atomsPerBlock * DIM; + + cgh.require(a_gridA); + cgh.require(a_coefficientsA); + cgh.require(a_forces); + + if constexpr (numGrids == 2) + { + cgh.require(a_gridB); + cgh.require(a_coefficientsB); + } + + if constexpr (readGlobal) + { + cgh.require(a_theta); + cgh.require(a_dtheta); + cgh.require(a_gridlineIndices); + } + else + { + cgh.require(a_coordinates); + cgh.require(a_fractShiftsTable); + cgh.require(a_gridlineIndicesTable); + } + + // Gridline indices, ivec + cl::sycl::accessor sm_gridlineIndices( + cl::sycl::range<1>(atomsPerBlock * DIM), cgh); + // Spline values + cl::sycl::accessor sm_theta( + cl::sycl::range<1>(atomsPerBlock * DIM * order), cgh); + // Spline derivatives + cl::sycl::accessor sm_dtheta( + cl::sycl::range<1>(atomsPerBlock * DIM * order), cgh); + // Coefficients prefetch cache + cl::sycl::accessor sm_coefficients( + cl::sycl::range<1>(atomsPerBlock), cgh); + // Coordinates prefetch cache + cl::sycl::accessor sm_coordinates( + cl::sycl::range<1>(atomsPerBlock), cgh); + // Reduction of partial force contributions + cl::sycl::accessor sm_forces( + cl::sycl::range<1>(atomsPerBlock), cgh); + + auto sm_fractCoords = [&]() { + if constexpr (!readGlobal) + { + return cl::sycl::accessor( + cl::sycl::range<1>(atomsPerBlock * DIM), cgh); + } + else + { + return nullptr; + } + }(); + + return [=](cl::sycl::nd_item<3> itemIdx) [[intel::reqd_sub_group_size(subGroupSize)]] + { + assert(blockSize == itemIdx.get_local_range().size()); + /* These are the atom indices - for the shared and global memory */ + const int atomIndexLocal = itemIdx.get_local_id(XX); + const int blockIndex = + itemIdx.get_group(YY) * itemIdx.get_group_range(ZZ) + itemIdx.get_group(ZZ); + // itemIdx.get_group_linear_id(); + + const int atomIndexOffset = blockIndex * atomsPerBlock; + const int atomIndexGlobal = atomIndexOffset + atomIndexLocal; + /* Early return for fully empty blocks at the end + * (should only happen for billions of input atoms) + */ + if (atomIndexOffset >= nAtoms) + { + return; + } + /* Spline Z coordinates */ + const int ithz = itemIdx.get_local_id(ZZ); + /* These are the spline contribution indices in shared memory */ + const int splineIndex = + itemIdx.get_local_id(YY) * itemIdx.get_local_range(ZZ) + itemIdx.get_local_id(ZZ); + + const int threadLocalId = itemIdx.get_local_linear_id(); + const int threadLocalIdMax = blockSize; + assert(threadLocalId < threadLocalIdMax); + + const int lineIndex = + (itemIdx.get_local_id(XX) * (itemIdx.get_local_range(ZZ) * itemIdx.get_local_range(YY))) + + splineIndex; // And to all the block's particles + assert(lineIndex == threadLocalId); + + if constexpr (readGlobal) + { + /* Read splines */ + const int localGridlineIndicesIndex = threadLocalId; + const int globalGridlineIndicesIndex = + blockIndex * gridlineIndicesSize + localGridlineIndicesIndex; + // itemIdx.get_group(ZZ) * gridlineIndicesSize + localGridlineIndicesIndex; + if (localGridlineIndicesIndex < gridlineIndicesSize) + { + sm_gridlineIndices[localGridlineIndicesIndex] = + a_gridlineIndices[globalGridlineIndicesIndex]; + assert(sm_gridlineIndices[localGridlineIndicesIndex] >= 0); + } + /* The loop needed for order threads per atom to make sure we load all data values, as each thread must load multiple values + with order*order threads per atom, it is only required for each thread to load one data value */ + + const int iMin = 0; + const int iMax = (threadsPerAtom == ThreadsPerAtom::Order) ? 3 : 1; + + for (int i = iMin; i < iMax; i++) + { + // i will always be zero for order*order threads per atom + const int localSplineParamsIndex = threadLocalId + i * threadLocalIdMax; + const int globalSplineParamsIndex = blockIndex * splineParamsSize + localSplineParamsIndex; + // const int globalSplineParamsIndex = itemIdx.get_group(ZZ) * splineParamsSize + localSplineParamsIndex; + if (localSplineParamsIndex < splineParamsSize) + { + sm_theta[localSplineParamsIndex] = a_theta[globalSplineParamsIndex]; + sm_dtheta[localSplineParamsIndex] = a_dtheta[globalSplineParamsIndex]; + assertIsFinite(sm_theta[localSplineParamsIndex]); + assertIsFinite(sm_dtheta[localSplineParamsIndex]); + } + } + + itemIdx.barrier(fence_space::local_space); + } + else + { + /* Recalculate Splines */ + /* Staging coefficients/charges */ + pmeGpuStageAtomData( + sm_coefficients.get_pointer(), a_coefficientsA.get_pointer(), itemIdx); + /* Staging coordinates */ + pmeGpuStageAtomData( + sm_coordinates.get_pointer(), a_coordinates.get_pointer(), itemIdx); + itemIdx.barrier(fence_space::local_space); + const Float3 atomX = sm_coordinates[atomIndexLocal]; + const float atomCharge = sm_coefficients[atomIndexLocal]; + + calculateSplines( + atomIndexOffset, + atomX, + atomCharge, + tablesOffsets, + realGridSizeFP, + currentRecipBox0, + currentRecipBox1, + currentRecipBox2, + nullptr, + nullptr, + nullptr, + a_fractShiftsTable.get_pointer(), + a_gridlineIndicesTable.get_pointer(), + sm_theta.get_pointer(), + sm_dtheta.get_pointer(), + sm_gridlineIndices.get_pointer(), + sm_fractCoords.get_pointer(), + itemIdx); + subGroupBarrier(itemIdx); + } + float fx = 0.0F; + float fy = 0.0F; + float fz = 0.0F; + + const int chargeCheck = pmeGpuCheckAtomCharge(a_coefficientsA[atomIndexGlobal]); + + const int nx = realGridSize[XX]; + const int ny = realGridSize[YY]; + const int nz = realGridSize[ZZ]; + const int pny = realGridSizePadded[YY]; + const int pnz = realGridSizePadded[ZZ]; + + const int atomWarpIndex = atomIndexLocal % atomsPerWarp; + const int warpIndex = atomIndexLocal / atomsPerWarp; + + const int splineIndexBase = getSplineParamIndexBase(warpIndex, atomWarpIndex); + const int splineIndexZ = getSplineParamIndex(splineIndexBase, ZZ, ithz); + const cl::sycl::float2 tdz{ sm_theta[splineIndexZ], sm_dtheta[splineIndexZ] }; + + int iz = sm_gridlineIndices[atomIndexLocal * DIM + ZZ] + ithz; + const int ixBase = sm_gridlineIndices[atomIndexLocal * DIM + XX]; + + if (iz >= nz) + { + iz -= nz; + } + + const int ithyMin = (threadsPerAtom == ThreadsPerAtom::Order) ? 0 : itemIdx.get_local_id(YY); + const int ithyMax = + (threadsPerAtom == ThreadsPerAtom::Order) ? order : itemIdx.get_local_id(YY) + 1; + if (chargeCheck) + { + sumForceComponents(&fx, + &fy, + &fz, + ithyMin, + ithyMax, + ixBase, + iz, + nx, + ny, + pny, + pnz, + atomIndexLocal, + splineIndexBase, + tdz, + sm_gridlineIndices.get_pointer(), + sm_theta.get_pointer(), + sm_dtheta.get_pointer(), + a_gridA.get_pointer()); + } + reduceAtomForces( + itemIdx, sm_forces.get_pointer(), atomIndexLocal, splineIndex, lineIndex, realGridSizeFP, fx, fy, fz); + itemIdx.barrier(fence_space::local_space); + + /* Calculating the final forces with no component branching, atomsPerBlock threads */ + const int forceIndexLocal = threadLocalId; + const int forceIndexGlobal = atomIndexOffset + forceIndexLocal; + if (forceIndexLocal < atomsPerBlock) + { + calculateAndStoreGridForces(sm_forces.get_pointer(), + forceIndexLocal, + forceIndexGlobal, + currentRecipBox0, + currentRecipBox1, + currentRecipBox2, + scale, + a_coefficientsA.get_pointer()); + } + itemIdx.barrier(fence_space::local_space); + + static_assert(atomsPerBlock <= subGroupSize); + + /* Writing or adding the final forces component-wise, single warp */ + constexpr int blockForcesSize = atomsPerBlock * DIM; + constexpr int numIter = (blockForcesSize + subGroupSize - 1) / subGroupSize; + constexpr int iterThreads = blockForcesSize / numIter; + if (threadLocalId < iterThreads) + { +#pragma unroll + for (int i = 0; i < numIter; i++) + { + const int floatIndexLocal = i * iterThreads + threadLocalId; + const int float3IndexLocal = floatIndexLocal / 3; + const int dimLocal = floatIndexLocal % 3; + static_assert(blockForcesSize % DIM == 0); // Assures that dimGlobal == dimLocal + const int float3IndexGlobal = blockIndex * atomsPerBlock + float3IndexLocal; + // const int float3IndexGlobal = itemIdx.get_group(ZZ) * atomsPerBlock + float3IndexLocal; + a_forces[float3IndexGlobal][dimLocal] = sm_forces[float3IndexLocal][dimLocal]; + } + } + + if constexpr (numGrids == 2) + { + /* We must sync here since the same shared memory is used as above. */ + itemIdx.barrier(fence_space::local_space); + fx = 0.0F; + fy = 0.0F; + fz = 0.0F; + const bool chargeCheck = pmeGpuCheckAtomCharge(a_coefficientsB[atomIndexGlobal]); + if (chargeCheck) + { + sumForceComponents(&fx, + &fy, + &fz, + ithyMin, + ithyMax, + ixBase, + iz, + nx, + ny, + pny, + pnz, + atomIndexLocal, + splineIndexBase, + tdz, + sm_gridlineIndices.get_pointer(), + sm_theta.get_pointer(), + sm_dtheta.get_pointer(), + a_gridB.get_pointer()); + } + // Reduction of partial force contributions + reduceAtomForces( + itemIdx, sm_forces.get_pointer(), atomIndexLocal, splineIndex, lineIndex, realGridSizeFP, fx, fy, fz); + itemIdx.barrier(fence_space::local_space); + + /* Calculating the final forces with no component branching, atomsPerBlock threads */ + if (forceIndexLocal < atomsPerBlock) + { + calculateAndStoreGridForces(sm_forces.get_pointer(), + forceIndexLocal, + forceIndexGlobal, + currentRecipBox0, + currentRecipBox1, + currentRecipBox2, + 1.0F - scale, + a_coefficientsB.get_pointer()); + } + + itemIdx.barrier(fence_space::local_space); + + /* Writing or adding the final forces component-wise, single warp */ + if (threadLocalId < iterThreads) + { +#pragma unroll + for (int i = 0; i < numIter; i++) + { + const int floatIndexLocal = i * iterThreads + threadLocalId; + const int float3IndexLocal = floatIndexLocal / 3; + const int dimLocal = floatIndexLocal % 3; + static_assert(blockForcesSize % DIM == 0); // Assures that dimGlobal == dimLocal + const int float3IndexGlobal = blockIndex * atomsPerBlock + float3IndexLocal; + a_forces[float3IndexGlobal][dimLocal] += sm_forces[float3IndexLocal][dimLocal]; + } + } + } + }; +} + +template +PmeGatherKernel::PmeGatherKernel() +{ + reset(); +} + +template +void PmeGatherKernel::setArg( + size_t argIndex, + void* arg) +{ + if (argIndex == 0) + { + auto* params = reinterpret_cast(arg); + gridParams_ = ¶ms->grid; + atomParams_ = ¶ms->atoms; + dynamicParams_ = ¶ms->current; + } + else + { + GMX_RELEASE_ASSERT(argIndex == 0, "Trying to pass too many args to the kernel"); + } +} + + +template +cl::sycl::event PmeGatherKernel::launch( + const KernelLaunchConfig& config, + const DeviceStream& deviceStream) +{ + GMX_RELEASE_ASSERT(gridParams_, "Can not launch the kernel before setting its args"); + GMX_RELEASE_ASSERT(atomParams_, "Can not launch the kernel before setting its args"); + GMX_RELEASE_ASSERT(dynamicParams_, "Can not launch the kernel before setting its args"); + + using kernelNameType = + PmeGatherKernel; + + // SYCL has different multidimensional layout than OpenCL/CUDA. + const cl::sycl::range<3> localSize{ config.blockSize[2], config.blockSize[1], config.blockSize[0] }; + const cl::sycl::range<3> groupRange{ config.gridSize[2], config.gridSize[1], config.gridSize[0] }; + const cl::sycl::nd_range<3> range{ groupRange * localSize, localSize }; + + cl::sycl::queue q = deviceStream.stream(); + + cl::sycl::event e = q.submit([&](cl::sycl::handler& cgh) { + auto kernel = pmeGatherKernel( + cgh, + atomParams_->nAtoms, + gridParams_->d_realGrid[0], + gridParams_->d_realGrid[1], + atomParams_->d_coefficients[0], + atomParams_->d_coefficients[1], + atomParams_->d_coordinates, + atomParams_->d_forces, + atomParams_->d_theta, + atomParams_->d_dtheta, + atomParams_->d_gridlineIndices, + gridParams_->d_fractShiftsTable, + gridParams_->d_gridlineIndicesTable, + gridParams_->tablesOffsets, + gridParams_->realGridSize, + gridParams_->realGridSizeFP, + gridParams_->realGridSizePadded, + dynamicParams_->recipBox[0], + dynamicParams_->recipBox[1], + dynamicParams_->recipBox[2], + dynamicParams_->scale); + cgh.parallel_for(range, kernel); + }); + + // Delete set args, so we don't forget to set them before the next launch. + reset(); + + return e; +} + + +template +void PmeGatherKernel::reset() +{ + gridParams_ = nullptr; + atomParams_ = nullptr; + dynamicParams_ = nullptr; +} + + +//! Kernel instantiations +/* Disable the "explicit template instantiation 'PmeSplineAndSpreadKernel<...>' will emit a vtable in every + * translation unit [-Wweak-template-vtables]" warning. + * It is only explicitly instantiated in this translation unit, so we should be safe. + */ +#ifdef __clang__ +# pragma clang diagnostic push +# pragma clang diagnostic ignored "-Wweak-template-vtables" +#endif + +#define INSTANTIATE_3(order, numGrids, readGlobal, threadsPerAtom, subGroupSize) \ + template class PmeGatherKernel; + +#define INSTANTIATE_2(order, numGrids, threadsPerAtom, subGroupSize) \ + INSTANTIATE_3(order, numGrids, true, threadsPerAtom, subGroupSize); \ + INSTANTIATE_3(order, numGrids, false, threadsPerAtom, subGroupSize); + +#define INSTANTIATE(order, subGroupSize) \ + INSTANTIATE_2(order, 1, ThreadsPerAtom::Order, subGroupSize); \ + INSTANTIATE_2(order, 1, ThreadsPerAtom::OrderSquared, subGroupSize); \ + INSTANTIATE_2(order, 2, ThreadsPerAtom::Order, subGroupSize); \ + INSTANTIATE_2(order, 2, ThreadsPerAtom::OrderSquared, subGroupSize); + +#if GMX_SYCL_DPCPP +INSTANTIATE(4, 16); // TODO: Choose best value, Issue #4153. +#elif GMX_SYCL_HIPSYCL +INSTANTIATE(4, 32); +INSTANTIATE(4, 64); +#endif + +#ifdef __clang__ +# pragma clang diagnostic pop +#endif diff --git a/src/gromacs/ewald/pme_gather_sycl.h b/src/gromacs/ewald/pme_gather_sycl.h new file mode 100644 index 0000000000..c6b07f34ea --- /dev/null +++ b/src/gromacs/ewald/pme_gather_sycl.h @@ -0,0 +1,66 @@ +/* + * This file is part of the GROMACS molecular simulation package. + * + * Copyright (c) 2021, by the GROMACS development team, led by + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, + * and including many others, as listed in the AUTHORS file in the + * top-level source directory and at http://www.gromacs.org. + * + * GROMACS is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 + * of the License, or (at your option) any later version. + * + * GROMACS is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with GROMACS; if not, see + * http://www.gnu.org/licenses, or write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * If you want to redistribute modifications to GROMACS, please + * consider that scientific software is very special. Version + * control is crucial - bugs must be traceable. We will be happy to + * consider code for inclusion in the official distribution, but + * derived work must not be called official GROMACS. Details are found + * in the README & COPYING files - if they are missing, get the + * official version at http://www.gromacs.org. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the research papers on the package. Check out http://www.gromacs.org. + */ + +/*! \internal \file + * \brief Implements PME GPU gather in SYCL. + * + * \author Andrey Alekseenko + */ + +#include "gromacs/gpu_utils/gmxsycl.h" + +#include "gromacs/gpu_utils/syclutils.h" + +#include "pme_grid.h" +#include "pme_gpu_types_host.h" + +struct PmeGpuGridParams; +struct PmeGpuAtomParams; +struct PmeGpuDynamicParams; + +template +class PmeGatherKernel : public ISyclKernelFunctor +{ +public: + PmeGatherKernel(); + void setArg(size_t argIndex, void* arg) override; + cl::sycl::event launch(const KernelLaunchConfig& config, const DeviceStream& deviceStream) override; + +private: + PmeGpuGridParams* gridParams_; + PmeGpuAtomParams* atomParams_; + PmeGpuDynamicParams* dynamicParams_; + void reset(); +}; diff --git a/src/gromacs/ewald/pme_gpu_program_impl_sycl.cpp b/src/gromacs/ewald/pme_gpu_program_impl_sycl.cpp index 07da0a848c..196cff26ae 100644 --- a/src/gromacs/ewald/pme_gpu_program_impl_sycl.cpp +++ b/src/gromacs/ewald/pme_gpu_program_impl_sycl.cpp @@ -49,6 +49,7 @@ #include "gromacs/gpu_utils/syclutils.h" #include "pme_gpu_program_impl.h" +#include "pme_gather_sycl.h" #include "pme_spread_sycl.h" #include "pme_gpu_constants.h" @@ -72,20 +73,32 @@ static int subGroupSizeFromVendor(const DeviceInformation& deviceInfo) } } -#define INSTANTIATE_3(order, computeSplines, spreadCharges, numGrids, writeGlobal, threadsPerAtom, subGroupSize) \ +#define INSTANTIATE_SPREAD_2( \ + order, computeSplines, spreadCharges, numGrids, writeGlobal, threadsPerAtom, subGroupSize) \ extern template class PmeSplineAndSpreadKernel; -#define INSTANTIATE_2(order, numGrids, threadsPerAtom, subGroupSize) \ - INSTANTIATE_3(order, true, true, numGrids, true, threadsPerAtom, subGroupSize); \ - INSTANTIATE_3(order, true, false, numGrids, true, threadsPerAtom, subGroupSize); \ - INSTANTIATE_3(order, false, true, numGrids, true, threadsPerAtom, subGroupSize); \ - INSTANTIATE_3(order, true, true, numGrids, false, threadsPerAtom, subGroupSize); +#define INSTANTIATE_SPREAD(order, numGrids, threadsPerAtom, subGroupSize) \ + INSTANTIATE_SPREAD_2(order, true, true, numGrids, true, threadsPerAtom, subGroupSize); \ + INSTANTIATE_SPREAD_2(order, true, false, numGrids, true, threadsPerAtom, subGroupSize); \ + INSTANTIATE_SPREAD_2(order, false, true, numGrids, true, threadsPerAtom, subGroupSize); \ + INSTANTIATE_SPREAD_2(order, true, true, numGrids, false, threadsPerAtom, subGroupSize); -#define INSTANTIATE(order, subGroupSize) \ - INSTANTIATE_2(order, 1, ThreadsPerAtom::Order, subGroupSize); \ - INSTANTIATE_2(order, 1, ThreadsPerAtom::OrderSquared, subGroupSize); \ - INSTANTIATE_2(order, 2, ThreadsPerAtom::Order, subGroupSize); \ - INSTANTIATE_2(order, 2, ThreadsPerAtom::OrderSquared, subGroupSize); +#define INSTANTIATE_GATHER_2(order, numGrids, readGlobal, threadsPerAtom, subGroupSize) \ + extern template class PmeGatherKernel; + +#define INSTANTIATE_GATHER(order, numGrids, threadsPerAtom, subGroupSize) \ + INSTANTIATE_GATHER_2(order, numGrids, true, threadsPerAtom, subGroupSize); \ + INSTANTIATE_GATHER_2(order, numGrids, false, threadsPerAtom, subGroupSize); + +#define INSTANTIATE_X(x, order, subGroupSize) \ + INSTANTIATE_##x(order, 1, ThreadsPerAtom::Order, subGroupSize); \ + INSTANTIATE_##x(order, 1, ThreadsPerAtom::OrderSquared, subGroupSize); \ + INSTANTIATE_##x(order, 2, ThreadsPerAtom::Order, subGroupSize); \ + INSTANTIATE_##x(order, 2, ThreadsPerAtom::OrderSquared, subGroupSize); + +#define INSTANTIATE(order, subGroupSize) \ + INSTANTIATE_X(SPREAD, order, subGroupSize); \ + INSTANTIATE_X(GATHER, order, subGroupSize); #if GMX_SYCL_DPCPP INSTANTIATE(4, 16); @@ -135,6 +148,22 @@ static void setKernelPointers(struct PmeGpuProgramImpl* pmeGpuProgram) new PmeSplineAndSpreadKernel(); pmeGpuProgram->spreadKernelThPerAtom4Dual = new PmeSplineAndSpreadKernel(); + pmeGpuProgram->gatherKernelSingle = + new PmeGatherKernel(); + pmeGpuProgram->gatherKernelThPerAtom4Single = + new PmeGatherKernel(); + pmeGpuProgram->gatherKernelReadSplinesSingle = + new PmeGatherKernel(); + pmeGpuProgram->gatherKernelReadSplinesThPerAtom4Single = + new PmeGatherKernel(); + pmeGpuProgram->gatherKernelDual = + new PmeGatherKernel(); + pmeGpuProgram->gatherKernelThPerAtom4Dual = + new PmeGatherKernel(); + pmeGpuProgram->gatherKernelReadSplinesDual = + new PmeGatherKernel(); + pmeGpuProgram->gatherKernelReadSplinesThPerAtom4Dual = + new PmeGatherKernel(); } PmeGpuProgramImpl::PmeGpuProgramImpl(const DeviceContext& deviceContext) : diff --git a/src/gromacs/ewald/tests/pmegathertest.cpp b/src/gromacs/ewald/tests/pmegathertest.cpp index d1d3ffcc7f..444ff904cf 100644 --- a/src/gromacs/ewald/tests/pmegathertest.cpp +++ b/src/gromacs/ewald/tests/pmegathertest.cpp @@ -254,7 +254,14 @@ public: } s_inputAtomDataSets_[atomCount] = atomData; } - s_pmeTestHardwareContexts = createPmeTestHardwareContextList(); + s_pmeTestHardwareContexts = createPmeTestHardwareContextList(); + g_allowPmeWithSyclForTesting = true; // We support PmeGather with SYCL + } + + static void TearDownTestSuite() + { + // Revert the value back. + g_allowPmeWithSyclForTesting = false; } //! The test -- 2.22.0