From: Andrey Alekseenko Date: Tue, 21 Sep 2021 18:54:59 +0000 (+0000) Subject: SYCL PME Spread kernel X-Git-Url: http://biod.pnpi.spb.ru/gitweb/?a=commitdiff_plain;h=f190fa59b4a72390b2aea2e74b4960941098ce4d;p=alexxy%2Fgromacs.git SYCL PME Spread kernel --- diff --git a/src/gromacs/ewald/CMakeLists.txt b/src/gromacs/ewald/CMakeLists.txt index ce81582dcb..27dc116a28 100644 --- a/src/gromacs/ewald/CMakeLists.txt +++ b/src/gromacs/ewald/CMakeLists.txt @@ -88,20 +88,21 @@ elseif (GMX_GPU_OPENCL) elseif (GMX_GPU_SYCL) # SYCL-TODO: proper implementation gmx_add_libgromacs_sources( - # Files that implement stubs - pme_gpu_sycl_stubs.cpp # GPU-specific sources pme_gpu.cpp pme_gpu_internal.cpp + pme_gpu_program_impl_sycl.cpp + pme_spread_sycl.cpp pme_gpu_timings.cpp ) _gmx_add_files_to_property(SYCL_SOURCES pme_gpu_internal.cpp pme_gpu_program.cpp - pme_gpu_sycl_stubs.cpp + pme_gpu_program_impl_sycl.cpp pme_gpu_3dfft_sycl.cpp pme_gpu_timings.cpp - ) + pme_spread_sycl.cpp + ) else() gmx_add_libgromacs_sources( # Files that implement stubs diff --git a/src/gromacs/ewald/pme.cpp b/src/gromacs/ewald/pme.cpp index 5eba296172..fe65e9d6e3 100644 --- a/src/gromacs/ewald/pme.cpp +++ b/src/gromacs/ewald/pme.cpp @@ -127,6 +127,8 @@ #include "pme_spline_work.h" #include "pme_spread.h" +bool g_allowPmeWithSyclForTesting = false; + bool pme_gpu_supports_build(std::string* error) { gmx::MessageStringCollector errorReasons; @@ -134,7 +136,7 @@ bool pme_gpu_supports_build(std::string* error) errorReasons.startContext("PME GPU does not support:"); errorReasons.appendIf(GMX_DOUBLE, "Double-precision build of GROMACS."); errorReasons.appendIf(!GMX_GPU, "Non-GPU build of GROMACS."); - errorReasons.appendIf(GMX_GPU_SYCL, "SYCL build."); // SYCL-TODO + errorReasons.appendIf(GMX_GPU_SYCL && !g_allowPmeWithSyclForTesting, "SYCL build."); // SYCL-TODO errorReasons.finishContext(); if (error != nullptr) { @@ -196,7 +198,7 @@ static bool pme_gpu_check_restrictions(const gmx_pme_t* pme, std::string* error) errorReasons.appendIf(pme->doLJ, "Lennard-Jones PME."); errorReasons.appendIf(GMX_DOUBLE, "Double precision build of GROMACS."); errorReasons.appendIf(!GMX_GPU, "Non-GPU build of GROMACS."); - errorReasons.appendIf(GMX_GPU_SYCL, "SYCL build of GROMACS."); // SYCL-TODO + errorReasons.appendIf(GMX_GPU_SYCL && !g_allowPmeWithSyclForTesting, "SYCL build of GROMACS."); // SYCL-TODO errorReasons.finishContext(); if (error != nullptr) { diff --git a/src/gromacs/ewald/pme.h b/src/gromacs/ewald/pme.h index 1e32caf265..0576e51b19 100644 --- a/src/gromacs/ewald/pme.h +++ b/src/gromacs/ewald/pme.h @@ -75,6 +75,18 @@ enum class GpuTaskCompletion; class PmeGpuProgram; class GpuEventSynchronizer; +/*! \brief Hack to selectively enable some parts of PME during unit testing. + * + * Set to \c false by default. If any of the tests sets it to \c true, it will + * make the compatibility check consider PME to be supported in SYCL builds. + * + * Currently we don't have proper PME implementation with SYCL, but we still want + * to run tests for some of the kernels. + * + * \todo Remove after #3927 is done and PME is fully enabled in SYCL builds. + */ +extern bool g_allowPmeWithSyclForTesting; + namespace gmx { template diff --git a/src/gromacs/ewald/pme_gpu_calculate_splines_sycl.h b/src/gromacs/ewald/pme_gpu_calculate_splines_sycl.h new file mode 100644 index 0000000000..998ce6b2e4 --- /dev/null +++ b/src/gromacs/ewald/pme_gpu_calculate_splines_sycl.h @@ -0,0 +1,403 @@ +/* + * This file is part of the GROMACS molecular simulation package. + * + * Copyright (c) 2021, by the GROMACS development team, led by + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, + * and including many others, as listed in the AUTHORS file in the + * top-level source directory and at http://www.gromacs.org. + * + * GROMACS is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 + * of the License, or (at your option) any later version. + * + * GROMACS is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with GROMACS; if not, see + * http://www.gnu.org/licenses, or write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * If you want to redistribute modifications to GROMACS, please + * consider that scientific software is very special. Version + * control is crucial - bugs must be traceable. We will be happy to + * consider code for inclusion in the official distribution, but + * derived work must not be called official GROMACS. Details are found + * in the README & COPYING files - if they are missing, get the + * official version at http://www.gromacs.org. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the research papers on the package. Check out http://www.gromacs.org. + */ + +/*! \internal \file + * \brief Implements helper routines for PME gather and spline routines. + * + * \author Andrey Alekseenko + */ + +#include "gmxpre.h" + +#include + +#include "gromacs/gpu_utils/gmxsycl.h" +#include "gromacs/gpu_utils/gputraits_sycl.h" +#include "gromacs/gpu_utils/sycl_kernel_utils.h" + +#include "pme_grid.h" +#include "pme_gpu_constants.h" +#include "pme_gpu_types.h" + +namespace +{ + +/*! \brief Asserts if the argument is finite. + * + * The function works for any data type, that can be casted to float. Note that there is also + * a specialized implementation for float3 data type. + * + * \param[in] arg Argument to check. + */ +template +inline void assertIsFinite(T arg); + +#if defined(NDEBUG) || GMX_SYCL_HIPSYCL +// We have no cl::sycl::isfinite in hipSYCL yet +template +inline void assertIsFinite(T /* arg */) +{ +} +#else +template<> +inline void assertIsFinite(Float3 gmx_used_in_debug arg) +{ + assert(cl::sycl::isfinite(arg[0])); + assert(cl::sycl::isfinite(arg[1])); + assert(cl::sycl::isfinite(arg[2])); +} + +template +inline void assertIsFinite(T gmx_used_in_debug arg) +{ + assert(cl::sycl::isfinite(static_cast(arg))); +} +#endif + +} // namespace + +using cl::sycl::access::fence_space; +using cl::sycl::access::mode; +using cl::sycl::access::target; + +/*! \internal \brief + * Gets a base of the unique index to an element in a spline parameter buffer (theta/dtheta), + * which is laid out for GPU spread/gather kernels. The base only corresponds to the atom index within the execution block. + * Feed the result into getSplineParamIndex() to get a full index. + * TODO: it's likely that both parameters can be just replaced with a single atom index, as they are derived from it. + * Do that, verifying that the generated code is not bloated, and/or revise the spline indexing scheme. + * Removing warp dependency would also be nice (and would probably coincide with removing c_pmeSpreadGatherAtomsPerWarp). + * + * \tparam order PME order + * \tparam atomsPerSubGroup Number of atoms processed by a sub group + * \param[in] subGroupIndex Sub group index in the work group. + * \param[in] atomSubGroupIndex Atom index in the sub group (from 0 to atomsPerSubGroup - 1). + * + * \returns Index into theta or dtheta array using GPU layout. + */ +template +static inline int getSplineParamIndexBase(int subGroupIndex, int atomSubGroupIndex) +{ + assert((atomSubGroupIndex >= 0) && (atomSubGroupIndex < atomsPerSubGroup)); + constexpr int dimIndex = 0; + constexpr int splineIndex = 0; + // The zeroes are here to preserve the full index formula for reference + return (((splineIndex + order * subGroupIndex) * DIM + dimIndex) * atomsPerSubGroup + atomSubGroupIndex); +} + +/*! \internal \brief + * Gets a unique index to an element in a spline parameter buffer (theta/dtheta), + * which is laid out for GPU spread/gather kernels. The index is wrt to the execution block, + * in range(0, atomsPerBlock * order * DIM). + * This function consumes result of getSplineParamIndexBase() and adjusts it for \p dimIndex and \p splineIndex. + * + * \tparam order PME order + * \tparam atomsPerSubGroup Number of atoms processed by a sub group + * \param[in] paramIndexBase Must be result of getSplineParamIndexBase(). + * \param[in] dimIndex Dimension index (from 0 to 2) + * \param[in] splineIndex Spline contribution index (from 0 to \p order - 1) + * + * \returns Index into theta or dtheta array using GPU layout. + */ +template +static inline int getSplineParamIndex(int paramIndexBase, int dimIndex, int splineIndex) +{ + assert((dimIndex >= XX) && (dimIndex < DIM)); + assert((splineIndex >= 0) && (splineIndex < order)); + return (paramIndexBase + (splineIndex * DIM + dimIndex) * atomsPerSubGroup); +} + +/*! \internal \brief + * An inline function for skipping the zero-charge atoms when we have \c c_skipNeutralAtoms set to \c true. + * + * \returns \c true if atom should be processed, \c false otherwise. + * \param[in] charge The atom charge. + */ +static inline bool pmeGpuCheckAtomCharge(const float charge) +{ + assertIsFinite(charge); + return c_skipNeutralAtoms ? (charge != 0.0F) : true; +} + +/*! \brief + * General purpose function for loading atom-related data from global to shared memory. + * + * \tparam T Data type (float/int/...). + * \tparam atomsPerWorkGroup Number of atoms processed by a block - should be accounted for + * in the size of the shared memory array. + * \tparam dataCountPerAtom Number of data elements + * per single atom (e.g. \c DIM for an rvec coordinates array). + * \param[out] sm_destination Shared memory array for output. + * \param[in] gm_source Global memory array for input. + * \param[in] itemIdx SYCL thread ID. + */ +template +static inline void pmeGpuStageAtomData(cl::sycl::local_ptr sm_destination, + const cl::sycl::global_ptr gm_source, + cl::sycl::nd_item<3> itemIdx) +{ + const int blockIndex = itemIdx.get_group_linear_id(); + const int localIndex = itemIdx.get_local_linear_id(); + const int globalIndexBase = blockIndex * atomsPerWorkGroup * dataCountPerAtom; + const int globalIndex = globalIndexBase + localIndex; + if (localIndex < atomsPerWorkGroup * dataCountPerAtom) + { + assertIsFinite(gm_source[globalIndex]); + sm_destination[localIndex] = gm_source[globalIndex]; + } +} + +/*! \brief + * PME GPU spline parameter and gridline indices calculation. + * This corresponds to the CPU functions calc_interpolation_idx() and make_bsplines(). + * First stage of the whole kernel. + * + * \tparam order PME interpolation order. + * \tparam atomsPerBlock Number of atoms processed by a block - should be accounted for + * in the sizes of the shared memory arrays. + * \tparam atomsPerWarp Number of atoms processed by a warp + * \tparam writeSmDtheta Bool controlling if the theta derivative should be written to + * shared memory. Enables calculation of dtheta if set. + * \tparam writeGlobal A boolean which tells if the theta values and gridlines should + * be written to global memory. Enables calculation of dtheta if set. + * \tparam numGrids The number of grids using the splines. + * \tparam subGroupSize The size of a sub-group (warp). + * \param[in] atomIndexOffset Starting atom index for the execution block in the global + * memory. + * \param[in] atomX Coordinates of atom processed by thread. + * \param[in] atomCharge Charge/coefficient of atom processed by thread. + * \param[in] tablesOffsets Offsets for X/Y/Z components of \p gm_fractShiftsTable and + * \p gm_gridlineIndicesTable. + * \param[in] realGridSizeFP Real-space grid dimensions, converted to floating point. + * \param[in] currentRecipBox0 Current reciprocal (inverted unit cell) box, vector 1. + * \param[in] currentRecipBox1 Current reciprocal (inverted unit cell) box, vector 2. + * \param[in] currentRecipBox2 Current reciprocal (inverted unit cell) box, vector 3. + * \param[out] gm_theta Atom spline values in the global memory. + * Used only if \p writeGlobal is \c true. + * \param[out] gm_dtheta Derivatives of atom spline values in the global memory. + * Used only if \p writeGlobal is \c true. + * \param[out] gm_gridlineIndices Atom gridline indices in the global memory. + * Used only if \p writeGlobal is \c true. + * \param[in] gm_fractShiftsTable Fractional shifts lookup table in the global memory. + * \param[in] gm_gridlineIndicesTable Gridline indices lookup table in the global memory. + * \param[out] sm_theta Atom spline values in the local memory. + * \param[out] sm_dtheta Derivatives of atom spline values in the local memory. + * \param[out] sm_gridlineIndices Atom gridline indices in the local memory. + * \param[out] sm_fractCoords Fractional coordinates in the local memory. + * \param[in] itemIdx SYCL thread ID. + */ + +template +static inline void calculateSplines(const int atomIndexOffset, + const Float3 atomX, + const float atomCharge, + const gmx::IVec tablesOffsets, + const gmx::RVec realGridSizeFP, + const gmx::RVec currentRecipBox0, + const gmx::RVec currentRecipBox1, + const gmx::RVec currentRecipBox2, + cl::sycl::global_ptr gm_theta, + cl::sycl::global_ptr gm_dtheta, + cl::sycl::global_ptr gm_gridlineIndices, + const cl::sycl::global_ptr gm_fractShiftsTable, + const cl::sycl::global_ptr gm_gridlineIndicesTable, + cl::sycl::local_ptr sm_theta, + cl::sycl::local_ptr sm_dtheta, + cl::sycl::local_ptr sm_gridlineIndices, + cl::sycl::local_ptr sm_fractCoords, + cl::sycl::nd_item<3> itemIdx) +{ + static_assert(numGrids == 1 || numGrids == 2); + static_assert(numGrids == 1 || c_skipNeutralAtoms == false); + + /* Thread index w.r.t. block */ + const int threadLocalId = itemIdx.get_local_linear_id(); + /* Warp index w.r.t. block - could probably be obtained easier? */ + const int warpIndex = threadLocalId / subGroupSize; + /* Atom index w.r.t. warp - alternating 0 1 0 1 ... */ + const int atomWarpIndex = itemIdx.get_local_id(0) % atomsPerWarp; + /* Atom index w.r.t. block/shared memory */ + const int atomIndexLocal = warpIndex * atomsPerWarp + atomWarpIndex; + + /* Spline contribution index in one dimension */ + const int threadLocalIdXY = + (itemIdx.get_local_id(1) * itemIdx.get_group_range(2)) + itemIdx.get_local_id(2); + const int orderIndex = threadLocalIdXY / DIM; + /* Dimension index */ + const int dimIndex = threadLocalIdXY % DIM; + + /* Multi-purpose index of rvec/ivec atom data */ + const int sharedMemoryIndex = atomIndexLocal * DIM + dimIndex; + + float splineData[order]; + + const int localCheck = (dimIndex < DIM) && (orderIndex < 1); + + /* we have 4 threads per atom, but can only use 3 here for the dimensions */ + if (localCheck) + { + /* Indices interpolation */ + if (orderIndex == 0) + { + int tableIndex, tInt; + float n, t; + assert(atomIndexLocal < DIM * atomsPerBlock); + // Switch structure inherited from CUDA. + // TODO: Issue #4153: Direct indexing with dimIndex can be better with SYCL + switch (dimIndex) + { + case XX: + tableIndex = tablesOffsets[XX]; + n = realGridSizeFP[XX]; + t = atomX[XX] * currentRecipBox0[XX] + atomX[YY] * currentRecipBox0[YY] + + atomX[ZZ] * currentRecipBox0[ZZ]; + break; + + case YY: + tableIndex = tablesOffsets[YY]; + n = realGridSizeFP[YY]; + t = atomX[YY] * currentRecipBox1[YY] + atomX[ZZ] * currentRecipBox1[ZZ]; + break; + + case ZZ: + tableIndex = tablesOffsets[ZZ]; + n = realGridSizeFP[ZZ]; + t = atomX[ZZ] * currentRecipBox2[ZZ]; + break; + } + const float shift = c_pmeMaxUnitcellShift; + /* Fractional coordinates along box vectors, adding a positive shift to ensure t is positive for triclinic boxes */ + t = (t + shift) * n; + tInt = static_cast(t); + assert(sharedMemoryIndex < atomsPerBlock * DIM); + sm_fractCoords[sharedMemoryIndex] = t - tInt; + tableIndex += tInt; + assert(tInt >= 0); + assert(tInt < c_pmeNeighborUnitcellCount * n); + + // TODO: Issue #4153: use shared table for both parameters to share the fetch, as index is always same. + sm_fractCoords[sharedMemoryIndex] += gm_fractShiftsTable[tableIndex]; + sm_gridlineIndices[sharedMemoryIndex] = gm_gridlineIndicesTable[tableIndex]; + if constexpr (writeGlobal) + { + gm_gridlineIndices[atomIndexOffset * DIM + sharedMemoryIndex] = + sm_gridlineIndices[sharedMemoryIndex]; + } + } + + /* B-spline calculation */ + const int chargeCheck = pmeGpuCheckAtomCharge(atomCharge); + /* With FEP (numGrids == 2), we might have 0 charge in state A, but !=0 in state B, so we always calculate splines */ + if (numGrids == 2 || chargeCheck) + { + const float dr = sm_fractCoords[sharedMemoryIndex]; + assertIsFinite(dr); + + /* dr is relative offset from lower cell limit */ + splineData[order - 1] = 0.0F; + splineData[1] = dr; + splineData[0] = 1.0F - dr; + +#pragma unroll + for (int k = 3; k < order; k++) + { + const float div = 1.0F / (k - 1.0F); + splineData[k - 1] = div * dr * splineData[k - 2]; +#pragma unroll + for (int l = 1; l < (k - 1); l++) + { + splineData[k - l - 1] = + div * ((dr + l) * splineData[k - l - 2] + (k - l - dr) * splineData[k - l - 1]); + } + splineData[0] = div * (1.0F - dr) * splineData[0]; + } + + const int thetaIndexBase = + getSplineParamIndexBase(warpIndex, atomWarpIndex); + const int thetaGlobalOffsetBase = atomIndexOffset * DIM * order; + /* only calculate dtheta if we are saving it to shared or global memory */ + if constexpr (writeSmDtheta || writeGlobal) + { + /* Differentiation and storing the spline derivatives (dtheta) */ +#pragma unroll + for (int o = 0; o < order; o++) + { + const int thetaIndex = + getSplineParamIndex(thetaIndexBase, dimIndex, o); + + const float dtheta = ((o > 0) ? splineData[o - 1] : 0.0F) - splineData[o]; + assertIsFinite(dtheta); + assert(thetaIndex < order * DIM * atomsPerBlock); + if constexpr (writeSmDtheta) + { + sm_dtheta[thetaIndex] = dtheta; + } + if constexpr (writeGlobal) + { + const int thetaGlobalIndex = thetaGlobalOffsetBase + thetaIndex; + gm_dtheta[thetaGlobalIndex] = dtheta; + } + } + } + + const float div = 1.0F / (order - 1.0F); + splineData[order - 1] = div * dr * splineData[order - 2]; +#pragma unroll + for (int k = 1; k < (order - 1); k++) + { + splineData[order - k - 1] = div + * ((dr + k) * splineData[order - k - 2] + + (order - k - dr) * splineData[order - k - 1]); + } + splineData[0] = div * (1.0F - dr) * splineData[0]; + + /* Storing the spline values (theta) */ +#pragma unroll + for (int o = 0; o < order; o++) + { + const int thetaIndex = + getSplineParamIndex(thetaIndexBase, dimIndex, o); + assert(thetaIndex < order * DIM * atomsPerBlock); + sm_theta[thetaIndex] = splineData[o]; + assertIsFinite(sm_theta[thetaIndex]); + if constexpr (writeGlobal) + { + const int thetaGlobalIndex = thetaGlobalOffsetBase + thetaIndex; + gm_theta[thetaGlobalIndex] = splineData[o]; + } + } + } + } +} diff --git a/src/gromacs/ewald/pme_gpu_program_impl.h b/src/gromacs/ewald/pme_gpu_program_impl.h index 8a1e8afaf1..621a7b9631 100644 --- a/src/gromacs/ewald/pme_gpu_program_impl.h +++ b/src/gromacs/ewald/pme_gpu_program_impl.h @@ -49,6 +49,7 @@ #include "gromacs/gpu_utils/device_context.h" #include "gromacs/utility/classhelpers.h" +class ISyclKernelFunctor; class DeviceContext; struct DeviceInformation; @@ -86,7 +87,7 @@ struct PmeGpuProgramImpl #elif GMX_GPU_OPENCL using PmeKernelHandle = cl_kernel; #else - using PmeKernelHandle = void*; + using PmeKernelHandle = ISyclKernelFunctor*; #endif /*! \brief diff --git a/src/gromacs/ewald/pme_gpu_program_impl_sycl.cpp b/src/gromacs/ewald/pme_gpu_program_impl_sycl.cpp new file mode 100644 index 0000000000..07da0a848c --- /dev/null +++ b/src/gromacs/ewald/pme_gpu_program_impl_sycl.cpp @@ -0,0 +1,179 @@ +/* + * This file is part of the GROMACS molecular simulation package. + * + * Copyright (c) 2021, by the GROMACS development team, led by + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, + * and including many others, as listed in the AUTHORS file in the + * top-level source directory and at http://www.gromacs.org. + * + * GROMACS is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 + * of the License, or (at your option) any later version. + * + * GROMACS is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with GROMACS; if not, see + * http://www.gnu.org/licenses, or write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * If you want to redistribute modifications to GROMACS, please + * consider that scientific software is very special. Version + * control is crucial - bugs must be traceable. We will be happy to + * consider code for inclusion in the official distribution, but + * derived work must not be called official GROMACS. Details are found + * in the README & COPYING files - if they are missing, get the + * official version at http://www.gromacs.org. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the research papers on the package. Check out http://www.gromacs.org. + */ + +/*! \internal \file + * \brief + * Implements PmeGpuProgramImpl, which stores permanent PME GPU context-derived data, + * such as (compiled) kernel handles. + * + * \author Aleksei Iupinov + * \author Andrey Alekseenko + * \ingroup module_ewald + */ +#include "gmxpre.h" + +#include "gromacs/hardware/device_information.h" +#include "gromacs/gpu_utils/gmxsycl.h" +#include "gromacs/gpu_utils/syclutils.h" + +#include "pme_gpu_program_impl.h" +#include "pme_spread_sycl.h" + +#include "pme_gpu_constants.h" +#include "pme_gpu_internal.h" // for GridOrdering enum +#include "pme_gpu_types_host.h" + +// PME interpolation order +constexpr int c_pmeOrder = 4; +// These hardcoded spread/gather parameters refer to not-implemented PME GPU 2D decomposition in X/Y +constexpr bool c_wrapX = true; +constexpr bool c_wrapY = true; + +static int subGroupSizeFromVendor(const DeviceInformation& deviceInfo) +{ + switch (deviceInfo.deviceVendor) + { + case DeviceVendor::Amd: return 64; // Handle RDNA2 devices, Issue #3972. + case DeviceVendor::Intel: return 16; // TODO: Choose best value, Issue #4153. + case DeviceVendor::Nvidia: return 32; + default: GMX_RELEASE_ASSERT(false, "Unknown device vendor"); return 0; + } +} + +#define INSTANTIATE_3(order, computeSplines, spreadCharges, numGrids, writeGlobal, threadsPerAtom, subGroupSize) \ + extern template class PmeSplineAndSpreadKernel; + +#define INSTANTIATE_2(order, numGrids, threadsPerAtom, subGroupSize) \ + INSTANTIATE_3(order, true, true, numGrids, true, threadsPerAtom, subGroupSize); \ + INSTANTIATE_3(order, true, false, numGrids, true, threadsPerAtom, subGroupSize); \ + INSTANTIATE_3(order, false, true, numGrids, true, threadsPerAtom, subGroupSize); \ + INSTANTIATE_3(order, true, true, numGrids, false, threadsPerAtom, subGroupSize); + +#define INSTANTIATE(order, subGroupSize) \ + INSTANTIATE_2(order, 1, ThreadsPerAtom::Order, subGroupSize); \ + INSTANTIATE_2(order, 1, ThreadsPerAtom::OrderSquared, subGroupSize); \ + INSTANTIATE_2(order, 2, ThreadsPerAtom::Order, subGroupSize); \ + INSTANTIATE_2(order, 2, ThreadsPerAtom::OrderSquared, subGroupSize); + +#if GMX_SYCL_DPCPP +INSTANTIATE(4, 16); +#elif GMX_SYCL_HIPSYCL +INSTANTIATE(4, 32); +INSTANTIATE(4, 64); +#endif + + +//! Helper function to set proper kernel functor pointers +template +static void setKernelPointers(struct PmeGpuProgramImpl* pmeGpuProgram) +{ + /* Not all combinations of the splineAndSpread, spline and Spread kernels are required + * If only the spline (without the spread) then it does not make sense not to write the data to global memory + * Similarly the spread kernel (without the spline) implies that we should read the spline data from global memory + */ + pmeGpuProgram->splineAndSpreadKernelSingle = + new PmeSplineAndSpreadKernel(); + pmeGpuProgram->splineAndSpreadKernelThPerAtom4Single = + new PmeSplineAndSpreadKernel(); + pmeGpuProgram->splineAndSpreadKernelWriteSplinesSingle = + new PmeSplineAndSpreadKernel(); + pmeGpuProgram->splineAndSpreadKernelWriteSplinesThPerAtom4Single = + new PmeSplineAndSpreadKernel(); + pmeGpuProgram->splineKernelSingle = + new PmeSplineAndSpreadKernel(); + pmeGpuProgram->splineKernelThPerAtom4Single = + new PmeSplineAndSpreadKernel(); + pmeGpuProgram->spreadKernelSingle = + new PmeSplineAndSpreadKernel(); + pmeGpuProgram->spreadKernelThPerAtom4Single = + new PmeSplineAndSpreadKernel(); + pmeGpuProgram->splineAndSpreadKernelDual = + new PmeSplineAndSpreadKernel(); + pmeGpuProgram->splineAndSpreadKernelThPerAtom4Dual = + new PmeSplineAndSpreadKernel(); + pmeGpuProgram->splineAndSpreadKernelWriteSplinesDual = + new PmeSplineAndSpreadKernel(); + pmeGpuProgram->splineAndSpreadKernelWriteSplinesThPerAtom4Dual = + new PmeSplineAndSpreadKernel(); + pmeGpuProgram->splineKernelDual = + new PmeSplineAndSpreadKernel(); + pmeGpuProgram->splineKernelThPerAtom4Dual = + new PmeSplineAndSpreadKernel(); + pmeGpuProgram->spreadKernelDual = + new PmeSplineAndSpreadKernel(); + pmeGpuProgram->spreadKernelThPerAtom4Dual = + new PmeSplineAndSpreadKernel(); +} + +PmeGpuProgramImpl::PmeGpuProgramImpl(const DeviceContext& deviceContext) : + deviceContext_(deviceContext) +{ + // kernel parameters + warpSize_ = subGroupSizeFromVendor(deviceContext.deviceInfo()); + spreadWorkGroupSize = c_spreadMaxWarpsPerBlock * warpSize_; + solveMaxWorkGroupSize = c_solveMaxWarpsPerBlock * warpSize_; + gatherWorkGroupSize = c_gatherMaxWarpsPerBlock * warpSize_; + + switch (warpSize_) + { +#if GMX_SYCL_DPCPP + case 16: setKernelPointers<16>(this); break; +#elif GMX_SYCL_HIPSYCL + case 32: setKernelPointers<32>(this); break; + case 64: setKernelPointers<64>(this); break; +#endif + default: GMX_RELEASE_ASSERT(false, "Invalid sub group size"); + } +} + +PmeGpuProgramImpl::~PmeGpuProgramImpl() +{ + delete splineKernelSingle; + delete splineKernelThPerAtom4Single; + delete spreadKernelSingle; + delete spreadKernelThPerAtom4Single; + delete splineAndSpreadKernelSingle; + delete splineAndSpreadKernelThPerAtom4Single; + delete splineAndSpreadKernelWriteSplinesSingle; + delete splineAndSpreadKernelWriteSplinesThPerAtom4Single; + delete splineKernelDual; + delete splineKernelThPerAtom4Dual; + delete spreadKernelDual; + delete spreadKernelThPerAtom4Dual; + delete splineAndSpreadKernelDual; + delete splineAndSpreadKernelThPerAtom4Dual; + delete splineAndSpreadKernelWriteSplinesDual; + delete splineAndSpreadKernelWriteSplinesThPerAtom4Dual; +} diff --git a/src/gromacs/ewald/pme_spread_sycl.cpp b/src/gromacs/ewald/pme_spread_sycl.cpp new file mode 100644 index 0000000000..c88c1de374 --- /dev/null +++ b/src/gromacs/ewald/pme_spread_sycl.cpp @@ -0,0 +1,501 @@ +/* + * This file is part of the GROMACS molecular simulation package. + * + * Copyright (c) 2021, by the GROMACS development team, led by + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, + * and including many others, as listed in the AUTHORS file in the + * top-level source directory and at http://www.gromacs.org. + * + * GROMACS is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 + * of the License, or (at your option) any later version. + * + * GROMACS is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with GROMACS; if not, see + * http://www.gnu.org/licenses, or write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * If you want to redistribute modifications to GROMACS, please + * consider that scientific software is very special. Version + * control is crucial - bugs must be traceable. We will be happy to + * consider code for inclusion in the official distribution, but + * derived work must not be called official GROMACS. Details are found + * in the README & COPYING files - if they are missing, get the + * official version at http://www.gromacs.org. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the research papers on the package. Check out http://www.gromacs.org. + */ + +/*! \internal \file + * \brief Implements PME GPU spline calculation and charge spreading in SYCL. + * + * \author Andrey Alekseenko + */ + +#include "gmxpre.h" + +#include "pme_spread_sycl.h" + +#include "gromacs/gpu_utils/gmxsycl.h" +#include "gromacs/gpu_utils/gputraits_sycl.h" +#include "gromacs/gpu_utils/sycl_kernel_utils.h" +#include "gromacs/gpu_utils/syclutils.h" + +#include "pme_gpu_calculate_splines_sycl.h" +#include "pme_grid.h" +#include "pme_gpu_types_host.h" + +/*! \brief + * Charge spreading onto the grid. + * This corresponds to the CPU function spread_coefficients_bsplines_thread(). + * Optional second stage of the spline_and_spread_kernel. + * + * \tparam order PME interpolation order. + * \tparam wrapX Whether the grid overlap in dimension X should be wrapped. + * \tparam wrapY Whether the grid overlap in dimension Y should be wrapped. + * \tparam threadsPerAtom How many threads work on each atom. + * \tparam subGroupSize Size of the sub-group. + * + * \param[in] atomCharge Atom charge/coefficient of atom processed by thread. + * \param[in] realGridSize Size of the real grid. + * \param[in] realGridSizePadded Padded of the real grid. + * \param[in,out] gm_grid Device pointer to the real grid to which charges are added. + * \param[in] sm_gridlineIndices Atom gridline indices in the local memory. + * \param[in] sm_theta Atom spline values in the local memory. + * \param[in] itemIdx Current thread ID. + */ +template +inline void spread_charges(const float atomCharge, + const int realGridSize[DIM], + const int realGridSizePadded[DIM], + cl::sycl::global_ptr gm_grid, + const cl::sycl::local_ptr sm_gridlineIndices, + const cl::sycl::local_ptr sm_theta, + const cl::sycl::nd_item<3>& itemIdx) +{ + //! Number of atoms processed by a single warp in spread and gather + const int threadsPerAtomValue = (threadsPerAtom == ThreadsPerAtom::Order) ? order : order * order; + const int atomsPerWarp = subGroupSize / threadsPerAtomValue; + + const int nx = realGridSize[XX]; + const int ny = realGridSize[YY]; + const int nz = realGridSize[ZZ]; + const int pny = realGridSizePadded[YY]; + const int pnz = realGridSizePadded[ZZ]; + + const int atomIndexLocal = itemIdx.get_local_id(0); + + const int chargeCheck = pmeGpuCheckAtomCharge(atomCharge); + + if (chargeCheck) + { + // Spline Z coordinates + const int ithz = itemIdx.get_local_id(2); + + const int ixBase = sm_gridlineIndices[atomIndexLocal * DIM + XX]; + const int iyBase = sm_gridlineIndices[atomIndexLocal * DIM + YY]; + int iz = sm_gridlineIndices[atomIndexLocal * DIM + ZZ] + ithz; + if (iz >= nz) + { + iz -= nz; + } + /* Atom index w.r.t. warp - alternating 0 1 0 1 ... */ + const int atomWarpIndex = atomIndexLocal % atomsPerWarp; + /* Warp index w.r.t. block - could probably be obtained easier? */ + const int warpIndex = atomIndexLocal / atomsPerWarp; + + const int splineIndexBase = getSplineParamIndexBase(warpIndex, atomWarpIndex); + const int splineIndexZ = getSplineParamIndex(splineIndexBase, ZZ, ithz); + const float thetaZ = sm_theta[splineIndexZ]; + + /* loop not used if order*order threads per atom */ + const int ithyMin = (threadsPerAtom == ThreadsPerAtom::Order) ? 0 : itemIdx.get_local_id(YY); + const int ithyMax = + (threadsPerAtom == ThreadsPerAtom::Order) ? order : itemIdx.get_local_id(YY) + 1; + for (int ithy = ithyMin; ithy < ithyMax; ithy++) + { + int iy = iyBase + ithy; + if (wrapY & (iy >= ny)) + { + iy -= ny; + } + + const int splineIndexY = getSplineParamIndex(splineIndexBase, YY, ithy); + float thetaY = sm_theta[splineIndexY]; + const float Val = thetaZ * thetaY * (atomCharge); + assertIsFinite(Val); + const int offset = iy * pnz + iz; + +#pragma unroll + for (int ithx = 0; (ithx < order); ithx++) + { + int ix = ixBase + ithx; + if (wrapX & (ix >= nx)) + { + ix -= nx; + } + const int gridIndexGlobal = ix * pny * pnz + offset; + const int splineIndexX = + getSplineParamIndex(splineIndexBase, XX, ithx); + const float thetaX = sm_theta[splineIndexX]; + assertIsFinite(thetaX); + assertIsFinite(gm_grid[gridIndexGlobal]); + atomicFetchAdd(gm_grid[gridIndexGlobal], thetaX * Val); + } + } + } +} + + +/*! \brief + * A spline computation and charge spreading kernel function. + * + * Two tuning parameters can be used for additional performance. For small systems and for debugging + * writeGlobal should be used removing the need to recalculate the theta values in the gather kernel. + * Similarly for large systems, with useOrderThreads, using order threads per atom gives higher + * performance than order*order threads. + * + * \tparam order PME interpolation order. + * \tparam computeSplines A boolean which tells if the spline parameter and gridline indices' + * computation should be performed. + * \tparam spreadCharges A boolean which tells if the charge spreading should be performed. + * \tparam wrapX A boolean which tells if the grid overlap in dimension X should be wrapped. + * \tparam wrapY A boolean which tells if the grid overlap in dimension Y should be wrapped. + * \tparam numGrids The number of grids to use in the kernel. Can be 1 or 2. + * \tparam writeGlobal A boolean which tells if the theta values and gridlines should be written + * to global memory. + * \tparam threadsPerAtom How many threads work on each atom. + * \tparam subGroupSize Size of the sub-group. + */ +template +auto pmeSplineAndSpreadKernel( + cl::sycl::handler& cgh, + const int nAtoms, + OptionalAccessor a_realGrid_0, + OptionalAccessor a_realGrid_1, + OptionalAccessor a_theta, + OptionalAccessor a_dtheta, + OptionalAccessor a_gridlineIndices, + OptionalAccessor a_fractShiftsTable, + OptionalAccessor a_gridlineIndicesTable, + DeviceAccessor a_coefficients_0, + OptionalAccessor a_coefficients_1, + OptionalAccessor a_coordinates, + const gmx::IVec tablesOffsets, + const gmx::IVec realGridSize, + const gmx::RVec realGridSizeFP, + const gmx::IVec realGridSizePadded, + const gmx::RVec currentRecipBox0, + const gmx::RVec currentRecipBox1, + const gmx::RVec currentRecipBox2) +{ + constexpr int threadsPerAtomValue = (threadsPerAtom == ThreadsPerAtom::Order) ? order : order * order; + constexpr int spreadMaxThreadsPerBlock = c_spreadMaxWarpsPerBlock * subGroupSize; + constexpr int atomsPerBlock = spreadMaxThreadsPerBlock / threadsPerAtomValue; + // Number of atoms processed by a single warp in spread and gather + static_assert(subGroupSize >= threadsPerAtomValue); + constexpr int atomsPerWarp = subGroupSize / threadsPerAtomValue; + + if constexpr (spreadCharges) + { + cgh.require(a_realGrid_0); + } + if constexpr (writeGlobal || computeSplines) + { + cgh.require(a_theta); + } + if constexpr (computeSplines && writeGlobal) + { + cgh.require(a_dtheta); + } + if constexpr (writeGlobal) + { + cgh.require(a_gridlineIndices); + } + if constexpr (computeSplines) + { + cgh.require(a_fractShiftsTable); + cgh.require(a_gridlineIndicesTable); + cgh.require(a_coordinates); + } + cgh.require(a_coefficients_0); + if constexpr (numGrids == 2 && spreadCharges) + { + cgh.require(a_realGrid_1); + cgh.require(a_coefficients_1); + } + + // Gridline indices, ivec + cl::sycl::accessor sm_gridlineIndices( + cl::sycl::range<1>(atomsPerBlock * DIM), cgh); + // Charges + cl::sycl::accessor sm_coefficients( + cl::sycl::range<1>(atomsPerBlock), cgh); + // Spline values + cl::sycl::accessor sm_theta( + cl::sycl::range<1>(atomsPerBlock * DIM * order), cgh); + auto sm_fractCoords = [&]() { + if constexpr (computeSplines) + { + return cl::sycl::accessor( + cl::sycl::range<1>(atomsPerBlock * DIM), cgh); + } + else + { + return nullptr; + } + }(); + + return [=](cl::sycl::nd_item<3> itemIdx) [[intel::reqd_sub_group_size(subGroupSize)]] + { + const int blockIndex = itemIdx.get_group_linear_id(); + const int atomIndexOffset = blockIndex * atomsPerBlock; + + /* Thread index w.r.t. block */ + const int threadLocalId = itemIdx.get_local_linear_id(); + /* Warp index w.r.t. block - could probably be obtained easier? */ + const int warpIndex = threadLocalId / subGroupSize; + + /* Atom index w.r.t. warp */ + const int atomWarpIndex = itemIdx.get_local_id(XX) % atomsPerWarp; + /* Atom index w.r.t. block/shared memory */ + const int atomIndexLocal = warpIndex * atomsPerWarp + atomWarpIndex; + /* Atom index w.r.t. global memory */ + const int atomIndexGlobal = atomIndexOffset + atomIndexLocal; + + /* Early return for fully empty blocks at the end + * (should only happen for billions of input atoms) */ + if (atomIndexOffset >= nAtoms) + { + return; + } + + /* Charges, required for both spline and spread */ + pmeGpuStageAtomData( + sm_coefficients.get_pointer(), a_coefficients_0.get_pointer(), itemIdx); + itemIdx.barrier(fence_space::local_space); + const float atomCharge = sm_coefficients[atomIndexLocal]; + + if constexpr (computeSplines) + { + // SYCL-TODO: Use prefetching? Issue #4153. + const Float3 atomX = a_coordinates[atomIndexGlobal]; + // Lambdas below can be avoided when hipSYCL merges https://github.com/illuhad/hipSYCL/pull/629. + cl::sycl::global_ptr gm_dtheta = [&]() { + if constexpr (writeGlobal) + { + return a_dtheta.get_pointer(); + } + else + { + return nullptr; + } + }(); + cl::sycl::global_ptr gm_gridlineIndices = [&]() { + if constexpr (writeGlobal) + { + return a_gridlineIndices.get_pointer(); + } + else + { + return nullptr; + } + }(); + calculateSplines( + atomIndexOffset, + atomX, + atomCharge, + tablesOffsets, + realGridSizeFP, + currentRecipBox0, + currentRecipBox1, + currentRecipBox2, + a_theta.get_pointer(), + gm_dtheta, + gm_gridlineIndices, + a_fractShiftsTable.get_pointer(), + a_gridlineIndicesTable.get_pointer(), + sm_theta.get_pointer(), + nullptr, + sm_gridlineIndices.get_pointer(), + sm_fractCoords.get_pointer(), + itemIdx); + subGroupBarrier(itemIdx); + } + else + { + /* Staging the data for spread + * (the data is assumed to be in GPU global memory with proper layout already, + * as in after running the spline kernel) + */ + /* Spline data - only thetas (dthetas will only be needed in gather) */ + pmeGpuStageAtomData( + sm_theta.get_pointer(), a_theta.get_pointer(), itemIdx); + /* Gridline indices */ + pmeGpuStageAtomData( + sm_gridlineIndices.get_pointer(), a_gridlineIndices.get_pointer(), itemIdx); + + itemIdx.barrier(fence_space::local_space); + } + + /* Spreading */ + if constexpr (spreadCharges) + { + spread_charges( + atomCharge, + realGridSize, + realGridSizePadded, + a_realGrid_0.get_pointer(), + sm_gridlineIndices.get_pointer(), + sm_theta.get_pointer(), + itemIdx); + } + if constexpr (numGrids == 2 && spreadCharges) + { + itemIdx.barrier(fence_space::local_space); + pmeGpuStageAtomData( + sm_coefficients.get_pointer(), a_coefficients_1.get_pointer(), itemIdx); + itemIdx.barrier(fence_space::local_space); + const float atomCharge = sm_coefficients[atomIndexLocal]; + + spread_charges( + atomCharge, + realGridSize, + realGridSizePadded, + a_realGrid_1.get_pointer(), + sm_gridlineIndices.get_pointer(), + sm_theta.get_pointer(), + itemIdx); + } + }; +} + +template +PmeSplineAndSpreadKernel::PmeSplineAndSpreadKernel() +{ + reset(); +} + +template +void PmeSplineAndSpreadKernel::setArg( + size_t argIndex, + void* arg) +{ + if (argIndex == 0) + { + auto* params = reinterpret_cast(arg); + gridParams_ = ¶ms->grid; + atomParams_ = ¶ms->atoms; + dynamicParams_ = ¶ms->current; + } + else + { + GMX_RELEASE_ASSERT(argIndex == 0, "Trying to pass too many args to the kernel"); + } +} + + +template +cl::sycl::event +PmeSplineAndSpreadKernel::launch( + const KernelLaunchConfig& config, + const DeviceStream& deviceStream) +{ + GMX_RELEASE_ASSERT(gridParams_, "Can not launch the kernel before setting its args"); + GMX_RELEASE_ASSERT(atomParams_, "Can not launch the kernel before setting its args"); + GMX_RELEASE_ASSERT(dynamicParams_, "Can not launch the kernel before setting its args"); + + using kernelNameType = + PmeSplineAndSpreadKernel; + + // SYCL has different multidimensional layout than OpenCL/CUDA. + const cl::sycl::range<3> localSize{ config.blockSize[2], config.blockSize[1], config.blockSize[0] }; + const cl::sycl::range<3> groupRange{ config.gridSize[2], config.gridSize[1], config.gridSize[0] }; + const cl::sycl::nd_range<3> range{ groupRange * localSize, localSize }; + + cl::sycl::queue q = deviceStream.stream(); + + + cl::sycl::event e = q.submit([&](cl::sycl::handler& cgh) { + auto kernel = + pmeSplineAndSpreadKernel( + cgh, + atomParams_->nAtoms, + gridParams_->d_realGrid[0], + gridParams_->d_realGrid[1], + atomParams_->d_theta, + atomParams_->d_dtheta, + atomParams_->d_gridlineIndices, + gridParams_->d_fractShiftsTable, + gridParams_->d_gridlineIndicesTable, + atomParams_->d_coefficients[0], + atomParams_->d_coefficients[1], + atomParams_->d_coordinates, + gridParams_->tablesOffsets, + gridParams_->realGridSize, + gridParams_->realGridSizeFP, + gridParams_->realGridSizePadded, + dynamicParams_->recipBox[0], + dynamicParams_->recipBox[1], + dynamicParams_->recipBox[2]); + cgh.parallel_for(range, kernel); + }); + + // Delete set args, so we don't forget to set them before the next launch. + reset(); + + return e; +} + + +template +void PmeSplineAndSpreadKernel::reset() +{ + gridParams_ = nullptr; + atomParams_ = nullptr; + dynamicParams_ = nullptr; +} + + +//! Kernel instantiations +/* Disable the "explicit template instantiation 'PmeSplineAndSpreadKernel<...>' will emit a vtable in every + * translation unit [-Wweak-template-vtables]" warning. + * It is only explicitly instantiated in this translation unit, so we should be safe. + */ +#ifdef __clang__ +# pragma clang diagnostic push +# pragma clang diagnostic ignored "-Wweak-template-vtables" +#endif + +#define INSTANTIATE_3(order, computeSplines, spreadCharges, numGrids, writeGlobal, threadsPerAtom, subGroupSize) \ + template class PmeSplineAndSpreadKernel; + +#define INSTANTIATE_2(order, numGrids, threadsPerAtom, subGroupSize) \ + INSTANTIATE_3(order, true, true, numGrids, true, threadsPerAtom, subGroupSize); \ + INSTANTIATE_3(order, true, false, numGrids, true, threadsPerAtom, subGroupSize); \ + INSTANTIATE_3(order, false, true, numGrids, true, threadsPerAtom, subGroupSize); \ + INSTANTIATE_3(order, true, true, numGrids, false, threadsPerAtom, subGroupSize); + +#define INSTANTIATE(order, subGroupSize) \ + INSTANTIATE_2(order, 1, ThreadsPerAtom::Order, subGroupSize); \ + INSTANTIATE_2(order, 1, ThreadsPerAtom::OrderSquared, subGroupSize); \ + INSTANTIATE_2(order, 2, ThreadsPerAtom::Order, subGroupSize); \ + INSTANTIATE_2(order, 2, ThreadsPerAtom::OrderSquared, subGroupSize); + +#if GMX_SYCL_DPCPP +INSTANTIATE(4, 16); // TODO: Choose best value, Issue #4153. +#elif GMX_SYCL_HIPSYCL +INSTANTIATE(4, 32); +INSTANTIATE(4, 64); +#endif + +#ifdef __clang__ +# pragma clang diagnostic pop +#endif diff --git a/src/gromacs/ewald/pme_gpu_sycl_stubs.cpp b/src/gromacs/ewald/pme_spread_sycl.h similarity index 60% rename from src/gromacs/ewald/pme_gpu_sycl_stubs.cpp rename to src/gromacs/ewald/pme_spread_sycl.h index 832395e084..fdf65fb8e4 100644 --- a/src/gromacs/ewald/pme_gpu_sycl_stubs.cpp +++ b/src/gromacs/ewald/pme_spread_sycl.h @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2020,2021, by the GROMACS development team, led by + * Copyright (c) 2021, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -34,26 +34,34 @@ */ /*! \internal \file - * \brief Implements stubs of high-level PME GPU functions for SYCL. + * \brief Implements PME GPU spline calculation and charge spreading in SYCL. + * TODO: consider always pre-sorting particles (as in DD case). * - * \author Andrey Alekseenko - * - * \ingroup module_ewald + * \author Andrey Alekseenko */ -#include "gmxpre.h" -#include "gromacs/ewald/ewald_utils.h" +#include "gromacs/gpu_utils/gmxsycl.h" + +#include "gromacs/gpu_utils/syclutils.h" + +#include "pme_grid.h" +#include "pme_gpu_types_host.h" -#include "pme_gpu_program_impl.h" +struct PmeGpuGridParams; +struct PmeGpuAtomParams; +struct PmeGpuDynamicParams; -PmeGpuProgramImpl::PmeGpuProgramImpl(const DeviceContext& deviceContext) : - deviceContext_(deviceContext), - warpSize_(0), - spreadWorkGroupSize(0), - gatherWorkGroupSize(0), - solveMaxWorkGroupSize(0) +template +class PmeSplineAndSpreadKernel : public ISyclKernelFunctor { - // SYCL-TODO -} +public: + PmeSplineAndSpreadKernel(); + void setArg(size_t argIndex, void* arg) override; + cl::sycl::event launch(const KernelLaunchConfig& config, const DeviceStream& deviceStream) override; -PmeGpuProgramImpl::~PmeGpuProgramImpl() = default; +private: + PmeGpuGridParams* gridParams_; + PmeGpuAtomParams* atomParams_; + PmeGpuDynamicParams* dynamicParams_; + void reset(); +}; diff --git a/src/gromacs/ewald/tests/pmesplinespreadtest.cpp b/src/gromacs/ewald/tests/pmesplinespreadtest.cpp index 98c70599f2..d56cc256b8 100644 --- a/src/gromacs/ewald/tests/pmesplinespreadtest.cpp +++ b/src/gromacs/ewald/tests/pmesplinespreadtest.cpp @@ -85,7 +85,17 @@ public: PmeSplineAndSpreadTest() = default; //! Sets the programs once - static void SetUpTestSuite() { s_pmeTestHardwareContexts = createPmeTestHardwareContextList(); } + static void SetUpTestSuite() + { + s_pmeTestHardwareContexts = createPmeTestHardwareContextList(); + g_allowPmeWithSyclForTesting = true; // We support PmeSplineAndSpread with SYCL + } + + static void TearDownTestSuite() + { + // Revert the value back. + g_allowPmeWithSyclForTesting = false; + } //! The test static void runTest()