From 69d9df0fb1f51869e7f266416c1c3dbe637360ac Mon Sep 17 00:00:00 2001 From: Andrey Alekseenko Date: Tue, 2 Nov 2021 13:20:21 +0100 Subject: [PATCH] SYCL: Use acc.bind(cgh) instead of cgh.require(acc) A few other minor changes to SYCL version of DeviceBuffer. This is a prerequisite for adding support for USM. Refs #3847, #3965. --- src/gromacs/ewald/pme_gather_sycl.cpp | 22 +++++++-------- src/gromacs/ewald/pme_solve_sycl.cpp | 8 +++--- src/gromacs/ewald/pme_spread_sycl.cpp | 20 ++++++------- src/gromacs/gpu_utils/devicebuffer_sycl.h | 19 +++++++------ .../gpuforcereduction_impl_internal_sycl.cpp | 8 +++--- .../mdlib/leapfrog_gpu_internal_sycl.cpp | 14 +++++----- src/gromacs/mdlib/lincs_gpu_internal_sycl.cpp | 22 +++++++-------- .../mdlib/settle_gpu_internal_sycl.cpp | 10 +++---- .../update_constrain_gpu_internal_sycl.cpp | 2 +- .../nbnxm_gpu_buffer_ops_internal_sycl.cpp | 10 +++---- src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel.cpp | 28 +++++++++---------- .../sycl/nbnxm_sycl_kernel_pruneonly.cpp | 10 +++---- 12 files changed, 87 insertions(+), 86 deletions(-) diff --git a/src/gromacs/ewald/pme_gather_sycl.cpp b/src/gromacs/ewald/pme_gather_sycl.cpp index b03a7804b3..d133daaeea 100644 --- a/src/gromacs/ewald/pme_gather_sycl.cpp +++ b/src/gromacs/ewald/pme_gather_sycl.cpp @@ -284,27 +284,27 @@ auto pmeGatherKernel(cl::sycl::handler& cgh, constexpr int splineParamsSize = atomsPerBlock * DIM * order; constexpr int gridlineIndicesSize = atomsPerBlock * DIM; - cgh.require(a_gridA); - cgh.require(a_coefficientsA); - cgh.require(a_forces); + a_gridA.bind(cgh); + a_coefficientsA.bind(cgh); + a_forces.bind(cgh); if constexpr (numGrids == 2) { - cgh.require(a_gridB); - cgh.require(a_coefficientsB); + a_gridB.bind(cgh); + a_coefficientsB.bind(cgh); } if constexpr (readGlobal) { - cgh.require(a_theta); - cgh.require(a_dtheta); - cgh.require(a_gridlineIndices); + a_theta.bind(cgh); + a_dtheta.bind(cgh); + a_gridlineIndices.bind(cgh); } else { - cgh.require(a_coordinates); - cgh.require(a_fractShiftsTable); - cgh.require(a_gridlineIndicesTable); + a_coordinates.bind(cgh); + a_fractShiftsTable.bind(cgh); + a_gridlineIndicesTable.bind(cgh); } // Gridline indices, ivec diff --git a/src/gromacs/ewald/pme_solve_sycl.cpp b/src/gromacs/ewald/pme_solve_sycl.cpp index 46883060e9..5a0125c44d 100644 --- a/src/gromacs/ewald/pme_solve_sycl.cpp +++ b/src/gromacs/ewald/pme_solve_sycl.cpp @@ -68,13 +68,13 @@ auto makeSolveKernel(cl::sycl::handler& cgh, OptionalAccessor a_virialAndEnergy, DeviceAccessor a_fourierGrid) { - cgh.require(a_splineModuli); - cgh.require(a_solveKernelParams); + a_splineModuli.bind(cgh); + a_solveKernelParams.bind(cgh); if constexpr (computeEnergyAndVirial) { - cgh.require(a_virialAndEnergy); + a_virialAndEnergy.bind(cgh); } - cgh.require(a_fourierGrid); + a_fourierGrid.bind(cgh); /* Reduce 7 outputs per warp in the shared memory */ const int stride = diff --git a/src/gromacs/ewald/pme_spread_sycl.cpp b/src/gromacs/ewald/pme_spread_sycl.cpp index c88c1de374..41244d591f 100644 --- a/src/gromacs/ewald/pme_spread_sycl.cpp +++ b/src/gromacs/ewald/pme_spread_sycl.cpp @@ -205,31 +205,31 @@ auto pmeSplineAndSpreadKernel( if constexpr (spreadCharges) { - cgh.require(a_realGrid_0); + a_realGrid_0.bind(cgh); } if constexpr (writeGlobal || computeSplines) { - cgh.require(a_theta); + a_theta.bind(cgh); } if constexpr (computeSplines && writeGlobal) { - cgh.require(a_dtheta); + a_dtheta.bind(cgh); } if constexpr (writeGlobal) { - cgh.require(a_gridlineIndices); + a_gridlineIndices.bind(cgh); } if constexpr (computeSplines) { - cgh.require(a_fractShiftsTable); - cgh.require(a_gridlineIndicesTable); - cgh.require(a_coordinates); + a_fractShiftsTable.bind(cgh); + a_gridlineIndicesTable.bind(cgh); + a_coordinates.bind(cgh); } - cgh.require(a_coefficients_0); + a_coefficients_0.bind(cgh); if constexpr (numGrids == 2 && spreadCharges) { - cgh.require(a_realGrid_1); - cgh.require(a_coefficients_1); + a_realGrid_1.bind(cgh); + a_coefficients_1.bind(cgh); } // Gridline indices, ivec diff --git a/src/gromacs/gpu_utils/devicebuffer_sycl.h b/src/gromacs/gpu_utils/devicebuffer_sycl.h index cb3277b82f..0bc8b7c59e 100644 --- a/src/gromacs/gpu_utils/devicebuffer_sycl.h +++ b/src/gromacs/gpu_utils/devicebuffer_sycl.h @@ -176,6 +176,7 @@ public: static_assert(mode == cl::sycl::access::mode::read, "Can not create non-read-only accessor from a const DeviceBuffer"); } + void bind(cl::sycl::handler& cgh) { cgh.require(*this); } private: //! Helper function to get sycl:buffer object from DeviceBuffer wrapper, with a sanity check. @@ -188,15 +189,16 @@ private: namespace gmx::internal { -//! A "blackhole" class to be used when we want to ignore an argument to a function. -struct EmptyClassThatIgnoresConstructorArguments +//! A non-functional class that can be used instead of real accessors +template +struct NullAccessor { - template - [[maybe_unused]] EmptyClassThatIgnoresConstructorArguments(Args&&... /*args*/) - { - } + NullAccessor(const DeviceBuffer& /*buffer*/) {} //! Allow casting to nullptr constexpr operator std::nullptr_t() const { return nullptr; } + //! Placeholder implementation of \c cl::sycl::accessor::get_pointer. + T* get_pointer() const noexcept { return nullptr; } + void bind(cl::sycl::handler& /*cgh*/) { assert(false); } }; } // namespace gmx::internal @@ -230,7 +232,7 @@ struct EmptyClassThatIgnoresConstructorArguments */ template using OptionalAccessor = - std::conditional_t, gmx::internal::EmptyClassThatIgnoresConstructorArguments>; + std::conditional_t, gmx::internal::NullAccessor>; #endif // #ifndef DOXYGEN @@ -441,7 +443,6 @@ inline cl::sycl::event fillSyclBufferWithNull(cl::sycl::buffer& buffe false; #endif - if constexpr (usingHipSycl) { // hipSYCL does not support reinterpret but allows using Float3 directly. @@ -541,7 +542,7 @@ void initParamLookupTable(DeviceBuffer* deviceBuffer, template void destroyParamLookupTable(DeviceBuffer* deviceBuffer, DeviceTexture* /* deviceTexture */) { - deviceBuffer->buffer_.reset(nullptr); + freeDeviceBuffer(deviceBuffer); } #endif // GMX_GPU_UTILS_DEVICEBUFFER_SYCL_H diff --git a/src/gromacs/mdlib/gpuforcereduction_impl_internal_sycl.cpp b/src/gromacs/mdlib/gpuforcereduction_impl_internal_sycl.cpp index bd76fbf7d6..b8a2bcbb0c 100644 --- a/src/gromacs/mdlib/gpuforcereduction_impl_internal_sycl.cpp +++ b/src/gromacs/mdlib/gpuforcereduction_impl_internal_sycl.cpp @@ -72,13 +72,13 @@ static auto reduceKernel(cl::sycl::handler& cgh, DeviceAccessor a_cell, const int atomStart) { - cgh.require(a_nbnxmForce); + a_nbnxmForce.bind(cgh); if constexpr (addRvecForce) { - cgh.require(a_rvecForceToAdd); + a_rvecForceToAdd.bind(cgh); } - cgh.require(a_forceTotal); - cgh.require(a_cell); + a_forceTotal.bind(cgh); + a_cell.bind(cgh); return [=](cl::sycl::id<1> itemIdx) { // Set to nbnxnm force, then perhaps accumulate further to it diff --git a/src/gromacs/mdlib/leapfrog_gpu_internal_sycl.cpp b/src/gromacs/mdlib/leapfrog_gpu_internal_sycl.cpp index b5572dcfe1..25b3199e76 100644 --- a/src/gromacs/mdlib/leapfrog_gpu_internal_sycl.cpp +++ b/src/gromacs/mdlib/leapfrog_gpu_internal_sycl.cpp @@ -98,18 +98,18 @@ auto leapFrogKernel( OptionalAccessor a_tempScaleGroups, Float3 prVelocityScalingMatrixDiagonal) { - cgh.require(a_x); - cgh.require(a_xp); - cgh.require(a_v); - cgh.require(a_f); - cgh.require(a_inverseMasses); + a_x.bind(cgh); + a_xp.bind(cgh); + a_v.bind(cgh); + a_f.bind(cgh); + a_inverseMasses.bind(cgh); if constexpr (numTempScaleValues != NumTempScaleValues::None) { - cgh.require(a_lambdas); + a_lambdas.bind(cgh); } if constexpr (numTempScaleValues == NumTempScaleValues::Multiple) { - cgh.require(a_tempScaleGroups); + a_tempScaleGroups.bind(cgh); } return [=](cl::sycl::id<1> itemIdx) { diff --git a/src/gromacs/mdlib/lincs_gpu_internal_sycl.cpp b/src/gromacs/mdlib/lincs_gpu_internal_sycl.cpp index 658a45b32c..60b10f3f84 100644 --- a/src/gromacs/mdlib/lincs_gpu_internal_sycl.cpp +++ b/src/gromacs/mdlib/lincs_gpu_internal_sycl.cpp @@ -124,25 +124,25 @@ auto lincsKernel(cl::sycl::handler& cgh, OptionalAccessor a_virialScaled, PbcAiuc pbcAiuc) { - cgh.require(a_constraints); - cgh.require(a_constraintsTargetLengths); + a_constraints.bind(cgh); + a_constraintsTargetLengths.bind(cgh); if constexpr (haveCoupledConstraints) { - cgh.require(a_coupledConstraintsCounts); - cgh.require(a_coupledConstraintsIndices); - cgh.require(a_massFactors); - cgh.require(a_matrixA); + a_coupledConstraintsCounts.bind(cgh); + a_coupledConstraintsIndices.bind(cgh); + a_massFactors.bind(cgh); + a_matrixA.bind(cgh); } - cgh.require(a_inverseMasses); - cgh.require(a_x); - cgh.require(a_xp); + a_inverseMasses.bind(cgh); + a_x.bind(cgh); + a_xp.bind(cgh); if constexpr (updateVelocities) { - cgh.require(a_v); + a_v.bind(cgh); } if constexpr (computeVirial) { - cgh.require(a_virialScaled); + a_virialScaled.bind(cgh); } /* Shared local memory buffer. Corresponds to sh_r, sm_rhs, and sm_threadVirial in CUDA. diff --git a/src/gromacs/mdlib/settle_gpu_internal_sycl.cpp b/src/gromacs/mdlib/settle_gpu_internal_sycl.cpp index 6a32e856c0..2f7ae81b4c 100644 --- a/src/gromacs/mdlib/settle_gpu_internal_sycl.cpp +++ b/src/gromacs/mdlib/settle_gpu_internal_sycl.cpp @@ -72,16 +72,16 @@ auto settleKernel(cl::sycl::handler& c OptionalAccessor a_virialScaled, PbcAiuc pbcAiuc) { - cgh.require(a_settles); - cgh.require(a_x); - cgh.require(a_xp); + a_settles.bind(cgh); + a_x.bind(cgh); + a_xp.bind(cgh); if constexpr (updateVelocities) { - cgh.require(a_v); + a_v.bind(cgh); } if constexpr (computeVirial) { - cgh.require(a_virialScaled); + a_virialScaled.bind(cgh); } // shmem buffer for i x+q pre-loading diff --git a/src/gromacs/mdlib/update_constrain_gpu_internal_sycl.cpp b/src/gromacs/mdlib/update_constrain_gpu_internal_sycl.cpp index f7113a203f..12f60ea7ee 100644 --- a/src/gromacs/mdlib/update_constrain_gpu_internal_sycl.cpp +++ b/src/gromacs/mdlib/update_constrain_gpu_internal_sycl.cpp @@ -60,7 +60,7 @@ static auto scaleKernel(cl::sycl::handler& DeviceAccessor a_x, const ScalingMatrix scalingMatrix) { - cgh.require(a_x); + a_x.bind(cgh); return [=](cl::sycl::id<1> itemIdx) { Float3 x = a_x[itemIdx]; diff --git a/src/gromacs/nbnxm/sycl/nbnxm_gpu_buffer_ops_internal_sycl.cpp b/src/gromacs/nbnxm/sycl/nbnxm_gpu_buffer_ops_internal_sycl.cpp index d95bd6f06a..b4cda5e263 100644 --- a/src/gromacs/nbnxm/sycl/nbnxm_gpu_buffer_ops_internal_sycl.cpp +++ b/src/gromacs/nbnxm/sycl/nbnxm_gpu_buffer_ops_internal_sycl.cpp @@ -76,11 +76,11 @@ static auto nbnxmKernelTransformXToXq(cl::sycl::handler& c int numAtomsPerCell, int columnsOffset) { - cgh.require(a_xq); - cgh.require(a_x); - cgh.require(a_atomIndex); - cgh.require(a_numAtoms); - cgh.require(a_cellIndex); + a_xq.bind(cgh); + a_x.bind(cgh); + a_atomIndex.bind(cgh); + a_numAtoms.bind(cgh); + a_cellIndex.bind(cgh); return [=](cl::sycl::id<2> itemIdx) { // Map cell-level parallelism to y component of block index. diff --git a/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel.cpp b/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel.cpp index ea5321645d..a7b2c6cde6 100644 --- a/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel.cpp +++ b/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel.cpp @@ -583,34 +583,34 @@ auto nbnxmKernel(cl::sycl::handler& cgh, { static constexpr EnergyFunctionProperties props; - cgh.require(a_xq); - cgh.require(a_f); - cgh.require(a_shiftVec); - cgh.require(a_fShift); + a_xq.bind(cgh); + a_f.bind(cgh); + a_shiftVec.bind(cgh); + a_fShift.bind(cgh); if constexpr (doCalcEnergies) { - cgh.require(a_energyElec); - cgh.require(a_energyVdw); + a_energyElec.bind(cgh); + a_energyVdw.bind(cgh); } - cgh.require(a_plistCJ4); - cgh.require(a_plistSci); - cgh.require(a_plistExcl); + a_plistCJ4.bind(cgh); + a_plistSci.bind(cgh); + a_plistExcl.bind(cgh); if constexpr (!props.vdwComb) { - cgh.require(a_atomTypes); - cgh.require(a_nbfp); + a_atomTypes.bind(cgh); + a_nbfp.bind(cgh); } else { - cgh.require(a_ljComb); + a_ljComb.bind(cgh); } if constexpr (props.vdwEwald) { - cgh.require(a_nbfpComb); + a_nbfpComb.bind(cgh); } if constexpr (props.elecEwaldTab) { - cgh.require(a_coulombTab); + a_coulombTab.bind(cgh); } // shmem buffer for i x+q pre-loading diff --git a/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel_pruneonly.cpp b/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel_pruneonly.cpp index b779ad79e3..4b2ca582c3 100644 --- a/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel_pruneonly.cpp +++ b/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel_pruneonly.cpp @@ -76,11 +76,11 @@ auto nbnxmKernelPruneOnly(cl::sycl::handler& cgh, const int numParts, const int part) { - cgh.require(a_xq); - cgh.require(a_shiftVec); - cgh.require(a_plistCJ4); - cgh.require(a_plistSci); - cgh.require(a_plistIMask); + a_xq.bind(cgh); + a_shiftVec.bind(cgh); + a_plistCJ4.bind(cgh); + a_plistSci.bind(cgh); + a_plistIMask.bind(cgh); /* shmem buffer for i x+q pre-loading */ cl::sycl::accessor sm_xq( -- 2.22.0