From: Andrey Alekseenko Date: Thu, 16 Sep 2021 16:36:00 +0000 (+0300) Subject: SYCL: prepareGpuKernelArgument/launchGpuKernel X-Git-Url: http://biod.pnpi.spb.ru/gitweb/?a=commitdiff_plain;h=8be0787d177b4cd4d63f5cb84f7f25f8e9ab5631;p=alexxy%2Fgromacs.git SYCL: prepareGpuKernelArgument/launchGpuKernel Implement the functions for abstracting the kernel launch. The abstraction is not idiomatic in SYCL, but it is conformant with what is used in CUDA and OpenCL for PME kernel launches. Refs #3927. --- diff --git a/src/gromacs/gpu_utils/gputraits_sycl.h b/src/gromacs/gpu_utils/gputraits_sycl.h index b6b76c7a44..65734c7c1a 100644 --- a/src/gromacs/gpu_utils/gputraits_sycl.h +++ b/src/gromacs/gpu_utils/gputraits_sycl.h @@ -64,9 +64,13 @@ using Float3 = gmx::RVec; using Float2 = cl::sycl::float2; /*! \internal \brief - * GPU kernels scheduling description. This is same in OpenCL/CUDA. - * Provides reasonable defaults, one typically only needs to set the GPU stream - * and non-1 work sizes. + * GPU kernels scheduling description. + * One typically only needs to set non-1 work sizes. + * + * \note This struct uses CUDA/OpenCL layout, with the first dimension being contiguous. + * It is different from the SYCL standard, where the last dimension is contiguous. + * The transpose is to be performed internally in ISyclKernelFunctor::launch. + * \note \c sharedMemorySize is ignored in SYCL. */ struct KernelLaunchConfig { @@ -79,8 +83,13 @@ struct KernelLaunchConfig }; /*! \brief Sets whether device code can use arrays that are embedded in structs. - * \todo Probably can, must check + * + * That is not technically true for SYCL: the device code needs dedicated + * \c sycl::buffer/accessor objects. + * But our \c prepareGpuKernelArguments and \c launchGpuKernel functions deal + * with that, so we can pass embedded buffers to them, which is what this + * constant actually controls. */ -#define c_canEmbedBuffers false +#define c_canEmbedBuffers true #endif diff --git a/src/gromacs/gpu_utils/syclutils.h b/src/gromacs/gpu_utils/syclutils.h index ffa4be2e83..92a51271fc 100644 --- a/src/gromacs/gpu_utils/syclutils.h +++ b/src/gromacs/gpu_utils/syclutils.h @@ -67,10 +67,67 @@ struct gmx_device_runtime_data_t #ifndef DOXYGEN -/* To properly mark function as [[noreturn]], we must do it everywhere it is declared, which - * will pollute common headers.*/ -# pragma clang diagnostic push -# pragma clang diagnostic ignored "-Wmissing-noreturn" +//! \brief Interface for SYCL kernel function objects. +class ISyclKernelFunctor +{ +public: + //! \brief Virtual destructor. + virtual ~ISyclKernelFunctor() = default; + /*! \brief Set the kernel argument number \p argIndex to \p arg. + * + * \param argIndex Index of the argument. Maximum allowed value depends + * on the specific concrete class implementing this interface. + * \param arg Pointer to the argument value. + * + * \note Valid values of \p argIndex and types of \p arg depend on the + * specific concrete class implementing this interface. Passing + * illegal values is undefined behavior. + * \note Similar to \c clSetKernelArg, it is not safe to call this + * function on the same kernel object from multiple host threads. + */ + virtual void setArg(size_t argIndex, void* arg) = 0; + /*! \brief Launch the kernel. + * + * \param config Work-group configuration. + * \param deviceStream \c DeviceStream to use. + */ + virtual cl::sycl::event launch(const KernelLaunchConfig& /*config*/, + const DeviceStream& /*deviceStream*/) = 0; +}; + + +/*! \brief + * A function for setting up a single SYCL kernel argument. + * This is the tail of the compile-time recursive function below. + * It has to be seen by the compiler first. + * + * \param[in] kernel Kernel function handle + * \param[in] argIndex Index of the current argument + */ +void inline prepareGpuKernelArgument(ISyclKernelFunctor* /*kernel*/, size_t /*argIndex*/) {} + +/*! \brief + * Compile-time recursive function for setting up a single SYCL kernel argument. + * This function uses one kernel argument pointer \p argPtr to call + * \c ISyclKernelFunctor::setArg, and calls itself on the next argument, eventually + * calling the tail function above. + * + * \tparam CurrentArg Type of the current argument + * \tparam RemainingArgs Types of remaining arguments after the current one + * \param[in] kernel Kernel function handle + * \param[in] argIndex Index of the current argument + * \param[in] argPtr Pointer to the current argument + * \param[in] otherArgsPtrs Pack of pointers to arguments remaining to process after the current one + */ +template +void prepareGpuKernelArgument(ISyclKernelFunctor* kernel, + size_t argIndex, + const CurrentArg* argPtr, + const RemainingArgs*... otherArgsPtrs) +{ + kernel->setArg(argIndex, const_cast(reinterpret_cast(argPtr))); + prepareGpuKernelArgument(kernel, argIndex + 1, otherArgsPtrs...); +} /*! \brief * A wrapper function for setting up all the SYCL kernel arguments. @@ -80,12 +137,14 @@ struct gmx_device_runtime_data_t * \param[in] kernel Kernel function handle * \param[in] config Kernel configuration for launching * \param[in] argsPtrs Pointers to all the kernel arguments - * \returns A handle for the prepared parameter pack to be used with launchGpuKernel() as the last argument. + * \returns A dummy value to be used with launchGpuKernel() as the last argument. */ template -void* prepareGpuKernelArguments(void* /*kernel*/, const KernelLaunchConfig& /*config*/, const Args*... /*argsPtrs*/) +void* prepareGpuKernelArguments(void* kernel, const KernelLaunchConfig& /*config*/, const Args*... argsPtrs) { - GMX_THROW(gmx::NotImplementedError("Not implemented on SYCL yet")); + auto* kernelFunctor = reinterpret_cast(kernel); + prepareGpuKernelArgument(kernelFunctor, 0, argsPtrs...); + return nullptr; } /*! \brief Launches the SYCL kernel and handles the errors. @@ -93,20 +152,27 @@ void* prepareGpuKernelArguments(void* /*kernel*/, const KernelLaunchConfig& /*co * \param[in] kernel Kernel function handle * \param[in] config Kernel configuration for launching * \param[in] deviceStream GPU stream to launch kernel in - * \param[in] timingEvent Timing event, fetched from GpuRegionTimer - * \param[in] kernelName Human readable kernel description, for error handling only + * \param[in] timingEvent Timing event, fetched from GpuRegionTimer. Unused. + * \param[in] kernelName Human readable kernel description, for error handling only. Unused. + * \param[in] kernelArgs Unused. * \throws gmx::InternalError on kernel launch failure */ -inline void launchGpuKernel(void* /*kernel*/, - const KernelLaunchConfig& /*config*/, - const DeviceStream& /*deviceStream*/, +inline void launchGpuKernel(void* kernel, + const KernelLaunchConfig& config, + const DeviceStream& deviceStream, CommandEvent* /*timingEvent*/, const char* /*kernelName*/, const void* /*kernelArgs*/) { - GMX_THROW(gmx::NotImplementedError("Not implemented on SYCL yet")); + auto* kernelFunctor = reinterpret_cast(kernel); + cl::sycl::event event = kernelFunctor->launch(config, deviceStream); } +/* To properly mark function as [[noreturn]], we must do it everywhere it is declared, which + * will pollute common headers.*/ +# pragma clang diagnostic push +# pragma clang diagnostic ignored "-Wmissing-noreturn" + /*! \brief Pretend to check a SYCL stream for unfinished work (dummy implementation). * * \returns Not implemented in SYCL. diff --git a/src/gromacs/nbnxm/gpu_common.h b/src/gromacs/nbnxm/gpu_common.h index 13aea52f4b..96e2e48939 100644 --- a/src/gromacs/nbnxm/gpu_common.h +++ b/src/gromacs/nbnxm/gpu_common.h @@ -57,6 +57,7 @@ #if GMX_GPU_SYCL # include "sycl/nbnxm_sycl_types.h" +# include "gromacs/gpu_utils/syclutils.h" #endif #include "gromacs/gpu_utils/gpu_utils.h" diff --git a/src/gromacs/nbnxm/sycl/nbnxm_sycl_types.h b/src/gromacs/nbnxm/sycl/nbnxm_sycl_types.h index babea21881..5b0abc44b2 100644 --- a/src/gromacs/nbnxm/sycl/nbnxm_sycl_types.h +++ b/src/gromacs/nbnxm/sycl/nbnxm_sycl_types.h @@ -48,7 +48,6 @@ #include "gromacs/gpu_utils/gmxsycl.h" #include "gromacs/gpu_utils/gpueventsynchronizer.h" #include "gromacs/gpu_utils/gputraits.h" -#include "gromacs/gpu_utils/syclutils.h" #include "gromacs/nbnxm/gpu_types_common.h" #include "gromacs/nbnxm/nbnxm.h" #include "gromacs/nbnxm/pairlist.h"