From 20b93925023052ce6205e0dbdd3b3e83792a411f Mon Sep 17 00:00:00 2001 From: Andrey Alekseenko Date: Wed, 30 Sep 2020 17:49:11 +0200 Subject: [PATCH] Squash SYCL DeviceContext and DeviceStream --- src/gromacs/analysisdata/datamodule.h | 2 +- src/gromacs/ewald/CMakeLists.txt | 9 ++++ src/gromacs/ewald/tests/pmetestcommon.cpp | 4 +- src/gromacs/gpu_utils/CMakeLists.txt | 7 ++- src/gromacs/gpu_utils/device_context.h | 15 +++++++ src/gromacs/gpu_utils/device_context_sycl.cpp | 25 +++-------- src/gromacs/gpu_utils/device_stream.h | 34 ++++++++++---- src/gromacs/gpu_utils/device_stream_sycl.cpp | 45 +++++++++++++++++-- .../gpu_utils/tests/device_stream_manager.cpp | 2 +- src/gromacs/mdlib/gpuforcereduction.h | 4 +- src/gromacs/mdrun/runner.cpp | 1 - src/gromacs/mdtypes/CMakeLists.txt | 5 +++ src/testutils/CMakeLists.txt | 5 +++ 13 files changed, 122 insertions(+), 36 deletions(-) diff --git a/src/gromacs/analysisdata/datamodule.h b/src/gromacs/analysisdata/datamodule.h index b22380bce8..220be7d559 100644 --- a/src/gromacs/analysisdata/datamodule.h +++ b/src/gromacs/analysisdata/datamodule.h @@ -213,7 +213,7 @@ public: * This method is called after frameFinished(), but with an additional * constraint that it is always called in serial and with an increasing * \p frameIndex. Parallel data modules need this to serialize their - * data for downsteam serial modules; AnalysisDataModuleSerial provides + * data for downstream serial modules; AnalysisDataModuleSerial provides * an empty implementation, as there frameFinished() can be used for * the same purpose. */ diff --git a/src/gromacs/ewald/CMakeLists.txt b/src/gromacs/ewald/CMakeLists.txt index c5977c14b4..5c11f0529c 100644 --- a/src/gromacs/ewald/CMakeLists.txt +++ b/src/gromacs/ewald/CMakeLists.txt @@ -80,6 +80,15 @@ elseif (GMX_GPU_OPENCL) pme_gpu_internal.cpp pme_gpu_timings.cpp ) +elseif (GMX_GPU_SYCL) + # SYCL-TODO: proper implementation + gmx_add_libgromacs_sources( + pme_gpu_program_impl.cpp + ) + _gmx_add_files_to_property(SYCL_SOURCES + pme_gpu_program_impl.cpp + pme_gpu_program.cpp + ) else() gmx_add_libgromacs_sources( # Files that implement stubs diff --git a/src/gromacs/ewald/tests/pmetestcommon.cpp b/src/gromacs/ewald/tests/pmetestcommon.cpp index c3f5bd2514..eb1e170411 100644 --- a/src/gromacs/ewald/tests/pmetestcommon.cpp +++ b/src/gromacs/ewald/tests/pmetestcommon.cpp @@ -59,8 +59,8 @@ #include "gromacs/ewald/pme_solve.h" #include "gromacs/ewald/pme_spread.h" #include "gromacs/fft/parallel_3dfft.h" -#include "gromacs/gpu_utils/device_context.h" #include "gromacs/gpu_utils/gpu_utils.h" +#include "gromacs/hardware/device_management.h" #include "gromacs/math/invertmatrix.h" #include "gromacs/mdtypes/commrec.h" #include "gromacs/pbcutil/pbc.h" @@ -73,6 +73,8 @@ #include "testutils/test_hardware_environment.h" #include "testutils/testasserts.h" +class DeviceContext; + namespace gmx { namespace test diff --git a/src/gromacs/gpu_utils/CMakeLists.txt b/src/gromacs/gpu_utils/CMakeLists.txt index a9b1a89f52..22c0a4cb82 100644 --- a/src/gromacs/gpu_utils/CMakeLists.txt +++ b/src/gromacs/gpu_utils/CMakeLists.txt @@ -63,7 +63,12 @@ elseif(GMX_GPU_SYCL) device_context_sycl.cpp device_stream_sycl.cpp ) - _gmx_add_files_to_property(SYCL_SOURCES device_context_sycl.cpp device_stream_sycl.cpp) + _gmx_add_files_to_property(SYCL_SOURCES + device_context_manager.cpp + device_context_sycl.cpp + device_stream_manager.cpp + device_stream_sycl.cpp + ) else() gmx_add_libgromacs_sources( device_context.cpp diff --git a/src/gromacs/gpu_utils/device_context.h b/src/gromacs/gpu_utils/device_context.h index c70536dd65..c4ab967b8f 100644 --- a/src/gromacs/gpu_utils/device_context.h +++ b/src/gromacs/gpu_utils/device_context.h @@ -53,6 +53,9 @@ #if GMX_GPU_OPENCL # include "gromacs/gpu_utils/gmxopencl.h" #endif +#if GMX_GPU_SYCL +# include "gromacs/gpu_utils/gmxsycl.h" +#endif #include "gromacs/gpu_utils/gpu_utils.h" #include "gromacs/hardware/device_management.h" #include "gromacs/utility/classhelpers.h" @@ -87,6 +90,18 @@ private: cl_context context_ = nullptr; #endif +#if GMX_GPU_SYCL +public: + //! Const getter + const cl::sycl::context& context() const { return context_; } + //! Getter + cl::sycl::context& context() { return context_; } + +private: + //! SYCL context object + cl::sycl::context context_; +#endif + GMX_DISALLOW_COPY_MOVE_AND_ASSIGN(DeviceContext); }; diff --git a/src/gromacs/gpu_utils/device_context_sycl.cpp b/src/gromacs/gpu_utils/device_context_sycl.cpp index 06f78f7d20..7b4f88f31d 100644 --- a/src/gromacs/gpu_utils/device_context_sycl.cpp +++ b/src/gromacs/gpu_utils/device_context_sycl.cpp @@ -37,33 +37,22 @@ * \brief Implements the DeviceContext for SYCL builds. * * \author Erik Lindahl + * \author Andrey Alekseenko * * \ingroup module_gpu_utils */ #include "gmxpre.h" -#include - -#include - #include "gromacs/gpu_utils/device_context.h" +#include "gromacs/gpu_utils/gmxsycl.h" +#include "gromacs/hardware/device_information.h" -//! Constructor. -DeviceContext::DeviceContext(const DeviceInformation& deviceInfo) : deviceInfo_(deviceInfo) +//! Constructor +DeviceContext::DeviceContext(const DeviceInformation& deviceInfo) : + deviceInfo_(deviceInfo), + context_(cl::sycl::context(deviceInfo.syclDevice)) { - // This code is just a meaningless placeholder for now, but to actually test that - // compilation with and without SYCL works, and that the correct flags are set, it's - // important to have a file that actually contains SYCL code. - std::vector platforms = cl::sycl::platform::get_platforms(); - if (platforms.size() > 0) - { - for (const auto& platform : platforms) - { - printf("Found SYCL platform %s.\n", - platform.get_info().c_str()); - } - } } //! Destructor diff --git a/src/gromacs/gpu_utils/device_stream.h b/src/gromacs/gpu_utils/device_stream.h index 1a662db07d..d1aca30236 100644 --- a/src/gromacs/gpu_utils/device_stream.h +++ b/src/gromacs/gpu_utils/device_stream.h @@ -53,6 +53,8 @@ #elif GMX_GPU_OPENCL # include "gromacs/gpu_utils/gmxopencl.h" +#elif GMX_GPU_SYCL +# include "gromacs/gpu_utils/gmxsycl.h" #endif #include "gromacs/utility/classhelpers.h" @@ -73,7 +75,7 @@ enum class DeviceStreamPriority : int /*! \libinternal \brief Declaration of platform-agnostic device stream/queue. * * The command stream (or command queue) is a sequence of operations that are executed - * in they order they were issued. Several streams may co-exist to represent concurency. + * in they order they were issued. Several streams may co-exist to represent concurrency. * This class declares the interfaces, that are exposed to platform-agnostic code and * it should be implemented for each compute architecture (e.g. CUDA and OpenCL). * @@ -89,9 +91,9 @@ class DeviceStream public: /*! \brief Construct and init. * - * \param[in] deviceContext Device context (only used in OpenCL). + * \param[in] deviceContext Device context (only used in OpenCL and SYCL). * \param[in] priority Stream priority: high or normal (only used in CUDA). - * \param[in] useTiming If the timing should be enabled (only used in OpenCL). + * \param[in] useTiming If the timing should be enabled (only used in OpenCL and SYCL). */ DeviceStream(const DeviceContext& deviceContext, DeviceStreamPriority priority, bool useTiming); @@ -104,25 +106,41 @@ public: */ bool isValid() const; - //! Synchronize the steam + //! Synchronize the stream void synchronize() const; #if GMX_GPU_CUDA //! Getter cudaStream_t stream() const; - //! Setter (temporary, will be removed in the follow-up) - void setStream(cudaStream_t stream) { stream_ = stream; } private: cudaStream_t stream_ = nullptr; +#elif GMX_GPU_SYCL + /*! \brief + * Getter for the underlying \c cl::sycl:queue object. + * + * Returns a copy instead of const-reference, because it's impossible to submit to or wait + * on a \c const cl::sycl::queue. SYCL standard guarantees that operating on copy is + * equivalent to operating on the original queue. + * + * \throws std::bad_optional_access if the stream is not valid. + * + * \returns A copy of the internal \c cl::sycl:queue. + */ + cl::sycl::queue stream() const { return cl::sycl::queue(stream_); } + //! Getter. Can throw std::bad_optional_access if the stream is not valid. + cl::sycl::queue& stream() { return stream_; } + //! Synchronize the stream. Non-const version of \c ::synchronize() for SYCL that does not do unnecessary copying. + void synchronize(); + +private: + cl::sycl::queue stream_; #elif GMX_GPU_OPENCL || defined DOXYGEN //! Getter cl_command_queue stream() const; - //! Setter (temporary, will be removed in the follow-up) - void setStream(cl_command_queue stream) { stream_ = stream; } private: cl_command_queue stream_ = nullptr; diff --git a/src/gromacs/gpu_utils/device_stream_sycl.cpp b/src/gromacs/gpu_utils/device_stream_sycl.cpp index ecb76d87b9..72a431e121 100644 --- a/src/gromacs/gpu_utils/device_stream_sycl.cpp +++ b/src/gromacs/gpu_utils/device_stream_sycl.cpp @@ -37,17 +37,33 @@ * \brief Implements the DeviceStream for SYCL builds. * * \author Erik Lindahl + * \author Andrey Alekseenko * * \ingroup module_gpu_utils */ #include "gmxpre.h" +#include "gromacs/gpu_utils/device_context.h" #include "gromacs/gpu_utils/device_stream.h" -DeviceStream::DeviceStream(const DeviceContext& /* deviceContext */, +DeviceStream::DeviceStream(const DeviceContext& deviceContext, DeviceStreamPriority /* priority */, - const bool /* useTiming */) + const bool useTiming) { + const std::vector devicesInContext = deviceContext.context().get_devices(); + // The context is constructed to have exactly one device + const cl::sycl::device device = devicesInContext[0]; + + cl::sycl::property_list propertyList = {}; + if (useTiming) + { + const bool deviceSupportsTiming = device.get_info(); + if (deviceSupportsTiming) + { + propertyList = cl::sycl::property::queue::enable_profiling(); + } + } + stream_ = cl::sycl::queue(deviceContext.context(), device, propertyList); } DeviceStream::~DeviceStream() = default; @@ -55,7 +71,28 @@ DeviceStream::~DeviceStream() = default; // NOLINTNEXTLINE readability-convert-member-functions-to-static bool DeviceStream::isValid() const { - return false; + return true; } -void DeviceStream::synchronize() const {}; +void DeviceStream::synchronize() +{ + stream_.wait_and_throw(); +}; + +void DeviceStream::synchronize() const +{ + /* cl::sycl::queue::wait is a non-const function. However, a lot of code in GROMACS + * assumes DeviceStream is const, yet wants to synchronize with it. + * The chapter "4.3.2 Common reference semantics" of SYCL 1.2.1 specification says: + * > Each of the following SYCL runtime classes: [...] queue, [...] must obey the following + * > statements, where T is the runtime class type: + * > - T must be copy constructible and copy assignable on the host application [...]. + * > Any instance of T that is constructed as a copy of another instance, via either the + * > copy constructor or copy assignment operator, must behave as-if it were the original + * > instance and as-if any action performed on it were also performed on the original + * > instance [...]. + * Same in chapter "4.5.3" of provisional SYCL 2020 specification (June 30, 2020). + * So, we can copy-construct a new queue and wait() on it. + */ + cl::sycl::queue(stream_).wait_and_throw(); +} diff --git a/src/gromacs/gpu_utils/tests/device_stream_manager.cpp b/src/gromacs/gpu_utils/tests/device_stream_manager.cpp index 06c03fe8b1..d01aff20ea 100644 --- a/src/gromacs/gpu_utils/tests/device_stream_manager.cpp +++ b/src/gromacs/gpu_utils/tests/device_stream_manager.cpp @@ -73,7 +73,7 @@ const EnumerationArray c_deviceStreamNames = { /*! \brief Non-GPU builds return nullptr instead of streams, * so we have to expect that in such build configurations. */ -constexpr bool c_canExpectValidStreams = (GMX_GPU != 0 && !GMX_GPU_SYCL); // SYCL-TODO +constexpr bool c_canExpectValidStreams = (GMX_GPU != 0); //! Helper function to implement readable testing void expectValidStreams(DeviceStreamManager* manager, std::initializer_list types) diff --git a/src/gromacs/mdlib/gpuforcereduction.h b/src/gromacs/mdlib/gpuforcereduction.h index de8aebbf76..da55bdce5c 100644 --- a/src/gromacs/mdlib/gpuforcereduction.h +++ b/src/gromacs/mdlib/gpuforcereduction.h @@ -43,13 +43,15 @@ #ifndef GMX_MDLIB_GPUFORCEREDUCTION_H #define GMX_MDLIB_GPUFORCEREDUCTION_H -#include "gromacs/gpu_utils/device_stream.h" #include "gromacs/gpu_utils/devicebuffer_datatype.h" #include "gromacs/math/vectypes.h" #include "gromacs/utility/arrayref.h" +#include "gromacs/utility/classhelpers.h" #include "gromacs/utility/fixedcapacityvector.h" class GpuEventSynchronizer; +class DeviceStream; +class DeviceContext; namespace gmx { diff --git a/src/gromacs/mdrun/runner.cpp b/src/gromacs/mdrun/runner.cpp index c88d8adbea..d32550e182 100644 --- a/src/gromacs/mdrun/runner.cpp +++ b/src/gromacs/mdrun/runner.cpp @@ -73,7 +73,6 @@ #include "gromacs/fileio/tpxio.h" #include "gromacs/gmxlib/network.h" #include "gromacs/gmxlib/nrnb.h" -#include "gromacs/gpu_utils/device_context.h" #include "gromacs/gpu_utils/device_stream_manager.h" #include "gromacs/hardware/cpuinfo.h" #include "gromacs/hardware/detecthardware.h" diff --git a/src/gromacs/mdtypes/CMakeLists.txt b/src/gromacs/mdtypes/CMakeLists.txt index ac7b8d191c..a1446f88c8 100644 --- a/src/gromacs/mdtypes/CMakeLists.txt +++ b/src/gromacs/mdtypes/CMakeLists.txt @@ -50,6 +50,11 @@ if(GMX_GPU) gmx_add_libgromacs_sources( state_propagator_data_gpu_impl_gpu.cpp ) + if(GMX_GPU_SYCL) + _gmx_add_files_to_property(SYCL_SOURCES + state_propagator_data_gpu_impl_gpu.cpp + ) + endif() else() gmx_add_libgromacs_sources( state_propagator_data_gpu_impl.cpp diff --git a/src/testutils/CMakeLists.txt b/src/testutils/CMakeLists.txt index 1928150068..5e1c1ad329 100644 --- a/src/testutils/CMakeLists.txt +++ b/src/testutils/CMakeLists.txt @@ -82,6 +82,11 @@ else() add_library(testutils STATIC ${UNITTEST_TARGET_OPTIONS} ${TESTUTILS_SOURCES}) endif() +if (GMX_GPU_SYCL) + set_source_files_properties(test_device.cpp + PROPERTIES COMPILE_FLAGS "${SYCL_CXX_FLAGS}") +endif() + gmx_target_compile_options(testutils) target_compile_definitions(testutils PRIVATE HAVE_CONFIG_H) target_include_directories(testutils SYSTEM BEFORE PRIVATE ${PROJECT_SOURCE_DIR}/src/external/thread_mpi/include) -- 2.22.0