Squash SYCL DeviceContext and DeviceStream
authorAndrey Alekseenko <al42and@gmail.com>
Wed, 30 Sep 2020 15:49:11 +0000 (17:49 +0200)
committerAndrey Alekseenko <al42and@gmail.com>
Wed, 30 Sep 2020 15:49:11 +0000 (17:49 +0200)
13 files changed:
src/gromacs/analysisdata/datamodule.h
src/gromacs/ewald/CMakeLists.txt
src/gromacs/ewald/tests/pmetestcommon.cpp
src/gromacs/gpu_utils/CMakeLists.txt
src/gromacs/gpu_utils/device_context.h
src/gromacs/gpu_utils/device_context_sycl.cpp
src/gromacs/gpu_utils/device_stream.h
src/gromacs/gpu_utils/device_stream_sycl.cpp
src/gromacs/gpu_utils/tests/device_stream_manager.cpp
src/gromacs/mdlib/gpuforcereduction.h
src/gromacs/mdrun/runner.cpp
src/gromacs/mdtypes/CMakeLists.txt
src/testutils/CMakeLists.txt

index b22380bce860e85557d863a8f7c51460105898bd..220be7d5595661dadfb5ac9720065204e809a135 100644 (file)
@@ -213,7 +213,7 @@ public:
      * This method is called after frameFinished(), but with an additional
      * constraint that it is always called in serial and with an increasing
      * \p frameIndex.  Parallel data modules need this to serialize their
-     * data for downsteam serial modules; AnalysisDataModuleSerial provides
+     * data for downstream serial modules; AnalysisDataModuleSerial provides
      * an empty implementation, as there frameFinished() can be used for
      * the same purpose.
      */
index c5977c14b4f4e1ac8306c4034bc1745dee49b964..5c11f0529cc88cfc77f9afb105e1f20c364d77c3 100644 (file)
@@ -80,6 +80,15 @@ elseif (GMX_GPU_OPENCL)
         pme_gpu_internal.cpp
         pme_gpu_timings.cpp
         )
+elseif (GMX_GPU_SYCL)
+    # SYCL-TODO: proper implementation
+    gmx_add_libgromacs_sources(
+        pme_gpu_program_impl.cpp
+        )
+    _gmx_add_files_to_property(SYCL_SOURCES
+        pme_gpu_program_impl.cpp
+        pme_gpu_program.cpp
+        )
 else()
     gmx_add_libgromacs_sources(
         # Files that implement stubs
index c3f5bd2514b7cfa8d0188d60833798f9d5689340..eb1e170411bbf090582852ed283fe65824919143 100644 (file)
@@ -59,8 +59,8 @@
 #include "gromacs/ewald/pme_solve.h"
 #include "gromacs/ewald/pme_spread.h"
 #include "gromacs/fft/parallel_3dfft.h"
-#include "gromacs/gpu_utils/device_context.h"
 #include "gromacs/gpu_utils/gpu_utils.h"
+#include "gromacs/hardware/device_management.h"
 #include "gromacs/math/invertmatrix.h"
 #include "gromacs/mdtypes/commrec.h"
 #include "gromacs/pbcutil/pbc.h"
@@ -73,6 +73,8 @@
 #include "testutils/test_hardware_environment.h"
 #include "testutils/testasserts.h"
 
+class DeviceContext;
+
 namespace gmx
 {
 namespace test
index a9b1a89f52f494687a2f29026550ab13a589b344..22c0a4cb820120e1da473c615906185f4151a20e 100644 (file)
@@ -63,7 +63,12 @@ elseif(GMX_GPU_SYCL)
         device_context_sycl.cpp
         device_stream_sycl.cpp
         )
-    _gmx_add_files_to_property(SYCL_SOURCES device_context_sycl.cpp device_stream_sycl.cpp)
+    _gmx_add_files_to_property(SYCL_SOURCES
+        device_context_manager.cpp
+        device_context_sycl.cpp
+        device_stream_manager.cpp
+        device_stream_sycl.cpp
+        )
 else()
     gmx_add_libgromacs_sources(
         device_context.cpp
index c70536dd65fb339ad19ecf163e13790061be2dfd..c4ab967b8f2e72934b553e3c63cd3fdbfee949fb 100644 (file)
@@ -53,6 +53,9 @@
 #if GMX_GPU_OPENCL
 #    include "gromacs/gpu_utils/gmxopencl.h"
 #endif
+#if GMX_GPU_SYCL
+#    include "gromacs/gpu_utils/gmxsycl.h"
+#endif
 #include "gromacs/gpu_utils/gpu_utils.h"
 #include "gromacs/hardware/device_management.h"
 #include "gromacs/utility/classhelpers.h"
@@ -87,6 +90,18 @@ private:
     cl_context context_ = nullptr;
 #endif
 
+#if GMX_GPU_SYCL
+public:
+    //! Const getter
+    const cl::sycl::context& context() const { return context_; }
+    //! Getter
+    cl::sycl::context& context() { return context_; }
+
+private:
+    //! SYCL context object
+    cl::sycl::context context_;
+#endif
+
     GMX_DISALLOW_COPY_MOVE_AND_ASSIGN(DeviceContext);
 };
 
index 06f78f7d20d59799237e69bc9c8c806663a96e2c..7b4f88f31d9c82619f04247a6d1b563adb376801 100644 (file)
  * \brief Implements the DeviceContext for SYCL builds.
  *
  * \author Erik Lindahl <erik.lindahl@gmail.com>
+ * \author Andrey Alekseenko <al42and@gmail.com>
  *
  * \ingroup module_gpu_utils
  */
 #include "gmxpre.h"
 
-#include <vector>
-
-#include <CL/sycl.hpp>
-
 #include "gromacs/gpu_utils/device_context.h"
+#include "gromacs/gpu_utils/gmxsycl.h"
+#include "gromacs/hardware/device_information.h"
 
 
-//! Constructor.
-DeviceContext::DeviceContext(const DeviceInformation& deviceInfo) : deviceInfo_(deviceInfo)
+//! Constructor
+DeviceContext::DeviceContext(const DeviceInformation& deviceInfo) :
+    deviceInfo_(deviceInfo),
+    context_(cl::sycl::context(deviceInfo.syclDevice))
 {
-    // This code is just a meaningless placeholder for now, but to actually test that
-    // compilation with and without SYCL works, and that the correct flags are set, it's
-    // important to have a file that actually contains SYCL code.
-    std::vector<cl::sycl::platform> platforms = cl::sycl::platform::get_platforms();
-    if (platforms.size() > 0)
-    {
-        for (const auto& platform : platforms)
-        {
-            printf("Found SYCL platform %s.\n",
-                   platform.get_info<cl::sycl::info::platform::name>().c_str());
-        }
-    }
 }
 
 //! Destructor
index 1a662db07d6d52ed73225d6e3b82c3c93d92cb21..d1aca30236af675016bfef30ea6e4163d957c094 100644 (file)
@@ -53,6 +53,8 @@
 
 #elif GMX_GPU_OPENCL
 #    include "gromacs/gpu_utils/gmxopencl.h"
+#elif GMX_GPU_SYCL
+#    include "gromacs/gpu_utils/gmxsycl.h"
 #endif
 #include "gromacs/utility/classhelpers.h"
 
@@ -73,7 +75,7 @@ enum class DeviceStreamPriority : int
 /*! \libinternal \brief Declaration of platform-agnostic device stream/queue.
  *
  * The command stream (or command queue) is a sequence of operations that are executed
- * in they order they were issued. Several streams may co-exist to represent concurency.
+ * in they order they were issued. Several streams may co-exist to represent concurrency.
  * This class declares the interfaces, that are exposed to platform-agnostic code and
  * it should be implemented for each compute architecture (e.g. CUDA and OpenCL).
  *
@@ -89,9 +91,9 @@ class DeviceStream
 public:
     /*! \brief Construct and init.
      *
-     * \param[in] deviceContext  Device context (only used in OpenCL).
+     * \param[in] deviceContext  Device context (only used in OpenCL and SYCL).
      * \param[in] priority       Stream priority: high or normal (only used in CUDA).
-     * \param[in] useTiming      If the timing should be enabled (only used in OpenCL).
+     * \param[in] useTiming      If the timing should be enabled (only used in OpenCL and SYCL).
      */
     DeviceStream(const DeviceContext& deviceContext, DeviceStreamPriority priority, bool useTiming);
 
@@ -104,25 +106,41 @@ public:
      */
     bool isValid() const;
 
-    //! Synchronize the steam
+    //! Synchronize the stream
     void synchronize() const;
 
 #if GMX_GPU_CUDA
 
     //! Getter
     cudaStream_t stream() const;
-    //! Setter (temporary, will be removed in the follow-up)
-    void setStream(cudaStream_t stream) { stream_ = stream; }
 
 private:
     cudaStream_t stream_ = nullptr;
+#elif GMX_GPU_SYCL
 
+    /*! \brief
+     * Getter for the underlying \c cl::sycl:queue object.
+     *
+     * Returns a copy instead of const-reference, because it's impossible to submit to or wait
+     * on a \c const cl::sycl::queue. SYCL standard guarantees that operating on copy is
+     * equivalent to operating on the original queue.
+     *
+     * \throws std::bad_optional_access if the stream is not valid.
+     *
+     * \returns A copy of the internal \c cl::sycl:queue.
+     */
+    cl::sycl::queue stream() const { return cl::sycl::queue(stream_); }
+    //! Getter. Can throw std::bad_optional_access if the stream is not valid.
+    cl::sycl::queue& stream() { return stream_; }
+    //! Synchronize the stream. Non-const version of \c ::synchronize() for SYCL that does not do unnecessary copying.
+    void synchronize();
+
+private:
+    cl::sycl::queue stream_;
 #elif GMX_GPU_OPENCL || defined DOXYGEN
 
     //! Getter
     cl_command_queue stream() const;
-    //! Setter (temporary, will be removed in the follow-up)
-    void setStream(cl_command_queue stream) { stream_ = stream; }
 
 private:
     cl_command_queue stream_ = nullptr;
index ecb76d87b96b94baa2d601ab09e75d9d567f376f..72a431e121b920b55f4642a46c4e9bb13c6dcd2c 100644 (file)
  * \brief Implements the DeviceStream for SYCL builds.
  *
  * \author Erik Lindahl <erik.lindahl@gmail.com>
+ * \author Andrey Alekseenko <al42and@gmail.com>
  *
  * \ingroup module_gpu_utils
  */
 #include "gmxpre.h"
 
+#include "gromacs/gpu_utils/device_context.h"
 #include "gromacs/gpu_utils/device_stream.h"
 
-DeviceStream::DeviceStream(const DeviceContext& /* deviceContext */,
+DeviceStream::DeviceStream(const DeviceContext& deviceContext,
                            DeviceStreamPriority /* priority */,
-                           const bool /* useTiming */)
+                           const bool useTiming)
 {
+    const std::vector<cl::sycl::device> devicesInContext = deviceContext.context().get_devices();
+    // The context is constructed to have exactly one device
+    const cl::sycl::device device = devicesInContext[0];
+
+    cl::sycl::property_list propertyList = {};
+    if (useTiming)
+    {
+        const bool deviceSupportsTiming = device.get_info<cl::sycl::info::device::queue_profiling>();
+        if (deviceSupportsTiming)
+        {
+            propertyList = cl::sycl::property::queue::enable_profiling();
+        }
+    }
+    stream_ = cl::sycl::queue(deviceContext.context(), device, propertyList);
 }
 
 DeviceStream::~DeviceStream() = default;
@@ -55,7 +71,28 @@ DeviceStream::~DeviceStream() = default;
 // NOLINTNEXTLINE readability-convert-member-functions-to-static
 bool DeviceStream::isValid() const
 {
-    return false;
+    return true;
 }
 
-void DeviceStream::synchronize() const {};
+void DeviceStream::synchronize()
+{
+    stream_.wait_and_throw();
+};
+
+void DeviceStream::synchronize() const
+{
+    /* cl::sycl::queue::wait is a non-const function. However, a lot of code in GROMACS
+     * assumes DeviceStream is const, yet wants to synchronize with it.
+     * The chapter "4.3.2 Common reference semantics" of SYCL 1.2.1 specification says:
+     * > Each of the following SYCL runtime classes: [...] queue, [...] must obey the following
+     * > statements, where T is the runtime class type:
+     * > - T must be copy constructible and copy assignable on the host application [...].
+     * >   Any instance of T that is constructed as a copy of another instance, via either the
+     * >   copy constructor or copy assignment operator, must behave as-if it were the original
+     * >   instance and as-if any action performed on it were also performed on the original
+     * >   instance [...].
+     * Same in chapter "4.5.3" of provisional SYCL 2020 specification (June 30, 2020).
+     * So, we can copy-construct a new queue and wait() on it.
+     */
+    cl::sycl::queue(stream_).wait_and_throw();
+}
index 06c03fe8b1f72a30eb9b3694d9dfaf4cd0dbbeef..d01aff20ead9771860aa31cdf0f19893cafe2eea 100644 (file)
@@ -73,7 +73,7 @@ const EnumerationArray<DeviceStreamType, std::string> c_deviceStreamNames = {
 
 /*! \brief Non-GPU builds return nullptr instead of streams,
  * so we have to expect that in such build configurations. */
-constexpr bool c_canExpectValidStreams = (GMX_GPU != 0 && !GMX_GPU_SYCL); // SYCL-TODO
+constexpr bool c_canExpectValidStreams = (GMX_GPU != 0);
 
 //! Helper function to implement readable testing
 void expectValidStreams(DeviceStreamManager* manager, std::initializer_list<DeviceStreamType> types)
index de8aebbf76480fd148a21d0755e10814838c123f..da55bdce5c07547fabf55d7215e1b72256e383f8 100644 (file)
 #ifndef GMX_MDLIB_GPUFORCEREDUCTION_H
 #define GMX_MDLIB_GPUFORCEREDUCTION_H
 
-#include "gromacs/gpu_utils/device_stream.h"
 #include "gromacs/gpu_utils/devicebuffer_datatype.h"
 #include "gromacs/math/vectypes.h"
 #include "gromacs/utility/arrayref.h"
+#include "gromacs/utility/classhelpers.h"
 #include "gromacs/utility/fixedcapacityvector.h"
 
 class GpuEventSynchronizer;
+class DeviceStream;
+class DeviceContext;
 
 namespace gmx
 {
index c88d8adbea63a6ecc04245c3a7071a7b347cfc91..d32550e1824d140e3face616f8364fdeeb77db8a 100644 (file)
@@ -73,7 +73,6 @@
 #include "gromacs/fileio/tpxio.h"
 #include "gromacs/gmxlib/network.h"
 #include "gromacs/gmxlib/nrnb.h"
-#include "gromacs/gpu_utils/device_context.h"
 #include "gromacs/gpu_utils/device_stream_manager.h"
 #include "gromacs/hardware/cpuinfo.h"
 #include "gromacs/hardware/detecthardware.h"
index ac7b8d191c23177e2d3b09d344af2139fefc5e33..a1446f88c836a42368c1cb92b12fabb768ff083c 100644 (file)
@@ -50,6 +50,11 @@ if(GMX_GPU)
     gmx_add_libgromacs_sources(
        state_propagator_data_gpu_impl_gpu.cpp
        )
+   if(GMX_GPU_SYCL)
+       _gmx_add_files_to_property(SYCL_SOURCES
+           state_propagator_data_gpu_impl_gpu.cpp
+           )
+   endif()
 else()
     gmx_add_libgromacs_sources(
       state_propagator_data_gpu_impl.cpp
index 1928150068c2f249ea87618092f2848525f97604..5e1c1ad329c173fa377735a5a20d4c2501716197 100644 (file)
@@ -82,6 +82,11 @@ else()
     add_library(testutils STATIC ${UNITTEST_TARGET_OPTIONS} ${TESTUTILS_SOURCES})
 endif()
 
+if (GMX_GPU_SYCL)
+    set_source_files_properties(test_device.cpp
+      PROPERTIES COMPILE_FLAGS "${SYCL_CXX_FLAGS}")
+endif()
+
 gmx_target_compile_options(testutils)
 target_compile_definitions(testutils PRIVATE HAVE_CONFIG_H)
 target_include_directories(testutils SYSTEM BEFORE PRIVATE ${PROJECT_SOURCE_DIR}/src/external/thread_mpi/include)