From 446d77d250c7587c59fe6e30525c36c0a10d2111 Mon Sep 17 00:00:00 2001 From: Andrey Alekseenko Date: Sat, 12 Jun 2021 01:10:54 +0300 Subject: [PATCH] SYCL: Shorten mangled kernel name types Because they are used, for example, in the profiler output, and long names make it hard to read. Before: - _ZTSZZL18isDeviceFunctionalRKN2cl4sycl6deviceEPNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEENK3 - _ZTSN3gmx18LeapFrogKernelNameILNS_18NumTempScaleValuesE0ELNS_19VelocityScalingTypeE0EEE - _ZTSN5Nbnxm15NbnxmKernelNameILb0ELb1ELNS_8ElecTypeE0ELNS_7VdwTypeE1EEE - _ZTSN5Nbnxm24NbnxmKernelPruneOnlyNameILb1EEE After: - _ZTS11DummyKernel - _ZTS14LeapFrogKernelILN3gmx18NumTempScaleValuesE2ELNS0_19VelocityScalingTypeE1EE - _ZTS11NbnxmKernelILb0ELb1ELN5Nbnxm8ElecTypeE0ELNS0_7VdwTypeE1EE - _ZTS20NbnxmKernelPruneOnlyILb1EE Can be shortened further by casting enums to integers in template arguments, but not sure it will improve readability much. --- src/gromacs/hardware/device_management_sycl.cpp | 5 ++++- .../mdlib/gpuforcereduction_impl_internal_sycl.cpp | 9 +++++---- src/gromacs/mdlib/leapfrog_gpu_internal_sycl.cpp | 10 +++++----- .../mdlib/update_constrain_gpu_internal_sycl.cpp | 5 ++++- src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel.cpp | 11 +++++------ .../nbnxm/sycl/nbnxm_sycl_kernel_pruneonly.cpp | 11 +++++------ 6 files changed, 28 insertions(+), 23 deletions(-) diff --git a/src/gromacs/hardware/device_management_sycl.cpp b/src/gromacs/hardware/device_management_sycl.cpp index 8923fe3e40..3b687f3660 100644 --- a/src/gromacs/hardware/device_management_sycl.cpp +++ b/src/gromacs/hardware/device_management_sycl.cpp @@ -145,6 +145,9 @@ static DeviceStatus isDeviceCompatible(const cl::sycl::device& syclDevice) } } +// Declaring the class here to avoid long unreadable name in the profiler report +//! \brief Class name for test kernel +class DummyKernel; /*! * \brief Checks that device \c deviceInfo is sane (ie can run a kernel). @@ -168,7 +171,7 @@ static bool isDeviceFunctional(const cl::sycl::device& syclDevice, std::string* queue.submit([&](cl::sycl::handler& cgh) { auto d_buffer = buffer.get_access(cgh); cl::sycl::range<1> range{ numThreads }; - cgh.parallel_for(range, [=](cl::sycl::id<1> threadId) { + cgh.parallel_for(range, [=](cl::sycl::id<1> threadId) { d_buffer[threadId] = threadId.get(0); }); }).wait_and_throw(); diff --git a/src/gromacs/mdlib/gpuforcereduction_impl_internal_sycl.cpp b/src/gromacs/mdlib/gpuforcereduction_impl_internal_sycl.cpp index 3bc21c43de..e2ee7b4697 100644 --- a/src/gromacs/mdlib/gpuforcereduction_impl_internal_sycl.cpp +++ b/src/gromacs/mdlib/gpuforcereduction_impl_internal_sycl.cpp @@ -54,6 +54,10 @@ #include "gromacs/gpu_utils/gpueventsynchronizer_sycl.h" #include "gromacs/utility/template_mp.h" +//! \brief Class name for reduction kernel +template +class ReduceKernel; + namespace gmx { @@ -93,9 +97,6 @@ static auto reduceKernel(cl::sycl::handler& cgh, }; } -template -class ReduceKernelName; - template static void launchReductionKernel_(const int numAtoms, const int atomStart, @@ -114,7 +115,7 @@ static void launchReductionKernel_(const int numAtoms, queue.submit([&](cl::sycl::handler& cgh) { auto kernel = reduceKernel( cgh, b_nbnxmForce, b_rvecForceToAdd, b_forceTotal, b_cell, atomStart); - cgh.parallel_for>(rangeNumAtoms, kernel); + cgh.parallel_for>(rangeNumAtoms, kernel); }); } diff --git a/src/gromacs/mdlib/leapfrog_gpu_internal_sycl.cpp b/src/gromacs/mdlib/leapfrog_gpu_internal_sycl.cpp index 566c92b1b4..d0d019c25e 100644 --- a/src/gromacs/mdlib/leapfrog_gpu_internal_sycl.cpp +++ b/src/gromacs/mdlib/leapfrog_gpu_internal_sycl.cpp @@ -56,6 +56,10 @@ #include "gromacs/utility/fatalerror.h" #include "gromacs/utility/template_mp.h" +//! \brief Class name for leap-frog kernel +template +class LeapFrogKernel; + namespace gmx { @@ -156,15 +160,11 @@ auto leapFrogKernel( }; } -// SYCL 1.2.1 requires providing a unique type for a kernel. Should not be needed for SYCL2020. -template -class LeapFrogKernelName; - template static cl::sycl::event launchLeapFrogKernel(const DeviceStream& deviceStream, int numAtoms, Args&&... args) { // Should not be needed for SYCL2020. - using kernelNameType = LeapFrogKernelName; + using kernelNameType = LeapFrogKernel; const cl::sycl::range<1> rangeAllAtoms(numAtoms); cl::sycl::queue q = deviceStream.stream(); diff --git a/src/gromacs/mdlib/update_constrain_gpu_internal_sycl.cpp b/src/gromacs/mdlib/update_constrain_gpu_internal_sycl.cpp index 5f2233a7f1..1843b2f75b 100644 --- a/src/gromacs/mdlib/update_constrain_gpu_internal_sycl.cpp +++ b/src/gromacs/mdlib/update_constrain_gpu_internal_sycl.cpp @@ -49,6 +49,9 @@ #include "gromacs/gpu_utils/gputraits_sycl.h" #include "gromacs/utility/gmxassert.h" +//! \brief Class name for scaling kernel +class ScaleKernel; + namespace gmx { @@ -77,7 +80,7 @@ void launchScaleCoordinatesKernel(const int numAtoms, cl::sycl::event e = queue.submit([&](cl::sycl::handler& cgh) { auto kernel = scaleKernel(cgh, d_coordinates, mu); - cgh.parallel_for(rangeAllAtoms, kernel); + cgh.parallel_for(rangeAllAtoms, kernel); }); // TODO: Although this only happens on the pressure coupling steps, this synchronization // can affect the performance if nstpcouple is small. See Issue #4018 diff --git a/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel.cpp b/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel.cpp index 664540f857..1a37f49dff 100644 --- a/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel.cpp +++ b/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel.cpp @@ -53,6 +53,10 @@ #include "nbnxm_sycl_kernel_utils.h" #include "nbnxm_sycl_types.h" +//! \brief Class name for NBNXM kernel +template +class NbnxmKernel; + namespace Nbnxm { @@ -997,15 +1001,10 @@ auto nbnxmKernel(cl::sycl::handler& cgh, }; } -// SYCL 1.2.1 requires providing a unique type for a kernel. Should not be needed for SYCL2020. -template -class NbnxmKernelName; - template cl::sycl::event launchNbnxmKernel(const DeviceStream& deviceStream, const int numSci, Args&&... args) { - // Should not be needed for SYCL2020. - using kernelNameType = NbnxmKernelName; + using kernelNameType = NbnxmKernel; /* Kernel launch config: * - The thread block dimensions match the size of i-clusters, j-clusters, diff --git a/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel_pruneonly.cpp b/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel_pruneonly.cpp index a2cc1f8a0d..7770915c4d 100644 --- a/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel_pruneonly.cpp +++ b/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel_pruneonly.cpp @@ -54,6 +54,10 @@ using cl::sycl::access::fence_space; using cl::sycl::access::mode; using cl::sycl::access::target; +//! \brief Class name for NBNXM prune-only kernel +template +class NbnxmKernelPruneOnly; + namespace Nbnxm { @@ -216,17 +220,12 @@ auto nbnxmKernelPruneOnly(cl::sycl::handler& cgh, }; } -// SYCL 1.2.1 requires providing a unique type for a kernel. Should not be needed for SYCL2020. -template -class NbnxmKernelPruneOnlyName; - template cl::sycl::event launchNbnxmKernelPruneOnly(const DeviceStream& deviceStream, const int numSciInPart, Args&&... args) { - // Should not be needed for SYCL2020. - using kernelNameType = NbnxmKernelPruneOnlyName; + using kernelNameType = NbnxmKernelPruneOnly; /* Kernel launch config: * - The thread block dimensions match the size of i-clusters, j-clusters, -- 2.22.0