From 446d77d250c7587c59fe6e30525c36c0a10d2111 Mon Sep 17 00:00:00 2001
From: Andrey Alekseenko <al42and@gmail.com>
Date: Sat, 12 Jun 2021 01:10:54 +0300
Subject: [PATCH] SYCL: Shorten mangled kernel name types

Because they are used, for example, in the profiler output, and long
names make it hard to read.

Before:
- _ZTSZZL18isDeviceFunctionalRKN2cl4sycl6deviceEPNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEENK3
- _ZTSN3gmx18LeapFrogKernelNameILNS_18NumTempScaleValuesE0ELNS_19VelocityScalingTypeE0EEE
- _ZTSN5Nbnxm15NbnxmKernelNameILb0ELb1ELNS_8ElecTypeE0ELNS_7VdwTypeE1EEE
- _ZTSN5Nbnxm24NbnxmKernelPruneOnlyNameILb1EEE

After:
- _ZTS11DummyKernel
- _ZTS14LeapFrogKernelILN3gmx18NumTempScaleValuesE2ELNS0_19VelocityScalingTypeE1EE
- _ZTS11NbnxmKernelILb0ELb1ELN5Nbnxm8ElecTypeE0ELNS0_7VdwTypeE1EE
- _ZTS20NbnxmKernelPruneOnlyILb1EE

Can be shortened further by casting enums to integers in template
arguments, but not sure it will improve readability much.
---
 src/gromacs/hardware/device_management_sycl.cpp       |  5 ++++-
 .../mdlib/gpuforcereduction_impl_internal_sycl.cpp    |  9 +++++----
 src/gromacs/mdlib/leapfrog_gpu_internal_sycl.cpp      | 10 +++++-----
 .../mdlib/update_constrain_gpu_internal_sycl.cpp      |  5 ++++-
 src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel.cpp          | 11 +++++------
 .../nbnxm/sycl/nbnxm_sycl_kernel_pruneonly.cpp        | 11 +++++------
 6 files changed, 28 insertions(+), 23 deletions(-)
diff --git a/src/gromacs/hardware/device_management_sycl.cpp b/src/gromacs/hardware/device_management_sycl.cpp
index 8923fe3e40..3b687f3660 100644
--- a/src/gromacs/hardware/device_management_sycl.cpp
+++ b/src/gromacs/hardware/device_management_sycl.cpp
@@ -145,6 +145,9 @@ static DeviceStatus isDeviceCompatible(const cl::sycl::device& syclDevice)
     }
 }
 
+// Declaring the class here to avoid long unreadable name in the profiler report
+//! \brief Class name for test kernel
+class DummyKernel;
 
 /*!
  * \brief Checks that device \c deviceInfo is sane (ie can run a kernel).
@@ -168,7 +171,7 @@ static bool isDeviceFunctional(const cl::sycl::device& syclDevice, std::string*
         queue.submit([&](cl::sycl::handler& cgh) {
                  auto d_buffer = buffer.get_access<cl::sycl::access::mode::discard_write>(cgh);
                  cl::sycl::range<1> range{ numThreads };
-                 cgh.parallel_for<class DummyKernel>(range, [=](cl::sycl::id<1> threadId) {
+                 cgh.parallel_for<DummyKernel>(range, [=](cl::sycl::id<1> threadId) {
                      d_buffer[threadId] = threadId.get(0);
                  });
              }).wait_and_throw();
diff --git a/src/gromacs/mdlib/gpuforcereduction_impl_internal_sycl.cpp b/src/gromacs/mdlib/gpuforcereduction_impl_internal_sycl.cpp
index 3bc21c43de..e2ee7b4697 100644
--- a/src/gromacs/mdlib/gpuforcereduction_impl_internal_sycl.cpp
+++ b/src/gromacs/mdlib/gpuforcereduction_impl_internal_sycl.cpp
@@ -54,6 +54,10 @@
 #include "gromacs/gpu_utils/gpueventsynchronizer_sycl.h"
 #include "gromacs/utility/template_mp.h"
 
+//! \brief Class name for reduction kernel
+template<bool addRvecForce, bool accumulateForce>
+class ReduceKernel;
+
 namespace gmx
 {
 
@@ -93,9 +97,6 @@ static auto reduceKernel(cl::sycl::handler&                                 cgh,
     };
 }
 
-template<bool addRvecForce, bool accumulateForce>
-class ReduceKernelName;
-
 template<bool addRvecForce, bool accumulateForce>
 static void launchReductionKernel_(const int                   numAtoms,
                                    const int                   atomStart,
@@ -114,7 +115,7 @@ static void launchReductionKernel_(const int                   numAtoms,
     queue.submit([&](cl::sycl::handler& cgh) {
         auto kernel = reduceKernel<addRvecForce, accumulateForce>(
                 cgh, b_nbnxmForce, b_rvecForceToAdd, b_forceTotal, b_cell, atomStart);
-        cgh.parallel_for<ReduceKernelName<addRvecForce, accumulateForce>>(rangeNumAtoms, kernel);
+        cgh.parallel_for<ReduceKernel<addRvecForce, accumulateForce>>(rangeNumAtoms, kernel);
     });
 }
 
diff --git a/src/gromacs/mdlib/leapfrog_gpu_internal_sycl.cpp b/src/gromacs/mdlib/leapfrog_gpu_internal_sycl.cpp
index 566c92b1b4..d0d019c25e 100644
--- a/src/gromacs/mdlib/leapfrog_gpu_internal_sycl.cpp
+++ b/src/gromacs/mdlib/leapfrog_gpu_internal_sycl.cpp
@@ -56,6 +56,10 @@
 #include "gromacs/utility/fatalerror.h"
 #include "gromacs/utility/template_mp.h"
 
+//! \brief Class name for leap-frog kernel
+template<gmx::NumTempScaleValues numTempScaleValues, gmx::VelocityScalingType velocityScaling>
+class LeapFrogKernel;
+
 namespace gmx
 {
 
@@ -156,15 +160,11 @@ auto leapFrogKernel(
     };
 }
 
-// SYCL 1.2.1 requires providing a unique type for a kernel. Should not be needed for SYCL2020.
-template<NumTempScaleValues numTempScaleValues, VelocityScalingType velocityScaling>
-class LeapFrogKernelName;
-
 template<NumTempScaleValues numTempScaleValues, VelocityScalingType velocityScaling, class... Args>
 static cl::sycl::event launchLeapFrogKernel(const DeviceStream& deviceStream, int numAtoms, Args&&... args)
 {
     // Should not be needed for SYCL2020.
-    using kernelNameType = LeapFrogKernelName<numTempScaleValues, velocityScaling>;
+    using kernelNameType = LeapFrogKernel<numTempScaleValues, velocityScaling>;
 
     const cl::sycl::range<1> rangeAllAtoms(numAtoms);
     cl::sycl::queue          q = deviceStream.stream();
diff --git a/src/gromacs/mdlib/update_constrain_gpu_internal_sycl.cpp b/src/gromacs/mdlib/update_constrain_gpu_internal_sycl.cpp
index 5f2233a7f1..1843b2f75b 100644
--- a/src/gromacs/mdlib/update_constrain_gpu_internal_sycl.cpp
+++ b/src/gromacs/mdlib/update_constrain_gpu_internal_sycl.cpp
@@ -49,6 +49,9 @@
 #include "gromacs/gpu_utils/gputraits_sycl.h"
 #include "gromacs/utility/gmxassert.h"
 
+//! \brief Class name for scaling kernel
+class ScaleKernel;
+
 namespace gmx
 {
 
@@ -77,7 +80,7 @@ void launchScaleCoordinatesKernel(const int            numAtoms,
 
     cl::sycl::event e = queue.submit([&](cl::sycl::handler& cgh) {
         auto kernel = scaleKernel(cgh, d_coordinates, mu);
-        cgh.parallel_for<class ScaleKernelName>(rangeAllAtoms, kernel);
+        cgh.parallel_for<ScaleKernel>(rangeAllAtoms, kernel);
     });
     // TODO: Although this only happens on the pressure coupling steps, this synchronization
     //       can affect the performance if nstpcouple is small. See Issue #4018
diff --git a/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel.cpp b/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel.cpp
index 664540f857..1a37f49dff 100644
--- a/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel.cpp
+++ b/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel.cpp
@@ -53,6 +53,10 @@
 #include "nbnxm_sycl_kernel_utils.h"
 #include "nbnxm_sycl_types.h"
 
+//! \brief Class name for NBNXM kernel
+template<bool doPruneNBL, bool doCalcEnergies, enum Nbnxm::ElecType elecType, enum Nbnxm::VdwType vdwType>
+class NbnxmKernel;
+
 namespace Nbnxm
 {
 
@@ -997,15 +1001,10 @@ auto nbnxmKernel(cl::sycl::handler&                                   cgh,
     };
 }
 
-// SYCL 1.2.1 requires providing a unique type for a kernel. Should not be needed for SYCL2020.
-template<bool doPruneNBL, bool doCalcEnergies, enum ElecType elecType, enum VdwType vdwType>
-class NbnxmKernelName;
-
 template<bool doPruneNBL, bool doCalcEnergies, enum ElecType elecType, enum VdwType vdwType, class... Args>
 cl::sycl::event launchNbnxmKernel(const DeviceStream& deviceStream, const int numSci, Args&&... args)
 {
-    // Should not be needed for SYCL2020.
-    using kernelNameType = NbnxmKernelName<doPruneNBL, doCalcEnergies, elecType, vdwType>;
+    using kernelNameType = NbnxmKernel<doPruneNBL, doCalcEnergies, elecType, vdwType>;
 
     /* Kernel launch config:
      * - The thread block dimensions match the size of i-clusters, j-clusters,
diff --git a/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel_pruneonly.cpp b/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel_pruneonly.cpp
index a2cc1f8a0d..7770915c4d 100644
--- a/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel_pruneonly.cpp
+++ b/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel_pruneonly.cpp
@@ -54,6 +54,10 @@ using cl::sycl::access::fence_space;
 using cl::sycl::access::mode;
 using cl::sycl::access::target;
 
+//! \brief Class name for NBNXM prune-only kernel
+template<bool haveFreshList>
+class NbnxmKernelPruneOnly;
+
 namespace Nbnxm
 {
 
@@ -216,17 +220,12 @@ auto nbnxmKernelPruneOnly(cl::sycl::handler&                            cgh,
     };
 }
 
-// SYCL 1.2.1 requires providing a unique type for a kernel. Should not be needed for SYCL2020.
-template<bool haveFreshList>
-class NbnxmKernelPruneOnlyName;
-
 template<bool haveFreshList, class... Args>
 cl::sycl::event launchNbnxmKernelPruneOnly(const DeviceStream& deviceStream,
                                            const int           numSciInPart,
                                            Args&&... args)
 {
-    // Should not be needed for SYCL2020.
-    using kernelNameType = NbnxmKernelPruneOnlyName<haveFreshList>;
+    using kernelNameType = NbnxmKernelPruneOnly<haveFreshList>;
 
     /* Kernel launch config:
      * - The thread block dimensions match the size of i-clusters, j-clusters,
-- 
2.22.0