Fix SYCL PME Solve kernel

[alexxy/gromacs.git] / src / gromacs / gpu_utils / sycl_kernel_utils.h
diff --git a/src/gromacs/gpu_utils/sycl_kernel_utils.h b/src/gromacs/gpu_utils/sycl_kernel_utils.h

index 8a2d6bace13724e5a5b1bec77fd110e87b1d4449..1f44ca24d01ce39ccb2c829b3b67a4b4cabe9d2c 100644 (file)
--- a/src/gromacs/gpu_utils/sycl_kernel_utils.h
+++ b/src/gromacs/gpu_utils/sycl_kernel_utils.h
@@ -44,59 +44,37 @@
   *  \author Andrey Alekseenko <al42and@gmail.com>
   */
  
-/*! \brief Access mode to use for atomic accessors.
- *
- * Intel DPCPP compiler has \c sycl::atomic_ref, but has no \c sycl::atomic_fetch_add for floats.
- * However, \c atomic_ref can not be constructed from \c sycl::atomic, so we can not use
- * atomic accessors. Thus, we use \c mode::read_write accessors and \c atomic_ref.
- *
- * hipSYCL does not have \c sycl::atomic_ref, but has \c sycl::atomic_fetch_add for floats, which
- * requires using atomic accessors. Thus, we use \c mode::atomic accessors.
- *
- * The \ref atomicFetchAdd function could be used for doing operations on such accessors.
- */
-static constexpr auto mode_atomic = GMX_SYCL_DPCPP ? cl::sycl::access::mode::read_write :
-                                                   /* GMX_SYCL_HIPSYCL */ cl::sycl::access::mode::atomic;
-
  //! \brief Full warp active thread mask used in CUDA warp-level primitives.
  static constexpr unsigned int c_cudaFullWarpMask = 0xffffffff;
  
  /*! \brief Convenience wrapper to do atomic addition to a global buffer.
- *
- * The implementation differences between DPCPP and hipSYCL are explained in \ref mode_atomic.
   */
-template<class IndexType>
-static inline void atomicFetchAdd(DeviceAccessor<float, mode_atomic> acc, const IndexType idx, const float val)
+template<typename T, sycl_2020::memory_scope MemoryScope = sycl_2020::memory_scope::device>
+static inline void atomicFetchAdd(T& val, const T delta)
  {
-#if GMX_SYCL_DPCPP
-    sycl_2020::atomic_ref<float, sycl_2020::memory_order::relaxed, sycl_2020::memory_scope::device, cl::sycl::access::address_space::global_space>
-            fout_atomic(acc[idx]);
-    fout_atomic.fetch_add(val);
-#elif GMX_SYCL_HIPSYCL
-#    ifdef SYCL_DEVICE_ONLY
-    /* While there is support for float atomics on device, the host implementation uses
-     * Clang's __atomic_fetch_add intrinsic, that, at least in Clang 11, does not support
-     * floats. Luckily, we don't want to run on host. */
-    // The pragmas below can be removed once we switch to sycl::atomic
-#        pragma clang diagnostic push
-#        pragma clang diagnostic ignored "-Wdeprecated-declarations"
-    acc[idx].fetch_add(val);
-#        pragma clang diagnostic push
-#    else
-    GMX_ASSERT(false, "hipSYCL host codepath not supported");
-    GMX_UNUSED_VALUE(val);
-    GMX_UNUSED_VALUE(acc);
-    GMX_UNUSED_VALUE(idx);
-#    endif
-#endif
+    sycl_2020::atomic_ref<T, sycl_2020::memory_order::relaxed, MemoryScope, cl::sycl::access::address_space::global_space> ref(
+            val);
+    ref.fetch_add(delta);
+}
+
+/*! \brief Convenience wrapper to do atomic loads from a global buffer.
+ */
+template<typename T, sycl_2020::memory_scope MemoryScope = sycl_2020::memory_scope::device>
+static inline T atomicLoad(T& val)
+{
+    sycl_2020::atomic_ref<T, sycl_2020::memory_order::relaxed, MemoryScope, cl::sycl::access::address_space::global_space> ref(
+            val);
+    return ref.load();
  }
  
+
  /*! \brief Issue an intra sub-group barrier.
   *
   * Equivalent with CUDA's \c syncwarp(c_cudaFullWarpMask).
   *
   */
-static inline void subGroupBarrier(const cl::sycl::nd_item<1> itemIdx)
+template<int Dim>
+static inline void subGroupBarrier(const cl::sycl::nd_item<Dim> itemIdx)
  {
  #if GMX_SYCL_HIPSYCL
      cl::sycl::group_barrier(itemIdx.get_sub_group(), cl::sycl::memory_scope::sub_group);
@@ -166,6 +144,113 @@ static inline float shift_right(sycl_2020::sub_group sg, float var, sycl_2020::s
      return sg.shuffle_up(var, delta);
  }
  #endif
+
+#if GMX_SYCL_HIPSYCL
+/*! \brief Polyfill for sycl::isfinite missing from hipSYCL
+ *
+ * Does not follow GROMACS style because it should follow the name for
+ * which it is a polyfill. */
+template<typename Real>
+__device__ __host__ static inline bool isfinite(Real value)
+{
+    // This is not yet implemented in hipSYCL pending
+    // https://github.com/illuhad/hipSYCL/issues/636
+#    ifdef SYCL_DEVICE_ONLY
+#        if defined(HIPSYCL_PLATFORM_CUDA) && defined(__HIPSYCL_ENABLE_CUDA_TARGET__)
+    return ::isfinite(value);
+#        elif defined(HIPSYCL_PLATFORM_ROCM) && defined(__HIPSYCL_ENABLE_HIP_TARGET__)
+    return ::isfinite(value);
+#        else
+#            error "Unsupported hipSYCL target"
+#        endif
+#    else
+    // Should never be called
+    assert(false);
+    GMX_UNUSED_VALUE(value);
+    return false;
+#    endif
+}
+#elif GMX_SYCL_DPCPP
+template<typename Real>
+static inline bool isfinite(Real value)
+{
+    return cl::sycl::isfinite(value);
+}
+
+#endif
+
+#if GMX_SYCL_HIPSYCL
+
+/*! \brief Polyfill for sycl::vec::load buggy in hipSYCL
+ *
+ * Loads from the address \c ptr offset in elements of type T by
+ * NumElements * offset, into the components of \c v.
+ *
+ * Can probably be removed when
+ * https://github.com/illuhad/hipSYCL/issues/647 is resolved. */
+template<cl::sycl::access::address_space AddressSpace, typename T, int NumElements>
+static inline void loadToVec(size_t                                     offset,
+                             cl::sycl::multi_ptr<const T, AddressSpace> ptr,
+                             cl::sycl::vec<T, NumElements>*             v)
+{
+    for (int i = 0; i < NumElements; ++i)
+    {
+        (*v)[i] = ptr.get()[offset * NumElements + i];
+    }
+}
+
+/*! \brief Polyfill for sycl::vec::store buggy in hipSYCL
+ *
+ * Loads from the address \c ptr offset in elements of type T by
+ * NumElements * offset, into the components of \c v.
+ *
+ * Can probably be removed when
+ * https://github.com/illuhad/hipSYCL/issues/647 is resolved. */
+template<cl::sycl::access::address_space AddressSpace, typename T, int NumElements>
+static inline void storeFromVec(const cl::sycl::vec<T, NumElements>& v,
+                                size_t                               offset,
+                                cl::sycl::multi_ptr<T, AddressSpace> ptr)
+{
+    for (int i = 0; i < NumElements; ++i)
+    {
+        ptr.get()[offset * NumElements + i] = v[i];
+    }
+}
+
+#elif GMX_SYCL_DPCPP
+
+/*! \brief Polyfill for sycl::vec::load buggy in hipSYCL
+ *
+ * Loads from the address \c ptr offset in elements of type T by
+ * NumElements * offset, into the components of \c v.
+ *
+ * Can probably be removed when
+ * https://github.com/illuhad/hipSYCL/issues/647 is resolved. */
+template<cl::sycl::access::address_space AddressSpace, typename T, int NumElements>
+static inline void loadToVec(size_t offset,
+                             cl::sycl::multi_ptr<const T, AddressSpace> ptr,
+                             cl::sycl::vec<T, NumElements>* v)
+{
+    v->load(offset, ptr);
+}
+
+/*! \brief Polyfill for sycl::vec::store buggy in hipSYCL
+ *
+ * Loads from the address \c ptr offset in elements of type T by
+ * NumElements * offset, into the components of \c v.
+ *
+ * Can probably be removed when
+ * https://github.com/illuhad/hipSYCL/issues/647 is resolved. */
+template<cl::sycl::access::address_space AddressSpace, typename T, int NumElements>
+static inline void storeFromVec(const cl::sycl::vec<T, NumElements>& v,
+                                size_t offset,
+                                cl::sycl::multi_ptr<T, AddressSpace> ptr)
+{
+    v.store(offset, ptr);
+}
+
+#endif
+
  } // namespace sycl_2020
  
  #endif /* GMX_GPU_UTILS_SYCL_KERNEL_UTILS_H */