Fix hipSYCL build with CUDA target

author Szilárd Páll <pall.szilard@gmail.com>

Mon, 1 Nov 2021 13:23:44 +0000 (13:23 +0000)

committer Mark Abraham <mark.j.abraham@gmail.com>

Mon, 1 Nov 2021 13:23:44 +0000 (13:23 +0000)
author Szilárd Páll <pall.szilard@gmail.com>
Mon, 1 Nov 2021 13:23:44 +0000 (13:23 +0000)
committer Mark Abraham <mark.j.abraham@gmail.com>
Mon, 1 Nov 2021 13:23:44 +0000 (13:23 +0000)
diff --git a/src/config.h.cmakein b/src/config.h.cmakein

index 48d36ac9d5cb0e4d125924704afacc0d36640a43..61755b09cfd57c74b1decc0e3b9cee472b8aad4d 100644 (file)
--- a/src/config.h.cmakein
+++ b/src/config.h.cmakein
@@ -230,6 +230,12 @@
  /* Define if hipSYCL is used for SYCL support (otherwise DPCPP is assumed) */
  #cmakedefine01 GMX_SYCL_HIPSYCL
  
  /* Define if hipSYCL is used for SYCL support (otherwise DPCPP is assumed) */
  #cmakedefine01 GMX_SYCL_HIPSYCL
  
+// Define if hipSYCL has HIP target(s)
+#cmakedefine01 GMX_HIPSYCL_HAVE_HIP_TARGET
+
+// Define if hipSYCL has CUDA target(s)
+#cmakedefine01 GMX_HIPSYCL_HAVE_CUDA_TARGET
+
  /* Define if Intel's DPCPP is used (default unless hipSYCL is chosen) */
  #define GMX_SYCL_DPCPP (!GMX_SYCL_HIPSYCL && GMX_GPU_SYCL)
  
  /* Define if Intel's DPCPP is used (default unless hipSYCL is chosen) */
  #define GMX_SYCL_DPCPP (!GMX_SYCL_HIPSYCL && GMX_GPU_SYCL)
  
diff --git a/src/gromacs/CMakeLists.txt b/src/gromacs/CMakeLists.txt

index e3876a38e6a8417fde94bfdc746c2a0c455305df..bf530db08138c8ef39e526f80dcdf25c4fdd4959 100644 (file)
--- a/src/gromacs/CMakeLists.txt
+++ b/src/gromacs/CMakeLists.txt
@@ -190,7 +190,7 @@ if (TARGET Heffte::Heffte)
      target_link_libraries(libgromacs PRIVATE Heffte::Heffte)
  endif()
  
      target_link_libraries(libgromacs PRIVATE Heffte::Heffte)
  endif()
  
-if (GMX_SYCL_HIPSYCL)
+if (GMX_SYCL_HIPSYCL AND GMX_HIPSYCL_HAVE_HIP_TARGET)
      target_link_libraries(libgromacs PUBLIC roc::rocfft)
  endif()
  
      target_link_libraries(libgromacs PUBLIC roc::rocfft)
  endif()
  
diff --git a/src/gromacs/fft/CMakeLists.txt b/src/gromacs/fft/CMakeLists.txt

index a1c2e6899a8831b07fc9bf80533f0b258e5545d9..16a5a4b7056c3454da1856faf9a737bf5405fe22 100644 (file)
--- a/src/gromacs/fft/CMakeLists.txt
+++ b/src/gromacs/fft/CMakeLists.txt
@@ -82,7 +82,7 @@ elseif (GMX_GPU_SYCL)
              gpu_3dfft_sycl_mkl.cpp
              )
      endif()
              gpu_3dfft_sycl_mkl.cpp
              )
      endif()
-    if (GMX_SYCL_HIPSYCL)
+    if (GMX_SYCL_HIPSYCL AND GMX_HIPSYCL_HAVE_HIP_TARGET)
              gmx_add_libgromacs_sources(
              gpu_3dfft_sycl_rocfft.cpp
              )
              gmx_add_libgromacs_sources(
              gpu_3dfft_sycl_rocfft.cpp
              )
diff --git a/src/gromacs/fft/gpu_3dfft.cpp b/src/gromacs/fft/gpu_3dfft.cpp

index 7dd21185e720d37356f876d62fa0770213041106..3eeca1a78440392e0d39cbd88b409ec4b2e00967 100644 (file)
--- a/src/gromacs/fft/gpu_3dfft.cpp
+++ b/src/gromacs/fft/gpu_3dfft.cpp
@@ -55,7 +55,7 @@
  #    if GMX_SYCL_DPCPP && GMX_FFT_MKL
  #        include "gpu_3dfft_sycl_mkl.h"
  #    endif
  #    if GMX_SYCL_DPCPP && GMX_FFT_MKL
  #        include "gpu_3dfft_sycl_mkl.h"
  #    endif
-#    if GMX_SYCL_HIPSYCL
+#    if GMX_SYCL_HIPSYCL && GMX_HIPSYCL_HAVE_HIP_TARGET
  #        include "gpu_3dfft_sycl_rocfft.h"
  #    endif
  #endif
  #        include "gpu_3dfft_sycl_rocfft.h"
  #    endif
  #endif
@@ -155,7 +155,7 @@ Gpu3dFft::Gpu3dFft(FftBackend           backend,
                                                              complexGrid);
              break;
  #        endif
                                                              complexGrid);
              break;
  #        endif
-#        if GMX_SYCL_HIPSYCL
+#        if GMX_SYCL_HIPSYCL && GMX_HIPSYCL_HAVE_HIP_TARGET
          case FftBackend::SyclRocfft:
              impl_ = std::make_unique<Gpu3dFft::ImplSyclRocfft>(allocateGrids,
                                                                 comm,
          case FftBackend::SyclRocfft:
              impl_ = std::make_unique<Gpu3dFft::ImplSyclRocfft>(allocateGrids,
                                                                 comm,
diff --git a/src/gromacs/fft/tests/fft.cpp b/src/gromacs/fft/tests/fft.cpp

index e06d89e0d27794ba4a504a1f974eab331223ca0e..b2dc98e4b5aece3dd7314e4e5f3ac7327e0a92b7 100644 (file)
--- a/src/gromacs/fft/tests/fft.cpp
+++ b/src/gromacs/fft/tests/fft.cpp
@@ -72,6 +72,8 @@ namespace gmx
  {
  namespace test
  {
  {
  namespace test
  {
+namespace
+{
  
  /*! \brief Input data for FFT tests.
   *
  
  /*! \brief Input data for FFT tests.
   *
@@ -364,8 +366,44 @@ TEST_F(FFTTest3D, Real5_6_9)
      checkRealGrid(realGridSize, realGridSizePadded, in_, outputRealGridValues);
  }
  
      checkRealGrid(realGridSize, realGridSizePadded, in_, outputRealGridValues);
  }
  
-#if GMX_GPU_CUDA || GMX_GPU_OPENCL \
-        || (GMX_GPU_SYCL && (GMX_SYCL_HIPSYCL || (GMX_SYCL_DPCPP && GMX_FFT_MKL)))
+#if GMX_GPU
+
+/*! \brief Whether the FFT is in- or out-of-place
+ *
+ *  DPCPP uses oneMKL, which seems to have troubles with out-of-place
+ *  transforms. */
+static constexpr bool sc_performOutOfPlaceFFT = !(GMX_SYCL_DPCPP && GMX_FFT_MKL);
+
+/*! \brief Return the output grid depending on whether in- or out-of
+ * place FFT is used
+ *
+ * Some versions of clang complain of unused code if we would just
+ * branch on the value of sc_performOutOfPlaceFFT at run time, because
+ * in any single configuration there would indeed be unused code. So
+ * the two template specializations are needed so that the compiler
+ * only compiles the template that is used. */
+template<bool performOutOfPlaceFFT>
+DeviceBuffer<float>* actualOutputGrid(DeviceBuffer<float>* realGrid, DeviceBuffer<float>* complexGrid);
+
+#    if GMX_SYCL_DPCPP && GMX_FFT_MKL
+
+template<>
+DeviceBuffer<float>* actualOutputGrid<false>(DeviceBuffer<float>* realGrid,
+                                             DeviceBuffer<float>* /* complexGrid */)
+{
+    return realGrid;
+};
+
+#    else
+
+template<>
+DeviceBuffer<float>* actualOutputGrid<true>(DeviceBuffer<float>* /* realGrid */, DeviceBuffer<float>* complexGrid)
+{
+    return complexGrid;
+}
+
+#    endif
+
  TEST_F(FFTTest3D, GpuReal5_6_9)
  {
      // Ensure library resources are managed appropriately
  TEST_F(FFTTest3D, GpuReal5_6_9)
  {
      // Ensure library resources are managed appropriately
@@ -398,28 +436,46 @@ TEST_F(FFTTest3D, GpuReal5_6_9)
          // Use std::copy to convert from double to real easily
          std::copy(inputdata, inputdata + sizeInReals, in_.begin());
  
          // Use std::copy to convert from double to real easily
          std::copy(inputdata, inputdata + sizeInReals, in_.begin());
  
-        // DPCPP uses oneMKL, which seems to have troubles with out-of-place transforms
-        const bool performOutOfPlaceFFT = !GMX_SYCL_DPCPP;
-
-        SCOPED_TRACE("Allocating the device buffers");
-        DeviceBuffer<float> realGrid, complexGrid;
-        allocateDeviceBuffer(&realGrid, in_.size(), deviceContext);
-        if (performOutOfPlaceFFT)
-        {
-            allocateDeviceBuffer(&complexGrid, complexGridValues.size(), deviceContext);
-        }
-
  #    if GMX_GPU_CUDA
          const FftBackend backend = FftBackend::Cufft;
  #    elif GMX_GPU_OPENCL
          const FftBackend backend = FftBackend::Ocl;
  #    elif GMX_GPU_SYCL
  #        if GMX_SYCL_HIPSYCL
  #    if GMX_GPU_CUDA
          const FftBackend backend = FftBackend::Cufft;
  #    elif GMX_GPU_OPENCL
          const FftBackend backend = FftBackend::Ocl;
  #    elif GMX_GPU_SYCL
  #        if GMX_SYCL_HIPSYCL
+#            if GMX_HIPSYCL_HAVE_HIP_TARGET
          const FftBackend backend = FftBackend::SyclRocfft;
          const FftBackend backend = FftBackend::SyclRocfft;
-#        elif GMX_SYCL_DPCPP && GMX_FFT_MKL
+#            else
+        // Use stub backend so compilation succeeds
+        const FftBackend backend = FftBackend::Sycl;
+        // Don't complain about unused reference data
+        checker.disableUnusedEntriesCheck();
+        // Skip the rest of the test
+        GTEST_SKIP() << "Only rocFFT backend is supported with hipSYCL";
+#            endif
+#        elif GMX_SYCL_DPCPP
+#            if GMX_FFT_MKL
          const FftBackend backend = FftBackend::SyclMkl;
          const FftBackend backend = FftBackend::SyclMkl;
+#            else
+        // Use stub backend so compilation succeeds
+        const FftBackend backend = FftBackend::Sycl;
+        // Don't complain about unused reference data
+        checker.disableUnusedEntriesCheck();
+        // Skip the rest of the test
+        GTEST_SKIP() << "Only MKL backend is supported with DPC++";
+#            endif
+#        else
+#            error "Unsupported SYCL implementation"
  #        endif
  #    endif
  #        endif
  #    endif
+
+        SCOPED_TRACE("Allocating the device buffers");
+        DeviceBuffer<float> realGrid, complexGrid;
+        allocateDeviceBuffer(&realGrid, in_.size(), deviceContext);
+        if (sc_performOutOfPlaceFFT)
+        {
+            allocateDeviceBuffer(&complexGrid, complexGridValues.size(), deviceContext);
+        }
+
          MPI_Comm           comm                    = MPI_COMM_NULL;
          const bool         allocateGrid            = false;
          std::array<int, 1> gridSizesInXForEachRank = { 0 };
          MPI_Comm           comm                    = MPI_COMM_NULL;
          const bool         allocateGrid            = false;
          std::array<int, 1> gridSizesInXForEachRank = { 0 };
@@ -431,14 +487,14 @@ TEST_F(FFTTest3D, GpuReal5_6_9)
                            gridSizesInXForEachRank,
                            gridSizesInYForEachRank,
                            nz,
                            gridSizesInXForEachRank,
                            gridSizesInYForEachRank,
                            nz,
-                          performOutOfPlaceFFT,
+                          sc_performOutOfPlaceFFT,
                            deviceContext,
                            deviceStream,
                            realGridSize,
                            realGridSizePadded,
                            complexGridSizePadded,
                            &realGrid,
                            deviceContext,
                            deviceStream,
                            realGridSize,
                            realGridSizePadded,
                            complexGridSizePadded,
                            &realGrid,
-                          performOutOfPlaceFFT ? &complexGrid : &realGrid);
+                          actualOutputGrid<sc_performOutOfPlaceFFT>(&realGrid, &complexGrid));
  
          // Transfer the real grid input data for the FFT
          copyToDeviceBuffer(
  
          // Transfer the real grid input data for the FFT
          copyToDeviceBuffer(
@@ -451,7 +507,7 @@ TEST_F(FFTTest3D, GpuReal5_6_9)
  
          // Check the complex grid (NB this data has not been normalized)
          copyFromDeviceBuffer(complexGridValues.data(),
  
          // Check the complex grid (NB this data has not been normalized)
          copyFromDeviceBuffer(complexGridValues.data(),
-                             performOutOfPlaceFFT ? &complexGrid : &realGrid,
+                             actualOutputGrid<sc_performOutOfPlaceFFT>(&realGrid, &complexGrid),
                               0,
                               complexGridValues.size(),
                               deviceStream,
                               0,
                               complexGridValues.size(),
                               deviceStream,
@@ -461,7 +517,7 @@ TEST_F(FFTTest3D, GpuReal5_6_9)
                  complexGridValues.begin(), complexGridValues.end(), "ComplexGridAfterRealToComplex");
  
          std::vector<float> outputRealGridValues(in_.size());
                  complexGridValues.begin(), complexGridValues.end(), "ComplexGridAfterRealToComplex");
  
          std::vector<float> outputRealGridValues(in_.size());
-        if (performOutOfPlaceFFT)
+        if (sc_performOutOfPlaceFFT)
          {
              // Clear the real grid input data for the FFT so we can
              // compute the back transform into it and observe that it did
          {
              // Clear the real grid input data for the FFT so we can
              // compute the back transform into it and observe that it did
@@ -492,7 +548,7 @@ TEST_F(FFTTest3D, GpuReal5_6_9)
  
          SCOPED_TRACE("Cleaning up");
          freeDeviceBuffer(&realGrid);
  
          SCOPED_TRACE("Cleaning up");
          freeDeviceBuffer(&realGrid);
-        if (performOutOfPlaceFFT)
+        if (sc_performOutOfPlaceFFT)
          {
              freeDeviceBuffer(&complexGrid);
          }
          {
              freeDeviceBuffer(&complexGrid);
          }
@@ -501,5 +557,6 @@ TEST_F(FFTTest3D, GpuReal5_6_9)
  
  #endif
  
  
  #endif
  
+} // namespace
  } // namespace test
  } // namespace gmx
  } // namespace test
  } // namespace gmx
author	Szilárd Páll <pall.szilard@gmail.com>
	Mon, 1 Nov 2021 13:23:44 +0000 (13:23 +0000)
committer	Mark Abraham <mark.j.abraham@gmail.com>
	Mon, 1 Nov 2021 13:23:44 +0000 (13:23 +0000)
src/config.h.cmakein		patch \| blob \| history
src/gromacs/CMakeLists.txt		patch \| blob \| history
src/gromacs/fft/CMakeLists.txt		patch \| blob \| history
src/gromacs/fft/gpu_3dfft.cpp		patch \| blob \| history
src/gromacs/fft/tests/fft.cpp		patch \| blob \| history