message(FATAL_ERROR "Cannot compile with SYCL Intel compiler. Try a different compiler or disable SYCL.")
endif()
+ include(gmxManageFFTLibraries)
+ if(NOT GMX_FFT_MKL)
+ message(WARNING "Building SYCL version with ${GMX_FFT_LIBRARY} instead of MKL. GPU FFT is disabled!")
+ endif()
+
# Add function wrapper similar to the one used by ComputeCPP and hipSYCL
function(add_sycl_to_target)
cmake_parse_arguments(
#elif GMX_GPU_OPENCL
const gmx::FftBackend backend = gmx::FftBackend::Ocl;
#elif GMX_GPU_SYCL
-# if GMX_SYCL_HIPSYCL
+# if GMX_SYCL_DPCPP && GMX_FFT_MKL
+ const gmx::FftBackend backend = gmx::FftBackend::SyclMkl;
+# elif GMX_SYCL_HIPSYCL
const gmx::FftBackend backend = gmx::FftBackend::SyclRocfft;
# else
const gmx::FftBackend backend = gmx::FftBackend::Sycl;
gpu_3dfft_ocl.cpp
)
elseif (GMX_GPU_SYCL)
- if (GMX_SYCL_HIPSYCL)
- set(3dfft_sycl_source gpu_3dfft_sycl_rocfft.cpp)
- else()
- set(3dfft_sycl_source gpu_3dfft.cpp gpu_3dfft_sycl.cpp)
+ if (NOT GMX_GPU_HIPSYCL AND GMX_FFT_MKL)
+ gmx_add_libgromacs_sources(
+ gpu_3dfft_sycl_mkl.cpp
+ )
+ _gmx_add_files_to_property(SYCL_SOURCES
+ gpu_3dfft_sycl_mkl.cpp
+ )
+ endif()
+ if (GMX_GPU_HIPSYCL)
+ gmx_add_libgromacs_sources(
+ gpu_3dfft_sycl_rocfft.cpp
+ )
+ _gmx_add_files_to_property(SYCL_SOURCES
+ gpu_3dfft_sycl_rocfft.cpp
+ )
endif()
gmx_add_libgromacs_sources(
- # SYCL-specific sources
- ${3dfft_sycl_source}
+ gpu_3dfft_sycl.cpp
)
_gmx_add_files_to_property(SYCL_SOURCES
- ${3dfft_sycl_source}
+ gpu_3dfft_sycl.cpp
+ gpu_3dfft.cpp
+ )
+else()
+ gmx_add_libgromacs_sources(
+ # Stub sources for CPU-only build
+ gpu_3dfft.cpp
)
endif()
# include "gpu_3dfft_ocl.h"
#elif GMX_GPU_SYCL
# include "gpu_3dfft_sycl.h"
+# if GMX_SYCL_DPCPP && GMX_FFT_MKL
+# include "gpu_3dfft_sycl_mkl.h"
+# endif
# if GMX_SYCL_HIPSYCL
# include "gpu_3dfft_sycl_rocfft.h"
# endif
# elif GMX_GPU_SYCL
switch (backend)
{
+# if GMX_SYCL_DPCPP && GMX_FFT_MKL
+ case FftBackend::SyclMkl:
+ impl_ = std::make_unique<Gpu3dFft::ImplSyclMkl>(allocateGrids,
+ comm,
+ gridSizesInXForEachRank,
+ gridSizesInYForEachRank,
+ nz,
+ performOutOfPlaceFFT,
+ context,
+ pmeStream,
+ realGridSize,
+ realGridSizePadded,
+ complexGridSizePadded,
+ realGrid,
+ complexGrid);
+ break;
+# endif
# if GMX_SYCL_HIPSYCL
case FftBackend::SyclRocfft:
impl_ = std::make_unique<Gpu3dFft::ImplSyclRocfft>(allocateGrids,
class Impl;
class ImplCuFft;
class ImplOcl;
+ class ImplSyclMkl;
class ImplSyclRocfft;
class ImplSycl;
DeviceBuffer<float>* /*realGrid*/,
DeviceBuffer<float>* /*complexGrid*/)
{
- GMX_THROW(NotImplementedError("GPU 3DFFT is not implemented in SYCL"));
+ GMX_THROW(NotImplementedError("Using SYCL build without GPU 3DFFT support"));
}
Gpu3dFft::ImplSycl::~ImplSycl() = default;
void Gpu3dFft::ImplSycl::perform3dFft(gmx_fft_direction /*dir*/, CommandEvent* /*timingEvent*/)
{
- GMX_THROW(NotImplementedError("Not implemented on SYCL yet"));
+ GMX_THROW(NotImplementedError("Using SYCL build without GPU 3DFFT support"));
}
#pragma clang diagnostic pop
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2021, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/*! \internal \file
+ * \brief Implements GPU 3D FFT routines for SYCL.
+ *
+ * \author Andrey Alekseenko <al42and@gmail.com>
+ * \author Mark Abraham <mark.j.abraham@gmail.com>
+ * \ingroup module_fft
+ *
+ * In DPC++, we use Intel oneMKL to perform the FFT. It requires using the binary version of
+ * MKL, since the open-source one does not support FFT yet (https://github.com/oneapi-src/oneMKL/issues/27).
+ *
+ * There are issues with out-of-place transform, existing at least in oneAPI 2021.2-2021.4, so
+ * we allow only in-place transforms.
+ */
+
+#include "gmxpre.h"
+
+#include "gpu_3dfft_sycl_mkl.h"
+
+#include "config.h"
+
+#include "gromacs/gpu_utils/devicebuffer_sycl.h"
+#include "gromacs/gpu_utils/device_stream.h"
+#include "gromacs/utility/exceptions.h"
+#include "gromacs/utility/gmxassert.h"
+
+class DeviceContext;
+
+#if (!GMX_SYCL_DPCPP)
+# error This file can only be compiled with Intel DPC++ compiler
+#endif
+
+#if (!GMX_FFT_MKL)
+# error Must use MKL library for FFT when compiling with Intel DPC++ compiler
+#endif
+
+#include <cstddef>
+#include <oneapi/mkl/dfti.hpp>
+#include <oneapi/mkl/exceptions.hpp>
+#include <mkl_version.h>
+
+// oneAPI 2021.2.0 to 2021.4.0 have issues with backward out-of-place transform
+static constexpr bool sc_mklHasBuggyOutOfPlaceFFT = (INTEL_MKL_VERSION <= 20210004);
+
+namespace gmx
+{
+
+Gpu3dFft::ImplSyclMkl::Descriptor Gpu3dFft::ImplSyclMkl::initDescriptor(const ivec realGridSize)
+{
+ try
+ {
+ const std::vector<std::int64_t> realGridDimensions{ realGridSize[XX],
+ realGridSize[YY],
+ realGridSize[ZZ] };
+ return { realGridDimensions };
+ }
+ catch (oneapi::mkl::exception& exc)
+ {
+ GMX_THROW(InternalError(formatString("MKL failure while constructing descriptor: %s", exc.what())));
+ }
+}
+
+Gpu3dFft::ImplSyclMkl::ImplSyclMkl(bool allocateGrids,
+ MPI_Comm /*comm*/,
+ ArrayRef<const int> gridSizesInXForEachRank,
+ ArrayRef<const int> gridSizesInYForEachRank,
+ int /*nz*/,
+ const bool performOutOfPlaceFFT,
+ const DeviceContext& /*context*/,
+ const DeviceStream& pmeStream,
+ ivec realGridSize,
+ ivec realGridSizePadded,
+ ivec complexGridSizePadded,
+ DeviceBuffer<float>* realGrid,
+ DeviceBuffer<float>* complexGrid) :
+ realGrid_(*realGrid->buffer_),
+ complexGrid_(*complexGrid->buffer_),
+ queue_(pmeStream.stream()),
+ r2cDescriptor_(initDescriptor(realGridSize)),
+ c2rDescriptor_(initDescriptor(realGridSize))
+{
+ GMX_RELEASE_ASSERT(!allocateGrids, "Grids needs to be pre-allocated");
+ GMX_RELEASE_ASSERT(gridSizesInXForEachRank.size() == 1 && gridSizesInYForEachRank.size() == 1,
+ "Multi-rank FFT decomposition not implemented with SYCL MKL backend");
+
+ GMX_RELEASE_ASSERT(!(sc_mklHasBuggyOutOfPlaceFFT && performOutOfPlaceFFT),
+ "The version of MKL used does not properly support out-of-place FFTs");
+
+ GMX_ASSERT(checkDeviceBuffer(*realGrid,
+ realGridSizePadded[XX] * realGridSizePadded[YY] * realGridSizePadded[ZZ]),
+ "Real grid buffer is too small for the declared padded size");
+
+ GMX_ASSERT(checkDeviceBuffer(*complexGrid,
+ complexGridSizePadded[XX] * complexGridSizePadded[YY]
+ * complexGridSizePadded[ZZ] * 2),
+ "Complex grid buffer is too small for the declared padded size");
+
+ // MKL expects row-major
+ const std::array<MKL_LONG, 4> realGridStrides = {
+ 0, static_cast<MKL_LONG>(realGridSizePadded[YY] * realGridSizePadded[ZZ]), realGridSizePadded[ZZ], 1
+ };
+ const std::array<MKL_LONG, 4> complexGridStrides = {
+ 0,
+ static_cast<MKL_LONG>(complexGridSizePadded[YY] * complexGridSizePadded[ZZ]),
+ complexGridSizePadded[ZZ],
+ 1
+ };
+
+ const auto placement = performOutOfPlaceFFT ? DFTI_NOT_INPLACE : DFTI_INPLACE;
+
+ try
+ {
+ using oneapi::mkl::dft::config_param;
+ r2cDescriptor_.set_value(config_param::INPUT_STRIDES, realGridStrides.data());
+ r2cDescriptor_.set_value(config_param::OUTPUT_STRIDES, complexGridStrides.data());
+ r2cDescriptor_.set_value(config_param::CONJUGATE_EVEN_STORAGE, DFTI_COMPLEX_COMPLEX);
+ r2cDescriptor_.set_value(config_param::PLACEMENT, placement);
+ r2cDescriptor_.commit(queue_);
+ }
+ catch (oneapi::mkl::exception& exc)
+ {
+ GMX_THROW(InternalError(
+ formatString("MKL failure while configuring R2C descriptor: %s", exc.what())));
+ }
+
+ try
+ {
+ using oneapi::mkl::dft::config_param;
+ c2rDescriptor_.set_value(config_param::INPUT_STRIDES, complexGridStrides.data());
+ c2rDescriptor_.set_value(config_param::OUTPUT_STRIDES, realGridStrides.data());
+ c2rDescriptor_.set_value(config_param::CONJUGATE_EVEN_STORAGE, DFTI_COMPLEX_COMPLEX);
+ c2rDescriptor_.set_value(config_param::PLACEMENT, placement);
+ c2rDescriptor_.commit(queue_);
+ }
+ catch (oneapi::mkl::exception& exc)
+ {
+ GMX_THROW(InternalError(
+ formatString("MKL failure while configuring C2R descriptor: %s", exc.what())));
+ }
+}
+
+Gpu3dFft::ImplSyclMkl::~ImplSyclMkl() = default;
+
+void Gpu3dFft::ImplSyclMkl::perform3dFft(gmx_fft_direction dir, CommandEvent* /*timingEvent*/)
+{
+ switch (dir)
+ {
+ case GMX_FFT_REAL_TO_COMPLEX:
+ try
+ {
+ oneapi::mkl::dft::compute_forward(r2cDescriptor_, realGrid_, complexGrid_);
+ }
+ catch (oneapi::mkl::exception& exc)
+ {
+ GMX_THROW(InternalError(
+ formatString("MKL failure while executing R2C transform: %s", exc.what())));
+ }
+ break;
+ case GMX_FFT_COMPLEX_TO_REAL:
+ try
+ {
+ oneapi::mkl::dft::compute_backward(c2rDescriptor_, complexGrid_, realGrid_);
+ }
+ catch (oneapi::mkl::exception& exc)
+ {
+ GMX_THROW(InternalError(
+ formatString("MKL failure while executing C2R transform: %s", exc.what())));
+ }
+ break;
+ default:
+ GMX_THROW(NotImplementedError("The chosen 3D-FFT case is not implemented on GPUs"));
+ }
+}
+
+} // namespace gmx
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2016,2017,2018,2019,2021, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/*! \internal \file
+ * \brief Declares the GPU 3D FFT routines.
+ *
+ * \author Aleksei Iupinov <a.yupinov@gmail.com>
+ * \author Mark Abraham <mark.j.abraham@gmail.com>
+ * \author Gaurav Garg <gaugarg@nvidia.com>
+ * \ingroup module_fft
+ */
+
+#ifndef GMX_FFT_GPU_3DFFT_SYCL_MKL_H
+#define GMX_FFT_GPU_3DFFT_SYCL_MKL_H
+
+#include "gpu_3dfft_impl.h"
+
+#include <oneapi/mkl/dfti.hpp>
+
+#include "gromacs/fft/fft.h"
+#include "gromacs/gpu_utils/devicebuffer_datatype.h"
+#include "gromacs/gpu_utils/gmxsycl.h"
+#include "gromacs/gpu_utils/gputraits.h"
+#include "gromacs/utility/gmxmpi.h"
+
+class DeviceContext;
+class DeviceStream;
+
+namespace gmx
+{
+
+/*! \internal \brief
+ * A 3D FFT wrapper class for performing R2C/C2R transforms using SYCL + MKL.
+ */
+class Gpu3dFft::ImplSyclMkl : public Gpu3dFft::Impl
+{
+public:
+ //! \copydoc Gpu3dFft::Impl::Impl
+ ImplSyclMkl(bool allocateGrids,
+ MPI_Comm comm,
+ ArrayRef<const int> gridSizesInXForEachRank,
+ ArrayRef<const int> gridSizesInYForEachRank,
+ int nz,
+ bool performOutOfPlaceFFT,
+ const DeviceContext& context,
+ const DeviceStream& pmeStream,
+ ivec realGridSize,
+ ivec realGridSizePadded,
+ ivec complexGridSizePadded,
+ DeviceBuffer<float>* realGrid,
+ DeviceBuffer<float>* complexGrid);
+
+ //! \copydoc Gpu3dFft::Impl::~Impl
+ ~ImplSyclMkl() override;
+
+ //! \copydoc Gpu3dFft::Impl::perform3dFft
+ void perform3dFft(gmx_fft_direction dir, CommandEvent* timingEvent) override;
+
+private:
+ // Shorthand for the FFT descriptor (a.k.a. plan) type
+ using Descriptor =
+ oneapi::mkl::dft::descriptor<oneapi::mkl::dft::precision::SINGLE, oneapi::mkl::dft::domain::REAL>;
+
+ cl::sycl::buffer<float, 1> realGrid_;
+ cl::sycl::buffer<float, 1> complexGrid_;
+ cl::sycl::queue queue_;
+ Descriptor r2cDescriptor_, c2rDescriptor_;
+
+ static Descriptor initDescriptor(const ivec realGridSize);
+};
+
+} // namespace gmx
+
+#endif
checkRealGrid(realGridSize, realGridSizePadded, in_, outputRealGridValues);
}
-#if GMX_GPU_CUDA || GMX_GPU_OPENCL || (GMX_GPU_SYCL && GMX_SYCL_HIPSYCL)
+#if GMX_GPU_CUDA || GMX_GPU_OPENCL \
+ || (GMX_GPU_SYCL && (GMX_SYCL_HIPSYCL || (GMX_SYCL_DPCPP && GMX_FFT_MKL)))
TEST_F(FFTTest3D, GpuReal5_6_9)
{
// Ensure library resources are managed appropriately
// Use std::copy to convert from double to real easily
std::copy(inputdata, inputdata + sizeInReals, in_.begin());
+ // DPCPP uses oneMKL, which seems to have troubles with out-of-place transforms
+ const bool performOutOfPlaceFFT = !GMX_SYCL_DPCPP;
+
SCOPED_TRACE("Allocating the device buffers");
DeviceBuffer<float> realGrid, complexGrid;
allocateDeviceBuffer(&realGrid, in_.size(), deviceContext);
- allocateDeviceBuffer(&complexGrid, complexGridValues.size(), deviceContext);
+ if (performOutOfPlaceFFT)
+ {
+ allocateDeviceBuffer(&complexGrid, complexGridValues.size(), deviceContext);
+ }
# if GMX_GPU_CUDA
const FftBackend backend = FftBackend::Cufft;
# elif GMX_GPU_SYCL
# if GMX_SYCL_HIPSYCL
const FftBackend backend = FftBackend::SyclRocfft;
+# elif GMX_SYCL_DPCPP && GMX_FFT_MKL
+ const FftBackend backend = FftBackend::SyclMkl;
# endif
# endif
- const bool performOutOfPlaceFFT = true;
MPI_Comm comm = MPI_COMM_NULL;
const bool allocateGrid = false;
std::array<int, 1> gridSizesInXForEachRank = { 0 };
realGridSizePadded,
complexGridSizePadded,
&realGrid,
- &complexGrid);
+ performOutOfPlaceFFT ? &complexGrid : &realGrid);
// Transfer the real grid input data for the FFT
copyToDeviceBuffer(
// Check the complex grid (NB this data has not been normalized)
copyFromDeviceBuffer(complexGridValues.data(),
- &complexGrid,
+ performOutOfPlaceFFT ? &complexGrid : &realGrid,
0,
complexGridValues.size(),
deviceStream,
checker.checkSequence(
complexGridValues.begin(), complexGridValues.end(), "ComplexGridAfterRealToComplex");
- // Clear the real grid input data for the FFT so we can
- // compute the back transform into it and observe that it did
- // the work expected.
std::vector<float> outputRealGridValues(in_.size());
- copyToDeviceBuffer(&realGrid,
- outputRealGridValues.data(),
- 0,
- outputRealGridValues.size(),
- deviceStream,
- GpuApiCallBehavior::Sync,
- nullptr);
+ if (performOutOfPlaceFFT)
+ {
+ // Clear the real grid input data for the FFT so we can
+ // compute the back transform into it and observe that it did
+ // the work expected.
+ copyToDeviceBuffer(&realGrid,
+ outputRealGridValues.data(),
+ 0,
+ outputRealGridValues.size(),
+ deviceStream,
+ GpuApiCallBehavior::Sync,
+ nullptr);
+ }
SCOPED_TRACE("Doing the back transform");
gpu3dFft.perform3dFft(GMX_FFT_COMPLEX_TO_REAL, timingEvent);
SCOPED_TRACE("Cleaning up");
freeDeviceBuffer(&realGrid);
- freeDeviceBuffer(&complexGrid);
+ if (performOutOfPlaceFFT)
+ {
+ freeDeviceBuffer(&complexGrid);
+ }
}
}