From: Andrey Alekseenko Date: Thu, 30 Sep 2021 18:27:37 +0000 (+0200) Subject: SYCL: 3D FFT using oneMKL X-Git-Url: http://biod.pnpi.spb.ru/gitweb/?a=commitdiff_plain;h=54ff816269ce5d0a3364aeb88ab7c753664088c8;p=alexxy%2Fgromacs.git SYCL: 3D FFT using oneMKL Requires Intel oneAPI binary installation (the open-source version of oneMKL does not support FFT yet). Only in-place transforms are supported, du to a bug in MKL up to, at least, 2021.4.0. Also removes errorneous buffer clearing on "in-place" codepath in FFT tests. Refs #3927. --- diff --git a/cmake/gmxManageSYCL.cmake b/cmake/gmxManageSYCL.cmake index 3ca117d491..79da7359be 100644 --- a/cmake/gmxManageSYCL.cmake +++ b/cmake/gmxManageSYCL.cmake @@ -290,6 +290,11 @@ else() message(FATAL_ERROR "Cannot compile with SYCL Intel compiler. Try a different compiler or disable SYCL.") endif() + include(gmxManageFFTLibraries) + if(NOT GMX_FFT_MKL) + message(WARNING "Building SYCL version with ${GMX_FFT_LIBRARY} instead of MKL. GPU FFT is disabled!") + endif() + # Add function wrapper similar to the one used by ComputeCPP and hipSYCL function(add_sycl_to_target) cmake_parse_arguments( diff --git a/src/gromacs/ewald/pme_gpu_internal.cpp b/src/gromacs/ewald/pme_gpu_internal.cpp index fa750002f5..96dc1f4db9 100644 --- a/src/gromacs/ewald/pme_gpu_internal.cpp +++ b/src/gromacs/ewald/pme_gpu_internal.cpp @@ -616,7 +616,9 @@ void pme_gpu_reinit_3dfft(const PmeGpu* pmeGpu) #elif GMX_GPU_OPENCL const gmx::FftBackend backend = gmx::FftBackend::Ocl; #elif GMX_GPU_SYCL -# if GMX_SYCL_HIPSYCL +# if GMX_SYCL_DPCPP && GMX_FFT_MKL + const gmx::FftBackend backend = gmx::FftBackend::SyclMkl; +# elif GMX_SYCL_HIPSYCL const gmx::FftBackend backend = gmx::FftBackend::SyclRocfft; # else const gmx::FftBackend backend = gmx::FftBackend::Sycl; diff --git a/src/gromacs/fft/CMakeLists.txt b/src/gromacs/fft/CMakeLists.txt index 13571c1564..6ad3558840 100644 --- a/src/gromacs/fft/CMakeLists.txt +++ b/src/gromacs/fft/CMakeLists.txt @@ -74,17 +74,33 @@ elseif (GMX_GPU_OPENCL) gpu_3dfft_ocl.cpp ) elseif (GMX_GPU_SYCL) - if (GMX_SYCL_HIPSYCL) - set(3dfft_sycl_source gpu_3dfft_sycl_rocfft.cpp) - else() - set(3dfft_sycl_source gpu_3dfft.cpp gpu_3dfft_sycl.cpp) + if (NOT GMX_GPU_HIPSYCL AND GMX_FFT_MKL) + gmx_add_libgromacs_sources( + gpu_3dfft_sycl_mkl.cpp + ) + _gmx_add_files_to_property(SYCL_SOURCES + gpu_3dfft_sycl_mkl.cpp + ) + endif() + if (GMX_GPU_HIPSYCL) + gmx_add_libgromacs_sources( + gpu_3dfft_sycl_rocfft.cpp + ) + _gmx_add_files_to_property(SYCL_SOURCES + gpu_3dfft_sycl_rocfft.cpp + ) endif() gmx_add_libgromacs_sources( - # SYCL-specific sources - ${3dfft_sycl_source} + gpu_3dfft_sycl.cpp ) _gmx_add_files_to_property(SYCL_SOURCES - ${3dfft_sycl_source} + gpu_3dfft_sycl.cpp + gpu_3dfft.cpp + ) +else() + gmx_add_libgromacs_sources( + # Stub sources for CPU-only build + gpu_3dfft.cpp ) endif() diff --git a/src/gromacs/fft/gpu_3dfft.cpp b/src/gromacs/fft/gpu_3dfft.cpp index 17ffe13f68..7dd21185e7 100644 --- a/src/gromacs/fft/gpu_3dfft.cpp +++ b/src/gromacs/fft/gpu_3dfft.cpp @@ -52,6 +52,9 @@ # include "gpu_3dfft_ocl.h" #elif GMX_GPU_SYCL # include "gpu_3dfft_sycl.h" +# if GMX_SYCL_DPCPP && GMX_FFT_MKL +# include "gpu_3dfft_sycl_mkl.h" +# endif # if GMX_SYCL_HIPSYCL # include "gpu_3dfft_sycl_rocfft.h" # endif @@ -135,6 +138,23 @@ Gpu3dFft::Gpu3dFft(FftBackend backend, # elif GMX_GPU_SYCL switch (backend) { +# if GMX_SYCL_DPCPP && GMX_FFT_MKL + case FftBackend::SyclMkl: + impl_ = std::make_unique(allocateGrids, + comm, + gridSizesInXForEachRank, + gridSizesInYForEachRank, + nz, + performOutOfPlaceFFT, + context, + pmeStream, + realGridSize, + realGridSizePadded, + complexGridSizePadded, + realGrid, + complexGrid); + break; +# endif # if GMX_SYCL_HIPSYCL case FftBackend::SyclRocfft: impl_ = std::make_unique(allocateGrids, diff --git a/src/gromacs/fft/gpu_3dfft.h b/src/gromacs/fft/gpu_3dfft.h index 2643be275f..729dac12e0 100644 --- a/src/gromacs/fft/gpu_3dfft.h +++ b/src/gromacs/fft/gpu_3dfft.h @@ -128,6 +128,7 @@ private: class Impl; class ImplCuFft; class ImplOcl; + class ImplSyclMkl; class ImplSyclRocfft; class ImplSycl; diff --git a/src/gromacs/fft/gpu_3dfft_sycl.cpp b/src/gromacs/fft/gpu_3dfft_sycl.cpp index ff2abfd485..a24c5bb718 100644 --- a/src/gromacs/fft/gpu_3dfft_sycl.cpp +++ b/src/gromacs/fft/gpu_3dfft_sycl.cpp @@ -69,14 +69,14 @@ Gpu3dFft::ImplSycl::ImplSycl(bool /*allocateGrids*/, DeviceBuffer* /*realGrid*/, DeviceBuffer* /*complexGrid*/) { - GMX_THROW(NotImplementedError("GPU 3DFFT is not implemented in SYCL")); + GMX_THROW(NotImplementedError("Using SYCL build without GPU 3DFFT support")); } Gpu3dFft::ImplSycl::~ImplSycl() = default; void Gpu3dFft::ImplSycl::perform3dFft(gmx_fft_direction /*dir*/, CommandEvent* /*timingEvent*/) { - GMX_THROW(NotImplementedError("Not implemented on SYCL yet")); + GMX_THROW(NotImplementedError("Using SYCL build without GPU 3DFFT support")); } #pragma clang diagnostic pop diff --git a/src/gromacs/fft/gpu_3dfft_sycl_mkl.cpp b/src/gromacs/fft/gpu_3dfft_sycl_mkl.cpp new file mode 100644 index 0000000000..981f1172fa --- /dev/null +++ b/src/gromacs/fft/gpu_3dfft_sycl_mkl.cpp @@ -0,0 +1,209 @@ +/* + * This file is part of the GROMACS molecular simulation package. + * + * Copyright (c) 2021, by the GROMACS development team, led by + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, + * and including many others, as listed in the AUTHORS file in the + * top-level source directory and at http://www.gromacs.org. + * + * GROMACS is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 + * of the License, or (at your option) any later version. + * + * GROMACS is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with GROMACS; if not, see + * http://www.gnu.org/licenses, or write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * If you want to redistribute modifications to GROMACS, please + * consider that scientific software is very special. Version + * control is crucial - bugs must be traceable. We will be happy to + * consider code for inclusion in the official distribution, but + * derived work must not be called official GROMACS. Details are found + * in the README & COPYING files - if they are missing, get the + * official version at http://www.gromacs.org. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the research papers on the package. Check out http://www.gromacs.org. + */ + +/*! \internal \file + * \brief Implements GPU 3D FFT routines for SYCL. + * + * \author Andrey Alekseenko + * \author Mark Abraham + * \ingroup module_fft + * + * In DPC++, we use Intel oneMKL to perform the FFT. It requires using the binary version of + * MKL, since the open-source one does not support FFT yet (https://github.com/oneapi-src/oneMKL/issues/27). + * + * There are issues with out-of-place transform, existing at least in oneAPI 2021.2-2021.4, so + * we allow only in-place transforms. + */ + +#include "gmxpre.h" + +#include "gpu_3dfft_sycl_mkl.h" + +#include "config.h" + +#include "gromacs/gpu_utils/devicebuffer_sycl.h" +#include "gromacs/gpu_utils/device_stream.h" +#include "gromacs/utility/exceptions.h" +#include "gromacs/utility/gmxassert.h" + +class DeviceContext; + +#if (!GMX_SYCL_DPCPP) +# error This file can only be compiled with Intel DPC++ compiler +#endif + +#if (!GMX_FFT_MKL) +# error Must use MKL library for FFT when compiling with Intel DPC++ compiler +#endif + +#include +#include +#include +#include + +// oneAPI 2021.2.0 to 2021.4.0 have issues with backward out-of-place transform +static constexpr bool sc_mklHasBuggyOutOfPlaceFFT = (INTEL_MKL_VERSION <= 20210004); + +namespace gmx +{ + +Gpu3dFft::ImplSyclMkl::Descriptor Gpu3dFft::ImplSyclMkl::initDescriptor(const ivec realGridSize) +{ + try + { + const std::vector realGridDimensions{ realGridSize[XX], + realGridSize[YY], + realGridSize[ZZ] }; + return { realGridDimensions }; + } + catch (oneapi::mkl::exception& exc) + { + GMX_THROW(InternalError(formatString("MKL failure while constructing descriptor: %s", exc.what()))); + } +} + +Gpu3dFft::ImplSyclMkl::ImplSyclMkl(bool allocateGrids, + MPI_Comm /*comm*/, + ArrayRef gridSizesInXForEachRank, + ArrayRef gridSizesInYForEachRank, + int /*nz*/, + const bool performOutOfPlaceFFT, + const DeviceContext& /*context*/, + const DeviceStream& pmeStream, + ivec realGridSize, + ivec realGridSizePadded, + ivec complexGridSizePadded, + DeviceBuffer* realGrid, + DeviceBuffer* complexGrid) : + realGrid_(*realGrid->buffer_), + complexGrid_(*complexGrid->buffer_), + queue_(pmeStream.stream()), + r2cDescriptor_(initDescriptor(realGridSize)), + c2rDescriptor_(initDescriptor(realGridSize)) +{ + GMX_RELEASE_ASSERT(!allocateGrids, "Grids needs to be pre-allocated"); + GMX_RELEASE_ASSERT(gridSizesInXForEachRank.size() == 1 && gridSizesInYForEachRank.size() == 1, + "Multi-rank FFT decomposition not implemented with SYCL MKL backend"); + + GMX_RELEASE_ASSERT(!(sc_mklHasBuggyOutOfPlaceFFT && performOutOfPlaceFFT), + "The version of MKL used does not properly support out-of-place FFTs"); + + GMX_ASSERT(checkDeviceBuffer(*realGrid, + realGridSizePadded[XX] * realGridSizePadded[YY] * realGridSizePadded[ZZ]), + "Real grid buffer is too small for the declared padded size"); + + GMX_ASSERT(checkDeviceBuffer(*complexGrid, + complexGridSizePadded[XX] * complexGridSizePadded[YY] + * complexGridSizePadded[ZZ] * 2), + "Complex grid buffer is too small for the declared padded size"); + + // MKL expects row-major + const std::array realGridStrides = { + 0, static_cast(realGridSizePadded[YY] * realGridSizePadded[ZZ]), realGridSizePadded[ZZ], 1 + }; + const std::array complexGridStrides = { + 0, + static_cast(complexGridSizePadded[YY] * complexGridSizePadded[ZZ]), + complexGridSizePadded[ZZ], + 1 + }; + + const auto placement = performOutOfPlaceFFT ? DFTI_NOT_INPLACE : DFTI_INPLACE; + + try + { + using oneapi::mkl::dft::config_param; + r2cDescriptor_.set_value(config_param::INPUT_STRIDES, realGridStrides.data()); + r2cDescriptor_.set_value(config_param::OUTPUT_STRIDES, complexGridStrides.data()); + r2cDescriptor_.set_value(config_param::CONJUGATE_EVEN_STORAGE, DFTI_COMPLEX_COMPLEX); + r2cDescriptor_.set_value(config_param::PLACEMENT, placement); + r2cDescriptor_.commit(queue_); + } + catch (oneapi::mkl::exception& exc) + { + GMX_THROW(InternalError( + formatString("MKL failure while configuring R2C descriptor: %s", exc.what()))); + } + + try + { + using oneapi::mkl::dft::config_param; + c2rDescriptor_.set_value(config_param::INPUT_STRIDES, complexGridStrides.data()); + c2rDescriptor_.set_value(config_param::OUTPUT_STRIDES, realGridStrides.data()); + c2rDescriptor_.set_value(config_param::CONJUGATE_EVEN_STORAGE, DFTI_COMPLEX_COMPLEX); + c2rDescriptor_.set_value(config_param::PLACEMENT, placement); + c2rDescriptor_.commit(queue_); + } + catch (oneapi::mkl::exception& exc) + { + GMX_THROW(InternalError( + formatString("MKL failure while configuring C2R descriptor: %s", exc.what()))); + } +} + +Gpu3dFft::ImplSyclMkl::~ImplSyclMkl() = default; + +void Gpu3dFft::ImplSyclMkl::perform3dFft(gmx_fft_direction dir, CommandEvent* /*timingEvent*/) +{ + switch (dir) + { + case GMX_FFT_REAL_TO_COMPLEX: + try + { + oneapi::mkl::dft::compute_forward(r2cDescriptor_, realGrid_, complexGrid_); + } + catch (oneapi::mkl::exception& exc) + { + GMX_THROW(InternalError( + formatString("MKL failure while executing R2C transform: %s", exc.what()))); + } + break; + case GMX_FFT_COMPLEX_TO_REAL: + try + { + oneapi::mkl::dft::compute_backward(c2rDescriptor_, complexGrid_, realGrid_); + } + catch (oneapi::mkl::exception& exc) + { + GMX_THROW(InternalError( + formatString("MKL failure while executing C2R transform: %s", exc.what()))); + } + break; + default: + GMX_THROW(NotImplementedError("The chosen 3D-FFT case is not implemented on GPUs")); + } +} + +} // namespace gmx diff --git a/src/gromacs/fft/gpu_3dfft_sycl_mkl.h b/src/gromacs/fft/gpu_3dfft_sycl_mkl.h new file mode 100644 index 0000000000..08cd488aa8 --- /dev/null +++ b/src/gromacs/fft/gpu_3dfft_sycl_mkl.h @@ -0,0 +1,106 @@ +/* + * This file is part of the GROMACS molecular simulation package. + * + * Copyright (c) 2016,2017,2018,2019,2021, by the GROMACS development team, led by + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, + * and including many others, as listed in the AUTHORS file in the + * top-level source directory and at http://www.gromacs.org. + * + * GROMACS is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 + * of the License, or (at your option) any later version. + * + * GROMACS is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with GROMACS; if not, see + * http://www.gnu.org/licenses, or write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * If you want to redistribute modifications to GROMACS, please + * consider that scientific software is very special. Version + * control is crucial - bugs must be traceable. We will be happy to + * consider code for inclusion in the official distribution, but + * derived work must not be called official GROMACS. Details are found + * in the README & COPYING files - if they are missing, get the + * official version at http://www.gromacs.org. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the research papers on the package. Check out http://www.gromacs.org. + */ + +/*! \internal \file + * \brief Declares the GPU 3D FFT routines. + * + * \author Aleksei Iupinov + * \author Mark Abraham + * \author Gaurav Garg + * \ingroup module_fft + */ + +#ifndef GMX_FFT_GPU_3DFFT_SYCL_MKL_H +#define GMX_FFT_GPU_3DFFT_SYCL_MKL_H + +#include "gpu_3dfft_impl.h" + +#include + +#include "gromacs/fft/fft.h" +#include "gromacs/gpu_utils/devicebuffer_datatype.h" +#include "gromacs/gpu_utils/gmxsycl.h" +#include "gromacs/gpu_utils/gputraits.h" +#include "gromacs/utility/gmxmpi.h" + +class DeviceContext; +class DeviceStream; + +namespace gmx +{ + +/*! \internal \brief + * A 3D FFT wrapper class for performing R2C/C2R transforms using SYCL + MKL. + */ +class Gpu3dFft::ImplSyclMkl : public Gpu3dFft::Impl +{ +public: + //! \copydoc Gpu3dFft::Impl::Impl + ImplSyclMkl(bool allocateGrids, + MPI_Comm comm, + ArrayRef gridSizesInXForEachRank, + ArrayRef gridSizesInYForEachRank, + int nz, + bool performOutOfPlaceFFT, + const DeviceContext& context, + const DeviceStream& pmeStream, + ivec realGridSize, + ivec realGridSizePadded, + ivec complexGridSizePadded, + DeviceBuffer* realGrid, + DeviceBuffer* complexGrid); + + //! \copydoc Gpu3dFft::Impl::~Impl + ~ImplSyclMkl() override; + + //! \copydoc Gpu3dFft::Impl::perform3dFft + void perform3dFft(gmx_fft_direction dir, CommandEvent* timingEvent) override; + +private: + // Shorthand for the FFT descriptor (a.k.a. plan) type + using Descriptor = + oneapi::mkl::dft::descriptor; + + cl::sycl::buffer realGrid_; + cl::sycl::buffer complexGrid_; + cl::sycl::queue queue_; + Descriptor r2cDescriptor_, c2rDescriptor_; + + static Descriptor initDescriptor(const ivec realGridSize); +}; + +} // namespace gmx + +#endif diff --git a/src/gromacs/fft/tests/fft.cpp b/src/gromacs/fft/tests/fft.cpp index dfe7189795..e06d89e0d2 100644 --- a/src/gromacs/fft/tests/fft.cpp +++ b/src/gromacs/fft/tests/fft.cpp @@ -364,7 +364,8 @@ TEST_F(FFTTest3D, Real5_6_9) checkRealGrid(realGridSize, realGridSizePadded, in_, outputRealGridValues); } -#if GMX_GPU_CUDA || GMX_GPU_OPENCL || (GMX_GPU_SYCL && GMX_SYCL_HIPSYCL) +#if GMX_GPU_CUDA || GMX_GPU_OPENCL \ + || (GMX_GPU_SYCL && (GMX_SYCL_HIPSYCL || (GMX_SYCL_DPCPP && GMX_FFT_MKL))) TEST_F(FFTTest3D, GpuReal5_6_9) { // Ensure library resources are managed appropriately @@ -397,10 +398,16 @@ TEST_F(FFTTest3D, GpuReal5_6_9) // Use std::copy to convert from double to real easily std::copy(inputdata, inputdata + sizeInReals, in_.begin()); + // DPCPP uses oneMKL, which seems to have troubles with out-of-place transforms + const bool performOutOfPlaceFFT = !GMX_SYCL_DPCPP; + SCOPED_TRACE("Allocating the device buffers"); DeviceBuffer realGrid, complexGrid; allocateDeviceBuffer(&realGrid, in_.size(), deviceContext); - allocateDeviceBuffer(&complexGrid, complexGridValues.size(), deviceContext); + if (performOutOfPlaceFFT) + { + allocateDeviceBuffer(&complexGrid, complexGridValues.size(), deviceContext); + } # if GMX_GPU_CUDA const FftBackend backend = FftBackend::Cufft; @@ -409,9 +416,10 @@ TEST_F(FFTTest3D, GpuReal5_6_9) # elif GMX_GPU_SYCL # if GMX_SYCL_HIPSYCL const FftBackend backend = FftBackend::SyclRocfft; +# elif GMX_SYCL_DPCPP && GMX_FFT_MKL + const FftBackend backend = FftBackend::SyclMkl; # endif # endif - const bool performOutOfPlaceFFT = true; MPI_Comm comm = MPI_COMM_NULL; const bool allocateGrid = false; std::array gridSizesInXForEachRank = { 0 }; @@ -430,7 +438,7 @@ TEST_F(FFTTest3D, GpuReal5_6_9) realGridSizePadded, complexGridSizePadded, &realGrid, - &complexGrid); + performOutOfPlaceFFT ? &complexGrid : &realGrid); // Transfer the real grid input data for the FFT copyToDeviceBuffer( @@ -443,7 +451,7 @@ TEST_F(FFTTest3D, GpuReal5_6_9) // Check the complex grid (NB this data has not been normalized) copyFromDeviceBuffer(complexGridValues.data(), - &complexGrid, + performOutOfPlaceFFT ? &complexGrid : &realGrid, 0, complexGridValues.size(), deviceStream, @@ -452,17 +460,20 @@ TEST_F(FFTTest3D, GpuReal5_6_9) checker.checkSequence( complexGridValues.begin(), complexGridValues.end(), "ComplexGridAfterRealToComplex"); - // Clear the real grid input data for the FFT so we can - // compute the back transform into it and observe that it did - // the work expected. std::vector outputRealGridValues(in_.size()); - copyToDeviceBuffer(&realGrid, - outputRealGridValues.data(), - 0, - outputRealGridValues.size(), - deviceStream, - GpuApiCallBehavior::Sync, - nullptr); + if (performOutOfPlaceFFT) + { + // Clear the real grid input data for the FFT so we can + // compute the back transform into it and observe that it did + // the work expected. + copyToDeviceBuffer(&realGrid, + outputRealGridValues.data(), + 0, + outputRealGridValues.size(), + deviceStream, + GpuApiCallBehavior::Sync, + nullptr); + } SCOPED_TRACE("Doing the back transform"); gpu3dFft.perform3dFft(GMX_FFT_COMPLEX_TO_REAL, timingEvent); @@ -481,7 +492,10 @@ TEST_F(FFTTest3D, GpuReal5_6_9) SCOPED_TRACE("Cleaning up"); freeDeviceBuffer(&realGrid); - freeDeviceBuffer(&complexGrid); + if (performOutOfPlaceFFT) + { + freeDeviceBuffer(&complexGrid); + } } }