From 4a4ade895955cba997b6b4b8136bffe36697178d Mon Sep 17 00:00:00 2001 From: Gaurav Garg Date: Wed, 8 Sep 2021 18:40:06 +0000 Subject: [PATCH] Redesign GPU FFT abstraction - Modify interface to allow distributed FFT implementation in future - Provide support for choosing FFT backend at runtime. E.g. CUFFT backend can be instantiated for single-GPU FFT but HeFFTe can be instantiated in case PME decomposition is used. This is a pre-requisite for GPU PME-decomposition implementation. Refs #3884 --- src/gromacs/ewald/pme_gpu_internal.cpp | 37 ++++-- src/gromacs/fft/CMakeLists.txt | 13 +- src/gromacs/fft/gpu_3dfft.cpp | 120 ++++++++++++++++-- src/gromacs/fft/gpu_3dfft.h | 64 +++++++--- .../fft/{gpu_3dfft.cu => gpu_3dfft_cufft.cu} | 87 ++++--------- src/gromacs/fft/gpu_3dfft_cufft.h | 100 +++++++++++++++ src/gromacs/fft/gpu_3dfft_impl.h | 103 +++++++++++++++ src/gromacs/fft/gpu_3dfft_ocl.cpp | 95 ++++---------- src/gromacs/fft/gpu_3dfft_ocl.h | 101 +++++++++++++++ src/gromacs/fft/gpu_3dfft_sycl.cpp | 33 ++--- src/gromacs/fft/gpu_3dfft_sycl.h | 91 +++++++++++++ src/gromacs/fft/tests/fft.cpp | 30 +++-- src/gromacs/utility/binaryinformation.cpp | 36 +++++- 13 files changed, 709 insertions(+), 201 deletions(-) rename src/gromacs/fft/{gpu_3dfft.cu => gpu_3dfft_cufft.cu} (66%) create mode 100644 src/gromacs/fft/gpu_3dfft_cufft.h create mode 100644 src/gromacs/fft/gpu_3dfft_impl.h create mode 100644 src/gromacs/fft/gpu_3dfft_ocl.h create mode 100644 src/gromacs/fft/gpu_3dfft_sycl.h diff --git a/src/gromacs/ewald/pme_gpu_internal.cpp b/src/gromacs/ewald/pme_gpu_internal.cpp index cee5a349c8..e9998c803e 100644 --- a/src/gromacs/ewald/pme_gpu_internal.cpp +++ b/src/gromacs/ewald/pme_gpu_internal.cpp @@ -606,21 +606,40 @@ void pme_gpu_reinit_3dfft(const PmeGpu* pmeGpu) if (pme_gpu_settings(pmeGpu).performGPUFFT) { pmeGpu->archSpecific->fftSetup.resize(0); - const bool useDecomposition = pme_gpu_settings(pmeGpu).useDecomposition; - const bool performOutOfPlaceFFT = pmeGpu->archSpecific->performOutOfPlaceFFT; - PmeGpuGridParams& grid = pme_gpu_get_kernel_params_base_ptr(pmeGpu)->grid; + const bool performOutOfPlaceFFT = pmeGpu->archSpecific->performOutOfPlaceFFT; + const bool allocateGrid = false; + MPI_Comm comm = MPI_COMM_NULL; + std::array gridOffsetsInXForEachRank = { 0 }; + std::array gridOffsetsInYForEachRank = { 0 }; +#if GMX_GPU_CUDA + const gmx::FftBackend backend = gmx::FftBackend::Cufft; +#elif GMX_GPU_OPENCL + const gmx::FftBackend backend = gmx::FftBackend::Ocl; +#elif GMX_GPU_SYCL + const gmx::FftBackend backend = gmx::FftBackend::Sycl; +#else + GMX_RELEASE_ASSERT(false, "Unknown GPU backend"); + const gmx::FftBackend backend = gmx::FftBackend::Count; +#endif + + PmeGpuGridParams& grid = pme_gpu_get_kernel_params_base_ptr(pmeGpu)->grid; for (int gridIndex = 0; gridIndex < pmeGpu->common->ngrids; gridIndex++) { pmeGpu->archSpecific->fftSetup.push_back( - std::make_unique(grid.realGridSize, - grid.realGridSizePadded, - grid.complexGridSizePadded, - useDecomposition, + std::make_unique(backend, + allocateGrid, + comm, + gridOffsetsInXForEachRank, + gridOffsetsInYForEachRank, + grid.realGridSize[ZZ], performOutOfPlaceFFT, pmeGpu->archSpecific->deviceContext_, pmeGpu->archSpecific->pmeStream_, - grid.d_realGrid[gridIndex], - grid.d_fourierGrid[gridIndex])); + grid.realGridSize, + grid.realGridSizePadded, + grid.complexGridSizePadded, + &(grid.d_realGrid[gridIndex]), + &(grid.d_fourierGrid[gridIndex]))); } } } diff --git a/src/gromacs/fft/CMakeLists.txt b/src/gromacs/fft/CMakeLists.txt index 70dd951e6f..73a6d4e5cd 100644 --- a/src/gromacs/fft/CMakeLists.txt +++ b/src/gromacs/fft/CMakeLists.txt @@ -40,6 +40,7 @@ gmx_add_libgromacs_sources( fft.cpp fft5d.cpp parallel_3dfft.cpp + gpu_3dfft.cpp ) if (GMX_FFT_FFTPACK) @@ -57,7 +58,11 @@ endif() if (GMX_GPU_CUDA) gmx_add_libgromacs_sources( # CUDA-specific sources - gpu_3dfft.cu + gpu_3dfft_cufft.cu + ) + _gmx_add_files_to_property(CUDA_SOURCES + # Must add these files so they can include cuda_runtime.h + gpu_3dfft.cpp ) elseif (GMX_GPU_OPENCL) gmx_add_libgromacs_sources( @@ -71,12 +76,8 @@ elseif (GMX_GPU_SYCL) gpu_3dfft_sycl.cpp ) _gmx_add_files_to_property(SYCL_SOURCES - gpu_3dfft_sycl.cpp - ) -else() - gmx_add_libgromacs_sources( - # Stub sources for CPU-only build gpu_3dfft.cpp + gpu_3dfft_sycl.cpp ) endif() diff --git a/src/gromacs/fft/gpu_3dfft.cpp b/src/gromacs/fft/gpu_3dfft.cpp index 3b896eb796..c027c5d08a 100644 --- a/src/gromacs/fft/gpu_3dfft.cpp +++ b/src/gromacs/fft/gpu_3dfft.cpp @@ -37,13 +37,24 @@ * \brief Implements stub GPU 3D FFT routines for CPU-only builds * * \author Mark Abraham + * \author Gaurav Garg * \ingroup module_fft */ #include "gmxpre.h" #include "gpu_3dfft.h" +#include "gpu_3dfft_impl.h" +#if GMX_GPU_CUDA +# include "gpu_3dfft_cufft.h" +#elif GMX_GPU_OPENCL +# include "gpu_3dfft_ocl.h" +#elif GMX_GPU_SYCL +# include "gpu_3dfft_sycl.h" +#endif + +#include "gromacs/utility/arrayref.h" #include "gromacs/utility/exceptions.h" namespace gmx @@ -55,29 +66,114 @@ namespace gmx # pragma clang diagnostic ignored "-Wmissing-noreturn" #endif -class Gpu3dFft::Impl +#if (GMX_GPU_CUDA || GMX_GPU_OPENCL || GMX_GPU_SYCL) + +Gpu3dFft::Gpu3dFft(FftBackend backend, + bool allocateGrids, + MPI_Comm comm, + ArrayRef gridSizesInXForEachRank, + ArrayRef gridSizesInYForEachRank, + const int nz, + bool performOutOfPlaceFFT, + const DeviceContext& context, + const DeviceStream& pmeStream, + ivec realGridSize, + ivec realGridSizePadded, + ivec complexGridSizePadded, + DeviceBuffer* realGrid, + DeviceBuffer* complexGrid) { -}; +# if GMX_GPU_CUDA + switch (backend) + { + case FftBackend::Cufft: + impl_ = std::make_unique(allocateGrids, + comm, + gridSizesInXForEachRank, + gridSizesInYForEachRank, + nz, + performOutOfPlaceFFT, + context, + pmeStream, + realGridSize, + realGridSizePadded, + complexGridSizePadded, + realGrid, + complexGrid); + break; + default: GMX_THROW(InternalError("Unsupported FFT backend requested")); + } +# elif GMX_GPU_OPENCL + switch (backend) + { + case FftBackend::Ocl: + impl_ = std::make_unique(allocateGrids, + comm, + gridSizesInXForEachRank, + gridSizesInYForEachRank, + nz, + performOutOfPlaceFFT, + context, + pmeStream, + realGridSize, + realGridSizePadded, + complexGridSizePadded, + realGrid, + complexGrid); + break; + default: GMX_THROW(InternalError("Unsupported FFT backend requested")); + } +# elif GMX_GPU_SYCL + switch (backend) + { + case FftBackend::Sycl: + impl_ = std::make_unique(allocateGrids, + comm, + gridSizesInXForEachRank, + gridSizesInYForEachRank, + nz, + performOutOfPlaceFFT, + context, + pmeStream, + realGridSize, + realGridSizePadded, + complexGridSizePadded, + realGrid, + complexGrid); + break; + default: GMX_THROW(InternalError("Unsupported FFT backend requested")); + } +# endif +} -Gpu3dFft::Gpu3dFft(ivec /*realGridSize*/, - ivec /*realGridSizePadded*/, - ivec /*complexGridSizePadded*/, - const bool /*useDecomposition*/, - const bool /*performOutOfPlaceFFT*/, +#else + +Gpu3dFft::Gpu3dFft(FftBackend /*backend */, + bool /*allocateGrids*/, + MPI_Comm /*comm*/, + ArrayRef /*gridSizesInXForEachRank*/, + ArrayRef /*gridSizesInYForEachRank*/, + const int /*nz*/, + bool /*performOutOfPlaceFFT*/, const DeviceContext& /*context*/, const DeviceStream& /*pmeStream*/, - DeviceBuffer /*realGrid*/, - DeviceBuffer /*complexGrid*/) + ivec /*realGridSize*/, + ivec /*realGridSizePadded*/, + ivec /*complexGridSizePadded*/, + DeviceBuffer* /*realGrid*/, + DeviceBuffer* /*complexGrid*/) { GMX_THROW(InternalError("Cannot run GPU routines in a CPU-only configuration")); } +#endif + Gpu3dFft::~Gpu3dFft() = default; -// NOLINTNEXTLINE readability-convert-member-functions-to-static -void Gpu3dFft::perform3dFft(gmx_fft_direction /*dir*/, CommandEvent* /*timingEvent*/) +void Gpu3dFft::perform3dFft(gmx_fft_direction dir, CommandEvent* timingEvent) { - GMX_THROW(InternalError("Cannot run GPU routines in a CPU-only configuration")); + GMX_RELEASE_ASSERT(impl_ != nullptr, "Cannot run GPU routines in a CPU-only configuration"); + impl_->perform3dFft(dir, timingEvent); } #ifdef __clang__ diff --git a/src/gromacs/fft/gpu_3dfft.h b/src/gromacs/fft/gpu_3dfft.h index 65d3f6f03b..7b2c637654 100644 --- a/src/gromacs/fft/gpu_3dfft.h +++ b/src/gromacs/fft/gpu_3dfft.h @@ -38,6 +38,7 @@ * * \author Aleksei Iupinov * \author Mark Abraham + * \author Gaurav Garg * \ingroup module_fft */ @@ -49,6 +50,7 @@ #include "gromacs/fft/fft.h" #include "gromacs/gpu_utils/devicebuffer_datatype.h" #include "gromacs/gpu_utils/gputraits.h" +#include "gromacs/utility/gmxmpi.h" class DeviceContext; class DeviceStream; @@ -56,35 +58,59 @@ class DeviceStream; namespace gmx { +template +class ArrayRef; + +/*! \internal \brief + * Enum specifying all GPU FFT backends supported by GROMACS + * Some of the backends support only single GPU, some only multi-node, multi-GPU + */ +enum class FftBackend +{ + Cufft, // supports only single-GPU + Ocl, // supports only single-GPU + Sycl, // Not supported currently + Count +}; + /*! \internal \brief * A 3D FFT class for performing R2C/C2R transforms - * \todo Make this class actually parallel over multiple GPUs */ class Gpu3dFft { public: /*! \brief - * Constructs GPU FFT plans for performing 3D FFT on a PME grid. + * Construct 3D FFT object for given backend * - * \param[in] realGridSize Dimensions of the real grid - * \param[in] realGridSizePadded Dimensions of the real grid with padding - * \param[in] complexGridSizePadded Dimensions of the real grid with padding - * \param[in] useDecomposition Whether PME decomposition will be used - * \param[in] performOutOfPlaceFFT Whether the FFT will be performed out-of-place - * \param[in] context GPU context. - * \param[in] pmeStream GPU stream for PME. - * \param[in] realGrid Device buffer of floats for the real grid - * \param[in] complexGrid Device buffer of complex floats for the complex grid + * \param[in] backend FFT backend to be instantiated + * \param[in] allocateGrids True if fft grids are to be allocated, false if pre-allocated + * \param[in] comm MPI communicator, used with distributed-FFT backends + * \param[in] gridSizesInXForEachRank Number of grid points used with each rank in X-dimension + * \param[in] gridSizesInYForEachRank Number of grid points used with each rank in Y-dimension + * \param[in] nz Grid dimension in Z + * \param[in] performOutOfPlaceFFT Whether the FFT will be performed out-of-place + * \param[in] context GPU context. + * \param[in] pmeStream GPU stream for PME. + * \param[in,out] realGridSize Dimensions of the local real grid, out if allocateGrids=true + * \param[in,out] realGridSizePadded Dimensions of the local real grid with padding, out if allocateGrids=true + * \param[in,out] complexGridSizePadded Dimensions of the local complex grid with padding, out if allocateGrids=true + * \param[in,out] realGrid Device buffer of floats for the local real grid, out if allocateGrids=true + * \param[in,out] complexGrid Device buffer of complex floats for the local complex grid, out if allocateGrids=true */ - Gpu3dFft(ivec realGridSize, - ivec realGridSizePadded, - ivec complexGridSizePadded, - bool useDecomposition, + Gpu3dFft(FftBackend backend, + bool allocateGrids, + MPI_Comm comm, + ArrayRef gridSizesInXForEachRank, + ArrayRef gridSizesInYForEachRank, + int nz, bool performOutOfPlaceFFT, const DeviceContext& context, const DeviceStream& pmeStream, - DeviceBuffer realGrid, - DeviceBuffer complexGrid); + ivec realGridSize, + ivec realGridSizePadded, + ivec complexGridSizePadded, + DeviceBuffer* realGrid, + DeviceBuffer* complexGrid); /*! \brief Destroys the FFT plans. */ ~Gpu3dFft(); @@ -97,6 +123,10 @@ public: private: class Impl; + class ImplCuFft; + class ImplOcl; + class ImplSycl; + std::unique_ptr impl_; }; diff --git a/src/gromacs/fft/gpu_3dfft.cu b/src/gromacs/fft/gpu_3dfft_cufft.cu similarity index 66% rename from src/gromacs/fft/gpu_3dfft.cu rename to src/gromacs/fft/gpu_3dfft_cufft.cu index 78f3ba90dc..5ccdb9842e 100644 --- a/src/gromacs/fft/gpu_3dfft.cu +++ b/src/gromacs/fft/gpu_3dfft_cufft.cu @@ -43,37 +43,15 @@ #include "gmxpre.h" -#include "gpu_3dfft.h" - -#include +#include "gpu_3dfft_cufft.h" #include "gromacs/gpu_utils/device_stream.h" +#include "gromacs/utility/arrayref.h" #include "gromacs/utility/fatalerror.h" #include "gromacs/utility/gmxassert.h" namespace gmx { - -class Gpu3dFft::Impl -{ -public: - Impl(ivec realGridSize, - ivec realGridSizePadded, - ivec complexGridSizePadded, - bool useDecomposition, - bool performOutOfPlaceFFT, - const DeviceContext& context, - const DeviceStream& pmeStream, - DeviceBuffer realGrid, - DeviceBuffer complexGrid); - ~Impl(); - - cufftHandle planR2C_; - cufftHandle planC2R_; - cufftReal* realGrid_; - cufftComplex* complexGrid_; -}; - static void handleCufftError(cufftResult_t status, const char* msg) { if (status != CUFFT_SUCCESS) @@ -82,19 +60,25 @@ static void handleCufftError(cufftResult_t status, const char* msg) } } -Gpu3dFft::Impl::Impl(ivec realGridSize, - ivec realGridSizePadded, - ivec complexGridSizePadded, - const bool useDecomposition, - const bool /*performOutOfPlaceFFT*/, - const DeviceContext& /*context*/, - const DeviceStream& pmeStream, - DeviceBuffer realGrid, - DeviceBuffer complexGrid) : - realGrid_(reinterpret_cast(realGrid)), - complexGrid_(reinterpret_cast(complexGrid)) +Gpu3dFft::ImplCuFft::ImplCuFft(bool allocateGrids, + MPI_Comm /*comm*/, + ArrayRef gridSizesInXForEachRank, + ArrayRef gridSizesInYForEachRank, + const int /*nz*/, + bool /*performOutOfPlaceFFT*/, + const DeviceContext& /*context*/, + const DeviceStream& pmeStream, + ivec realGridSize, + ivec realGridSizePadded, + ivec complexGridSizePadded, + DeviceBuffer* realGrid, + DeviceBuffer* complexGrid) : + realGrid_(reinterpret_cast(*realGrid)), + complexGrid_(reinterpret_cast(*complexGrid)) { - GMX_RELEASE_ASSERT(!useDecomposition, "FFT decomposition not implemented"); + GMX_RELEASE_ASSERT(allocateGrids == false, "Grids needs to be pre-allocated"); + GMX_RELEASE_ASSERT(gridSizesInXForEachRank.size() == 1 && gridSizesInYForEachRank.size() == 1, + "FFT decomposition not implemented with cuFFT backend"); const int complexGridSizePaddedTotal = complexGridSizePadded[XX] * complexGridSizePadded[YY] * complexGridSizePadded[ZZ]; @@ -151,7 +135,7 @@ Gpu3dFft::Impl::Impl(ivec realGridSize, handleCufftError(result, "cufftSetStream C2R failure"); } -Gpu3dFft::Impl::~Impl() +Gpu3dFft::ImplCuFft::~ImplCuFft() { cufftResult_t result; result = cufftDestroy(planR2C_); @@ -160,42 +144,19 @@ Gpu3dFft::Impl::~Impl() handleCufftError(result, "cufftDestroy C2R failure"); } -void Gpu3dFft::perform3dFft(gmx_fft_direction dir, CommandEvent* /*timingEvent*/) +void Gpu3dFft::ImplCuFft::perform3dFft(gmx_fft_direction dir, CommandEvent* /*timingEvent*/) { cufftResult_t result; if (dir == GMX_FFT_REAL_TO_COMPLEX) { - result = cufftExecR2C(impl_->planR2C_, impl_->realGrid_, impl_->complexGrid_); + result = cufftExecR2C(planR2C_, realGrid_, complexGrid_); handleCufftError(result, "cuFFT R2C execution failure"); } else { - result = cufftExecC2R(impl_->planC2R_, impl_->complexGrid_, impl_->realGrid_); + result = cufftExecC2R(planC2R_, complexGrid_, realGrid_); handleCufftError(result, "cuFFT C2R execution failure"); } } -Gpu3dFft::Gpu3dFft(ivec realGridSize, - ivec realGridSizePadded, - ivec complexGridSizePadded, - const bool useDecomposition, - const bool performOutOfPlaceFFT, - const DeviceContext& context, - const DeviceStream& pmeStream, - DeviceBuffer realGrid, - DeviceBuffer complexGrid) : - impl_(std::make_unique(realGridSize, - realGridSizePadded, - complexGridSizePadded, - useDecomposition, - performOutOfPlaceFFT, - context, - pmeStream, - realGrid, - complexGrid)) -{ -} - -Gpu3dFft::~Gpu3dFft() = default; - } // namespace gmx diff --git a/src/gromacs/fft/gpu_3dfft_cufft.h b/src/gromacs/fft/gpu_3dfft_cufft.h new file mode 100644 index 0000000000..5ebdd16820 --- /dev/null +++ b/src/gromacs/fft/gpu_3dfft_cufft.h @@ -0,0 +1,100 @@ +/* + * This file is part of the GROMACS molecular simulation package. + * + * Copyright (c) 2016,2017,2018,2019,2021, by the GROMACS development team, led by + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, + * and including many others, as listed in the AUTHORS file in the + * top-level source directory and at http://www.gromacs.org. + * + * GROMACS is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 + * of the License, or (at your option) any later version. + * + * GROMACS is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with GROMACS; if not, see + * http://www.gnu.org/licenses, or write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * If you want to redistribute modifications to GROMACS, please + * consider that scientific software is very special. Version + * control is crucial - bugs must be traceable. We will be happy to + * consider code for inclusion in the official distribution, but + * derived work must not be called official GROMACS. Details are found + * in the README & COPYING files - if they are missing, get the + * official version at http://www.gromacs.org. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the research papers on the package. Check out http://www.gromacs.org. + */ + +/*! \internal \file + * \brief Declares the GPU 3D FFT routines. + * + * \author Aleksei Iupinov + * \author Mark Abraham + * \author Gaurav Garg + * \ingroup module_fft + */ + +#ifndef GMX_FFT_GPU_3DFFT_CUFFT_H +#define GMX_FFT_GPU_3DFFT_CUFFT_H + +#include + +#include "gromacs/fft/fft.h" +#include "gromacs/gpu_utils/devicebuffer_datatype.h" +#include "gromacs/gpu_utils/gputraits.h" +#include "gromacs/utility/gmxmpi.h" +#include "gpu_3dfft_impl.h" + +#include + +class DeviceContext; +class DeviceStream; + +namespace gmx +{ + +/*! \internal \brief + * A 3D FFT wrapper class for performing R2C/C2R transforms using cuFFT + */ +class Gpu3dFft::ImplCuFft : public Gpu3dFft::Impl +{ +public: + //! \copydoc Gpu3dFft::Impl::Impl + ImplCuFft(bool allocateGrids, + MPI_Comm comm, + ArrayRef gridSizesInXForEachRank, + ArrayRef gridSizesInYForEachRank, + int nz, + bool performOutOfPlaceFFT, + const DeviceContext& context, + const DeviceStream& pmeStream, + ivec realGridSize, + ivec realGridSizePadded, + ivec complexGridSizePadded, + DeviceBuffer* realGrid, + DeviceBuffer* complexGrid); + + //! \copydoc Gpu3dFft::Impl::~Impl + ~ImplCuFft() override; + + //! \copydoc Gpu3dFft::Impl::perform3dFft + void perform3dFft(gmx_fft_direction dir, CommandEvent* timingEvent) override; + +private: + cufftHandle planR2C_; + cufftHandle planC2R_; + cufftReal* realGrid_; + cufftComplex* complexGrid_; +}; + +} // namespace gmx + +#endif diff --git a/src/gromacs/fft/gpu_3dfft_impl.h b/src/gromacs/fft/gpu_3dfft_impl.h new file mode 100644 index 0000000000..2e95cc012e --- /dev/null +++ b/src/gromacs/fft/gpu_3dfft_impl.h @@ -0,0 +1,103 @@ +/* + * This file is part of the GROMACS molecular simulation package. + * + * Copyright (c) 2016,2017,2018,2019,2021, by the GROMACS development team, led by + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, + * and including many others, as listed in the AUTHORS file in the + * top-level source directory and at http://www.gromacs.org. + * + * GROMACS is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 + * of the License, or (at your option) any later version. + * + * GROMACS is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with GROMACS; if not, see + * http://www.gnu.org/licenses, or write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * If you want to redistribute modifications to GROMACS, please + * consider that scientific software is very special. Version + * control is crucial - bugs must be traceable. We will be happy to + * consider code for inclusion in the official distribution, but + * derived work must not be called official GROMACS. Details are found + * in the README & COPYING files - if they are missing, get the + * official version at http://www.gromacs.org. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the research papers on the package. Check out http://www.gromacs.org. + */ + +/*! \internal \file + * \brief Declares the GPU 3D FFT routines. + * + * \author Gaurav Garg + * \ingroup module_fft + */ + +#ifndef GMX_FFT_GPU_3DFFT_IMPL_H +#define GMX_FFT_GPU_3DFFT_IMPL_H + +#include "gromacs/fft/fft.h" +#include "gromacs/fft/gpu_3dfft.h" +#include "gromacs/gpu_utils/devicebuffer_datatype.h" +#include "gromacs/gpu_utils/gputraits.h" + + +namespace gmx +{ +/*! \internal \brief + * Impl base class for all FFT backends + */ +class Gpu3dFft::Impl +{ +public: + //! Default constructor + Impl() = default; + + /*! \brief + * Constructs GPU FFT plans for performing 3D FFT on a PME grid. + * + * \param[in] allocateGrids True if fft grids are to be allocated, false if pre-allocated + * \param[in] comm MPI communicator, used with distributed-FFT backends + * \param[in] gridSizesInXForEachRank Number of grid points used with each rank in X-dimension + * \param[in] gridSizesInYForEachRank Number of grid points used with each rank in Y-dimension + * \param[in] nz Grid dimension in Z + * \param[in] performOutOfPlaceFFT Whether the FFT will be performed out-of-place + * \param[in] context GPU context. + * \param[in] pmeStream GPU stream for PME. + * \param[in,out] realGridSize Dimensions of the local real grid, out if allocateGrids=true + * \param[in,out] realGridSizePadded Dimensions of the local real grid with padding, out if allocateGrids=true + * \param[in,out] complexGridSizePadded Dimensions of the local complex grid with padding, out if allocateGrids=true + * \param[in,out] realGrid Device buffer of floats for the local real grid, out if allocateGrids=true + * \param[in,out] complexGrid Device buffer of complex floats for the local complex grid, out if allocateGrids=true + */ + Impl(bool allocateGrids, + MPI_Comm comm, + ArrayRef gridSizesInXForEachRank, + ArrayRef gridSizesInYForEachRank, + int nz, + bool performOutOfPlaceFFT, + const DeviceContext& context, + const DeviceStream& pmeStream, + ivec realGridSize, + ivec realGridSizePadded, + ivec complexGridSizePadded, + DeviceBuffer* realGrid, + DeviceBuffer* complexGrid); + + /*! \brief Default destructor */ + virtual ~Impl() = default; + + //! \copydoc Gpu3dFft::perform3dFft + virtual void perform3dFft(gmx_fft_direction dir, CommandEvent* timingEvent) = 0; +}; + +} // namespace gmx + +#endif diff --git a/src/gromacs/fft/gpu_3dfft_ocl.cpp b/src/gromacs/fft/gpu_3dfft_ocl.cpp index 69a4497445..76ae535608 100644 --- a/src/gromacs/fft/gpu_3dfft_ocl.cpp +++ b/src/gromacs/fft/gpu_3dfft_ocl.cpp @@ -43,7 +43,7 @@ #include "gmxpre.h" -#include "gpu_3dfft.h" +#include "gpu_3dfft_ocl.h" #include #include @@ -59,28 +59,6 @@ namespace gmx { - -class Gpu3dFft::Impl -{ -public: - Impl(ivec realGridSize, - ivec realGridSizePadded, - ivec complexGridSizePadded, - bool useDecomposition, - bool performOutOfPlaceFFT, - const DeviceContext& context, - const DeviceStream& pmeStream, - DeviceBuffer realGrid, - DeviceBuffer complexGrid); - ~Impl(); - - clfftPlanHandle planR2C_; - clfftPlanHandle planC2R_; - std::vector commandStreams_; - cl_mem realGrid_; - cl_mem complexGrid_; -}; - //! Throws the exception on clFFT error static void handleClfftError(clfftStatus status, const char* msg) { @@ -91,18 +69,24 @@ static void handleClfftError(clfftStatus status, const char* msg) } } -Gpu3dFft::Impl::Impl(ivec realGridSize, - ivec realGridSizePadded, - ivec complexGridSizePadded, - const bool useDecomposition, - const bool performOutOfPlaceFFT, - const DeviceContext& context, - const DeviceStream& pmeStream, - DeviceBuffer realGrid, - DeviceBuffer complexGrid) : - realGrid_(realGrid), complexGrid_(complexGrid) +Gpu3dFft::ImplOcl::ImplOcl(bool allocateGrids, + MPI_Comm /*comm*/, + ArrayRef gridSizesInXForEachRank, + ArrayRef gridSizesInYForEachRank, + const int /*nz*/, + bool performOutOfPlaceFFT, + const DeviceContext& context, + const DeviceStream& pmeStream, + ivec realGridSize, + ivec realGridSizePadded, + ivec complexGridSizePadded, + DeviceBuffer* realGrid, + DeviceBuffer* complexGrid) : + realGrid_(*realGrid), complexGrid_(*complexGrid) { - GMX_RELEASE_ASSERT(!useDecomposition, "FFT decomposition not implemented"); + GMX_RELEASE_ASSERT(allocateGrids == false, "Grids needs to be pre-allocated"); + GMX_RELEASE_ASSERT(gridSizesInXForEachRank.size() == 1 && gridSizesInYForEachRank.size() == 1, + "FFT decomposition not implemented with OpenCL backend"); cl_context clContext = context.context(); commandStreams_.push_back(pmeStream.stream()); @@ -157,13 +141,13 @@ Gpu3dFft::Impl::Impl(ivec realGridSize, // TODO: disable last transpose (clfftSetPlanTransposeResult) } -Gpu3dFft::Impl::~Impl() +Gpu3dFft::ImplOcl::~ImplOcl() { clfftDestroyPlan(&planR2C_); clfftDestroyPlan(&planC2R_); } -void Gpu3dFft::perform3dFft(gmx_fft_direction dir, CommandEvent* timingEvent) +void Gpu3dFft::ImplOcl::perform3dFft(gmx_fft_direction dir, CommandEvent* timingEvent) { cl_mem tempBuffer = nullptr; constexpr std::array waitEvents{ {} }; @@ -175,24 +159,24 @@ void Gpu3dFft::perform3dFft(gmx_fft_direction dir, CommandEvent* timingEvent) switch (dir) { case GMX_FFT_REAL_TO_COMPLEX: - plan = impl_->planR2C_; + plan = planR2C_; direction = CLFFT_FORWARD; - inputGrids = &impl_->realGrid_; - outputGrids = &impl_->complexGrid_; + inputGrids = &realGrid_; + outputGrids = &complexGrid_; break; case GMX_FFT_COMPLEX_TO_REAL: - plan = impl_->planC2R_; + plan = planC2R_; direction = CLFFT_BACKWARD; - inputGrids = &impl_->complexGrid_; - outputGrids = &impl_->realGrid_; + inputGrids = &complexGrid_; + outputGrids = &realGrid_; break; default: GMX_THROW(NotImplementedError("The chosen 3D-FFT case is not implemented on GPUs")); } handleClfftError(clfftEnqueueTransform(plan, direction, - impl_->commandStreams_.size(), - impl_->commandStreams_.data(), + commandStreams_.size(), + commandStreams_.data(), waitEvents.size(), waitEvents.data(), timingEvent, @@ -202,27 +186,4 @@ void Gpu3dFft::perform3dFft(gmx_fft_direction dir, CommandEvent* timingEvent) "clFFT execution failure"); } -Gpu3dFft::Gpu3dFft(ivec realGridSize, - ivec realGridSizePadded, - ivec complexGridSizePadded, - const bool useDecomposition, - const bool performOutOfPlaceFFT, - const DeviceContext& context, - const DeviceStream& pmeStream, - DeviceBuffer realGrid, - DeviceBuffer complexGrid) : - impl_(std::make_unique(realGridSize, - realGridSizePadded, - complexGridSizePadded, - useDecomposition, - performOutOfPlaceFFT, - context, - pmeStream, - realGrid, - complexGrid)) -{ -} - -Gpu3dFft::~Gpu3dFft() = default; - } // namespace gmx diff --git a/src/gromacs/fft/gpu_3dfft_ocl.h b/src/gromacs/fft/gpu_3dfft_ocl.h new file mode 100644 index 0000000000..0ffb042069 --- /dev/null +++ b/src/gromacs/fft/gpu_3dfft_ocl.h @@ -0,0 +1,101 @@ +/* + * This file is part of the GROMACS molecular simulation package. + * + * Copyright (c) 2016,2017,2018,2019,2021, by the GROMACS development team, led by + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, + * and including many others, as listed in the AUTHORS file in the + * top-level source directory and at http://www.gromacs.org. + * + * GROMACS is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 + * of the License, or (at your option) any later version. + * + * GROMACS is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with GROMACS; if not, see + * http://www.gnu.org/licenses, or write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * If you want to redistribute modifications to GROMACS, please + * consider that scientific software is very special. Version + * control is crucial - bugs must be traceable. We will be happy to + * consider code for inclusion in the official distribution, but + * derived work must not be called official GROMACS. Details are found + * in the README & COPYING files - if they are missing, get the + * official version at http://www.gromacs.org. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the research papers on the package. Check out http://www.gromacs.org. + */ + +/*! \internal \file + * \brief Declares the GPU 3D FFT routines. + * + * \author Aleksei Iupinov + * \author Mark Abraham + * \author Gaurav Garg + * \ingroup module_fft + */ + +#ifndef GMX_FFT_GPU_3DFFT_OCL_H +#define GMX_FFT_GPU_3DFFT_OCL_H + +#include "gpu_3dfft_impl.h" + +#include "gromacs/fft/fft.h" +#include "gromacs/gpu_utils/devicebuffer_datatype.h" +#include "gromacs/gpu_utils/gputraits.h" +#include "gromacs/utility/arrayref.h" +#include "gromacs/utility/gmxmpi.h" + +#include + +class DeviceContext; +class DeviceStream; + +namespace gmx +{ + +/*! \internal \brief + * A 3D FFT wrapper class for performing R2C/C2R transforms using clFFT + */ +class Gpu3dFft::ImplOcl : public Gpu3dFft::Impl +{ +public: + //! \copydoc Gpu3dFft::Impl::Impl + ImplOcl(bool allocateGrids, + MPI_Comm comm, + ArrayRef gridSizesInXForEachRank, + ArrayRef gridSizesInYForEachRank, + int nz, + bool performOutOfPlaceFFT, + const DeviceContext& context, + const DeviceStream& pmeStream, + ivec realGridSize, + ivec realGridSizePadded, + ivec complexGridSizePadded, + DeviceBuffer* realGrid, + DeviceBuffer* complexGrid); + + //! \copydoc Gpu3dFft::Impl::~Impl + ~ImplOcl() override; + + //! \copydoc Gpu3dFft::Impl::perform3dFft + void perform3dFft(gmx_fft_direction dir, CommandEvent* timingEvent) override; + +private: + clfftPlanHandle planR2C_; + clfftPlanHandle planC2R_; + std::vector commandStreams_; + cl_mem realGrid_; + cl_mem complexGrid_; +}; + +} // namespace gmx + +#endif diff --git a/src/gromacs/fft/gpu_3dfft_sycl.cpp b/src/gromacs/fft/gpu_3dfft_sycl.cpp index 394aaac5a4..ff2abfd485 100644 --- a/src/gromacs/fft/gpu_3dfft_sycl.cpp +++ b/src/gromacs/fft/gpu_3dfft_sycl.cpp @@ -43,8 +43,9 @@ #include "gmxpre.h" -#include "gpu_3dfft.h" +#include "gpu_3dfft_sycl.h" +#include "gromacs/utility/arrayref.h" #include "gromacs/utility/exceptions.h" namespace gmx @@ -54,26 +55,26 @@ namespace gmx #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wmissing-noreturn" -class Gpu3dFft::Impl -{ -}; - -Gpu3dFft::Gpu3dFft(ivec /*realGridSize*/, - ivec /*realGridSizePadded*/, - ivec /*complexGridSizePadded*/, - const bool /*useDecomposition*/, - const bool /*performOutOfPlaceFFT*/, - const DeviceContext& /*context*/, - const DeviceStream& /*pmeStream*/, - DeviceBuffer /*realGrid*/, - DeviceBuffer /*complexGrid*/) +Gpu3dFft::ImplSycl::ImplSycl(bool /*allocateGrids*/, + MPI_Comm /*comm*/, + ArrayRef /*gridSizesInXForEachRank*/, + ArrayRef /*gridSizesInYForEachRank*/, + const int /*nz*/, + bool /*performOutOfPlaceFFT*/, + const DeviceContext& /*context*/, + const DeviceStream& /*pmeStream*/, + ivec /*realGridSize*/, + ivec /*realGridSizePadded*/, + ivec /*complexGridSizePadded*/, + DeviceBuffer* /*realGrid*/, + DeviceBuffer* /*complexGrid*/) { GMX_THROW(NotImplementedError("GPU 3DFFT is not implemented in SYCL")); } -Gpu3dFft::~Gpu3dFft() = default; +Gpu3dFft::ImplSycl::~ImplSycl() = default; -void Gpu3dFft::perform3dFft(gmx_fft_direction /*dir*/, CommandEvent* /*timingEvent*/) +void Gpu3dFft::ImplSycl::perform3dFft(gmx_fft_direction /*dir*/, CommandEvent* /*timingEvent*/) { GMX_THROW(NotImplementedError("Not implemented on SYCL yet")); } diff --git a/src/gromacs/fft/gpu_3dfft_sycl.h b/src/gromacs/fft/gpu_3dfft_sycl.h new file mode 100644 index 0000000000..8bc398b69e --- /dev/null +++ b/src/gromacs/fft/gpu_3dfft_sycl.h @@ -0,0 +1,91 @@ +/* + * This file is part of the GROMACS molecular simulation package. + * + * Copyright (c) 2016,2017,2018,2019,2021, by the GROMACS development team, led by + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, + * and including many others, as listed in the AUTHORS file in the + * top-level source directory and at http://www.gromacs.org. + * + * GROMACS is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 + * of the License, or (at your option) any later version. + * + * GROMACS is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with GROMACS; if not, see + * http://www.gnu.org/licenses, or write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * If you want to redistribute modifications to GROMACS, please + * consider that scientific software is very special. Version + * control is crucial - bugs must be traceable. We will be happy to + * consider code for inclusion in the official distribution, but + * derived work must not be called official GROMACS. Details are found + * in the README & COPYING files - if they are missing, get the + * official version at http://www.gromacs.org. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the research papers on the package. Check out http://www.gromacs.org. + */ + +/*! \internal \file + * \brief Declares the GPU 3D FFT routines. + * + * \author Aleksei Iupinov + * \author Mark Abraham + * \author Gaurav Garg + * \ingroup module_fft + */ + +#ifndef GMX_FFT_GPU_3DFFT_SYCL_H +#define GMX_FFT_GPU_3DFFT_SYCL_H + +#include "gpu_3dfft_impl.h" + +#include "gromacs/fft/fft.h" +#include "gromacs/gpu_utils/devicebuffer_datatype.h" +#include "gromacs/gpu_utils/gputraits.h" +#include "gromacs/utility/gmxmpi.h" + +class DeviceContext; +class DeviceStream; + +namespace gmx +{ + +/*! \internal \brief + * A 3D FFT wrapper class for performing R2C/C2R transforms using SYCL. Not yet implemented + */ +class Gpu3dFft::ImplSycl : public Gpu3dFft::Impl +{ +public: + //! \copydoc Gpu3dFft::Impl::Impl + ImplSycl(bool allocateGrids, + MPI_Comm comm, + ArrayRef gridSizesInXForEachRank, + ArrayRef gridSizesInYForEachRank, + int nz, + bool performOutOfPlaceFFT, + const DeviceContext& context, + const DeviceStream& pmeStream, + ivec realGridSize, + ivec realGridSizePadded, + ivec complexGridSizePadded, + DeviceBuffer* realGrid, + DeviceBuffer* complexGrid); + + //! \copydoc Gpu3dFft::Impl::~Impl + ~ImplSycl() override; + + //! \copydoc Gpu3dFft::Impl::perform3dFft + void perform3dFft(gmx_fft_direction dir, CommandEvent* timingEvent) override; +}; + +} // namespace gmx + +#endif diff --git a/src/gromacs/fft/tests/fft.cpp b/src/gromacs/fft/tests/fft.cpp index 7d0c7186d4..ce3717e210 100644 --- a/src/gromacs/fft/tests/fft.cpp +++ b/src/gromacs/fft/tests/fft.cpp @@ -400,17 +400,31 @@ TEST_F(FFTTest3D, GpuReal5_6_9) allocateDeviceBuffer(&realGrid, in_.size(), deviceContext); allocateDeviceBuffer(&complexGrid, complexGridValues.size(), deviceContext); - const bool useDecomposition = false; - const bool performOutOfPlaceFFT = true; - Gpu3dFft gpu3dFft(realGridSize, - realGridSizePadded, - complexGridSizePadded, - useDecomposition, +# if GMX_GPU_CUDA + const FftBackend backend = FftBackend::Cufft; +# elif GMX_GPU_OPENCL + const FftBackend backend = FftBackend::Ocl; +# endif + const bool performOutOfPlaceFFT = true; + const MPI_Comm comm = MPI_COMM_NULL; + const bool allocateGrid = false; + std::array gridSizesInXForEachRank = { 0 }; + std::array gridSizesInYForEachRank = { 0 }; + const int nz = realGridSize[ZZ]; + Gpu3dFft gpu3dFft(backend, + allocateGrid, + comm, + gridSizesInXForEachRank, + gridSizesInYForEachRank, + nz, performOutOfPlaceFFT, deviceContext, deviceStream, - realGrid, - complexGrid); + realGridSize, + realGridSizePadded, + complexGridSizePadded, + &realGrid, + &complexGrid); // Transfer the real grid input data for the FFT copyToDeviceBuffer( diff --git a/src/gromacs/utility/binaryinformation.cpp b/src/gromacs/utility/binaryinformation.cpp index 935a6d553a..2ca50c0186 100644 --- a/src/gromacs/utility/binaryinformation.cpp +++ b/src/gromacs/utility/binaryinformation.cpp @@ -204,8 +204,8 @@ void printCopyright(gmx::TextWriter* writer) } } -// Construct a string that describes the library that provides FFT support to this build -const char* getFftDescriptionString() +//! Construct a string that describes the library that provides CPU FFT support to this build +const char* getCpuFftDescriptionString() { // Define the FFT description string #if GMX_FFT_FFTW3 || GMX_FFT_ARMPL_FFTW3 @@ -229,6 +229,35 @@ const char* getFftDescriptionString() #endif }; +//! Construct a string that describes the library that provides GPU FFT support to this build +const char* getGpuFftDescriptionString() +{ + if (GMX_GPU) + { + if (GMX_GPU_CUDA) + { + return "cuFFT"; + } + else if (GMX_GPU_OPENCL) + { + return "clFFT"; + } + else if (GMX_GPU_SYCL) + { + return "unknown"; + } + else + { + GMX_RELEASE_ASSERT(false, "Unknown GPU configuration"); + return "impossible"; + } + } + else + { + return "none"; + } +}; + void gmx_print_version_info(gmx::TextWriter* writer) { writer->writeLine(formatString("GROMACS version: %s", gmx_version())); @@ -309,7 +338,8 @@ void gmx_print_version_info(gmx::TextWriter* writer) #endif writer->writeLine(formatString("GPU support: %s", getGpuImplementationString())); writer->writeLine(formatString("SIMD instructions: %s", GMX_SIMD_STRING)); - writer->writeLine(formatString("FFT library: %s", getFftDescriptionString())); + writer->writeLine(formatString("CPU FFT library: %s", getCpuFftDescriptionString())); + writer->writeLine(formatString("GPU FFT library: %s", getGpuFftDescriptionString())); #if GMX_TARGET_X86 writer->writeLine(formatString("RDTSCP usage: %s", GMX_USE_RDTSCP ? "enabled" : "disabled")); #endif -- 2.22.0