From f8a05bc43449f683a1e64a273d9c699d488d596c Mon Sep 17 00:00:00 2001 From: Gaurav Garg Date: Tue, 28 Sep 2021 10:26:36 +0000 Subject: [PATCH] Add HeFFTe based FFT backend --- CMakeLists.txt | 16 ++ admin/gitlab-ci/gromacs.matrix.gitlab-ci.yml | 1 + .../gromacs.gcc-11-cuda-11.4.1.gitlab-ci.yml | 10 +- .../gromacs.gcc-7-cuda-11.0-mpi.gitlab-ci.yml | 58 +++++ src/config.h.cmakein | 3 + src/gromacs/CMakeLists.txt | 4 + src/gromacs/fft/CMakeLists.txt | 6 +- src/gromacs/fft/gpu_3dfft.cpp | 37 ++- src/gromacs/fft/gpu_3dfft.h | 4 + src/gromacs/fft/gpu_3dfft_heffte.cpp | 198 +++++++++++++++ src/gromacs/fft/gpu_3dfft_heffte.h | 106 ++++++++ src/gromacs/fft/tests/CMakeLists.txt | 7 + src/gromacs/fft/tests/fft_mpi.cpp | 232 ++++++++++++++++++ .../mdrun/tests/exactcontinuation.cpp | 6 +- 14 files changed, 678 insertions(+), 10 deletions(-) create mode 100644 admin/gitlab-ci/gromacs.matrix/gromacs.gcc-7-cuda-11.0-mpi.gitlab-ci.yml create mode 100644 src/gromacs/fft/gpu_3dfft_heffte.cpp create mode 100644 src/gromacs/fft/gpu_3dfft_heffte.h create mode 100644 src/gromacs/fft/tests/fft_mpi.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index b18f414895..0d6a4b7f27 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -229,6 +229,12 @@ gmx_dependent_option( mark_as_advanced(GMX_BUILD_OWN_FFTW) mark_as_advanced(GMX_DISABLE_FFTW_MEASURE) +gmx_dependent_option( + GMX_USE_HEFFTE + "Use HeFFTe for FFT support. Used with CUDA backend" + OFF + "GMX_GPU STREQUAL CUDA;GMX_MPI") + gmx_dependent_cache_variable(GMX_SIMD_REF_FLOAT_WIDTH "Reference SIMD single precision width" STRING "4" "GMX_SIMD STREQUAL REFERENCE") gmx_dependent_cache_variable(GMX_SIMD_REF_DOUBLE_WIDTH "Reference SIMD double precision width" STRING "2" "GMX_SIMD STREQUAL REFERENCE") @@ -620,6 +626,16 @@ if(CYGWIN) set(GMX_CYGWIN 1) endif() +if(GMX_USE_HEFFTE) + if(NOT GMX_GPU_CUDA) + message(FATAL_ERROR "HeFFTe support requires a CUDA build") + endif() + if(NOT GMX_LIB_MPI) + message(FATAL_ERROR "HeFFTe support requires a library MPI build") + endif() + find_package(Heffte 2.1.0 REQUIRED CUDA) +endif() + if(WIN32) set(GMX_NATIVE_WINDOWS 1) # This makes windows.h not declare min/max as macros that would break diff --git a/admin/gitlab-ci/gromacs.matrix.gitlab-ci.yml b/admin/gitlab-ci/gromacs.matrix.gitlab-ci.yml index 1b8892e49a..31cd3d223b 100644 --- a/admin/gitlab-ci/gromacs.matrix.gitlab-ci.yml +++ b/admin/gitlab-ci/gromacs.matrix.gitlab-ci.yml @@ -273,6 +273,7 @@ include: - local: '/admin/gitlab-ci/gromacs.matrix/gromacs.gcc-11.gitlab-ci.yml' - local: '/admin/gitlab-ci/gromacs.matrix/gromacs.gcc-11-coverage.gitlab-ci.yml' - local: '/admin/gitlab-ci/gromacs.matrix/gromacs.gcc-7-cuda-11.0.gitlab-ci.yml' + - local: '/admin/gitlab-ci/gromacs.matrix/gromacs.gcc-7-cuda-11.0-mpi.gitlab-ci.yml' - local: '/admin/gitlab-ci/gromacs.matrix/gromacs.gcc-11-cuda-11.4.1.gitlab-ci.yml' - local: '/admin/gitlab-ci/gromacs.matrix/gromacs.gcc-11-cuda-11.4.1-release.gitlab-ci.yml' - local: '/admin/gitlab-ci/gromacs.matrix/gromacs.gcc-11-release.gitlab-ci.yml' diff --git a/admin/gitlab-ci/gromacs.matrix/gromacs.gcc-11-cuda-11.4.1.gitlab-ci.yml b/admin/gitlab-ci/gromacs.matrix/gromacs.gcc-11-cuda-11.4.1.gitlab-ci.yml index 582dfbb60a..29b6d5a229 100644 --- a/admin/gitlab-ci/gromacs.matrix/gromacs.gcc-11-cuda-11.4.1.gitlab-ci.yml +++ b/admin/gitlab-ci/gromacs.matrix/gromacs.gcc-11-cuda-11.4.1.gitlab-ci.yml @@ -15,7 +15,7 @@ # FFT: FFTW3 # Parallelism np/ntomp: 4/1 (regression tests with dual GPU) -gromacs:gcc-11-cuda-11.4:configureMPI: +gromacs:gcc-11-cuda-11.4.1:configureMPI: extends: - .gromacs:base:configure - .use-gcc:base @@ -28,7 +28,7 @@ gromacs:gcc-11-cuda-11.4:configureMPI: CMAKE_SIMD_OPTIONS: "-DGMX_SIMD=SSE4.1" COMPILER_MAJOR_VERSION: 11 -gromacs:gcc-11-cuda-11.4:buildMPI: +gromacs:gcc-11-cuda-11.4.1:buildMPI: extends: - .variables:default - .gromacs:base:build @@ -39,9 +39,9 @@ gromacs:gcc-11-cuda-11.4:buildMPI: variables: CMAKE: /usr/local/cmake-3.18.4/bin/cmake needs: - - job: gromacs:gcc-11-cuda-11.4:configureMPI + - job: gromacs:gcc-11-cuda-11.4.1:configureMPI -gromacs:gcc-11-cuda-11.4:regressiontest-gpucommupd-MPI: +gromacs:gcc-11-cuda-11.4.1:regressiontest-gpucommupd-MPI: # Test parallelism np/ntomp: 4/1 # Test parallelism GPU: direct communications, update extends: @@ -62,7 +62,7 @@ gromacs:gcc-11-cuda-11.4:regressiontest-gpucommupd-MPI: tags: - k8s-scilifelab needs: - - job: gromacs:gcc-11-cuda-11.4:buildMPI + - job: gromacs:gcc-11-cuda-11.4.1:buildMPI - job: regressiontests:prepare artifacts: paths: diff --git a/admin/gitlab-ci/gromacs.matrix/gromacs.gcc-7-cuda-11.0-mpi.gitlab-ci.yml b/admin/gitlab-ci/gromacs.matrix/gromacs.gcc-7-cuda-11.0-mpi.gitlab-ci.yml new file mode 100644 index 0000000000..352390c170 --- /dev/null +++ b/admin/gitlab-ci/gromacs.matrix/gromacs.gcc-7-cuda-11.0-mpi.gitlab-ci.yml @@ -0,0 +1,58 @@ +# Test goal: old versions of GCC with CUDA; GPU communications with OpenMPI +# Test intents (should change rarely and conservatively): +# OS: Ubuntu oldest supported +# Compiler: GCC oldest supported +# GPU: CUDA oldest supported +# HW: NVIDIA GPU, single NVIDIA GPU +# MPI: OpenMPI +# Features: GPU direct communications + update (unit tests), HeFFTe support +# Scope: configure, build, unit tests +# Test implementation choices (free to change as needed): +# OS: Ubuntu 20.04 +# Build type: Debug +# Compiler: GCC 7 +# GPU: CUDA 11.0 +# SIMD: SSE 4.1 +# FFT: FFTW3 +# Parallelism nt/ntomp: 4/2 (unit tests) + +gromacs:gcc-7-cuda-11.0:configureMPI: + extends: + - .gromacs:base:configure + - .use-gcc:base + - .use-cuda + - .use-mpi + - .rules:merge-and-post-merge-acceptance + image: ${CI_REGISTRY}/gromacs/gromacs/ci-ubuntu-20.04-gcc-7-cuda-11.0 + variables: + CMAKE: /usr/local/cmake-3.17.2/bin/cmake + CMAKE_SIMD_OPTIONS: "-DGMX_SIMD=SSE4.1" + CMAKE_EXTRA_OPTIONS: "-DGMX_USE_HEFFTE=ON" + COMPILER_MAJOR_VERSION: 7 + +gromacs:gcc-7-cuda-11.0:buildMPI: + extends: + - .variables:default + - .gromacs:base:build + - .before_script:default + - .use-ccache + - .rules:merge-and-post-merge-acceptance + image: ${CI_REGISTRY}/gromacs/gromacs/ci-ubuntu-20.04-gcc-7-cuda-11.0 + variables: + CMAKE: /usr/local/cmake-3.17.2/bin/cmake + needs: + - job: gromacs:gcc-7-cuda-11.0:configureMPI + +gromacs:gcc-7-cuda-11.0:testMPI: + extends: + - .gromacs:base:test + - .rules:merge-requests + image: ${CI_REGISTRY}/gromacs/gromacs/ci-ubuntu-20.04-gcc-7-cuda-11.0 + variables: + CMAKE: /usr/local/cmake-3.17.2/bin/cmake + KUBERNETES_EXTENDED_RESOURCE_NAME: "nvidia.com/gpu" + KUBERNETES_EXTENDED_RESOURCE_LIMIT: 1 + tags: + - k8s-scilifelab + needs: + - job: gromacs:gcc-7-cuda-11.0:buildMPI diff --git a/src/config.h.cmakein b/src/config.h.cmakein index b082d5e702..48d36ac9d5 100644 --- a/src/config.h.cmakein +++ b/src/config.h.cmakein @@ -239,6 +239,9 @@ /* Use CUDA-aware MPI. */ #cmakedefine01 HAVE_CUDA_AWARE_MPI +/* Define if HeFFTe library found */ +#cmakedefine01 Heffte_FOUND + /* Cluster size used by nonbonded kernel. Should be 8 for NVIDIA/AMD and 4 for Intel */ #define GMX_GPU_NB_CLUSTER_SIZE @GMX_GPU_NB_CLUSTER_SIZE@ diff --git a/src/gromacs/CMakeLists.txt b/src/gromacs/CMakeLists.txt index 8d1680da00..e3876a38e6 100644 --- a/src/gromacs/CMakeLists.txt +++ b/src/gromacs/CMakeLists.txt @@ -186,6 +186,10 @@ else() add_library(libgromacs ${LIBGROMACS_SOURCES}) endif() +if (TARGET Heffte::Heffte) + target_link_libraries(libgromacs PRIVATE Heffte::Heffte) +endif() + if (GMX_SYCL_HIPSYCL) target_link_libraries(libgromacs PUBLIC roc::rocfft) endif() diff --git a/src/gromacs/fft/CMakeLists.txt b/src/gromacs/fft/CMakeLists.txt index 73a6d4e5cd..060f9b8d6b 100644 --- a/src/gromacs/fft/CMakeLists.txt +++ b/src/gromacs/fft/CMakeLists.txt @@ -54,7 +54,11 @@ endif() if (GMX_FFT_MKL) gmx_add_libgromacs_sources(fft_mkl.cpp) endif() - +if(Heffte_FOUND) + gmx_add_libgromacs_sources( + gpu_3dfft_heffte.cpp + ) +endif() if (GMX_GPU_CUDA) gmx_add_libgromacs_sources( # CUDA-specific sources diff --git a/src/gromacs/fft/gpu_3dfft.cpp b/src/gromacs/fft/gpu_3dfft.cpp index c027c5d08a..9b931cfb2f 100644 --- a/src/gromacs/fft/gpu_3dfft.cpp +++ b/src/gromacs/fft/gpu_3dfft.cpp @@ -54,6 +54,10 @@ # include "gpu_3dfft_sycl.h" #endif +#if Heffte_FOUND +# include "gpu_3dfft_heffte.h" +#endif + #include "gromacs/utility/arrayref.h" #include "gromacs/utility/exceptions.h" @@ -101,7 +105,9 @@ Gpu3dFft::Gpu3dFft(FftBackend backend, realGrid, complexGrid); break; - default: GMX_THROW(InternalError("Unsupported FFT backend requested")); + default: + GMX_RELEASE_ASSERT(backend == FftBackend::HeFFTe_CUDA, + "Unsupported FFT backend requested"); } # elif GMX_GPU_OPENCL switch (backend) @@ -144,6 +150,35 @@ Gpu3dFft::Gpu3dFft(FftBackend backend, default: GMX_THROW(InternalError("Unsupported FFT backend requested")); } # endif + +# if Heffte_FOUND + switch (backend) + { + case FftBackend::HeFFTe_CUDA: + GMX_RELEASE_ASSERT( + GMX_GPU_CUDA, + "HeFFTe_CUDA FFT backend is supported only with GROMACS compiled with CUDA"); + GMX_RELEASE_ASSERT(heffte::backend::is_enabled::value, + "HeFFTe not compiled with CUDA support"); + impl_ = std::make_unique>( + allocateGrids, + comm, + gridSizesInXForEachRank, + gridSizesInYForEachRank, + nz, + performOutOfPlaceFFT, + context, + pmeStream, + realGridSize, + realGridSizePadded, + complexGridSizePadded, + realGrid, + complexGrid); + + break; + default: GMX_RELEASE_ASSERT(impl_ != nullptr, "Unsupported FFT backend requested"); + } +# endif } #else diff --git a/src/gromacs/fft/gpu_3dfft.h b/src/gromacs/fft/gpu_3dfft.h index 7b2c637654..a26dc8af3f 100644 --- a/src/gromacs/fft/gpu_3dfft.h +++ b/src/gromacs/fft/gpu_3dfft.h @@ -70,6 +70,7 @@ enum class FftBackend Cufft, // supports only single-GPU Ocl, // supports only single-GPU Sycl, // Not supported currently + HeFFTe_CUDA, Count }; @@ -127,6 +128,9 @@ private: class ImplOcl; class ImplSycl; + template + class ImplHeFfte; + std::unique_ptr impl_; }; diff --git a/src/gromacs/fft/gpu_3dfft_heffte.cpp b/src/gromacs/fft/gpu_3dfft_heffte.cpp new file mode 100644 index 0000000000..4c392bd65d --- /dev/null +++ b/src/gromacs/fft/gpu_3dfft_heffte.cpp @@ -0,0 +1,198 @@ +/* + * This file is part of the GROMACS molecular simulation package. + * + * Copyright (c) 2021, by the GROMACS development team, led by + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, + * and including many others, as listed in the AUTHORS file in the + * top-level source directory and at http://www.gromacs.org. + * + * GROMACS is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 + * of the License, or (at your option) any later version. + * + * GROMACS is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with GROMACS; if not, see + * http://www.gnu.org/licenses, or write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * If you want to redistribute modifications to GROMACS, please + * consider that scientific software is very special. Version + * control is crucial - bugs must be traceable. We will be happy to + * consider code for inclusion in the official distribution, but + * derived work must not be called official GROMACS. Details are found + * in the README & COPYING files - if they are missing, get the + * official version at http://www.gromacs.org. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the research papers on the package. Check out http://www.gromacs.org. + */ + +/*! \internal \file + * \brief Implements GPU 3D FFT routines using HeFFTe. + * + * \author Gaurav Garg + * \ingroup module_fft + */ + +#include "gmxpre.h" + +#include "gpu_3dfft_heffte.h" + +#include "gromacs/gpu_utils/device_stream.h" +#include "gromacs/utility/arrayref.h" +#include "gromacs/utility/fatalerror.h" +#include "gromacs/utility/gmxassert.h" + +namespace gmx +{ +template +Gpu3dFft::ImplHeFfte::ImplHeFfte(bool allocateGrids, + MPI_Comm comm, + ArrayRef gridSizesInXForEachRank, + ArrayRef gridSizesInYForEachRank, + const int nz, + bool performOutOfPlaceFFT, + const DeviceContext& /*context*/, + const DeviceStream& pmeStream, + ivec realGridSize, + ivec realGridSizePadded, + ivec complexGridSizePadded, + DeviceBuffer* realGrid, + DeviceBuffer* complexGrid) : + stream_(pmeStream) +{ + const int numDomainsX = gridSizesInXForEachRank.size(); + const int numDomainsY = gridSizesInYForEachRank.size(); + + GMX_RELEASE_ASSERT(allocateGrids == true, "Grids cannot be pre-allocated"); + GMX_RELEASE_ASSERT(performOutOfPlaceFFT == true, "Only out-of-place FFT supported"); + GMX_RELEASE_ASSERT(numDomainsX * numDomainsY > 1, + "HeFFTe backend is expected to be used only with more than 1 rank"); + + // calculate grid offsets + std::vector gridOffsetsInX(numDomainsX + 1); + std::vector gridOffsetsInY(numDomainsY + 1); + + gridOffsetsInX[0] = 0; + for (unsigned int i = 0; i < gridSizesInXForEachRank.size(); ++i) + { + gridOffsetsInX[i + 1] = gridOffsetsInX[i] + gridSizesInXForEachRank[i]; + } + + gridOffsetsInY[0] = 0; + for (unsigned int i = 0; i < gridSizesInYForEachRank.size(); ++i) + { + gridOffsetsInY[i + 1] = gridOffsetsInY[i] + gridSizesInYForEachRank[i]; + } + + int rank, nProcs; + MPI_Comm_rank(comm, &rank); + MPI_Comm_size(comm, &nProcs); + + GMX_RELEASE_ASSERT(nProcs == numDomainsX * numDomainsY, + "Mismatch in communicator size and expected domain decomposition"); + + // define how ranks are mapped to 2d domain + int procY = rank % numDomainsY; + int procX = rank / numDomainsY; + + // local real grid boxes + heffte::box3d<> const realBox = { { 0, gridOffsetsInY[procY], gridOffsetsInX[procX] }, + { nz - 1, gridOffsetsInY[procY + 1] - 1, gridOffsetsInX[procX + 1] - 1 } }; + + const int nx = gridOffsetsInX[numDomainsX]; + const int ny = gridOffsetsInY[numDomainsY]; + + // define shape of local complex grid boxes + std::vector gridOffsetsInY_transformed(numDomainsX + 1); + std::vector gridOffsetsInZ_transformed(numDomainsY + 1); + + for (int i = 0; i < numDomainsX; i++) + { + gridOffsetsInY_transformed[i] = (i * ny + 0) / numDomainsX; + } + gridOffsetsInY_transformed[numDomainsX] = ny; + + const int complexZDim = nz / 2 + 1; + for (int i = 0; i < numDomainsY; i++) + { + gridOffsetsInZ_transformed[i] = (i * complexZDim + 0) / numDomainsY; + } + gridOffsetsInZ_transformed[numDomainsY] = complexZDim; + + // output order - YZX + // this avoids reordering of data in final fft as final fft is done along x-dimension with + // x being contiguous, leave the data as is in YZX order and don't bring it back in XYZ + heffte::box3d<> const complexBox = { + { gridOffsetsInZ_transformed[procY], gridOffsetsInY_transformed[procX], 0 }, + { gridOffsetsInZ_transformed[procY + 1] - 1, gridOffsetsInY_transformed[procX + 1] - 1, nx - 1 }, + { 2, 0, 1 } + }; + + // ToDo: useReorder=true and useAlltoall=true gave me best results in past but, verify it once again + const bool useReorder = true; + const bool useAlltoall = true; + const bool usePencils = false; // Not-used as GROMACS doesn't work with brick decomposition + heffte::plan_options options(useReorder, useAlltoall, usePencils); + + // Define 3D FFT plan + fftPlan_ = std::make_unique>(realBox, complexBox, 0, comm, options); + + // allocate grid and workspace_ + localRealGrid_ = heffte::gpu::vector(fftPlan_->size_inbox()); + localComplexGrid_ = heffte::gpu::vector>(fftPlan_->size_outbox()); + workspace_ = heffte::gpu::vector>(fftPlan_->size_workspace()); + + // write back the output data + *realGrid = localRealGrid_.data(); + *complexGrid = (float*)localComplexGrid_.data(); + + realGridSize[XX] = gridSizesInXForEachRank[procX]; + realGridSize[YY] = gridSizesInYForEachRank[procY]; + realGridSize[ZZ] = nz; + + realGridSizePadded[XX] = fftPlan_->inbox().size[2]; + realGridSizePadded[YY] = fftPlan_->inbox().size[1]; + realGridSizePadded[ZZ] = fftPlan_->inbox().size[0]; + + complexGridSizePadded[XX] = fftPlan_->outbox().size[2]; + complexGridSizePadded[YY] = fftPlan_->outbox().size[1]; + complexGridSizePadded[ZZ] = fftPlan_->outbox().size[0]; +} + +template +void Gpu3dFft::ImplHeFfte::perform3dFft(gmx_fft_direction dir, CommandEvent* /*timingEvent*/) +{ + // HeFFTe does all the computations in the default stream + // ToDo: We need some way to create DeviceStream class in GROMACS with default stream + // This way we can synchronize PME and default streams using events + stream_.synchronize(); + + switch (dir) + { + case GMX_FFT_REAL_TO_COMPLEX: + fftPlan_->forward(localRealGrid_.data(), localComplexGrid_.data(), workspace_.data()); + break; + case GMX_FFT_COMPLEX_TO_REAL: + fftPlan_->backward(localComplexGrid_.data(), localRealGrid_.data(), workspace_.data()); + break; + default: + GMX_THROW(NotImplementedError("The chosen 3D-FFT case is not implemented on GPUs")); + } + + // ToDo: Same as above, we need some way to create DeviceStream from default stream + heffte::gpu::synchronize_default_stream(); +} + +// instantiate relevant HeFFTe backend +#if GMX_GPU_CUDA +template class Gpu3dFft::ImplHeFfte; +#endif + +} // namespace gmx diff --git a/src/gromacs/fft/gpu_3dfft_heffte.h b/src/gromacs/fft/gpu_3dfft_heffte.h new file mode 100644 index 0000000000..ebc92e554a --- /dev/null +++ b/src/gromacs/fft/gpu_3dfft_heffte.h @@ -0,0 +1,106 @@ +/* + * This file is part of the GROMACS molecular simulation package. + * + * Copyright (c) 2021, by the GROMACS development team, led by + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, + * and including many others, as listed in the AUTHORS file in the + * top-level source directory and at http://www.gromacs.org. + * + * GROMACS is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 + * of the License, or (at your option) any later version. + * + * GROMACS is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with GROMACS; if not, see + * http://www.gnu.org/licenses, or write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * If you want to redistribute modifications to GROMACS, please + * consider that scientific software is very special. Version + * control is crucial - bugs must be traceable. We will be happy to + * consider code for inclusion in the official distribution, but + * derived work must not be called official GROMACS. Details are found + * in the README & COPYING files - if they are missing, get the + * official version at http://www.gromacs.org. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the research papers on the package. Check out http://www.gromacs.org. + */ + +/*! \internal \file + * \brief Declares the GPU 3D FFT routines. + * \author Gaurav Garg + * \ingroup module_fft + */ + +#ifndef GMX_FFT_GPU_3DFFT_HEFFTE_H +#define GMX_FFT_GPU_3DFFT_HEFFTE_H + +#include + +#include "gromacs/fft/fft.h" +#include "gromacs/gpu_utils/devicebuffer_datatype.h" +#include "gromacs/gpu_utils/hostallocator.h" +#include "gromacs/gpu_utils/gputraits.h" +#include "gromacs/utility/gmxmpi.h" +#include "gpu_3dfft_impl.h" + +#include + +class DeviceContext; +class DeviceStream; + +namespace gmx +{ + +/*! \internal \brief + * A 3D FFT wrapper class for performing R2C/C2R transforms using clFFT + */ +template +class Gpu3dFft::ImplHeFfte : public Gpu3dFft::Impl +{ +public: + //! \copydoc Gpu3dFft::Impl::Impl + ImplHeFfte(bool allocateGrids, + MPI_Comm comm, + ArrayRef gridSizesInXForEachRank, + ArrayRef gridSizesInYForEachRank, + int nz, + bool performOutOfPlaceFFT, + const DeviceContext& context, + const DeviceStream& pmeStream, + ivec realGridSize, + ivec realGridSizePadded, + ivec complexGridSizePadded, + DeviceBuffer* realGrid, + DeviceBuffer* complexGrid); + + /*! \brief Destroys the FFT plans. */ + ~ImplHeFfte() override = default; + + /*! \brief Performs the FFT transform in given direction + * + * \param[in] dir FFT transform direction specifier + * \param[out] timingEvent pointer to the timing event where timing data is recorded + */ + void perform3dFft(gmx_fft_direction dir, CommandEvent* timingEvent) override; + +private: + heffte::gpu::vector localRealGrid_; + heffte::gpu::vector> localComplexGrid_; + heffte::gpu::vector> workspace_; + + std::unique_ptr> fftPlan_; + + const DeviceStream& stream_; +}; + +} // namespace gmx + +#endif diff --git a/src/gromacs/fft/tests/CMakeLists.txt b/src/gromacs/fft/tests/CMakeLists.txt index 319a6ab7f3..681fd38cbe 100644 --- a/src/gromacs/fft/tests/CMakeLists.txt +++ b/src/gromacs/fft/tests/CMakeLists.txt @@ -36,3 +36,10 @@ gmx_add_unit_test(FFTUnitTests fft-test HARDWARE_DETECTION GPU_CPP_SOURCE_FILES fft.cpp ) + +if(Heffte_FOUND) +gmx_add_mpi_unit_test(FFTMpiUnitTests fft-mpi-test 4 HARDWARE_DETECTION + GPU_CPP_SOURCE_FILES + fft_mpi.cpp + ) +endif() diff --git a/src/gromacs/fft/tests/fft_mpi.cpp b/src/gromacs/fft/tests/fft_mpi.cpp new file mode 100644 index 0000000000..1772102196 --- /dev/null +++ b/src/gromacs/fft/tests/fft_mpi.cpp @@ -0,0 +1,232 @@ +/* + * This file is part of the GROMACS molecular simulation package. + * + * Copyright (c) 2021, by the GROMACS development team, led by + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, + * and including many others, as listed in the AUTHORS file in the + * top-level source directory and at http://www.gromacs.org. + * + * GROMACS is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 + * of the License, or (at your option) any later version. + * + * GROMACS is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with GROMACS; if not, see + * http://www.gnu.org/licenses, or write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * If you want to redistribute modifications to GROMACS, please + * consider that scientific software is very special. Version + * control is crucial - bugs must be traceable. We will be happy to + * consider code for inclusion in the official distribution, but + * derived work must not be called official GROMACS. Details are found + * in the README & COPYING files - if they are missing, get the + * official version at http://www.gromacs.org. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the research papers on the package. Check out http://www.gromacs.org. + */ +/*! \internal \file + * \brief + * Tests utilities for fft calculations. + * + * \author Gaurav Garg + * \ingroup module_fft + */ +#include "gmxpre.h" + +#include "gromacs/fft/fft.h" + +#include "config.h" + +#include +#include +#include + +#include +#include + +#include "gromacs/fft/gpu_3dfft.h" +#include "gromacs/gpu_utils/clfftinitializer.h" +#if GMX_GPU +# include "gromacs/gpu_utils/devicebuffer.h" +#endif +#include "gromacs/utility/stringutil.h" + +#include "testutils/refdata.h" +#include "testutils/mpitest.h" +#include "testutils/test_hardware_environment.h" +#include "testutils/testasserts.h" +#include "testutils/testmatchers.h" + +namespace gmx +{ +namespace test +{ +using GpuFftTestParams = std::tuple; + +/*! \brief Check that the real grid after forward and backward + * 3D transforms matches the input real grid. */ +static void checkRealGrid(const IVec realGridSizeFull, + const ivec realGridSize, + const ivec realGridSizePadded, + ArrayRef inputRealGrid, + ArrayRef outputRealGridValues) +{ + // Normalize the output (as the implementation does not + // normalize either FFT) + const real normalizationConstant = + 1.0 / (realGridSizeFull[XX] * realGridSizeFull[YY] * realGridSizeFull[ZZ]); + std::transform(outputRealGridValues.begin(), + outputRealGridValues.end(), + outputRealGridValues.begin(), + [normalizationConstant](const real r) { return r * normalizationConstant; }); + // Check the real grid, skipping unused data from the padding + const auto realGridTolerance = relativeToleranceAsFloatingPoint(10, 1e-6); + for (int i = 0; i < realGridSize[XX] * realGridSize[YY]; i++) + { + auto expected = + arrayRefFromArray(inputRealGrid.data() + i * realGridSizePadded[ZZ], realGridSize[ZZ]); + auto actual = arrayRefFromArray(outputRealGridValues.data() + i * realGridSizePadded[ZZ], + realGridSize[ZZ]); + EXPECT_THAT(actual, Pointwise(RealEq(realGridTolerance), expected)) + << formatString("checking backward transform part %d", i); + } +} + +class GpuFftTest3D : public ::testing::Test, public ::testing::WithParamInterface +{ +public: + GpuFftTest3D() = default; + + + //! The whole logic being tested is contained here + static void runTest(const GpuFftTestParams& param) + { + const auto& deviceList = getTestHardwareEnvironment()->getTestDeviceList(); + + int rank; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + + const auto& testDevice = deviceList[rank % deviceList.size()]; + + const DeviceContext& deviceContext = testDevice->deviceContext(); + setActiveDevice(testDevice->deviceInfo()); + const DeviceStream& deviceStream = testDevice->deviceStream(); + + FftBackend backend; + + int numDomainsX; + int numDomainsY; + IVec realGridSizeFull; + std::tie(realGridSizeFull, numDomainsX, numDomainsY, backend) = param; + + // define local grid sizes - this follows same logic as GROMACS implementation + std::vector localGridSizesX(numDomainsX); + for (unsigned int i = 0; i < localGridSizesX.size(); ++i) + { + localGridSizesX[i] = ((i + 1) * realGridSizeFull[XX] / numDomainsX) + - (i * realGridSizeFull[XX] / numDomainsX); + ASSERT_GT(localGridSizesX[i], 0); + } + + std::vector localGridSizesY(numDomainsY); + for (unsigned int i = 0; i < localGridSizesY.size(); ++i) + { + localGridSizesY[i] = ((i + 1) * realGridSizeFull[YY] / numDomainsY) + - (i * realGridSizeFull[YY] / numDomainsY); + ASSERT_GT(localGridSizesY[i], 0); + } + + ivec realGridSize; + ivec realGridSizePadded; + ivec complexGridSizePadded; + + // Allocate the device buffers + DeviceBuffer realGrid, complexGrid; + + const bool performOutOfPlaceFFT = true; + const MPI_Comm comm = MPI_COMM_WORLD; + const bool allocateGrid = true; + const int nz = realGridSizeFull[ZZ]; + Gpu3dFft gpu3dFft(backend, + allocateGrid, + comm, + localGridSizesX, + localGridSizesY, + nz, + performOutOfPlaceFFT, + deviceContext, + deviceStream, + realGridSize, + realGridSizePadded, + complexGridSizePadded, + &realGrid, + &complexGrid); + + int sizeInReals = realGridSizePadded[0] * realGridSizePadded[1] * realGridSizePadded[2]; + + // initialze random input data + std::vector in(sizeInReals); + std::uniform_real_distribution<> dis(-10.0f, 10.0f); + std::minstd_rand gen(time(NULL) + rank); + std::generate(in.begin(), in.end(), [&dis, &gen]() { + // random number between -10 to 10 + return dis(gen); + }); + + // Transfer the real grid input data for the FFT + copyToDeviceBuffer( + &realGrid, in.data(), 0, in.size(), deviceStream, GpuApiCallBehavior::Sync, nullptr); + + // Do the forward FFT to compute the complex grid + CommandEvent* timingEvent = nullptr; + gpu3dFft.perform3dFft(GMX_FFT_REAL_TO_COMPLEX, timingEvent); + + // clear real grid after the forward FFT, so that we know the + // final grid is one produced by the complex FFT, not just leftovers + clearDeviceBufferAsync(&realGrid, 0, sizeInReals, deviceStream); + + // Do the back transform + gpu3dFft.perform3dFft(GMX_FFT_COMPLEX_TO_REAL, timingEvent); + deviceStream.synchronize(); + + // Transfer the real grid back from the device + std::vector outputRealGridValues(in.size()); + copyFromDeviceBuffer(outputRealGridValues.data(), + &realGrid, + 0, + outputRealGridValues.size(), + deviceStream, + GpuApiCallBehavior::Sync, + nullptr); + + checkRealGrid(realGridSizeFull, realGridSize, realGridSizePadded, in, outputRealGridValues); + } +}; + +TEST_P(GpuFftTest3D, GpuFftDecomposition) +{ + GMX_MPI_TEST(4); + GpuFftTestParams params = GetParam(); + runTest(params); +} + +std::vector const inputs{ + { IVec{ 5, 6, 9 }, 4, 1, FftBackend::HeFFTe_CUDA}, // slab decomposition + { IVec{ 5, 6, 9 }, 2, 2, FftBackend::HeFFTe_CUDA} // pencil decomposition +}; + +INSTANTIATE_TEST_SUITE_P(GpuFft, GpuFftTest3D, ::testing::ValuesIn(inputs)); + +} // namespace test +} // namespace gmx diff --git a/src/programs/mdrun/tests/exactcontinuation.cpp b/src/programs/mdrun/tests/exactcontinuation.cpp index 8f79bde29b..efd2e80645 100644 --- a/src/programs/mdrun/tests/exactcontinuation.cpp +++ b/src/programs/mdrun/tests/exactcontinuation.cpp @@ -395,14 +395,14 @@ TEST_P(MdrunNoAppendContinuationIsExact, WithinTolerances) mdpFieldValues["init-lambda-state"] = "3"; mdpFieldValues["nsteps"] = "16"; - // Forces on GPUs are generally not reproducible enough for a tight - // tolerance. Similarly, the propagation of sd and bd are not as + // Forces and update on GPUs are generally not reproducible enough for a tight + // tolerance. Similarly, the propagation of bd is not as // reproducible as the others. So we use several ULP tolerance // in all cases. This is looser than needed e.g. for md and md-vv // with forces on CPUs, but there is no real risk of a bug with // those propagators that would only be caught with a tighter // tolerance in this particular test. - int ulpToleranceInMixed = 32; + int ulpToleranceInMixed = 128; int ulpToleranceInDouble = 64; if (integrator == "bd") { -- 2.22.0