mark_as_advanced(GMX_BUILD_OWN_FFTW)
mark_as_advanced(GMX_DISABLE_FFTW_MEASURE)
+gmx_dependent_option(
+ GMX_USE_HEFFTE
+ "Use HeFFTe for FFT support. Used with CUDA backend"
+ OFF
+ "GMX_GPU STREQUAL CUDA;GMX_MPI")
+
gmx_dependent_cache_variable(GMX_SIMD_REF_FLOAT_WIDTH "Reference SIMD single precision width" STRING "4" "GMX_SIMD STREQUAL REFERENCE")
gmx_dependent_cache_variable(GMX_SIMD_REF_DOUBLE_WIDTH "Reference SIMD double precision width" STRING "2" "GMX_SIMD STREQUAL REFERENCE")
set(GMX_CYGWIN 1)
endif()
+if(GMX_USE_HEFFTE)
+ if(NOT GMX_GPU_CUDA)
+ message(FATAL_ERROR "HeFFTe support requires a CUDA build")
+ endif()
+ if(NOT GMX_LIB_MPI)
+ message(FATAL_ERROR "HeFFTe support requires a library MPI build")
+ endif()
+ find_package(Heffte 2.1.0 REQUIRED CUDA)
+endif()
+
if(WIN32)
set(GMX_NATIVE_WINDOWS 1)
# This makes windows.h not declare min/max as macros that would break
- local: '/admin/gitlab-ci/gromacs.matrix/gromacs.gcc-11.gitlab-ci.yml'
- local: '/admin/gitlab-ci/gromacs.matrix/gromacs.gcc-11-coverage.gitlab-ci.yml'
- local: '/admin/gitlab-ci/gromacs.matrix/gromacs.gcc-7-cuda-11.0.gitlab-ci.yml'
+ - local: '/admin/gitlab-ci/gromacs.matrix/gromacs.gcc-7-cuda-11.0-mpi.gitlab-ci.yml'
- local: '/admin/gitlab-ci/gromacs.matrix/gromacs.gcc-11-cuda-11.4.1.gitlab-ci.yml'
- local: '/admin/gitlab-ci/gromacs.matrix/gromacs.gcc-11-cuda-11.4.1-release.gitlab-ci.yml'
- local: '/admin/gitlab-ci/gromacs.matrix/gromacs.gcc-11-release.gitlab-ci.yml'
# FFT: FFTW3
# Parallelism np/ntomp: 4/1 (regression tests with dual GPU)
-gromacs:gcc-11-cuda-11.4:configureMPI:
+gromacs:gcc-11-cuda-11.4.1:configureMPI:
extends:
- .gromacs:base:configure
- .use-gcc:base
CMAKE_SIMD_OPTIONS: "-DGMX_SIMD=SSE4.1"
COMPILER_MAJOR_VERSION: 11
-gromacs:gcc-11-cuda-11.4:buildMPI:
+gromacs:gcc-11-cuda-11.4.1:buildMPI:
extends:
- .variables:default
- .gromacs:base:build
variables:
CMAKE: /usr/local/cmake-3.18.4/bin/cmake
needs:
- - job: gromacs:gcc-11-cuda-11.4:configureMPI
+ - job: gromacs:gcc-11-cuda-11.4.1:configureMPI
-gromacs:gcc-11-cuda-11.4:regressiontest-gpucommupd-MPI:
+gromacs:gcc-11-cuda-11.4.1:regressiontest-gpucommupd-MPI:
# Test parallelism np/ntomp: 4/1
# Test parallelism GPU: direct communications, update
extends:
tags:
- k8s-scilifelab
needs:
- - job: gromacs:gcc-11-cuda-11.4:buildMPI
+ - job: gromacs:gcc-11-cuda-11.4.1:buildMPI
- job: regressiontests:prepare
artifacts:
paths:
--- /dev/null
+# Test goal: old versions of GCC with CUDA; GPU communications with OpenMPI
+# Test intents (should change rarely and conservatively):
+# OS: Ubuntu oldest supported
+# Compiler: GCC oldest supported
+# GPU: CUDA oldest supported
+# HW: NVIDIA GPU, single NVIDIA GPU
+# MPI: OpenMPI
+# Features: GPU direct communications + update (unit tests), HeFFTe support
+# Scope: configure, build, unit tests
+# Test implementation choices (free to change as needed):
+# OS: Ubuntu 20.04
+# Build type: Debug
+# Compiler: GCC 7
+# GPU: CUDA 11.0
+# SIMD: SSE 4.1
+# FFT: FFTW3
+# Parallelism nt/ntomp: 4/2 (unit tests)
+
+gromacs:gcc-7-cuda-11.0:configureMPI:
+ extends:
+ - .gromacs:base:configure
+ - .use-gcc:base
+ - .use-cuda
+ - .use-mpi
+ - .rules:merge-and-post-merge-acceptance
+ image: ${CI_REGISTRY}/gromacs/gromacs/ci-ubuntu-20.04-gcc-7-cuda-11.0
+ variables:
+ CMAKE: /usr/local/cmake-3.17.2/bin/cmake
+ CMAKE_SIMD_OPTIONS: "-DGMX_SIMD=SSE4.1"
+ CMAKE_EXTRA_OPTIONS: "-DGMX_USE_HEFFTE=ON"
+ COMPILER_MAJOR_VERSION: 7
+
+gromacs:gcc-7-cuda-11.0:buildMPI:
+ extends:
+ - .variables:default
+ - .gromacs:base:build
+ - .before_script:default
+ - .use-ccache
+ - .rules:merge-and-post-merge-acceptance
+ image: ${CI_REGISTRY}/gromacs/gromacs/ci-ubuntu-20.04-gcc-7-cuda-11.0
+ variables:
+ CMAKE: /usr/local/cmake-3.17.2/bin/cmake
+ needs:
+ - job: gromacs:gcc-7-cuda-11.0:configureMPI
+
+gromacs:gcc-7-cuda-11.0:testMPI:
+ extends:
+ - .gromacs:base:test
+ - .rules:merge-requests
+ image: ${CI_REGISTRY}/gromacs/gromacs/ci-ubuntu-20.04-gcc-7-cuda-11.0
+ variables:
+ CMAKE: /usr/local/cmake-3.17.2/bin/cmake
+ KUBERNETES_EXTENDED_RESOURCE_NAME: "nvidia.com/gpu"
+ KUBERNETES_EXTENDED_RESOURCE_LIMIT: 1
+ tags:
+ - k8s-scilifelab
+ needs:
+ - job: gromacs:gcc-7-cuda-11.0:buildMPI
/* Use CUDA-aware MPI. */
#cmakedefine01 HAVE_CUDA_AWARE_MPI
+/* Define if HeFFTe library found */
+#cmakedefine01 Heffte_FOUND
+
/* Cluster size used by nonbonded kernel. Should be 8 for NVIDIA/AMD and 4 for Intel */
#define GMX_GPU_NB_CLUSTER_SIZE @GMX_GPU_NB_CLUSTER_SIZE@
add_library(libgromacs ${LIBGROMACS_SOURCES})
endif()
+if (TARGET Heffte::Heffte)
+ target_link_libraries(libgromacs PRIVATE Heffte::Heffte)
+endif()
+
if (GMX_SYCL_HIPSYCL)
target_link_libraries(libgromacs PUBLIC roc::rocfft)
endif()
if (GMX_FFT_MKL)
gmx_add_libgromacs_sources(fft_mkl.cpp)
endif()
-
+if(Heffte_FOUND)
+ gmx_add_libgromacs_sources(
+ gpu_3dfft_heffte.cpp
+ )
+endif()
if (GMX_GPU_CUDA)
gmx_add_libgromacs_sources(
# CUDA-specific sources
# include "gpu_3dfft_sycl.h"
#endif
+#if Heffte_FOUND
+# include "gpu_3dfft_heffte.h"
+#endif
+
#include "gromacs/utility/arrayref.h"
#include "gromacs/utility/exceptions.h"
realGrid,
complexGrid);
break;
- default: GMX_THROW(InternalError("Unsupported FFT backend requested"));
+ default:
+ GMX_RELEASE_ASSERT(backend == FftBackend::HeFFTe_CUDA,
+ "Unsupported FFT backend requested");
}
# elif GMX_GPU_OPENCL
switch (backend)
default: GMX_THROW(InternalError("Unsupported FFT backend requested"));
}
# endif
+
+# if Heffte_FOUND
+ switch (backend)
+ {
+ case FftBackend::HeFFTe_CUDA:
+ GMX_RELEASE_ASSERT(
+ GMX_GPU_CUDA,
+ "HeFFTe_CUDA FFT backend is supported only with GROMACS compiled with CUDA");
+ GMX_RELEASE_ASSERT(heffte::backend::is_enabled<heffte::backend::cufft>::value,
+ "HeFFTe not compiled with CUDA support");
+ impl_ = std::make_unique<Gpu3dFft::ImplHeFfte<heffte::backend::cufft>>(
+ allocateGrids,
+ comm,
+ gridSizesInXForEachRank,
+ gridSizesInYForEachRank,
+ nz,
+ performOutOfPlaceFFT,
+ context,
+ pmeStream,
+ realGridSize,
+ realGridSizePadded,
+ complexGridSizePadded,
+ realGrid,
+ complexGrid);
+
+ break;
+ default: GMX_RELEASE_ASSERT(impl_ != nullptr, "Unsupported FFT backend requested");
+ }
+# endif
}
#else
Cufft, // supports only single-GPU
Ocl, // supports only single-GPU
Sycl, // Not supported currently
+ HeFFTe_CUDA,
Count
};
class ImplOcl;
class ImplSycl;
+ template<typename backend_tag>
+ class ImplHeFfte;
+
std::unique_ptr<Impl> impl_;
};
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2021, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/*! \internal \file
+ * \brief Implements GPU 3D FFT routines using HeFFTe.
+ *
+ * \author Gaurav Garg <gaugarg@nvidia.com>
+ * \ingroup module_fft
+ */
+
+#include "gmxpre.h"
+
+#include "gpu_3dfft_heffte.h"
+
+#include "gromacs/gpu_utils/device_stream.h"
+#include "gromacs/utility/arrayref.h"
+#include "gromacs/utility/fatalerror.h"
+#include "gromacs/utility/gmxassert.h"
+
+namespace gmx
+{
+template<typename backend_tag>
+Gpu3dFft::ImplHeFfte<backend_tag>::ImplHeFfte(bool allocateGrids,
+ MPI_Comm comm,
+ ArrayRef<const int> gridSizesInXForEachRank,
+ ArrayRef<const int> gridSizesInYForEachRank,
+ const int nz,
+ bool performOutOfPlaceFFT,
+ const DeviceContext& /*context*/,
+ const DeviceStream& pmeStream,
+ ivec realGridSize,
+ ivec realGridSizePadded,
+ ivec complexGridSizePadded,
+ DeviceBuffer<float>* realGrid,
+ DeviceBuffer<float>* complexGrid) :
+ stream_(pmeStream)
+{
+ const int numDomainsX = gridSizesInXForEachRank.size();
+ const int numDomainsY = gridSizesInYForEachRank.size();
+
+ GMX_RELEASE_ASSERT(allocateGrids == true, "Grids cannot be pre-allocated");
+ GMX_RELEASE_ASSERT(performOutOfPlaceFFT == true, "Only out-of-place FFT supported");
+ GMX_RELEASE_ASSERT(numDomainsX * numDomainsY > 1,
+ "HeFFTe backend is expected to be used only with more than 1 rank");
+
+ // calculate grid offsets
+ std::vector<int> gridOffsetsInX(numDomainsX + 1);
+ std::vector<int> gridOffsetsInY(numDomainsY + 1);
+
+ gridOffsetsInX[0] = 0;
+ for (unsigned int i = 0; i < gridSizesInXForEachRank.size(); ++i)
+ {
+ gridOffsetsInX[i + 1] = gridOffsetsInX[i] + gridSizesInXForEachRank[i];
+ }
+
+ gridOffsetsInY[0] = 0;
+ for (unsigned int i = 0; i < gridSizesInYForEachRank.size(); ++i)
+ {
+ gridOffsetsInY[i + 1] = gridOffsetsInY[i] + gridSizesInYForEachRank[i];
+ }
+
+ int rank, nProcs;
+ MPI_Comm_rank(comm, &rank);
+ MPI_Comm_size(comm, &nProcs);
+
+ GMX_RELEASE_ASSERT(nProcs == numDomainsX * numDomainsY,
+ "Mismatch in communicator size and expected domain decomposition");
+
+ // define how ranks are mapped to 2d domain
+ int procY = rank % numDomainsY;
+ int procX = rank / numDomainsY;
+
+ // local real grid boxes
+ heffte::box3d<> const realBox = { { 0, gridOffsetsInY[procY], gridOffsetsInX[procX] },
+ { nz - 1, gridOffsetsInY[procY + 1] - 1, gridOffsetsInX[procX + 1] - 1 } };
+
+ const int nx = gridOffsetsInX[numDomainsX];
+ const int ny = gridOffsetsInY[numDomainsY];
+
+ // define shape of local complex grid boxes
+ std::vector<int> gridOffsetsInY_transformed(numDomainsX + 1);
+ std::vector<int> gridOffsetsInZ_transformed(numDomainsY + 1);
+
+ for (int i = 0; i < numDomainsX; i++)
+ {
+ gridOffsetsInY_transformed[i] = (i * ny + 0) / numDomainsX;
+ }
+ gridOffsetsInY_transformed[numDomainsX] = ny;
+
+ const int complexZDim = nz / 2 + 1;
+ for (int i = 0; i < numDomainsY; i++)
+ {
+ gridOffsetsInZ_transformed[i] = (i * complexZDim + 0) / numDomainsY;
+ }
+ gridOffsetsInZ_transformed[numDomainsY] = complexZDim;
+
+ // output order - YZX
+ // this avoids reordering of data in final fft as final fft is done along x-dimension with
+ // x being contiguous, leave the data as is in YZX order and don't bring it back in XYZ
+ heffte::box3d<> const complexBox = {
+ { gridOffsetsInZ_transformed[procY], gridOffsetsInY_transformed[procX], 0 },
+ { gridOffsetsInZ_transformed[procY + 1] - 1, gridOffsetsInY_transformed[procX + 1] - 1, nx - 1 },
+ { 2, 0, 1 }
+ };
+
+ // ToDo: useReorder=true and useAlltoall=true gave me best results in past but, verify it once again
+ const bool useReorder = true;
+ const bool useAlltoall = true;
+ const bool usePencils = false; // Not-used as GROMACS doesn't work with brick decomposition
+ heffte::plan_options options(useReorder, useAlltoall, usePencils);
+
+ // Define 3D FFT plan
+ fftPlan_ = std::make_unique<heffte::fft3d_r2c<backend_tag, int>>(realBox, complexBox, 0, comm, options);
+
+ // allocate grid and workspace_
+ localRealGrid_ = heffte::gpu::vector<float>(fftPlan_->size_inbox());
+ localComplexGrid_ = heffte::gpu::vector<std::complex<float>>(fftPlan_->size_outbox());
+ workspace_ = heffte::gpu::vector<std::complex<float>>(fftPlan_->size_workspace());
+
+ // write back the output data
+ *realGrid = localRealGrid_.data();
+ *complexGrid = (float*)localComplexGrid_.data();
+
+ realGridSize[XX] = gridSizesInXForEachRank[procX];
+ realGridSize[YY] = gridSizesInYForEachRank[procY];
+ realGridSize[ZZ] = nz;
+
+ realGridSizePadded[XX] = fftPlan_->inbox().size[2];
+ realGridSizePadded[YY] = fftPlan_->inbox().size[1];
+ realGridSizePadded[ZZ] = fftPlan_->inbox().size[0];
+
+ complexGridSizePadded[XX] = fftPlan_->outbox().size[2];
+ complexGridSizePadded[YY] = fftPlan_->outbox().size[1];
+ complexGridSizePadded[ZZ] = fftPlan_->outbox().size[0];
+}
+
+template<typename backend_tag>
+void Gpu3dFft::ImplHeFfte<backend_tag>::perform3dFft(gmx_fft_direction dir, CommandEvent* /*timingEvent*/)
+{
+ // HeFFTe does all the computations in the default stream
+ // ToDo: We need some way to create DeviceStream class in GROMACS with default stream
+ // This way we can synchronize PME and default streams using events
+ stream_.synchronize();
+
+ switch (dir)
+ {
+ case GMX_FFT_REAL_TO_COMPLEX:
+ fftPlan_->forward(localRealGrid_.data(), localComplexGrid_.data(), workspace_.data());
+ break;
+ case GMX_FFT_COMPLEX_TO_REAL:
+ fftPlan_->backward(localComplexGrid_.data(), localRealGrid_.data(), workspace_.data());
+ break;
+ default:
+ GMX_THROW(NotImplementedError("The chosen 3D-FFT case is not implemented on GPUs"));
+ }
+
+ // ToDo: Same as above, we need some way to create DeviceStream from default stream
+ heffte::gpu::synchronize_default_stream();
+}
+
+// instantiate relevant HeFFTe backend
+#if GMX_GPU_CUDA
+template class Gpu3dFft::ImplHeFfte<heffte::backend::cufft>;
+#endif
+
+} // namespace gmx
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2021, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/*! \internal \file
+ * \brief Declares the GPU 3D FFT routines.
+ * \author Gaurav Garg <gaugarg@nvidia.com>
+ * \ingroup module_fft
+ */
+
+#ifndef GMX_FFT_GPU_3DFFT_HEFFTE_H
+#define GMX_FFT_GPU_3DFFT_HEFFTE_H
+
+#include <memory>
+
+#include "gromacs/fft/fft.h"
+#include "gromacs/gpu_utils/devicebuffer_datatype.h"
+#include "gromacs/gpu_utils/hostallocator.h"
+#include "gromacs/gpu_utils/gputraits.h"
+#include "gromacs/utility/gmxmpi.h"
+#include "gpu_3dfft_impl.h"
+
+#include <heffte.h>
+
+class DeviceContext;
+class DeviceStream;
+
+namespace gmx
+{
+
+/*! \internal \brief
+ * A 3D FFT wrapper class for performing R2C/C2R transforms using clFFT
+ */
+template<typename backend_tag>
+class Gpu3dFft::ImplHeFfte : public Gpu3dFft::Impl
+{
+public:
+ //! \copydoc Gpu3dFft::Impl::Impl
+ ImplHeFfte(bool allocateGrids,
+ MPI_Comm comm,
+ ArrayRef<const int> gridSizesInXForEachRank,
+ ArrayRef<const int> gridSizesInYForEachRank,
+ int nz,
+ bool performOutOfPlaceFFT,
+ const DeviceContext& context,
+ const DeviceStream& pmeStream,
+ ivec realGridSize,
+ ivec realGridSizePadded,
+ ivec complexGridSizePadded,
+ DeviceBuffer<float>* realGrid,
+ DeviceBuffer<float>* complexGrid);
+
+ /*! \brief Destroys the FFT plans. */
+ ~ImplHeFfte() override = default;
+
+ /*! \brief Performs the FFT transform in given direction
+ *
+ * \param[in] dir FFT transform direction specifier
+ * \param[out] timingEvent pointer to the timing event where timing data is recorded
+ */
+ void perform3dFft(gmx_fft_direction dir, CommandEvent* timingEvent) override;
+
+private:
+ heffte::gpu::vector<float> localRealGrid_;
+ heffte::gpu::vector<std::complex<float>> localComplexGrid_;
+ heffte::gpu::vector<std::complex<float>> workspace_;
+
+ std::unique_ptr<heffte::fft3d_r2c<backend_tag, int>> fftPlan_;
+
+ const DeviceStream& stream_;
+};
+
+} // namespace gmx
+
+#endif
GPU_CPP_SOURCE_FILES
fft.cpp
)
+
+if(Heffte_FOUND)
+gmx_add_mpi_unit_test(FFTMpiUnitTests fft-mpi-test 4 HARDWARE_DETECTION
+ GPU_CPP_SOURCE_FILES
+ fft_mpi.cpp
+ )
+endif()
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2021, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ * \brief
+ * Tests utilities for fft calculations.
+ *
+ * \author Gaurav Garg <gaugarg@nvidia.com>
+ * \ingroup module_fft
+ */
+#include "gmxpre.h"
+
+#include "gromacs/fft/fft.h"
+
+#include "config.h"
+
+#include <algorithm>
+#include <vector>
+#include <random>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+
+#include "gromacs/fft/gpu_3dfft.h"
+#include "gromacs/gpu_utils/clfftinitializer.h"
+#if GMX_GPU
+# include "gromacs/gpu_utils/devicebuffer.h"
+#endif
+#include "gromacs/utility/stringutil.h"
+
+#include "testutils/refdata.h"
+#include "testutils/mpitest.h"
+#include "testutils/test_hardware_environment.h"
+#include "testutils/testasserts.h"
+#include "testutils/testmatchers.h"
+
+namespace gmx
+{
+namespace test
+{
+using GpuFftTestParams = std::tuple<IVec, // size of grid
+ int, // domains in x
+ int, // domains in y
+ FftBackend>;
+
+/*! \brief Check that the real grid after forward and backward
+ * 3D transforms matches the input real grid. */
+static void checkRealGrid(const IVec realGridSizeFull,
+ const ivec realGridSize,
+ const ivec realGridSizePadded,
+ ArrayRef<const real> inputRealGrid,
+ ArrayRef<real> outputRealGridValues)
+{
+ // Normalize the output (as the implementation does not
+ // normalize either FFT)
+ const real normalizationConstant =
+ 1.0 / (realGridSizeFull[XX] * realGridSizeFull[YY] * realGridSizeFull[ZZ]);
+ std::transform(outputRealGridValues.begin(),
+ outputRealGridValues.end(),
+ outputRealGridValues.begin(),
+ [normalizationConstant](const real r) { return r * normalizationConstant; });
+ // Check the real grid, skipping unused data from the padding
+ const auto realGridTolerance = relativeToleranceAsFloatingPoint(10, 1e-6);
+ for (int i = 0; i < realGridSize[XX] * realGridSize[YY]; i++)
+ {
+ auto expected =
+ arrayRefFromArray(inputRealGrid.data() + i * realGridSizePadded[ZZ], realGridSize[ZZ]);
+ auto actual = arrayRefFromArray(outputRealGridValues.data() + i * realGridSizePadded[ZZ],
+ realGridSize[ZZ]);
+ EXPECT_THAT(actual, Pointwise(RealEq(realGridTolerance), expected))
+ << formatString("checking backward transform part %d", i);
+ }
+}
+
+class GpuFftTest3D : public ::testing::Test, public ::testing::WithParamInterface<GpuFftTestParams>
+{
+public:
+ GpuFftTest3D() = default;
+
+
+ //! The whole logic being tested is contained here
+ static void runTest(const GpuFftTestParams& param)
+ {
+ const auto& deviceList = getTestHardwareEnvironment()->getTestDeviceList();
+
+ int rank;
+ MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+
+ const auto& testDevice = deviceList[rank % deviceList.size()];
+
+ const DeviceContext& deviceContext = testDevice->deviceContext();
+ setActiveDevice(testDevice->deviceInfo());
+ const DeviceStream& deviceStream = testDevice->deviceStream();
+
+ FftBackend backend;
+
+ int numDomainsX;
+ int numDomainsY;
+ IVec realGridSizeFull;
+ std::tie(realGridSizeFull, numDomainsX, numDomainsY, backend) = param;
+
+ // define local grid sizes - this follows same logic as GROMACS implementation
+ std::vector<int> localGridSizesX(numDomainsX);
+ for (unsigned int i = 0; i < localGridSizesX.size(); ++i)
+ {
+ localGridSizesX[i] = ((i + 1) * realGridSizeFull[XX] / numDomainsX)
+ - (i * realGridSizeFull[XX] / numDomainsX);
+ ASSERT_GT(localGridSizesX[i], 0);
+ }
+
+ std::vector<int> localGridSizesY(numDomainsY);
+ for (unsigned int i = 0; i < localGridSizesY.size(); ++i)
+ {
+ localGridSizesY[i] = ((i + 1) * realGridSizeFull[YY] / numDomainsY)
+ - (i * realGridSizeFull[YY] / numDomainsY);
+ ASSERT_GT(localGridSizesY[i], 0);
+ }
+
+ ivec realGridSize;
+ ivec realGridSizePadded;
+ ivec complexGridSizePadded;
+
+ // Allocate the device buffers
+ DeviceBuffer<float> realGrid, complexGrid;
+
+ const bool performOutOfPlaceFFT = true;
+ const MPI_Comm comm = MPI_COMM_WORLD;
+ const bool allocateGrid = true;
+ const int nz = realGridSizeFull[ZZ];
+ Gpu3dFft gpu3dFft(backend,
+ allocateGrid,
+ comm,
+ localGridSizesX,
+ localGridSizesY,
+ nz,
+ performOutOfPlaceFFT,
+ deviceContext,
+ deviceStream,
+ realGridSize,
+ realGridSizePadded,
+ complexGridSizePadded,
+ &realGrid,
+ &complexGrid);
+
+ int sizeInReals = realGridSizePadded[0] * realGridSizePadded[1] * realGridSizePadded[2];
+
+ // initialze random input data
+ std::vector<real> in(sizeInReals);
+ std::uniform_real_distribution<> dis(-10.0f, 10.0f);
+ std::minstd_rand gen(time(NULL) + rank);
+ std::generate(in.begin(), in.end(), [&dis, &gen]() {
+ // random number between -10 to 10
+ return dis(gen);
+ });
+
+ // Transfer the real grid input data for the FFT
+ copyToDeviceBuffer(
+ &realGrid, in.data(), 0, in.size(), deviceStream, GpuApiCallBehavior::Sync, nullptr);
+
+ // Do the forward FFT to compute the complex grid
+ CommandEvent* timingEvent = nullptr;
+ gpu3dFft.perform3dFft(GMX_FFT_REAL_TO_COMPLEX, timingEvent);
+
+ // clear real grid after the forward FFT, so that we know the
+ // final grid is one produced by the complex FFT, not just leftovers
+ clearDeviceBufferAsync(&realGrid, 0, sizeInReals, deviceStream);
+
+ // Do the back transform
+ gpu3dFft.perform3dFft(GMX_FFT_COMPLEX_TO_REAL, timingEvent);
+ deviceStream.synchronize();
+
+ // Transfer the real grid back from the device
+ std::vector<float> outputRealGridValues(in.size());
+ copyFromDeviceBuffer(outputRealGridValues.data(),
+ &realGrid,
+ 0,
+ outputRealGridValues.size(),
+ deviceStream,
+ GpuApiCallBehavior::Sync,
+ nullptr);
+
+ checkRealGrid(realGridSizeFull, realGridSize, realGridSizePadded, in, outputRealGridValues);
+ }
+};
+
+TEST_P(GpuFftTest3D, GpuFftDecomposition)
+{
+ GMX_MPI_TEST(4);
+ GpuFftTestParams params = GetParam();
+ runTest(params);
+}
+
+std::vector<GpuFftTestParams> const inputs{
+ { IVec{ 5, 6, 9 }, 4, 1, FftBackend::HeFFTe_CUDA}, // slab decomposition
+ { IVec{ 5, 6, 9 }, 2, 2, FftBackend::HeFFTe_CUDA} // pencil decomposition
+};
+
+INSTANTIATE_TEST_SUITE_P(GpuFft, GpuFftTest3D, ::testing::ValuesIn(inputs));
+
+} // namespace test
+} // namespace gmx
mdpFieldValues["init-lambda-state"] = "3";
mdpFieldValues["nsteps"] = "16";
- // Forces on GPUs are generally not reproducible enough for a tight
- // tolerance. Similarly, the propagation of sd and bd are not as
+ // Forces and update on GPUs are generally not reproducible enough for a tight
+ // tolerance. Similarly, the propagation of bd is not as
// reproducible as the others. So we use several ULP tolerance
// in all cases. This is looser than needed e.g. for md and md-vv
// with forces on CPUs, but there is no real risk of a bug with
// those propagators that would only be caught with a tighter
// tolerance in this particular test.
- int ulpToleranceInMixed = 32;
+ int ulpToleranceInMixed = 128;
int ulpToleranceInDouble = 64;
if (integrator == "bd")
{