From f8a05bc43449f683a1e64a273d9c699d488d596c Mon Sep 17 00:00:00 2001
From: Gaurav Garg <gaugarg@nvidia.com>
Date: Tue, 28 Sep 2021 10:26:36 +0000
Subject: [PATCH] Add HeFFTe based FFT backend

---
 CMakeLists.txt                                |  16 ++
 admin/gitlab-ci/gromacs.matrix.gitlab-ci.yml  |   1 +
 .../gromacs.gcc-11-cuda-11.4.1.gitlab-ci.yml  |  10 +-
 .../gromacs.gcc-7-cuda-11.0-mpi.gitlab-ci.yml |  58 +++++
 src/config.h.cmakein                          |   3 +
 src/gromacs/CMakeLists.txt                    |   4 +
 src/gromacs/fft/CMakeLists.txt                |   6 +-
 src/gromacs/fft/gpu_3dfft.cpp                 |  37 ++-
 src/gromacs/fft/gpu_3dfft.h                   |   4 +
 src/gromacs/fft/gpu_3dfft_heffte.cpp          | 198 +++++++++++++++
 src/gromacs/fft/gpu_3dfft_heffte.h            | 106 ++++++++
 src/gromacs/fft/tests/CMakeLists.txt          |   7 +
 src/gromacs/fft/tests/fft_mpi.cpp             | 232 ++++++++++++++++++
 .../mdrun/tests/exactcontinuation.cpp         |   6 +-
 14 files changed, 678 insertions(+), 10 deletions(-)
 create mode 100644 admin/gitlab-ci/gromacs.matrix/gromacs.gcc-7-cuda-11.0-mpi.gitlab-ci.yml
 create mode 100644 src/gromacs/fft/gpu_3dfft_heffte.cpp
 create mode 100644 src/gromacs/fft/gpu_3dfft_heffte.h
 create mode 100644 src/gromacs/fft/tests/fft_mpi.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index b18f414895..0d6a4b7f27 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -229,6 +229,12 @@ gmx_dependent_option(
 mark_as_advanced(GMX_BUILD_OWN_FFTW)
 mark_as_advanced(GMX_DISABLE_FFTW_MEASURE)
 
+gmx_dependent_option(
+    GMX_USE_HEFFTE
+    "Use HeFFTe for FFT support. Used with CUDA backend"
+    OFF
+    "GMX_GPU STREQUAL CUDA;GMX_MPI")
+
 gmx_dependent_cache_variable(GMX_SIMD_REF_FLOAT_WIDTH  "Reference SIMD single precision width" STRING "4" "GMX_SIMD STREQUAL REFERENCE")
 gmx_dependent_cache_variable(GMX_SIMD_REF_DOUBLE_WIDTH "Reference SIMD double precision width" STRING "2" "GMX_SIMD STREQUAL REFERENCE")
 
@@ -620,6 +626,16 @@ if(CYGWIN)
     set(GMX_CYGWIN 1)
 endif()
 
+if(GMX_USE_HEFFTE)
+    if(NOT GMX_GPU_CUDA)
+        message(FATAL_ERROR "HeFFTe support requires a CUDA build")
+    endif()
+    if(NOT GMX_LIB_MPI)
+        message(FATAL_ERROR "HeFFTe support requires a library MPI build")
+    endif()
+    find_package(Heffte 2.1.0 REQUIRED CUDA)
+endif()
+
 if(WIN32)
     set(GMX_NATIVE_WINDOWS 1)
     # This makes windows.h not declare min/max as macros that would break
diff --git a/admin/gitlab-ci/gromacs.matrix.gitlab-ci.yml b/admin/gitlab-ci/gromacs.matrix.gitlab-ci.yml
index 1b8892e49a..31cd3d223b 100644
--- a/admin/gitlab-ci/gromacs.matrix.gitlab-ci.yml
+++ b/admin/gitlab-ci/gromacs.matrix.gitlab-ci.yml
@@ -273,6 +273,7 @@ include:
   - local: '/admin/gitlab-ci/gromacs.matrix/gromacs.gcc-11.gitlab-ci.yml'
   - local: '/admin/gitlab-ci/gromacs.matrix/gromacs.gcc-11-coverage.gitlab-ci.yml'
   - local: '/admin/gitlab-ci/gromacs.matrix/gromacs.gcc-7-cuda-11.0.gitlab-ci.yml'
+  - local: '/admin/gitlab-ci/gromacs.matrix/gromacs.gcc-7-cuda-11.0-mpi.gitlab-ci.yml'
   - local: '/admin/gitlab-ci/gromacs.matrix/gromacs.gcc-11-cuda-11.4.1.gitlab-ci.yml'
   - local: '/admin/gitlab-ci/gromacs.matrix/gromacs.gcc-11-cuda-11.4.1-release.gitlab-ci.yml'
   - local: '/admin/gitlab-ci/gromacs.matrix/gromacs.gcc-11-release.gitlab-ci.yml'
diff --git a/admin/gitlab-ci/gromacs.matrix/gromacs.gcc-11-cuda-11.4.1.gitlab-ci.yml b/admin/gitlab-ci/gromacs.matrix/gromacs.gcc-11-cuda-11.4.1.gitlab-ci.yml
index 582dfbb60a..29b6d5a229 100644
--- a/admin/gitlab-ci/gromacs.matrix/gromacs.gcc-11-cuda-11.4.1.gitlab-ci.yml
+++ b/admin/gitlab-ci/gromacs.matrix/gromacs.gcc-11-cuda-11.4.1.gitlab-ci.yml
@@ -15,7 +15,7 @@
 #   FFT: FFTW3
 #   Parallelism np/ntomp: 4/1 (regression tests with dual GPU)
 
-gromacs:gcc-11-cuda-11.4:configureMPI:
+gromacs:gcc-11-cuda-11.4.1:configureMPI:
   extends:
     - .gromacs:base:configure
     - .use-gcc:base
@@ -28,7 +28,7 @@ gromacs:gcc-11-cuda-11.4:configureMPI:
     CMAKE_SIMD_OPTIONS: "-DGMX_SIMD=SSE4.1"
     COMPILER_MAJOR_VERSION: 11
 
-gromacs:gcc-11-cuda-11.4:buildMPI:
+gromacs:gcc-11-cuda-11.4.1:buildMPI:
   extends:
     - .variables:default
     - .gromacs:base:build
@@ -39,9 +39,9 @@ gromacs:gcc-11-cuda-11.4:buildMPI:
   variables:
     CMAKE: /usr/local/cmake-3.18.4/bin/cmake
   needs:
-    - job: gromacs:gcc-11-cuda-11.4:configureMPI
+    - job: gromacs:gcc-11-cuda-11.4.1:configureMPI
 
-gromacs:gcc-11-cuda-11.4:regressiontest-gpucommupd-MPI:
+gromacs:gcc-11-cuda-11.4.1:regressiontest-gpucommupd-MPI:
   # Test parallelism np/ntomp: 4/1
   # Test parallelism GPU: direct communications, update
   extends:
@@ -62,7 +62,7 @@ gromacs:gcc-11-cuda-11.4:regressiontest-gpucommupd-MPI:
   tags:
     - k8s-scilifelab
   needs:
-    - job: gromacs:gcc-11-cuda-11.4:buildMPI
+    - job: gromacs:gcc-11-cuda-11.4.1:buildMPI
     - job: regressiontests:prepare
   artifacts:
     paths:
diff --git a/admin/gitlab-ci/gromacs.matrix/gromacs.gcc-7-cuda-11.0-mpi.gitlab-ci.yml b/admin/gitlab-ci/gromacs.matrix/gromacs.gcc-7-cuda-11.0-mpi.gitlab-ci.yml
new file mode 100644
index 0000000000..352390c170
--- /dev/null
+++ b/admin/gitlab-ci/gromacs.matrix/gromacs.gcc-7-cuda-11.0-mpi.gitlab-ci.yml
@@ -0,0 +1,58 @@
+# Test goal: old versions of GCC with CUDA; GPU communications with OpenMPI
+# Test intents (should change rarely and conservatively):
+#   OS: Ubuntu oldest supported
+#   Compiler: GCC oldest supported
+#   GPU: CUDA oldest supported
+#   HW: NVIDIA GPU, single NVIDIA GPU
+#   MPI: OpenMPI
+#   Features: GPU direct communications + update (unit tests), HeFFTe support
+#   Scope: configure, build, unit tests
+# Test implementation choices (free to change as needed):
+#   OS: Ubuntu 20.04
+#   Build type: Debug
+#   Compiler: GCC 7
+#   GPU: CUDA 11.0
+#   SIMD: SSE 4.1
+#   FFT: FFTW3
+#   Parallelism nt/ntomp: 4/2 (unit tests)
+
+gromacs:gcc-7-cuda-11.0:configureMPI:
+  extends:
+    - .gromacs:base:configure
+    - .use-gcc:base
+    - .use-cuda
+    - .use-mpi
+    - .rules:merge-and-post-merge-acceptance
+  image: ${CI_REGISTRY}/gromacs/gromacs/ci-ubuntu-20.04-gcc-7-cuda-11.0
+  variables:
+    CMAKE: /usr/local/cmake-3.17.2/bin/cmake
+    CMAKE_SIMD_OPTIONS: "-DGMX_SIMD=SSE4.1"
+    CMAKE_EXTRA_OPTIONS: "-DGMX_USE_HEFFTE=ON"
+    COMPILER_MAJOR_VERSION: 7
+
+gromacs:gcc-7-cuda-11.0:buildMPI:
+  extends:
+    - .variables:default
+    - .gromacs:base:build
+    - .before_script:default
+    - .use-ccache
+    - .rules:merge-and-post-merge-acceptance
+  image: ${CI_REGISTRY}/gromacs/gromacs/ci-ubuntu-20.04-gcc-7-cuda-11.0
+  variables:
+    CMAKE: /usr/local/cmake-3.17.2/bin/cmake
+  needs:
+    - job: gromacs:gcc-7-cuda-11.0:configureMPI
+
+gromacs:gcc-7-cuda-11.0:testMPI:
+  extends:
+    - .gromacs:base:test
+    - .rules:merge-requests
+  image: ${CI_REGISTRY}/gromacs/gromacs/ci-ubuntu-20.04-gcc-7-cuda-11.0
+  variables:
+    CMAKE: /usr/local/cmake-3.17.2/bin/cmake
+    KUBERNETES_EXTENDED_RESOURCE_NAME: "nvidia.com/gpu"
+    KUBERNETES_EXTENDED_RESOURCE_LIMIT: 1
+  tags:
+    - k8s-scilifelab
+  needs:
+    - job: gromacs:gcc-7-cuda-11.0:buildMPI
diff --git a/src/config.h.cmakein b/src/config.h.cmakein
index b082d5e702..48d36ac9d5 100644
--- a/src/config.h.cmakein
+++ b/src/config.h.cmakein
@@ -239,6 +239,9 @@
 /* Use CUDA-aware MPI.  */
 #cmakedefine01 HAVE_CUDA_AWARE_MPI
 
+/* Define if HeFFTe library found */
+#cmakedefine01 Heffte_FOUND
+
 /* Cluster size used by nonbonded kernel. Should be 8 for NVIDIA/AMD and 4 for Intel */
 #define GMX_GPU_NB_CLUSTER_SIZE @GMX_GPU_NB_CLUSTER_SIZE@
 
diff --git a/src/gromacs/CMakeLists.txt b/src/gromacs/CMakeLists.txt
index 8d1680da00..e3876a38e6 100644
--- a/src/gromacs/CMakeLists.txt
+++ b/src/gromacs/CMakeLists.txt
@@ -186,6 +186,10 @@ else()
     add_library(libgromacs ${LIBGROMACS_SOURCES})
 endif()
 
+if (TARGET Heffte::Heffte)
+    target_link_libraries(libgromacs PRIVATE Heffte::Heffte)
+endif()
+
 if (GMX_SYCL_HIPSYCL)
     target_link_libraries(libgromacs PUBLIC roc::rocfft)
 endif()
diff --git a/src/gromacs/fft/CMakeLists.txt b/src/gromacs/fft/CMakeLists.txt
index 73a6d4e5cd..060f9b8d6b 100644
--- a/src/gromacs/fft/CMakeLists.txt
+++ b/src/gromacs/fft/CMakeLists.txt
@@ -54,7 +54,11 @@ endif()
 if (GMX_FFT_MKL)
     gmx_add_libgromacs_sources(fft_mkl.cpp)
 endif()
-
+if(Heffte_FOUND)
+    gmx_add_libgromacs_sources(
+        gpu_3dfft_heffte.cpp
+        )
+endif()
 if (GMX_GPU_CUDA)
     gmx_add_libgromacs_sources(
         # CUDA-specific sources
diff --git a/src/gromacs/fft/gpu_3dfft.cpp b/src/gromacs/fft/gpu_3dfft.cpp
index c027c5d08a..9b931cfb2f 100644
--- a/src/gromacs/fft/gpu_3dfft.cpp
+++ b/src/gromacs/fft/gpu_3dfft.cpp
@@ -54,6 +54,10 @@
 #    include "gpu_3dfft_sycl.h"
 #endif
 
+#if Heffte_FOUND
+#    include "gpu_3dfft_heffte.h"
+#endif
+
 #include "gromacs/utility/arrayref.h"
 #include "gromacs/utility/exceptions.h"
 
@@ -101,7 +105,9 @@ Gpu3dFft::Gpu3dFft(FftBackend           backend,
                                                           realGrid,
                                                           complexGrid);
             break;
-        default: GMX_THROW(InternalError("Unsupported FFT backend requested"));
+        default:
+            GMX_RELEASE_ASSERT(backend == FftBackend::HeFFTe_CUDA,
+                               "Unsupported FFT backend requested");
     }
 #    elif GMX_GPU_OPENCL
     switch (backend)
@@ -144,6 +150,35 @@ Gpu3dFft::Gpu3dFft(FftBackend           backend,
         default: GMX_THROW(InternalError("Unsupported FFT backend requested"));
     }
 #    endif
+
+#    if Heffte_FOUND
+    switch (backend)
+    {
+        case FftBackend::HeFFTe_CUDA:
+            GMX_RELEASE_ASSERT(
+                    GMX_GPU_CUDA,
+                    "HeFFTe_CUDA FFT backend is supported only with GROMACS compiled with CUDA");
+            GMX_RELEASE_ASSERT(heffte::backend::is_enabled<heffte::backend::cufft>::value,
+                               "HeFFTe not compiled with CUDA support");
+            impl_ = std::make_unique<Gpu3dFft::ImplHeFfte<heffte::backend::cufft>>(
+                    allocateGrids,
+                    comm,
+                    gridSizesInXForEachRank,
+                    gridSizesInYForEachRank,
+                    nz,
+                    performOutOfPlaceFFT,
+                    context,
+                    pmeStream,
+                    realGridSize,
+                    realGridSizePadded,
+                    complexGridSizePadded,
+                    realGrid,
+                    complexGrid);
+
+            break;
+        default: GMX_RELEASE_ASSERT(impl_ != nullptr, "Unsupported FFT backend requested");
+    }
+#    endif
 }
 
 #else
diff --git a/src/gromacs/fft/gpu_3dfft.h b/src/gromacs/fft/gpu_3dfft.h
index 7b2c637654..a26dc8af3f 100644
--- a/src/gromacs/fft/gpu_3dfft.h
+++ b/src/gromacs/fft/gpu_3dfft.h
@@ -70,6 +70,7 @@ enum class FftBackend
     Cufft, // supports only single-GPU
     Ocl,   // supports only single-GPU
     Sycl,  // Not supported currently
+    HeFFTe_CUDA,
     Count
 };
 
@@ -127,6 +128,9 @@ private:
     class ImplOcl;
     class ImplSycl;
 
+    template<typename backend_tag>
+    class ImplHeFfte;
+
     std::unique_ptr<Impl> impl_;
 };
 
diff --git a/src/gromacs/fft/gpu_3dfft_heffte.cpp b/src/gromacs/fft/gpu_3dfft_heffte.cpp
new file mode 100644
index 0000000000..4c392bd65d
--- /dev/null
+++ b/src/gromacs/fft/gpu_3dfft_heffte.cpp
@@ -0,0 +1,198 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2021, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/*! \internal \file
+ *  \brief Implements GPU 3D FFT routines using HeFFTe.
+ *
+ *  \author Gaurav Garg <gaugarg@nvidia.com>
+ *  \ingroup module_fft
+ */
+
+#include "gmxpre.h"
+
+#include "gpu_3dfft_heffte.h"
+
+#include "gromacs/gpu_utils/device_stream.h"
+#include "gromacs/utility/arrayref.h"
+#include "gromacs/utility/fatalerror.h"
+#include "gromacs/utility/gmxassert.h"
+
+namespace gmx
+{
+template<typename backend_tag>
+Gpu3dFft::ImplHeFfte<backend_tag>::ImplHeFfte(bool                allocateGrids,
+                                              MPI_Comm            comm,
+                                              ArrayRef<const int> gridSizesInXForEachRank,
+                                              ArrayRef<const int> gridSizesInYForEachRank,
+                                              const int           nz,
+                                              bool                performOutOfPlaceFFT,
+                                              const DeviceContext& /*context*/,
+                                              const DeviceStream&  pmeStream,
+                                              ivec                 realGridSize,
+                                              ivec                 realGridSizePadded,
+                                              ivec                 complexGridSizePadded,
+                                              DeviceBuffer<float>* realGrid,
+                                              DeviceBuffer<float>* complexGrid) :
+    stream_(pmeStream)
+{
+    const int numDomainsX = gridSizesInXForEachRank.size();
+    const int numDomainsY = gridSizesInYForEachRank.size();
+
+    GMX_RELEASE_ASSERT(allocateGrids == true, "Grids cannot be pre-allocated");
+    GMX_RELEASE_ASSERT(performOutOfPlaceFFT == true, "Only out-of-place FFT supported");
+    GMX_RELEASE_ASSERT(numDomainsX * numDomainsY > 1,
+                       "HeFFTe backend is expected to be used only with more than 1 rank");
+
+    // calculate grid offsets
+    std::vector<int> gridOffsetsInX(numDomainsX + 1);
+    std::vector<int> gridOffsetsInY(numDomainsY + 1);
+
+    gridOffsetsInX[0] = 0;
+    for (unsigned int i = 0; i < gridSizesInXForEachRank.size(); ++i)
+    {
+        gridOffsetsInX[i + 1] = gridOffsetsInX[i] + gridSizesInXForEachRank[i];
+    }
+
+    gridOffsetsInY[0] = 0;
+    for (unsigned int i = 0; i < gridSizesInYForEachRank.size(); ++i)
+    {
+        gridOffsetsInY[i + 1] = gridOffsetsInY[i] + gridSizesInYForEachRank[i];
+    }
+
+    int rank, nProcs;
+    MPI_Comm_rank(comm, &rank);
+    MPI_Comm_size(comm, &nProcs);
+
+    GMX_RELEASE_ASSERT(nProcs == numDomainsX * numDomainsY,
+                       "Mismatch in communicator size and expected domain decomposition");
+
+    // define how ranks are mapped to 2d domain
+    int procY = rank % numDomainsY;
+    int procX = rank / numDomainsY;
+
+    // local real grid boxes
+    heffte::box3d<> const realBox = { { 0, gridOffsetsInY[procY], gridOffsetsInX[procX] },
+                                      { nz - 1, gridOffsetsInY[procY + 1] - 1, gridOffsetsInX[procX + 1] - 1 } };
+
+    const int nx = gridOffsetsInX[numDomainsX];
+    const int ny = gridOffsetsInY[numDomainsY];
+
+    // define shape of local complex grid boxes
+    std::vector<int> gridOffsetsInY_transformed(numDomainsX + 1);
+    std::vector<int> gridOffsetsInZ_transformed(numDomainsY + 1);
+
+    for (int i = 0; i < numDomainsX; i++)
+    {
+        gridOffsetsInY_transformed[i] = (i * ny + 0) / numDomainsX;
+    }
+    gridOffsetsInY_transformed[numDomainsX] = ny;
+
+    const int complexZDim = nz / 2 + 1;
+    for (int i = 0; i < numDomainsY; i++)
+    {
+        gridOffsetsInZ_transformed[i] = (i * complexZDim + 0) / numDomainsY;
+    }
+    gridOffsetsInZ_transformed[numDomainsY] = complexZDim;
+
+    // output order - YZX
+    // this avoids reordering of data in final fft as final fft is done along x-dimension with
+    // x being contiguous, leave the data as is in YZX order and don't bring it back in XYZ
+    heffte::box3d<> const complexBox = {
+        { gridOffsetsInZ_transformed[procY], gridOffsetsInY_transformed[procX], 0 },
+        { gridOffsetsInZ_transformed[procY + 1] - 1, gridOffsetsInY_transformed[procX + 1] - 1, nx - 1 },
+        { 2, 0, 1 }
+    };
+
+    // ToDo: useReorder=true and useAlltoall=true gave me best results in past but, verify it once again
+    const bool useReorder  = true;
+    const bool useAlltoall = true;
+    const bool usePencils  = false; // Not-used as GROMACS doesn't work with brick decomposition
+    heffte::plan_options options(useReorder, useAlltoall, usePencils);
+
+    // Define 3D FFT plan
+    fftPlan_ = std::make_unique<heffte::fft3d_r2c<backend_tag, int>>(realBox, complexBox, 0, comm, options);
+
+    // allocate grid and workspace_
+    localRealGrid_    = heffte::gpu::vector<float>(fftPlan_->size_inbox());
+    localComplexGrid_ = heffte::gpu::vector<std::complex<float>>(fftPlan_->size_outbox());
+    workspace_        = heffte::gpu::vector<std::complex<float>>(fftPlan_->size_workspace());
+
+    // write back the output data
+    *realGrid    = localRealGrid_.data();
+    *complexGrid = (float*)localComplexGrid_.data();
+
+    realGridSize[XX] = gridSizesInXForEachRank[procX];
+    realGridSize[YY] = gridSizesInYForEachRank[procY];
+    realGridSize[ZZ] = nz;
+
+    realGridSizePadded[XX] = fftPlan_->inbox().size[2];
+    realGridSizePadded[YY] = fftPlan_->inbox().size[1];
+    realGridSizePadded[ZZ] = fftPlan_->inbox().size[0];
+
+    complexGridSizePadded[XX] = fftPlan_->outbox().size[2];
+    complexGridSizePadded[YY] = fftPlan_->outbox().size[1];
+    complexGridSizePadded[ZZ] = fftPlan_->outbox().size[0];
+}
+
+template<typename backend_tag>
+void Gpu3dFft::ImplHeFfte<backend_tag>::perform3dFft(gmx_fft_direction dir, CommandEvent* /*timingEvent*/)
+{
+    // HeFFTe does all the computations in the default stream
+    // ToDo: We need some way to create DeviceStream class in GROMACS with default stream
+    // This way we can synchronize PME and default streams using events
+    stream_.synchronize();
+
+    switch (dir)
+    {
+        case GMX_FFT_REAL_TO_COMPLEX:
+            fftPlan_->forward(localRealGrid_.data(), localComplexGrid_.data(), workspace_.data());
+            break;
+        case GMX_FFT_COMPLEX_TO_REAL:
+            fftPlan_->backward(localComplexGrid_.data(), localRealGrid_.data(), workspace_.data());
+            break;
+        default:
+            GMX_THROW(NotImplementedError("The chosen 3D-FFT case is not implemented on GPUs"));
+    }
+
+    // ToDo: Same as above, we need some way to create DeviceStream from default stream
+    heffte::gpu::synchronize_default_stream();
+}
+
+// instantiate relevant HeFFTe backend
+#if GMX_GPU_CUDA
+template class Gpu3dFft::ImplHeFfte<heffte::backend::cufft>;
+#endif
+
+} // namespace gmx
diff --git a/src/gromacs/fft/gpu_3dfft_heffte.h b/src/gromacs/fft/gpu_3dfft_heffte.h
new file mode 100644
index 0000000000..ebc92e554a
--- /dev/null
+++ b/src/gromacs/fft/gpu_3dfft_heffte.h
@@ -0,0 +1,106 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2021, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/*! \internal \file
+ *  \brief Declares the GPU 3D FFT routines.
+ *  \author Gaurav Garg <gaugarg@nvidia.com>
+ *  \ingroup module_fft
+ */
+
+#ifndef GMX_FFT_GPU_3DFFT_HEFFTE_H
+#define GMX_FFT_GPU_3DFFT_HEFFTE_H
+
+#include <memory>
+
+#include "gromacs/fft/fft.h"
+#include "gromacs/gpu_utils/devicebuffer_datatype.h"
+#include "gromacs/gpu_utils/hostallocator.h"
+#include "gromacs/gpu_utils/gputraits.h"
+#include "gromacs/utility/gmxmpi.h"
+#include "gpu_3dfft_impl.h"
+
+#include <heffte.h>
+
+class DeviceContext;
+class DeviceStream;
+
+namespace gmx
+{
+
+/*! \internal \brief
+ * A 3D FFT wrapper class for performing R2C/C2R transforms using clFFT
+ */
+template<typename backend_tag>
+class Gpu3dFft::ImplHeFfte : public Gpu3dFft::Impl
+{
+public:
+    //! \copydoc Gpu3dFft::Impl::Impl
+    ImplHeFfte(bool                 allocateGrids,
+               MPI_Comm             comm,
+               ArrayRef<const int>  gridSizesInXForEachRank,
+               ArrayRef<const int>  gridSizesInYForEachRank,
+               int                  nz,
+               bool                 performOutOfPlaceFFT,
+               const DeviceContext& context,
+               const DeviceStream&  pmeStream,
+               ivec                 realGridSize,
+               ivec                 realGridSizePadded,
+               ivec                 complexGridSizePadded,
+               DeviceBuffer<float>* realGrid,
+               DeviceBuffer<float>* complexGrid);
+
+    /*! \brief Destroys the FFT plans. */
+    ~ImplHeFfte() override = default;
+
+    /*! \brief Performs the FFT transform in given direction
+     *
+     * \param[in]  dir           FFT transform direction specifier
+     * \param[out] timingEvent   pointer to the timing event where timing data is recorded
+     */
+    void perform3dFft(gmx_fft_direction dir, CommandEvent* timingEvent) override;
+
+private:
+    heffte::gpu::vector<float>               localRealGrid_;
+    heffte::gpu::vector<std::complex<float>> localComplexGrid_;
+    heffte::gpu::vector<std::complex<float>> workspace_;
+
+    std::unique_ptr<heffte::fft3d_r2c<backend_tag, int>> fftPlan_;
+
+    const DeviceStream& stream_;
+};
+
+} // namespace gmx
+
+#endif
diff --git a/src/gromacs/fft/tests/CMakeLists.txt b/src/gromacs/fft/tests/CMakeLists.txt
index 319a6ab7f3..681fd38cbe 100644
--- a/src/gromacs/fft/tests/CMakeLists.txt
+++ b/src/gromacs/fft/tests/CMakeLists.txt
@@ -36,3 +36,10 @@ gmx_add_unit_test(FFTUnitTests fft-test HARDWARE_DETECTION
     GPU_CPP_SOURCE_FILES
         fft.cpp
     )
+
+if(Heffte_FOUND)
+gmx_add_mpi_unit_test(FFTMpiUnitTests fft-mpi-test 4 HARDWARE_DETECTION
+    GPU_CPP_SOURCE_FILES
+        fft_mpi.cpp
+        )
+endif()
diff --git a/src/gromacs/fft/tests/fft_mpi.cpp b/src/gromacs/fft/tests/fft_mpi.cpp
new file mode 100644
index 0000000000..1772102196
--- /dev/null
+++ b/src/gromacs/fft/tests/fft_mpi.cpp
@@ -0,0 +1,232 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2021, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ * \brief
+ * Tests utilities for fft calculations.
+ *
+ * \author Gaurav Garg <gaugarg@nvidia.com>
+ * \ingroup module_fft
+ */
+#include "gmxpre.h"
+
+#include "gromacs/fft/fft.h"
+
+#include "config.h"
+
+#include <algorithm>
+#include <vector>
+#include <random>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+
+#include "gromacs/fft/gpu_3dfft.h"
+#include "gromacs/gpu_utils/clfftinitializer.h"
+#if GMX_GPU
+#    include "gromacs/gpu_utils/devicebuffer.h"
+#endif
+#include "gromacs/utility/stringutil.h"
+
+#include "testutils/refdata.h"
+#include "testutils/mpitest.h"
+#include "testutils/test_hardware_environment.h"
+#include "testutils/testasserts.h"
+#include "testutils/testmatchers.h"
+
+namespace gmx
+{
+namespace test
+{
+using GpuFftTestParams = std::tuple<IVec, // size of grid
+                                    int,  // domains in x
+                                    int,  // domains in y
+                                    FftBackend>;
+
+/*! \brief Check that the real grid after forward and backward
+ * 3D transforms matches the input real grid. */
+static void checkRealGrid(const IVec           realGridSizeFull,
+                          const ivec           realGridSize,
+                          const ivec           realGridSizePadded,
+                          ArrayRef<const real> inputRealGrid,
+                          ArrayRef<real>       outputRealGridValues)
+{
+    // Normalize the output (as the implementation does not
+    // normalize either FFT)
+    const real normalizationConstant =
+            1.0 / (realGridSizeFull[XX] * realGridSizeFull[YY] * realGridSizeFull[ZZ]);
+    std::transform(outputRealGridValues.begin(),
+                   outputRealGridValues.end(),
+                   outputRealGridValues.begin(),
+                   [normalizationConstant](const real r) { return r * normalizationConstant; });
+    // Check the real grid, skipping unused data from the padding
+    const auto realGridTolerance = relativeToleranceAsFloatingPoint(10, 1e-6);
+    for (int i = 0; i < realGridSize[XX] * realGridSize[YY]; i++)
+    {
+        auto expected =
+                arrayRefFromArray(inputRealGrid.data() + i * realGridSizePadded[ZZ], realGridSize[ZZ]);
+        auto actual = arrayRefFromArray(outputRealGridValues.data() + i * realGridSizePadded[ZZ],
+                                        realGridSize[ZZ]);
+        EXPECT_THAT(actual, Pointwise(RealEq(realGridTolerance), expected))
+                << formatString("checking backward transform part %d", i);
+    }
+}
+
+class GpuFftTest3D : public ::testing::Test, public ::testing::WithParamInterface<GpuFftTestParams>
+{
+public:
+    GpuFftTest3D() = default;
+
+
+    //! The whole logic being tested is contained here
+    static void runTest(const GpuFftTestParams& param)
+    {
+        const auto& deviceList = getTestHardwareEnvironment()->getTestDeviceList();
+
+        int rank;
+        MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+
+        const auto& testDevice = deviceList[rank % deviceList.size()];
+
+        const DeviceContext& deviceContext = testDevice->deviceContext();
+        setActiveDevice(testDevice->deviceInfo());
+        const DeviceStream& deviceStream = testDevice->deviceStream();
+
+        FftBackend backend;
+
+        int  numDomainsX;
+        int  numDomainsY;
+        IVec realGridSizeFull;
+        std::tie(realGridSizeFull, numDomainsX, numDomainsY, backend) = param;
+
+        // define local grid sizes - this follows same logic as GROMACS implementation
+        std::vector<int> localGridSizesX(numDomainsX);
+        for (unsigned int i = 0; i < localGridSizesX.size(); ++i)
+        {
+            localGridSizesX[i] = ((i + 1) * realGridSizeFull[XX] / numDomainsX)
+                                 - (i * realGridSizeFull[XX] / numDomainsX);
+            ASSERT_GT(localGridSizesX[i], 0);
+        }
+
+        std::vector<int> localGridSizesY(numDomainsY);
+        for (unsigned int i = 0; i < localGridSizesY.size(); ++i)
+        {
+            localGridSizesY[i] = ((i + 1) * realGridSizeFull[YY] / numDomainsY)
+                                 - (i * realGridSizeFull[YY] / numDomainsY);
+            ASSERT_GT(localGridSizesY[i], 0);
+        }
+
+        ivec realGridSize;
+        ivec realGridSizePadded;
+        ivec complexGridSizePadded;
+
+        // Allocate the device buffers
+        DeviceBuffer<float> realGrid, complexGrid;
+
+        const bool     performOutOfPlaceFFT = true;
+        const MPI_Comm comm                 = MPI_COMM_WORLD;
+        const bool     allocateGrid         = true;
+        const int      nz                   = realGridSizeFull[ZZ];
+        Gpu3dFft       gpu3dFft(backend,
+                          allocateGrid,
+                          comm,
+                          localGridSizesX,
+                          localGridSizesY,
+                          nz,
+                          performOutOfPlaceFFT,
+                          deviceContext,
+                          deviceStream,
+                          realGridSize,
+                          realGridSizePadded,
+                          complexGridSizePadded,
+                          &realGrid,
+                          &complexGrid);
+
+        int sizeInReals = realGridSizePadded[0] * realGridSizePadded[1] * realGridSizePadded[2];
+
+        // initialze random input data
+        std::vector<real>                in(sizeInReals);
+        std::uniform_real_distribution<> dis(-10.0f, 10.0f);
+        std::minstd_rand                 gen(time(NULL) + rank);
+        std::generate(in.begin(), in.end(), [&dis, &gen]() {
+            // random number between -10 to 10
+            return dis(gen);
+        });
+
+        // Transfer the real grid input data for the FFT
+        copyToDeviceBuffer(
+                &realGrid, in.data(), 0, in.size(), deviceStream, GpuApiCallBehavior::Sync, nullptr);
+
+        // Do the forward FFT to compute the complex grid
+        CommandEvent* timingEvent = nullptr;
+        gpu3dFft.perform3dFft(GMX_FFT_REAL_TO_COMPLEX, timingEvent);
+
+        // clear real grid after the forward FFT, so that we know the
+        // final grid is one produced by the complex FFT, not just leftovers
+        clearDeviceBufferAsync(&realGrid, 0, sizeInReals, deviceStream);
+
+        // Do the back transform
+        gpu3dFft.perform3dFft(GMX_FFT_COMPLEX_TO_REAL, timingEvent);
+        deviceStream.synchronize();
+
+        // Transfer the real grid back from the device
+        std::vector<float> outputRealGridValues(in.size());
+        copyFromDeviceBuffer(outputRealGridValues.data(),
+                             &realGrid,
+                             0,
+                             outputRealGridValues.size(),
+                             deviceStream,
+                             GpuApiCallBehavior::Sync,
+                             nullptr);
+
+        checkRealGrid(realGridSizeFull, realGridSize, realGridSizePadded, in, outputRealGridValues);
+    }
+};
+
+TEST_P(GpuFftTest3D, GpuFftDecomposition)
+{
+    GMX_MPI_TEST(4);
+    GpuFftTestParams params = GetParam();
+    runTest(params);
+}
+
+std::vector<GpuFftTestParams> const inputs{
+    { IVec{ 5, 6, 9 }, 4, 1, FftBackend::HeFFTe_CUDA}, // slab decomposition
+    { IVec{ 5, 6, 9 }, 2, 2, FftBackend::HeFFTe_CUDA} // pencil decomposition
+};
+
+INSTANTIATE_TEST_SUITE_P(GpuFft, GpuFftTest3D, ::testing::ValuesIn(inputs));
+
+} // namespace test
+} // namespace gmx
diff --git a/src/programs/mdrun/tests/exactcontinuation.cpp b/src/programs/mdrun/tests/exactcontinuation.cpp
index 8f79bde29b..efd2e80645 100644
--- a/src/programs/mdrun/tests/exactcontinuation.cpp
+++ b/src/programs/mdrun/tests/exactcontinuation.cpp
@@ -395,14 +395,14 @@ TEST_P(MdrunNoAppendContinuationIsExact, WithinTolerances)
     mdpFieldValues["init-lambda-state"] = "3";
     mdpFieldValues["nsteps"]            = "16";
 
-    // Forces on GPUs are generally not reproducible enough for a tight
-    // tolerance. Similarly, the propagation of sd and bd are not as
+    // Forces and update on GPUs are generally not reproducible enough for a tight
+    // tolerance. Similarly, the propagation of bd is not as
     // reproducible as the others. So we use several ULP tolerance
     // in all cases. This is looser than needed e.g. for md and md-vv
     // with forces on CPUs, but there is no real risk of a bug with
     // those propagators that would only be caught with a tighter
     // tolerance in this particular test.
-    int ulpToleranceInMixed  = 32;
+    int ulpToleranceInMixed  = 128;
     int ulpToleranceInDouble = 64;
     if (integrator == "bd")
     {
-- 
2.22.0