Redesign GPU FFT abstraction
authorGaurav Garg <gaugarg@nvidia.com>
Wed, 8 Sep 2021 18:40:06 +0000 (18:40 +0000)
committerAndrey Alekseenko <al42and@gmail.com>
Wed, 8 Sep 2021 18:40:06 +0000 (18:40 +0000)
- Modify interface to allow distributed FFT implementation in future
- Provide support for choosing FFT backend at runtime. E.g. CUFFT backend can be instantiated for single-GPU FFT but HeFFTe can be instantiated in case PME decomposition is used.

This is a pre-requisite for GPU PME-decomposition implementation.

Refs #3884

13 files changed:
src/gromacs/ewald/pme_gpu_internal.cpp
src/gromacs/fft/CMakeLists.txt
src/gromacs/fft/gpu_3dfft.cpp
src/gromacs/fft/gpu_3dfft.h
src/gromacs/fft/gpu_3dfft_cufft.cu [moved from src/gromacs/fft/gpu_3dfft.cu with 66% similarity]
src/gromacs/fft/gpu_3dfft_cufft.h [new file with mode: 0644]
src/gromacs/fft/gpu_3dfft_impl.h [new file with mode: 0644]
src/gromacs/fft/gpu_3dfft_ocl.cpp
src/gromacs/fft/gpu_3dfft_ocl.h [new file with mode: 0644]
src/gromacs/fft/gpu_3dfft_sycl.cpp
src/gromacs/fft/gpu_3dfft_sycl.h [new file with mode: 0644]
src/gromacs/fft/tests/fft.cpp
src/gromacs/utility/binaryinformation.cpp

index cee5a349c8b9c17dc6b7757c1c7b19f5b57916f2..e9998c803e78c142d439703f7520510f2f44b38c 100644 (file)
@@ -606,21 +606,40 @@ void pme_gpu_reinit_3dfft(const PmeGpu* pmeGpu)
     if (pme_gpu_settings(pmeGpu).performGPUFFT)
     {
         pmeGpu->archSpecific->fftSetup.resize(0);
-        const bool        useDecomposition     = pme_gpu_settings(pmeGpu).useDecomposition;
-        const bool        performOutOfPlaceFFT = pmeGpu->archSpecific->performOutOfPlaceFFT;
-        PmeGpuGridParams& grid                 = pme_gpu_get_kernel_params_base_ptr(pmeGpu)->grid;
+        const bool         performOutOfPlaceFFT      = pmeGpu->archSpecific->performOutOfPlaceFFT;
+        const bool         allocateGrid              = false;
+        MPI_Comm           comm                      = MPI_COMM_NULL;
+        std::array<int, 1> gridOffsetsInXForEachRank = { 0 };
+        std::array<int, 1> gridOffsetsInYForEachRank = { 0 };
+#if GMX_GPU_CUDA
+        const gmx::FftBackend backend = gmx::FftBackend::Cufft;
+#elif GMX_GPU_OPENCL
+        const gmx::FftBackend backend = gmx::FftBackend::Ocl;
+#elif GMX_GPU_SYCL
+        const gmx::FftBackend backend = gmx::FftBackend::Sycl;
+#else
+        GMX_RELEASE_ASSERT(false, "Unknown GPU backend");
+        const gmx::FftBackend backend = gmx::FftBackend::Count;
+#endif
+
+        PmeGpuGridParams& grid = pme_gpu_get_kernel_params_base_ptr(pmeGpu)->grid;
         for (int gridIndex = 0; gridIndex < pmeGpu->common->ngrids; gridIndex++)
         {
             pmeGpu->archSpecific->fftSetup.push_back(
-                    std::make_unique<gmx::Gpu3dFft>(grid.realGridSize,
-                                                    grid.realGridSizePadded,
-                                                    grid.complexGridSizePadded,
-                                                    useDecomposition,
+                    std::make_unique<gmx::Gpu3dFft>(backend,
+                                                    allocateGrid,
+                                                    comm,
+                                                    gridOffsetsInXForEachRank,
+                                                    gridOffsetsInYForEachRank,
+                                                    grid.realGridSize[ZZ],
                                                     performOutOfPlaceFFT,
                                                     pmeGpu->archSpecific->deviceContext_,
                                                     pmeGpu->archSpecific->pmeStream_,
-                                                    grid.d_realGrid[gridIndex],
-                                                    grid.d_fourierGrid[gridIndex]));
+                                                    grid.realGridSize,
+                                                    grid.realGridSizePadded,
+                                                    grid.complexGridSizePadded,
+                                                    &(grid.d_realGrid[gridIndex]),
+                                                    &(grid.d_fourierGrid[gridIndex])));
         }
     }
 }
index 70dd951e6f92e635173342fa745f1807ea4d587d..73a6d4e5cd32c6a8b7e6d5d6a3ebc385bde501f4 100644 (file)
@@ -40,6 +40,7 @@ gmx_add_libgromacs_sources(
      fft.cpp
      fft5d.cpp
      parallel_3dfft.cpp
+     gpu_3dfft.cpp
      )
 
 if (GMX_FFT_FFTPACK)
@@ -57,7 +58,11 @@ endif()
 if (GMX_GPU_CUDA)
     gmx_add_libgromacs_sources(
         # CUDA-specific sources
-        gpu_3dfft.cu
+        gpu_3dfft_cufft.cu
+        )
+    _gmx_add_files_to_property(CUDA_SOURCES
+        # Must add these files so they can include cuda_runtime.h
+        gpu_3dfft.cpp
         )
 elseif (GMX_GPU_OPENCL)
     gmx_add_libgromacs_sources(
@@ -71,12 +76,8 @@ elseif (GMX_GPU_SYCL)
         gpu_3dfft_sycl.cpp
         )
     _gmx_add_files_to_property(SYCL_SOURCES
-        gpu_3dfft_sycl.cpp
-        )
-else()
-    gmx_add_libgromacs_sources(
-        # Stub sources for CPU-only build
         gpu_3dfft.cpp
+        gpu_3dfft_sycl.cpp
         )
 endif()
 
index 3b896eb7967fad8bc8045d21fe26dee749046014..c027c5d08a80bd761d6b4a116d4b73a2a0578ca1 100644 (file)
  *  \brief Implements stub GPU 3D FFT routines for CPU-only builds
  *
  *  \author Mark Abraham <mark.j.abraham@gmail.com>
+ *  \author Gaurav Garg <gaugarg@nvidia.com>
  *  \ingroup module_fft
  */
 
 #include "gmxpre.h"
 
 #include "gpu_3dfft.h"
+#include "gpu_3dfft_impl.h"
 
+#if GMX_GPU_CUDA
+#    include "gpu_3dfft_cufft.h"
+#elif GMX_GPU_OPENCL
+#    include "gpu_3dfft_ocl.h"
+#elif GMX_GPU_SYCL
+#    include "gpu_3dfft_sycl.h"
+#endif
+
+#include "gromacs/utility/arrayref.h"
 #include "gromacs/utility/exceptions.h"
 
 namespace gmx
@@ -55,29 +66,114 @@ namespace gmx
 #    pragma clang diagnostic ignored "-Wmissing-noreturn"
 #endif
 
-class Gpu3dFft::Impl
+#if (GMX_GPU_CUDA || GMX_GPU_OPENCL || GMX_GPU_SYCL)
+
+Gpu3dFft::Gpu3dFft(FftBackend           backend,
+                   bool                 allocateGrids,
+                   MPI_Comm             comm,
+                   ArrayRef<const int>  gridSizesInXForEachRank,
+                   ArrayRef<const int>  gridSizesInYForEachRank,
+                   const int            nz,
+                   bool                 performOutOfPlaceFFT,
+                   const DeviceContext& context,
+                   const DeviceStream&  pmeStream,
+                   ivec                 realGridSize,
+                   ivec                 realGridSizePadded,
+                   ivec                 complexGridSizePadded,
+                   DeviceBuffer<float>* realGrid,
+                   DeviceBuffer<float>* complexGrid)
 {
-};
+#    if GMX_GPU_CUDA
+    switch (backend)
+    {
+        case FftBackend::Cufft:
+            impl_ = std::make_unique<Gpu3dFft::ImplCuFft>(allocateGrids,
+                                                          comm,
+                                                          gridSizesInXForEachRank,
+                                                          gridSizesInYForEachRank,
+                                                          nz,
+                                                          performOutOfPlaceFFT,
+                                                          context,
+                                                          pmeStream,
+                                                          realGridSize,
+                                                          realGridSizePadded,
+                                                          complexGridSizePadded,
+                                                          realGrid,
+                                                          complexGrid);
+            break;
+        default: GMX_THROW(InternalError("Unsupported FFT backend requested"));
+    }
+#    elif GMX_GPU_OPENCL
+    switch (backend)
+    {
+        case FftBackend::Ocl:
+            impl_ = std::make_unique<Gpu3dFft::ImplOcl>(allocateGrids,
+                                                        comm,
+                                                        gridSizesInXForEachRank,
+                                                        gridSizesInYForEachRank,
+                                                        nz,
+                                                        performOutOfPlaceFFT,
+                                                        context,
+                                                        pmeStream,
+                                                        realGridSize,
+                                                        realGridSizePadded,
+                                                        complexGridSizePadded,
+                                                        realGrid,
+                                                        complexGrid);
+            break;
+        default: GMX_THROW(InternalError("Unsupported FFT backend requested"));
+    }
+#    elif GMX_GPU_SYCL
+    switch (backend)
+    {
+        case FftBackend::Sycl:
+            impl_ = std::make_unique<Gpu3dFft::ImplSycl>(allocateGrids,
+                                                         comm,
+                                                         gridSizesInXForEachRank,
+                                                         gridSizesInYForEachRank,
+                                                         nz,
+                                                         performOutOfPlaceFFT,
+                                                         context,
+                                                         pmeStream,
+                                                         realGridSize,
+                                                         realGridSizePadded,
+                                                         complexGridSizePadded,
+                                                         realGrid,
+                                                         complexGrid);
+            break;
+        default: GMX_THROW(InternalError("Unsupported FFT backend requested"));
+    }
+#    endif
+}
 
-Gpu3dFft::Gpu3dFft(ivec /*realGridSize*/,
-                   ivec /*realGridSizePadded*/,
-                   ivec /*complexGridSizePadded*/,
-                   const bool /*useDecomposition*/,
-                   const bool /*performOutOfPlaceFFT*/,
+#else
+
+Gpu3dFft::Gpu3dFft(FftBackend /*backend */,
+                   bool /*allocateGrids*/,
+                   MPI_Comm /*comm*/,
+                   ArrayRef<const int> /*gridSizesInXForEachRank*/,
+                   ArrayRef<const int> /*gridSizesInYForEachRank*/,
+                   const int /*nz*/,
+                   bool /*performOutOfPlaceFFT*/,
                    const DeviceContext& /*context*/,
                    const DeviceStream& /*pmeStream*/,
-                   DeviceBuffer<float> /*realGrid*/,
-                   DeviceBuffer<float> /*complexGrid*/)
+                   ivec /*realGridSize*/,
+                   ivec /*realGridSizePadded*/,
+                   ivec /*complexGridSizePadded*/,
+                   DeviceBuffer<float>* /*realGrid*/,
+                   DeviceBuffer<float>* /*complexGrid*/)
 {
     GMX_THROW(InternalError("Cannot run GPU routines in a CPU-only configuration"));
 }
 
+#endif
+
 Gpu3dFft::~Gpu3dFft() = default;
 
-// NOLINTNEXTLINE readability-convert-member-functions-to-static
-void Gpu3dFft::perform3dFft(gmx_fft_direction /*dir*/, CommandEvent* /*timingEvent*/)
+void Gpu3dFft::perform3dFft(gmx_fft_direction dir, CommandEvent* timingEvent)
 {
-    GMX_THROW(InternalError("Cannot run GPU routines in a CPU-only configuration"));
+    GMX_RELEASE_ASSERT(impl_ != nullptr, "Cannot run GPU routines in a CPU-only configuration");
+    impl_->perform3dFft(dir, timingEvent);
 }
 
 #ifdef __clang__
index 65d3f6f03bd18241005cdf1dfb9220aeef2f20d7..7b2c6376546336ff4c79f21ca3ddfed08a686268 100644 (file)
@@ -38,6 +38,7 @@
  *
  *  \author Aleksei Iupinov <a.yupinov@gmail.com>
  *  \author Mark Abraham <mark.j.abraham@gmail.com>
+ *  \author Gaurav Garg <gaugarg@nvidia.com>
  *  \ingroup module_fft
  */
 
@@ -49,6 +50,7 @@
 #include "gromacs/fft/fft.h"
 #include "gromacs/gpu_utils/devicebuffer_datatype.h"
 #include "gromacs/gpu_utils/gputraits.h"
+#include "gromacs/utility/gmxmpi.h"
 
 class DeviceContext;
 class DeviceStream;
@@ -56,35 +58,59 @@ class DeviceStream;
 namespace gmx
 {
 
+template<typename T>
+class ArrayRef;
+
+/*! \internal \brief
+ * Enum specifying all GPU FFT backends supported by GROMACS
+ * Some of the backends support only single GPU, some only multi-node, multi-GPU
+ */
+enum class FftBackend
+{
+    Cufft, // supports only single-GPU
+    Ocl,   // supports only single-GPU
+    Sycl,  // Not supported currently
+    Count
+};
+
 /*! \internal \brief
  * A 3D FFT class for performing R2C/C2R transforms
- * \todo Make this class actually parallel over multiple GPUs
  */
 class Gpu3dFft
 {
 public:
     /*! \brief
-     * Constructs GPU FFT plans for performing 3D FFT on a PME grid.
+     * Construct 3D FFT object for given backend
      *
-     * \param[in]  realGridSize           Dimensions of the real grid
-     * \param[in]  realGridSizePadded     Dimensions of the real grid with padding
-     * \param[in]  complexGridSizePadded  Dimensions of the real grid with padding
-     * \param[in]  useDecomposition       Whether PME decomposition will be used
-     * \param[in]  performOutOfPlaceFFT   Whether the FFT will be performed out-of-place
-     * \param[in]  context                GPU context.
-     * \param[in]  pmeStream              GPU stream for PME.
-     * \param[in]  realGrid               Device buffer of floats for the real grid
-     * \param[in]  complexGrid            Device buffer of complex floats for the complex grid
+     * \param[in]  backend                      FFT backend to be instantiated
+     * \param[in]  allocateGrids                True if fft grids are to be allocated, false if pre-allocated
+     * \param[in]  comm                         MPI communicator, used with distributed-FFT backends
+     * \param[in]  gridSizesInXForEachRank      Number of grid points used with each rank in X-dimension
+     * \param[in]  gridSizesInYForEachRank      Number of grid points used with each rank in Y-dimension
+     * \param[in]  nz                           Grid dimension in Z
+     * \param[in]  performOutOfPlaceFFT         Whether the FFT will be performed out-of-place
+     * \param[in]  context                      GPU context.
+     * \param[in]  pmeStream                    GPU stream for PME.
+     * \param[in,out]  realGridSize             Dimensions of the local real grid, out if allocateGrids=true
+     * \param[in,out]  realGridSizePadded       Dimensions of the local real grid with padding, out if allocateGrids=true
+     * \param[in,out]  complexGridSizePadded    Dimensions of the local complex grid with padding, out if allocateGrids=true
+     * \param[in,out]  realGrid                 Device buffer of floats for the local real grid, out if allocateGrids=true
+     * \param[in,out]  complexGrid              Device buffer of complex floats for the local complex grid, out if allocateGrids=true
      */
-    Gpu3dFft(ivec                 realGridSize,
-             ivec                 realGridSizePadded,
-             ivec                 complexGridSizePadded,
-             bool                 useDecomposition,
+    Gpu3dFft(FftBackend           backend,
+             bool                 allocateGrids,
+             MPI_Comm             comm,
+             ArrayRef<const int>  gridSizesInXForEachRank,
+             ArrayRef<const int>  gridSizesInYForEachRank,
+             int                  nz,
              bool                 performOutOfPlaceFFT,
              const DeviceContext& context,
              const DeviceStream&  pmeStream,
-             DeviceBuffer<float>  realGrid,
-             DeviceBuffer<float>  complexGrid);
+             ivec                 realGridSize,
+             ivec                 realGridSizePadded,
+             ivec                 complexGridSizePadded,
+             DeviceBuffer<float>* realGrid,
+             DeviceBuffer<float>* complexGrid);
 
     /*! \brief Destroys the FFT plans. */
     ~Gpu3dFft();
@@ -97,6 +123,10 @@ public:
 
 private:
     class Impl;
+    class ImplCuFft;
+    class ImplOcl;
+    class ImplSycl;
+
     std::unique_ptr<Impl> impl_;
 };
 
similarity index 66%
rename from src/gromacs/fft/gpu_3dfft.cu
rename to src/gromacs/fft/gpu_3dfft_cufft.cu
index 78f3ba90dcc0cf0bb2f70e2274d73ff5d9f25c2a..5ccdb9842e9335967b8d161edb644302d9dc57e5 100644 (file)
 
 #include "gmxpre.h"
 
-#include "gpu_3dfft.h"
-
-#include <cufft.h>
+#include "gpu_3dfft_cufft.h"
 
 #include "gromacs/gpu_utils/device_stream.h"
+#include "gromacs/utility/arrayref.h"
 #include "gromacs/utility/fatalerror.h"
 #include "gromacs/utility/gmxassert.h"
 
 namespace gmx
 {
-
-class Gpu3dFft::Impl
-{
-public:
-    Impl(ivec                 realGridSize,
-         ivec                 realGridSizePadded,
-         ivec                 complexGridSizePadded,
-         bool                 useDecomposition,
-         bool                 performOutOfPlaceFFT,
-         const DeviceContext& context,
-         const DeviceStream&  pmeStream,
-         DeviceBuffer<float>  realGrid,
-         DeviceBuffer<float>  complexGrid);
-    ~Impl();
-
-    cufftHandle   planR2C_;
-    cufftHandle   planC2R_;
-    cufftReal*    realGrid_;
-    cufftComplex* complexGrid_;
-};
-
 static void handleCufftError(cufftResult_t status, const char* msg)
 {
     if (status != CUFFT_SUCCESS)
@@ -82,19 +60,25 @@ static void handleCufftError(cufftResult_t status, const char* msg)
     }
 }
 
-Gpu3dFft::Impl::Impl(ivec       realGridSize,
-                     ivec       realGridSizePadded,
-                     ivec       complexGridSizePadded,
-                     const bool useDecomposition,
-                     const bool /*performOutOfPlaceFFT*/,
-                     const DeviceContext& /*context*/,
-                     const DeviceStream& pmeStream,
-                     DeviceBuffer<float> realGrid,
-                     DeviceBuffer<float> complexGrid) :
-    realGrid_(reinterpret_cast<cufftReal*>(realGrid)),
-    complexGrid_(reinterpret_cast<cufftComplex*>(complexGrid))
+Gpu3dFft::ImplCuFft::ImplCuFft(bool allocateGrids,
+                               MPI_Comm /*comm*/,
+                               ArrayRef<const int> gridSizesInXForEachRank,
+                               ArrayRef<const int> gridSizesInYForEachRank,
+                               const int /*nz*/,
+                               bool /*performOutOfPlaceFFT*/,
+                               const DeviceContext& /*context*/,
+                               const DeviceStream&  pmeStream,
+                               ivec                 realGridSize,
+                               ivec                 realGridSizePadded,
+                               ivec                 complexGridSizePadded,
+                               DeviceBuffer<float>* realGrid,
+                               DeviceBuffer<float>* complexGrid) :
+    realGrid_(reinterpret_cast<cufftReal*>(*realGrid)),
+    complexGrid_(reinterpret_cast<cufftComplex*>(*complexGrid))
 {
-    GMX_RELEASE_ASSERT(!useDecomposition, "FFT decomposition not implemented");
+    GMX_RELEASE_ASSERT(allocateGrids == false, "Grids needs to be pre-allocated");
+    GMX_RELEASE_ASSERT(gridSizesInXForEachRank.size() == 1 && gridSizesInYForEachRank.size() == 1,
+                       "FFT decomposition not implemented with cuFFT backend");
 
     const int complexGridSizePaddedTotal =
             complexGridSizePadded[XX] * complexGridSizePadded[YY] * complexGridSizePadded[ZZ];
@@ -151,7 +135,7 @@ Gpu3dFft::Impl::Impl(ivec       realGridSize,
     handleCufftError(result, "cufftSetStream C2R failure");
 }
 
-Gpu3dFft::Impl::~Impl()
+Gpu3dFft::ImplCuFft::~ImplCuFft()
 {
     cufftResult_t result;
     result = cufftDestroy(planR2C_);
@@ -160,42 +144,19 @@ Gpu3dFft::Impl::~Impl()
     handleCufftError(result, "cufftDestroy C2R failure");
 }
 
-void Gpu3dFft::perform3dFft(gmx_fft_direction dir, CommandEvent* /*timingEvent*/)
+void Gpu3dFft::ImplCuFft::perform3dFft(gmx_fft_direction dir, CommandEvent* /*timingEvent*/)
 {
     cufftResult_t result;
     if (dir == GMX_FFT_REAL_TO_COMPLEX)
     {
-        result = cufftExecR2C(impl_->planR2C_, impl_->realGrid_, impl_->complexGrid_);
+        result = cufftExecR2C(planR2C_, realGrid_, complexGrid_);
         handleCufftError(result, "cuFFT R2C execution failure");
     }
     else
     {
-        result = cufftExecC2R(impl_->planC2R_, impl_->complexGrid_, impl_->realGrid_);
+        result = cufftExecC2R(planC2R_, complexGrid_, realGrid_);
         handleCufftError(result, "cuFFT C2R execution failure");
     }
 }
 
-Gpu3dFft::Gpu3dFft(ivec                 realGridSize,
-                   ivec                 realGridSizePadded,
-                   ivec                 complexGridSizePadded,
-                   const bool           useDecomposition,
-                   const bool           performOutOfPlaceFFT,
-                   const DeviceContext& context,
-                   const DeviceStream&  pmeStream,
-                   DeviceBuffer<float>  realGrid,
-                   DeviceBuffer<float>  complexGrid) :
-    impl_(std::make_unique<Impl>(realGridSize,
-                                 realGridSizePadded,
-                                 complexGridSizePadded,
-                                 useDecomposition,
-                                 performOutOfPlaceFFT,
-                                 context,
-                                 pmeStream,
-                                 realGrid,
-                                 complexGrid))
-{
-}
-
-Gpu3dFft::~Gpu3dFft() = default;
-
 } // namespace gmx
diff --git a/src/gromacs/fft/gpu_3dfft_cufft.h b/src/gromacs/fft/gpu_3dfft_cufft.h
new file mode 100644 (file)
index 0000000..5ebdd16
--- /dev/null
@@ -0,0 +1,100 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2016,2017,2018,2019,2021, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/*! \internal \file
+ *  \brief Declares the GPU 3D FFT routines.
+ *
+ *  \author Aleksei Iupinov <a.yupinov@gmail.com>
+ *  \author Mark Abraham <mark.j.abraham@gmail.com>
+ *  \author Gaurav Garg <gaugarg@nvidia.com>
+ *  \ingroup module_fft
+ */
+
+#ifndef GMX_FFT_GPU_3DFFT_CUFFT_H
+#define GMX_FFT_GPU_3DFFT_CUFFT_H
+
+#include <memory>
+
+#include "gromacs/fft/fft.h"
+#include "gromacs/gpu_utils/devicebuffer_datatype.h"
+#include "gromacs/gpu_utils/gputraits.h"
+#include "gromacs/utility/gmxmpi.h"
+#include "gpu_3dfft_impl.h"
+
+#include <cufft.h>
+
+class DeviceContext;
+class DeviceStream;
+
+namespace gmx
+{
+
+/*! \internal \brief
+ * A 3D FFT wrapper class for performing R2C/C2R transforms using cuFFT
+ */
+class Gpu3dFft::ImplCuFft : public Gpu3dFft::Impl
+{
+public:
+    //! \copydoc Gpu3dFft::Impl::Impl
+    ImplCuFft(bool                 allocateGrids,
+              MPI_Comm             comm,
+              ArrayRef<const int>  gridSizesInXForEachRank,
+              ArrayRef<const int>  gridSizesInYForEachRank,
+              int                  nz,
+              bool                 performOutOfPlaceFFT,
+              const DeviceContext& context,
+              const DeviceStream&  pmeStream,
+              ivec                 realGridSize,
+              ivec                 realGridSizePadded,
+              ivec                 complexGridSizePadded,
+              DeviceBuffer<float>* realGrid,
+              DeviceBuffer<float>* complexGrid);
+
+    //! \copydoc Gpu3dFft::Impl::~Impl
+    ~ImplCuFft() override;
+
+    //! \copydoc Gpu3dFft::Impl::perform3dFft
+    void perform3dFft(gmx_fft_direction dir, CommandEvent* timingEvent) override;
+
+private:
+    cufftHandle   planR2C_;
+    cufftHandle   planC2R_;
+    cufftReal*    realGrid_;
+    cufftComplex* complexGrid_;
+};
+
+} // namespace gmx
+
+#endif
diff --git a/src/gromacs/fft/gpu_3dfft_impl.h b/src/gromacs/fft/gpu_3dfft_impl.h
new file mode 100644 (file)
index 0000000..2e95cc0
--- /dev/null
@@ -0,0 +1,103 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2016,2017,2018,2019,2021, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/*! \internal \file
+ *  \brief Declares the GPU 3D FFT routines.
+ *
+ *  \author Gaurav Garg <gaugarg@nvidia.com>
+ *  \ingroup module_fft
+ */
+
+#ifndef GMX_FFT_GPU_3DFFT_IMPL_H
+#define GMX_FFT_GPU_3DFFT_IMPL_H
+
+#include "gromacs/fft/fft.h"
+#include "gromacs/fft/gpu_3dfft.h"
+#include "gromacs/gpu_utils/devicebuffer_datatype.h"
+#include "gromacs/gpu_utils/gputraits.h"
+
+
+namespace gmx
+{
+/*! \internal \brief
+ * Impl base class for all FFT backends
+ */
+class Gpu3dFft::Impl
+{
+public:
+    //! Default constructor
+    Impl() = default;
+
+    /*! \brief
+     * Constructs GPU FFT plans for performing 3D FFT on a PME grid.
+     *
+     * \param[in]  allocateGrids                True if fft grids are to be allocated, false if pre-allocated
+     * \param[in]  comm                         MPI communicator, used with distributed-FFT backends
+     * \param[in]  gridSizesInXForEachRank      Number of grid points used with each rank in X-dimension
+     * \param[in]  gridSizesInYForEachRank      Number of grid points used with each rank in Y-dimension
+     * \param[in]  nz                           Grid dimension in Z
+     * \param[in]  performOutOfPlaceFFT         Whether the FFT will be performed out-of-place
+     * \param[in]  context                      GPU context.
+     * \param[in]  pmeStream                    GPU stream for PME.
+     * \param[in,out]  realGridSize             Dimensions of the local real grid, out if allocateGrids=true
+     * \param[in,out]  realGridSizePadded       Dimensions of the local real grid with padding, out if allocateGrids=true
+     * \param[in,out]  complexGridSizePadded    Dimensions of the local complex grid with padding, out if allocateGrids=true
+     * \param[in,out]  realGrid                 Device buffer of floats for the local real grid, out if allocateGrids=true
+     * \param[in,out]  complexGrid              Device buffer of complex floats for the local complex grid, out if allocateGrids=true
+     */
+    Impl(bool                 allocateGrids,
+         MPI_Comm             comm,
+         ArrayRef<const int>  gridSizesInXForEachRank,
+         ArrayRef<const int>  gridSizesInYForEachRank,
+         int                  nz,
+         bool                 performOutOfPlaceFFT,
+         const DeviceContext& context,
+         const DeviceStream&  pmeStream,
+         ivec                 realGridSize,
+         ivec                 realGridSizePadded,
+         ivec                 complexGridSizePadded,
+         DeviceBuffer<float>* realGrid,
+         DeviceBuffer<float>* complexGrid);
+
+    /*! \brief Default destructor */
+    virtual ~Impl() = default;
+
+    //! \copydoc Gpu3dFft::perform3dFft
+    virtual void perform3dFft(gmx_fft_direction dir, CommandEvent* timingEvent) = 0;
+};
+
+} // namespace gmx
+
+#endif
index 69a44974459fcf9cd26c288317b4688fb2338be1..76ae53560874d7d141b7bdcaf0ccbeb8834005de 100644 (file)
@@ -43,7 +43,7 @@
 
 #include "gmxpre.h"
 
-#include "gpu_3dfft.h"
+#include "gpu_3dfft_ocl.h"
 
 #include <array>
 #include <vector>
 
 namespace gmx
 {
-
-class Gpu3dFft::Impl
-{
-public:
-    Impl(ivec                 realGridSize,
-         ivec                 realGridSizePadded,
-         ivec                 complexGridSizePadded,
-         bool                 useDecomposition,
-         bool                 performOutOfPlaceFFT,
-         const DeviceContext& context,
-         const DeviceStream&  pmeStream,
-         DeviceBuffer<float>  realGrid,
-         DeviceBuffer<float>  complexGrid);
-    ~Impl();
-
-    clfftPlanHandle               planR2C_;
-    clfftPlanHandle               planC2R_;
-    std::vector<cl_command_queue> commandStreams_;
-    cl_mem                        realGrid_;
-    cl_mem                        complexGrid_;
-};
-
 //! Throws the exception on clFFT error
 static void handleClfftError(clfftStatus status, const char* msg)
 {
@@ -91,18 +69,24 @@ static void handleClfftError(clfftStatus status, const char* msg)
     }
 }
 
-Gpu3dFft::Impl::Impl(ivec                 realGridSize,
-                     ivec                 realGridSizePadded,
-                     ivec                 complexGridSizePadded,
-                     const bool           useDecomposition,
-                     const bool           performOutOfPlaceFFT,
-                     const DeviceContext& context,
-                     const DeviceStream&  pmeStream,
-                     DeviceBuffer<float>  realGrid,
-                     DeviceBuffer<float>  complexGrid) :
-    realGrid_(realGrid), complexGrid_(complexGrid)
+Gpu3dFft::ImplOcl::ImplOcl(bool allocateGrids,
+                           MPI_Comm /*comm*/,
+                           ArrayRef<const int> gridSizesInXForEachRank,
+                           ArrayRef<const int> gridSizesInYForEachRank,
+                           const int /*nz*/,
+                           bool                 performOutOfPlaceFFT,
+                           const DeviceContext& context,
+                           const DeviceStream&  pmeStream,
+                           ivec                 realGridSize,
+                           ivec                 realGridSizePadded,
+                           ivec                 complexGridSizePadded,
+                           DeviceBuffer<float>* realGrid,
+                           DeviceBuffer<float>* complexGrid) :
+    realGrid_(*realGrid), complexGrid_(*complexGrid)
 {
-    GMX_RELEASE_ASSERT(!useDecomposition, "FFT decomposition not implemented");
+    GMX_RELEASE_ASSERT(allocateGrids == false, "Grids needs to be pre-allocated");
+    GMX_RELEASE_ASSERT(gridSizesInXForEachRank.size() == 1 && gridSizesInYForEachRank.size() == 1,
+                       "FFT decomposition not implemented with OpenCL backend");
 
     cl_context clContext = context.context();
     commandStreams_.push_back(pmeStream.stream());
@@ -157,13 +141,13 @@ Gpu3dFft::Impl::Impl(ivec                 realGridSize,
     // TODO: disable last transpose (clfftSetPlanTransposeResult)
 }
 
-Gpu3dFft::Impl::~Impl()
+Gpu3dFft::ImplOcl::~ImplOcl()
 {
     clfftDestroyPlan(&planR2C_);
     clfftDestroyPlan(&planC2R_);
 }
 
-void Gpu3dFft::perform3dFft(gmx_fft_direction dir, CommandEvent* timingEvent)
+void Gpu3dFft::ImplOcl::perform3dFft(gmx_fft_direction dir, CommandEvent* timingEvent)
 {
     cl_mem                            tempBuffer = nullptr;
     constexpr std::array<cl_event, 0> waitEvents{ {} };
@@ -175,24 +159,24 @@ void Gpu3dFft::perform3dFft(gmx_fft_direction dir, CommandEvent* timingEvent)
     switch (dir)
     {
         case GMX_FFT_REAL_TO_COMPLEX:
-            plan        = impl_->planR2C_;
+            plan        = planR2C_;
             direction   = CLFFT_FORWARD;
-            inputGrids  = &impl_->realGrid_;
-            outputGrids = &impl_->complexGrid_;
+            inputGrids  = &realGrid_;
+            outputGrids = &complexGrid_;
             break;
         case GMX_FFT_COMPLEX_TO_REAL:
-            plan        = impl_->planC2R_;
+            plan        = planC2R_;
             direction   = CLFFT_BACKWARD;
-            inputGrids  = &impl_->complexGrid_;
-            outputGrids = &impl_->realGrid_;
+            inputGrids  = &complexGrid_;
+            outputGrids = &realGrid_;
             break;
         default:
             GMX_THROW(NotImplementedError("The chosen 3D-FFT case is not implemented on GPUs"));
     }
     handleClfftError(clfftEnqueueTransform(plan,
                                            direction,
-                                           impl_->commandStreams_.size(),
-                                           impl_->commandStreams_.data(),
+                                           commandStreams_.size(),
+                                           commandStreams_.data(),
                                            waitEvents.size(),
                                            waitEvents.data(),
                                            timingEvent,
@@ -202,27 +186,4 @@ void Gpu3dFft::perform3dFft(gmx_fft_direction dir, CommandEvent* timingEvent)
                      "clFFT execution failure");
 }
 
-Gpu3dFft::Gpu3dFft(ivec                 realGridSize,
-                   ivec                 realGridSizePadded,
-                   ivec                 complexGridSizePadded,
-                   const bool           useDecomposition,
-                   const bool           performOutOfPlaceFFT,
-                   const DeviceContext& context,
-                   const DeviceStream&  pmeStream,
-                   DeviceBuffer<float>  realGrid,
-                   DeviceBuffer<float>  complexGrid) :
-    impl_(std::make_unique<Impl>(realGridSize,
-                                 realGridSizePadded,
-                                 complexGridSizePadded,
-                                 useDecomposition,
-                                 performOutOfPlaceFFT,
-                                 context,
-                                 pmeStream,
-                                 realGrid,
-                                 complexGrid))
-{
-}
-
-Gpu3dFft::~Gpu3dFft() = default;
-
 } // namespace gmx
diff --git a/src/gromacs/fft/gpu_3dfft_ocl.h b/src/gromacs/fft/gpu_3dfft_ocl.h
new file mode 100644 (file)
index 0000000..0ffb042
--- /dev/null
@@ -0,0 +1,101 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2016,2017,2018,2019,2021, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/*! \internal \file
+ *  \brief Declares the GPU 3D FFT routines.
+ *
+ *  \author Aleksei Iupinov <a.yupinov@gmail.com>
+ *  \author Mark Abraham <mark.j.abraham@gmail.com>
+ *  \author Gaurav Garg <gaugarg@nvidia.com>
+ *  \ingroup module_fft
+ */
+
+#ifndef GMX_FFT_GPU_3DFFT_OCL_H
+#define GMX_FFT_GPU_3DFFT_OCL_H
+
+#include "gpu_3dfft_impl.h"
+
+#include "gromacs/fft/fft.h"
+#include "gromacs/gpu_utils/devicebuffer_datatype.h"
+#include "gromacs/gpu_utils/gputraits.h"
+#include "gromacs/utility/arrayref.h"
+#include "gromacs/utility/gmxmpi.h"
+
+#include <clFFT.h>
+
+class DeviceContext;
+class DeviceStream;
+
+namespace gmx
+{
+
+/*! \internal \brief
+ * A 3D FFT wrapper class for performing R2C/C2R transforms using clFFT
+ */
+class Gpu3dFft::ImplOcl : public Gpu3dFft::Impl
+{
+public:
+    //! \copydoc Gpu3dFft::Impl::Impl
+    ImplOcl(bool                 allocateGrids,
+            MPI_Comm             comm,
+            ArrayRef<const int>  gridSizesInXForEachRank,
+            ArrayRef<const int>  gridSizesInYForEachRank,
+            int                  nz,
+            bool                 performOutOfPlaceFFT,
+            const DeviceContext& context,
+            const DeviceStream&  pmeStream,
+            ivec                 realGridSize,
+            ivec                 realGridSizePadded,
+            ivec                 complexGridSizePadded,
+            DeviceBuffer<float>* realGrid,
+            DeviceBuffer<float>* complexGrid);
+
+    //! \copydoc Gpu3dFft::Impl::~Impl
+    ~ImplOcl() override;
+
+    //! \copydoc Gpu3dFft::Impl::perform3dFft
+    void perform3dFft(gmx_fft_direction dir, CommandEvent* timingEvent) override;
+
+private:
+    clfftPlanHandle               planR2C_;
+    clfftPlanHandle               planC2R_;
+    std::vector<cl_command_queue> commandStreams_;
+    cl_mem                        realGrid_;
+    cl_mem                        complexGrid_;
+};
+
+} // namespace gmx
+
+#endif
index 394aaac5a4e9acdd0d72b16c3144f5c833e1467e..ff2abfd48582b9648363bdc6c878b9e547365ce2 100644 (file)
@@ -43,8 +43,9 @@
 
 #include "gmxpre.h"
 
-#include "gpu_3dfft.h"
+#include "gpu_3dfft_sycl.h"
 
+#include "gromacs/utility/arrayref.h"
 #include "gromacs/utility/exceptions.h"
 
 namespace gmx
@@ -54,26 +55,26 @@ namespace gmx
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wmissing-noreturn"
 
-class Gpu3dFft::Impl
-{
-};
-
-Gpu3dFft::Gpu3dFft(ivec /*realGridSize*/,
-                   ivec /*realGridSizePadded*/,
-                   ivec /*complexGridSizePadded*/,
-                   const bool /*useDecomposition*/,
-                   const bool /*performOutOfPlaceFFT*/,
-                   const DeviceContext& /*context*/,
-                   const DeviceStream& /*pmeStream*/,
-                   DeviceBuffer<float> /*realGrid*/,
-                   DeviceBuffer<float> /*complexGrid*/)
+Gpu3dFft::ImplSycl::ImplSycl(bool /*allocateGrids*/,
+                             MPI_Comm /*comm*/,
+                             ArrayRef<const int> /*gridSizesInXForEachRank*/,
+                             ArrayRef<const int> /*gridSizesInYForEachRank*/,
+                             const int /*nz*/,
+                             bool /*performOutOfPlaceFFT*/,
+                             const DeviceContext& /*context*/,
+                             const DeviceStream& /*pmeStream*/,
+                             ivec /*realGridSize*/,
+                             ivec /*realGridSizePadded*/,
+                             ivec /*complexGridSizePadded*/,
+                             DeviceBuffer<float>* /*realGrid*/,
+                             DeviceBuffer<float>* /*complexGrid*/)
 {
     GMX_THROW(NotImplementedError("GPU 3DFFT is not implemented in SYCL"));
 }
 
-Gpu3dFft::~Gpu3dFft() = default;
+Gpu3dFft::ImplSycl::~ImplSycl() = default;
 
-void Gpu3dFft::perform3dFft(gmx_fft_direction /*dir*/, CommandEvent* /*timingEvent*/)
+void Gpu3dFft::ImplSycl::perform3dFft(gmx_fft_direction /*dir*/, CommandEvent* /*timingEvent*/)
 {
     GMX_THROW(NotImplementedError("Not implemented on SYCL yet"));
 }
diff --git a/src/gromacs/fft/gpu_3dfft_sycl.h b/src/gromacs/fft/gpu_3dfft_sycl.h
new file mode 100644 (file)
index 0000000..8bc398b
--- /dev/null
@@ -0,0 +1,91 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2016,2017,2018,2019,2021, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/*! \internal \file
+ *  \brief Declares the GPU 3D FFT routines.
+ *
+ *  \author Aleksei Iupinov <a.yupinov@gmail.com>
+ *  \author Mark Abraham <mark.j.abraham@gmail.com>
+ *  \author Gaurav Garg <gaugarg@nvidia.com>
+ *  \ingroup module_fft
+ */
+
+#ifndef GMX_FFT_GPU_3DFFT_SYCL_H
+#define GMX_FFT_GPU_3DFFT_SYCL_H
+
+#include "gpu_3dfft_impl.h"
+
+#include "gromacs/fft/fft.h"
+#include "gromacs/gpu_utils/devicebuffer_datatype.h"
+#include "gromacs/gpu_utils/gputraits.h"
+#include "gromacs/utility/gmxmpi.h"
+
+class DeviceContext;
+class DeviceStream;
+
+namespace gmx
+{
+
+/*! \internal \brief
+ * A 3D FFT wrapper class for performing R2C/C2R transforms using SYCL. Not yet implemented
+ */
+class Gpu3dFft::ImplSycl : public Gpu3dFft::Impl
+{
+public:
+    //! \copydoc Gpu3dFft::Impl::Impl
+    ImplSycl(bool                 allocateGrids,
+             MPI_Comm             comm,
+             ArrayRef<const int>  gridSizesInXForEachRank,
+             ArrayRef<const int>  gridSizesInYForEachRank,
+             int                  nz,
+             bool                 performOutOfPlaceFFT,
+             const DeviceContext& context,
+             const DeviceStream&  pmeStream,
+             ivec                 realGridSize,
+             ivec                 realGridSizePadded,
+             ivec                 complexGridSizePadded,
+             DeviceBuffer<float>* realGrid,
+             DeviceBuffer<float>* complexGrid);
+
+    //! \copydoc Gpu3dFft::Impl::~Impl
+    ~ImplSycl() override;
+
+    //! \copydoc Gpu3dFft::Impl::perform3dFft
+    void perform3dFft(gmx_fft_direction dir, CommandEvent* timingEvent) override;
+};
+
+} // namespace gmx
+
+#endif
index 7d0c7186d4090dd6307f8ae87d29ecbd2ff9e4ee..ce3717e210ffdf72c29f90dd90ac50613801e087 100644 (file)
@@ -400,17 +400,31 @@ TEST_F(FFTTest3D, GpuReal5_6_9)
         allocateDeviceBuffer(&realGrid, in_.size(), deviceContext);
         allocateDeviceBuffer(&complexGrid, complexGridValues.size(), deviceContext);
 
-        const bool useDecomposition     = false;
-        const bool performOutOfPlaceFFT = true;
-        Gpu3dFft   gpu3dFft(realGridSize,
-                          realGridSizePadded,
-                          complexGridSizePadded,
-                          useDecomposition,
+#    if GMX_GPU_CUDA
+        const FftBackend backend = FftBackend::Cufft;
+#    elif GMX_GPU_OPENCL
+        const FftBackend backend = FftBackend::Ocl;
+#    endif
+        const bool         performOutOfPlaceFFT    = true;
+        const MPI_Comm     comm                    = MPI_COMM_NULL;
+        const bool         allocateGrid            = false;
+        std::array<int, 1> gridSizesInXForEachRank = { 0 };
+        std::array<int, 1> gridSizesInYForEachRank = { 0 };
+        const int          nz                      = realGridSize[ZZ];
+        Gpu3dFft           gpu3dFft(backend,
+                          allocateGrid,
+                          comm,
+                          gridSizesInXForEachRank,
+                          gridSizesInYForEachRank,
+                          nz,
                           performOutOfPlaceFFT,
                           deviceContext,
                           deviceStream,
-                          realGrid,
-                          complexGrid);
+                          realGridSize,
+                          realGridSizePadded,
+                          complexGridSizePadded,
+                          &realGrid,
+                          &complexGrid);
 
         // Transfer the real grid input data for the FFT
         copyToDeviceBuffer(
index 935a6d553add1c3cf2119ae6dfd08ac59cdc1368..2ca50c01860e6c78d80e17e1d6f879cd87e55cb7 100644 (file)
@@ -204,8 +204,8 @@ void printCopyright(gmx::TextWriter* writer)
     }
 }
 
-// Construct a string that describes the library that provides FFT support to this build
-const char* getFftDescriptionString()
+//! Construct a string that describes the library that provides CPU FFT support to this build
+const char* getCpuFftDescriptionString()
 {
 // Define the FFT description string
 #if GMX_FFT_FFTW3 || GMX_FFT_ARMPL_FFTW3
@@ -229,6 +229,35 @@ const char* getFftDescriptionString()
 #endif
 };
 
+//! Construct a string that describes the library that provides GPU FFT support to this build
+const char* getGpuFftDescriptionString()
+{
+    if (GMX_GPU)
+    {
+        if (GMX_GPU_CUDA)
+        {
+            return "cuFFT";
+        }
+        else if (GMX_GPU_OPENCL)
+        {
+            return "clFFT";
+        }
+        else if (GMX_GPU_SYCL)
+        {
+            return "unknown";
+        }
+        else
+        {
+            GMX_RELEASE_ASSERT(false, "Unknown GPU configuration");
+            return "impossible";
+        }
+    }
+    else
+    {
+        return "none";
+    }
+};
+
 void gmx_print_version_info(gmx::TextWriter* writer)
 {
     writer->writeLine(formatString("GROMACS version:    %s", gmx_version()));
@@ -309,7 +338,8 @@ void gmx_print_version_info(gmx::TextWriter* writer)
 #endif
     writer->writeLine(formatString("GPU support:        %s", getGpuImplementationString()));
     writer->writeLine(formatString("SIMD instructions:  %s", GMX_SIMD_STRING));
-    writer->writeLine(formatString("FFT library:        %s", getFftDescriptionString()));
+    writer->writeLine(formatString("CPU FFT library:    %s", getCpuFftDescriptionString()));
+    writer->writeLine(formatString("GPU FFT library:    %s", getGpuFftDescriptionString()));
 #if GMX_TARGET_X86
     writer->writeLine(formatString("RDTSCP usage:       %s", GMX_USE_RDTSCP ? "enabled" : "disabled"));
 #endif