From 2b684af003232984fa5c59f9b94b8bc9f48c188d Mon Sep 17 00:00:00 2001
From: Mark Abraham <mark.j.abraham@gmail.com>
Date: Thu, 30 Sep 2021 12:42:30 +0000
Subject: [PATCH] Add hipSYCL support to GPU 3DFFT

Made the testing of the complex-to-real transform less likely to pass
when there's a problem.

Refs #3965
---
 src/gromacs/ewald/pme_gpu_internal.cpp    |   4 +
 src/gromacs/fft/CMakeLists.txt            |  11 +-
 src/gromacs/fft/gpu_3dfft.cpp             |  20 +
 src/gromacs/fft/gpu_3dfft.h               |   9 +-
 src/gromacs/fft/gpu_3dfft_sycl_rocfft.cpp | 457 ++++++++++++++++++++++
 src/gromacs/fft/gpu_3dfft_sycl_rocfft.h   |  96 +++++
 src/gromacs/fft/tests/fft.cpp             |  28 +-
 7 files changed, 613 insertions(+), 12 deletions(-)
 create mode 100644 src/gromacs/fft/gpu_3dfft_sycl_rocfft.cpp
 create mode 100644 src/gromacs/fft/gpu_3dfft_sycl_rocfft.h

diff --git a/src/gromacs/ewald/pme_gpu_internal.cpp b/src/gromacs/ewald/pme_gpu_internal.cpp
index 185fea7de3..fa750002f5 100644
--- a/src/gromacs/ewald/pme_gpu_internal.cpp
+++ b/src/gromacs/ewald/pme_gpu_internal.cpp
@@ -616,7 +616,11 @@ void pme_gpu_reinit_3dfft(const PmeGpu* pmeGpu)
 #elif GMX_GPU_OPENCL
         const gmx::FftBackend backend = gmx::FftBackend::Ocl;
 #elif GMX_GPU_SYCL
+#    if GMX_SYCL_HIPSYCL
+        const gmx::FftBackend backend = gmx::FftBackend::SyclRocfft;
+#    else
         const gmx::FftBackend backend = gmx::FftBackend::Sycl;
+#    endif
 #else
         GMX_RELEASE_ASSERT(false, "Unknown GPU backend");
         const gmx::FftBackend backend = gmx::FftBackend::Count;
diff --git a/src/gromacs/fft/CMakeLists.txt b/src/gromacs/fft/CMakeLists.txt
index 060f9b8d6b..13571c1564 100644
--- a/src/gromacs/fft/CMakeLists.txt
+++ b/src/gromacs/fft/CMakeLists.txt
@@ -74,14 +74,17 @@ elseif (GMX_GPU_OPENCL)
         gpu_3dfft_ocl.cpp
         )
 elseif (GMX_GPU_SYCL)
-    # SYCL-TODO: proper implementation
+    if (GMX_SYCL_HIPSYCL)
+        set(3dfft_sycl_source gpu_3dfft_sycl_rocfft.cpp)
+    else()
+        set(3dfft_sycl_source gpu_3dfft.cpp gpu_3dfft_sycl.cpp)
+    endif()
     gmx_add_libgromacs_sources(
         # SYCL-specific sources
-        gpu_3dfft_sycl.cpp
+        ${3dfft_sycl_source}
         )
     _gmx_add_files_to_property(SYCL_SOURCES
-        gpu_3dfft.cpp
-        gpu_3dfft_sycl.cpp
+        ${3dfft_sycl_source}
         )
 endif()
 
diff --git a/src/gromacs/fft/gpu_3dfft.cpp b/src/gromacs/fft/gpu_3dfft.cpp
index 9b931cfb2f..17ffe13f68 100644
--- a/src/gromacs/fft/gpu_3dfft.cpp
+++ b/src/gromacs/fft/gpu_3dfft.cpp
@@ -52,6 +52,9 @@
 #    include "gpu_3dfft_ocl.h"
 #elif GMX_GPU_SYCL
 #    include "gpu_3dfft_sycl.h"
+#    if GMX_SYCL_HIPSYCL
+#        include "gpu_3dfft_sycl_rocfft.h"
+#    endif
 #endif
 
 #if Heffte_FOUND
@@ -132,6 +135,23 @@ Gpu3dFft::Gpu3dFft(FftBackend           backend,
 #    elif GMX_GPU_SYCL
     switch (backend)
     {
+#        if GMX_SYCL_HIPSYCL
+        case FftBackend::SyclRocfft:
+            impl_ = std::make_unique<Gpu3dFft::ImplSyclRocfft>(allocateGrids,
+                                                               comm,
+                                                               gridSizesInXForEachRank,
+                                                               gridSizesInYForEachRank,
+                                                               nz,
+                                                               performOutOfPlaceFFT,
+                                                               context,
+                                                               pmeStream,
+                                                               realGridSize,
+                                                               realGridSizePadded,
+                                                               complexGridSizePadded,
+                                                               realGrid,
+                                                               complexGrid);
+            break;
+#        endif
         case FftBackend::Sycl:
             impl_ = std::make_unique<Gpu3dFft::ImplSycl>(allocateGrids,
                                                          comm,
diff --git a/src/gromacs/fft/gpu_3dfft.h b/src/gromacs/fft/gpu_3dfft.h
index a26dc8af3f..577108a154 100644
--- a/src/gromacs/fft/gpu_3dfft.h
+++ b/src/gromacs/fft/gpu_3dfft.h
@@ -67,10 +67,12 @@ class ArrayRef;
  */
 enum class FftBackend
 {
-    Cufft, // supports only single-GPU
-    Ocl,   // supports only single-GPU
-    Sycl,  // Not supported currently
+    Cufft,      // supports only single-GPU
+    Ocl,        // supports only single-GPU
     HeFFTe_CUDA,
+    SyclMkl,    // supports only single-GPU
+    SyclRocfft, // supports only single-GPU
+    Sycl,       // stubs for not supported configurations
     Count
 };
 
@@ -126,6 +128,7 @@ private:
     class Impl;
     class ImplCuFft;
     class ImplOcl;
+    class ImplSyclRocfft;
     class ImplSycl;
 
     template<typename backend_tag>
diff --git a/src/gromacs/fft/gpu_3dfft_sycl_rocfft.cpp b/src/gromacs/fft/gpu_3dfft_sycl_rocfft.cpp
new file mode 100644
index 0000000000..8e59c7fe11
--- /dev/null
+++ b/src/gromacs/fft/gpu_3dfft_sycl_rocfft.cpp
@@ -0,0 +1,457 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2021, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/*! \internal \file
+ *  \brief Implements GPU 3D FFT routines for hipSYCL via rocFFT.
+ *
+ *  \author Andrey Alekseenko <al42and@gmail.com>
+ *  \author Mark Abraham <mark.j.abraham@gmail.com>
+ *
+ * For hipSYCL, in order to call FFT APIs from the respective vendors
+ * using the same DeviceStream as other operations, a vendor extension
+ * called "custom operations" is used (see hipSYCL
+ * doc/enqueue-custom-operation.md). That effectively enqueues an
+ * asynchronous host-side lambda into the same queue. The body of the
+ * lambda unpacks the runtime data structures to get the native
+ * handles and calls the native FFT APIs.
+ *
+ * For a 3D FFT, rocFFT requires a working buffer which it allocates
+ * itself if not provided. This might be slow enough to be worth
+ * optimizing. This working buffer could be provided in advance by
+ * calling rocfft_plan_get_work_buffer_size, allocating a buffer that
+ * persists suitably, and then using
+ * rocfft_execution_info_set_work_buffer in a custom operation.
+ *
+ * hipSYCL queues operate at a higher level of abstraction than hip
+ * streams, with the runtime distributing work to the latter to
+ * balance load. It is possible to set the HIP stream in
+ * rocfft_execution_info, but then there is no guarantee that a
+ * subsequent queue item will run using the same stream. So we
+ * currently do not attempt to set the stream.
+ *
+ *  \ingroup module_fft
+ */
+
+#include "gmxpre.h"
+
+#include "gpu_3dfft_sycl_rocfft.h"
+
+#include "gromacs/utility/enumerationhelpers.h"
+#include "gromacs/utility/exceptions.h"
+
+#include <vector>
+
+#include "gromacs/gpu_utils/device_stream.h"
+#include "gromacs/gpu_utils/devicebuffer.h"
+#include "gromacs/utility/exceptions.h"
+#include "gromacs/utility/gmxassert.h"
+
+#ifndef __HIPSYCL__
+#    error This file can only be compiled with hipSYCL enabled
+#endif
+
+#if !defined HIPSYCL_PLATFORM_ROCM
+#    error Only ROCM platform is supported for 3D FFT with hipSYCL
+#endif
+
+#include "rocfft.h"
+
+namespace gmx
+{
+
+namespace
+{
+
+//! Model the kinds of 3D FFT implemented
+enum class FftDirection : int
+{
+    RealToComplex,
+    ComplexToReal,
+    Count,
+};
+
+//! Strings that match enum rocfft_status_e in rocfft.h
+const std::array<const char*, rocfft_status_invalid_work_buffer + 1> c_rocfftErrorStrings = {
+    "success",
+    "failure",
+    "invalid argument value",
+    "invalid dimensions",
+    "invalid array type",
+    "invalid strides",
+    "invalid distance",
+    "invalid offset",
+    "invalid work buffer"
+};
+
+//! Helper for consistent error handling
+void handleFftError(rocfft_status result, const std::string& msg)
+{
+    if (result != rocfft_status_success)
+    {
+        if (result <= rocfft_status_invalid_work_buffer)
+        {
+            GMX_THROW(gmx::InternalError(gmx::formatString(
+                    "%s: (error code %d - %s)\n", msg.c_str(), result, c_rocfftErrorStrings[result])));
+        }
+        else
+        {
+            GMX_THROW(gmx::InternalError(gmx::formatString("%s: (error code %d)\n", msg.c_str(), result)));
+        }
+    }
+}
+
+//! Helper for consistent error handling
+void handleFftError(rocfft_status result, const std::string& direction, const std::string& msg)
+{
+    if (result != rocfft_status_success)
+    {
+        handleFftError(result, msg + " doing " + direction);
+    }
+}
+
+//! Provides RAII-style initialization of rocFFT library
+class RocfftInitializer
+{
+public:
+    RocfftInitializer()
+    {
+        rocfft_status result;
+        result = rocfft_setup();
+        handleFftError(result, "rocfft_setup failure");
+    }
+    ~RocfftInitializer()
+    {
+        // No need to handle any errors in a destructor, and
+        // anyway one cannot throw.
+        rocfft_cleanup();
+    }
+};
+
+//! All the persistent data for planning an executing a 3D FFT
+struct RocfftPlan
+{
+    //! Describes details of the data layout
+    rocfft_plan_description description = nullptr;
+    //! High level information about the plan
+    rocfft_plan plan = nullptr;
+    //! Destructor
+    ~RocfftPlan()
+    {
+        // No need to handle any errors in a destructor,
+        // and anyway one cannot throw.
+        if (plan)
+        {
+            rocfft_plan_destroy(plan);
+        }
+        if (description)
+        {
+            rocfft_plan_description_destroy(description);
+        }
+    }
+};
+
+//! Helper struct to reduce repetitive code setting up a 3D FFT plan
+struct PlanSetupData
+{
+    //! Format of the input array (real or hermitian)
+    rocfft_array_type arrayType;
+    //! Strides through the input array for the three dimensions
+    std::array<size_t, DIM> strides;
+    //! Total size of the input array (including padding)
+    size_t totalSize;
+};
+
+//! Compute the stride through the real 1D array
+std::array<size_t, DIM> makeRealStrides(ivec realGridSizePadded)
+{
+    return { 1, size_t(realGridSizePadded[ZZ]), size_t(realGridSizePadded[ZZ] * realGridSizePadded[YY]) };
+};
+
+//! Compute the stride through the complex 1D array
+std::array<size_t, DIM> makeComplexStrides(ivec complexGridSizePadded)
+{
+    return { 1,
+             size_t(complexGridSizePadded[XX]),
+             size_t(complexGridSizePadded[XX] * complexGridSizePadded[YY]) };
+}
+
+//! Compute total grid size
+size_t computeTotalSize(ivec gridSize)
+{
+    return size_t(gridSize[XX] * gridSize[YY] * gridSize[ZZ]);
+}
+
+/*! \brief Prepare plans for the forward and reverse transformation.
+ *
+ * Because these require device-side allocations, some of them must be
+ * done in a SYCL queue. */
+RocfftPlan makePlan(const std::string&     descriptiveString,
+                    rocfft_transform_type  transformType,
+                    const PlanSetupData&   inputPlanSetupData,
+                    const PlanSetupData&   outputPlanSetupData,
+                    ArrayRef<const size_t> rocfftRealGridSize,
+                    const DeviceStream&    pmeStream)
+{
+    rocfft_plan_description description = nullptr;
+    rocfft_status           result;
+    result = rocfft_plan_description_create(&description);
+    handleFftError(result, descriptiveString, "rocfft_plan_description_create failure");
+    result = rocfft_plan_description_set_data_layout(description,
+                                                     inputPlanSetupData.arrayType,
+                                                     outputPlanSetupData.arrayType,
+                                                     // No offsets are needed
+                                                     nullptr,
+                                                     nullptr,
+                                                     inputPlanSetupData.strides.size(),
+                                                     inputPlanSetupData.strides.data(),
+                                                     inputPlanSetupData.totalSize,
+                                                     outputPlanSetupData.strides.size(),
+                                                     outputPlanSetupData.strides.data(),
+                                                     outputPlanSetupData.totalSize);
+    handleFftError(result, descriptiveString, "rocfft_plan_description_set_data_layout failure");
+
+    // The plan creation depends on the identity of the GPU device, so
+    // we make sure it is made in the same queue where it will be
+    // used. The stream for execution can be set at the same time.
+
+    // First set up device buffers to receive the rocfft status values
+    rocfft_plan                        plan = nullptr;
+    cl::sycl::buffer<rocfft_status, 1> resultPlanCreate(1);
+
+    // Submit the planning to the queue. This is necessary so that we
+    // can ensure that the allocations in the planning go to the right
+    // context.
+    {
+        auto queue = pmeStream.stream();
+        // Make a buffer that is a view of the existing memory for a
+        // plan.
+        cl::sycl::buffer<rocfft_plan, 1> planView =
+                cl::sycl::make_async_writeback_view(&plan, cl::sycl::range(1), queue);
+        queue.submit([&](cl::sycl::handler& cgh) {
+            // Make the necessary accessors
+            auto a_plan = planView.get_access(cgh, cl::sycl::write_only, cl::sycl::no_init);
+            auto a_resultPlanCreate =
+                    resultPlanCreate.get_access(cgh, cl::sycl::write_only, cl::sycl::no_init);
+            cgh.hipSYCL_enqueue_custom_operation([=](cl::sycl::interop_handle& /*h*/) {
+                const int numBatches = 1;
+                // Unlike some other FFT APIs, in rocFFT the
+                // dimension of an FFT is the (vectorial) size
+                // of the problem (ie. rocfftRealGridSize), not
+                // the size of the input vector (which varies
+                // according to whether the input format is real
+                // or hermitian respectively for forward or
+                // reverse transforms).
+                a_resultPlanCreate[0] = rocfft_plan_create(&a_plan[0],
+                                                           rocfft_placement_notinplace,
+                                                           transformType,
+                                                           rocfft_precision_single,
+                                                           rocfftRealGridSize.size(),
+                                                           rocfftRealGridSize.data(),
+                                                           numBatches,
+                                                           description);
+            });
+        });
+    }
+    // Check for errors that happened while running the hipSYCL custom
+    // operation.
+    handleFftError(
+            resultPlanCreate.get_host_access()[0], descriptiveString, "rocfft_plan_create failure");
+
+    return RocfftPlan{ description, plan };
+}
+
+} // namespace
+
+//! Impl class
+class Gpu3dFft::ImplSyclRocfft::Impl
+{
+public:
+    //! \copydoc Gpu3dFft::Impl::Impl
+    Impl(bool                 allocateGrids,
+         MPI_Comm             comm,
+         ArrayRef<const int>  gridSizesInXForEachRank,
+         ArrayRef<const int>  gridSizesInYForEachRank,
+         const int            nz,
+         bool                 performOutOfPlaceFFT,
+         const DeviceContext& context,
+         const DeviceStream&  pmeStream,
+         ivec                 realGridSize,
+         ivec                 realGridSizePadded,
+         ivec                 complexGridSizePadded,
+         DeviceBuffer<float>* realGrid,
+         DeviceBuffer<float>* complexGrid);
+    /*! \brief Handle initializing the rocFFT library
+     *
+     * Make sure the library is initialized before the plans, etc. and
+     * not destructed before they are. */
+    RocfftInitializer init_;
+    //! Data for 3D FFT plans and execution
+    EnumerationArray<FftDirection, RocfftPlan> plans_;
+    //! Handle to the real grid buffer
+    cl::sycl::buffer<float, 1> realGrid_;
+    //! Handle to the complex grid buffer
+    cl::sycl::buffer<float, 1> complexGrid_;
+    /*! \brief Copy of PME stream
+     *
+     * This copy is guaranteed by the SYCL standard to work as if
+     * it was the original. */
+    cl::sycl::queue queue_;
+};
+
+Gpu3dFft::ImplSyclRocfft::Impl::Impl(bool allocateGrids,
+                                     MPI_Comm /*comm*/,
+                                     ArrayRef<const int> gridSizesInXForEachRank,
+                                     ArrayRef<const int> gridSizesInYForEachRank,
+                                     int /*nz*/,
+                                     bool performOutOfPlaceFFT,
+                                     const DeviceContext& /*context*/,
+                                     const DeviceStream&  pmeStream,
+                                     ivec                 realGridSize,
+                                     ivec                 realGridSizePadded,
+                                     ivec                 complexGridSizePadded,
+                                     DeviceBuffer<float>* realGrid,
+                                     DeviceBuffer<float>* complexGrid) :
+    plans_{
+        makePlan("real-to-complex",
+                 rocfft_transform_type_real_forward,
+                 // input
+                 PlanSetupData{ rocfft_array_type_real,
+                                makeRealStrides(realGridSizePadded),
+                                computeTotalSize(realGridSizePadded) },
+                 // output
+                 PlanSetupData{ rocfft_array_type_hermitian_interleaved,
+                                makeComplexStrides(complexGridSizePadded),
+                                computeTotalSize(complexGridSizePadded) },
+                 // Note that rocFFT requires that we reverse the dimension order when planning
+                 std::vector<size_t>{ size_t(realGridSize[ZZ]),
+                                      size_t(realGridSize[YY]),
+                                      size_t(realGridSize[XX]) },
+                 pmeStream),
+        // For rocFFT, the complex-to-real setup is the logical
+        // converse of the real-to-complex. The PlanSetupData objects
+        // are the same, but used in the opposite sense of
+        // input/output.
+        makePlan("complex-to-real",
+                 rocfft_transform_type_real_inverse,
+                 // input
+                 PlanSetupData{ rocfft_array_type_hermitian_interleaved,
+                                makeComplexStrides(complexGridSizePadded),
+                                computeTotalSize(complexGridSizePadded) },
+                 // output
+                 PlanSetupData{ rocfft_array_type_real,
+                                makeRealStrides(realGridSizePadded),
+                                computeTotalSize(realGridSizePadded) },
+                 // Note that rocFFT requires that we reverse the dimension order when planning
+                 std::vector<size_t>{ size_t(realGridSize[ZZ]),
+                                      size_t(realGridSize[YY]),
+                                      size_t(realGridSize[XX]) },
+                 pmeStream),
+    },
+    realGrid_(*realGrid->buffer_.get()),
+    complexGrid_(*complexGrid->buffer_.get()),
+    queue_(pmeStream.stream())
+{
+    GMX_RELEASE_ASSERT(performOutOfPlaceFFT, "Only out-of-place FFT is implemented in hipSYCL");
+    GMX_RELEASE_ASSERT(allocateGrids == false, "Grids need to be pre-allocated");
+    GMX_RELEASE_ASSERT(gridSizesInXForEachRank.size() == 1 && gridSizesInYForEachRank.size() == 1,
+                       "FFT decomposition not implemented with SYCL backend");
+}
+
+void Gpu3dFft::ImplSyclRocfft::perform3dFft(gmx_fft_direction dir, CommandEvent* /*timingEvent*/)
+{
+    GMX_RELEASE_ASSERT((dir == GMX_FFT_REAL_TO_COMPLEX) || (dir == GMX_FFT_COMPLEX_TO_REAL),
+                       "Only real-to-complex and complex-to-real FFTs are implemented in hipSYCL");
+    FftDirection               direction;
+    cl::sycl::buffer<float, 1>*inputGrid = nullptr, *outputGrid = nullptr;
+    if (dir == GMX_FFT_REAL_TO_COMPLEX)
+    {
+        direction  = FftDirection::RealToComplex;
+        inputGrid  = &impl_->realGrid_;
+        outputGrid = &impl_->complexGrid_;
+    }
+    else
+    {
+        direction  = FftDirection::ComplexToReal;
+        inputGrid  = &impl_->complexGrid_;
+        outputGrid = &impl_->realGrid_;
+    }
+    // Enqueue the 3D FFT work
+    impl_->queue_.submit([&](cl::sycl::handler& cgh) {
+        auto inputGridAccessor = inputGrid->get_access(cgh, cl::sycl::read_only, cl::sycl::no_init);
+        auto outputGridAccessor = outputGrid->get_access(cgh, cl::sycl::write_only, cl::sycl::no_init);
+        // Use a hipSYCL custom operation to access the native buffers
+        // needed to call rocFFT
+        cgh.hipSYCL_enqueue_custom_operation([=](cl::sycl::interop_handle& h) {
+            void* d_inputGrid  = h.get_native_mem<cl::sycl::backend::hip>(inputGridAccessor);
+            void* d_outputGrid = h.get_native_mem<cl::sycl::backend::hip>(outputGridAccessor);
+            // Don't check results generated asynchronously,
+            // because we don't know what to do with them
+            rocfft_execute(impl_->plans_[direction].plan, &d_inputGrid, &d_outputGrid, nullptr);
+        });
+    });
+}
+
+Gpu3dFft::ImplSyclRocfft::ImplSyclRocfft(bool                 allocateGrids,
+                                         MPI_Comm             comm,
+                                         ArrayRef<const int>  gridSizesInXForEachRank,
+                                         ArrayRef<const int>  gridSizesInYForEachRank,
+                                         const int            nz,
+                                         bool                 performOutOfPlaceFFT,
+                                         const DeviceContext& context,
+                                         const DeviceStream&  pmeStream,
+                                         ivec                 realGridSize,
+                                         ivec                 realGridSizePadded,
+                                         ivec                 complexGridSizePadded,
+                                         DeviceBuffer<float>* realGrid,
+                                         DeviceBuffer<float>* complexGrid) :
+    impl_(std::make_unique<Impl>(allocateGrids,
+                                 comm,
+                                 gridSizesInXForEachRank,
+                                 gridSizesInYForEachRank,
+                                 nz,
+                                 performOutOfPlaceFFT,
+                                 context,
+                                 pmeStream,
+                                 realGridSize,
+                                 realGridSizePadded,
+                                 complexGridSizePadded,
+                                 realGrid,
+                                 complexGrid))
+{
+}
+
+Gpu3dFft::ImplSyclRocfft::~ImplSyclRocfft() = default;
+
+} // namespace gmx
diff --git a/src/gromacs/fft/gpu_3dfft_sycl_rocfft.h b/src/gromacs/fft/gpu_3dfft_sycl_rocfft.h
new file mode 100644
index 0000000000..9dc428c79f
--- /dev/null
+++ b/src/gromacs/fft/gpu_3dfft_sycl_rocfft.h
@@ -0,0 +1,96 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2016,2017,2018,2019,2021, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/*! \internal \file
+ *  \brief Declares the GPU 3D FFT routines for hipSYCL via rocFFT.
+ *
+ *  \author Mark Abraham <mark.j.abraham@gmail.com>
+ *  \ingroup module_fft
+ */
+
+#ifndef GMX_FFT_GPU_3DFFT_SYCL_ROCFFT_H
+#define GMX_FFT_GPU_3DFFT_SYCL_ROCFFT_H
+
+#include <memory>
+
+#include "gromacs/fft/fft.h"
+#include "gromacs/gpu_utils/devicebuffer_datatype.h"
+#include "gromacs/gpu_utils/gputraits.h"
+#include "gromacs/utility/arrayref.h"
+#include "gromacs/utility/gmxmpi.h"
+#include "gpu_3dfft_impl.h"
+
+class DeviceContext;
+class DeviceStream;
+
+namespace gmx
+{
+
+/*! \internal \brief A 3D FFT wrapper class for performing R2C/C2R
+ * transforms using rocFFT for hipSYCL targetting ROCm devices.
+ */
+class Gpu3dFft::ImplSyclRocfft : public Gpu3dFft::Impl
+{
+public:
+    //! \copydoc Gpu3dFft::Impl::Impl
+    ImplSyclRocfft(bool                 allocateGrids,
+                   MPI_Comm             comm,
+                   ArrayRef<const int>  gridSizesInXForEachRank,
+                   ArrayRef<const int>  gridSizesInYForEachRank,
+                   int                  nz,
+                   bool                 performOutOfPlaceFFT,
+                   const DeviceContext& context,
+                   const DeviceStream&  pmeStream,
+                   ivec                 realGridSize,
+                   ivec                 realGridSizePadded,
+                   ivec                 complexGridSizePadded,
+                   DeviceBuffer<float>* realGrid,
+                   DeviceBuffer<float>* complexGrid);
+
+    //! \copydoc Gpu3dFft::Impl::~Impl
+    ~ImplSyclRocfft() override;
+
+    //! \copydoc Gpu3dFft::Impl::perform3dFft
+    void perform3dFft(gmx_fft_direction dir, CommandEvent* timingEvent) override;
+
+private:
+    class Impl;
+
+    std::unique_ptr<Impl> impl_;
+};
+
+} // namespace gmx
+
+#endif
diff --git a/src/gromacs/fft/tests/fft.cpp b/src/gromacs/fft/tests/fft.cpp
index c421abcb4b..dfe7189795 100644
--- a/src/gromacs/fft/tests/fft.cpp
+++ b/src/gromacs/fft/tests/fft.cpp
@@ -81,7 +81,7 @@ namespace test
  * initializers, and we would not have to do so much useless copying
  * during the unit tests below.
  */
-const double inputdata[] = {
+const double inputdata[500] = {
     // print ",\n".join([",".join(["%4s"%(random.randint(-99,99)/10.,) for i in range(25)]) for j in range(20)])
     -3.5, 6.3,  1.2,  0.3,  1.1,  -5.7, 5.8,  -1.9, -6.3, -1.4, 7.4,  2.4,  -9.9, -7.2, 5.4,  6.1,
     -1.9, -7.6, 1.4,  -3.5, 0.7,  5.6,  -4.2, -1.1, -4.4, -6.3, -7.2, 4.6,  -3.0, -0.9, 7.2,  2.5,
@@ -364,7 +364,7 @@ TEST_F(FFTTest3D, Real5_6_9)
     checkRealGrid(realGridSize, realGridSizePadded, in_, outputRealGridValues);
 }
 
-#if GMX_GPU_CUDA || GMX_GPU_OPENCL
+#if GMX_GPU_CUDA || GMX_GPU_OPENCL || (GMX_GPU_SYCL && GMX_SYCL_HIPSYCL)
 TEST_F(FFTTest3D, GpuReal5_6_9)
 {
     // Ensure library resources are managed appropriately
@@ -387,6 +387,8 @@ TEST_F(FFTTest3D, GpuReal5_6_9)
 
         int size = complexGridSizePadded[0] * complexGridSizePadded[1] * complexGridSizePadded[2];
         int sizeInReals = size * 2;
+        GMX_RELEASE_ASSERT(sizeof(inputdata) / sizeof(inputdata[0]) >= size_t(sizeInReals),
+                           "Size of inputdata is too small");
 
         // Set up the complex grid. Complex numbers take twice the
         // memory.
@@ -395,7 +397,7 @@ TEST_F(FFTTest3D, GpuReal5_6_9)
         // Use std::copy to convert from double to real easily
         std::copy(inputdata, inputdata + sizeInReals, in_.begin());
 
-        // Allocate the device buffers
+        SCOPED_TRACE("Allocating the device buffers");
         DeviceBuffer<float> realGrid, complexGrid;
         allocateDeviceBuffer(&realGrid, in_.size(), deviceContext);
         allocateDeviceBuffer(&complexGrid, complexGridValues.size(), deviceContext);
@@ -404,6 +406,10 @@ TEST_F(FFTTest3D, GpuReal5_6_9)
         const FftBackend backend = FftBackend::Cufft;
 #    elif GMX_GPU_OPENCL
         const FftBackend backend = FftBackend::Ocl;
+#    elif GMX_GPU_SYCL
+#        if GMX_SYCL_HIPSYCL
+        const FftBackend backend = FftBackend::SyclRocfft;
+#        endif
 #    endif
         const bool         performOutOfPlaceFFT    = true;
         MPI_Comm           comm                    = MPI_COMM_NULL;
@@ -446,12 +452,23 @@ TEST_F(FFTTest3D, GpuReal5_6_9)
         checker.checkSequence(
                 complexGridValues.begin(), complexGridValues.end(), "ComplexGridAfterRealToComplex");
 
-        // Do the back transform
+        // Clear the real grid input data for the FFT so we can
+        // compute the back transform into it and observe that it did
+        // the work expected.
+        std::vector<float> outputRealGridValues(in_.size());
+        copyToDeviceBuffer(&realGrid,
+                           outputRealGridValues.data(),
+                           0,
+                           outputRealGridValues.size(),
+                           deviceStream,
+                           GpuApiCallBehavior::Sync,
+                           nullptr);
+
+        SCOPED_TRACE("Doing the back transform");
         gpu3dFft.perform3dFft(GMX_FFT_COMPLEX_TO_REAL, timingEvent);
         deviceStream.synchronize();
 
         // Transfer the real grid back from the device
-        std::vector<float> outputRealGridValues(in_.size());
         copyFromDeviceBuffer(outputRealGridValues.data(),
                              &realGrid,
                              0,
@@ -462,6 +479,7 @@ TEST_F(FFTTest3D, GpuReal5_6_9)
 
         checkRealGrid(realGridSize, realGridSizePadded, in_, outputRealGridValues);
 
+        SCOPED_TRACE("Cleaning up");
         freeDeviceBuffer(&realGrid);
         freeDeviceBuffer(&complexGrid);
     }
-- 
2.22.0