SYCL: 3D FFT using oneMKL
authorAndrey Alekseenko <al42and@gmail.com>
Thu, 30 Sep 2021 18:27:37 +0000 (20:27 +0200)
committerMark Abraham <mark.j.abraham@gmail.com>
Fri, 1 Oct 2021 09:49:24 +0000 (09:49 +0000)
Requires Intel oneAPI binary installation (the open-source version of
oneMKL does not support FFT yet).

Only in-place transforms are supported, du to a bug in MKL up to, at
least, 2021.4.0.

Also removes errorneous buffer clearing on "in-place" codepath in FFT tests.

Refs #3927.

cmake/gmxManageSYCL.cmake
src/gromacs/ewald/pme_gpu_internal.cpp
src/gromacs/fft/CMakeLists.txt
src/gromacs/fft/gpu_3dfft.cpp
src/gromacs/fft/gpu_3dfft.h
src/gromacs/fft/gpu_3dfft_sycl.cpp
src/gromacs/fft/gpu_3dfft_sycl_mkl.cpp [new file with mode: 0644]
src/gromacs/fft/gpu_3dfft_sycl_mkl.h [new file with mode: 0644]
src/gromacs/fft/tests/fft.cpp

index 3ca117d49112d498d3023ab1f936e0e9268d470b..79da7359be7b91e02725ec09fee694a82f49d442 100644 (file)
@@ -290,6 +290,11 @@ else()
         message(FATAL_ERROR "Cannot compile with SYCL Intel compiler. Try a different compiler or disable SYCL.")
     endif()
 
+    include(gmxManageFFTLibraries)
+    if(NOT GMX_FFT_MKL)
+        message(WARNING "Building SYCL version with ${GMX_FFT_LIBRARY} instead of MKL. GPU FFT is disabled!")
+    endif()
+
     # Add function wrapper similar to the one used by ComputeCPP and hipSYCL
     function(add_sycl_to_target)
         cmake_parse_arguments(
index fa750002f5766d2830c87c4c419efdc5c22aff60..96dc1f4db95a1b1953fca13a89f6c664a50e5e54 100644 (file)
@@ -616,7 +616,9 @@ void pme_gpu_reinit_3dfft(const PmeGpu* pmeGpu)
 #elif GMX_GPU_OPENCL
         const gmx::FftBackend backend = gmx::FftBackend::Ocl;
 #elif GMX_GPU_SYCL
-#    if GMX_SYCL_HIPSYCL
+#    if GMX_SYCL_DPCPP && GMX_FFT_MKL
+        const gmx::FftBackend backend = gmx::FftBackend::SyclMkl;
+#    elif GMX_SYCL_HIPSYCL
         const gmx::FftBackend backend = gmx::FftBackend::SyclRocfft;
 #    else
         const gmx::FftBackend backend = gmx::FftBackend::Sycl;
index 13571c1564bf416487451dd268f0616a5bbc02c6..6ad355884042825c3bf52d3dd31fe395149ebc3d 100644 (file)
@@ -74,17 +74,33 @@ elseif (GMX_GPU_OPENCL)
         gpu_3dfft_ocl.cpp
         )
 elseif (GMX_GPU_SYCL)
-    if (GMX_SYCL_HIPSYCL)
-        set(3dfft_sycl_source gpu_3dfft_sycl_rocfft.cpp)
-    else()
-        set(3dfft_sycl_source gpu_3dfft.cpp gpu_3dfft_sycl.cpp)
+    if (NOT GMX_GPU_HIPSYCL AND GMX_FFT_MKL)
+            gmx_add_libgromacs_sources(
+            gpu_3dfft_sycl_mkl.cpp
+            )
+        _gmx_add_files_to_property(SYCL_SOURCES
+            gpu_3dfft_sycl_mkl.cpp
+            )
+    endif()
+    if (GMX_GPU_HIPSYCL)
+            gmx_add_libgromacs_sources(
+            gpu_3dfft_sycl_rocfft.cpp
+            )
+        _gmx_add_files_to_property(SYCL_SOURCES
+            gpu_3dfft_sycl_rocfft.cpp
+            )
     endif()
     gmx_add_libgromacs_sources(
-        # SYCL-specific sources
-        ${3dfft_sycl_source}
+        gpu_3dfft_sycl.cpp
         )
     _gmx_add_files_to_property(SYCL_SOURCES
-        ${3dfft_sycl_source}
+        gpu_3dfft_sycl.cpp
+        gpu_3dfft.cpp
+        )
+else()
+    gmx_add_libgromacs_sources(
+        # Stub sources for CPU-only build
+        gpu_3dfft.cpp
         )
 endif()
 
index 17ffe13f682e12354a74091320886fc97a48d41c..7dd21185e720d37356f876d62fa0770213041106 100644 (file)
@@ -52,6 +52,9 @@
 #    include "gpu_3dfft_ocl.h"
 #elif GMX_GPU_SYCL
 #    include "gpu_3dfft_sycl.h"
+#    if GMX_SYCL_DPCPP && GMX_FFT_MKL
+#        include "gpu_3dfft_sycl_mkl.h"
+#    endif
 #    if GMX_SYCL_HIPSYCL
 #        include "gpu_3dfft_sycl_rocfft.h"
 #    endif
@@ -135,6 +138,23 @@ Gpu3dFft::Gpu3dFft(FftBackend           backend,
 #    elif GMX_GPU_SYCL
     switch (backend)
     {
+#        if GMX_SYCL_DPCPP && GMX_FFT_MKL
+        case FftBackend::SyclMkl:
+            impl_ = std::make_unique<Gpu3dFft::ImplSyclMkl>(allocateGrids,
+                                                            comm,
+                                                            gridSizesInXForEachRank,
+                                                            gridSizesInYForEachRank,
+                                                            nz,
+                                                            performOutOfPlaceFFT,
+                                                            context,
+                                                            pmeStream,
+                                                            realGridSize,
+                                                            realGridSizePadded,
+                                                            complexGridSizePadded,
+                                                            realGrid,
+                                                            complexGrid);
+            break;
+#        endif
 #        if GMX_SYCL_HIPSYCL
         case FftBackend::SyclRocfft:
             impl_ = std::make_unique<Gpu3dFft::ImplSyclRocfft>(allocateGrids,
index 2643be275fea32d990ef8ddb1ef80c3e143198f0..729dac12e04d7000519637be425fde7c5c3acb93 100644 (file)
@@ -128,6 +128,7 @@ private:
     class Impl;
     class ImplCuFft;
     class ImplOcl;
+    class ImplSyclMkl;
     class ImplSyclRocfft;
     class ImplSycl;
 
index ff2abfd48582b9648363bdc6c878b9e547365ce2..a24c5bb718c5af70a0c180ee7f7c795063432669 100644 (file)
@@ -69,14 +69,14 @@ Gpu3dFft::ImplSycl::ImplSycl(bool /*allocateGrids*/,
                              DeviceBuffer<float>* /*realGrid*/,
                              DeviceBuffer<float>* /*complexGrid*/)
 {
-    GMX_THROW(NotImplementedError("GPU 3DFFT is not implemented in SYCL"));
+    GMX_THROW(NotImplementedError("Using SYCL build without GPU 3DFFT support"));
 }
 
 Gpu3dFft::ImplSycl::~ImplSycl() = default;
 
 void Gpu3dFft::ImplSycl::perform3dFft(gmx_fft_direction /*dir*/, CommandEvent* /*timingEvent*/)
 {
-    GMX_THROW(NotImplementedError("Not implemented on SYCL yet"));
+    GMX_THROW(NotImplementedError("Using SYCL build without GPU 3DFFT support"));
 }
 
 #pragma clang diagnostic pop
diff --git a/src/gromacs/fft/gpu_3dfft_sycl_mkl.cpp b/src/gromacs/fft/gpu_3dfft_sycl_mkl.cpp
new file mode 100644 (file)
index 0000000..981f117
--- /dev/null
@@ -0,0 +1,209 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2021, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/*! \internal \file
+ *  \brief Implements GPU 3D FFT routines for SYCL.
+ *
+ *  \author Andrey Alekseenko <al42and@gmail.com>
+ *  \author Mark Abraham <mark.j.abraham@gmail.com>
+ *  \ingroup module_fft
+ *
+ *  In DPC++, we use Intel oneMKL to perform the FFT. It requires using the binary version of
+ *  MKL, since the open-source one does not support FFT yet (https://github.com/oneapi-src/oneMKL/issues/27).
+ *
+ *  There are issues with out-of-place transform, existing at least in oneAPI 2021.2-2021.4, so
+ *  we allow only in-place transforms.
+ */
+
+#include "gmxpre.h"
+
+#include "gpu_3dfft_sycl_mkl.h"
+
+#include "config.h"
+
+#include "gromacs/gpu_utils/devicebuffer_sycl.h"
+#include "gromacs/gpu_utils/device_stream.h"
+#include "gromacs/utility/exceptions.h"
+#include "gromacs/utility/gmxassert.h"
+
+class DeviceContext;
+
+#if (!GMX_SYCL_DPCPP)
+#    error This file can only be compiled with Intel DPC++ compiler
+#endif
+
+#if (!GMX_FFT_MKL)
+#    error Must use MKL library for FFT when compiling with Intel DPC++ compiler
+#endif
+
+#include <cstddef>
+#include <oneapi/mkl/dfti.hpp>
+#include <oneapi/mkl/exceptions.hpp>
+#include <mkl_version.h>
+
+// oneAPI 2021.2.0 to 2021.4.0 have issues with backward out-of-place transform
+static constexpr bool sc_mklHasBuggyOutOfPlaceFFT = (INTEL_MKL_VERSION <= 20210004);
+
+namespace gmx
+{
+
+Gpu3dFft::ImplSyclMkl::Descriptor Gpu3dFft::ImplSyclMkl::initDescriptor(const ivec realGridSize)
+{
+    try
+    {
+        const std::vector<std::int64_t> realGridDimensions{ realGridSize[XX],
+                                                            realGridSize[YY],
+                                                            realGridSize[ZZ] };
+        return { realGridDimensions };
+    }
+    catch (oneapi::mkl::exception& exc)
+    {
+        GMX_THROW(InternalError(formatString("MKL failure while constructing descriptor: %s", exc.what())));
+    }
+}
+
+Gpu3dFft::ImplSyclMkl::ImplSyclMkl(bool allocateGrids,
+                                   MPI_Comm /*comm*/,
+                                   ArrayRef<const int> gridSizesInXForEachRank,
+                                   ArrayRef<const int> gridSizesInYForEachRank,
+                                   int /*nz*/,
+                                   const bool performOutOfPlaceFFT,
+                                   const DeviceContext& /*context*/,
+                                   const DeviceStream&  pmeStream,
+                                   ivec                 realGridSize,
+                                   ivec                 realGridSizePadded,
+                                   ivec                 complexGridSizePadded,
+                                   DeviceBuffer<float>* realGrid,
+                                   DeviceBuffer<float>* complexGrid) :
+    realGrid_(*realGrid->buffer_),
+    complexGrid_(*complexGrid->buffer_),
+    queue_(pmeStream.stream()),
+    r2cDescriptor_(initDescriptor(realGridSize)),
+    c2rDescriptor_(initDescriptor(realGridSize))
+{
+    GMX_RELEASE_ASSERT(!allocateGrids, "Grids needs to be pre-allocated");
+    GMX_RELEASE_ASSERT(gridSizesInXForEachRank.size() == 1 && gridSizesInYForEachRank.size() == 1,
+                       "Multi-rank FFT decomposition not implemented with SYCL MKL backend");
+
+    GMX_RELEASE_ASSERT(!(sc_mklHasBuggyOutOfPlaceFFT && performOutOfPlaceFFT),
+                       "The version of MKL used does not properly support out-of-place FFTs");
+
+    GMX_ASSERT(checkDeviceBuffer(*realGrid,
+                                 realGridSizePadded[XX] * realGridSizePadded[YY] * realGridSizePadded[ZZ]),
+               "Real grid buffer is too small for the declared padded size");
+
+    GMX_ASSERT(checkDeviceBuffer(*complexGrid,
+                                 complexGridSizePadded[XX] * complexGridSizePadded[YY]
+                                         * complexGridSizePadded[ZZ] * 2),
+               "Complex grid buffer is too small for the declared padded size");
+
+    // MKL expects row-major
+    const std::array<MKL_LONG, 4> realGridStrides = {
+        0, static_cast<MKL_LONG>(realGridSizePadded[YY] * realGridSizePadded[ZZ]), realGridSizePadded[ZZ], 1
+    };
+    const std::array<MKL_LONG, 4> complexGridStrides = {
+        0,
+        static_cast<MKL_LONG>(complexGridSizePadded[YY] * complexGridSizePadded[ZZ]),
+        complexGridSizePadded[ZZ],
+        1
+    };
+
+    const auto placement = performOutOfPlaceFFT ? DFTI_NOT_INPLACE : DFTI_INPLACE;
+
+    try
+    {
+        using oneapi::mkl::dft::config_param;
+        r2cDescriptor_.set_value(config_param::INPUT_STRIDES, realGridStrides.data());
+        r2cDescriptor_.set_value(config_param::OUTPUT_STRIDES, complexGridStrides.data());
+        r2cDescriptor_.set_value(config_param::CONJUGATE_EVEN_STORAGE, DFTI_COMPLEX_COMPLEX);
+        r2cDescriptor_.set_value(config_param::PLACEMENT, placement);
+        r2cDescriptor_.commit(queue_);
+    }
+    catch (oneapi::mkl::exception& exc)
+    {
+        GMX_THROW(InternalError(
+                formatString("MKL failure while configuring R2C descriptor: %s", exc.what())));
+    }
+
+    try
+    {
+        using oneapi::mkl::dft::config_param;
+        c2rDescriptor_.set_value(config_param::INPUT_STRIDES, complexGridStrides.data());
+        c2rDescriptor_.set_value(config_param::OUTPUT_STRIDES, realGridStrides.data());
+        c2rDescriptor_.set_value(config_param::CONJUGATE_EVEN_STORAGE, DFTI_COMPLEX_COMPLEX);
+        c2rDescriptor_.set_value(config_param::PLACEMENT, placement);
+        c2rDescriptor_.commit(queue_);
+    }
+    catch (oneapi::mkl::exception& exc)
+    {
+        GMX_THROW(InternalError(
+                formatString("MKL failure while configuring C2R descriptor: %s", exc.what())));
+    }
+}
+
+Gpu3dFft::ImplSyclMkl::~ImplSyclMkl() = default;
+
+void Gpu3dFft::ImplSyclMkl::perform3dFft(gmx_fft_direction dir, CommandEvent* /*timingEvent*/)
+{
+    switch (dir)
+    {
+        case GMX_FFT_REAL_TO_COMPLEX:
+            try
+            {
+                oneapi::mkl::dft::compute_forward(r2cDescriptor_, realGrid_, complexGrid_);
+            }
+            catch (oneapi::mkl::exception& exc)
+            {
+                GMX_THROW(InternalError(
+                        formatString("MKL failure while executing R2C transform: %s", exc.what())));
+            }
+            break;
+        case GMX_FFT_COMPLEX_TO_REAL:
+            try
+            {
+                oneapi::mkl::dft::compute_backward(c2rDescriptor_, complexGrid_, realGrid_);
+            }
+            catch (oneapi::mkl::exception& exc)
+            {
+                GMX_THROW(InternalError(
+                        formatString("MKL failure while executing C2R transform: %s", exc.what())));
+            }
+            break;
+        default:
+            GMX_THROW(NotImplementedError("The chosen 3D-FFT case is not implemented on GPUs"));
+    }
+}
+
+} // namespace gmx
diff --git a/src/gromacs/fft/gpu_3dfft_sycl_mkl.h b/src/gromacs/fft/gpu_3dfft_sycl_mkl.h
new file mode 100644 (file)
index 0000000..08cd488
--- /dev/null
@@ -0,0 +1,106 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2016,2017,2018,2019,2021, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/*! \internal \file
+ *  \brief Declares the GPU 3D FFT routines.
+ *
+ *  \author Aleksei Iupinov <a.yupinov@gmail.com>
+ *  \author Mark Abraham <mark.j.abraham@gmail.com>
+ *  \author Gaurav Garg <gaugarg@nvidia.com>
+ *  \ingroup module_fft
+ */
+
+#ifndef GMX_FFT_GPU_3DFFT_SYCL_MKL_H
+#define GMX_FFT_GPU_3DFFT_SYCL_MKL_H
+
+#include "gpu_3dfft_impl.h"
+
+#include <oneapi/mkl/dfti.hpp>
+
+#include "gromacs/fft/fft.h"
+#include "gromacs/gpu_utils/devicebuffer_datatype.h"
+#include "gromacs/gpu_utils/gmxsycl.h"
+#include "gromacs/gpu_utils/gputraits.h"
+#include "gromacs/utility/gmxmpi.h"
+
+class DeviceContext;
+class DeviceStream;
+
+namespace gmx
+{
+
+/*! \internal \brief
+ * A 3D FFT wrapper class for performing R2C/C2R transforms using SYCL + MKL.
+ */
+class Gpu3dFft::ImplSyclMkl : public Gpu3dFft::Impl
+{
+public:
+    //! \copydoc Gpu3dFft::Impl::Impl
+    ImplSyclMkl(bool                 allocateGrids,
+                MPI_Comm             comm,
+                ArrayRef<const int>  gridSizesInXForEachRank,
+                ArrayRef<const int>  gridSizesInYForEachRank,
+                int                  nz,
+                bool                 performOutOfPlaceFFT,
+                const DeviceContext& context,
+                const DeviceStream&  pmeStream,
+                ivec                 realGridSize,
+                ivec                 realGridSizePadded,
+                ivec                 complexGridSizePadded,
+                DeviceBuffer<float>* realGrid,
+                DeviceBuffer<float>* complexGrid);
+
+    //! \copydoc Gpu3dFft::Impl::~Impl
+    ~ImplSyclMkl() override;
+
+    //! \copydoc Gpu3dFft::Impl::perform3dFft
+    void perform3dFft(gmx_fft_direction dir, CommandEvent* timingEvent) override;
+
+private:
+    // Shorthand for the FFT descriptor (a.k.a. plan) type
+    using Descriptor =
+            oneapi::mkl::dft::descriptor<oneapi::mkl::dft::precision::SINGLE, oneapi::mkl::dft::domain::REAL>;
+
+    cl::sycl::buffer<float, 1> realGrid_;
+    cl::sycl::buffer<float, 1> complexGrid_;
+    cl::sycl::queue            queue_;
+    Descriptor                 r2cDescriptor_, c2rDescriptor_;
+
+    static Descriptor initDescriptor(const ivec realGridSize);
+};
+
+} // namespace gmx
+
+#endif
index dfe7189795e539d09d85a817a52459a81356d29d..e06d89e0d27794ba4a504a1f974eab331223ca0e 100644 (file)
@@ -364,7 +364,8 @@ TEST_F(FFTTest3D, Real5_6_9)
     checkRealGrid(realGridSize, realGridSizePadded, in_, outputRealGridValues);
 }
 
-#if GMX_GPU_CUDA || GMX_GPU_OPENCL || (GMX_GPU_SYCL && GMX_SYCL_HIPSYCL)
+#if GMX_GPU_CUDA || GMX_GPU_OPENCL \
+        || (GMX_GPU_SYCL && (GMX_SYCL_HIPSYCL || (GMX_SYCL_DPCPP && GMX_FFT_MKL)))
 TEST_F(FFTTest3D, GpuReal5_6_9)
 {
     // Ensure library resources are managed appropriately
@@ -397,10 +398,16 @@ TEST_F(FFTTest3D, GpuReal5_6_9)
         // Use std::copy to convert from double to real easily
         std::copy(inputdata, inputdata + sizeInReals, in_.begin());
 
+        // DPCPP uses oneMKL, which seems to have troubles with out-of-place transforms
+        const bool performOutOfPlaceFFT = !GMX_SYCL_DPCPP;
+
         SCOPED_TRACE("Allocating the device buffers");
         DeviceBuffer<float> realGrid, complexGrid;
         allocateDeviceBuffer(&realGrid, in_.size(), deviceContext);
-        allocateDeviceBuffer(&complexGrid, complexGridValues.size(), deviceContext);
+        if (performOutOfPlaceFFT)
+        {
+            allocateDeviceBuffer(&complexGrid, complexGridValues.size(), deviceContext);
+        }
 
 #    if GMX_GPU_CUDA
         const FftBackend backend = FftBackend::Cufft;
@@ -409,9 +416,10 @@ TEST_F(FFTTest3D, GpuReal5_6_9)
 #    elif GMX_GPU_SYCL
 #        if GMX_SYCL_HIPSYCL
         const FftBackend backend = FftBackend::SyclRocfft;
+#        elif GMX_SYCL_DPCPP && GMX_FFT_MKL
+        const FftBackend backend = FftBackend::SyclMkl;
 #        endif
 #    endif
-        const bool         performOutOfPlaceFFT    = true;
         MPI_Comm           comm                    = MPI_COMM_NULL;
         const bool         allocateGrid            = false;
         std::array<int, 1> gridSizesInXForEachRank = { 0 };
@@ -430,7 +438,7 @@ TEST_F(FFTTest3D, GpuReal5_6_9)
                           realGridSizePadded,
                           complexGridSizePadded,
                           &realGrid,
-                          &complexGrid);
+                          performOutOfPlaceFFT ? &complexGrid : &realGrid);
 
         // Transfer the real grid input data for the FFT
         copyToDeviceBuffer(
@@ -443,7 +451,7 @@ TEST_F(FFTTest3D, GpuReal5_6_9)
 
         // Check the complex grid (NB this data has not been normalized)
         copyFromDeviceBuffer(complexGridValues.data(),
-                             &complexGrid,
+                             performOutOfPlaceFFT ? &complexGrid : &realGrid,
                              0,
                              complexGridValues.size(),
                              deviceStream,
@@ -452,17 +460,20 @@ TEST_F(FFTTest3D, GpuReal5_6_9)
         checker.checkSequence(
                 complexGridValues.begin(), complexGridValues.end(), "ComplexGridAfterRealToComplex");
 
-        // Clear the real grid input data for the FFT so we can
-        // compute the back transform into it and observe that it did
-        // the work expected.
         std::vector<float> outputRealGridValues(in_.size());
-        copyToDeviceBuffer(&realGrid,
-                           outputRealGridValues.data(),
-                           0,
-                           outputRealGridValues.size(),
-                           deviceStream,
-                           GpuApiCallBehavior::Sync,
-                           nullptr);
+        if (performOutOfPlaceFFT)
+        {
+            // Clear the real grid input data for the FFT so we can
+            // compute the back transform into it and observe that it did
+            // the work expected.
+            copyToDeviceBuffer(&realGrid,
+                               outputRealGridValues.data(),
+                               0,
+                               outputRealGridValues.size(),
+                               deviceStream,
+                               GpuApiCallBehavior::Sync,
+                               nullptr);
+        }
 
         SCOPED_TRACE("Doing the back transform");
         gpu3dFft.perform3dFft(GMX_FFT_COMPLEX_TO_REAL, timingEvent);
@@ -481,7 +492,10 @@ TEST_F(FFTTest3D, GpuReal5_6_9)
 
         SCOPED_TRACE("Cleaning up");
         freeDeviceBuffer(&realGrid);
-        freeDeviceBuffer(&complexGrid);
+        if (performOutOfPlaceFFT)
+        {
+            freeDeviceBuffer(&complexGrid);
+        }
     }
 }