2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2016,2017 by the GROMACS development team.
5 * Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by
6 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
7 * and including many others, as listed in the AUTHORS file in the
8 * top-level source directory and at http://www.gromacs.org.
10 * GROMACS is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public License
12 * as published by the Free Software Foundation; either version 2.1
13 * of the License, or (at your option) any later version.
15 * GROMACS is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with GROMACS; if not, see
22 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
23 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
25 * If you want to redistribute modifications to GROMACS, please
26 * consider that scientific software is very special. Version
27 * control is crucial - bugs must be traceable. We will be happy to
28 * consider code for inclusion in the official distribution, but
29 * derived work must not be called official GROMACS. Details are found
30 * in the README & COPYING files - if they are missing, get the
31 * official version at http://www.gromacs.org.
33 * To help us fund GROMACS development, we humbly ask that you cite
34 * the research papers on the package. Check out http://www.gromacs.org.
38 * Tests utilities for fft calculations.
40 * Current reference data is generated in double precision using the Reference
41 * build type, except for the compiler (Apple Clang).
43 * \author Roland Schulz <roland@utk.edu>
48 #include "gromacs/fft/fft.h"
55 #include <gmock/gmock.h>
56 #include <gtest/gtest.h>
58 #include "gromacs/fft/gpu_3dfft.h"
59 #include "gromacs/fft/parallel_3dfft.h"
60 #include "gromacs/gpu_utils/clfftinitializer.h"
62 # include "gromacs/gpu_utils/devicebuffer.h"
64 #include "gromacs/utility/stringutil.h"
66 #include "testutils/refdata.h"
67 #include "testutils/test_hardware_environment.h"
68 #include "testutils/testasserts.h"
69 #include "testutils/testmatchers.h"
78 /*! \brief Input data for FFT tests.
80 * TODO If we require compilers that all support C++11 user literals,
81 * then this array could be of type real, initialized with e.g. -3.5_r
82 * that does not suffer from implicit narrowing with brace
83 * initializers, and we would not have to do so much useless copying
84 * during the unit tests below.
86 const double inputdata[500] = {
87 // print ",\n".join([",".join(["%4s"%(random.randint(-99,99)/10.,) for i in range(25)]) for j in range(20)])
88 -3.5, 6.3, 1.2, 0.3, 1.1, -5.7, 5.8, -1.9, -6.3, -1.4, 7.4, 2.4, -9.9, -7.2, 5.4, 6.1,
89 -1.9, -7.6, 1.4, -3.5, 0.7, 5.6, -4.2, -1.1, -4.4, -6.3, -7.2, 4.6, -3.0, -0.9, 7.2, 2.5,
90 -3.6, 6.1, -3.2, -2.1, 6.5, -0.4, -9.0, 2.3, 8.4, 4.0, -5.2, -9.0, 4.7, -3.7, -2.0, -9.5,
91 -3.9, -3.6, 7.1, 0.8, -0.6, 5.2, -9.3, -4.5, 5.9, 2.2, -5.8, 5.0, 1.2, -0.1, 2.2, 0.2,
92 -7.7, 1.9, -8.4, 4.4, 2.3, -2.9, 6.7, 2.7, 5.8, -3.6, 8.9, 8.9, 4.3, 9.1, 9.3, -8.7,
93 4.1, 9.6, -6.2, 6.6, -9.3, 8.2, 4.5, 6.2, 9.4, -8.0, -6.8, -3.3, 7.2, 1.7, 0.6, -4.9,
94 9.8, 1.3, 3.2, -0.2, 9.9, 4.4, -9.9, -7.2, 4.4, 4.7, 7.2, -0.3, 0.3, -2.1, 8.4, -2.1,
95 -6.1, 4.1, -5.9, -2.2, -3.8, 5.2, -8.2, -7.8, -8.8, 6.7, -9.5, -4.2, 0.8, 8.3, 5.2, -9.0,
96 8.7, 9.8, -9.9, -7.8, -8.3, 9.0, -2.8, -9.2, -9.6, 8.4, 2.5, 6.0, -0.4, 1.3, -0.5, 9.1,
97 -9.5, -0.8, 1.9, -6.2, 4.3, -3.8, 8.6, -1.9, -2.1, -0.4, -7.1, -3.7, 9.1, -6.4, -0.6, 2.5,
98 8.0, -5.2, -9.8, -4.3, 4.5, 1.7, 9.3, 9.2, 1.0, 5.3, -4.5, 6.4, -6.6, 3.1, -6.8, 2.1,
99 2.0, 7.3, 8.6, 5.0, 5.2, 0.4, -7.1, 4.5, -9.2, -9.1, 0.2, -6.3, -1.1, -9.6, 7.4, -3.7,
100 -5.5, 2.6, -3.5, -0.7, 9.0, 9.8, -8.0, 3.6, 3.0, -2.2, -2.8, 0.8, 9.0, 2.8, 7.7, -0.7,
101 -5.0, -1.8, -2.3, -0.4, -6.2, -9.1, -9.2, 0.5, 5.7, -3.9, 2.1, 0.6, 0.4, 9.1, 7.4, 7.1,
102 -2.5, 7.3, 7.8, -4.3, 6.3, -0.8, -3.8, -1.5, 6.6, 2.3, 3.9, -4.6, 5.8, -7.4, 5.9, 2.8,
103 4.7, 3.9, -5.4, 9.1, -1.6, -1.9, -4.2, -2.6, 0.6, -5.1, 1.8, 5.2, 4.0, -6.2, 6.5, -9.1,
104 0.5, 2.1, 7.1, -8.6, 7.6, -9.7, -4.6, -5.7, 6.1, -1.8, -7.3, 9.4, 8.0, -2.6, -1.8, 5.7,
105 9.3, -7.9, 7.4, 6.3, 2.0, 9.6, -4.5, -6.2, 6.1, 2.3, 0.8, 5.9, -2.8, -3.5, -1.5, 6.0,
106 -4.9, 3.5, 7.7, -4.2, -9.7, 2.4, 8.1, 5.9, 3.4, -7.5, 7.5, 2.6, 4.7, 2.7, 2.2, 2.6,
107 6.2, 7.5, 0.2, -6.4, -2.8, -0.5, -0.3, 0.4, 1.2, 3.5, -4.0, -0.5, 9.3, -7.2, 8.5, -5.5,
108 -1.7, -5.3, 0.3, 3.9, -3.6, -3.6, 4.7, -8.1, 1.4, 4.0, 1.3, -4.3, -8.8, -7.3, 6.3, -7.5,
109 -9.0, 9.1, 4.5, -1.9, 1.9, 9.9, -1.7, -9.1, -5.1, 8.5, -9.3, 2.1, -5.8, -3.6, -0.8, -0.9,
110 -3.3, -2.7, 7.0, -7.2, -5.0, 7.4, -1.4, 0.0, -4.5, -9.7, 0.7, -1.0, -9.1, -5.3, 4.3, 3.4,
111 -6.6, 9.8, -1.1, 8.9, 5.0, 2.9, 0.2, -2.9, 0.8, 6.7, -0.6, 0.6, 4.1, 5.3, -1.7, -0.3,
112 4.2, 3.7, -8.3, 4.0, 1.3, 6.3, 0.2, 1.3, -1.1, -3.5, 2.8, -7.7, 6.2, -4.9, -9.9, 9.6,
113 3.0, -9.2, -8.0, -3.9, 7.9, -6.1, 6.0, 5.9, 9.6, 1.2, 6.2, 3.6, 2.1, 5.8, 9.2, -8.8,
114 8.8, -3.3, -9.2, 4.6, 1.8, 4.6, 2.9, -2.7, 4.2, 7.3, -0.4, 7.7, -7.0, 2.1, 0.3, 3.7,
115 3.3, -8.6, 9.8, 3.6, 3.1, 6.5, -2.4, 7.8, 7.5, 8.4, -2.8, -6.3, -5.1, -2.7, 9.3, -0.8,
116 -9.2, 7.9, 8.9, 3.4, 0.1, -5.3, -6.8, 4.9, 4.3, -0.7, -2.2, -3.2, -7.5, -2.3, 0.0, 8.1,
117 -9.2, -2.3, -5.7, 2.1, 2.6, 2.0, 0.3, -8.0, -2.0, -7.9, 6.6, 8.4, 4.0, -6.2, -6.9, -7.2,
118 7.7, -5.0, 5.3, 1.9, -5.3, -7.5, 8.8, 8.3, 9.0, 8.1, 3.2, 1.2, -5.4, -0.2, 2.1, -5.2,
123 class BaseFFTTest : public ::testing::Test
126 BaseFFTTest() : flags_(GMX_FFT_FLAG_CONSERVATIVE) {}
127 ~BaseFFTTest() override { gmx_fft_cleanup(); }
129 TestReferenceData data_;
130 std::vector<real> in_, out_;
132 // TODO: These tolerances are just something that has been observed
133 // to be sufficient to pass the tests. It would be nicer to
134 // actually argue about why they are sufficient (or what is).
135 // Should work for both one-way and forward+backward transform.
136 FloatingPointTolerance defaultTolerance_ = relativeToleranceAsPrecisionDependentUlp(10.0, 64, 512);
139 class FFTTest : public BaseFFTTest
142 FFTTest() : fft_(nullptr) { checker_.setDefaultTolerance(defaultTolerance_); }
147 gmx_fft_destroy(fft_);
150 TestReferenceChecker checker_ = data_.rootChecker();
154 class ManyFFTTest : public BaseFFTTest
157 ManyFFTTest() : fft_(nullptr) { checker_.setDefaultTolerance(defaultTolerance_); }
158 ~ManyFFTTest() override
162 gmx_many_fft_destroy(fft_);
165 TestReferenceChecker checker_ = data_.rootChecker();
170 // TODO: Add tests for aligned/not-aligned input/output memory
172 class FFTTest1D : public FFTTest, public ::testing::WithParamInterface<int>
176 class FFTTest3D : public BaseFFTTest
179 FFTTest3D() : fft_(nullptr) {}
180 ~FFTTest3D() override
184 gmx_parallel_3dfft_destroy(fft_);
187 gmx_parallel_3dfft_t fft_;
191 TEST_P(FFTTest1D, Complex)
193 const int nx = GetParam();
194 ASSERT_LE(nx * 2, static_cast<int>(sizeof(inputdata) / sizeof(inputdata[0])));
196 in_ = std::vector<real>(nx * 2);
197 std::copy(inputdata, inputdata + nx * 2, in_.begin());
198 out_ = std::vector<real>(nx * 2);
200 real* out = &out_[0];
202 gmx_fft_init_1d(&fft_, nx, flags_);
204 gmx_fft_1d(fft_, GMX_FFT_FORWARD, in, out);
205 checker_.checkSequenceArray(nx * 2, out, "forward");
206 gmx_fft_1d(fft_, GMX_FFT_BACKWARD, in, out);
207 checker_.checkSequenceArray(nx * 2, out, "backward");
210 TEST_P(FFTTest1D, Real)
212 const int rx = GetParam();
213 const int cx = (rx / 2 + 1);
214 ASSERT_LE(cx * 2, static_cast<int>(sizeof(inputdata) / sizeof(inputdata[0])));
216 in_ = std::vector<real>(cx * 2);
217 std::copy(inputdata, inputdata + cx * 2, in_.begin());
218 out_ = std::vector<real>(cx * 2);
220 real* out = &out_[0];
222 gmx_fft_init_1d_real(&fft_, rx, flags_);
224 gmx_fft_1d_real(fft_, GMX_FFT_REAL_TO_COMPLEX, in, out);
225 checker_.checkSequenceArray(cx * 2, out, "forward");
226 gmx_fft_1d_real(fft_, GMX_FFT_COMPLEX_TO_REAL, in, out);
227 checker_.checkSequenceArray(rx, out, "backward");
230 INSTANTIATE_TEST_SUITE_P(7_8_25_36_60, FFTTest1D, ::testing::Values(7, 8, 25, 36, 60));
233 TEST_F(ManyFFTTest, Complex1DLength48Multi5Test)
238 in_ = std::vector<real>(nx * 2 * N);
239 std::copy(inputdata, inputdata + nx * 2 * N, in_.begin());
240 out_ = std::vector<real>(nx * 2 * N);
242 real* out = &out_[0];
244 gmx_fft_init_many_1d(&fft_, nx, N, flags_);
246 gmx_fft_many_1d(fft_, GMX_FFT_FORWARD, in, out);
247 checker_.checkSequenceArray(nx * 2 * N, out, "forward");
248 gmx_fft_many_1d(fft_, GMX_FFT_BACKWARD, in, out);
249 checker_.checkSequenceArray(nx * 2 * N, out, "backward");
252 TEST_F(ManyFFTTest, Real1DLength48Multi5Test)
255 const int cx = (rx / 2 + 1);
258 in_ = std::vector<real>(cx * 2 * N);
259 std::copy(inputdata, inputdata + cx * 2 * N, in_.begin());
260 out_ = std::vector<real>(cx * 2 * N);
262 real* out = &out_[0];
264 gmx_fft_init_many_1d_real(&fft_, rx, N, flags_);
266 gmx_fft_many_1d_real(fft_, GMX_FFT_REAL_TO_COMPLEX, in, out);
267 checker_.checkSequenceArray(cx * 2 * N, out, "forward");
268 gmx_fft_many_1d_real(fft_, GMX_FFT_COMPLEX_TO_REAL, in, out);
269 checker_.checkSequenceArray(rx * N, out, "backward");
272 TEST_F(FFTTest, Real2DLength18_15Test)
275 const int cx = (rx / 2 + 1);
278 in_ = std::vector<real>(cx * 2 * ny);
279 std::copy(inputdata, inputdata + cx * 2 * ny, in_.begin());
280 out_ = std::vector<real>(cx * 2 * ny);
282 real* out = &out_[0];
284 gmx_fft_init_2d_real(&fft_, rx, ny, flags_);
286 gmx_fft_2d_real(fft_, GMX_FFT_REAL_TO_COMPLEX, in, out);
287 checker_.checkSequenceArray(cx * 2 * ny, out, "forward");
288 // known to be wrong for gmx_fft_mkl. And not used.
289 // gmx_fft_2d_real(_fft,GMX_FFT_COMPLEX_TO_REAL,in,out);
290 // _checker.checkSequenceArray(rx*ny, out, "backward");
296 /*! \brief Check that the real grid after forward and backward
297 * 3D transforms matches the input real grid. */
298 void checkRealGrid(const ivec realGridSize,
299 const ivec realGridSizePadded,
300 ArrayRef<const real> inputRealGrid,
301 ArrayRef<real> outputRealGridValues)
303 // Normalize the output (as the implementation does not
304 // normalize either FFT)
305 const real normalizationConstant = 1.0 / (realGridSize[XX] * realGridSize[YY] * realGridSize[ZZ]);
306 std::transform(outputRealGridValues.begin(),
307 outputRealGridValues.end(),
308 outputRealGridValues.begin(),
309 [normalizationConstant](const real r) { return r * normalizationConstant; });
310 // Check the real grid, skipping unused data from the padding
311 const auto realGridTolerance = relativeToleranceAsFloatingPoint(10, 1e-6);
312 for (int i = 0; i < realGridSize[XX] * realGridSize[YY]; i++)
315 arrayRefFromArray(inputRealGrid.data() + i * realGridSizePadded[ZZ], realGridSize[ZZ]);
316 auto actual = arrayRefFromArray(outputRealGridValues.data() + i * realGridSizePadded[ZZ],
318 EXPECT_THAT(actual, Pointwise(RealEq(realGridTolerance), expected))
319 << formatString("checking backward transform part %d", i);
325 // TODO: test with threads and more than 1 MPI ranks
326 TEST_F(FFTTest3D, Real5_6_9)
328 int realGridSize[] = { 5, 6, 9 };
329 MPI_Comm comm[] = { MPI_COMM_NULL, MPI_COMM_NULL };
332 ivec local_ndata, offset, realGridSizePadded, complexGridSizePadded, complex_order;
333 TestReferenceChecker checker(data_.rootChecker());
334 checker.setDefaultTolerance(defaultTolerance_);
336 gmx_parallel_3dfft_init(&fft_, realGridSize, &rdata, &cdata, comm, TRUE, 1);
338 gmx_parallel_3dfft_real_limits(fft_, local_ndata, offset, realGridSizePadded);
339 gmx_parallel_3dfft_complex_limits(fft_, complex_order, local_ndata, offset, complexGridSizePadded);
340 checker.checkVector(realGridSizePadded, "realGridSizePadded");
341 checker.checkVector(complexGridSizePadded, "complexGridSizePadded");
342 int size = complexGridSizePadded[0] * complexGridSizePadded[1] * complexGridSizePadded[2];
343 int sizeInBytes = size * sizeof(t_complex);
344 int sizeInReals = sizeInBytes / sizeof(real);
346 // Prepare the real grid
347 in_ = std::vector<real>(sizeInReals);
348 // Use std::copy to convert from double to real easily
349 std::copy(inputdata, inputdata + sizeInReals, in_.begin());
350 // Use memcpy to convert to t_complex easily
351 memcpy(rdata, in_.data(), sizeInBytes);
353 // Do the forward FFT to compute the complex grid
354 gmx_parallel_3dfft_execute(fft_, GMX_FFT_REAL_TO_COMPLEX, 0, nullptr);
356 // Check the complex grid (NB this data has not been normalized)
357 ArrayRef<real> complexGridValues = arrayRefFromArray(reinterpret_cast<real*>(cdata), size * 2);
358 checker.checkSequence(
359 complexGridValues.begin(), complexGridValues.end(), "ComplexGridAfterRealToComplex");
361 // Do the back transform
362 gmx_parallel_3dfft_execute(fft_, GMX_FFT_COMPLEX_TO_REAL, 0, nullptr);
364 ArrayRef<real> outputRealGridValues = arrayRefFromArray(
365 rdata, realGridSizePadded[XX] * realGridSizePadded[YY] * realGridSizePadded[ZZ]);
366 checkRealGrid(realGridSize, realGridSizePadded, in_, outputRealGridValues);
371 /*! \brief Whether the FFT is in- or out-of-place
373 * DPCPP uses oneMKL, which seems to have troubles with out-of-place
375 constexpr bool sc_performOutOfPlaceFFT = !((GMX_SYCL_DPCPP == 1) && (GMX_FFT_MKL == 1));
377 /*! \brief Return the output grid depending on whether in- or out-of
380 * Some versions of clang complain of unused code if we would just
381 * branch on the value of sc_performOutOfPlaceFFT at run time, because
382 * in any single configuration there would indeed be unused code. So
383 * the two template specializations are needed so that the compiler
384 * only compiles the template that is used. */
385 template<bool performOutOfPlaceFFT>
386 DeviceBuffer<float>* actualOutputGrid(DeviceBuffer<float>* realGrid, DeviceBuffer<float>* complexGrid);
388 # if GMX_SYCL_DPCPP && GMX_FFT_MKL
391 DeviceBuffer<float>* actualOutputGrid<false>(DeviceBuffer<float>* realGrid,
392 DeviceBuffer<float>* /* complexGrid */)
400 DeviceBuffer<float>* actualOutputGrid<true>(DeviceBuffer<float>* /* realGrid */, DeviceBuffer<float>* complexGrid)
407 TEST_F(FFTTest3D, GpuReal5_6_9)
409 // Ensure library resources are managed appropriately
410 ClfftInitializer clfftInitializer;
411 for (const auto& testDevice : getTestHardwareEnvironment()->getTestDeviceList())
413 TestReferenceChecker checker(data_.rootChecker()); // Must be inside the loop to avoid warnings
414 checker.setDefaultTolerance(defaultTolerance_);
416 const DeviceContext& deviceContext = testDevice->deviceContext();
417 setActiveDevice(testDevice->deviceInfo());
418 const DeviceStream& deviceStream = testDevice->deviceStream();
420 ivec realGridSize = { 5, 6, 9 };
421 ivec realGridSizePadded = { realGridSize[XX], realGridSize[YY], (realGridSize[ZZ] / 2 + 1) * 2 };
422 ivec complexGridSizePadded = { realGridSize[XX], realGridSize[YY], (realGridSize[ZZ] / 2) + 1 };
424 checker.checkVector(realGridSizePadded, "realGridSizePadded");
425 checker.checkVector(complexGridSizePadded, "complexGridSizePadded");
427 int size = complexGridSizePadded[0] * complexGridSizePadded[1] * complexGridSizePadded[2];
428 int sizeInReals = size * 2;
429 GMX_RELEASE_ASSERT(sizeof(inputdata) / sizeof(inputdata[0]) >= size_t(sizeInReals),
430 "Size of inputdata is too small");
432 // Set up the complex grid. Complex numbers take twice the
434 std::vector<float> complexGridValues(sizeInReals);
435 in_.resize(sizeInReals);
436 // Use std::copy to convert from double to real easily
437 std::copy(inputdata, inputdata + sizeInReals, in_.begin());
440 const FftBackend backend = FftBackend::Cufft;
441 # elif GMX_GPU_OPENCL
442 const FftBackend backend = FftBackend::Ocl;
444 # if GMX_SYCL_HIPSYCL
445 # if GMX_HIPSYCL_HAVE_HIP_TARGET
446 const FftBackend backend = FftBackend::SyclRocfft;
448 // Use stub backend so compilation succeeds
449 const FftBackend backend = FftBackend::Sycl;
450 // Don't complain about unused reference data
451 checker.disableUnusedEntriesCheck();
452 // Skip the rest of the test
453 GTEST_SKIP() << "Only rocFFT backend is supported with hipSYCL";
455 # elif GMX_SYCL_DPCPP
457 const FftBackend backend = FftBackend::SyclMkl;
459 // Use stub backend so compilation succeeds
460 const FftBackend backend = FftBackend::Sycl;
461 // Don't complain about unused reference data
462 checker.disableUnusedEntriesCheck();
463 // Skip the rest of the test
464 GTEST_SKIP() << "Only MKL backend is supported with DPC++";
467 # error "Unsupported SYCL implementation"
471 SCOPED_TRACE("Allocating the device buffers");
472 DeviceBuffer<float> realGrid, complexGrid;
473 allocateDeviceBuffer(&realGrid, in_.size(), deviceContext);
474 if (sc_performOutOfPlaceFFT)
476 allocateDeviceBuffer(&complexGrid, complexGridValues.size(), deviceContext);
479 MPI_Comm comm = MPI_COMM_NULL;
480 const bool allocateGrid = false;
481 std::array<int, 1> gridSizesInXForEachRank = { 0 };
482 std::array<int, 1> gridSizesInYForEachRank = { 0 };
483 const int nz = realGridSize[ZZ];
484 Gpu3dFft gpu3dFft(backend,
487 gridSizesInXForEachRank,
488 gridSizesInYForEachRank,
490 sc_performOutOfPlaceFFT,
495 complexGridSizePadded,
497 actualOutputGrid<sc_performOutOfPlaceFFT>(&realGrid, &complexGrid));
499 // Transfer the real grid input data for the FFT
501 &realGrid, in_.data(), 0, in_.size(), deviceStream, GpuApiCallBehavior::Sync, nullptr);
503 // Do the forward FFT to compute the complex grid
504 CommandEvent* timingEvent = nullptr;
505 gpu3dFft.perform3dFft(GMX_FFT_REAL_TO_COMPLEX, timingEvent);
506 deviceStream.synchronize();
508 // Check the complex grid (NB this data has not been normalized)
509 copyFromDeviceBuffer(complexGridValues.data(),
510 actualOutputGrid<sc_performOutOfPlaceFFT>(&realGrid, &complexGrid),
512 complexGridValues.size(),
514 GpuApiCallBehavior::Sync,
516 checker.checkSequence(
517 complexGridValues.begin(), complexGridValues.end(), "ComplexGridAfterRealToComplex");
519 std::vector<float> outputRealGridValues(in_.size());
520 if (sc_performOutOfPlaceFFT)
522 // Clear the real grid input data for the FFT so we can
523 // compute the back transform into it and observe that it did
524 // the work expected.
525 copyToDeviceBuffer(&realGrid,
526 outputRealGridValues.data(),
528 outputRealGridValues.size(),
530 GpuApiCallBehavior::Sync,
534 SCOPED_TRACE("Doing the back transform");
535 gpu3dFft.perform3dFft(GMX_FFT_COMPLEX_TO_REAL, timingEvent);
536 deviceStream.synchronize();
538 // Transfer the real grid back from the device
539 copyFromDeviceBuffer(outputRealGridValues.data(),
542 outputRealGridValues.size(),
544 GpuApiCallBehavior::Sync,
547 checkRealGrid(realGridSize, realGridSizePadded, in_, outputRealGridValues);
549 SCOPED_TRACE("Cleaning up");
550 freeDeviceBuffer(&realGrid);
551 if (sc_performOutOfPlaceFFT)
553 freeDeviceBuffer(&complexGrid);