2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2016,2017 by the GROMACS development team.
5 * Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by
6 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
7 * and including many others, as listed in the AUTHORS file in the
8 * top-level source directory and at http://www.gromacs.org.
10 * GROMACS is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public License
12 * as published by the Free Software Foundation; either version 2.1
13 * of the License, or (at your option) any later version.
15 * GROMACS is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with GROMACS; if not, see
22 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
23 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
25 * If you want to redistribute modifications to GROMACS, please
26 * consider that scientific software is very special. Version
27 * control is crucial - bugs must be traceable. We will be happy to
28 * consider code for inclusion in the official distribution, but
29 * derived work must not be called official GROMACS. Details are found
30 * in the README & COPYING files - if they are missing, get the
31 * official version at http://www.gromacs.org.
33 * To help us fund GROMACS development, we humbly ask that you cite
34 * the research papers on the package. Check out http://www.gromacs.org.
38 * Tests utilities for fft calculations.
40 * Current reference data is generated in double precision using the Reference
41 * build type, except for the compiler (Apple Clang).
43 * \author Roland Schulz <roland@utk.edu>
48 #include "gromacs/fft/fft.h"
55 #include <gmock/gmock.h>
56 #include <gtest/gtest.h>
58 #include "gromacs/fft/gpu_3dfft.h"
59 #include "gromacs/fft/parallel_3dfft.h"
60 #include "gromacs/gpu_utils/clfftinitializer.h"
62 # include "gromacs/gpu_utils/devicebuffer.h"
64 #include "gromacs/utility/stringutil.h"
66 #include "testutils/refdata.h"
67 #include "testutils/test_hardware_environment.h"
68 #include "testutils/testasserts.h"
69 #include "testutils/testmatchers.h"
76 /*! \brief Input data for FFT tests.
78 * TODO If we require compilers that all support C++11 user literals,
79 * then this array could be of type real, initialized with e.g. -3.5_r
80 * that does not suffer from implicit narrowing with brace
81 * initializers, and we would not have to do so much useless copying
82 * during the unit tests below.
84 const double inputdata[500] = {
85 // print ",\n".join([",".join(["%4s"%(random.randint(-99,99)/10.,) for i in range(25)]) for j in range(20)])
86 -3.5, 6.3, 1.2, 0.3, 1.1, -5.7, 5.8, -1.9, -6.3, -1.4, 7.4, 2.4, -9.9, -7.2, 5.4, 6.1,
87 -1.9, -7.6, 1.4, -3.5, 0.7, 5.6, -4.2, -1.1, -4.4, -6.3, -7.2, 4.6, -3.0, -0.9, 7.2, 2.5,
88 -3.6, 6.1, -3.2, -2.1, 6.5, -0.4, -9.0, 2.3, 8.4, 4.0, -5.2, -9.0, 4.7, -3.7, -2.0, -9.5,
89 -3.9, -3.6, 7.1, 0.8, -0.6, 5.2, -9.3, -4.5, 5.9, 2.2, -5.8, 5.0, 1.2, -0.1, 2.2, 0.2,
90 -7.7, 1.9, -8.4, 4.4, 2.3, -2.9, 6.7, 2.7, 5.8, -3.6, 8.9, 8.9, 4.3, 9.1, 9.3, -8.7,
91 4.1, 9.6, -6.2, 6.6, -9.3, 8.2, 4.5, 6.2, 9.4, -8.0, -6.8, -3.3, 7.2, 1.7, 0.6, -4.9,
92 9.8, 1.3, 3.2, -0.2, 9.9, 4.4, -9.9, -7.2, 4.4, 4.7, 7.2, -0.3, 0.3, -2.1, 8.4, -2.1,
93 -6.1, 4.1, -5.9, -2.2, -3.8, 5.2, -8.2, -7.8, -8.8, 6.7, -9.5, -4.2, 0.8, 8.3, 5.2, -9.0,
94 8.7, 9.8, -9.9, -7.8, -8.3, 9.0, -2.8, -9.2, -9.6, 8.4, 2.5, 6.0, -0.4, 1.3, -0.5, 9.1,
95 -9.5, -0.8, 1.9, -6.2, 4.3, -3.8, 8.6, -1.9, -2.1, -0.4, -7.1, -3.7, 9.1, -6.4, -0.6, 2.5,
96 8.0, -5.2, -9.8, -4.3, 4.5, 1.7, 9.3, 9.2, 1.0, 5.3, -4.5, 6.4, -6.6, 3.1, -6.8, 2.1,
97 2.0, 7.3, 8.6, 5.0, 5.2, 0.4, -7.1, 4.5, -9.2, -9.1, 0.2, -6.3, -1.1, -9.6, 7.4, -3.7,
98 -5.5, 2.6, -3.5, -0.7, 9.0, 9.8, -8.0, 3.6, 3.0, -2.2, -2.8, 0.8, 9.0, 2.8, 7.7, -0.7,
99 -5.0, -1.8, -2.3, -0.4, -6.2, -9.1, -9.2, 0.5, 5.7, -3.9, 2.1, 0.6, 0.4, 9.1, 7.4, 7.1,
100 -2.5, 7.3, 7.8, -4.3, 6.3, -0.8, -3.8, -1.5, 6.6, 2.3, 3.9, -4.6, 5.8, -7.4, 5.9, 2.8,
101 4.7, 3.9, -5.4, 9.1, -1.6, -1.9, -4.2, -2.6, 0.6, -5.1, 1.8, 5.2, 4.0, -6.2, 6.5, -9.1,
102 0.5, 2.1, 7.1, -8.6, 7.6, -9.7, -4.6, -5.7, 6.1, -1.8, -7.3, 9.4, 8.0, -2.6, -1.8, 5.7,
103 9.3, -7.9, 7.4, 6.3, 2.0, 9.6, -4.5, -6.2, 6.1, 2.3, 0.8, 5.9, -2.8, -3.5, -1.5, 6.0,
104 -4.9, 3.5, 7.7, -4.2, -9.7, 2.4, 8.1, 5.9, 3.4, -7.5, 7.5, 2.6, 4.7, 2.7, 2.2, 2.6,
105 6.2, 7.5, 0.2, -6.4, -2.8, -0.5, -0.3, 0.4, 1.2, 3.5, -4.0, -0.5, 9.3, -7.2, 8.5, -5.5,
106 -1.7, -5.3, 0.3, 3.9, -3.6, -3.6, 4.7, -8.1, 1.4, 4.0, 1.3, -4.3, -8.8, -7.3, 6.3, -7.5,
107 -9.0, 9.1, 4.5, -1.9, 1.9, 9.9, -1.7, -9.1, -5.1, 8.5, -9.3, 2.1, -5.8, -3.6, -0.8, -0.9,
108 -3.3, -2.7, 7.0, -7.2, -5.0, 7.4, -1.4, 0.0, -4.5, -9.7, 0.7, -1.0, -9.1, -5.3, 4.3, 3.4,
109 -6.6, 9.8, -1.1, 8.9, 5.0, 2.9, 0.2, -2.9, 0.8, 6.7, -0.6, 0.6, 4.1, 5.3, -1.7, -0.3,
110 4.2, 3.7, -8.3, 4.0, 1.3, 6.3, 0.2, 1.3, -1.1, -3.5, 2.8, -7.7, 6.2, -4.9, -9.9, 9.6,
111 3.0, -9.2, -8.0, -3.9, 7.9, -6.1, 6.0, 5.9, 9.6, 1.2, 6.2, 3.6, 2.1, 5.8, 9.2, -8.8,
112 8.8, -3.3, -9.2, 4.6, 1.8, 4.6, 2.9, -2.7, 4.2, 7.3, -0.4, 7.7, -7.0, 2.1, 0.3, 3.7,
113 3.3, -8.6, 9.8, 3.6, 3.1, 6.5, -2.4, 7.8, 7.5, 8.4, -2.8, -6.3, -5.1, -2.7, 9.3, -0.8,
114 -9.2, 7.9, 8.9, 3.4, 0.1, -5.3, -6.8, 4.9, 4.3, -0.7, -2.2, -3.2, -7.5, -2.3, 0.0, 8.1,
115 -9.2, -2.3, -5.7, 2.1, 2.6, 2.0, 0.3, -8.0, -2.0, -7.9, 6.6, 8.4, 4.0, -6.2, -6.9, -7.2,
116 7.7, -5.0, 5.3, 1.9, -5.3, -7.5, 8.8, 8.3, 9.0, 8.1, 3.2, 1.2, -5.4, -0.2, 2.1, -5.2,
121 class BaseFFTTest : public ::testing::Test
124 BaseFFTTest() : flags_(GMX_FFT_FLAG_CONSERVATIVE) {}
125 ~BaseFFTTest() override { gmx_fft_cleanup(); }
127 TestReferenceData data_;
128 std::vector<real> in_, out_;
130 // TODO: These tolerances are just something that has been observed
131 // to be sufficient to pass the tests. It would be nicer to
132 // actually argue about why they are sufficient (or what is).
133 // Should work for both one-way and forward+backward transform.
134 FloatingPointTolerance defaultTolerance_ = relativeToleranceAsPrecisionDependentUlp(10.0, 64, 512);
137 class FFTTest : public BaseFFTTest
140 FFTTest() : fft_(nullptr) { checker_.setDefaultTolerance(defaultTolerance_); }
145 gmx_fft_destroy(fft_);
148 TestReferenceChecker checker_ = data_.rootChecker();
152 class ManyFFTTest : public BaseFFTTest
155 ManyFFTTest() : fft_(nullptr) { checker_.setDefaultTolerance(defaultTolerance_); }
156 ~ManyFFTTest() override
160 gmx_many_fft_destroy(fft_);
163 TestReferenceChecker checker_ = data_.rootChecker();
168 // TODO: Add tests for aligned/not-aligned input/output memory
170 class FFTTest1D : public FFTTest, public ::testing::WithParamInterface<int>
174 class FFTTest3D : public BaseFFTTest
177 FFTTest3D() : fft_(nullptr) {}
178 ~FFTTest3D() override
182 gmx_parallel_3dfft_destroy(fft_);
185 gmx_parallel_3dfft_t fft_;
189 TEST_P(FFTTest1D, Complex)
191 const int nx = GetParam();
192 ASSERT_LE(nx * 2, static_cast<int>(sizeof(inputdata) / sizeof(inputdata[0])));
194 in_ = std::vector<real>(nx * 2);
195 std::copy(inputdata, inputdata + nx * 2, in_.begin());
196 out_ = std::vector<real>(nx * 2);
198 real* out = &out_[0];
200 gmx_fft_init_1d(&fft_, nx, flags_);
202 gmx_fft_1d(fft_, GMX_FFT_FORWARD, in, out);
203 checker_.checkSequenceArray(nx * 2, out, "forward");
204 gmx_fft_1d(fft_, GMX_FFT_BACKWARD, in, out);
205 checker_.checkSequenceArray(nx * 2, out, "backward");
208 TEST_P(FFTTest1D, Real)
210 const int rx = GetParam();
211 const int cx = (rx / 2 + 1);
212 ASSERT_LE(cx * 2, static_cast<int>(sizeof(inputdata) / sizeof(inputdata[0])));
214 in_ = std::vector<real>(cx * 2);
215 std::copy(inputdata, inputdata + cx * 2, in_.begin());
216 out_ = std::vector<real>(cx * 2);
218 real* out = &out_[0];
220 gmx_fft_init_1d_real(&fft_, rx, flags_);
222 gmx_fft_1d_real(fft_, GMX_FFT_REAL_TO_COMPLEX, in, out);
223 checker_.checkSequenceArray(cx * 2, out, "forward");
224 gmx_fft_1d_real(fft_, GMX_FFT_COMPLEX_TO_REAL, in, out);
225 checker_.checkSequenceArray(rx, out, "backward");
228 INSTANTIATE_TEST_SUITE_P(7_8_25_36_60, FFTTest1D, ::testing::Values(7, 8, 25, 36, 60));
231 TEST_F(ManyFFTTest, Complex1DLength48Multi5Test)
236 in_ = std::vector<real>(nx * 2 * N);
237 std::copy(inputdata, inputdata + nx * 2 * N, in_.begin());
238 out_ = std::vector<real>(nx * 2 * N);
240 real* out = &out_[0];
242 gmx_fft_init_many_1d(&fft_, nx, N, flags_);
244 gmx_fft_many_1d(fft_, GMX_FFT_FORWARD, in, out);
245 checker_.checkSequenceArray(nx * 2 * N, out, "forward");
246 gmx_fft_many_1d(fft_, GMX_FFT_BACKWARD, in, out);
247 checker_.checkSequenceArray(nx * 2 * N, out, "backward");
250 TEST_F(ManyFFTTest, Real1DLength48Multi5Test)
253 const int cx = (rx / 2 + 1);
256 in_ = std::vector<real>(cx * 2 * N);
257 std::copy(inputdata, inputdata + cx * 2 * N, in_.begin());
258 out_ = std::vector<real>(cx * 2 * N);
260 real* out = &out_[0];
262 gmx_fft_init_many_1d_real(&fft_, rx, N, flags_);
264 gmx_fft_many_1d_real(fft_, GMX_FFT_REAL_TO_COMPLEX, in, out);
265 checker_.checkSequenceArray(cx * 2 * N, out, "forward");
266 gmx_fft_many_1d_real(fft_, GMX_FFT_COMPLEX_TO_REAL, in, out);
267 checker_.checkSequenceArray(rx * N, out, "backward");
270 TEST_F(FFTTest, Real2DLength18_15Test)
273 const int cx = (rx / 2 + 1);
276 in_ = std::vector<real>(cx * 2 * ny);
277 std::copy(inputdata, inputdata + cx * 2 * ny, in_.begin());
278 out_ = std::vector<real>(cx * 2 * ny);
280 real* out = &out_[0];
282 gmx_fft_init_2d_real(&fft_, rx, ny, flags_);
284 gmx_fft_2d_real(fft_, GMX_FFT_REAL_TO_COMPLEX, in, out);
285 checker_.checkSequenceArray(cx * 2 * ny, out, "forward");
286 // known to be wrong for gmx_fft_mkl. And not used.
287 // gmx_fft_2d_real(_fft,GMX_FFT_COMPLEX_TO_REAL,in,out);
288 // _checker.checkSequenceArray(rx*ny, out, "backward");
294 /*! \brief Check that the real grid after forward and backward
295 * 3D transforms matches the input real grid. */
296 void checkRealGrid(const ivec realGridSize,
297 const ivec realGridSizePadded,
298 ArrayRef<const real> inputRealGrid,
299 ArrayRef<real> outputRealGridValues)
301 // Normalize the output (as the implementation does not
302 // normalize either FFT)
303 const real normalizationConstant = 1.0 / (realGridSize[XX] * realGridSize[YY] * realGridSize[ZZ]);
304 std::transform(outputRealGridValues.begin(),
305 outputRealGridValues.end(),
306 outputRealGridValues.begin(),
307 [normalizationConstant](const real r) { return r * normalizationConstant; });
308 // Check the real grid, skipping unused data from the padding
309 const auto realGridTolerance = relativeToleranceAsFloatingPoint(10, 1e-6);
310 for (int i = 0; i < realGridSize[XX] * realGridSize[YY]; i++)
313 arrayRefFromArray(inputRealGrid.data() + i * realGridSizePadded[ZZ], realGridSize[ZZ]);
314 auto actual = arrayRefFromArray(outputRealGridValues.data() + i * realGridSizePadded[ZZ],
316 EXPECT_THAT(actual, Pointwise(RealEq(realGridTolerance), expected))
317 << formatString("checking backward transform part %d", i);
323 // TODO: test with threads and more than 1 MPI ranks
324 TEST_F(FFTTest3D, Real5_6_9)
326 int realGridSize[] = { 5, 6, 9 };
327 MPI_Comm comm[] = { MPI_COMM_NULL, MPI_COMM_NULL };
330 ivec local_ndata, offset, realGridSizePadded, complexGridSizePadded, complex_order;
331 TestReferenceChecker checker(data_.rootChecker());
332 checker.setDefaultTolerance(defaultTolerance_);
334 gmx_parallel_3dfft_init(&fft_, realGridSize, &rdata, &cdata, comm, TRUE, 1);
336 gmx_parallel_3dfft_real_limits(fft_, local_ndata, offset, realGridSizePadded);
337 gmx_parallel_3dfft_complex_limits(fft_, complex_order, local_ndata, offset, complexGridSizePadded);
338 checker.checkVector(realGridSizePadded, "realGridSizePadded");
339 checker.checkVector(complexGridSizePadded, "complexGridSizePadded");
340 int size = complexGridSizePadded[0] * complexGridSizePadded[1] * complexGridSizePadded[2];
341 int sizeInBytes = size * sizeof(t_complex);
342 int sizeInReals = sizeInBytes / sizeof(real);
344 // Prepare the real grid
345 in_ = std::vector<real>(sizeInReals);
346 // Use std::copy to convert from double to real easily
347 std::copy(inputdata, inputdata + sizeInReals, in_.begin());
348 // Use memcpy to convert to t_complex easily
349 memcpy(rdata, in_.data(), sizeInBytes);
351 // Do the forward FFT to compute the complex grid
352 gmx_parallel_3dfft_execute(fft_, GMX_FFT_REAL_TO_COMPLEX, 0, nullptr);
354 // Check the complex grid (NB this data has not been normalized)
355 ArrayRef<real> complexGridValues = arrayRefFromArray(reinterpret_cast<real*>(cdata), size * 2);
356 checker.checkSequence(
357 complexGridValues.begin(), complexGridValues.end(), "ComplexGridAfterRealToComplex");
359 // Do the back transform
360 gmx_parallel_3dfft_execute(fft_, GMX_FFT_COMPLEX_TO_REAL, 0, nullptr);
362 ArrayRef<real> outputRealGridValues = arrayRefFromArray(
363 rdata, realGridSizePadded[XX] * realGridSizePadded[YY] * realGridSizePadded[ZZ]);
364 checkRealGrid(realGridSize, realGridSizePadded, in_, outputRealGridValues);
367 #if GMX_GPU_CUDA || GMX_GPU_OPENCL \
368 || (GMX_GPU_SYCL && (GMX_SYCL_HIPSYCL || (GMX_SYCL_DPCPP && GMX_FFT_MKL)))
369 TEST_F(FFTTest3D, GpuReal5_6_9)
371 // Ensure library resources are managed appropriately
372 ClfftInitializer clfftInitializer;
373 for (const auto& testDevice : getTestHardwareEnvironment()->getTestDeviceList())
375 TestReferenceChecker checker(data_.rootChecker()); // Must be inside the loop to avoid warnings
376 checker.setDefaultTolerance(defaultTolerance_);
378 const DeviceContext& deviceContext = testDevice->deviceContext();
379 setActiveDevice(testDevice->deviceInfo());
380 const DeviceStream& deviceStream = testDevice->deviceStream();
382 ivec realGridSize = { 5, 6, 9 };
383 ivec realGridSizePadded = { realGridSize[XX], realGridSize[YY], (realGridSize[ZZ] / 2 + 1) * 2 };
384 ivec complexGridSizePadded = { realGridSize[XX], realGridSize[YY], (realGridSize[ZZ] / 2) + 1 };
386 checker.checkVector(realGridSizePadded, "realGridSizePadded");
387 checker.checkVector(complexGridSizePadded, "complexGridSizePadded");
389 int size = complexGridSizePadded[0] * complexGridSizePadded[1] * complexGridSizePadded[2];
390 int sizeInReals = size * 2;
391 GMX_RELEASE_ASSERT(sizeof(inputdata) / sizeof(inputdata[0]) >= size_t(sizeInReals),
392 "Size of inputdata is too small");
394 // Set up the complex grid. Complex numbers take twice the
396 std::vector<float> complexGridValues(sizeInReals);
397 in_.resize(sizeInReals);
398 // Use std::copy to convert from double to real easily
399 std::copy(inputdata, inputdata + sizeInReals, in_.begin());
401 // DPCPP uses oneMKL, which seems to have troubles with out-of-place transforms
402 const bool performOutOfPlaceFFT = !GMX_SYCL_DPCPP;
404 SCOPED_TRACE("Allocating the device buffers");
405 DeviceBuffer<float> realGrid, complexGrid;
406 allocateDeviceBuffer(&realGrid, in_.size(), deviceContext);
407 if (performOutOfPlaceFFT)
409 allocateDeviceBuffer(&complexGrid, complexGridValues.size(), deviceContext);
413 const FftBackend backend = FftBackend::Cufft;
414 # elif GMX_GPU_OPENCL
415 const FftBackend backend = FftBackend::Ocl;
417 # if GMX_SYCL_HIPSYCL
418 const FftBackend backend = FftBackend::SyclRocfft;
419 # elif GMX_SYCL_DPCPP && GMX_FFT_MKL
420 const FftBackend backend = FftBackend::SyclMkl;
423 MPI_Comm comm = MPI_COMM_NULL;
424 const bool allocateGrid = false;
425 std::array<int, 1> gridSizesInXForEachRank = { 0 };
426 std::array<int, 1> gridSizesInYForEachRank = { 0 };
427 const int nz = realGridSize[ZZ];
428 Gpu3dFft gpu3dFft(backend,
431 gridSizesInXForEachRank,
432 gridSizesInYForEachRank,
434 performOutOfPlaceFFT,
439 complexGridSizePadded,
441 performOutOfPlaceFFT ? &complexGrid : &realGrid);
443 // Transfer the real grid input data for the FFT
445 &realGrid, in_.data(), 0, in_.size(), deviceStream, GpuApiCallBehavior::Sync, nullptr);
447 // Do the forward FFT to compute the complex grid
448 CommandEvent* timingEvent = nullptr;
449 gpu3dFft.perform3dFft(GMX_FFT_REAL_TO_COMPLEX, timingEvent);
450 deviceStream.synchronize();
452 // Check the complex grid (NB this data has not been normalized)
453 copyFromDeviceBuffer(complexGridValues.data(),
454 performOutOfPlaceFFT ? &complexGrid : &realGrid,
456 complexGridValues.size(),
458 GpuApiCallBehavior::Sync,
460 checker.checkSequence(
461 complexGridValues.begin(), complexGridValues.end(), "ComplexGridAfterRealToComplex");
463 std::vector<float> outputRealGridValues(in_.size());
464 if (performOutOfPlaceFFT)
466 // Clear the real grid input data for the FFT so we can
467 // compute the back transform into it and observe that it did
468 // the work expected.
469 copyToDeviceBuffer(&realGrid,
470 outputRealGridValues.data(),
472 outputRealGridValues.size(),
474 GpuApiCallBehavior::Sync,
478 SCOPED_TRACE("Doing the back transform");
479 gpu3dFft.perform3dFft(GMX_FFT_COMPLEX_TO_REAL, timingEvent);
480 deviceStream.synchronize();
482 // Transfer the real grid back from the device
483 copyFromDeviceBuffer(outputRealGridValues.data(),
486 outputRealGridValues.size(),
488 GpuApiCallBehavior::Sync,
491 checkRealGrid(realGridSize, realGridSizePadded, in_, outputRealGridValues);
493 SCOPED_TRACE("Cleaning up");
494 freeDeviceBuffer(&realGrid);
495 if (performOutOfPlaceFFT)
497 freeDeviceBuffer(&complexGrid);