* This file is part of the GROMACS molecular simulation package.
*
* Copyright (c) 2012,2013,2014,2016,2017 by the GROMACS development team.
- * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
#include "gromacs/fft/fft.h"
+#include "config.h"
+
#include <algorithm>
#include <vector>
+#include <gmock/gmock.h>
#include <gtest/gtest.h>
+#include "gromacs/fft/gpu_3dfft.h"
#include "gromacs/fft/parallel_3dfft.h"
+#include "gromacs/gpu_utils/clfftinitializer.h"
+#if GMX_GPU
+# include "gromacs/gpu_utils/devicebuffer.h"
+#endif
#include "gromacs/utility/stringutil.h"
#include "testutils/refdata.h"
+#include "testutils/test_hardware_environment.h"
#include "testutils/testasserts.h"
+#include "testutils/testmatchers.h"
+namespace gmx
+{
+namespace test
+{
namespace
{
* initializers, and we would not have to do so much useless copying
* during the unit tests below.
*/
-const double inputdata[] = {
+const double inputdata[500] = {
// print ",\n".join([",".join(["%4s"%(random.randint(-99,99)/10.,) for i in range(25)]) for j in range(20)])
-3.5, 6.3, 1.2, 0.3, 1.1, -5.7, 5.8, -1.9, -6.3, -1.4, 7.4, 2.4, -9.9, -7.2, 5.4, 6.1,
-1.9, -7.6, 1.4, -3.5, 0.7, 5.6, -4.2, -1.1, -4.4, -6.3, -7.2, 4.6, -3.0, -0.9, 7.2, 2.5,
class BaseFFTTest : public ::testing::Test
{
public:
- BaseFFTTest() : checker_(data_.rootChecker()), flags_(GMX_FFT_FLAG_CONSERVATIVE)
- {
- // TODO: These tolerances are just something that has been observed
- // to be sufficient to pass the tests. It would be nicer to
- // actually argue about why they are sufficient (or what is).
- checker_.setDefaultTolerance(gmx::test::relativeToleranceAsPrecisionDependentUlp(10.0, 64, 512));
- }
+ BaseFFTTest() : flags_(GMX_FFT_FLAG_CONSERVATIVE) {}
~BaseFFTTest() override { gmx_fft_cleanup(); }
- gmx::test::TestReferenceData data_;
- gmx::test::TestReferenceChecker checker_;
- std::vector<real> in_, out_;
- int flags_;
+ TestReferenceData data_;
+ std::vector<real> in_, out_;
+ int flags_;
+ // TODO: These tolerances are just something that has been observed
+ // to be sufficient to pass the tests. It would be nicer to
+ // actually argue about why they are sufficient (or what is).
+ // Should work for both one-way and forward+backward transform.
+ FloatingPointTolerance defaultTolerance_ = relativeToleranceAsPrecisionDependentUlp(10.0, 64, 512);
};
class FFTTest : public BaseFFTTest
{
public:
- FFTTest() : fft_(nullptr) {}
+ FFTTest() : fft_(nullptr) { checker_.setDefaultTolerance(defaultTolerance_); }
~FFTTest() override
{
if (fft_)
gmx_fft_destroy(fft_);
}
}
- gmx_fft_t fft_;
+ TestReferenceChecker checker_ = data_.rootChecker();
+ gmx_fft_t fft_;
};
class ManyFFTTest : public BaseFFTTest
{
public:
- ManyFFTTest() : fft_(nullptr) {}
+ ManyFFTTest() : fft_(nullptr) { checker_.setDefaultTolerance(defaultTolerance_); }
~ManyFFTTest() override
{
if (fft_)
gmx_many_fft_destroy(fft_);
}
}
- gmx_fft_t fft_;
+ TestReferenceChecker checker_ = data_.rootChecker();
+ gmx_fft_t fft_;
};
{
};
-class FFFTest3D : public BaseFFTTest
+class FFTTest3D : public BaseFFTTest
{
public:
- FFFTest3D() : fft_(nullptr) {}
- ~FFFTest3D() override
+ FFTTest3D() : fft_(nullptr) {}
+ ~FFTTest3D() override
{
if (fft_)
{
checker_.checkSequenceArray(rx, out, "backward");
}
-INSTANTIATE_TEST_CASE_P(7_8_25_36_60, FFTTest1D, ::testing::Values(7, 8, 25, 36, 60));
+INSTANTIATE_TEST_SUITE_P(7_8_25_36_60, FFTTest1D, ::testing::Values(7, 8, 25, 36, 60));
TEST_F(ManyFFTTest, Complex1DLength48Multi5Test)
// _checker.checkSequenceArray(rx*ny, out, "backward");
}
+namespace
+{
+
+/*! \brief Check that the real grid after forward and backward
+ * 3D transforms matches the input real grid. */
+void checkRealGrid(const ivec realGridSize,
+ const ivec realGridSizePadded,
+ ArrayRef<const real> inputRealGrid,
+ ArrayRef<real> outputRealGridValues)
+{
+ // Normalize the output (as the implementation does not
+ // normalize either FFT)
+ const real normalizationConstant = 1.0 / (realGridSize[XX] * realGridSize[YY] * realGridSize[ZZ]);
+ std::transform(outputRealGridValues.begin(),
+ outputRealGridValues.end(),
+ outputRealGridValues.begin(),
+ [normalizationConstant](const real r) { return r * normalizationConstant; });
+ // Check the real grid, skipping unused data from the padding
+ const auto realGridTolerance = relativeToleranceAsFloatingPoint(10, 1e-6);
+ for (int i = 0; i < realGridSize[XX] * realGridSize[YY]; i++)
+ {
+ auto expected =
+ arrayRefFromArray(inputRealGrid.data() + i * realGridSizePadded[ZZ], realGridSize[ZZ]);
+ auto actual = arrayRefFromArray(outputRealGridValues.data() + i * realGridSizePadded[ZZ],
+ realGridSize[ZZ]);
+ EXPECT_THAT(actual, Pointwise(RealEq(realGridTolerance), expected))
+ << formatString("checking backward transform part %d", i);
+ }
+}
+
+} // namespace
+
// TODO: test with threads and more than 1 MPI ranks
-TEST_F(FFFTest3D, Real5_6_9)
+TEST_F(FFTTest3D, Real5_6_9)
{
- int ndata[] = { 5, 6, 9 };
- MPI_Comm comm[] = { MPI_COMM_NULL, MPI_COMM_NULL };
+ int realGridSize[] = { 5, 6, 9 };
+ MPI_Comm comm[] = { MPI_COMM_NULL, MPI_COMM_NULL };
real* rdata;
t_complex* cdata;
- ivec local_ndata, offset, rsize, csize, complex_order;
+ ivec local_ndata, offset, realGridSizePadded, complexGridSizePadded, complex_order;
+ TestReferenceChecker checker(data_.rootChecker());
+ checker.setDefaultTolerance(defaultTolerance_);
- gmx_parallel_3dfft_init(&fft_, ndata, &rdata, &cdata, comm, TRUE, 1);
+ gmx_parallel_3dfft_init(&fft_, realGridSize, &rdata, &cdata, comm, TRUE, 1);
- gmx_parallel_3dfft_real_limits(fft_, local_ndata, offset, rsize);
- gmx_parallel_3dfft_complex_limits(fft_, complex_order, local_ndata, offset, csize);
- checker_.checkVector(rsize, "rsize");
- checker_.checkVector(csize, "csize");
- int size = csize[0] * csize[1] * csize[2];
+ gmx_parallel_3dfft_real_limits(fft_, local_ndata, offset, realGridSizePadded);
+ gmx_parallel_3dfft_complex_limits(fft_, complex_order, local_ndata, offset, complexGridSizePadded);
+ checker.checkVector(realGridSizePadded, "realGridSizePadded");
+ checker.checkVector(complexGridSizePadded, "complexGridSizePadded");
+ int size = complexGridSizePadded[0] * complexGridSizePadded[1] * complexGridSizePadded[2];
int sizeInBytes = size * sizeof(t_complex);
int sizeInReals = sizeInBytes / sizeof(real);
+ // Prepare the real grid
in_ = std::vector<real>(sizeInReals);
// Use std::copy to convert from double to real easily
std::copy(inputdata, inputdata + sizeInReals, in_.begin());
// Use memcpy to convert to t_complex easily
memcpy(rdata, in_.data(), sizeInBytes);
+
+ // Do the forward FFT to compute the complex grid
gmx_parallel_3dfft_execute(fft_, GMX_FFT_REAL_TO_COMPLEX, 0, nullptr);
- // TODO use std::complex and add checkComplex for it
- checker_.checkSequenceArray(size * 2, reinterpret_cast<real*>(cdata), "forward");
- // Use std::copy to convert from double to real easily
- std::copy(inputdata, inputdata + sizeInReals, in_.begin());
- // Use memcpy to convert to t_complex easily
- memcpy(cdata, in_.data(), sizeInBytes);
+ // Check the complex grid (NB this data has not been normalized)
+ ArrayRef<real> complexGridValues = arrayRefFromArray(reinterpret_cast<real*>(cdata), size * 2);
+ checker.checkSequence(
+ complexGridValues.begin(), complexGridValues.end(), "ComplexGridAfterRealToComplex");
+
+ // Do the back transform
gmx_parallel_3dfft_execute(fft_, GMX_FFT_COMPLEX_TO_REAL, 0, nullptr);
- for (int i = 0; i < ndata[0] * ndata[1]; i++) // check sequence but skip unused data
+
+ ArrayRef<real> outputRealGridValues = arrayRefFromArray(
+ rdata, realGridSizePadded[XX] * realGridSizePadded[YY] * realGridSizePadded[ZZ]);
+ checkRealGrid(realGridSize, realGridSizePadded, in_, outputRealGridValues);
+}
+
+#if GMX_GPU
+
+/*! \brief Whether the FFT is in- or out-of-place
+ *
+ * DPCPP uses oneMKL, which seems to have troubles with out-of-place
+ * transforms. */
+constexpr bool sc_performOutOfPlaceFFT = !((GMX_SYCL_DPCPP == 1) && (GMX_FFT_MKL == 1));
+
+/*! \brief Return the output grid depending on whether in- or out-of
+ * place FFT is used
+ *
+ * Some versions of clang complain of unused code if we would just
+ * branch on the value of sc_performOutOfPlaceFFT at run time, because
+ * in any single configuration there would indeed be unused code. So
+ * the two template specializations are needed so that the compiler
+ * only compiles the template that is used. */
+template<bool performOutOfPlaceFFT>
+DeviceBuffer<float>* actualOutputGrid(DeviceBuffer<float>* realGrid, DeviceBuffer<float>* complexGrid);
+
+# if GMX_SYCL_DPCPP && GMX_FFT_MKL
+
+template<>
+DeviceBuffer<float>* actualOutputGrid<false>(DeviceBuffer<float>* realGrid,
+ DeviceBuffer<float>* /* complexGrid */)
+{
+ return realGrid;
+};
+
+# else
+
+template<>
+DeviceBuffer<float>* actualOutputGrid<true>(DeviceBuffer<float>* /* realGrid */, DeviceBuffer<float>* complexGrid)
+{
+ return complexGrid;
+}
+
+# endif
+
+TEST_F(FFTTest3D, GpuReal5_6_9)
+{
+ // Ensure library resources are managed appropriately
+ ClfftInitializer clfftInitializer;
+ for (const auto& testDevice : getTestHardwareEnvironment()->getTestDeviceList())
{
- checker_.checkSequenceArray(ndata[2], rdata + i * rsize[2],
- gmx::formatString("backward %d", i).c_str());
+ TestReferenceChecker checker(data_.rootChecker()); // Must be inside the loop to avoid warnings
+ checker.setDefaultTolerance(defaultTolerance_);
+
+ const DeviceContext& deviceContext = testDevice->deviceContext();
+ setActiveDevice(testDevice->deviceInfo());
+ const DeviceStream& deviceStream = testDevice->deviceStream();
+
+ ivec realGridSize = { 5, 6, 9 };
+ ivec realGridSizePadded = { realGridSize[XX], realGridSize[YY], (realGridSize[ZZ] / 2 + 1) * 2 };
+ ivec complexGridSizePadded = { realGridSize[XX], realGridSize[YY], (realGridSize[ZZ] / 2) + 1 };
+
+ checker.checkVector(realGridSizePadded, "realGridSizePadded");
+ checker.checkVector(complexGridSizePadded, "complexGridSizePadded");
+
+ int size = complexGridSizePadded[0] * complexGridSizePadded[1] * complexGridSizePadded[2];
+ int sizeInReals = size * 2;
+ GMX_RELEASE_ASSERT(sizeof(inputdata) / sizeof(inputdata[0]) >= size_t(sizeInReals),
+ "Size of inputdata is too small");
+
+ // Set up the complex grid. Complex numbers take twice the
+ // memory.
+ std::vector<float> complexGridValues(sizeInReals);
+ in_.resize(sizeInReals);
+ // Use std::copy to convert from double to real easily
+ std::copy(inputdata, inputdata + sizeInReals, in_.begin());
+
+# if GMX_GPU_CUDA
+ const FftBackend backend = FftBackend::Cufft;
+# elif GMX_GPU_OPENCL
+ const FftBackend backend = FftBackend::Ocl;
+# elif GMX_GPU_SYCL
+# if GMX_SYCL_HIPSYCL
+# if GMX_HIPSYCL_HAVE_HIP_TARGET
+ const FftBackend backend = FftBackend::SyclRocfft;
+# else
+ // Use stub backend so compilation succeeds
+ const FftBackend backend = FftBackend::Sycl;
+ // Don't complain about unused reference data
+ checker.disableUnusedEntriesCheck();
+ // Skip the rest of the test
+ GTEST_SKIP() << "Only rocFFT backend is supported with hipSYCL";
+# endif
+# elif GMX_SYCL_DPCPP
+# if GMX_FFT_MKL
+ const FftBackend backend = FftBackend::SyclMkl;
+# else
+ // Use stub backend so compilation succeeds
+ const FftBackend backend = FftBackend::Sycl;
+ // Don't complain about unused reference data
+ checker.disableUnusedEntriesCheck();
+ // Skip the rest of the test
+ GTEST_SKIP() << "Only MKL backend is supported with DPC++";
+# endif
+# else
+# error "Unsupported SYCL implementation"
+# endif
+# endif
+
+ SCOPED_TRACE("Allocating the device buffers");
+ DeviceBuffer<float> realGrid, complexGrid;
+ allocateDeviceBuffer(&realGrid, in_.size(), deviceContext);
+ if (sc_performOutOfPlaceFFT)
+ {
+ allocateDeviceBuffer(&complexGrid, complexGridValues.size(), deviceContext);
+ }
+
+ MPI_Comm comm = MPI_COMM_NULL;
+ const bool allocateGrid = false;
+ std::array<int, 1> gridSizesInXForEachRank = { 0 };
+ std::array<int, 1> gridSizesInYForEachRank = { 0 };
+ const int nz = realGridSize[ZZ];
+ Gpu3dFft gpu3dFft(backend,
+ allocateGrid,
+ comm,
+ gridSizesInXForEachRank,
+ gridSizesInYForEachRank,
+ nz,
+ sc_performOutOfPlaceFFT,
+ deviceContext,
+ deviceStream,
+ realGridSize,
+ realGridSizePadded,
+ complexGridSizePadded,
+ &realGrid,
+ actualOutputGrid<sc_performOutOfPlaceFFT>(&realGrid, &complexGrid));
+
+ // Transfer the real grid input data for the FFT
+ copyToDeviceBuffer(
+ &realGrid, in_.data(), 0, in_.size(), deviceStream, GpuApiCallBehavior::Sync, nullptr);
+
+ // Do the forward FFT to compute the complex grid
+ CommandEvent* timingEvent = nullptr;
+ gpu3dFft.perform3dFft(GMX_FFT_REAL_TO_COMPLEX, timingEvent);
+ deviceStream.synchronize();
+
+ // Check the complex grid (NB this data has not been normalized)
+ copyFromDeviceBuffer(complexGridValues.data(),
+ actualOutputGrid<sc_performOutOfPlaceFFT>(&realGrid, &complexGrid),
+ 0,
+ complexGridValues.size(),
+ deviceStream,
+ GpuApiCallBehavior::Sync,
+ nullptr);
+ checker.checkSequence(
+ complexGridValues.begin(), complexGridValues.end(), "ComplexGridAfterRealToComplex");
+
+ std::vector<float> outputRealGridValues(in_.size());
+ if (sc_performOutOfPlaceFFT)
+ {
+ // Clear the real grid input data for the FFT so we can
+ // compute the back transform into it and observe that it did
+ // the work expected.
+ copyToDeviceBuffer(&realGrid,
+ outputRealGridValues.data(),
+ 0,
+ outputRealGridValues.size(),
+ deviceStream,
+ GpuApiCallBehavior::Sync,
+ nullptr);
+ }
+
+ SCOPED_TRACE("Doing the back transform");
+ gpu3dFft.perform3dFft(GMX_FFT_COMPLEX_TO_REAL, timingEvent);
+ deviceStream.synchronize();
+
+ // Transfer the real grid back from the device
+ copyFromDeviceBuffer(outputRealGridValues.data(),
+ &realGrid,
+ 0,
+ outputRealGridValues.size(),
+ deviceStream,
+ GpuApiCallBehavior::Sync,
+ nullptr);
+
+ checkRealGrid(realGridSize, realGridSizePadded, in_, outputRealGridValues);
+
+ SCOPED_TRACE("Cleaning up");
+ freeDeviceBuffer(&realGrid);
+ if (sc_performOutOfPlaceFFT)
+ {
+ freeDeviceBuffer(&complexGrid);
+ }
}
}
+#endif
+
} // namespace
+} // namespace test
+} // namespace gmx