From 2b684af003232984fa5c59f9b94b8bc9f48c188d Mon Sep 17 00:00:00 2001 From: Mark Abraham Date: Thu, 30 Sep 2021 12:42:30 +0000 Subject: [PATCH] Add hipSYCL support to GPU 3DFFT Made the testing of the complex-to-real transform less likely to pass when there's a problem. Refs #3965 --- src/gromacs/ewald/pme_gpu_internal.cpp | 4 + src/gromacs/fft/CMakeLists.txt | 11 +- src/gromacs/fft/gpu_3dfft.cpp | 20 + src/gromacs/fft/gpu_3dfft.h | 9 +- src/gromacs/fft/gpu_3dfft_sycl_rocfft.cpp | 457 ++++++++++++++++++++++ src/gromacs/fft/gpu_3dfft_sycl_rocfft.h | 96 +++++ src/gromacs/fft/tests/fft.cpp | 28 +- 7 files changed, 613 insertions(+), 12 deletions(-) create mode 100644 src/gromacs/fft/gpu_3dfft_sycl_rocfft.cpp create mode 100644 src/gromacs/fft/gpu_3dfft_sycl_rocfft.h diff --git a/src/gromacs/ewald/pme_gpu_internal.cpp b/src/gromacs/ewald/pme_gpu_internal.cpp index 185fea7de3..fa750002f5 100644 --- a/src/gromacs/ewald/pme_gpu_internal.cpp +++ b/src/gromacs/ewald/pme_gpu_internal.cpp @@ -616,7 +616,11 @@ void pme_gpu_reinit_3dfft(const PmeGpu* pmeGpu) #elif GMX_GPU_OPENCL const gmx::FftBackend backend = gmx::FftBackend::Ocl; #elif GMX_GPU_SYCL +# if GMX_SYCL_HIPSYCL + const gmx::FftBackend backend = gmx::FftBackend::SyclRocfft; +# else const gmx::FftBackend backend = gmx::FftBackend::Sycl; +# endif #else GMX_RELEASE_ASSERT(false, "Unknown GPU backend"); const gmx::FftBackend backend = gmx::FftBackend::Count; diff --git a/src/gromacs/fft/CMakeLists.txt b/src/gromacs/fft/CMakeLists.txt index 060f9b8d6b..13571c1564 100644 --- a/src/gromacs/fft/CMakeLists.txt +++ b/src/gromacs/fft/CMakeLists.txt @@ -74,14 +74,17 @@ elseif (GMX_GPU_OPENCL) gpu_3dfft_ocl.cpp ) elseif (GMX_GPU_SYCL) - # SYCL-TODO: proper implementation + if (GMX_SYCL_HIPSYCL) + set(3dfft_sycl_source gpu_3dfft_sycl_rocfft.cpp) + else() + set(3dfft_sycl_source gpu_3dfft.cpp gpu_3dfft_sycl.cpp) + endif() gmx_add_libgromacs_sources( # SYCL-specific sources - gpu_3dfft_sycl.cpp + ${3dfft_sycl_source} ) _gmx_add_files_to_property(SYCL_SOURCES - gpu_3dfft.cpp - gpu_3dfft_sycl.cpp + ${3dfft_sycl_source} ) endif() diff --git a/src/gromacs/fft/gpu_3dfft.cpp b/src/gromacs/fft/gpu_3dfft.cpp index 9b931cfb2f..17ffe13f68 100644 --- a/src/gromacs/fft/gpu_3dfft.cpp +++ b/src/gromacs/fft/gpu_3dfft.cpp @@ -52,6 +52,9 @@ # include "gpu_3dfft_ocl.h" #elif GMX_GPU_SYCL # include "gpu_3dfft_sycl.h" +# if GMX_SYCL_HIPSYCL +# include "gpu_3dfft_sycl_rocfft.h" +# endif #endif #if Heffte_FOUND @@ -132,6 +135,23 @@ Gpu3dFft::Gpu3dFft(FftBackend backend, # elif GMX_GPU_SYCL switch (backend) { +# if GMX_SYCL_HIPSYCL + case FftBackend::SyclRocfft: + impl_ = std::make_unique(allocateGrids, + comm, + gridSizesInXForEachRank, + gridSizesInYForEachRank, + nz, + performOutOfPlaceFFT, + context, + pmeStream, + realGridSize, + realGridSizePadded, + complexGridSizePadded, + realGrid, + complexGrid); + break; +# endif case FftBackend::Sycl: impl_ = std::make_unique(allocateGrids, comm, diff --git a/src/gromacs/fft/gpu_3dfft.h b/src/gromacs/fft/gpu_3dfft.h index a26dc8af3f..577108a154 100644 --- a/src/gromacs/fft/gpu_3dfft.h +++ b/src/gromacs/fft/gpu_3dfft.h @@ -67,10 +67,12 @@ class ArrayRef; */ enum class FftBackend { - Cufft, // supports only single-GPU - Ocl, // supports only single-GPU - Sycl, // Not supported currently + Cufft, // supports only single-GPU + Ocl, // supports only single-GPU HeFFTe_CUDA, + SyclMkl, // supports only single-GPU + SyclRocfft, // supports only single-GPU + Sycl, // stubs for not supported configurations Count }; @@ -126,6 +128,7 @@ private: class Impl; class ImplCuFft; class ImplOcl; + class ImplSyclRocfft; class ImplSycl; template diff --git a/src/gromacs/fft/gpu_3dfft_sycl_rocfft.cpp b/src/gromacs/fft/gpu_3dfft_sycl_rocfft.cpp new file mode 100644 index 0000000000..8e59c7fe11 --- /dev/null +++ b/src/gromacs/fft/gpu_3dfft_sycl_rocfft.cpp @@ -0,0 +1,457 @@ +/* + * This file is part of the GROMACS molecular simulation package. + * + * Copyright (c) 2021, by the GROMACS development team, led by + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, + * and including many others, as listed in the AUTHORS file in the + * top-level source directory and at http://www.gromacs.org. + * + * GROMACS is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 + * of the License, or (at your option) any later version. + * + * GROMACS is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with GROMACS; if not, see + * http://www.gnu.org/licenses, or write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * If you want to redistribute modifications to GROMACS, please + * consider that scientific software is very special. Version + * control is crucial - bugs must be traceable. We will be happy to + * consider code for inclusion in the official distribution, but + * derived work must not be called official GROMACS. Details are found + * in the README & COPYING files - if they are missing, get the + * official version at http://www.gromacs.org. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the research papers on the package. Check out http://www.gromacs.org. + */ + +/*! \internal \file + * \brief Implements GPU 3D FFT routines for hipSYCL via rocFFT. + * + * \author Andrey Alekseenko + * \author Mark Abraham + * + * For hipSYCL, in order to call FFT APIs from the respective vendors + * using the same DeviceStream as other operations, a vendor extension + * called "custom operations" is used (see hipSYCL + * doc/enqueue-custom-operation.md). That effectively enqueues an + * asynchronous host-side lambda into the same queue. The body of the + * lambda unpacks the runtime data structures to get the native + * handles and calls the native FFT APIs. + * + * For a 3D FFT, rocFFT requires a working buffer which it allocates + * itself if not provided. This might be slow enough to be worth + * optimizing. This working buffer could be provided in advance by + * calling rocfft_plan_get_work_buffer_size, allocating a buffer that + * persists suitably, and then using + * rocfft_execution_info_set_work_buffer in a custom operation. + * + * hipSYCL queues operate at a higher level of abstraction than hip + * streams, with the runtime distributing work to the latter to + * balance load. It is possible to set the HIP stream in + * rocfft_execution_info, but then there is no guarantee that a + * subsequent queue item will run using the same stream. So we + * currently do not attempt to set the stream. + * + * \ingroup module_fft + */ + +#include "gmxpre.h" + +#include "gpu_3dfft_sycl_rocfft.h" + +#include "gromacs/utility/enumerationhelpers.h" +#include "gromacs/utility/exceptions.h" + +#include + +#include "gromacs/gpu_utils/device_stream.h" +#include "gromacs/gpu_utils/devicebuffer.h" +#include "gromacs/utility/exceptions.h" +#include "gromacs/utility/gmxassert.h" + +#ifndef __HIPSYCL__ +# error This file can only be compiled with hipSYCL enabled +#endif + +#if !defined HIPSYCL_PLATFORM_ROCM +# error Only ROCM platform is supported for 3D FFT with hipSYCL +#endif + +#include "rocfft.h" + +namespace gmx +{ + +namespace +{ + +//! Model the kinds of 3D FFT implemented +enum class FftDirection : int +{ + RealToComplex, + ComplexToReal, + Count, +}; + +//! Strings that match enum rocfft_status_e in rocfft.h +const std::array c_rocfftErrorStrings = { + "success", + "failure", + "invalid argument value", + "invalid dimensions", + "invalid array type", + "invalid strides", + "invalid distance", + "invalid offset", + "invalid work buffer" +}; + +//! Helper for consistent error handling +void handleFftError(rocfft_status result, const std::string& msg) +{ + if (result != rocfft_status_success) + { + if (result <= rocfft_status_invalid_work_buffer) + { + GMX_THROW(gmx::InternalError(gmx::formatString( + "%s: (error code %d - %s)\n", msg.c_str(), result, c_rocfftErrorStrings[result]))); + } + else + { + GMX_THROW(gmx::InternalError(gmx::formatString("%s: (error code %d)\n", msg.c_str(), result))); + } + } +} + +//! Helper for consistent error handling +void handleFftError(rocfft_status result, const std::string& direction, const std::string& msg) +{ + if (result != rocfft_status_success) + { + handleFftError(result, msg + " doing " + direction); + } +} + +//! Provides RAII-style initialization of rocFFT library +class RocfftInitializer +{ +public: + RocfftInitializer() + { + rocfft_status result; + result = rocfft_setup(); + handleFftError(result, "rocfft_setup failure"); + } + ~RocfftInitializer() + { + // No need to handle any errors in a destructor, and + // anyway one cannot throw. + rocfft_cleanup(); + } +}; + +//! All the persistent data for planning an executing a 3D FFT +struct RocfftPlan +{ + //! Describes details of the data layout + rocfft_plan_description description = nullptr; + //! High level information about the plan + rocfft_plan plan = nullptr; + //! Destructor + ~RocfftPlan() + { + // No need to handle any errors in a destructor, + // and anyway one cannot throw. + if (plan) + { + rocfft_plan_destroy(plan); + } + if (description) + { + rocfft_plan_description_destroy(description); + } + } +}; + +//! Helper struct to reduce repetitive code setting up a 3D FFT plan +struct PlanSetupData +{ + //! Format of the input array (real or hermitian) + rocfft_array_type arrayType; + //! Strides through the input array for the three dimensions + std::array strides; + //! Total size of the input array (including padding) + size_t totalSize; +}; + +//! Compute the stride through the real 1D array +std::array makeRealStrides(ivec realGridSizePadded) +{ + return { 1, size_t(realGridSizePadded[ZZ]), size_t(realGridSizePadded[ZZ] * realGridSizePadded[YY]) }; +}; + +//! Compute the stride through the complex 1D array +std::array makeComplexStrides(ivec complexGridSizePadded) +{ + return { 1, + size_t(complexGridSizePadded[XX]), + size_t(complexGridSizePadded[XX] * complexGridSizePadded[YY]) }; +} + +//! Compute total grid size +size_t computeTotalSize(ivec gridSize) +{ + return size_t(gridSize[XX] * gridSize[YY] * gridSize[ZZ]); +} + +/*! \brief Prepare plans for the forward and reverse transformation. + * + * Because these require device-side allocations, some of them must be + * done in a SYCL queue. */ +RocfftPlan makePlan(const std::string& descriptiveString, + rocfft_transform_type transformType, + const PlanSetupData& inputPlanSetupData, + const PlanSetupData& outputPlanSetupData, + ArrayRef rocfftRealGridSize, + const DeviceStream& pmeStream) +{ + rocfft_plan_description description = nullptr; + rocfft_status result; + result = rocfft_plan_description_create(&description); + handleFftError(result, descriptiveString, "rocfft_plan_description_create failure"); + result = rocfft_plan_description_set_data_layout(description, + inputPlanSetupData.arrayType, + outputPlanSetupData.arrayType, + // No offsets are needed + nullptr, + nullptr, + inputPlanSetupData.strides.size(), + inputPlanSetupData.strides.data(), + inputPlanSetupData.totalSize, + outputPlanSetupData.strides.size(), + outputPlanSetupData.strides.data(), + outputPlanSetupData.totalSize); + handleFftError(result, descriptiveString, "rocfft_plan_description_set_data_layout failure"); + + // The plan creation depends on the identity of the GPU device, so + // we make sure it is made in the same queue where it will be + // used. The stream for execution can be set at the same time. + + // First set up device buffers to receive the rocfft status values + rocfft_plan plan = nullptr; + cl::sycl::buffer resultPlanCreate(1); + + // Submit the planning to the queue. This is necessary so that we + // can ensure that the allocations in the planning go to the right + // context. + { + auto queue = pmeStream.stream(); + // Make a buffer that is a view of the existing memory for a + // plan. + cl::sycl::buffer planView = + cl::sycl::make_async_writeback_view(&plan, cl::sycl::range(1), queue); + queue.submit([&](cl::sycl::handler& cgh) { + // Make the necessary accessors + auto a_plan = planView.get_access(cgh, cl::sycl::write_only, cl::sycl::no_init); + auto a_resultPlanCreate = + resultPlanCreate.get_access(cgh, cl::sycl::write_only, cl::sycl::no_init); + cgh.hipSYCL_enqueue_custom_operation([=](cl::sycl::interop_handle& /*h*/) { + const int numBatches = 1; + // Unlike some other FFT APIs, in rocFFT the + // dimension of an FFT is the (vectorial) size + // of the problem (ie. rocfftRealGridSize), not + // the size of the input vector (which varies + // according to whether the input format is real + // or hermitian respectively for forward or + // reverse transforms). + a_resultPlanCreate[0] = rocfft_plan_create(&a_plan[0], + rocfft_placement_notinplace, + transformType, + rocfft_precision_single, + rocfftRealGridSize.size(), + rocfftRealGridSize.data(), + numBatches, + description); + }); + }); + } + // Check for errors that happened while running the hipSYCL custom + // operation. + handleFftError( + resultPlanCreate.get_host_access()[0], descriptiveString, "rocfft_plan_create failure"); + + return RocfftPlan{ description, plan }; +} + +} // namespace + +//! Impl class +class Gpu3dFft::ImplSyclRocfft::Impl +{ +public: + //! \copydoc Gpu3dFft::Impl::Impl + Impl(bool allocateGrids, + MPI_Comm comm, + ArrayRef gridSizesInXForEachRank, + ArrayRef gridSizesInYForEachRank, + const int nz, + bool performOutOfPlaceFFT, + const DeviceContext& context, + const DeviceStream& pmeStream, + ivec realGridSize, + ivec realGridSizePadded, + ivec complexGridSizePadded, + DeviceBuffer* realGrid, + DeviceBuffer* complexGrid); + /*! \brief Handle initializing the rocFFT library + * + * Make sure the library is initialized before the plans, etc. and + * not destructed before they are. */ + RocfftInitializer init_; + //! Data for 3D FFT plans and execution + EnumerationArray plans_; + //! Handle to the real grid buffer + cl::sycl::buffer realGrid_; + //! Handle to the complex grid buffer + cl::sycl::buffer complexGrid_; + /*! \brief Copy of PME stream + * + * This copy is guaranteed by the SYCL standard to work as if + * it was the original. */ + cl::sycl::queue queue_; +}; + +Gpu3dFft::ImplSyclRocfft::Impl::Impl(bool allocateGrids, + MPI_Comm /*comm*/, + ArrayRef gridSizesInXForEachRank, + ArrayRef gridSizesInYForEachRank, + int /*nz*/, + bool performOutOfPlaceFFT, + const DeviceContext& /*context*/, + const DeviceStream& pmeStream, + ivec realGridSize, + ivec realGridSizePadded, + ivec complexGridSizePadded, + DeviceBuffer* realGrid, + DeviceBuffer* complexGrid) : + plans_{ + makePlan("real-to-complex", + rocfft_transform_type_real_forward, + // input + PlanSetupData{ rocfft_array_type_real, + makeRealStrides(realGridSizePadded), + computeTotalSize(realGridSizePadded) }, + // output + PlanSetupData{ rocfft_array_type_hermitian_interleaved, + makeComplexStrides(complexGridSizePadded), + computeTotalSize(complexGridSizePadded) }, + // Note that rocFFT requires that we reverse the dimension order when planning + std::vector{ size_t(realGridSize[ZZ]), + size_t(realGridSize[YY]), + size_t(realGridSize[XX]) }, + pmeStream), + // For rocFFT, the complex-to-real setup is the logical + // converse of the real-to-complex. The PlanSetupData objects + // are the same, but used in the opposite sense of + // input/output. + makePlan("complex-to-real", + rocfft_transform_type_real_inverse, + // input + PlanSetupData{ rocfft_array_type_hermitian_interleaved, + makeComplexStrides(complexGridSizePadded), + computeTotalSize(complexGridSizePadded) }, + // output + PlanSetupData{ rocfft_array_type_real, + makeRealStrides(realGridSizePadded), + computeTotalSize(realGridSizePadded) }, + // Note that rocFFT requires that we reverse the dimension order when planning + std::vector{ size_t(realGridSize[ZZ]), + size_t(realGridSize[YY]), + size_t(realGridSize[XX]) }, + pmeStream), + }, + realGrid_(*realGrid->buffer_.get()), + complexGrid_(*complexGrid->buffer_.get()), + queue_(pmeStream.stream()) +{ + GMX_RELEASE_ASSERT(performOutOfPlaceFFT, "Only out-of-place FFT is implemented in hipSYCL"); + GMX_RELEASE_ASSERT(allocateGrids == false, "Grids need to be pre-allocated"); + GMX_RELEASE_ASSERT(gridSizesInXForEachRank.size() == 1 && gridSizesInYForEachRank.size() == 1, + "FFT decomposition not implemented with SYCL backend"); +} + +void Gpu3dFft::ImplSyclRocfft::perform3dFft(gmx_fft_direction dir, CommandEvent* /*timingEvent*/) +{ + GMX_RELEASE_ASSERT((dir == GMX_FFT_REAL_TO_COMPLEX) || (dir == GMX_FFT_COMPLEX_TO_REAL), + "Only real-to-complex and complex-to-real FFTs are implemented in hipSYCL"); + FftDirection direction; + cl::sycl::buffer*inputGrid = nullptr, *outputGrid = nullptr; + if (dir == GMX_FFT_REAL_TO_COMPLEX) + { + direction = FftDirection::RealToComplex; + inputGrid = &impl_->realGrid_; + outputGrid = &impl_->complexGrid_; + } + else + { + direction = FftDirection::ComplexToReal; + inputGrid = &impl_->complexGrid_; + outputGrid = &impl_->realGrid_; + } + // Enqueue the 3D FFT work + impl_->queue_.submit([&](cl::sycl::handler& cgh) { + auto inputGridAccessor = inputGrid->get_access(cgh, cl::sycl::read_only, cl::sycl::no_init); + auto outputGridAccessor = outputGrid->get_access(cgh, cl::sycl::write_only, cl::sycl::no_init); + // Use a hipSYCL custom operation to access the native buffers + // needed to call rocFFT + cgh.hipSYCL_enqueue_custom_operation([=](cl::sycl::interop_handle& h) { + void* d_inputGrid = h.get_native_mem(inputGridAccessor); + void* d_outputGrid = h.get_native_mem(outputGridAccessor); + // Don't check results generated asynchronously, + // because we don't know what to do with them + rocfft_execute(impl_->plans_[direction].plan, &d_inputGrid, &d_outputGrid, nullptr); + }); + }); +} + +Gpu3dFft::ImplSyclRocfft::ImplSyclRocfft(bool allocateGrids, + MPI_Comm comm, + ArrayRef gridSizesInXForEachRank, + ArrayRef gridSizesInYForEachRank, + const int nz, + bool performOutOfPlaceFFT, + const DeviceContext& context, + const DeviceStream& pmeStream, + ivec realGridSize, + ivec realGridSizePadded, + ivec complexGridSizePadded, + DeviceBuffer* realGrid, + DeviceBuffer* complexGrid) : + impl_(std::make_unique(allocateGrids, + comm, + gridSizesInXForEachRank, + gridSizesInYForEachRank, + nz, + performOutOfPlaceFFT, + context, + pmeStream, + realGridSize, + realGridSizePadded, + complexGridSizePadded, + realGrid, + complexGrid)) +{ +} + +Gpu3dFft::ImplSyclRocfft::~ImplSyclRocfft() = default; + +} // namespace gmx diff --git a/src/gromacs/fft/gpu_3dfft_sycl_rocfft.h b/src/gromacs/fft/gpu_3dfft_sycl_rocfft.h new file mode 100644 index 0000000000..9dc428c79f --- /dev/null +++ b/src/gromacs/fft/gpu_3dfft_sycl_rocfft.h @@ -0,0 +1,96 @@ +/* + * This file is part of the GROMACS molecular simulation package. + * + * Copyright (c) 2016,2017,2018,2019,2021, by the GROMACS development team, led by + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, + * and including many others, as listed in the AUTHORS file in the + * top-level source directory and at http://www.gromacs.org. + * + * GROMACS is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 + * of the License, or (at your option) any later version. + * + * GROMACS is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with GROMACS; if not, see + * http://www.gnu.org/licenses, or write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * If you want to redistribute modifications to GROMACS, please + * consider that scientific software is very special. Version + * control is crucial - bugs must be traceable. We will be happy to + * consider code for inclusion in the official distribution, but + * derived work must not be called official GROMACS. Details are found + * in the README & COPYING files - if they are missing, get the + * official version at http://www.gromacs.org. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the research papers on the package. Check out http://www.gromacs.org. + */ + +/*! \internal \file + * \brief Declares the GPU 3D FFT routines for hipSYCL via rocFFT. + * + * \author Mark Abraham + * \ingroup module_fft + */ + +#ifndef GMX_FFT_GPU_3DFFT_SYCL_ROCFFT_H +#define GMX_FFT_GPU_3DFFT_SYCL_ROCFFT_H + +#include + +#include "gromacs/fft/fft.h" +#include "gromacs/gpu_utils/devicebuffer_datatype.h" +#include "gromacs/gpu_utils/gputraits.h" +#include "gromacs/utility/arrayref.h" +#include "gromacs/utility/gmxmpi.h" +#include "gpu_3dfft_impl.h" + +class DeviceContext; +class DeviceStream; + +namespace gmx +{ + +/*! \internal \brief A 3D FFT wrapper class for performing R2C/C2R + * transforms using rocFFT for hipSYCL targetting ROCm devices. + */ +class Gpu3dFft::ImplSyclRocfft : public Gpu3dFft::Impl +{ +public: + //! \copydoc Gpu3dFft::Impl::Impl + ImplSyclRocfft(bool allocateGrids, + MPI_Comm comm, + ArrayRef gridSizesInXForEachRank, + ArrayRef gridSizesInYForEachRank, + int nz, + bool performOutOfPlaceFFT, + const DeviceContext& context, + const DeviceStream& pmeStream, + ivec realGridSize, + ivec realGridSizePadded, + ivec complexGridSizePadded, + DeviceBuffer* realGrid, + DeviceBuffer* complexGrid); + + //! \copydoc Gpu3dFft::Impl::~Impl + ~ImplSyclRocfft() override; + + //! \copydoc Gpu3dFft::Impl::perform3dFft + void perform3dFft(gmx_fft_direction dir, CommandEvent* timingEvent) override; + +private: + class Impl; + + std::unique_ptr impl_; +}; + +} // namespace gmx + +#endif diff --git a/src/gromacs/fft/tests/fft.cpp b/src/gromacs/fft/tests/fft.cpp index c421abcb4b..dfe7189795 100644 --- a/src/gromacs/fft/tests/fft.cpp +++ b/src/gromacs/fft/tests/fft.cpp @@ -81,7 +81,7 @@ namespace test * initializers, and we would not have to do so much useless copying * during the unit tests below. */ -const double inputdata[] = { +const double inputdata[500] = { // print ",\n".join([",".join(["%4s"%(random.randint(-99,99)/10.,) for i in range(25)]) for j in range(20)]) -3.5, 6.3, 1.2, 0.3, 1.1, -5.7, 5.8, -1.9, -6.3, -1.4, 7.4, 2.4, -9.9, -7.2, 5.4, 6.1, -1.9, -7.6, 1.4, -3.5, 0.7, 5.6, -4.2, -1.1, -4.4, -6.3, -7.2, 4.6, -3.0, -0.9, 7.2, 2.5, @@ -364,7 +364,7 @@ TEST_F(FFTTest3D, Real5_6_9) checkRealGrid(realGridSize, realGridSizePadded, in_, outputRealGridValues); } -#if GMX_GPU_CUDA || GMX_GPU_OPENCL +#if GMX_GPU_CUDA || GMX_GPU_OPENCL || (GMX_GPU_SYCL && GMX_SYCL_HIPSYCL) TEST_F(FFTTest3D, GpuReal5_6_9) { // Ensure library resources are managed appropriately @@ -387,6 +387,8 @@ TEST_F(FFTTest3D, GpuReal5_6_9) int size = complexGridSizePadded[0] * complexGridSizePadded[1] * complexGridSizePadded[2]; int sizeInReals = size * 2; + GMX_RELEASE_ASSERT(sizeof(inputdata) / sizeof(inputdata[0]) >= size_t(sizeInReals), + "Size of inputdata is too small"); // Set up the complex grid. Complex numbers take twice the // memory. @@ -395,7 +397,7 @@ TEST_F(FFTTest3D, GpuReal5_6_9) // Use std::copy to convert from double to real easily std::copy(inputdata, inputdata + sizeInReals, in_.begin()); - // Allocate the device buffers + SCOPED_TRACE("Allocating the device buffers"); DeviceBuffer realGrid, complexGrid; allocateDeviceBuffer(&realGrid, in_.size(), deviceContext); allocateDeviceBuffer(&complexGrid, complexGridValues.size(), deviceContext); @@ -404,6 +406,10 @@ TEST_F(FFTTest3D, GpuReal5_6_9) const FftBackend backend = FftBackend::Cufft; # elif GMX_GPU_OPENCL const FftBackend backend = FftBackend::Ocl; +# elif GMX_GPU_SYCL +# if GMX_SYCL_HIPSYCL + const FftBackend backend = FftBackend::SyclRocfft; +# endif # endif const bool performOutOfPlaceFFT = true; MPI_Comm comm = MPI_COMM_NULL; @@ -446,12 +452,23 @@ TEST_F(FFTTest3D, GpuReal5_6_9) checker.checkSequence( complexGridValues.begin(), complexGridValues.end(), "ComplexGridAfterRealToComplex"); - // Do the back transform + // Clear the real grid input data for the FFT so we can + // compute the back transform into it and observe that it did + // the work expected. + std::vector outputRealGridValues(in_.size()); + copyToDeviceBuffer(&realGrid, + outputRealGridValues.data(), + 0, + outputRealGridValues.size(), + deviceStream, + GpuApiCallBehavior::Sync, + nullptr); + + SCOPED_TRACE("Doing the back transform"); gpu3dFft.perform3dFft(GMX_FFT_COMPLEX_TO_REAL, timingEvent); deviceStream.synchronize(); // Transfer the real grid back from the device - std::vector outputRealGridValues(in_.size()); copyFromDeviceBuffer(outputRealGridValues.data(), &realGrid, 0, @@ -462,6 +479,7 @@ TEST_F(FFTTest3D, GpuReal5_6_9) checkRealGrid(realGridSize, realGridSizePadded, in_, outputRealGridValues); + SCOPED_TRACE("Cleaning up"); freeDeviceBuffer(&realGrid); freeDeviceBuffer(&complexGrid); } -- 2.22.0