if (pme_gpu_settings(pmeGpu).performGPUFFT)
{
pmeGpu->archSpecific->fftSetup.resize(0);
- const bool useDecomposition = pme_gpu_settings(pmeGpu).useDecomposition;
- const bool performOutOfPlaceFFT = pmeGpu->archSpecific->performOutOfPlaceFFT;
- PmeGpuGridParams& grid = pme_gpu_get_kernel_params_base_ptr(pmeGpu)->grid;
+ const bool performOutOfPlaceFFT = pmeGpu->archSpecific->performOutOfPlaceFFT;
+ const bool allocateGrid = false;
+ MPI_Comm comm = MPI_COMM_NULL;
+ std::array<int, 1> gridOffsetsInXForEachRank = { 0 };
+ std::array<int, 1> gridOffsetsInYForEachRank = { 0 };
+#if GMX_GPU_CUDA
+ const gmx::FftBackend backend = gmx::FftBackend::Cufft;
+#elif GMX_GPU_OPENCL
+ const gmx::FftBackend backend = gmx::FftBackend::Ocl;
+#elif GMX_GPU_SYCL
+ const gmx::FftBackend backend = gmx::FftBackend::Sycl;
+#else
+ GMX_RELEASE_ASSERT(false, "Unknown GPU backend");
+ const gmx::FftBackend backend = gmx::FftBackend::Count;
+#endif
+
+ PmeGpuGridParams& grid = pme_gpu_get_kernel_params_base_ptr(pmeGpu)->grid;
for (int gridIndex = 0; gridIndex < pmeGpu->common->ngrids; gridIndex++)
{
pmeGpu->archSpecific->fftSetup.push_back(
- std::make_unique<gmx::Gpu3dFft>(grid.realGridSize,
- grid.realGridSizePadded,
- grid.complexGridSizePadded,
- useDecomposition,
+ std::make_unique<gmx::Gpu3dFft>(backend,
+ allocateGrid,
+ comm,
+ gridOffsetsInXForEachRank,
+ gridOffsetsInYForEachRank,
+ grid.realGridSize[ZZ],
performOutOfPlaceFFT,
pmeGpu->archSpecific->deviceContext_,
pmeGpu->archSpecific->pmeStream_,
- grid.d_realGrid[gridIndex],
- grid.d_fourierGrid[gridIndex]));
+ grid.realGridSize,
+ grid.realGridSizePadded,
+ grid.complexGridSizePadded,
+ &(grid.d_realGrid[gridIndex]),
+ &(grid.d_fourierGrid[gridIndex])));
}
}
}
fft.cpp
fft5d.cpp
parallel_3dfft.cpp
+ gpu_3dfft.cpp
)
if (GMX_FFT_FFTPACK)
if (GMX_GPU_CUDA)
gmx_add_libgromacs_sources(
# CUDA-specific sources
- gpu_3dfft.cu
+ gpu_3dfft_cufft.cu
+ )
+ _gmx_add_files_to_property(CUDA_SOURCES
+ # Must add these files so they can include cuda_runtime.h
+ gpu_3dfft.cpp
)
elseif (GMX_GPU_OPENCL)
gmx_add_libgromacs_sources(
gpu_3dfft_sycl.cpp
)
_gmx_add_files_to_property(SYCL_SOURCES
- gpu_3dfft_sycl.cpp
- )
-else()
- gmx_add_libgromacs_sources(
- # Stub sources for CPU-only build
gpu_3dfft.cpp
+ gpu_3dfft_sycl.cpp
)
endif()
* \brief Implements stub GPU 3D FFT routines for CPU-only builds
*
* \author Mark Abraham <mark.j.abraham@gmail.com>
+ * \author Gaurav Garg <gaugarg@nvidia.com>
* \ingroup module_fft
*/
#include "gmxpre.h"
#include "gpu_3dfft.h"
+#include "gpu_3dfft_impl.h"
+#if GMX_GPU_CUDA
+# include "gpu_3dfft_cufft.h"
+#elif GMX_GPU_OPENCL
+# include "gpu_3dfft_ocl.h"
+#elif GMX_GPU_SYCL
+# include "gpu_3dfft_sycl.h"
+#endif
+
+#include "gromacs/utility/arrayref.h"
#include "gromacs/utility/exceptions.h"
namespace gmx
# pragma clang diagnostic ignored "-Wmissing-noreturn"
#endif
-class Gpu3dFft::Impl
+#if (GMX_GPU_CUDA || GMX_GPU_OPENCL || GMX_GPU_SYCL)
+
+Gpu3dFft::Gpu3dFft(FftBackend backend,
+ bool allocateGrids,
+ MPI_Comm comm,
+ ArrayRef<const int> gridSizesInXForEachRank,
+ ArrayRef<const int> gridSizesInYForEachRank,
+ const int nz,
+ bool performOutOfPlaceFFT,
+ const DeviceContext& context,
+ const DeviceStream& pmeStream,
+ ivec realGridSize,
+ ivec realGridSizePadded,
+ ivec complexGridSizePadded,
+ DeviceBuffer<float>* realGrid,
+ DeviceBuffer<float>* complexGrid)
{
-};
+# if GMX_GPU_CUDA
+ switch (backend)
+ {
+ case FftBackend::Cufft:
+ impl_ = std::make_unique<Gpu3dFft::ImplCuFft>(allocateGrids,
+ comm,
+ gridSizesInXForEachRank,
+ gridSizesInYForEachRank,
+ nz,
+ performOutOfPlaceFFT,
+ context,
+ pmeStream,
+ realGridSize,
+ realGridSizePadded,
+ complexGridSizePadded,
+ realGrid,
+ complexGrid);
+ break;
+ default: GMX_THROW(InternalError("Unsupported FFT backend requested"));
+ }
+# elif GMX_GPU_OPENCL
+ switch (backend)
+ {
+ case FftBackend::Ocl:
+ impl_ = std::make_unique<Gpu3dFft::ImplOcl>(allocateGrids,
+ comm,
+ gridSizesInXForEachRank,
+ gridSizesInYForEachRank,
+ nz,
+ performOutOfPlaceFFT,
+ context,
+ pmeStream,
+ realGridSize,
+ realGridSizePadded,
+ complexGridSizePadded,
+ realGrid,
+ complexGrid);
+ break;
+ default: GMX_THROW(InternalError("Unsupported FFT backend requested"));
+ }
+# elif GMX_GPU_SYCL
+ switch (backend)
+ {
+ case FftBackend::Sycl:
+ impl_ = std::make_unique<Gpu3dFft::ImplSycl>(allocateGrids,
+ comm,
+ gridSizesInXForEachRank,
+ gridSizesInYForEachRank,
+ nz,
+ performOutOfPlaceFFT,
+ context,
+ pmeStream,
+ realGridSize,
+ realGridSizePadded,
+ complexGridSizePadded,
+ realGrid,
+ complexGrid);
+ break;
+ default: GMX_THROW(InternalError("Unsupported FFT backend requested"));
+ }
+# endif
+}
-Gpu3dFft::Gpu3dFft(ivec /*realGridSize*/,
- ivec /*realGridSizePadded*/,
- ivec /*complexGridSizePadded*/,
- const bool /*useDecomposition*/,
- const bool /*performOutOfPlaceFFT*/,
+#else
+
+Gpu3dFft::Gpu3dFft(FftBackend /*backend */,
+ bool /*allocateGrids*/,
+ MPI_Comm /*comm*/,
+ ArrayRef<const int> /*gridSizesInXForEachRank*/,
+ ArrayRef<const int> /*gridSizesInYForEachRank*/,
+ const int /*nz*/,
+ bool /*performOutOfPlaceFFT*/,
const DeviceContext& /*context*/,
const DeviceStream& /*pmeStream*/,
- DeviceBuffer<float> /*realGrid*/,
- DeviceBuffer<float> /*complexGrid*/)
+ ivec /*realGridSize*/,
+ ivec /*realGridSizePadded*/,
+ ivec /*complexGridSizePadded*/,
+ DeviceBuffer<float>* /*realGrid*/,
+ DeviceBuffer<float>* /*complexGrid*/)
{
GMX_THROW(InternalError("Cannot run GPU routines in a CPU-only configuration"));
}
+#endif
+
Gpu3dFft::~Gpu3dFft() = default;
-// NOLINTNEXTLINE readability-convert-member-functions-to-static
-void Gpu3dFft::perform3dFft(gmx_fft_direction /*dir*/, CommandEvent* /*timingEvent*/)
+void Gpu3dFft::perform3dFft(gmx_fft_direction dir, CommandEvent* timingEvent)
{
- GMX_THROW(InternalError("Cannot run GPU routines in a CPU-only configuration"));
+ GMX_RELEASE_ASSERT(impl_ != nullptr, "Cannot run GPU routines in a CPU-only configuration");
+ impl_->perform3dFft(dir, timingEvent);
}
#ifdef __clang__
*
* \author Aleksei Iupinov <a.yupinov@gmail.com>
* \author Mark Abraham <mark.j.abraham@gmail.com>
+ * \author Gaurav Garg <gaugarg@nvidia.com>
* \ingroup module_fft
*/
#include "gromacs/fft/fft.h"
#include "gromacs/gpu_utils/devicebuffer_datatype.h"
#include "gromacs/gpu_utils/gputraits.h"
+#include "gromacs/utility/gmxmpi.h"
class DeviceContext;
class DeviceStream;
namespace gmx
{
+template<typename T>
+class ArrayRef;
+
+/*! \internal \brief
+ * Enum specifying all GPU FFT backends supported by GROMACS
+ * Some of the backends support only single GPU, some only multi-node, multi-GPU
+ */
+enum class FftBackend
+{
+ Cufft, // supports only single-GPU
+ Ocl, // supports only single-GPU
+ Sycl, // Not supported currently
+ Count
+};
+
/*! \internal \brief
* A 3D FFT class for performing R2C/C2R transforms
- * \todo Make this class actually parallel over multiple GPUs
*/
class Gpu3dFft
{
public:
/*! \brief
- * Constructs GPU FFT plans for performing 3D FFT on a PME grid.
+ * Construct 3D FFT object for given backend
*
- * \param[in] realGridSize Dimensions of the real grid
- * \param[in] realGridSizePadded Dimensions of the real grid with padding
- * \param[in] complexGridSizePadded Dimensions of the real grid with padding
- * \param[in] useDecomposition Whether PME decomposition will be used
- * \param[in] performOutOfPlaceFFT Whether the FFT will be performed out-of-place
- * \param[in] context GPU context.
- * \param[in] pmeStream GPU stream for PME.
- * \param[in] realGrid Device buffer of floats for the real grid
- * \param[in] complexGrid Device buffer of complex floats for the complex grid
+ * \param[in] backend FFT backend to be instantiated
+ * \param[in] allocateGrids True if fft grids are to be allocated, false if pre-allocated
+ * \param[in] comm MPI communicator, used with distributed-FFT backends
+ * \param[in] gridSizesInXForEachRank Number of grid points used with each rank in X-dimension
+ * \param[in] gridSizesInYForEachRank Number of grid points used with each rank in Y-dimension
+ * \param[in] nz Grid dimension in Z
+ * \param[in] performOutOfPlaceFFT Whether the FFT will be performed out-of-place
+ * \param[in] context GPU context.
+ * \param[in] pmeStream GPU stream for PME.
+ * \param[in,out] realGridSize Dimensions of the local real grid, out if allocateGrids=true
+ * \param[in,out] realGridSizePadded Dimensions of the local real grid with padding, out if allocateGrids=true
+ * \param[in,out] complexGridSizePadded Dimensions of the local complex grid with padding, out if allocateGrids=true
+ * \param[in,out] realGrid Device buffer of floats for the local real grid, out if allocateGrids=true
+ * \param[in,out] complexGrid Device buffer of complex floats for the local complex grid, out if allocateGrids=true
*/
- Gpu3dFft(ivec realGridSize,
- ivec realGridSizePadded,
- ivec complexGridSizePadded,
- bool useDecomposition,
+ Gpu3dFft(FftBackend backend,
+ bool allocateGrids,
+ MPI_Comm comm,
+ ArrayRef<const int> gridSizesInXForEachRank,
+ ArrayRef<const int> gridSizesInYForEachRank,
+ int nz,
bool performOutOfPlaceFFT,
const DeviceContext& context,
const DeviceStream& pmeStream,
- DeviceBuffer<float> realGrid,
- DeviceBuffer<float> complexGrid);
+ ivec realGridSize,
+ ivec realGridSizePadded,
+ ivec complexGridSizePadded,
+ DeviceBuffer<float>* realGrid,
+ DeviceBuffer<float>* complexGrid);
/*! \brief Destroys the FFT plans. */
~Gpu3dFft();
private:
class Impl;
+ class ImplCuFft;
+ class ImplOcl;
+ class ImplSycl;
+
std::unique_ptr<Impl> impl_;
};
#include "gmxpre.h"
-#include "gpu_3dfft.h"
-
-#include <cufft.h>
+#include "gpu_3dfft_cufft.h"
#include "gromacs/gpu_utils/device_stream.h"
+#include "gromacs/utility/arrayref.h"
#include "gromacs/utility/fatalerror.h"
#include "gromacs/utility/gmxassert.h"
namespace gmx
{
-
-class Gpu3dFft::Impl
-{
-public:
- Impl(ivec realGridSize,
- ivec realGridSizePadded,
- ivec complexGridSizePadded,
- bool useDecomposition,
- bool performOutOfPlaceFFT,
- const DeviceContext& context,
- const DeviceStream& pmeStream,
- DeviceBuffer<float> realGrid,
- DeviceBuffer<float> complexGrid);
- ~Impl();
-
- cufftHandle planR2C_;
- cufftHandle planC2R_;
- cufftReal* realGrid_;
- cufftComplex* complexGrid_;
-};
-
static void handleCufftError(cufftResult_t status, const char* msg)
{
if (status != CUFFT_SUCCESS)
}
}
-Gpu3dFft::Impl::Impl(ivec realGridSize,
- ivec realGridSizePadded,
- ivec complexGridSizePadded,
- const bool useDecomposition,
- const bool /*performOutOfPlaceFFT*/,
- const DeviceContext& /*context*/,
- const DeviceStream& pmeStream,
- DeviceBuffer<float> realGrid,
- DeviceBuffer<float> complexGrid) :
- realGrid_(reinterpret_cast<cufftReal*>(realGrid)),
- complexGrid_(reinterpret_cast<cufftComplex*>(complexGrid))
+Gpu3dFft::ImplCuFft::ImplCuFft(bool allocateGrids,
+ MPI_Comm /*comm*/,
+ ArrayRef<const int> gridSizesInXForEachRank,
+ ArrayRef<const int> gridSizesInYForEachRank,
+ const int /*nz*/,
+ bool /*performOutOfPlaceFFT*/,
+ const DeviceContext& /*context*/,
+ const DeviceStream& pmeStream,
+ ivec realGridSize,
+ ivec realGridSizePadded,
+ ivec complexGridSizePadded,
+ DeviceBuffer<float>* realGrid,
+ DeviceBuffer<float>* complexGrid) :
+ realGrid_(reinterpret_cast<cufftReal*>(*realGrid)),
+ complexGrid_(reinterpret_cast<cufftComplex*>(*complexGrid))
{
- GMX_RELEASE_ASSERT(!useDecomposition, "FFT decomposition not implemented");
+ GMX_RELEASE_ASSERT(allocateGrids == false, "Grids needs to be pre-allocated");
+ GMX_RELEASE_ASSERT(gridSizesInXForEachRank.size() == 1 && gridSizesInYForEachRank.size() == 1,
+ "FFT decomposition not implemented with cuFFT backend");
const int complexGridSizePaddedTotal =
complexGridSizePadded[XX] * complexGridSizePadded[YY] * complexGridSizePadded[ZZ];
handleCufftError(result, "cufftSetStream C2R failure");
}
-Gpu3dFft::Impl::~Impl()
+Gpu3dFft::ImplCuFft::~ImplCuFft()
{
cufftResult_t result;
result = cufftDestroy(planR2C_);
handleCufftError(result, "cufftDestroy C2R failure");
}
-void Gpu3dFft::perform3dFft(gmx_fft_direction dir, CommandEvent* /*timingEvent*/)
+void Gpu3dFft::ImplCuFft::perform3dFft(gmx_fft_direction dir, CommandEvent* /*timingEvent*/)
{
cufftResult_t result;
if (dir == GMX_FFT_REAL_TO_COMPLEX)
{
- result = cufftExecR2C(impl_->planR2C_, impl_->realGrid_, impl_->complexGrid_);
+ result = cufftExecR2C(planR2C_, realGrid_, complexGrid_);
handleCufftError(result, "cuFFT R2C execution failure");
}
else
{
- result = cufftExecC2R(impl_->planC2R_, impl_->complexGrid_, impl_->realGrid_);
+ result = cufftExecC2R(planC2R_, complexGrid_, realGrid_);
handleCufftError(result, "cuFFT C2R execution failure");
}
}
-Gpu3dFft::Gpu3dFft(ivec realGridSize,
- ivec realGridSizePadded,
- ivec complexGridSizePadded,
- const bool useDecomposition,
- const bool performOutOfPlaceFFT,
- const DeviceContext& context,
- const DeviceStream& pmeStream,
- DeviceBuffer<float> realGrid,
- DeviceBuffer<float> complexGrid) :
- impl_(std::make_unique<Impl>(realGridSize,
- realGridSizePadded,
- complexGridSizePadded,
- useDecomposition,
- performOutOfPlaceFFT,
- context,
- pmeStream,
- realGrid,
- complexGrid))
-{
-}
-
-Gpu3dFft::~Gpu3dFft() = default;
-
} // namespace gmx
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2016,2017,2018,2019,2021, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/*! \internal \file
+ * \brief Declares the GPU 3D FFT routines.
+ *
+ * \author Aleksei Iupinov <a.yupinov@gmail.com>
+ * \author Mark Abraham <mark.j.abraham@gmail.com>
+ * \author Gaurav Garg <gaugarg@nvidia.com>
+ * \ingroup module_fft
+ */
+
+#ifndef GMX_FFT_GPU_3DFFT_CUFFT_H
+#define GMX_FFT_GPU_3DFFT_CUFFT_H
+
+#include <memory>
+
+#include "gromacs/fft/fft.h"
+#include "gromacs/gpu_utils/devicebuffer_datatype.h"
+#include "gromacs/gpu_utils/gputraits.h"
+#include "gromacs/utility/gmxmpi.h"
+#include "gpu_3dfft_impl.h"
+
+#include <cufft.h>
+
+class DeviceContext;
+class DeviceStream;
+
+namespace gmx
+{
+
+/*! \internal \brief
+ * A 3D FFT wrapper class for performing R2C/C2R transforms using cuFFT
+ */
+class Gpu3dFft::ImplCuFft : public Gpu3dFft::Impl
+{
+public:
+ //! \copydoc Gpu3dFft::Impl::Impl
+ ImplCuFft(bool allocateGrids,
+ MPI_Comm comm,
+ ArrayRef<const int> gridSizesInXForEachRank,
+ ArrayRef<const int> gridSizesInYForEachRank,
+ int nz,
+ bool performOutOfPlaceFFT,
+ const DeviceContext& context,
+ const DeviceStream& pmeStream,
+ ivec realGridSize,
+ ivec realGridSizePadded,
+ ivec complexGridSizePadded,
+ DeviceBuffer<float>* realGrid,
+ DeviceBuffer<float>* complexGrid);
+
+ //! \copydoc Gpu3dFft::Impl::~Impl
+ ~ImplCuFft() override;
+
+ //! \copydoc Gpu3dFft::Impl::perform3dFft
+ void perform3dFft(gmx_fft_direction dir, CommandEvent* timingEvent) override;
+
+private:
+ cufftHandle planR2C_;
+ cufftHandle planC2R_;
+ cufftReal* realGrid_;
+ cufftComplex* complexGrid_;
+};
+
+} // namespace gmx
+
+#endif
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2016,2017,2018,2019,2021, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/*! \internal \file
+ * \brief Declares the GPU 3D FFT routines.
+ *
+ * \author Gaurav Garg <gaugarg@nvidia.com>
+ * \ingroup module_fft
+ */
+
+#ifndef GMX_FFT_GPU_3DFFT_IMPL_H
+#define GMX_FFT_GPU_3DFFT_IMPL_H
+
+#include "gromacs/fft/fft.h"
+#include "gromacs/fft/gpu_3dfft.h"
+#include "gromacs/gpu_utils/devicebuffer_datatype.h"
+#include "gromacs/gpu_utils/gputraits.h"
+
+
+namespace gmx
+{
+/*! \internal \brief
+ * Impl base class for all FFT backends
+ */
+class Gpu3dFft::Impl
+{
+public:
+ //! Default constructor
+ Impl() = default;
+
+ /*! \brief
+ * Constructs GPU FFT plans for performing 3D FFT on a PME grid.
+ *
+ * \param[in] allocateGrids True if fft grids are to be allocated, false if pre-allocated
+ * \param[in] comm MPI communicator, used with distributed-FFT backends
+ * \param[in] gridSizesInXForEachRank Number of grid points used with each rank in X-dimension
+ * \param[in] gridSizesInYForEachRank Number of grid points used with each rank in Y-dimension
+ * \param[in] nz Grid dimension in Z
+ * \param[in] performOutOfPlaceFFT Whether the FFT will be performed out-of-place
+ * \param[in] context GPU context.
+ * \param[in] pmeStream GPU stream for PME.
+ * \param[in,out] realGridSize Dimensions of the local real grid, out if allocateGrids=true
+ * \param[in,out] realGridSizePadded Dimensions of the local real grid with padding, out if allocateGrids=true
+ * \param[in,out] complexGridSizePadded Dimensions of the local complex grid with padding, out if allocateGrids=true
+ * \param[in,out] realGrid Device buffer of floats for the local real grid, out if allocateGrids=true
+ * \param[in,out] complexGrid Device buffer of complex floats for the local complex grid, out if allocateGrids=true
+ */
+ Impl(bool allocateGrids,
+ MPI_Comm comm,
+ ArrayRef<const int> gridSizesInXForEachRank,
+ ArrayRef<const int> gridSizesInYForEachRank,
+ int nz,
+ bool performOutOfPlaceFFT,
+ const DeviceContext& context,
+ const DeviceStream& pmeStream,
+ ivec realGridSize,
+ ivec realGridSizePadded,
+ ivec complexGridSizePadded,
+ DeviceBuffer<float>* realGrid,
+ DeviceBuffer<float>* complexGrid);
+
+ /*! \brief Default destructor */
+ virtual ~Impl() = default;
+
+ //! \copydoc Gpu3dFft::perform3dFft
+ virtual void perform3dFft(gmx_fft_direction dir, CommandEvent* timingEvent) = 0;
+};
+
+} // namespace gmx
+
+#endif
#include "gmxpre.h"
-#include "gpu_3dfft.h"
+#include "gpu_3dfft_ocl.h"
#include <array>
#include <vector>
namespace gmx
{
-
-class Gpu3dFft::Impl
-{
-public:
- Impl(ivec realGridSize,
- ivec realGridSizePadded,
- ivec complexGridSizePadded,
- bool useDecomposition,
- bool performOutOfPlaceFFT,
- const DeviceContext& context,
- const DeviceStream& pmeStream,
- DeviceBuffer<float> realGrid,
- DeviceBuffer<float> complexGrid);
- ~Impl();
-
- clfftPlanHandle planR2C_;
- clfftPlanHandle planC2R_;
- std::vector<cl_command_queue> commandStreams_;
- cl_mem realGrid_;
- cl_mem complexGrid_;
-};
-
//! Throws the exception on clFFT error
static void handleClfftError(clfftStatus status, const char* msg)
{
}
}
-Gpu3dFft::Impl::Impl(ivec realGridSize,
- ivec realGridSizePadded,
- ivec complexGridSizePadded,
- const bool useDecomposition,
- const bool performOutOfPlaceFFT,
- const DeviceContext& context,
- const DeviceStream& pmeStream,
- DeviceBuffer<float> realGrid,
- DeviceBuffer<float> complexGrid) :
- realGrid_(realGrid), complexGrid_(complexGrid)
+Gpu3dFft::ImplOcl::ImplOcl(bool allocateGrids,
+ MPI_Comm /*comm*/,
+ ArrayRef<const int> gridSizesInXForEachRank,
+ ArrayRef<const int> gridSizesInYForEachRank,
+ const int /*nz*/,
+ bool performOutOfPlaceFFT,
+ const DeviceContext& context,
+ const DeviceStream& pmeStream,
+ ivec realGridSize,
+ ivec realGridSizePadded,
+ ivec complexGridSizePadded,
+ DeviceBuffer<float>* realGrid,
+ DeviceBuffer<float>* complexGrid) :
+ realGrid_(*realGrid), complexGrid_(*complexGrid)
{
- GMX_RELEASE_ASSERT(!useDecomposition, "FFT decomposition not implemented");
+ GMX_RELEASE_ASSERT(allocateGrids == false, "Grids needs to be pre-allocated");
+ GMX_RELEASE_ASSERT(gridSizesInXForEachRank.size() == 1 && gridSizesInYForEachRank.size() == 1,
+ "FFT decomposition not implemented with OpenCL backend");
cl_context clContext = context.context();
commandStreams_.push_back(pmeStream.stream());
// TODO: disable last transpose (clfftSetPlanTransposeResult)
}
-Gpu3dFft::Impl::~Impl()
+Gpu3dFft::ImplOcl::~ImplOcl()
{
clfftDestroyPlan(&planR2C_);
clfftDestroyPlan(&planC2R_);
}
-void Gpu3dFft::perform3dFft(gmx_fft_direction dir, CommandEvent* timingEvent)
+void Gpu3dFft::ImplOcl::perform3dFft(gmx_fft_direction dir, CommandEvent* timingEvent)
{
cl_mem tempBuffer = nullptr;
constexpr std::array<cl_event, 0> waitEvents{ {} };
switch (dir)
{
case GMX_FFT_REAL_TO_COMPLEX:
- plan = impl_->planR2C_;
+ plan = planR2C_;
direction = CLFFT_FORWARD;
- inputGrids = &impl_->realGrid_;
- outputGrids = &impl_->complexGrid_;
+ inputGrids = &realGrid_;
+ outputGrids = &complexGrid_;
break;
case GMX_FFT_COMPLEX_TO_REAL:
- plan = impl_->planC2R_;
+ plan = planC2R_;
direction = CLFFT_BACKWARD;
- inputGrids = &impl_->complexGrid_;
- outputGrids = &impl_->realGrid_;
+ inputGrids = &complexGrid_;
+ outputGrids = &realGrid_;
break;
default:
GMX_THROW(NotImplementedError("The chosen 3D-FFT case is not implemented on GPUs"));
}
handleClfftError(clfftEnqueueTransform(plan,
direction,
- impl_->commandStreams_.size(),
- impl_->commandStreams_.data(),
+ commandStreams_.size(),
+ commandStreams_.data(),
waitEvents.size(),
waitEvents.data(),
timingEvent,
"clFFT execution failure");
}
-Gpu3dFft::Gpu3dFft(ivec realGridSize,
- ivec realGridSizePadded,
- ivec complexGridSizePadded,
- const bool useDecomposition,
- const bool performOutOfPlaceFFT,
- const DeviceContext& context,
- const DeviceStream& pmeStream,
- DeviceBuffer<float> realGrid,
- DeviceBuffer<float> complexGrid) :
- impl_(std::make_unique<Impl>(realGridSize,
- realGridSizePadded,
- complexGridSizePadded,
- useDecomposition,
- performOutOfPlaceFFT,
- context,
- pmeStream,
- realGrid,
- complexGrid))
-{
-}
-
-Gpu3dFft::~Gpu3dFft() = default;
-
} // namespace gmx
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2016,2017,2018,2019,2021, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/*! \internal \file
+ * \brief Declares the GPU 3D FFT routines.
+ *
+ * \author Aleksei Iupinov <a.yupinov@gmail.com>
+ * \author Mark Abraham <mark.j.abraham@gmail.com>
+ * \author Gaurav Garg <gaugarg@nvidia.com>
+ * \ingroup module_fft
+ */
+
+#ifndef GMX_FFT_GPU_3DFFT_OCL_H
+#define GMX_FFT_GPU_3DFFT_OCL_H
+
+#include "gpu_3dfft_impl.h"
+
+#include "gromacs/fft/fft.h"
+#include "gromacs/gpu_utils/devicebuffer_datatype.h"
+#include "gromacs/gpu_utils/gputraits.h"
+#include "gromacs/utility/arrayref.h"
+#include "gromacs/utility/gmxmpi.h"
+
+#include <clFFT.h>
+
+class DeviceContext;
+class DeviceStream;
+
+namespace gmx
+{
+
+/*! \internal \brief
+ * A 3D FFT wrapper class for performing R2C/C2R transforms using clFFT
+ */
+class Gpu3dFft::ImplOcl : public Gpu3dFft::Impl
+{
+public:
+ //! \copydoc Gpu3dFft::Impl::Impl
+ ImplOcl(bool allocateGrids,
+ MPI_Comm comm,
+ ArrayRef<const int> gridSizesInXForEachRank,
+ ArrayRef<const int> gridSizesInYForEachRank,
+ int nz,
+ bool performOutOfPlaceFFT,
+ const DeviceContext& context,
+ const DeviceStream& pmeStream,
+ ivec realGridSize,
+ ivec realGridSizePadded,
+ ivec complexGridSizePadded,
+ DeviceBuffer<float>* realGrid,
+ DeviceBuffer<float>* complexGrid);
+
+ //! \copydoc Gpu3dFft::Impl::~Impl
+ ~ImplOcl() override;
+
+ //! \copydoc Gpu3dFft::Impl::perform3dFft
+ void perform3dFft(gmx_fft_direction dir, CommandEvent* timingEvent) override;
+
+private:
+ clfftPlanHandle planR2C_;
+ clfftPlanHandle planC2R_;
+ std::vector<cl_command_queue> commandStreams_;
+ cl_mem realGrid_;
+ cl_mem complexGrid_;
+};
+
+} // namespace gmx
+
+#endif
#include "gmxpre.h"
-#include "gpu_3dfft.h"
+#include "gpu_3dfft_sycl.h"
+#include "gromacs/utility/arrayref.h"
#include "gromacs/utility/exceptions.h"
namespace gmx
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wmissing-noreturn"
-class Gpu3dFft::Impl
-{
-};
-
-Gpu3dFft::Gpu3dFft(ivec /*realGridSize*/,
- ivec /*realGridSizePadded*/,
- ivec /*complexGridSizePadded*/,
- const bool /*useDecomposition*/,
- const bool /*performOutOfPlaceFFT*/,
- const DeviceContext& /*context*/,
- const DeviceStream& /*pmeStream*/,
- DeviceBuffer<float> /*realGrid*/,
- DeviceBuffer<float> /*complexGrid*/)
+Gpu3dFft::ImplSycl::ImplSycl(bool /*allocateGrids*/,
+ MPI_Comm /*comm*/,
+ ArrayRef<const int> /*gridSizesInXForEachRank*/,
+ ArrayRef<const int> /*gridSizesInYForEachRank*/,
+ const int /*nz*/,
+ bool /*performOutOfPlaceFFT*/,
+ const DeviceContext& /*context*/,
+ const DeviceStream& /*pmeStream*/,
+ ivec /*realGridSize*/,
+ ivec /*realGridSizePadded*/,
+ ivec /*complexGridSizePadded*/,
+ DeviceBuffer<float>* /*realGrid*/,
+ DeviceBuffer<float>* /*complexGrid*/)
{
GMX_THROW(NotImplementedError("GPU 3DFFT is not implemented in SYCL"));
}
-Gpu3dFft::~Gpu3dFft() = default;
+Gpu3dFft::ImplSycl::~ImplSycl() = default;
-void Gpu3dFft::perform3dFft(gmx_fft_direction /*dir*/, CommandEvent* /*timingEvent*/)
+void Gpu3dFft::ImplSycl::perform3dFft(gmx_fft_direction /*dir*/, CommandEvent* /*timingEvent*/)
{
GMX_THROW(NotImplementedError("Not implemented on SYCL yet"));
}
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2016,2017,2018,2019,2021, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/*! \internal \file
+ * \brief Declares the GPU 3D FFT routines.
+ *
+ * \author Aleksei Iupinov <a.yupinov@gmail.com>
+ * \author Mark Abraham <mark.j.abraham@gmail.com>
+ * \author Gaurav Garg <gaugarg@nvidia.com>
+ * \ingroup module_fft
+ */
+
+#ifndef GMX_FFT_GPU_3DFFT_SYCL_H
+#define GMX_FFT_GPU_3DFFT_SYCL_H
+
+#include "gpu_3dfft_impl.h"
+
+#include "gromacs/fft/fft.h"
+#include "gromacs/gpu_utils/devicebuffer_datatype.h"
+#include "gromacs/gpu_utils/gputraits.h"
+#include "gromacs/utility/gmxmpi.h"
+
+class DeviceContext;
+class DeviceStream;
+
+namespace gmx
+{
+
+/*! \internal \brief
+ * A 3D FFT wrapper class for performing R2C/C2R transforms using SYCL. Not yet implemented
+ */
+class Gpu3dFft::ImplSycl : public Gpu3dFft::Impl
+{
+public:
+ //! \copydoc Gpu3dFft::Impl::Impl
+ ImplSycl(bool allocateGrids,
+ MPI_Comm comm,
+ ArrayRef<const int> gridSizesInXForEachRank,
+ ArrayRef<const int> gridSizesInYForEachRank,
+ int nz,
+ bool performOutOfPlaceFFT,
+ const DeviceContext& context,
+ const DeviceStream& pmeStream,
+ ivec realGridSize,
+ ivec realGridSizePadded,
+ ivec complexGridSizePadded,
+ DeviceBuffer<float>* realGrid,
+ DeviceBuffer<float>* complexGrid);
+
+ //! \copydoc Gpu3dFft::Impl::~Impl
+ ~ImplSycl() override;
+
+ //! \copydoc Gpu3dFft::Impl::perform3dFft
+ void perform3dFft(gmx_fft_direction dir, CommandEvent* timingEvent) override;
+};
+
+} // namespace gmx
+
+#endif
allocateDeviceBuffer(&realGrid, in_.size(), deviceContext);
allocateDeviceBuffer(&complexGrid, complexGridValues.size(), deviceContext);
- const bool useDecomposition = false;
- const bool performOutOfPlaceFFT = true;
- Gpu3dFft gpu3dFft(realGridSize,
- realGridSizePadded,
- complexGridSizePadded,
- useDecomposition,
+# if GMX_GPU_CUDA
+ const FftBackend backend = FftBackend::Cufft;
+# elif GMX_GPU_OPENCL
+ const FftBackend backend = FftBackend::Ocl;
+# endif
+ const bool performOutOfPlaceFFT = true;
+ const MPI_Comm comm = MPI_COMM_NULL;
+ const bool allocateGrid = false;
+ std::array<int, 1> gridSizesInXForEachRank = { 0 };
+ std::array<int, 1> gridSizesInYForEachRank = { 0 };
+ const int nz = realGridSize[ZZ];
+ Gpu3dFft gpu3dFft(backend,
+ allocateGrid,
+ comm,
+ gridSizesInXForEachRank,
+ gridSizesInYForEachRank,
+ nz,
performOutOfPlaceFFT,
deviceContext,
deviceStream,
- realGrid,
- complexGrid);
+ realGridSize,
+ realGridSizePadded,
+ complexGridSizePadded,
+ &realGrid,
+ &complexGrid);
// Transfer the real grid input data for the FFT
copyToDeviceBuffer(
}
}
-// Construct a string that describes the library that provides FFT support to this build
-const char* getFftDescriptionString()
+//! Construct a string that describes the library that provides CPU FFT support to this build
+const char* getCpuFftDescriptionString()
{
// Define the FFT description string
#if GMX_FFT_FFTW3 || GMX_FFT_ARMPL_FFTW3
#endif
};
+//! Construct a string that describes the library that provides GPU FFT support to this build
+const char* getGpuFftDescriptionString()
+{
+ if (GMX_GPU)
+ {
+ if (GMX_GPU_CUDA)
+ {
+ return "cuFFT";
+ }
+ else if (GMX_GPU_OPENCL)
+ {
+ return "clFFT";
+ }
+ else if (GMX_GPU_SYCL)
+ {
+ return "unknown";
+ }
+ else
+ {
+ GMX_RELEASE_ASSERT(false, "Unknown GPU configuration");
+ return "impossible";
+ }
+ }
+ else
+ {
+ return "none";
+ }
+};
+
void gmx_print_version_info(gmx::TextWriter* writer)
{
writer->writeLine(formatString("GROMACS version: %s", gmx_version()));
#endif
writer->writeLine(formatString("GPU support: %s", getGpuImplementationString()));
writer->writeLine(formatString("SIMD instructions: %s", GMX_SIMD_STRING));
- writer->writeLine(formatString("FFT library: %s", getFftDescriptionString()));
+ writer->writeLine(formatString("CPU FFT library: %s", getCpuFftDescriptionString()));
+ writer->writeLine(formatString("GPU FFT library: %s", getGpuFftDescriptionString()));
#if GMX_TARGET_X86
writer->writeLine(formatString("RDTSCP usage: %s", GMX_USE_RDTSCP ? "enabled" : "disabled"));
#endif