2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2021, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
37 * \brief Implements GPU 3D FFT routines using HeFFTe.
39 * \author Gaurav Garg <gaugarg@nvidia.com>
45 #include "gpu_3dfft_heffte.h"
47 #include "gromacs/gpu_utils/device_stream.h"
48 #include "gromacs/utility/arrayref.h"
49 #include "gromacs/utility/fatalerror.h"
50 #include "gromacs/utility/gmxassert.h"
54 template<typename backend_tag>
55 Gpu3dFft::ImplHeFfte<backend_tag>::ImplHeFfte(bool allocateGrids,
57 ArrayRef<const int> gridSizesInXForEachRank,
58 ArrayRef<const int> gridSizesInYForEachRank,
60 bool performOutOfPlaceFFT,
61 const DeviceContext& /*context*/,
62 const DeviceStream& pmeStream,
64 ivec realGridSizePadded,
65 ivec complexGridSizePadded,
66 DeviceBuffer<float>* realGrid,
67 DeviceBuffer<float>* complexGrid) :
70 const int numDomainsX = gridSizesInXForEachRank.size();
71 const int numDomainsY = gridSizesInYForEachRank.size();
73 GMX_RELEASE_ASSERT(allocateGrids == true, "Grids cannot be pre-allocated");
74 GMX_RELEASE_ASSERT(performOutOfPlaceFFT == true, "Only out-of-place FFT supported");
75 GMX_RELEASE_ASSERT(numDomainsX * numDomainsY > 1,
76 "HeFFTe backend is expected to be used only with more than 1 rank");
78 // calculate grid offsets
79 std::vector<int> gridOffsetsInX(numDomainsX + 1);
80 std::vector<int> gridOffsetsInY(numDomainsY + 1);
82 gridOffsetsInX[0] = 0;
83 for (unsigned int i = 0; i < gridSizesInXForEachRank.size(); ++i)
85 gridOffsetsInX[i + 1] = gridOffsetsInX[i] + gridSizesInXForEachRank[i];
88 gridOffsetsInY[0] = 0;
89 for (unsigned int i = 0; i < gridSizesInYForEachRank.size(); ++i)
91 gridOffsetsInY[i + 1] = gridOffsetsInY[i] + gridSizesInYForEachRank[i];
95 MPI_Comm_rank(comm, &rank);
96 MPI_Comm_size(comm, &nProcs);
98 GMX_RELEASE_ASSERT(nProcs == numDomainsX * numDomainsY,
99 "Mismatch in communicator size and expected domain decomposition");
101 // define how ranks are mapped to 2d domain
102 int procY = rank % numDomainsY;
103 int procX = rank / numDomainsY;
105 // local real grid boxes
106 heffte::box3d<> const realBox = { { 0, gridOffsetsInY[procY], gridOffsetsInX[procX] },
107 { nz - 1, gridOffsetsInY[procY + 1] - 1, gridOffsetsInX[procX + 1] - 1 } };
109 const int nx = gridOffsetsInX[numDomainsX];
110 const int ny = gridOffsetsInY[numDomainsY];
112 // define shape of local complex grid boxes
113 std::vector<int> gridOffsetsInY_transformed(numDomainsX + 1);
114 std::vector<int> gridOffsetsInZ_transformed(numDomainsY + 1);
116 for (int i = 0; i < numDomainsX; i++)
118 gridOffsetsInY_transformed[i] = (i * ny + 0) / numDomainsX;
120 gridOffsetsInY_transformed[numDomainsX] = ny;
122 const int complexZDim = nz / 2 + 1;
123 for (int i = 0; i < numDomainsY; i++)
125 gridOffsetsInZ_transformed[i] = (i * complexZDim + 0) / numDomainsY;
127 gridOffsetsInZ_transformed[numDomainsY] = complexZDim;
129 // output order - YZX
130 // this avoids reordering of data in final fft as final fft is done along x-dimension with
131 // x being contiguous, leave the data as is in YZX order and don't bring it back in XYZ
132 heffte::box3d<> const complexBox = {
133 { gridOffsetsInZ_transformed[procY], gridOffsetsInY_transformed[procX], 0 },
134 { gridOffsetsInZ_transformed[procY + 1] - 1, gridOffsetsInY_transformed[procX + 1] - 1, nx - 1 },
138 // ToDo: useReorder=true and useAlltoall=true gave me best results in past but, verify it once again
139 const bool useReorder = true;
140 const bool useAlltoall = true;
141 const bool usePencils = false; // Not-used as GROMACS doesn't work with brick decomposition
142 heffte::plan_options options(useReorder, useAlltoall, usePencils);
144 // Define 3D FFT plan
145 fftPlan_ = std::make_unique<heffte::fft3d_r2c<backend_tag, int>>(realBox, complexBox, 0, comm, options);
147 // allocate grid and workspace_
148 localRealGrid_ = heffte::gpu::vector<float>(fftPlan_->size_inbox());
149 localComplexGrid_ = heffte::gpu::vector<std::complex<float>>(fftPlan_->size_outbox());
150 workspace_ = heffte::gpu::vector<std::complex<float>>(fftPlan_->size_workspace());
152 // write back the output data
153 *realGrid = localRealGrid_.data();
154 *complexGrid = (float*)localComplexGrid_.data();
156 realGridSize[XX] = gridSizesInXForEachRank[procX];
157 realGridSize[YY] = gridSizesInYForEachRank[procY];
158 realGridSize[ZZ] = nz;
160 realGridSizePadded[XX] = fftPlan_->inbox().size[2];
161 realGridSizePadded[YY] = fftPlan_->inbox().size[1];
162 realGridSizePadded[ZZ] = fftPlan_->inbox().size[0];
164 complexGridSizePadded[XX] = fftPlan_->outbox().size[2];
165 complexGridSizePadded[YY] = fftPlan_->outbox().size[1];
166 complexGridSizePadded[ZZ] = fftPlan_->outbox().size[0];
169 template<typename backend_tag>
170 void Gpu3dFft::ImplHeFfte<backend_tag>::perform3dFft(gmx_fft_direction dir, CommandEvent* /*timingEvent*/)
172 // HeFFTe does all the computations in the default stream
173 // ToDo: We need some way to create DeviceStream class in GROMACS with default stream
174 // This way we can synchronize PME and default streams using events
175 stream_.synchronize();
179 case GMX_FFT_REAL_TO_COMPLEX:
180 fftPlan_->forward(localRealGrid_.data(), localComplexGrid_.data(), workspace_.data());
182 case GMX_FFT_COMPLEX_TO_REAL:
183 fftPlan_->backward(localComplexGrid_.data(), localRealGrid_.data(), workspace_.data());
186 GMX_THROW(NotImplementedError("The chosen 3D-FFT case is not implemented on GPUs"));
189 // ToDo: Same as above, we need some way to create DeviceStream from default stream
190 heffte::gpu::synchronize_default_stream();
193 // instantiate relevant HeFFTe backend
195 template class Gpu3dFft::ImplHeFfte<heffte::backend::cufft>;