src/gromacs/fft/gpu_3dfft_heffte.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2021, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 /*! \internal \file
  37  *  \brief Implements GPU 3D FFT routines using HeFFTe.
  38  *
  39  *  \author Gaurav Garg <gaugarg@nvidia.com>
  40  *  \ingroup module_fft
  41  */
  42
  43 #include "gmxpre.h"
  44
  45 #include "gpu_3dfft_heffte.h"
  46
  47 #include "gromacs/gpu_utils/device_stream.h"
  48 #include "gromacs/utility/arrayref.h"
  49 #include "gromacs/utility/fatalerror.h"
  50 #include "gromacs/utility/gmxassert.h"
  51
  52 namespace gmx
  53 {
  54 template<typename backend_tag>
  55 Gpu3dFft::ImplHeFfte<backend_tag>::ImplHeFfte(bool                allocateGrids,
  56                                               MPI_Comm            comm,
  57                                               ArrayRef<const int> gridSizesInXForEachRank,
  58                                               ArrayRef<const int> gridSizesInYForEachRank,
  59                                               const int           nz,
  60                                               bool                performOutOfPlaceFFT,
  61                                               const DeviceContext& /*context*/,
  62                                               const DeviceStream&  pmeStream,
  63                                               ivec                 realGridSize,
  64                                               ivec                 realGridSizePadded,
  65                                               ivec                 complexGridSizePadded,
  66                                               DeviceBuffer<float>* realGrid,
  67                                               DeviceBuffer<float>* complexGrid) :
  68     stream_(pmeStream)
  69 {
  70     const int numDomainsX = gridSizesInXForEachRank.size();
  71     const int numDomainsY = gridSizesInYForEachRank.size();
  72
  73     GMX_RELEASE_ASSERT(allocateGrids == true, "Grids cannot be pre-allocated");
  74     GMX_RELEASE_ASSERT(performOutOfPlaceFFT == true, "Only out-of-place FFT supported");
  75     GMX_RELEASE_ASSERT(numDomainsX * numDomainsY > 1,
  76                        "HeFFTe backend is expected to be used only with more than 1 rank");
  77
  78     // calculate grid offsets
  79     std::vector<int> gridOffsetsInX(numDomainsX + 1);
  80     std::vector<int> gridOffsetsInY(numDomainsY + 1);
  81
  82     gridOffsetsInX[0] = 0;
  83     for (unsigned int i = 0; i < gridSizesInXForEachRank.size(); ++i)
  84     {
  85         gridOffsetsInX[i + 1] = gridOffsetsInX[i] + gridSizesInXForEachRank[i];
  86     }
  87
  88     gridOffsetsInY[0] = 0;
  89     for (unsigned int i = 0; i < gridSizesInYForEachRank.size(); ++i)
  90     {
  91         gridOffsetsInY[i + 1] = gridOffsetsInY[i] + gridSizesInYForEachRank[i];
  92     }
  93
  94     int rank, nProcs;
  95     MPI_Comm_rank(comm, &rank);
  96     MPI_Comm_size(comm, &nProcs);
  97
  98     GMX_RELEASE_ASSERT(nProcs == numDomainsX * numDomainsY,
  99                        "Mismatch in communicator size and expected domain decomposition");
 100
 101     // define how ranks are mapped to 2d domain
 102     int procY = rank % numDomainsY;
 103     int procX = rank / numDomainsY;
 104
 105     // local real grid boxes
 106     heffte::box3d<> const realBox = { { 0, gridOffsetsInY[procY], gridOffsetsInX[procX] },
 107                                       { nz - 1, gridOffsetsInY[procY + 1] - 1, gridOffsetsInX[procX + 1] - 1 } };
 108
 109     const int nx = gridOffsetsInX[numDomainsX];
 110     const int ny = gridOffsetsInY[numDomainsY];
 111
 112     // define shape of local complex grid boxes
 113     std::vector<int> gridOffsetsInY_transformed(numDomainsX + 1);
 114     std::vector<int> gridOffsetsInZ_transformed(numDomainsY + 1);
 115
 116     for (int i = 0; i < numDomainsX; i++)
 117     {
 118         gridOffsetsInY_transformed[i] = (i * ny + 0) / numDomainsX;
 119     }
 120     gridOffsetsInY_transformed[numDomainsX] = ny;
 121
 122     const int complexZDim = nz / 2 + 1;
 123     for (int i = 0; i < numDomainsY; i++)
 124     {
 125         gridOffsetsInZ_transformed[i] = (i * complexZDim + 0) / numDomainsY;
 126     }
 127     gridOffsetsInZ_transformed[numDomainsY] = complexZDim;
 128
 129     // output order - YZX
 130     // this avoids reordering of data in final fft as final fft is done along x-dimension with
 131     // x being contiguous, leave the data as is in YZX order and don't bring it back in XYZ
 132     heffte::box3d<> const complexBox = {
 133         { gridOffsetsInZ_transformed[procY], gridOffsetsInY_transformed[procX], 0 },
 134         { gridOffsetsInZ_transformed[procY + 1] - 1, gridOffsetsInY_transformed[procX + 1] - 1, nx - 1 },
 135         { 2, 0, 1 }
 136     };
 137
 138     // ToDo: useReorder=true and useAlltoall=true gave me best results in past but, verify it once again
 139     const bool useReorder  = true;
 140     const bool useAlltoall = true;
 141     const bool usePencils  = false; // Not-used as GROMACS doesn't work with brick decomposition
 142     heffte::plan_options options(useReorder, useAlltoall, usePencils);
 143
 144     // Define 3D FFT plan
 145     fftPlan_ = std::make_unique<heffte::fft3d_r2c<backend_tag, int>>(realBox, complexBox, 0, comm, options);
 146
 147     // allocate grid and workspace_
 148     localRealGrid_    = heffte::gpu::vector<float>(fftPlan_->size_inbox());
 149     localComplexGrid_ = heffte::gpu::vector<std::complex<float>>(fftPlan_->size_outbox());
 150     workspace_        = heffte::gpu::vector<std::complex<float>>(fftPlan_->size_workspace());
 151
 152     // write back the output data
 153     *realGrid    = localRealGrid_.data();
 154     *complexGrid = (float*)localComplexGrid_.data();
 155
 156     realGridSize[XX] = gridSizesInXForEachRank[procX];
 157     realGridSize[YY] = gridSizesInYForEachRank[procY];
 158     realGridSize[ZZ] = nz;
 159
 160     realGridSizePadded[XX] = fftPlan_->inbox().size[2];
 161     realGridSizePadded[YY] = fftPlan_->inbox().size[1];
 162     realGridSizePadded[ZZ] = fftPlan_->inbox().size[0];
 163
 164     complexGridSizePadded[XX] = fftPlan_->outbox().size[2];
 165     complexGridSizePadded[YY] = fftPlan_->outbox().size[1];
 166     complexGridSizePadded[ZZ] = fftPlan_->outbox().size[0];
 167 }
 168
 169 template<typename backend_tag>
 170 void Gpu3dFft::ImplHeFfte<backend_tag>::perform3dFft(gmx_fft_direction dir, CommandEvent* /*timingEvent*/)
 171 {
 172     // HeFFTe does all the computations in the default stream
 173     // ToDo: We need some way to create DeviceStream class in GROMACS with default stream
 174     // This way we can synchronize PME and default streams using events
 175     stream_.synchronize();
 176
 177     switch (dir)
 178     {
 179         case GMX_FFT_REAL_TO_COMPLEX:
 180             fftPlan_->forward(localRealGrid_.data(), localComplexGrid_.data(), workspace_.data());
 181             break;
 182         case GMX_FFT_COMPLEX_TO_REAL:
 183             fftPlan_->backward(localComplexGrid_.data(), localRealGrid_.data(), workspace_.data());
 184             break;
 185         default:
 186             GMX_THROW(NotImplementedError("The chosen 3D-FFT case is not implemented on GPUs"));
 187     }
 188
 189     // ToDo: Same as above, we need some way to create DeviceStream from default stream
 190     heffte::gpu::synchronize_default_stream();
 191 }
 192
 193 // instantiate relevant HeFFTe backend
 194 #if GMX_GPU_CUDA
 195 template class Gpu3dFft::ImplHeFfte<heffte::backend::cufft>;
 196 #endif
 197
 198 } // namespace gmx