SYCL: Avoid using no_init read accessor in rocFFT
[alexxy/gromacs.git] / src / gromacs / fft / gpu_3dfft_sycl_rocfft.cpp
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2021, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35
36 /*! \internal \file
37  *  \brief Implements GPU 3D FFT routines for hipSYCL via rocFFT.
38  *
39  *  \author Andrey Alekseenko <al42and@gmail.com>
40  *  \author Mark Abraham <mark.j.abraham@gmail.com>
41  *
42  * For hipSYCL, in order to call FFT APIs from the respective vendors
43  * using the same DeviceStream as other operations, a vendor extension
44  * called "custom operations" is used (see hipSYCL
45  * doc/enqueue-custom-operation.md). That effectively enqueues an
46  * asynchronous host-side lambda into the same queue. The body of the
47  * lambda unpacks the runtime data structures to get the native
48  * handles and calls the native FFT APIs.
49  *
50  * For a 3D FFT, rocFFT requires a working buffer which it allocates
51  * itself if not provided. This might be slow enough to be worth
52  * optimizing. This working buffer could be provided in advance by
53  * calling rocfft_plan_get_work_buffer_size, allocating a buffer that
54  * persists suitably, and then using
55  * rocfft_execution_info_set_work_buffer in a custom operation.
56  *
57  * hipSYCL queues operate at a higher level of abstraction than hip
58  * streams, with the runtime distributing work to the latter to
59  * balance load. It is possible to set the HIP stream in
60  * rocfft_execution_info, but then there is no guarantee that a
61  * subsequent queue item will run using the same stream. So we
62  * currently do not attempt to set the stream.
63  *
64  *  \ingroup module_fft
65  */
66
67 #include "gmxpre.h"
68
69 #include "gpu_3dfft_sycl_rocfft.h"
70
71 #include "gromacs/utility/enumerationhelpers.h"
72 #include "gromacs/utility/exceptions.h"
73
74 #include <vector>
75
76 #include "gromacs/gpu_utils/device_stream.h"
77 #include "gromacs/gpu_utils/devicebuffer.h"
78 #include "gromacs/utility/exceptions.h"
79 #include "gromacs/utility/gmxassert.h"
80
81 #ifndef __HIPSYCL__
82 #    error This file can only be compiled with hipSYCL enabled
83 #endif
84
85 #if !defined HIPSYCL_PLATFORM_ROCM
86 #    error Only ROCM platform is supported for 3D FFT with hipSYCL
87 #endif
88
89 #include "rocfft.h"
90
91 namespace gmx
92 {
93
94 namespace
95 {
96
97 //! Model the kinds of 3D FFT implemented
98 enum class FftDirection : int
99 {
100     RealToComplex,
101     ComplexToReal,
102     Count,
103 };
104
105 //! Strings that match enum rocfft_status_e in rocfft.h
106 const std::array<const char*, rocfft_status_invalid_work_buffer + 1> c_rocfftErrorStrings = {
107     "success",
108     "failure",
109     "invalid argument value",
110     "invalid dimensions",
111     "invalid array type",
112     "invalid strides",
113     "invalid distance",
114     "invalid offset",
115     "invalid work buffer"
116 };
117
118 //! Helper for consistent error handling
119 void handleFftError(rocfft_status result, const std::string& msg)
120 {
121     if (result != rocfft_status_success)
122     {
123         if (result <= rocfft_status_invalid_work_buffer)
124         {
125             GMX_THROW(gmx::InternalError(gmx::formatString(
126                     "%s: (error code %d - %s)\n", msg.c_str(), result, c_rocfftErrorStrings[result])));
127         }
128         else
129         {
130             GMX_THROW(gmx::InternalError(gmx::formatString("%s: (error code %d)\n", msg.c_str(), result)));
131         }
132     }
133 }
134
135 //! Helper for consistent error handling
136 void handleFftError(rocfft_status result, const std::string& direction, const std::string& msg)
137 {
138     if (result != rocfft_status_success)
139     {
140         handleFftError(result, msg + " doing " + direction);
141     }
142 }
143
144 //! Provides RAII-style initialization of rocFFT library
145 class RocfftInitializer
146 {
147 public:
148     RocfftInitializer()
149     {
150         rocfft_status result;
151         result = rocfft_setup();
152         handleFftError(result, "rocfft_setup failure");
153     }
154     ~RocfftInitializer()
155     {
156         // No need to handle any errors in a destructor, and
157         // anyway one cannot throw.
158         rocfft_cleanup();
159     }
160 };
161
162 //! All the persistent data for planning an executing a 3D FFT
163 struct RocfftPlan
164 {
165     //! Describes details of the data layout
166     rocfft_plan_description description = nullptr;
167     //! High level information about the plan
168     rocfft_plan plan = nullptr;
169     //! Destructor
170     ~RocfftPlan()
171     {
172         // No need to handle any errors in a destructor,
173         // and anyway one cannot throw.
174         if (plan)
175         {
176             rocfft_plan_destroy(plan);
177         }
178         if (description)
179         {
180             rocfft_plan_description_destroy(description);
181         }
182     }
183 };
184
185 //! Helper struct to reduce repetitive code setting up a 3D FFT plan
186 struct PlanSetupData
187 {
188     //! Format of the input array (real or hermitian)
189     rocfft_array_type arrayType;
190     //! Strides through the input array for the three dimensions
191     std::array<size_t, DIM> strides;
192     //! Total size of the input array (including padding)
193     size_t totalSize;
194 };
195
196 //! Compute the stride through the real 1D array
197 std::array<size_t, DIM> makeRealStrides(ivec realGridSizePadded)
198 {
199     return { 1, size_t(realGridSizePadded[ZZ]), size_t(realGridSizePadded[ZZ] * realGridSizePadded[YY]) };
200 };
201
202 //! Compute the stride through the complex 1D array
203 std::array<size_t, DIM> makeComplexStrides(ivec complexGridSizePadded)
204 {
205     return { 1,
206              size_t(complexGridSizePadded[XX]),
207              size_t(complexGridSizePadded[XX] * complexGridSizePadded[YY]) };
208 }
209
210 //! Compute total grid size
211 size_t computeTotalSize(ivec gridSize)
212 {
213     return size_t(gridSize[XX] * gridSize[YY] * gridSize[ZZ]);
214 }
215
216 /*! \brief Prepare plans for the forward and reverse transformation.
217  *
218  * Because these require device-side allocations, some of them must be
219  * done in a SYCL queue. */
220 RocfftPlan makePlan(const std::string&     descriptiveString,
221                     rocfft_transform_type  transformType,
222                     const PlanSetupData&   inputPlanSetupData,
223                     const PlanSetupData&   outputPlanSetupData,
224                     ArrayRef<const size_t> rocfftRealGridSize,
225                     const DeviceStream&    pmeStream)
226 {
227     rocfft_plan_description description = nullptr;
228     rocfft_status           result;
229     result = rocfft_plan_description_create(&description);
230     handleFftError(result, descriptiveString, "rocfft_plan_description_create failure");
231     result = rocfft_plan_description_set_data_layout(description,
232                                                      inputPlanSetupData.arrayType,
233                                                      outputPlanSetupData.arrayType,
234                                                      // No offsets are needed
235                                                      nullptr,
236                                                      nullptr,
237                                                      inputPlanSetupData.strides.size(),
238                                                      inputPlanSetupData.strides.data(),
239                                                      inputPlanSetupData.totalSize,
240                                                      outputPlanSetupData.strides.size(),
241                                                      outputPlanSetupData.strides.data(),
242                                                      outputPlanSetupData.totalSize);
243     handleFftError(result, descriptiveString, "rocfft_plan_description_set_data_layout failure");
244
245     // The plan creation depends on the identity of the GPU device, so
246     // we make sure it is made in the same queue where it will be
247     // used. The stream for execution can be set at the same time.
248
249     // First set up device buffers to receive the rocfft status values
250     rocfft_plan                        plan = nullptr;
251     cl::sycl::buffer<rocfft_status, 1> resultPlanCreate(1);
252
253     // Submit the planning to the queue. This is necessary so that we
254     // can ensure that the allocations in the planning go to the right
255     // context.
256     {
257         auto queue = pmeStream.stream();
258         // Make a buffer that is a view of the existing memory for a
259         // plan.
260         cl::sycl::buffer<rocfft_plan, 1> planView =
261                 cl::sycl::make_async_writeback_view(&plan, cl::sycl::range(1), queue);
262         queue.submit([&](cl::sycl::handler& cgh) {
263             // Make the necessary accessors
264             auto a_plan = planView.get_access(cgh, cl::sycl::write_only, cl::sycl::no_init);
265             auto a_resultPlanCreate =
266                     resultPlanCreate.get_access(cgh, cl::sycl::write_only, cl::sycl::no_init);
267             cgh.hipSYCL_enqueue_custom_operation([=](cl::sycl::interop_handle& /*h*/) {
268                 const int numBatches = 1;
269                 // Unlike some other FFT APIs, in rocFFT the
270                 // dimension of an FFT is the (vectorial) size
271                 // of the problem (ie. rocfftRealGridSize), not
272                 // the size of the input vector (which varies
273                 // according to whether the input format is real
274                 // or hermitian respectively for forward or
275                 // reverse transforms).
276                 a_resultPlanCreate[0] = rocfft_plan_create(&a_plan[0],
277                                                            rocfft_placement_notinplace,
278                                                            transformType,
279                                                            rocfft_precision_single,
280                                                            rocfftRealGridSize.size(),
281                                                            rocfftRealGridSize.data(),
282                                                            numBatches,
283                                                            description);
284             });
285         });
286     }
287     // Check for errors that happened while running the hipSYCL custom
288     // operation.
289     handleFftError(
290             resultPlanCreate.get_host_access()[0], descriptiveString, "rocfft_plan_create failure");
291
292     return RocfftPlan{ description, plan };
293 }
294
295 } // namespace
296
297 //! Impl class
298 class Gpu3dFft::ImplSyclRocfft::Impl
299 {
300 public:
301     //! \copydoc Gpu3dFft::Impl::Impl
302     Impl(bool                 allocateGrids,
303          MPI_Comm             comm,
304          ArrayRef<const int>  gridSizesInXForEachRank,
305          ArrayRef<const int>  gridSizesInYForEachRank,
306          const int            nz,
307          bool                 performOutOfPlaceFFT,
308          const DeviceContext& context,
309          const DeviceStream&  pmeStream,
310          ivec                 realGridSize,
311          ivec                 realGridSizePadded,
312          ivec                 complexGridSizePadded,
313          DeviceBuffer<float>* realGrid,
314          DeviceBuffer<float>* complexGrid);
315     /*! \brief Handle initializing the rocFFT library
316      *
317      * Make sure the library is initialized before the plans, etc. and
318      * not destructed before they are. */
319     RocfftInitializer init_;
320     //! Data for 3D FFT plans and execution
321     EnumerationArray<FftDirection, RocfftPlan> plans_;
322     //! Handle to the real grid buffer
323     cl::sycl::buffer<float, 1> realGrid_;
324     //! Handle to the complex grid buffer
325     cl::sycl::buffer<float, 1> complexGrid_;
326     /*! \brief Copy of PME stream
327      *
328      * This copy is guaranteed by the SYCL standard to work as if
329      * it was the original. */
330     cl::sycl::queue queue_;
331 };
332
333 Gpu3dFft::ImplSyclRocfft::Impl::Impl(bool allocateGrids,
334                                      MPI_Comm /*comm*/,
335                                      ArrayRef<const int> gridSizesInXForEachRank,
336                                      ArrayRef<const int> gridSizesInYForEachRank,
337                                      int /*nz*/,
338                                      bool performOutOfPlaceFFT,
339                                      const DeviceContext& /*context*/,
340                                      const DeviceStream&  pmeStream,
341                                      ivec                 realGridSize,
342                                      ivec                 realGridSizePadded,
343                                      ivec                 complexGridSizePadded,
344                                      DeviceBuffer<float>* realGrid,
345                                      DeviceBuffer<float>* complexGrid) :
346     plans_{
347         makePlan("real-to-complex",
348                  rocfft_transform_type_real_forward,
349                  // input
350                  PlanSetupData{ rocfft_array_type_real,
351                                 makeRealStrides(realGridSizePadded),
352                                 computeTotalSize(realGridSizePadded) },
353                  // output
354                  PlanSetupData{ rocfft_array_type_hermitian_interleaved,
355                                 makeComplexStrides(complexGridSizePadded),
356                                 computeTotalSize(complexGridSizePadded) },
357                  // Note that rocFFT requires that we reverse the dimension order when planning
358                  std::vector<size_t>{ size_t(realGridSize[ZZ]),
359                                       size_t(realGridSize[YY]),
360                                       size_t(realGridSize[XX]) },
361                  pmeStream),
362         // For rocFFT, the complex-to-real setup is the logical
363         // converse of the real-to-complex. The PlanSetupData objects
364         // are the same, but used in the opposite sense of
365         // input/output.
366         makePlan("complex-to-real",
367                  rocfft_transform_type_real_inverse,
368                  // input
369                  PlanSetupData{ rocfft_array_type_hermitian_interleaved,
370                                 makeComplexStrides(complexGridSizePadded),
371                                 computeTotalSize(complexGridSizePadded) },
372                  // output
373                  PlanSetupData{ rocfft_array_type_real,
374                                 makeRealStrides(realGridSizePadded),
375                                 computeTotalSize(realGridSizePadded) },
376                  // Note that rocFFT requires that we reverse the dimension order when planning
377                  std::vector<size_t>{ size_t(realGridSize[ZZ]),
378                                       size_t(realGridSize[YY]),
379                                       size_t(realGridSize[XX]) },
380                  pmeStream),
381     },
382     realGrid_(*realGrid->buffer_.get()),
383     complexGrid_(*complexGrid->buffer_.get()),
384     queue_(pmeStream.stream())
385 {
386     GMX_RELEASE_ASSERT(performOutOfPlaceFFT, "Only out-of-place FFT is implemented in hipSYCL");
387     GMX_RELEASE_ASSERT(allocateGrids == false, "Grids need to be pre-allocated");
388     GMX_RELEASE_ASSERT(gridSizesInXForEachRank.size() == 1 && gridSizesInYForEachRank.size() == 1,
389                        "FFT decomposition not implemented with SYCL backend");
390 }
391
392 void Gpu3dFft::ImplSyclRocfft::perform3dFft(gmx_fft_direction dir, CommandEvent* /*timingEvent*/)
393 {
394     GMX_RELEASE_ASSERT((dir == GMX_FFT_REAL_TO_COMPLEX) || (dir == GMX_FFT_COMPLEX_TO_REAL),
395                        "Only real-to-complex and complex-to-real FFTs are implemented in hipSYCL");
396     FftDirection               direction;
397     cl::sycl::buffer<float, 1>*inputGrid = nullptr, *outputGrid = nullptr;
398     if (dir == GMX_FFT_REAL_TO_COMPLEX)
399     {
400         direction  = FftDirection::RealToComplex;
401         inputGrid  = &impl_->realGrid_;
402         outputGrid = &impl_->complexGrid_;
403     }
404     else
405     {
406         direction  = FftDirection::ComplexToReal;
407         inputGrid  = &impl_->complexGrid_;
408         outputGrid = &impl_->realGrid_;
409     }
410     // Enqueue the 3D FFT work
411     impl_->queue_.submit([&](cl::sycl::handler& cgh) {
412         auto inputGridAccessor = inputGrid->get_access(cgh, cl::sycl::read_only);
413         auto outputGridAccessor = outputGrid->get_access(cgh, cl::sycl::write_only, cl::sycl::no_init);
414         // Use a hipSYCL custom operation to access the native buffers
415         // needed to call rocFFT
416         cgh.hipSYCL_enqueue_custom_operation([=](cl::sycl::interop_handle& h) {
417             void* d_inputGrid  = h.get_native_mem<cl::sycl::backend::hip>(inputGridAccessor);
418             void* d_outputGrid = h.get_native_mem<cl::sycl::backend::hip>(outputGridAccessor);
419             // Don't check results generated asynchronously,
420             // because we don't know what to do with them
421             rocfft_execute(impl_->plans_[direction].plan, &d_inputGrid, &d_outputGrid, nullptr);
422         });
423     });
424 }
425
426 Gpu3dFft::ImplSyclRocfft::ImplSyclRocfft(bool                 allocateGrids,
427                                          MPI_Comm             comm,
428                                          ArrayRef<const int>  gridSizesInXForEachRank,
429                                          ArrayRef<const int>  gridSizesInYForEachRank,
430                                          const int            nz,
431                                          bool                 performOutOfPlaceFFT,
432                                          const DeviceContext& context,
433                                          const DeviceStream&  pmeStream,
434                                          ivec                 realGridSize,
435                                          ivec                 realGridSizePadded,
436                                          ivec                 complexGridSizePadded,
437                                          DeviceBuffer<float>* realGrid,
438                                          DeviceBuffer<float>* complexGrid) :
439     impl_(std::make_unique<Impl>(allocateGrids,
440                                  comm,
441                                  gridSizesInXForEachRank,
442                                  gridSizesInYForEachRank,
443                                  nz,
444                                  performOutOfPlaceFFT,
445                                  context,
446                                  pmeStream,
447                                  realGridSize,
448                                  realGridSizePadded,
449                                  complexGridSizePadded,
450                                  realGrid,
451                                  complexGrid))
452 {
453 }
454
455 Gpu3dFft::ImplSyclRocfft::~ImplSyclRocfft() = default;
456
457 } // namespace gmx