2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2021, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
37 * \brief Implements GPU 3D FFT routines for hipSYCL via rocFFT.
39 * \author Andrey Alekseenko <al42and@gmail.com>
40 * \author Mark Abraham <mark.j.abraham@gmail.com>
42 * For hipSYCL, in order to call FFT APIs from the respective vendors
43 * using the same DeviceStream as other operations, a vendor extension
44 * called "custom operations" is used (see hipSYCL
45 * doc/enqueue-custom-operation.md). That effectively enqueues an
46 * asynchronous host-side lambda into the same queue. The body of the
47 * lambda unpacks the runtime data structures to get the native
48 * handles and calls the native FFT APIs.
50 * For a 3D FFT, rocFFT requires a working buffer which it allocates
51 * itself if not provided. This might be slow enough to be worth
52 * optimizing. This working buffer could be provided in advance by
53 * calling rocfft_plan_get_work_buffer_size, allocating a buffer that
54 * persists suitably, and then using
55 * rocfft_execution_info_set_work_buffer in a custom operation.
57 * hipSYCL queues operate at a higher level of abstraction than hip
58 * streams, with the runtime distributing work to the latter to
59 * balance load. It is possible to set the HIP stream in
60 * rocfft_execution_info, but then there is no guarantee that a
61 * subsequent queue item will run using the same stream. So we
62 * currently do not attempt to set the stream.
69 #include "gpu_3dfft_sycl_rocfft.h"
71 #include "gromacs/utility/enumerationhelpers.h"
72 #include "gromacs/utility/exceptions.h"
76 #include "gromacs/gpu_utils/device_stream.h"
77 #include "gromacs/gpu_utils/devicebuffer.h"
78 #include "gromacs/utility/exceptions.h"
79 #include "gromacs/utility/gmxassert.h"
82 # error This file can only be compiled with hipSYCL enabled
85 #if !defined HIPSYCL_PLATFORM_ROCM
86 # error Only ROCM platform is supported for 3D FFT with hipSYCL
97 //! Model the kinds of 3D FFT implemented
98 enum class FftDirection : int
105 //! Strings that match enum rocfft_status_e in rocfft.h
106 const std::array<const char*, rocfft_status_invalid_work_buffer + 1> c_rocfftErrorStrings = {
109 "invalid argument value",
110 "invalid dimensions",
111 "invalid array type",
115 "invalid work buffer"
118 //! Helper for consistent error handling
119 void handleFftError(rocfft_status result, const std::string& msg)
121 if (result != rocfft_status_success)
123 if (result <= rocfft_status_invalid_work_buffer)
125 GMX_THROW(gmx::InternalError(gmx::formatString(
126 "%s: (error code %d - %s)\n", msg.c_str(), result, c_rocfftErrorStrings[result])));
130 GMX_THROW(gmx::InternalError(gmx::formatString("%s: (error code %d)\n", msg.c_str(), result)));
135 //! Helper for consistent error handling
136 void handleFftError(rocfft_status result, const std::string& direction, const std::string& msg)
138 if (result != rocfft_status_success)
140 handleFftError(result, msg + " doing " + direction);
144 //! Provides RAII-style initialization of rocFFT library
145 class RocfftInitializer
150 rocfft_status result;
151 result = rocfft_setup();
152 handleFftError(result, "rocfft_setup failure");
156 // No need to handle any errors in a destructor, and
157 // anyway one cannot throw.
162 //! All the persistent data for planning an executing a 3D FFT
165 //! Describes details of the data layout
166 rocfft_plan_description description = nullptr;
167 //! High level information about the plan
168 rocfft_plan plan = nullptr;
172 // No need to handle any errors in a destructor,
173 // and anyway one cannot throw.
176 rocfft_plan_destroy(plan);
180 rocfft_plan_description_destroy(description);
185 //! Helper struct to reduce repetitive code setting up a 3D FFT plan
188 //! Format of the input array (real or hermitian)
189 rocfft_array_type arrayType;
190 //! Strides through the input array for the three dimensions
191 std::array<size_t, DIM> strides;
192 //! Total size of the input array (including padding)
196 //! Compute the stride through the real 1D array
197 std::array<size_t, DIM> makeRealStrides(ivec realGridSizePadded)
199 return { 1, size_t(realGridSizePadded[ZZ]), size_t(realGridSizePadded[ZZ] * realGridSizePadded[YY]) };
202 //! Compute the stride through the complex 1D array
203 std::array<size_t, DIM> makeComplexStrides(ivec complexGridSizePadded)
206 size_t(complexGridSizePadded[XX]),
207 size_t(complexGridSizePadded[XX] * complexGridSizePadded[YY]) };
210 //! Compute total grid size
211 size_t computeTotalSize(ivec gridSize)
213 return size_t(gridSize[XX] * gridSize[YY] * gridSize[ZZ]);
216 /*! \brief Prepare plans for the forward and reverse transformation.
218 * Because these require device-side allocations, some of them must be
219 * done in a SYCL queue. */
220 RocfftPlan makePlan(const std::string& descriptiveString,
221 rocfft_transform_type transformType,
222 const PlanSetupData& inputPlanSetupData,
223 const PlanSetupData& outputPlanSetupData,
224 ArrayRef<const size_t> rocfftRealGridSize,
225 const DeviceStream& pmeStream)
227 rocfft_plan_description description = nullptr;
228 rocfft_status result;
229 result = rocfft_plan_description_create(&description);
230 handleFftError(result, descriptiveString, "rocfft_plan_description_create failure");
231 result = rocfft_plan_description_set_data_layout(description,
232 inputPlanSetupData.arrayType,
233 outputPlanSetupData.arrayType,
234 // No offsets are needed
237 inputPlanSetupData.strides.size(),
238 inputPlanSetupData.strides.data(),
239 inputPlanSetupData.totalSize,
240 outputPlanSetupData.strides.size(),
241 outputPlanSetupData.strides.data(),
242 outputPlanSetupData.totalSize);
243 handleFftError(result, descriptiveString, "rocfft_plan_description_set_data_layout failure");
245 // The plan creation depends on the identity of the GPU device, so
246 // we make sure it is made in the same queue where it will be
247 // used. The stream for execution can be set at the same time.
249 // First set up device buffers to receive the rocfft status values
250 rocfft_plan plan = nullptr;
251 cl::sycl::buffer<rocfft_status, 1> resultPlanCreate(1);
253 // Submit the planning to the queue. This is necessary so that we
254 // can ensure that the allocations in the planning go to the right
257 auto queue = pmeStream.stream();
258 // Make a buffer that is a view of the existing memory for a
260 cl::sycl::buffer<rocfft_plan, 1> planView =
261 cl::sycl::make_async_writeback_view(&plan, cl::sycl::range(1), queue);
262 queue.submit([&](cl::sycl::handler& cgh) {
263 // Make the necessary accessors
264 auto a_plan = planView.get_access(cgh, cl::sycl::write_only, cl::sycl::no_init);
265 auto a_resultPlanCreate =
266 resultPlanCreate.get_access(cgh, cl::sycl::write_only, cl::sycl::no_init);
267 cgh.hipSYCL_enqueue_custom_operation([=](cl::sycl::interop_handle& /*h*/) {
268 const int numBatches = 1;
269 // Unlike some other FFT APIs, in rocFFT the
270 // dimension of an FFT is the (vectorial) size
271 // of the problem (ie. rocfftRealGridSize), not
272 // the size of the input vector (which varies
273 // according to whether the input format is real
274 // or hermitian respectively for forward or
275 // reverse transforms).
276 a_resultPlanCreate[0] = rocfft_plan_create(&a_plan[0],
277 rocfft_placement_notinplace,
279 rocfft_precision_single,
280 rocfftRealGridSize.size(),
281 rocfftRealGridSize.data(),
287 // Check for errors that happened while running the hipSYCL custom
290 resultPlanCreate.get_host_access()[0], descriptiveString, "rocfft_plan_create failure");
292 return RocfftPlan{ description, plan };
298 class Gpu3dFft::ImplSyclRocfft::Impl
301 //! \copydoc Gpu3dFft::Impl::Impl
302 Impl(bool allocateGrids,
304 ArrayRef<const int> gridSizesInXForEachRank,
305 ArrayRef<const int> gridSizesInYForEachRank,
307 bool performOutOfPlaceFFT,
308 const DeviceContext& context,
309 const DeviceStream& pmeStream,
311 ivec realGridSizePadded,
312 ivec complexGridSizePadded,
313 DeviceBuffer<float>* realGrid,
314 DeviceBuffer<float>* complexGrid);
315 /*! \brief Handle initializing the rocFFT library
317 * Make sure the library is initialized before the plans, etc. and
318 * not destructed before they are. */
319 RocfftInitializer init_;
320 //! Data for 3D FFT plans and execution
321 EnumerationArray<FftDirection, RocfftPlan> plans_;
322 //! Handle to the real grid buffer
323 cl::sycl::buffer<float, 1> realGrid_;
324 //! Handle to the complex grid buffer
325 cl::sycl::buffer<float, 1> complexGrid_;
326 /*! \brief Copy of PME stream
328 * This copy is guaranteed by the SYCL standard to work as if
329 * it was the original. */
330 cl::sycl::queue queue_;
333 Gpu3dFft::ImplSyclRocfft::Impl::Impl(bool allocateGrids,
335 ArrayRef<const int> gridSizesInXForEachRank,
336 ArrayRef<const int> gridSizesInYForEachRank,
338 bool performOutOfPlaceFFT,
339 const DeviceContext& /*context*/,
340 const DeviceStream& pmeStream,
342 ivec realGridSizePadded,
343 ivec complexGridSizePadded,
344 DeviceBuffer<float>* realGrid,
345 DeviceBuffer<float>* complexGrid) :
347 makePlan("real-to-complex",
348 rocfft_transform_type_real_forward,
350 PlanSetupData{ rocfft_array_type_real,
351 makeRealStrides(realGridSizePadded),
352 computeTotalSize(realGridSizePadded) },
354 PlanSetupData{ rocfft_array_type_hermitian_interleaved,
355 makeComplexStrides(complexGridSizePadded),
356 computeTotalSize(complexGridSizePadded) },
357 // Note that rocFFT requires that we reverse the dimension order when planning
358 std::vector<size_t>{ size_t(realGridSize[ZZ]),
359 size_t(realGridSize[YY]),
360 size_t(realGridSize[XX]) },
362 // For rocFFT, the complex-to-real setup is the logical
363 // converse of the real-to-complex. The PlanSetupData objects
364 // are the same, but used in the opposite sense of
366 makePlan("complex-to-real",
367 rocfft_transform_type_real_inverse,
369 PlanSetupData{ rocfft_array_type_hermitian_interleaved,
370 makeComplexStrides(complexGridSizePadded),
371 computeTotalSize(complexGridSizePadded) },
373 PlanSetupData{ rocfft_array_type_real,
374 makeRealStrides(realGridSizePadded),
375 computeTotalSize(realGridSizePadded) },
376 // Note that rocFFT requires that we reverse the dimension order when planning
377 std::vector<size_t>{ size_t(realGridSize[ZZ]),
378 size_t(realGridSize[YY]),
379 size_t(realGridSize[XX]) },
382 realGrid_(*realGrid->buffer_.get()),
383 complexGrid_(*complexGrid->buffer_.get()),
384 queue_(pmeStream.stream())
386 GMX_RELEASE_ASSERT(performOutOfPlaceFFT, "Only out-of-place FFT is implemented in hipSYCL");
387 GMX_RELEASE_ASSERT(allocateGrids == false, "Grids need to be pre-allocated");
388 GMX_RELEASE_ASSERT(gridSizesInXForEachRank.size() == 1 && gridSizesInYForEachRank.size() == 1,
389 "FFT decomposition not implemented with SYCL backend");
392 void Gpu3dFft::ImplSyclRocfft::perform3dFft(gmx_fft_direction dir, CommandEvent* /*timingEvent*/)
394 GMX_RELEASE_ASSERT((dir == GMX_FFT_REAL_TO_COMPLEX) || (dir == GMX_FFT_COMPLEX_TO_REAL),
395 "Only real-to-complex and complex-to-real FFTs are implemented in hipSYCL");
396 FftDirection direction;
397 cl::sycl::buffer<float, 1>*inputGrid = nullptr, *outputGrid = nullptr;
398 if (dir == GMX_FFT_REAL_TO_COMPLEX)
400 direction = FftDirection::RealToComplex;
401 inputGrid = &impl_->realGrid_;
402 outputGrid = &impl_->complexGrid_;
406 direction = FftDirection::ComplexToReal;
407 inputGrid = &impl_->complexGrid_;
408 outputGrid = &impl_->realGrid_;
410 // Enqueue the 3D FFT work
411 impl_->queue_.submit([&](cl::sycl::handler& cgh) {
412 auto inputGridAccessor = inputGrid->get_access(cgh, cl::sycl::read_only, cl::sycl::no_init);
413 auto outputGridAccessor = outputGrid->get_access(cgh, cl::sycl::write_only, cl::sycl::no_init);
414 // Use a hipSYCL custom operation to access the native buffers
415 // needed to call rocFFT
416 cgh.hipSYCL_enqueue_custom_operation([=](cl::sycl::interop_handle& h) {
417 void* d_inputGrid = h.get_native_mem<cl::sycl::backend::hip>(inputGridAccessor);
418 void* d_outputGrid = h.get_native_mem<cl::sycl::backend::hip>(outputGridAccessor);
419 // Don't check results generated asynchronously,
420 // because we don't know what to do with them
421 rocfft_execute(impl_->plans_[direction].plan, &d_inputGrid, &d_outputGrid, nullptr);
426 Gpu3dFft::ImplSyclRocfft::ImplSyclRocfft(bool allocateGrids,
428 ArrayRef<const int> gridSizesInXForEachRank,
429 ArrayRef<const int> gridSizesInYForEachRank,
431 bool performOutOfPlaceFFT,
432 const DeviceContext& context,
433 const DeviceStream& pmeStream,
435 ivec realGridSizePadded,
436 ivec complexGridSizePadded,
437 DeviceBuffer<float>* realGrid,
438 DeviceBuffer<float>* complexGrid) :
439 impl_(std::make_unique<Impl>(allocateGrids,
441 gridSizesInXForEachRank,
442 gridSizesInYForEachRank,
444 performOutOfPlaceFFT,
449 complexGridSizePadded,
455 Gpu3dFft::ImplSyclRocfft::~ImplSyclRocfft() = default;