2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2016,2017,2018,2019,2020,2021, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
38 * \brief This file contains internal function definitions for performing the PME calculations on GPU.
39 * These are not meant to be exposed outside of the PME GPU code.
40 * As of now, their bodies are still in the common pme_gpu.cpp files.
42 * \author Aleksei Iupinov <a.yupinov@gmail.com>
43 * \ingroup module_ewald
46 #ifndef GMX_EWALD_PME_GPU_INTERNAL_H
47 #define GMX_EWALD_PME_GPU_INTERNAL_H
49 #include "gromacs/fft/fft.h" // for the gmx_fft_direction enum
50 #include "gromacs/gpu_utils/devicebuffer_datatype.h"
51 #include "gromacs/gpu_utils/gpu_macros.h" // for the GPU_FUNC_ macros
53 #include "pme_gpu_types_host.h"
54 #include "pme_output.h"
57 struct DeviceInformation;
59 class GpuEventSynchronizer;
62 struct gmx_pme_t; // only used in pme_gpu_reinit
65 enum class PmeForceOutputHandling;
69 struct PmeGpuSettings;
73 //! Grid index of FEP state A (or unperturbed system)
74 # define FEP_STATE_A 0
77 //! Grid index of FEP state B
78 # define FEP_STATE_B 1
88 //! Type of spline data
89 enum class PmeSplineDataType
92 Derivatives, // dtheta
93 }; // TODO move this into new and shiny pme.h (pme-types.h?)
95 //! PME grid dimension ordering (from major to minor)
96 enum class GridOrdering
102 /*! \libinternal \brief
103 * Returns the size of the block size requirement
105 * The GPU version of PME requires that the coordinates array have a
106 * size divisible by the returned number.
108 * \returns Number of atoms in a single GPU atom data chunk, which
109 * determines a minimum divisior of the size of the memory allocated.
111 int pme_gpu_get_atom_data_block_size();
113 /*! \libinternal \brief
114 * Synchronizes the current computation, waiting for the GPU kernels/transfers to finish.
116 * \param[in] pmeGpu The PME GPU structure.
118 GPU_FUNC_QUALIFIER void pme_gpu_synchronize(const PmeGpu* GPU_FUNC_ARGUMENT(pmeGpu)) GPU_FUNC_TERM;
120 /*! \libinternal \brief
121 * Allocates the fixed size energy and virial buffer both on GPU and CPU.
123 * \param[in,out] pmeGpu The PME GPU structure.
125 void pme_gpu_alloc_energy_virial(PmeGpu* pmeGpu);
127 /*! \libinternal \brief
128 * Frees the energy and virial memory both on GPU and CPU.
130 * \param[in] pmeGpu The PME GPU structure.
132 void pme_gpu_free_energy_virial(PmeGpu* pmeGpu);
134 /*! \libinternal \brief
135 * Clears the energy and virial memory on GPU with 0.
136 * Should be called at the end of PME computation which returned energy/virial.
138 * \param[in] pmeGpu The PME GPU structure.
140 void pme_gpu_clear_energy_virial(const PmeGpu* pmeGpu);
142 /*! \libinternal \brief
143 * Reallocates and copies the pre-computed B-spline values to the GPU.
145 * \param[in,out] pmeGpu The PME GPU structure.
146 * \param[in] gridIndex The index of the grid to use. 0 is Coulomb in the normal
147 * state or FEP state A and 1 is Coulomb in FEP state B.
149 void pme_gpu_realloc_and_copy_bspline_values(PmeGpu* pmeGpu, int gridIndex = 0);
151 /*! \libinternal \brief
152 * Frees the pre-computed B-spline values on the GPU (and the transfer CPU buffers).
154 * \param[in] pmeGpu The PME GPU structure.
156 void pme_gpu_free_bspline_values(const PmeGpu* pmeGpu);
158 /*! \libinternal \brief
159 * Reallocates the GPU buffer for the PME forces.
161 * \param[in] pmeGpu The PME GPU structure.
163 void pme_gpu_realloc_forces(PmeGpu* pmeGpu);
165 /*! \libinternal \brief
166 * Frees the GPU buffer for the PME forces.
168 * \param[in] pmeGpu The PME GPU structure.
170 void pme_gpu_free_forces(const PmeGpu* pmeGpu);
172 /*! \libinternal \brief
173 * Copies the forces from the CPU buffer to the GPU (to reduce them with the PME GPU gathered
174 * forces). To be called e.g. after the bonded calculations.
176 * \param[in] pmeGpu The PME GPU structure.
178 void pme_gpu_copy_input_forces(PmeGpu* pmeGpu);
180 /*! \libinternal \brief
181 * Copies the forces from the GPU to the CPU buffer. To be called after the gathering stage.
183 * \param[in] pmeGpu The PME GPU structure.
185 void pme_gpu_copy_output_forces(PmeGpu* pmeGpu);
187 /*! \libinternal \brief
188 * Checks whether work in the PME GPU stream has completed.
190 * \param[in] pmeGpu The PME GPU structure.
192 * \returns True if work in the PME stream has completed.
194 bool pme_gpu_stream_query(const PmeGpu* pmeGpu);
196 /*! \libinternal \brief
197 * Reallocates the buffer on the GPU and copies the charges/coefficients from the CPU buffer.
198 * Clears the padded part if needed.
200 * \param[in] pmeGpu The PME GPU structure.
201 * \param[in] h_coefficients The input atom charges/coefficients.
202 * \param[in] gridIndex The index of the grid to use. 0 is Coulomb in the normal
203 * state or FEP state A and 1 is Coulomb in FEP state B.
205 * Does not need to be done for every PME computation, only whenever the local charges change.
206 * (So, in the beginning of the run, or on DD step).
208 void pme_gpu_realloc_and_copy_input_coefficients(const PmeGpu* pmeGpu,
209 const float* h_coefficients,
212 /*! \libinternal \brief
213 * Frees the charges/coefficients on the GPU.
215 * \param[in] pmeGpu The PME GPU structure.
217 void pme_gpu_free_coefficients(const PmeGpu* pmeGpu);
219 /*! \libinternal \brief
220 * Reallocates the buffers on the GPU and the host for the atoms spline data.
222 * \param[in,out] pmeGpu The PME GPU structure.
224 void pme_gpu_realloc_spline_data(PmeGpu* pmeGpu);
226 /*! \libinternal \brief
227 * Frees the buffers on the GPU for the atoms spline data.
229 * \param[in] pmeGpu The PME GPU structure.
231 void pme_gpu_free_spline_data(const PmeGpu* pmeGpu);
233 /*! \libinternal \brief
234 * Reallocates the buffers on the GPU and the host for the particle gridline indices.
236 * \param[in,out] pmeGpu The PME GPU structure.
238 void pme_gpu_realloc_grid_indices(PmeGpu* pmeGpu);
240 /*! \libinternal \brief
241 * Frees the buffer on the GPU for the particle gridline indices.
243 * \param[in] pmeGpu The PME GPU structure.
245 void pme_gpu_free_grid_indices(const PmeGpu* pmeGpu);
247 /*! \libinternal \brief
248 * Reallocates the real space grid and the complex reciprocal grid (if needed) on the GPU.
250 * \param[in] pmeGpu The PME GPU structure.
252 void pme_gpu_realloc_grids(PmeGpu* pmeGpu);
254 /*! \libinternal \brief
255 * Frees the real space grid and the complex reciprocal grid (if needed) on the GPU.
257 * \param[in] pmeGpu The PME GPU structure.
259 void pme_gpu_free_grids(const PmeGpu* pmeGpu);
261 /*! \libinternal \brief
262 * Clears the real space grid on the GPU.
263 * Should be called at the end of each computation.
265 * \param[in] pmeGpu The PME GPU structure.
267 void pme_gpu_clear_grids(const PmeGpu* pmeGpu);
269 /*! \libinternal \brief
270 * Reallocates and copies the pre-computed fractional coordinates' shifts to the GPU.
272 * \param[in] pmeGpu The PME GPU structure.
274 void pme_gpu_realloc_and_copy_fract_shifts(PmeGpu* pmeGpu);
276 /*! \libinternal \brief
277 * Frees the pre-computed fractional coordinates' shifts on the GPU.
279 * \param[in] pmeGpu The PME GPU structure.
281 void pme_gpu_free_fract_shifts(const PmeGpu* pmeGpu);
283 /*! \libinternal \brief
284 * Copies the input real-space grid from the host to the GPU.
286 * \param[in] pmeGpu The PME GPU structure.
287 * \param[in] h_grid The host-side grid buffer.
288 * \param[in] gridIndex The index of the grid to use. 0 is Coulomb in the normal
289 * state or FEP state A and 1 is Coulomb in FEP state B.
291 void pme_gpu_copy_input_gather_grid(const PmeGpu* pmeGpu, const float* h_grid, int gridIndex = 0);
293 /*! \libinternal \brief
294 * Copies the output real-space grid from the GPU to the host.
296 * \param[in] pmeGpu The PME GPU structure.
297 * \param[out] h_grid The host-side grid buffer.
298 * \param[in] gridIndex The index of the grid to use. 0 is Coulomb in the normal
299 * state or FEP state A and 1 is Coulomb in FEP state B.
301 void pme_gpu_copy_output_spread_grid(const PmeGpu* pmeGpu, float* h_grid, int gridIndex = 0);
303 /*! \libinternal \brief
304 * Copies the spread output spline data and gridline indices from the GPU to the host.
306 * \param[in] pmeGpu The PME GPU structure.
308 void pme_gpu_copy_output_spread_atom_data(const PmeGpu* pmeGpu);
310 /*! \libinternal \brief
311 * Copies the gather input spline data and gridline indices from the host to the GPU.
313 * \param[in] pmeGpu The PME GPU structure.
315 void pme_gpu_copy_input_gather_atom_data(const PmeGpu* pmeGpu);
317 /*! \libinternal \brief
318 * Waits for the grid copying to the host-side buffer after spreading to finish.
320 * \param[in] pmeGpu The PME GPU structure.
322 void pme_gpu_sync_spread_grid(const PmeGpu* pmeGpu);
324 /*! \libinternal \brief
325 * Initializes the CUDA FFT structures.
327 * \param[in] pmeGpu The PME GPU structure.
329 void pme_gpu_reinit_3dfft(const PmeGpu* pmeGpu);
331 /*! \libinternal \brief
332 * Destroys the CUDA FFT structures.
334 * \param[in] pmeGpu The PME GPU structure.
336 void pme_gpu_destroy_3dfft(const PmeGpu* pmeGpu);
338 /* The PME stages themselves */
340 /*! \libinternal \brief
341 * A GPU spline computation and charge spreading function.
343 * \param[in] pmeGpu The PME GPU structure.
344 * \param[in] xReadyOnDevice Event synchronizer indicating that the coordinates are
345 * ready in the device memory; can be nullptr when invoked
346 * on a separate PME rank or from PME tests.
347 * \param[out] h_grids The host-side grid buffers (used only if the result
348 * of the spread is expected on the host, e.g. testing
350 * \param[in] computeSplines Should the computation of spline parameters and gridline
351 * indices be performed.
352 * \param[in] spreadCharges Should the charges/coefficients be spread on the grid.
353 * \param[in] lambda The lambda value of the current system state.
354 * \param[in] useGpuDirectComm Whether direct GPU PME-PP communication is active
355 * \param[in] pmeCoordinateReceiverGpu Coordinate receiver object, which must be valid when
356 * direct GPU PME-PP communication is active
358 GPU_FUNC_QUALIFIER void
359 pme_gpu_spread(const PmeGpu* GPU_FUNC_ARGUMENT(pmeGpu),
360 GpuEventSynchronizer* GPU_FUNC_ARGUMENT(xReadyOnDevice),
361 float** GPU_FUNC_ARGUMENT(h_grids),
362 bool GPU_FUNC_ARGUMENT(computeSplines),
363 bool GPU_FUNC_ARGUMENT(spreadCharges),
364 real GPU_FUNC_ARGUMENT(lambda),
365 bool GPU_FUNC_ARGUMENT(useGpuDirectComm),
366 gmx::PmeCoordinateReceiverGpu* GPU_FUNC_ARGUMENT(pmeCoordinateReceiverGpu)) GPU_FUNC_TERM;
368 /*! \libinternal \brief
369 * 3D FFT R2C/C2R routine.
371 * \param[in] pmeGpu The PME GPU structure.
372 * \param[in] direction Transform direction (real-to-complex or complex-to-real)
373 * \param[in] gridIndex The index of the grid to use. 0 is Coulomb in the normal
374 * state or FEP state A and 1 is Coulomb in FEP state B.
376 void pme_gpu_3dfft(const PmeGpu* pmeGpu, enum gmx_fft_direction direction, int gridIndex = 0);
378 /*! \libinternal \brief
379 * A GPU Fourier space solving function.
381 * \param[in] pmeGpu The PME GPU structure.
382 * \param[in] gridIndex The index of the grid to use. 0 is Coulomb in the normal
383 * state or FEP state A and 1 is Coulomb in FEP state B.
384 * \param[in,out] h_grid The host-side input and output Fourier grid buffer (used only with testing or host-side FFT)
385 * \param[in] gridOrdering Specifies the dimenion ordering of the complex grid. TODO: store this information?
386 * \param[in] computeEnergyAndVirial Tells if the energy and virial computation should be performed.
388 GPU_FUNC_QUALIFIER void pme_gpu_solve(const PmeGpu* GPU_FUNC_ARGUMENT(pmeGpu),
389 int GPU_FUNC_ARGUMENT(gridIndex),
390 t_complex* GPU_FUNC_ARGUMENT(h_grid),
391 GridOrdering GPU_FUNC_ARGUMENT(gridOrdering),
392 bool GPU_FUNC_ARGUMENT(computeEnergyAndVirial)) GPU_FUNC_TERM;
394 /*! \libinternal \brief
395 * A GPU force gathering function.
397 * \param[in] pmeGpu The PME GPU structure.
398 * \param[in] h_grids The host-side grid buffer (used only in testing mode).
399 * \param[in] lambda The lambda value to use.
401 GPU_FUNC_QUALIFIER void pme_gpu_gather(PmeGpu* GPU_FUNC_ARGUMENT(pmeGpu),
402 float** GPU_FUNC_ARGUMENT(h_grids),
403 float GPU_FUNC_ARGUMENT(lambda)) GPU_FUNC_TERM;
406 /*! \brief Sets the device pointer to coordinate data
407 * \param[in] pmeGpu The PME GPU structure.
408 * \param[in] d_x Pointer to coordinate data
410 GPU_FUNC_QUALIFIER void pme_gpu_set_kernelparam_coordinates(const PmeGpu* GPU_FUNC_ARGUMENT(pmeGpu),
411 DeviceBuffer<gmx::RVec> GPU_FUNC_ARGUMENT(d_x)) GPU_FUNC_TERM;
413 /*! \brief Return pointer to device copy of force data.
414 * \param[in] pmeGpu The PME GPU structure.
415 * \returns Pointer to force data
417 GPU_FUNC_QUALIFIER DeviceBuffer<gmx::RVec> pme_gpu_get_kernelparam_forces(const PmeGpu* GPU_FUNC_ARGUMENT(pmeGpu))
418 GPU_FUNC_TERM_WITH_RETURN(DeviceBuffer<gmx::RVec>{});
420 /*! \brief Return pointer to the sync object triggered after the PME force calculation completion
421 * \param[in] pmeGpu The PME GPU structure.
422 * \returns Pointer to sync object
424 GPU_FUNC_QUALIFIER GpuEventSynchronizer* pme_gpu_get_forces_ready_synchronizer(
425 const PmeGpu* GPU_FUNC_ARGUMENT(pmeGpu)) GPU_FUNC_TERM_WITH_RETURN(nullptr);
427 /*! \libinternal \brief
428 * Returns the PME GPU settings
430 * \param[in] pmeGpu The PME GPU structure.
431 * \returns The settings for PME on GPU
433 inline const PmeGpuSettings& pme_gpu_settings(const PmeGpu* pmeGpu)
435 return pmeGpu->settings;
438 /*! \libinternal \brief
439 * Returns the PME GPU staging object
441 * \param[in] pmeGpu The PME GPU structure.
442 * \returns The staging object for PME on GPU
444 inline const PmeGpuStaging& pme_gpu_staging(const PmeGpu* pmeGpu)
446 return pmeGpu->staging;
449 /*! \libinternal \brief
450 * Sets whether the PME module is running in testing mode
452 * \param[in] pmeGpu The PME GPU structure.
453 * \param[in] testing Whether testing mode is on.
455 inline void pme_gpu_set_testing(PmeGpu* pmeGpu, bool testing)
459 pmeGpu->settings.copyAllOutputs = testing;
460 pmeGpu->settings.transferKind = testing ? GpuApiCallBehavior::Sync : GpuApiCallBehavior::Async;
464 /* A block of C++ functions that live in pme_gpu_internal.cpp */
466 /*! \libinternal \brief
467 * Returns the energy and virial GPU outputs, useful for testing.
469 * It is the caller's responsibility to be aware of whether the GPU
470 * handled the solve stage.
472 * \param[in] pme The PME structure.
473 * \param[in] lambda The lambda value to use when calculating the results.
474 * \param[out] output Pointer to output where energy and virial should be stored.
476 GPU_FUNC_QUALIFIER void pme_gpu_getEnergyAndVirial(const gmx_pme_t& GPU_FUNC_ARGUMENT(pme),
477 float GPU_FUNC_ARGUMENT(lambda),
478 PmeOutput* GPU_FUNC_ARGUMENT(output)) GPU_FUNC_TERM;
480 /*! \libinternal \brief
481 * Returns the GPU outputs (forces, energy and virial)
483 * \param[in] pme The PME structure.
484 * \param[in] computeEnergyAndVirial Whether the energy and virial are being computed
485 * \param[in] lambdaQ The Coulomb lambda to use when finalizing the output.
486 * \returns The output object.
488 GPU_FUNC_QUALIFIER PmeOutput pme_gpu_getOutput(const gmx_pme_t& GPU_FUNC_ARGUMENT(pme),
489 bool GPU_FUNC_ARGUMENT(computeEnergyAndVirial),
490 real GPU_FUNC_ARGUMENT(lambdaQ))
491 GPU_FUNC_TERM_WITH_RETURN(PmeOutput{});
493 /*! \libinternal \brief
494 * Updates the unit cell parameters. Does not check if update is necessary - that is done in pme_gpu_prepare_computation().
496 * \param[in] pmeGpu The PME GPU structure.
497 * \param[in] box The unit cell box.
499 GPU_FUNC_QUALIFIER void pme_gpu_update_input_box(PmeGpu* GPU_FUNC_ARGUMENT(pmeGpu),
500 const matrix GPU_FUNC_ARGUMENT(box)) GPU_FUNC_TERM;
502 /*! \libinternal \brief
503 * Finishes the PME GPU computation, waiting for the output forces and/or energy/virial to be copied to the host.
504 * If forces were computed, they will have arrived at the external host buffer provided to gather.
505 * If virial/energy were computed, they will have arrived into the internal staging buffer
506 * (even though that should have already happened before even launching the gather).
507 * Finally, cudaEvent_t based GPU timers get updated if enabled. They also need stream synchronization for correctness.
508 * Additionally, device-side buffers are cleared asynchronously for the next computation.
510 * \param[in] pmeGpu The PME GPU structure.
512 void pme_gpu_finish_computation(const PmeGpu* pmeGpu);
514 /*! \libinternal \brief
515 * Get the normal/padded grid dimensions of the real-space PME grid on GPU. Only used in tests.
517 * \param[in] pmeGpu The PME GPU structure.
518 * \param[out] gridSize Pointer to the grid dimensions to fill in.
519 * \param[out] paddedGridSize Pointer to the padded grid dimensions to fill in.
521 GPU_FUNC_QUALIFIER void pme_gpu_get_real_grid_sizes(const PmeGpu* GPU_FUNC_ARGUMENT(pmeGpu),
522 gmx::IVec* GPU_FUNC_ARGUMENT(gridSize),
523 gmx::IVec* GPU_FUNC_ARGUMENT(paddedGridSize)) GPU_FUNC_TERM;
525 /*! \libinternal \brief
526 * (Re-)initializes the PME GPU data at the beginning of the run or on DLB.
528 * \param[in,out] pme The PME structure.
529 * \param[in] deviceContext The GPU context.
530 * \param[in] deviceStream The GPU stream.
531 * \param[in,out] pmeGpuProgram The handle to the program/kernel data created outside (e.g. in unit tests/runner)
533 * \throws gmx::NotImplementedError if this generally valid PME structure is not valid for GPU runs.
535 GPU_FUNC_QUALIFIER void pme_gpu_reinit(gmx_pme_t* GPU_FUNC_ARGUMENT(pme),
536 const DeviceContext* GPU_FUNC_ARGUMENT(deviceContext),
537 const DeviceStream* GPU_FUNC_ARGUMENT(deviceStream),
538 const PmeGpuProgram* GPU_FUNC_ARGUMENT(pmeGpuProgram)) GPU_FUNC_TERM;
540 /*! \libinternal \brief
541 * Destroys the PME GPU data at the end of the run.
543 * \param[in] pmeGpu The PME GPU structure.
545 GPU_FUNC_QUALIFIER void pme_gpu_destroy(PmeGpu* GPU_FUNC_ARGUMENT(pmeGpu)) GPU_FUNC_TERM;
547 /*! \libinternal \brief
548 * Reallocates the local atoms data (charges, coordinates, etc.). Copies the charges to the GPU.
550 * \param[in] pmeGpu The PME GPU structure.
551 * \param[in] nAtoms The number of particles.
552 * \param[in] chargesA The pointer to the host-side array of particle charges in the unperturbed state or FEP state A.
553 * \param[in] chargesB The pointer to the host-side array of particle charges in FEP state B.
555 * This is a function that should only be called in the beginning of the run and on domain
556 * decomposition. Should be called before the pme_gpu_set_io_ranges.
558 GPU_FUNC_QUALIFIER void pme_gpu_reinit_atoms(PmeGpu* GPU_FUNC_ARGUMENT(pmeGpu),
559 int GPU_FUNC_ARGUMENT(nAtoms),
560 const real* GPU_FUNC_ARGUMENT(chargesA),
561 const real* GPU_FUNC_ARGUMENT(chargesB) = nullptr) GPU_FUNC_TERM;
563 /*! \brief \libinternal
564 * The PME GPU reinitialization function that is called both at the end of any PME computation and on any load balancing.
566 * This clears the device-side working buffers in preparation for new computation.
568 * \param[in] pmeGpu The PME GPU structure.
570 void pme_gpu_reinit_computation(const PmeGpu* pmeGpu);
573 * Blocks until PME GPU tasks are completed, and gets the output forces and virial/energy
574 * (if they were to be computed).
576 * \param[in] pme The PME data structure.
577 * \param[in] computeEnergyAndVirial Tells if the energy and virial computation should be performed.
578 * \param[in] lambdaQ The Coulomb lambda to use when calculating the results.
579 * \param[out] wcycle The wallclock counter.
580 * \return The output forces, energy and virial
582 GPU_FUNC_QUALIFIER PmeOutput pme_gpu_wait_finish_task(gmx_pme_t* GPU_FUNC_ARGUMENT(pme),
583 bool GPU_FUNC_ARGUMENT(computeEnergyAndVirial),
584 real GPU_FUNC_ARGUMENT(lambdaQ),
585 gmx_wallcycle* GPU_FUNC_ARGUMENT(wcycle))
586 GPU_FUNC_TERM_WITH_RETURN(PmeOutput{});